Implementation of System V ABI struct passing.

This PR adds support for System V x86_64 ABI classification and calling convention to the VM and the Jit, including, but not limited to Ubuntu Linux and Mac OS X. The general rules outlined in the System V x86_64 ABI (described at http://www.x86-64.org/documentation/abi.pdf) are followed with a few little exceptions, described below: 1. The hidden argument for by-value passed structs is always after the ÎéÎíthisÎéÎí parameter (if there is one.). This is a difference with the Sysetem V ABI and affects only the internal jit calling conventions. For PInvoke calls the hidden argument is always the first parameter since there is no ÎéÎíthisÎéÎí parameter in this case. 2. Managed structs that have no fields are always passed by-value on the stack. 3. The jit proactively generates frame register frames (with RBP as a frame register) in order to aid the native OS tooling for stack unwinding and the like.
author: Lubomir Litchev <llitchev@live.com> 2015-02-19 11:42:30 -0800
committer: Lubomir Litchev <lubol@microsoft.com> 2015-10-20 14:20:36 -0700
commit: 378e304f9e22b3c4d03c3b1b62c47b0aa58ceaf5 (patch)
tree: b83aec8f77caeb9ca94c6d7505a548b93cdb7259
parent: 3015ff7afb4936a1c5c5856daa4e3482e6b390a9 (diff)
download: coreclr-378e304f9e22b3c4d03c3b1b62c47b0aa58ceaf5.tar.gz
coreclr-378e304f9e22b3c4d03c3b1b62c47b0aa58ceaf5.tar.bz2
coreclr-378e304f9e22b3c4d03c3b1b62c47b0aa58ceaf5.zip
74 files changed, 6735 insertions, 951 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1be2864ecb..2ac0ebb07a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -345,6 +345,11 @@ endif (WIN32)
 
 endif (OVERRIDE_CMAKE_CXX_FLAGS)
 
+if(CLR_CMAKE_PLATFORM_UNIX_TARGET_AMD64)
+add_definitions(-DFEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+add_definitions(-DFEATURE_UNIX_AMD64_STRUCT_PASSING)
+endif (CLR_CMAKE_PLATFORM_UNIX_TARGET_AMD64)
+
 OPTION(CMAKE_ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
 
 if(CMAKE_ENABLE_CODE_COVERAGE)
diff --git a/src/debug/daccess/nidump.cpp b/src/debug/daccess/nidump.cpp
index 44569d9874..c90c29f752 100644
--- a/src/debug/daccess/nidump.cpp
+++ b/src/debug/daccess/nidump.cpp
@@ -5678,7 +5678,12 @@ NativeImageDumper::EnumMnemonics s_MTFlagsLow[] =
     MTFLAG_ENTRY(HasVariance),
     MTFLAG_ENTRY(HasDefaultCtor),
     MTFLAG_ENTRY(HasPreciseInitCctors),
+#if defined(FEATURE_HFA)
     MTFLAG_ENTRY(IsHFA),
+#endif // FEATURE_HFA
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+    MTFLAG_ENTRY(IsRegStructPassed),
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
     MTFLAG_ENTRY(UNUSED_ComponentSize_4),
     MTFLAG_ENTRY(UNUSED_ComponentSize_5),
     MTFLAG_ENTRY(UNUSED_ComponentSize_6),
diff --git a/src/inc/corinfo.h b/src/inc/corinfo.h
index e0004a5948..cc2ce720b8 100644
--- a/src/inc/corinfo.h
+++ b/src/inc/corinfo.h
@@ -190,9 +190,10 @@ TODO: Talk about initializing strutures before use
 #include <specstrings.h>
 
 // For System V on the CLR type system number of registers to pass in and return a struct is the same.
-#define SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS   2
-#define SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_RETURN_IN_REGISTERS SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS
-#define SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS       16
+// The CLR type system allows only up to 2 eightbytes to be passed in registers. There is no SSEUP classification types.
+#define CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS   2 
+#define CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_RETURN_IN_REGISTERS 2
+#define CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS       16
 
 // System V struct passing
 // The Classification types are described in the ABI spec at http://www.x86-64.org/documentation/abi.pdf
@@ -212,7 +213,7 @@ enum SystemVClassificationType : unsigned __int8
     SystemVClassificationTypeMAX = 7,
 };
 
-
+// Represents classification information for a struct.
 struct SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR
 {
     SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR()
@@ -220,19 +221,40 @@ struct SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR
         Initialize();
     }
 
-    bool                        canPassInRegisters;
-    unsigned int                eightByteCount;
-    SystemVClassificationType   eightByteClassifications[SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
-    unsigned int                eightByteSizes[SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
-    unsigned int                eightByteOffsets[SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    bool                        passedInRegisters; // Whether the struct is passable/passed (this includes struct returning) in registers.
+    unsigned __int8             eightByteCount;    // Number of eightbytes for this struct.
+    SystemVClassificationType   eightByteClassifications[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS]; // The eightbytes type classification.
+    unsigned __int8             eightByteSizes[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];           // The size of the eightbytes (an eightbyte could include padding. This represents the no padding size of the eightbyte).
+    unsigned __int8             eightByteOffsets[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];         // The start offset of the eightbytes (in bytes).
+
+
+    //------------------------------------------------------------------------
+    // CopyFrom: Copies a struct classification into this one.
+    //
+    // Arguments:
+    //    'copyFrom' the struct classification to copy from.
+    //
+    void CopyFrom(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& copyFrom)
+    {
+        passedInRegisters = copyFrom.passedInRegisters;
+        eightByteCount = copyFrom.eightByteCount;
+
+        for (int i = 0; i < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
+        {
+            eightByteClassifications[i] = copyFrom.eightByteClassifications[i];
+            eightByteSizes[i] = copyFrom.eightByteSizes[i];
+            eightByteOffsets[i] = copyFrom.eightByteOffsets[i];
+        }
+    }
 
     // Members
+private:
     void Initialize()
     {
-        canPassInRegisters = false;
+        passedInRegisters = false;
         eightByteCount = 0;
 
-        for (int i = 0; i < SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
+        for (int i = 0; i < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
         {
             eightByteClassifications[i] = SystemVClassificationTypeUnknown;
             eightByteSizes[i] = 0;
diff --git a/src/inc/winwrap.h b/src/inc/winwrap.h
index a670a51de0..c0c43eb74c 100644
--- a/src/inc/winwrap.h
+++ b/src/inc/winwrap.h
@@ -854,9 +854,13 @@ InterlockedCompareExchangePointer (
 // Interlockedxxx64 that do not have intrinsics are only supported on Windows Server 2003 
 // or higher for X86 so define our own portable implementation
 
+#undef InterlockedIncrement64
 #define InterlockedIncrement64          __InterlockedIncrement64
+#undef InterlockedDecrement64
 #define InterlockedDecrement64          __InterlockedDecrement64
+#undef InterlockedExchange64
 #define InterlockedExchange64           __InterlockedExchange64
+#undef InterlockedExchangeAdd64
 #define InterlockedExchangeAdd64        __InterlockedExchangeAdd64
 
 __forceinline LONGLONG __InterlockedIncrement64(LONGLONG volatile *Addend)
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 0828a160c9..ea3cce6cc8 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -3648,7 +3648,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
                                                    RegState *regState)
 {
 #ifdef DEBUG
-    if (verbose)
+    if (verbose) 
         printf("*************** In genFnPrologCalleeRegArgs() for %s regs\n", regState->rsIsFloat ? "float" : "int");
 #endif
 
@@ -3678,6 +3678,9 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
     struct
     {
         unsigned    varNum;     // index into compiler->lvaTable[] for this register argument
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        var_types   type;       // the Jit type of this regArgTab entry
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         unsigned    trashBy;    // index into this regArgTab[] table of the register that will be copied to this register.
                                 // That is, for regArgTab[x].trashBy = y, argument register number 'y' will be copied to
                                 // argument register number 'x'. Only used when circular = true.
@@ -3691,18 +3694,20 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
     unsigned    varNum;
     LclVarDsc * varDsc;
-
     for (varNum = 0, varDsc = compiler->lvaTable;
          varNum < compiler->lvaCount;
-         varNum++  , varDsc++)
+         varNum++, varDsc++)
     {
         /* Is this variable a register arg? */
-
-        if  (!varDsc->lvIsParam)
+        if (!varDsc->lvIsParam)
+        {
             continue;
+        }
 
-        if  (!varDsc->lvIsRegArg)
+        if (!varDsc->lvIsRegArg)
+        {
             continue;
+        }
 
         // When we have a promoted struct we have two possible LclVars that can represent the incoming argument
         // in the regArgTab[], either the original TYP_STRUCT argument or the introduced lvStructField.
@@ -3726,13 +3731,17 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
                 // For register arguments that are independent promoted structs we put the promoted field varNum in the regArgTab[]
                 if (varDsc->lvPromoted)
+                {
                     continue;
+                }
             }
             else
             {
                 // For register arguments that are not independent promoted structs we put the parent struct varNum in the regArgTab[]
                 if (varDsc->lvIsStructField)
+                {
                     continue;
+                }
             }
         }
 
@@ -3743,19 +3752,89 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
         var_types regType = varDsc->TypeGet();
 #endif // !_TARGET_ARM_
 
-        if (isFloatRegType(regType) != doingFloat)
-            continue;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (regType != TYP_STRUCT)
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // A struct might be passed  partially in XMM register for System V calls.
+            // So a single arg might use both register files.
+            if (isFloatRegType(regType) != doingFloat)
+            {
+                continue;
+            }
+        }
 
-        /* Bingo - add it to our table */
-        
-        regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, regType);
-        noway_assert(regArgNum < regState->rsCalleeRegArgNum);
-        noway_assert(regArgTab[regArgNum].slot == 0); // we better not have added it already (there better not be multiple vars representing this argument register)
+        int slots = 0;
 
-        regArgTab[regArgNum].varNum = varNum;
-        regArgTab[regArgNum].slot = 1;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        if (varDsc->TypeGet() == TYP_STRUCT)
+        {
+            CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+            assert(typeHnd != nullptr);
+            compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            if (!structDesc.passedInRegisters)
+            {
+                // The var is not passed in registers.
+                continue;
+            }
 
-        int slots = 1;
+            unsigned firstRegSlot = 0;
+            for (unsigned slotCounter = 0; slotCounter < structDesc.eightByteCount; slotCounter++)
+            {
+                regNumber regNum = varDsc->lvRegNumForSlot(slotCounter);
+
+                var_types regType = compiler->getEightByteType(structDesc, slotCounter);
+                
+                regArgNum = genMapRegNumToRegArgNum(regNum, regType);
+                
+                if ((!doingFloat &&
+                    ((structDesc.eightByteClassifications[slotCounter] == SystemVClassificationTypeInteger) || 
+                     (structDesc.eightByteClassifications[slotCounter] == SystemVClassificationTypeIntegerReference))) ||
+                     (doingFloat && structDesc.eightByteClassifications[slotCounter] == SystemVClassificationTypeSSE))
+                {
+                    // Store the reg for the first slot.
+                    if (slots == 0)
+                    {
+                        firstRegSlot = regArgNum;
+                    }
+
+                    // Bingo - add it to our table
+                    noway_assert(regArgNum < regState->rsCalleeRegArgNum);
+                    noway_assert(regArgTab[regArgNum].slot == 0); // we better not have added it already (there better not be multiple vars representing this argument register)
+                    regArgTab[regArgNum].varNum = varNum;
+                    regArgTab[regArgNum].slot = (char)(slotCounter + 1);
+                    regArgTab[regArgNum].type = regType;
+                    slots++;
+                }
+            }
+
+            if (slots == 0)
+            {
+                continue; // Nothing to do for this regState set.
+            }
+
+            regArgNum = firstRegSlot;
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // Bingo - add it to our table
+            regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, regType);
+            noway_assert(regArgNum < regState->rsCalleeRegArgNum);
+            // we better not have added it already (there better not be multiple vars representing this argument register)
+            noway_assert(regArgTab[regArgNum].slot == 0);
+
+            // Set the register type.
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            regArgTab[regArgNum].type = regType;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+            regArgTab[regArgNum].varNum = varNum;
+            regArgTab[regArgNum].slot = 1;
+
+            slots = 1;
+        }
 
 #ifdef _TARGET_ARM_
         int lclSize = compiler->lvaLclSize(varNum);
@@ -3778,9 +3857,23 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         for (int i = 0; i < slots; i ++)
         {
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            // For structs passed in registers on System V systems,
+            // get the regType from the table for each slot.
+            if (regType == TYP_STRUCT)
+            {
+                regType = regArgTab[regArgNum + i].type;
+            }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
             regNumber regNum = genMapRegArgNumToRegNum(regArgNum + i, regType);
-            assert((i > 0) || (regNum == varDsc->lvArgReg));
 
+            // lvArgReg could be INT or FLOAT reg. So the following assertion doesn't hold.
+            // The type of the register depends on the classification of the first eightbyte 
+            // of the struct. For information on classification refer to the System V x86_64 ABI at:
+            // http://www.x86-64.org/documentation/abi.pdf
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            assert((i > 0) || (regNum == varDsc->lvArgReg));
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
             // Is the arg dead on entry to the method ?
 
             if ((regArgMaskLive & genRegMask(regNum)) == 0)
@@ -3831,8 +3924,8 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
             /* If it goes on the stack or in a register that doesn't hold
              * an argument anymore -> CANNOT form a circular dependency */
 
-            if ( varDsc->lvIsInReg()                              &&
-                (genRegMask(regNum) & regArgMaskLive)   )
+            if (varDsc->lvIsInReg() &&
+                (genRegMask(regNum) & regArgMaskLive))
             {
                 /* will trash another argument -> possible dependency
                  * We may need several passes after the table is constructed
@@ -3841,22 +3934,33 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
                 /* Maybe the argument stays in the register (IDEAL) */
 
                 if ((i == 0) && (varDsc->lvRegNum == regNum))
+                {
                     goto NON_DEP;
+                }
 
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                if ((i == 1) && (varDsc->TypeGet() == TYP_STRUCT) &&
+                    (varDsc->lvOtherReg == regNum))
+                {
+                    goto NON_DEP;
+                }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 if ((i == 1) && (genActualType(varDsc->TypeGet()) == TYP_LONG) &&
-                                (varDsc->lvOtherReg == regNum))
+                    (varDsc->lvOtherReg == regNum))
+                {
                     goto NON_DEP;
+                }
 
                 if ((i == 1) && (genActualType(varDsc->TypeGet()) == TYP_DOUBLE) &&
-                                (REG_NEXT(varDsc->lvRegNum) == regNum))
+                    (REG_NEXT(varDsc->lvRegNum) == regNum))
+                {
                     goto NON_DEP;
-
+                }
                 regArgTab[regArgNum+i].circular  = true;
             }
             else
             {
             NON_DEP:
-
                 regArgTab[regArgNum+i].circular  = false;
                 
                 /* mark the argument register as free */
@@ -3870,7 +3974,6 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
      * such that R1->R2 (that is, R1 needs to be moved to R2), R2->R3, ..., Rn->R1 */
 
     bool    change = true;
-
     if (regArgMaskLive)
     {
         /* Possible circular dependencies still exist; the previous pass was not enough
@@ -3882,15 +3985,20 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
             for (argNum = 0; argNum < regState->rsCalleeRegArgNum; argNum++)
             {
-                /* If we already marked the argument as non-circular then continue */
+                // If we already marked the argument as non-circular then continue
 
                 if (!regArgTab[argNum].circular)
+                {
                     continue;
+                }
 
                 if (regArgTab[argNum].slot == 0) // Not a register argument
+                {
                     continue;
+                }
 
-                varNum = regArgTab[argNum].varNum; noway_assert(varNum < compiler->lvaCount);
+                varNum = regArgTab[argNum].varNum;
+                noway_assert(varNum < compiler->lvaCount);
                 varDsc = compiler->lvaTable + varNum;
                 noway_assert(varDsc->lvIsParam && varDsc->lvIsRegArg);
 
@@ -3899,11 +4007,19 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
                 noway_assert(!regArgTab[argNum].stackArg);
 
                 regNumber regNum = genMapRegArgNumToRegNum(argNum, varDsc->TypeGet());
+
                 regNumber destRegNum;
                 if (regArgTab[argNum].slot == 1)
                 {
                     destRegNum = varDsc->lvRegNum;
                 }
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                else
+                {
+                    assert(regArgTab[argNum].slot == 2);
+                    destRegNum = varDsc->lvOtherReg;
+                }
+#else // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 else if (regArgTab[argNum].slot == 2 &&
                          genActualType(varDsc->TypeGet()) == TYP_LONG)
                 {
@@ -3915,7 +4031,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
                     assert(varDsc->TypeGet() == TYP_DOUBLE);
                     destRegNum = REG_NEXT(varDsc->lvRegNum);
                 }
-
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 if (genRegMask(destRegNum) & regArgMaskLive)
                 {
                     /* we are trashing a live argument register - record it */
@@ -3949,33 +4065,47 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
     }
 #endif
 
-    // TODO-AMD64-Bug? - homing of float argument registers with circular dependencies.
-#ifdef _TARGET_AMD64_
-    NYI_IF((regArgMaskLive & RBM_FLTARG_REGS) != 0, "Homing of float argument registers with circular dependencies not implemented");        
-#endif // _TARGET_AMD64_
+    // LSRA allocates registers to incoming parameters in order and will not overwrite
+    // a register still holding a live parameter.
+#ifndef LEGACY_BACKEND
+    noway_assert(((regArgMaskLive & RBM_FLTARG_REGS) == 0) && "Homing of float argument registers with circular dependencies not implemented.");
+#endif // LEGACY_BACKEND
 
     /* Now move the arguments to their locations.
      * First consider ones that go on the stack since they may
      * free some registers. */
 
     regArgMaskLive = regState->rsCalleeRegArgMaskLiveIn; // reset the live in to what it was at the start
-
     for (argNum = 0; argNum < regState->rsCalleeRegArgNum; argNum++)
     {
         emitAttr        size;
 
-        /* If the arg is dead on entry to the method, skip it */
+        // If this is the wrong register file, just continue.
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (regArgTab[argNum].type == TYP_UNDEF)
+        {
+            // This could happen if the reg in regArgTab[argNum] is of the other register file -
+            //     for System V register passed structs where the first reg is GPR and the second an XMM reg.
+            // The next register file processing will process it.
+            continue;
+        }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        // If the arg is dead on entry to the method, skip it
 
         if (regArgTab[argNum].processed)
+        {
             continue;
+        }
 
         if (regArgTab[argNum].slot == 0) // Not a register argument
+        {
             continue;
+        }
 
         varNum = regArgTab[argNum].varNum; noway_assert(varNum < compiler->lvaCount);
         varDsc = compiler->lvaTable + varNum;
 
-        /* If not a stack arg go to the next one */
+        // If not a stack arg go to the next one
 
 #ifndef _TARGET_64BIT_
         if (varDsc->lvType == TYP_LONG)
@@ -3993,7 +4123,9 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 #endif // !_TARGET_64BIT_
         {
             if (!regArgTab[argNum].stackArg)
+            {
                 continue;
+            }
         }
 
 #if defined(_TARGET_ARM_)
@@ -4021,10 +4153,15 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
         {
             size = EA_SIZE(varDsc->lvSize());
 #if defined(_TARGET_AMD64_)
-            storeType = (var_types) ((size <= 4) ? TYP_INT : TYP_I_IMPL);
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            storeType = (var_types)((size <= 4) ? TYP_INT : TYP_I_IMPL);
             // Must be 1, 2, 4, or 8, or else it wouldn't be passed in a register
             noway_assert(EA_SIZE_IN_BYTES(size) <= 8);
             assert((EA_SIZE_IN_BYTES(size) & (EA_SIZE_IN_BYTES(size) - 1)) == 0);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+            storeType = regArgTab[argNum].type;
+            size = emitActualTypeSize(storeType);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #elif defined(_TARGET_ARM64_)
             // Must be <= 16 bytes or else it wouldn't be passed in registers
             noway_assert(EA_SIZE_IN_BYTES(size) <= 16);
@@ -4060,7 +4197,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         regNumber srcRegNum = genMapRegArgNumToRegNum(argNum, storeType);
         
-        /* Stack argument - if the ref count is 0 don't care about it */
+        // Stack argument - if the ref count is 0 don't care about it
 
         if (!varDsc->lvOnFrame)
         {
@@ -4084,6 +4221,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         regArgTab[argNum].processed = true;
         regArgMaskLive &= ~genRegMask(srcRegNum);
+
 #if defined(_TARGET_ARM_)
         if (storeType == TYP_DOUBLE)
         {
@@ -4094,7 +4232,6 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
     }
 
     /* Process any circular dependencies */
-
     if (regArgMaskLive)
     {
         unsigned        begReg, destReg, srcReg;
@@ -4105,21 +4242,39 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         if (doingFloat)
         {
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 #if defined(_TARGET_ARM_)
             insCopy = INS_vmov;
-
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            insCopy = INS_mov;
+#else
+#error Error. Wrong architecture.
+#endif
             // Compute xtraReg here when we have a float argument
             assert(xtraReg == REG_NA);
 
             regMaskTP fpAvailMask;  
                     
             fpAvailMask = RBM_FLT_CALLEE_TRASH & ~regArgMaskLive;
+#if defined(_TARGET_ARM_)
             fpAvailMask &= RBM_DBL_REGS;
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            fpAvailMask &= RBM_ALLFLOAT;
+#else
+#error Error. Wrong architecture.
+#endif
+            
 
             if (fpAvailMask == RBM_NONE)
             {
                 fpAvailMask = RBM_ALLFLOAT & ~regArgMaskLive;
+#if defined(_TARGET_ARM_)
                 fpAvailMask &= RBM_DBL_REGS;
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                fpAvailMask &= RBM_ALLFLOAT;
+#else
+#error Error. Wrong architecture.
+#endif
             }
 
             assert(fpAvailMask != RBM_NONE);
@@ -4135,23 +4290,30 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         for (argNum = 0; argNum < regState->rsCalleeRegArgNum; argNum++)
         {
-            /* If not a circular dependency then continue */
-
+            // If not a circular dependency then continue
             if (!regArgTab[argNum].circular)
+            {
                 continue;
+            }
 
-            /* If already processed the dependency then continue */
+            // If already processed the dependency then continue
 
             if (regArgTab[argNum].processed)
+            {
                 continue;
+            }
 
             if (regArgTab[argNum].slot == 0) // Not a register argument
+            {
                 continue;
-           
+            }
+
             destReg = begReg = argNum;
-            srcReg  = regArgTab[argNum].trashBy; noway_assert(srcReg < regState->rsCalleeRegArgNum);
+            srcReg  = regArgTab[argNum].trashBy;
+            noway_assert(srcReg < regState->rsCalleeRegArgNum);
 
-            varNumDest = regArgTab[destReg].varNum; noway_assert(varNumDest < compiler->lvaCount);
+            varNumDest = regArgTab[destReg].varNum; 
+            noway_assert(varNumDest < compiler->lvaCount);
             varDscDest = compiler->lvaTable + varNumDest;
             noway_assert(varDscDest->lvIsParam && varDscDest->lvIsRegArg);
 
@@ -4376,6 +4538,18 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
             varDsc = compiler->lvaTable + varNum;
             regNumber regNum = genMapRegArgNumToRegNum(argNum, varDsc->TypeGet());
 
+            // If this is the wrong register file, just continue.
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            if (regArgTab[argNum].type == TYP_UNDEF)
+            {
+                // This could happen if the reg in regArgTab[argNum] is of the other register file -
+                //     for System V register passed structs where the first reg is GPR and the second an XMM reg.
+                // The next register file processing will process it.
+                regArgMaskLive &= ~genRegMask(regNum);
+                continue;
+            }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
             noway_assert(varDsc->lvIsParam && varDsc->lvIsRegArg);
 #ifndef _WIN64
             //Right now we think that incoming arguments are not pointer sized.  When we eventually
@@ -4506,7 +4680,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
             }
 #endif
         }
-
+        
         noway_assert(regArgMaskLiveSave != regArgMaskLive);   // if it doesn't change, we have an infinite loop
     }
 }
@@ -6729,12 +6903,14 @@ void CodeGen::genProfilingEnterCallback(regNumber  initReg,
         regNumber  argReg    = varDsc->lvArgReg;
         getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
 
+#if FEATURE_VARARG
         if (compiler->info.compIsVarArgs && varTypeIsFloating(loadType))
         {
             regNumber intArgReg = compiler->getCallArgIntRegister(argReg);
             instruction ins = ins_CopyFloatToInt(loadType, TYP_LONG);
             inst_RV_RV(ins, argReg, intArgReg, loadType);
         }
+#endif //  FEATURE_VARARG
     }
 
     // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using.
@@ -8495,6 +8671,7 @@ void                CodeGen::genFnProlog()
 #endif // !LEGACY_BACKEND
 
     RegState *regState;
+    
     FOREACH_REGISTER_FILE(regState)
     {
         if (regState->rsCalleeRegArgMaskLiveIn)
@@ -10789,8 +10966,8 @@ void                CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
 //------------------------------------------------------------------------
 // ARM-specific methods used by both the classic and RyuJIT
 //------------------------------------------------------------------------
-#ifdef _TARGET_ARM_
-CORINFO_CLASS_HANDLE Compiler::GetHfaClassHandle(GenTreePtr tree)
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+CORINFO_CLASS_HANDLE Compiler::GetStructClassHandle(GenTreePtr tree)
 {
     if (tree->TypeGet() == TYP_STRUCT)
     {
@@ -10809,7 +10986,7 @@ CORINFO_CLASS_HANDLE Compiler::GetHfaClassHandle(GenTreePtr tree)
 
         case GT_RETURN:
             assert(tree->gtOp.gtOp1->gtOper == GT_LCL_VAR);
-            return GetHfaClassHandle(tree->gtOp.gtOp1);
+            return GetStructClassHandle(tree->gtOp.gtOp1);
 
         case GT_LDOBJ:
             return tree->gtLdObj.gtClass;
@@ -10823,15 +11000,35 @@ CORINFO_CLASS_HANDLE Compiler::GetHfaClassHandle(GenTreePtr tree)
 
         case GT_ASG:
             assert(tree->gtOp.gtOp1->gtOper == GT_LCL_VAR || tree->gtOp.gtOp1->gtOper == GT_LCL_FLD);
-            return GetHfaClassHandle(tree->gtOp.gtOp1);
-            
+            return GetStructClassHandle(tree->gtOp.gtOp1);
         default:
-            unreached();
+            return NO_CLASS_HANDLE; 
         }
     }
     return NO_CLASS_HANDLE;
 }
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+bool Compiler::IsRegisterPassable(CORINFO_CLASS_HANDLE hClass)
+{
+    if (hClass == NO_CLASS_HANDLE)
+    {
+        return false;
+    }
+
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    eeGetSystemVAmd64PassStructInRegisterDescriptor(hClass, &structDesc);
+    return structDesc.passedInRegisters;
+}
 
+bool Compiler::IsRegisterPassable(GenTreePtr tree)
+{
+    return IsRegisterPassable(GetStructClassHandle(tree));
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+#ifdef _TARGET_ARM_
 bool Compiler::IsHfa(CORINFO_CLASS_HANDLE hClass)
 {
     return varTypeIsFloating(GetHfaType(hClass));
@@ -10839,12 +11036,12 @@ bool Compiler::IsHfa(CORINFO_CLASS_HANDLE hClass)
 
 bool Compiler::IsHfa(GenTreePtr tree)
 {
-    return IsHfa(GetHfaClassHandle(tree));
+    return IsHfa(GetStructClassHandle(tree));
 }
 
 var_types Compiler::GetHfaType(GenTreePtr tree)
 {
-    return (tree->TypeGet() == TYP_STRUCT) ? GetHfaType(GetHfaClassHandle(tree)) : TYP_UNDEF;
+    return (tree->TypeGet() == TYP_STRUCT) ? GetHfaType(GetStructClassHandle(tree)) : TYP_UNDEF;
 }
 
 unsigned Compiler::GetHfaSlots(GenTreePtr tree)
diff --git a/src/jit/codegenlegacy.cpp b/src/jit/codegenlegacy.cpp
index e37322d3b4..0914f7d7d6 100644
--- a/src/jit/codegenlegacy.cpp
+++ b/src/jit/codegenlegacy.cpp
@@ -12870,7 +12870,7 @@ void                CodeGen::genCodeForBBlist()
         genStackLevel = 0;
 #if FEATURE_STACK_FP_X87
         genResetFPstkLevel();
-#endif //FEATURE_STACK_FP_X87
+#endif // FEATURE_STACK_FP_X87
 
 #if !FEATURE_FIXED_OUT_ARGS
         /* Check for inserted throw blocks and adjust genStackLevel */
diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h
index 57eac7ced4..6a030eb926 100644
--- a/src/jit/codegenlinear.h
+++ b/src/jit/codegenlinear.h
@@ -103,6 +103,10 @@
 
     void                genConsumeBlockOp(GenTreeBlkOp* blkNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg);
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    void                genConsumePutArgStk(GenTreePutArgStk* putArgStkNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     void                genConsumeRegs(GenTree* tree);
 
     void                genConsumeOperands(GenTreeOp* tree);
@@ -126,6 +130,11 @@
 
     void                genCodeForCpBlkUnroll    (GenTreeCpBlk* cpBlkNode);
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    void                genCodeForPutArgRepMovs(GenTreePutArgStk* putArgStkNode);
+    void                genCodeForPutArgUnroll(GenTreePutArgStk* putArgStkNode);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     void                genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset);
 
     void                genCodeForStoreOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset);
@@ -150,6 +159,18 @@
     
     void                genJmpMethod(GenTreePtr jmp);
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    void genGetStructTypeSizeOffset(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc,
+                                    var_types* type0,
+                                    var_types* type1,
+                                    emitAttr* size0,
+                                    emitAttr* size1,
+                                    unsigned __int8* offset0,
+                                    unsigned __int8* offset1);
+
+    bool                genStoreRegisterReturnInLclVar(GenTreePtr treeNode);
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     void                genLclHeap(GenTreePtr tree);
 
     bool                genIsRegCandidateLocal (GenTreePtr    tree)
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index 076ba7c262..7064862c4c 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -785,7 +785,6 @@ void                CodeGen::genCodeForBBlist()
 #endif
 
         /* Both stacks should always be empty on exit from a basic block */
-
         noway_assert(genStackLevel == 0);
 
 #ifdef _TARGET_AMD64_
@@ -1571,6 +1570,7 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             if (!treeNode->InReg() && !(treeNode->gtFlags & GTF_SPILLED))
             {
                 assert(!isRegCandidate);
+
                 emit->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)), 
                                   emitTypeSize(treeNode), treeNode->gtRegNum, lcl->gtLclNum, 0);
                 genProduceReg(treeNode);
@@ -1618,85 +1618,98 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
 
     case GT_STORE_LCL_FLD:
         {
-            noway_assert(targetType != TYP_STRUCT);
-            noway_assert(!treeNode->InReg());            
-            assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (!genStoreRegisterReturnInLclVar(treeNode))
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            {
+                noway_assert(targetType != TYP_STRUCT);
+                noway_assert(!treeNode->InReg());
+                assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
 
 #ifdef FEATURE_SIMD
-            // storing of TYP_SIMD12 (i.e. Vector3) field
-            if (treeNode->TypeGet() == TYP_SIMD12)
-            {
-                genStoreLclFldTypeSIMD12(treeNode);
-                break;
-            }
+                // storing of TYP_SIMD12 (i.e. Vector3) field
+                if (treeNode->TypeGet() == TYP_SIMD12)
+                {
+                    genStoreLclFldTypeSIMD12(treeNode);
+                    break;
+                }
 #endif
 
-            GenTreePtr op1 = treeNode->gtOp.gtOp1;
-            genConsumeRegs(op1);
-            emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
+                GenTreePtr op1 = treeNode->gtOp.gtOp1;
+                genConsumeRegs(op1);
+                emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
+            }
         }
         break;
 
     case GT_STORE_LCL_VAR:
         {
-            noway_assert(targetType != TYP_STRUCT);
-            assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (!genStoreRegisterReturnInLclVar(treeNode))
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            {
+                noway_assert(targetType != TYP_STRUCT);
+                assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
 
-            unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
-            LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+                unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
+                LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
 
-            // Ensure that lclVar nodes are typed correctly.
-            assert(!varDsc->lvNormalizeOnStore() || treeNode->TypeGet() == genActualType(varDsc->TypeGet()));
+                // Ensure that lclVar nodes are typed correctly.
+                assert(!varDsc->lvNormalizeOnStore() || treeNode->TypeGet() == genActualType(varDsc->TypeGet()));
 
 #if !defined(_TARGET_64BIT_)
-            if (treeNode->TypeGet() == TYP_LONG)
-            {
-                genStoreLongLclVar(treeNode);
-                break;
-            }
+                if (treeNode->TypeGet() == TYP_LONG)
+                {
+                    genStoreLongLclVar(treeNode);
+                    break;
+                }
 #endif // !defined(_TARGET_64BIT_)
 
-            GenTreePtr op1 = treeNode->gtOp.gtOp1;
-            genConsumeRegs(op1);
-            if (treeNode->gtRegNum == REG_NA)
-            {
-                // stack store
-                emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)), emitTypeSize(treeNode), treeNode);
-                varDsc->lvRegNum = REG_STK;
-            }
-            else
-            {
-                bool containedOp1 = op1->isContained();
-                // Look for the case where we have a constant zero which we've marked for reuse,
-                // but which isn't actually in the register we want.  In that case, it's better to create
-                // zero in the target register, because an xor is smaller than a copy. Note that we could
-                // potentially handle this in the register allocator, but we can't always catch it there
-                // because the target may not have a register allocated for it yet.
-                if (!containedOp1 && (op1->gtRegNum != treeNode->gtRegNum) && op1->IsZero())
+                GenTreePtr op1 = treeNode->gtOp.gtOp1;
+                genConsumeRegs(op1);
+
+                if (treeNode->gtRegNum == REG_NA)
                 {
-                    op1->gtRegNum = REG_NA;
-                    op1->ResetReuseRegVal();
-                    containedOp1 = true;
+                    // stack store
+                    emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)), emitTypeSize(treeNode), treeNode);
+                    varDsc->lvRegNum = REG_STK;
                 }
-                if (containedOp1)
+                else
                 {
-                    // Currently, we assume that the contained source of a GT_STORE_LCL_VAR writing to a register
-                    // must be a constant. However, in the future we might want to support a contained memory op.
-                    // This is a bit tricky because we have to decide it's contained before register allocation,
-                    // and this would be a case where, once that's done, we need to mark that node as always
-                    // requiring a register - which we always assume now anyway, but once we "optimize" that
-                    // we'll have to take cases like this into account.
-                    assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
-                    genSetRegToConst(treeNode->gtRegNum, targetType, op1);
+                    bool containedOp1 = op1->isContained();
+                    // Look for the case where we have a constant zero which we've marked for reuse,
+                    // but which isn't actually in the register we want.  In that case, it's better to create
+                    // zero in the target register, because an xor is smaller than a copy. Note that we could
+                    // potentially handle this in the register allocator, but we can't always catch it there
+                    // because the target may not have a register allocated for it yet.
+                    if (!containedOp1 && (op1->gtRegNum != treeNode->gtRegNum) && op1->IsZero())
+                    {
+                        op1->gtRegNum = REG_NA;
+                        op1->ResetReuseRegVal();
+                        containedOp1 = true;
+                    }
+                    if (containedOp1)
+                    {
+                        // Currently, we assume that the contained source of a GT_STORE_LCL_VAR writing to a register
+                        // must be a constant. However, in the future we might want to support a contained memory op.
+                        // This is a bit tricky because we have to decide it's contained before register allocation,
+                        // and this would be a case where, once that's done, we need to mark that node as always
+                        // requiring a register - which we always assume now anyway, but once we "optimize" that
+                        // we'll have to take cases like this into account.
+                        assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
+                        genSetRegToConst(treeNode->gtRegNum, targetType, op1);
+                    }
+                    else if (op1->gtRegNum != treeNode->gtRegNum)
+                    {
+                        assert(op1->gtRegNum != REG_NA);
+                        emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(treeNode), treeNode, op1);
+                    }
                 }
-                else if (op1->gtRegNum != treeNode->gtRegNum)
+                if (treeNode->gtRegNum != REG_NA)
                 {
-                    assert(op1->gtRegNum != REG_NA);
-                    emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(treeNode), treeNode, op1);
+                    genProduceReg(treeNode);
                 }
             }
-            if (treeNode->gtRegNum != REG_NA)
-                genProduceReg(treeNode);
         }
         break;
 
@@ -1717,6 +1730,15 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             GenTreePtr op1 = treeNode->gtOp.gtOp1;
             if (targetType == TYP_VOID)
             {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                if (compiler->info.compRetBuffArg != BAD_VAR_NUM)
+                {
+                    // System V AMD64 spec requires that when a struct is returned by a hidden
+                    // argument the RAX should contain the value of the hidden retbuf arg.
+                    emit->emitIns_R_S(INS_mov, EA_BYREF, REG_RAX, compiler->info.compRetBuffArg, 0);
+                }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
                 assert(op1 == nullptr);
             }
 #if !defined(_TARGET_64BIT_)
@@ -1742,53 +1764,233 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
 #endif // !defined(_TARGET_64BIT_)
             else
             {
-                assert(op1 != nullptr);
-                noway_assert(op1->gtRegNum != REG_NA);
-
-                // !! NOTE !! genConsumeReg will clear op1 as GC ref after it has
-                // consumed a reg for the operand. This is because the variable
-                // is dead after return. But we are issuing more instructions
-                // like "profiler leave callback" after this consumption. So
-                // if you are issuing more instructions after this point,
-                // remember to keep the variable live up until the new method
-                // exit point where it is actually dead.
-                genConsumeReg(op1);
-
-                regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET;
-#ifdef _TARGET_X86_
-                if (varTypeIsFloating(treeNode))
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                if (treeNode->TypeGet() == TYP_STRUCT &&
+                    treeNode->gtOp.gtOp1->OperGet() == GT_LCL_VAR)
                 {
-                    if (genIsRegCandidateLocal(op1) && !compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvRegister)
+                    GenTreeLclVarCommon* lclVarPtr = treeNode->gtOp.gtOp1->AsLclVarCommon();
+                    LclVarDsc* varDsc = &(compiler->lvaTable[lclVarPtr->gtLclNum]);
+                    assert(varDsc->lvDontPromote);
+
+                    CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+                    assert(typeHnd != nullptr);
+
+                    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+                    compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+                    assert(structDesc.passedInRegisters);
+                    assert(structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+                    regNumber retReg0 = REG_NA;
+                    emitAttr size0 = EA_UNKNOWN;
+                    unsigned offset0 = structDesc.eightByteOffsets[0];
+                    regNumber retReg1 = REG_NA;
+                    emitAttr size1 = EA_UNKNOWN;
+                    unsigned offset1 = structDesc.eightByteOffsets[1];
+
+                    bool firstIntUsed = false;
+                    bool firstFloatUsed = false;
+                    
+                    var_types type0 = TYP_UNKNOWN;
+                    var_types type1 = TYP_UNKNOWN;
+
+                    // Set the first eightbyte data
+                    switch (structDesc.eightByteClassifications[0])
                     {
-                        // Store local variable to its home location, if necessary.
-                        if ((op1->gtFlags & GTF_REG_VAL) != 0)
+                    case SystemVClassificationTypeInteger:
+                        if (structDesc.eightByteSizes[0] <= 4)
+                        {
+                            retReg0 = REG_INTRET;
+                            size0 = EA_4BYTE;
+                            type0 = TYP_INT;
+                            firstIntUsed = true;
+                        }
+                        else if (structDesc.eightByteSizes[0] <= 8)
+                        {
+                            retReg0 = REG_LNGRET;
+                            size0 = EA_8BYTE;
+                            type0 = TYP_LONG;
+                            firstIntUsed = true;
+                        }
+                        else
+                        {
+                            assert(false && "Bad int type.");
+                        }
+                        break;
+                    case SystemVClassificationTypeIntegerReference:
+                        assert(structDesc.eightByteSizes[0] == REGSIZE_BYTES);
+                        retReg0 = REG_LNGRET;
+                        size0 = EA_GCREF;
+                        type0 = TYP_REF;
+                        firstIntUsed = true;
+                        break;
+                    case SystemVClassificationTypeSSE:
+                        if (structDesc.eightByteSizes[0] <= 4)
+                        {
+                            retReg0 = REG_FLOATRET;
+                            size0 = EA_4BYTE;
+                            type0 = TYP_FLOAT;
+                            firstFloatUsed = true;
+                        }
+                        else if (structDesc.eightByteSizes[0] <= 8)
+                        {
+                            retReg0 = REG_DOUBLERET;
+                            size0 = EA_8BYTE;
+                            type0 = TYP_DOUBLE;
+                            firstFloatUsed = true;
+                        }
+                        else
                         {
-                            op1->gtFlags &= ~GTF_REG_VAL;
-                            inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)), op1, op1->gtRegNum);
+                            assert(false && "Bat float type."); // Not possible.
                         }
-                        // Now, load it to the fp stack.
-                        getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, 0);
+                        break;
+                    default:
+                        assert(false && "Bad EightByte classification.");
+                        break;
                     }
-                    else
+
+                    // Set the second eight byte data
+                    switch (structDesc.eightByteClassifications[1])
                     {
-                        // Spill the value, which should be in a register, then load it to the fp stack.
-                        // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
-                        op1->gtFlags |= GTF_SPILL;
-                        regSet.rsSpillTree(op1->gtRegNum, op1);
-                        op1->gtFlags |= GTF_SPILLED;
-                        op1->gtFlags &= ~GTF_SPILL;
-
-                        TempDsc* t = regSet.rsUnspillInPlace(op1);
-                        inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
-                        op1->gtFlags &= ~GTF_SPILLED;
-                        compiler->tmpRlsTemp(t);
+                    case SystemVClassificationTypeInteger:
+                        if (structDesc.eightByteSizes[1] <= 4)
+                        {
+                            if (firstIntUsed)
+                            {
+                                retReg1 = REG_INTRET_1;
+                            }
+                            else
+                            {
+                                retReg1 = REG_INTRET;
+                            }
+                            type1 = TYP_INT;
+                            size1 = EA_4BYTE;
+                        }
+                        else if (structDesc.eightByteSizes[1] <= 8)
+                        {
+                            if (firstIntUsed)
+                            {
+                                retReg1 = REG_LNGRET_1;
+                            }
+                            else
+                            {
+                                retReg1 = REG_LNGRET;
+                            }
+                            type1 = TYP_LONG;
+                            size1 = EA_8BYTE;
+                        }
+                        else
+                        {
+                            assert(false && "Bad int type.");
+                        }
+                        break;
+                    case SystemVClassificationTypeIntegerReference:
+                        assert(structDesc.eightByteSizes[1] == REGSIZE_BYTES);
+                        if (firstIntUsed)
+                        {
+                            retReg1 = REG_LNGRET_1;
+                        }
+                        else
+                        {
+                            retReg1 = REG_LNGRET;
+                        }
+                        type1 = TYP_REF;
+                        size1 = EA_GCREF;
+                        break;
+                    case SystemVClassificationTypeSSE:
+                        if (structDesc.eightByteSizes[1] <= 4)
+                        {
+                            if (firstFloatUsed)
+                            {
+                                retReg1 = REG_FLOATRET_1;
+                            }
+                            else
+                            {
+                                retReg1 = REG_FLOATRET;
+                            }
+                            type1 = TYP_FLOAT;
+                            size1 = EA_4BYTE;
+                        }
+                        else if (structDesc.eightByteSizes[1] <= 8)
+                        {
+                            if (firstFloatUsed)
+                            {
+                                retReg1 = REG_DOUBLERET_1;
+                            }
+                            else
+                            {
+                                retReg1 = REG_DOUBLERET;
+                            }
+                            type1 = TYP_DOUBLE;
+                            size1 = EA_8BYTE;
+                        }
+                        else
+                        {
+                            assert(false && "Bat float type."); // Not possible.
+                        }
+                        break;
+                    default:
+                        assert(false && "Bad EightByte classification.");
+                        break;
                     }
+
+                    // Move the values into the return registers.
+                    // 
+                    emit->emitIns_R_S(ins_Load(type0), size0, retReg0, lclVarPtr->gtLclNum, offset0);
+                    emit->emitIns_R_S(ins_Load(type1), size1, retReg1, lclVarPtr->gtLclNum, offset1);
                 }
                 else
-#endif // _TARGET_X86_
-                if (op1->gtRegNum != retReg)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                 {
-                    inst_RV_RV(ins_Copy(targetType), retReg, op1->gtRegNum, targetType);
+                    assert(op1 != nullptr);
+                    noway_assert(op1->gtRegNum != REG_NA);
+
+                    // !! NOTE !! genConsumeReg will clear op1 as GC ref after it has
+                    // consumed a reg for the operand. This is because the variable
+                    // is dead after return. But we are issuing more instructions
+                    // like "profiler leave callback" after this consumption. So
+                    // if you are issuing more instructions after this point,
+                    // remember to keep the variable live up until the new method
+                    // exit point where it is actually dead.
+                    genConsumeReg(op1);
+
+                    regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET;
+#ifdef _TARGET_X86_
+                    if (varTypeIsFloating(treeNode))
+                    {
+                        if (genIsRegCandidateLocal(op1) && !compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvRegister)
+                        {
+                            // Store local variable to its home location, if necessary.
+                            if ((op1->gtFlags & GTF_REG_VAL) != 0)
+                            {
+                                op1->gtFlags &= ~GTF_REG_VAL;
+                                inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)), op1, op1->gtRegNum);
+                            }
+                            // Now, load it to the fp stack.
+                            getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, 0);
+                        }
+                        else
+                        {
+                            // Spill the value, which should be in a register, then load it to the fp stack.
+                            // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
+                            op1->gtFlags |= GTF_SPILL;
+                            regSet.rsSpillTree(op1->gtRegNum, op1);
+                            op1->gtFlags |= GTF_SPILLED;
+                            op1->gtFlags &= ~GTF_SPILL;
+
+                            TempDsc* t = regSet.rsUnspillInPlace(op1);
+                            inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
+                            op1->gtFlags &= ~GTF_SPILLED;
+                            compiler->tmpRlsTemp(t);
+                        }
+                    }
+                    else
+#endif // _TARGET_X86_
+                    {
+                        if (op1->gtRegNum != retReg)
+                        {
+                            inst_RV_RV(ins_Copy(targetType), retReg, op1->gtRegNum, targetType);
+                        }
+                    }
                 }
             }
 
@@ -2468,6 +2670,14 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
         genPutArgStk(treeNode);
 #else // !_TARGET_X86_
         {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+            if (targetType == TYP_STRUCT)
+            {
+                genPutArgStk(treeNode);
+                break;
+            }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
             noway_assert(targetType != TYP_STRUCT);
             assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
 
@@ -2536,8 +2746,9 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
 
     case GT_PUTARG_REG:
         {
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
             noway_assert(targetType != TYP_STRUCT);
-
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
             // commas show up here commonly, as part of a nullchk operation
             GenTree *op1 = treeNode->gtOp.gtOp1;
             // If child node is not already in the register we need, move it
@@ -2546,8 +2757,8 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             {
                 inst_RV_RV(ins_Copy(targetType), treeNode->gtRegNum, op1->gtRegNum, targetType);
             }
+            genProduceReg(treeNode);
         }
-        genProduceReg(treeNode);
         break;
 
     case GT_CALL:
@@ -2767,6 +2978,198 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
     }
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+// This method handles storing double register return struct value to a 
+// local homing stack location.
+// It returns true if this is a struct and storing of the returned
+// register value is handled. It returns false otherwise.
+bool
+CodeGen::genStoreRegisterReturnInLclVar(GenTreePtr treeNode)
+{
+    if (treeNode->TypeGet() == TYP_STRUCT)
+    {
+        noway_assert(!treeNode->InReg());
+
+        GenTreeLclVarCommon* lclVarPtr = treeNode->AsLclVarCommon();
+
+        LclVarDsc * varDsc = &(compiler->lvaTable[lclVarPtr->gtLclNum]);
+
+        CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+        assert(typeHnd != nullptr);
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+
+        assert(structDesc.passedInRegisters);
+        assert(structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+        GenTreePtr op1 = treeNode->gtOp.gtOp1;
+        genConsumeRegs(op1);
+
+        regNumber retReg0 = REG_NA;
+        emitAttr size0 = EA_UNKNOWN;
+        unsigned offset0 = structDesc.eightByteOffsets[0];
+        regNumber retReg1 = REG_NA;
+        emitAttr size1 = EA_UNKNOWN;
+        unsigned offset1 = structDesc.eightByteOffsets[1];
+
+        bool firstIntUsed = false;
+        bool firstFloatUsed = false;
+
+        var_types type0 = TYP_UNKNOWN;
+        var_types type1 = TYP_UNKNOWN;
+
+        // Set the first eightbyte data
+        switch (structDesc.eightByteClassifications[0])
+        {
+        case SystemVClassificationTypeInteger:
+            if (structDesc.eightByteSizes[0] <= 4)
+            {
+                retReg0 = REG_INTRET;
+                size0 = EA_4BYTE;
+                type0 = TYP_INT;
+                firstIntUsed = true;
+            }
+            else if (structDesc.eightByteSizes[0] <= 8)
+            {
+                retReg0 = REG_LNGRET;
+                size0 = EA_8BYTE;
+                type0 = TYP_LONG;
+                firstIntUsed = true;
+            }
+            else
+            {
+                assert(false && "Bad int type.");
+            }
+            break;
+        case SystemVClassificationTypeIntegerReference:
+            assert(structDesc.eightByteSizes[0] == REGSIZE_BYTES);
+            retReg0 = REG_LNGRET;
+            size0 = EA_GCREF;
+            type0 = TYP_REF;
+            firstIntUsed = true;
+            break;
+        case SystemVClassificationTypeSSE:
+            if (structDesc.eightByteSizes[0] <= 4)
+            {
+                retReg0 = REG_FLOATRET;
+                size0 = EA_4BYTE;
+                type0 = TYP_FLOAT;
+                firstFloatUsed = true;
+            }
+            else if (structDesc.eightByteSizes[0] <= 8)
+            {
+                retReg0 = REG_DOUBLERET;
+                size0 = EA_8BYTE;
+                type0 = TYP_DOUBLE;
+                firstFloatUsed = true;
+            }
+            else
+            {
+                assert(false && "Bat float type."); // Not possible.
+            }
+            break;
+        default:
+            assert(false && "Bad EightByte classification.");
+            break;
+        }
+
+        // Set the second eight byte data
+        switch (structDesc.eightByteClassifications[1])
+        {
+        case SystemVClassificationTypeInteger:
+            if (structDesc.eightByteSizes[1] <= 4)
+            {
+                if (firstIntUsed)
+                {
+                    retReg1 = REG_INTRET_1;
+                }
+                else
+                {
+                    retReg1 = REG_INTRET;
+                }
+                type1 = TYP_INT;
+                size1 = EA_4BYTE;
+            }
+            else if (structDesc.eightByteSizes[1] <= 8)
+            {
+                if (firstIntUsed)
+                {
+                    retReg1 = REG_LNGRET_1;
+                }
+                else
+                {
+                    retReg1 = REG_LNGRET;
+                }
+                type1 = TYP_LONG;
+                size1 = EA_8BYTE;
+            }
+            else
+            {
+                assert(false && "Bad int type.");
+            }
+            break;
+        case SystemVClassificationTypeIntegerReference:
+            assert(structDesc.eightByteSizes[1] == REGSIZE_BYTES);
+            if (firstIntUsed)
+            {
+                retReg1 = REG_LNGRET_1;
+            }
+            else
+            {
+                retReg1 = REG_LNGRET;
+            }
+            type1 = TYP_REF;
+            size1 = EA_GCREF;
+            break;
+        case SystemVClassificationTypeSSE:
+            if (structDesc.eightByteSizes[1] <= 4)
+            {
+                if (firstFloatUsed)
+                {
+                    retReg1 = REG_FLOATRET_1;
+                }
+                else
+                {
+                    retReg1 = REG_FLOATRET;
+                }
+                type1 = TYP_FLOAT;
+                size1 = EA_4BYTE;
+            }
+            else if (structDesc.eightByteSizes[1] <= 8)
+            {
+                if (firstFloatUsed)
+                {
+                    retReg1 = REG_DOUBLERET_1;
+                }
+                else
+                {
+                    retReg1 = REG_DOUBLERET;
+                }
+                type1 = TYP_DOUBLE;
+                size1 = EA_8BYTE;
+            }
+            else
+            {
+                assert(false && "Bat float type."); // Not possible.
+            }
+            break;
+        default:
+            assert(false && "Bad EightByte classification.");
+            break;
+        }
+
+        // Move the values into the return registers.
+        // 
+
+        getEmitter()->emitIns_S_R(ins_Store(type0), size0, retReg0, lclVarPtr->gtLclNum, offset0);
+        getEmitter()->emitIns_S_R(ins_Store(type1), size1, retReg1, lclVarPtr->gtLclNum, offset1);
+
+        return true;
+    }
+
+    return false;
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
 // Generate code for division (or mod) by power of two
 // or negative powers of two.  (meaning -1 * a power of two, not 2^(-1))
@@ -3366,40 +3769,55 @@ void CodeGen::genCodeForInitBlk(GenTreeInitBlk* initBlkNode)
 
 
 // Generate code for a load from some address + offset
-//   base: tree node which can be either a local address or arbitrary node
-//   offset: distance from the base from which to load
-void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset)
+//   baseNode: tree node which can be either a local address or arbitrary node
+//   offset: distance from the baseNode from which to load
+void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
 {
     emitter *emit = getEmitter();
 
-    if (base->OperIsLocalAddr())
+    if (baseNode->OperIsLocalAddr())
     {
-        if (base->gtOper == GT_LCL_FLD_ADDR)
-            offset += base->gtLclFld.gtLclOffs;
-        emit->emitIns_R_S(ins, size, dst, base->gtLclVarCommon.gtLclNum, offset);
+        if (baseNode->gtOper == GT_LCL_FLD_ADDR)
+            offset += baseNode->gtLclFld.gtLclOffs;
+        emit->emitIns_R_S(ins, size, dst, baseNode->gtLclVarCommon.gtLclNum, offset);
     }
     else
     {
-        emit->emitIns_R_AR(ins, size, dst, base->gtRegNum, offset);
+        emit->emitIns_R_AR(ins, size, dst, baseNode->gtRegNum, offset);
     }
 }
 
 // Generate code for a store to some address + offset
-//   base: tree node which can be either a local address or arbitrary node
-//   offset: distance from the base from which to load
-void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* base, unsigned offset)
+//   baseNode: tree node which can be either a local address or arbitrary node
+//   offset: distance from the baseNode from which to load
+void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* baseNode, unsigned offset)
 {
     emitter *emit = getEmitter();
 
-    if (base->OperIsLocalAddr())
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (baseNode->OperGet() == GT_PUTARG_STK)
     {
-        if (base->gtOper == GT_LCL_FLD_ADDR)
-            offset += base->gtLclFld.gtLclOffs;
-        emit->emitIns_S_R(ins, size, src, base->gtLclVarCommon.gtLclNum, offset);
+        GenTreePutArgStk* putArgStkNode = baseNode->AsPutArgStk();
+        assert(putArgStkNode->gtOp.gtOp1->isContained());
+        assert(putArgStkNode->gtOp.gtOp1->gtOp.gtOper == GT_LDOBJ);
+
+        emit->emitIns_S_R(ins, size, src, compiler->lvaOutgoingArgSpaceVar, 
+                          (putArgStkNode->gtSlotNum * TARGET_POINTER_SIZE) + offset);
     }
     else
+#endif // #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
-        emit->emitIns_AR_R(ins, size, src, base->gtRegNum, offset);
+
+        if (baseNode->OperIsLocalAddr())
+        {
+            if (baseNode->gtOper == GT_LCL_FLD_ADDR)
+                offset += baseNode->gtLclFld.gtLclOffs;
+            emit->emitIns_S_R(ins, size, src, baseNode->gtLclVarCommon.gtLclNum, offset);
+        }
+        else
+        {
+            emit->emitIns_AR_R(ins, size, src, baseNode->gtRegNum, offset);
+        }
     }
 }
 
@@ -3523,6 +3941,126 @@ void CodeGen::genCodeForCpBlkRepMovs(GenTreeCpBlk* cpBlkNode)
     instGen(INS_r_movsb);
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+// Generates PutArg code by performing a loop unroll
+//
+// TODO-Amd64-Unix: Try to share code with copyblk. 
+//      The difference for now is thethe putarg_stk contains it's children, while cpyblk not.
+//      This creates differences in code. After some significant refactoring it could be reused.
+void CodeGen::genCodeForPutArgUnroll(GenTreePutArgStk* putArgNode)
+{
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    GenTreePtr   dstAddr = putArgNode;
+    GenTreePtr   srcAddr = putArgNode->gtOp.gtOp1;
+
+    size_t size = putArgNode->gtNumSlots * TARGET_POINTER_SIZE;
+    assert(size <= CPBLK_UNROLL_LIMIT);
+
+    emitter *emit = getEmitter();
+
+    assert(srcAddr->isContained());
+    assert(srcAddr->gtOper == GT_LDOBJ);
+
+    if (!srcAddr->gtOp.gtOp1->isContained())
+    {
+        genConsumeReg(srcAddr->gtOp.gtOp1);
+    }
+
+    unsigned offset = 0;
+
+    // If the size of this struct is larger than 16 bytes
+    // let's use SSE2 to be able to do 16 byte at a time 
+    // loads and stores.
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+        assert(putArgNode->gtRsvdRegs != RBM_NONE);
+        regNumber xmmReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLFLOAT);
+        assert(genIsValidFloatReg(xmmReg));
+        size_t slots = size / XMM_REGSIZE_BYTES;
+
+        while (slots-- > 0)
+        {
+            // Load
+            genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr->gtOp.gtOp1, offset); // Load the address of the child of the LdObj node.
+            // Store
+            genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset);
+            offset += XMM_REGSIZE_BYTES;
+        }
+    }
+
+    // Fill the remainder (15 bytes or less) if there's one.
+    if ((size & 0xf) != 0)
+    {
+        // Grab the integer temp register to emit the remaining loads and stores.
+        regNumber tmpReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLINT);
+
+        if ((size & 8) != 0)
+        {
+#ifdef _TARGET_X86_
+            // TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
+            for (unsigned savedOffs = offset; offset < savedOffs + 8; offset += 4)
+            {
+                genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
+                genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
+            }
+#else // !_TARGET_X86_
+            genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr->gtOp.gtOp1, offset);
+            genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset);
+            offset += 8;
+#endif // !_TARGET_X86_
+        }
+        if ((size & 4) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr->gtOp.gtOp1, offset);
+            genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
+            offset += 4;
+        }
+        if ((size & 2) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr->gtOp.gtOp1, offset);
+            genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset);
+            offset += 2;
+        }
+        if ((size & 1) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr->gtOp.gtOp1, offset);
+            genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset);
+        }
+    }
+}
+
+// Generate code for CpBlk by using rep movs
+// Preconditions:
+// The size argument of the PutArgStk (for structs) is a constant and is between 
+// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
+void CodeGen::genCodeForPutArgRepMovs(GenTreePutArgStk* putArgNode)
+{
+
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    GenTreePtr   dstAddr = putArgNode;
+    GenTreePtr   srcAddr = putArgNode->gtOp.gtOp1;
+#ifdef DEBUG
+    size_t size = putArgNode->gtNumSlots * TARGET_POINTER_SIZE;
+#endif // DEBUG
+
+    // Validate state.
+    assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI));
+
+#ifdef DEBUG
+    assert(srcAddr->isContained());
+
+#ifdef _TARGET_AMD64_
+    assert(size > CPBLK_UNROLL_LIMIT);
+#else
+    assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
+#endif
+
+#endif // DEBUG
+    genConsumePutArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX);
+    instGen(INS_r_movsb);
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 // Generate code for CpObj nodes wich copy structs that have interleaved
 // GC pointers.
 // This will generate a sequence of movsq instructions for the cases of non-gc members
@@ -3686,7 +4224,7 @@ void CodeGen::genCodeForCpBlk(GenTreeCpBlk* cpBlkNode)
 {
 #ifdef _TARGET_AMD64_
     // Make sure we got the arguments of the cpblk operation in the right registers
-    GenTreePtr blockSize  = cpBlkNode->Size();
+    GenTreePtr blockSize = cpBlkNode->Size();
     GenTreePtr    dstAddr = cpBlkNode->Dest();
     GenTreePtr    srcAddr = cpBlkNode->Source();
 
@@ -3705,7 +4243,7 @@ void CodeGen::genCodeForCpBlk(GenTreeCpBlk* cpBlkNode)
 
     genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN);
 #else // !_TARGET_AMD64_
-    NYI_X86("Helper call for CpBlk");
+    noway_assert(false && "Helper call for CpBlk is not needed.");
 #endif // !_TARGET_AMD64_
 }
 
@@ -4558,7 +5096,9 @@ regNumber CodeGen::genConsumeReg(GenTree *tree)
 
     // genUpdateLife() will also spill local var if marked as GTF_SPILL by calling CodeGen::genSpillVar
     genUpdateLife(tree);
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(tree->gtRegNum != REG_NA);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     // there are three cases where consuming a reg means clearing the bit in the live mask
     // 1. it was not produced by a local
@@ -4678,6 +5218,82 @@ void CodeGen::genConsumeOperands(GenTreeOp* tree)
     }
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+void CodeGen::genConsumePutArgStk(GenTreePutArgStk* putArgNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg)
+{
+    // The putArgNode children are always contained. We should not consume any registers.
+
+    GenTree* dst = putArgNode;
+
+#ifdef DEBUG
+    // Get the GT_ADDR node, which is GT_LCL_VAR_ADDR (asserted below.)
+    GenTree* src = putArgNode->gtOp.gtOp1; 
+    assert(src->OperGet() == GT_LDOBJ);
+    src = src->gtOp.gtOp1;
+#else // !DEBUG
+    // Get the GT_ADDR node, which is GT_LCL_VAR_ADDR (asserted below.)
+    GenTree* src = putArgNode->gtOp.gtOp1->gtOp.gtOp1;
+#endif // !DEBUG
+    
+    size_t size = putArgNode->gtNumSlots * TARGET_POINTER_SIZE;
+    GenTree* op1;
+    GenTree* op2;
+
+    regNumber reg1, reg2, reg3;
+    op1 = dst;
+    reg1 = dstReg;
+    op2 = src;
+    reg2 = srcReg;
+    reg3 = sizeReg;
+
+    if (reg2 != REG_NA && op2->gtRegNum != REG_NA)
+    {
+        genConsumeReg(op2);
+    }
+
+    if ((reg1 != REG_NA) && (op1->gtRegNum != reg1))
+    {
+#if FEATURE_FIXED_OUT_ARGS
+        // Generate LEA instruction to load the stack of the outgoing var + SlotNum offset in  RDI.
+        LclVarDsc *  varDsc = &compiler->lvaTable[compiler->lvaOutgoingArgSpaceVar];
+        int offset = varDsc->lvStkOffs + putArgNode->gtSlotNum * TARGET_POINTER_SIZE;
+        // Outgoing area always on top of the stack (relative to rsp.)
+        getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, reg1, REG_SPBASE, offset);
+#else // !FEATURE_FIXED_OUT_ARGS
+        NYI_X86("Stack args for x86/RyuJIT");
+#endif // !FEATURE_FIXED_OUT_ARGS
+
+    }
+    
+    if (op2->gtRegNum != reg2)
+    {
+        if (src->OperIsLocalAddr())
+        {
+            // The OperLocalAddr is always contained.
+            assert(src->isContained());
+            GenTreeLclVarCommon* lclNode = src->AsLclVarCommon();
+
+            // Generate LEA instruction to load the LclVar address in RSI.
+            LclVarDsc *  varLclDsc = &compiler->lvaTable[lclNode->gtLclNum];
+            int offset = varLclDsc->lvStkOffs;
+
+            // Otutgoing area always on top of the stack (relative to rsp.)
+            getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, reg2, (isFramePointerUsed() ? getFramePointerReg() : REG_SPBASE), offset);
+        }
+        else
+        {
+            assert(src->gtRegNum != REG_NA);
+            getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, reg2, src->gtRegNum);
+        }
+    }
+
+    if ((reg3 != REG_NA))
+    {
+        inst_RV_IV(INS_mov, reg3, size, EA_8BYTE);
+    }
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 void CodeGen::genConsumeBlockOp(GenTreeBlkOp* blkNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg)
 {
     // We have to consume the registers, and perform any copies, in the actual execution order.
@@ -4827,7 +5443,6 @@ void CodeGen::genTransferRegGCState(regNumber dst, regNumber src)
    }
 }
 
-
 // generates an ip-relative call or indirect call via reg ('call reg')
 //     pass in 'addr' for a relative call or 'base' for a indirect register call
 //     methHnd - optional, only used for pretty printing 
@@ -4843,9 +5458,9 @@ void CodeGen::genEmitCall(int                   callType,
                           bool                  isJump,
                           bool                  isNoGC)
 {
-#ifndef _TARGET_X86_
+#if !defined(_TARGET_X86_)
     ssize_t               argSize = 0;
-#endif // !_TARGET_X86_
+#endif // !defined(_TARGET_X86_)
     getEmitter()->emitIns_Call(emitter::EmitCallType(callType),
                                methHnd,
                                INDEBUG_LDISASM_COMMA(sigInfo)
@@ -4867,14 +5482,14 @@ void CodeGen::genEmitCall(int                   callType,
 void CodeGen::genEmitCall(int                   callType,
                           CORINFO_METHOD_HANDLE methHnd,
                           INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo)
-                          GenTreeIndir*         indir
+                          GenTreeIndir*         indir 
                           X86_ARG(ssize_t       argSize),
                           emitAttr              retSize,
                           IL_OFFSETX            ilOffset)
 {
-#ifndef _TARGET_X86_
+#if !defined(_TARGET_X86_)
     ssize_t               argSize = 0;
-#endif // !_TARGET_X86_
+#endif // !defined(_TARGET_X86_)
     genConsumeAddress(indir->Addr());
 
     getEmitter()->emitIns_Call(emitter::EmitCallType(callType),
@@ -4920,13 +5535,49 @@ void CodeGen::genCallInstruction(GenTreePtr node)
         if (curArgTabEntry->regNum == REG_STK)
             continue;
 
-        regNumber argReg = curArgTabEntry->regNum;
-        genConsumeReg(argNode);
-        if (argNode->gtRegNum != argReg)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // Deal with multi register passed struct args.
+        if (argNode->OperGet() == GT_LIST)
         {
-            inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum);
+            GenTreeArgList* argListPtr = argNode->AsArgList();
+            unsigned iterationNum = 0;
+            for (; argListPtr; argListPtr = argListPtr->Rest(), iterationNum++)
+            {
+                GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+                assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+                regNumber argReg = REG_NA;
+                if (iterationNum == 0)
+                {
+                    argReg = curArgTabEntry->regNum;
+                }
+                else if (iterationNum == 1)
+                {
+                    argReg = curArgTabEntry->otherRegNum;
+                }
+                else
+                {
+                    assert(false); // Illegal state.
+                }
+
+                genConsumeReg(putArgRegNode);
+                if (putArgRegNode->gtRegNum != argReg)
+                {
+                    inst_RV_RV(ins_Move_Extend(putArgRegNode->TypeGet(), putArgRegNode->InReg()), argReg, putArgRegNode->gtRegNum);
+                }
+            }
+        }
+        else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        {
+            regNumber argReg = curArgTabEntry->regNum;
+            genConsumeReg(argNode);
+            if (argNode->gtRegNum != argReg)
+            {
+                inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum);
+            }
         }
 
+#if FEATURE_VARARG
         // In the case of a varargs call, 
         // the ABI dictates that if we have floating point args,
         // we must pass the enregistered arguments in both the 
@@ -4937,9 +5588,10 @@ void CodeGen::genCallInstruction(GenTreePtr node)
             instruction ins = ins_CopyFloatToInt(argNode->TypeGet(), TYP_LONG);
             inst_RV_RV(ins, argNode->gtRegNum, targetReg);
         }
+#endif // FEATURE_VARARG
     }
 
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     // The call will pop its arguments.
     // for each putarg_stk:
     ssize_t stackArgBytes = 0;
@@ -4949,16 +5601,31 @@ void CodeGen::genCallInstruction(GenTreePtr node)
         GenTreePtr arg = args->gtOp.gtOp1;
         if (arg->OperGet() != GT_ARGPLACE && !(arg->gtFlags & GTF_LATE_ARG))
         {
+#if defined(_TARGET_X86_)
             assert((arg->OperGet() == GT_PUTARG_STK) || (arg->OperGet() == GT_LONG));
             if (arg->OperGet() == GT_LONG)
             {
                 assert((arg->gtGetOp1()->OperGet() == GT_PUTARG_STK) && (arg->gtGetOp2()->OperGet() == GT_PUTARG_STK));
             }
+#endif // defined(_TARGET_X86_)
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (genActualType(arg->TypeGet()) == TYP_STRUCT)
+            {
+                if (arg->OperGet() == GT_PUTARG_STK)
+                {
+                    GenTreeLdObj* ldObj = arg->gtGetOp1()->AsLdObj();
+                    stackArgBytes = compiler->info.compCompHnd->getClassSize(ldObj->gtClass);
+                }
+            }
+            else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
             stackArgBytes += genTypeSize(genActualType(arg->TypeGet()));
         }
         args = args->gtOp.gtOp2;
     }
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
     // Insert a null check on "this" pointer if asked.
     if (call->NeedsNullCheck())
@@ -5056,9 +5723,9 @@ void CodeGen::genCallInstruction(GenTreePtr node)
                             methHnd,
                             INDEBUG_LDISASM_COMMA(sigInfo)
                             (void*) target->AsIndir()->Base()->AsIntConCommon()->IconValue(),
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
                             stackArgBytes,
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_)
                             retSize,
                             ilOffset);
             }
@@ -5070,9 +5737,9 @@ void CodeGen::genCallInstruction(GenTreePtr node)
                             methHnd,
                             INDEBUG_LDISASM_COMMA(sigInfo)
                             target->AsIndir(),
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
                             stackArgBytes,
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_)
                             retSize,
                             ilOffset);
             }
@@ -5086,9 +5753,9 @@ void CodeGen::genCallInstruction(GenTreePtr node)
                         methHnd,
                         INDEBUG_LDISASM_COMMA(sigInfo)
                         nullptr, //addr
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
                         stackArgBytes,
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_)
                         retSize,
                         ilOffset,
                         genConsumeReg(target));
@@ -5153,9 +5820,9 @@ void CodeGen::genCallInstruction(GenTreePtr node)
                     methHnd, 
                     INDEBUG_LDISASM_COMMA(sigInfo)
                     addr,
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
                     stackArgBytes,
-#endif // _TARGET_X86_
+#endif // _defined(_TARGET_X86_)
                     retSize,
                     ilOffset);
     }
@@ -5168,10 +5835,10 @@ void CodeGen::genCallInstruction(GenTreePtr node)
         genPendingCallLabel = nullptr;
     }
 
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
     // The call will pop its arguments.
     genStackLevel -= stackArgBytes;
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_)
 
     // Update GC info:
     // All Callee arg registers are trashed and no longer contain any GC pointers.
@@ -5218,6 +5885,130 @@ void CodeGen::genCallInstruction(GenTreePtr node)
     }
 }
 
+//------------------------------------------------------------------------
+// genGetStructTypeSizeOffset: Gets the type, size and offset of the eightbytes of a struct for System V systems.
+//
+// Arguments:
+//    'structDesc' struct description
+//    'type0'   returns the type of the first eightbyte.
+//    'type1'   returns the type of the second eightbyte.
+//    'size0'   returns the size of the first eightbyte.
+//    'size1'   returns the size of the second eightbyte.
+//    'offset0' returns the offset of the first eightbyte.
+//    'offset1' returns the offset of the second eightbyte.
+//
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+void CodeGen::genGetStructTypeSizeOffset(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc,
+    var_types* type0, var_types* type1, emitAttr* size0, emitAttr* size1,
+    unsigned __int8* offset0, unsigned __int8* offset1)
+{
+    *size0 = EA_UNKNOWN;
+    *offset0 = structDesc.eightByteOffsets[0];
+    *size1 = EA_UNKNOWN;
+    *offset1 = structDesc.eightByteOffsets[1];
+
+    *type0 = TYP_UNKNOWN;
+    *type1 = TYP_UNKNOWN;
+
+    // Set the first eightbyte data
+    if (structDesc.eightByteCount >= 1)
+    {
+        switch (structDesc.eightByteClassifications[0])
+        {
+        case SystemVClassificationTypeInteger:
+            if (structDesc.eightByteSizes[0] <= 4)
+            {
+                *size0 = EA_4BYTE;
+                *type0 = TYP_INT;
+            }
+            else if (structDesc.eightByteSizes[0] <= 8)
+            {
+                *size0 = EA_8BYTE;
+                *type0 = TYP_LONG;
+            }
+            else
+            {
+                assert(false && "Bad int type.");
+            }
+            break;
+        case SystemVClassificationTypeIntegerReference:
+            assert(structDesc.eightByteSizes[0] == REGSIZE_BYTES);
+            *size0 = EA_GCREF;
+            *type0 = TYP_REF;
+            break;
+        case SystemVClassificationTypeSSE:
+            if (structDesc.eightByteSizes[0] <= 4)
+            {
+                *size0 = EA_4BYTE;
+                *type0 = TYP_FLOAT;
+            }
+            else if (structDesc.eightByteSizes[0] <= 8)
+            {
+                *size0 = EA_8BYTE;
+                *type0 = TYP_DOUBLE;
+            }
+            else
+            {
+                assert(false && "Bat float type."); // Not possible.
+            }
+            break;
+        default:
+            assert(false && "Bad EightByte classification.");
+            break;
+        }
+    }
+
+    // Set the second eight byte data
+    if (structDesc.eightByteCount == 2)
+    {
+        switch (structDesc.eightByteClassifications[1])
+        {
+        case SystemVClassificationTypeInteger:
+            if (structDesc.eightByteSizes[1] <= 4)
+            {
+                *type1 = TYP_INT;
+                *size1 = EA_4BYTE;
+            }
+            else if (structDesc.eightByteSizes[1] <= 8)
+            {
+                *type1 = TYP_LONG;
+                *size1 = EA_8BYTE;
+            }
+            else
+            {
+                assert(false && "Bad int type.");
+            }
+            break;
+        case SystemVClassificationTypeIntegerReference:
+            assert(structDesc.eightByteSizes[1] == REGSIZE_BYTES);
+            *type1 = TYP_REF;
+            *size1 = EA_GCREF;
+            break;
+        case SystemVClassificationTypeSSE:
+            if (structDesc.eightByteSizes[1] <= 4)
+            {
+                *type1 = TYP_FLOAT;
+                *size1 = EA_4BYTE;
+            }
+            else if (structDesc.eightByteSizes[1] <= 8)
+            {
+                *type1 = TYP_DOUBLE;
+                *size1 = EA_8BYTE;
+            }
+            else
+            {
+                assert(false && "Bat float type."); // Not possible.
+            }
+            break;
+        default:
+            assert(false && "Bad EightByte classification.");
+            break;
+        }
+    }
+}
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 // Produce code for a GT_JMP node.
 // The arguments of the caller needs to be transferred to the callee before exiting caller.
 // The actual jump to callee is generated as part of caller epilog sequence.
@@ -5319,36 +6110,94 @@ void CodeGen::genJmpMethod(GenTreePtr jmp)
         if  (!varDsc->lvIsRegArg)
             continue;
 
-        // Register argument
-        noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (varDsc->lvType == TYP_STRUCT)
+        {
+            CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+            assert(typeHnd != nullptr);
 
-        // Is register argument already in the right register?
-        // If not load it from its stack location.
-        var_types  loadType  = varDsc->lvaArgType();
-        regNumber  argReg    = varDsc->lvArgReg;    // incoming arg register
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            assert(structDesc.passedInRegisters);
 
-        if (varDsc->lvRegNum != argReg)
-        {
-            assert(genIsValidReg(argReg)); 
+            emitAttr size0 = EA_UNKNOWN;
+            emitAttr size1 = EA_UNKNOWN;
+            unsigned __int8 offset0 = 0;
+            unsigned __int8 offset1 = 0;
+            var_types type0 = TYP_UNKNOWN;
+            var_types type1 = TYP_UNKNOWN;
+
+            // Get the eightbyte data
+            genGetStructTypeSizeOffset(structDesc, &type0, &type1, &size0, &size1, &offset0, &offset1);
+
+            // Move the values into the right registers.
+            // 
+            if (type0 != TYP_UNKNOWN)
+            {
+                getEmitter()->emitIns_R_S(ins_Load(type0), size0, varDsc->lvArgReg, varNum, offset0);
+
+                // Update varDsc->lvArgReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+                // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+                // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
+                // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
+                regSet.rsMaskVars |= genRegMask(varDsc->lvArgReg);
+                gcInfo.gcMarkRegPtrVal(varDsc->lvArgReg, type0);
+            }
+            
+            if (type1 != TYP_UNKNOWN)
+            {
+                getEmitter()->emitIns_R_S(ins_Load(type1), size1, varDsc->lvOtherArgReg, varNum, offset1);
 
-            getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
+                // Update varDsc->lvArgReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+                // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+                // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
+                // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
+                regSet.rsMaskVars |= genRegMask(varDsc->lvOtherArgReg);
+                gcInfo.gcMarkRegPtrVal(varDsc->lvOtherArgReg, type1);
+            }
 
-            // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
-            // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
-            // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
-            // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
-            regSet.rsMaskVars |= genRegMask(argReg);
-            gcInfo.gcMarkRegPtrVal(argReg, loadType);
             if (varDsc->lvTracked)
             {
-                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varNum);            
+                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varNum);
             }
         }
+        else
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // Register argument
+            noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
 
+            // Is register argument already in the right register?
+            // If not load it from its stack location.
+            var_types  loadType = varDsc->lvaArgType();
+            regNumber  argReg = varDsc->lvArgReg;    // incoming arg register
+
+            if (varDsc->lvRegNum != argReg)
+            {
+                assert(genIsValidReg(argReg));
+                getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
+
+                // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+                // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+                // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
+                // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
+                regSet.rsMaskVars |= genRegMask(argReg);
+                gcInfo.gcMarkRegPtrVal(argReg, loadType);
+                if (varDsc->lvTracked)
+                {
+                    VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varNum);
+                }
+            }
+        }
+
+#if FEATURE_VARARG
         // In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg register.        
         if (compiler->info.compIsVarArgs)
         {
             regNumber intArgReg;
+            var_types  loadType = varDsc->lvaArgType();
+            regNumber  argReg = varDsc->lvArgReg;    // incoming arg register
+
             if (varTypeIsFloating(loadType))
             {
                 intArgReg = compiler->getCallArgIntRegister(argReg);
@@ -5368,8 +6217,10 @@ void CodeGen::genJmpMethod(GenTreePtr jmp)
                 firstArgVarNum = varNum;
             }
         }
+#endif // FEATURE_VARARG    
     }
 
+#if FEATURE_VARARG
     // Jmp call to a vararg method - if the method has fewer than 4 fixed arguments,
     // load the remaining arg registers (both int and float) from the corresponding
     // shadow stack slots.  This is for the reason that we don't know the number and type
@@ -5409,7 +6260,7 @@ void CodeGen::genJmpMethod(GenTreePtr jmp)
             getEmitter()->emitEnableGC();
         }
     }
-
+#endif // FEATURE_VARARG
 }
 
 // produce code for a GT_LEA subnode
@@ -6488,13 +7339,122 @@ CodeGen::genMathIntrinsic(GenTreePtr treeNode)
     genProduceReg(treeNode);
 }
 
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+//---------------------------------------------------------------------
+// genPutArgStk - generate code for putting a struct arg on the stack by value.
+//                In case there are references to heap object in the struct,
+//                it generates the gcinfo as well.
+//
+// Arguments
+//    treeNode - the GT_PUTARG_STK node
+//
+// Return value:
+//    None
+//
 void
 CodeGen::genPutArgStk(GenTreePtr treeNode)
 {
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(treeNode->OperGet() == GT_PUTARG_STK);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     var_types targetType = treeNode->TypeGet();
+#ifdef _TARGET_X86_
     noway_assert(targetType != TYP_STRUCT);
+#elif defined (FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    noway_assert(targetType == TYP_STRUCT);
+    
+    GenTreePutArgStk* putArgStk = treeNode->AsPutArgStk();
+    if (putArgStk->gtNumberReferenceSlots == 0)
+    {
+        switch (putArgStk->gtPutArgStkKind)
+        {
+        case GenTreePutArgStk::PutArgStkKindRepInstr:
+            genCodeForPutArgRepMovs(putArgStk);
+            break;
+        case GenTreePutArgStk::PutArgStkKindUnroll:
+            genCodeForPutArgUnroll(putArgStk);
+            break;
+        default:
+            unreached();
+        }
+    }
+    else
+    {
+        // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always.
+
+        // Consume these registers.
+        // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
+        genConsumePutArgStk(putArgStk, REG_RDI, REG_RSI, REG_NA);
+        GenTreePtr   dstAddr = putArgStk;
+        GenTreePtr   srcAddr = putArgStk->gtOp.gtOp1;
+        gcInfo.gcMarkRegPtrVal(REG_RSI, srcAddr->TypeGet());
+        gcInfo.gcMarkRegPtrVal(REG_RDI, dstAddr->TypeGet());
+
+        unsigned slots = putArgStk->gtNumSlots;
+
+        // We are always on the stack we don't need to use the write barrier. 
+        BYTE*    gcPtrs     = putArgStk->gtGcPtrs;
+        unsigned gcPtrCount = putArgStk->gtNumberReferenceSlots;
+
+        unsigned i = 0;
+        unsigned copiedSlots = 0;
+        while (i < slots)
+        {
+            switch (gcPtrs[i])
+            {
+            case TYPE_GC_NONE:
+                // Let's see if we can use rep movsq instead of a sequence of movsq instructions
+                // to save cycles and code size.
+            {
+                unsigned nonGcSlotCount = 0;
+
+                do
+                {
+                    nonGcSlotCount++;
+                    i++;
+                } while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
+
+                // If we have a very small contiguous non-gc region, it's better just to
+                // emit a sequence of movsq instructions
+                if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
+                {
+                    copiedSlots += nonGcSlotCount;
+                    while (nonGcSlotCount > 0)
+                    {
+                        instGen(INS_movsq);
+                        nonGcSlotCount--;
+                    }
+                }
+                else
+                {
+                    getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
+                    copiedSlots += nonGcSlotCount;
+                    instGen(INS_r_movsq);
+                }
+            }
+                break;
+            default:
+                // We have a GC pointer
+                // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsq instruction,
+                // but the logic for emitting a GC info record is not available (it is internal for the emitter only.)
+                // See emitGCVarLiveUpd function. If we could call it separately, we could do instGen(INS_movsq); and emission of gc info.
+
+                getEmitter()->emitIns_R_AR(ins_Load(TYP_REF), EA_GCREF, REG_RCX, REG_RSI, 0);
+                getEmitter()->emitIns_S_R(ins_Store(TYP_REF), EA_GCREF, REG_RCX, compiler->lvaOutgoingArgSpaceVar,
+                                          ((copiedSlots + putArgStk->gtSlotNum) * TARGET_POINTER_SIZE)); 
+                getEmitter()->emitIns_R_I(INS_add, EA_8BYTE, REG_RSI, TARGET_POINTER_SIZE);
+                getEmitter()->emitIns_R_I(INS_add, EA_8BYTE, REG_RDI, TARGET_POINTER_SIZE);
+                copiedSlots++;
+                gcPtrCount--;
+                i++;
+            }
+        }
+
+        gcInfo.gcMarkRegSetNpt(RBM_RSI);
+        gcInfo.gcMarkRegSetNpt(RBM_RDI);
+    }
+    return;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
 
     GenTreePtr data = treeNode->gtOp.gtOp1;
@@ -6508,7 +7468,9 @@ CodeGen::genPutArgStk(GenTreePtr treeNode)
     // Decrement SP.
     int argSize = genTypeSize(genActualType(targetType));
     inst_RV_IV(INS_sub, REG_SPBASE, argSize, emitActualTypeSize(TYP_I_IMPL));
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     genStackLevel += argSize;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     // TODO-Cleanup: Handle this in emitInsMov() in emitXArch.cpp?
     if (data->isContained())
@@ -6522,7 +7484,7 @@ CodeGen::genPutArgStk(GenTreePtr treeNode)
         getEmitter()->emitIns_AR_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, REG_SPBASE, 0);
     }
 }
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 /*****************************************************************************
  *
diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp
index 427d778b90..b54657202a 100644
--- a/src/jit/compiler.cpp
+++ b/src/jit/compiler.cpp
@@ -2992,7 +2992,6 @@ void                 Compiler::compCompile(void * * methodCodePtr,
                                            unsigned compileFlags)
 {
     hashBv::Init(this);
-
     VarSetOps::AssignAllowUninitRhs(this, compCurLife, VarSetOps::UninitVal());
 
     /* The temp holding the secret stub argument is used by fgImport() when importing the intrinsic. */
@@ -4042,7 +4041,6 @@ int           Compiler::compCompileHelper (CORINFO_MODULE_HANDLE            clas
                                            unsigned                         compileFlags,
                                            CorInfoInstantiationVerification instVerInfo)
     {
-
         CORINFO_METHOD_HANDLE methodHnd = info.compMethodHnd;
 
         info.compCode           = methodInfo->ILCode;
@@ -5027,6 +5025,125 @@ START:
     return result;
 }
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+// GetTypeFromClassificationAndSizes:
+//   Returns the type of the eightbyte accounting for the classification and size of the eightbyte.
+//
+// args:
+//   classType: classification type
+//   size: size of the eightbyte.
+//   
+var_types Compiler::GetTypeFromClassificationAndSizes(SystemVClassificationType classType, int size)
+{
+    var_types type = TYP_UNKNOWN;
+    switch (classType)
+    {
+    case SystemVClassificationTypeInteger:
+        if (size == 1)
+        {
+            type = TYP_BYTE;
+        }
+        else if (size <= 2)
+        {
+            type = TYP_SHORT;
+        }
+        else if (size <= 4)
+        {
+            type = TYP_INT;
+        }
+        else if (size <= 8)
+        {
+            type = TYP_LONG;
+        }
+        else
+        {
+            assert(false && "GetTypeFromClassificationAndSizes Invalid Integer classification type.");
+        }
+        break;
+    case SystemVClassificationTypeIntegerReference:
+        type = TYP_REF;
+        break;
+    case SystemVClassificationTypeSSE:
+        if (size <= 4)
+        {
+            type = TYP_FLOAT;
+        }
+        else if (size <= 8)
+        {
+            type = TYP_DOUBLE;
+        }
+        else
+        {
+            assert(false && "GetTypeFromClassificationAndSizes Invalid SSE classification type.");
+        }
+        break;
+
+    default:
+        assert(false && "GetTypeFromClassificationAndSizes Invalid classification type.");
+        break;
+    }
+
+    return type;
+}
+
+// getEightByteType:
+//   Returns the type of the struct description and slot number of the eightbyte.
+//
+// args:
+//   structDesc: struct classification description.
+//   slotNum: eightbyte slot number for the struct.
+//   
+var_types Compiler::getEightByteType(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc, unsigned slotNum)
+{
+    var_types eightByteType = TYP_UNDEF;
+    unsigned len = structDesc.eightByteSizes[slotNum];
+
+    switch (structDesc.eightByteClassifications[slotNum])
+    {
+    case SystemVClassificationTypeInteger:
+        // See typelist.h for jit type definition. 
+        // All the types of size < 4 bytes are of jit type TYP_INT.
+        if (structDesc.eightByteSizes[slotNum] <= 4)
+        {
+            eightByteType = TYP_INT;
+        }
+        else if (structDesc.eightByteSizes[slotNum] <= 8)
+        {
+            eightByteType = TYP_LONG;
+        }
+        else
+        {
+            assert(false && "getEightByteType Invalid Integer classification type.");
+        }
+        break;
+    case SystemVClassificationTypeIntegerReference:
+        assert(len == REGSIZE_BYTES);
+        eightByteType = TYP_REF;
+        break;
+    case SystemVClassificationTypeSSE:
+        if (structDesc.eightByteSizes[slotNum] <= 4)
+        {
+            eightByteType = TYP_FLOAT;
+        }
+        else if (structDesc.eightByteSizes[slotNum] <= 8)
+        {
+            eightByteType = TYP_DOUBLE;
+        }
+        else
+        {
+            assert(false && "getEightByteType Invalid SSE classification type.");
+        }
+        break;
+    default:
+        assert(false && "getEightByteType Invalid classification type.");
+        break;
+    }
+
+    return eightByteType;
+}
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 /*****************************************************************************/
 /*****************************************************************************/
 
diff --git a/src/jit/compiler.h b/src/jit/compiler.h
index 520c94a462..bc851dcf1d 100644
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -269,9 +269,12 @@ public:
     unsigned char       lvOverlappingFields :1;  // True when we have a struct with possibly overlapping fields
     unsigned char       lvContainsHoles     :1;  // True when we have a promoted struct that contains holes
     unsigned char       lvCustomLayout      :1;  // True when this struct has "CustomLayout"
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     unsigned char       lvDontPromote:1;        // Should struct promoter consider this variable for promotion?
-    unsigned char       lvIsHfaRegArg:1;        // Is this argument variable holding a HFA register argument.
+#endif
+
+#ifdef _TARGET_ARM_
+    unsigned char       lvIsHfaRegArg   :1;        // Is this argument variable holding a HFA register argument.
     unsigned char       lvHfaTypeIsFloat:1;     // Is the HFA type float or double?
 #endif
 
@@ -290,7 +293,7 @@ public:
     unsigned char       lvSIMDType       :1; // This is a SIMD struct
     unsigned char       lvUsedInSIMDIntrinsic :1; // This tells lclvar is used for simd intrinsic
 #endif // FEATURE_SIMD
-    unsigned char       lvRegStruct : 1;     // This is a reg-sized non-field-addressed struct.
+    unsigned char       lvRegStruct      :1;     // This is a reg-sized non-field-addressed struct.
 
     union 
     {
@@ -305,6 +308,26 @@ public:
     unsigned char       lvFldOffset;
     unsigned char       lvFldOrdinal;
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    regNumber lvRegNumForSlot(unsigned slotNum)
+    {
+        if (slotNum == 0)
+        {
+            return lvArgReg;
+        }
+        else if (slotNum == 1)
+        {
+            return lvOtherArgReg;
+        }
+        else
+        {
+            assert(false && "Invalid slotNum!");
+        }
+
+        unreached();
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 private:
 
     regNumberSmall      _lvRegNum;      // Used to store the register this variable is in (or, the low register of a register pair).
@@ -314,7 +337,13 @@ private:
 #if !defined(_TARGET_64BIT_)
     regNumberSmall      _lvOtherReg;    // Used for "upper half" of long var.
 #endif // !defined(_TARGET_64BIT_)
+
     regNumberSmall      _lvArgReg;      // The register in which this argument is passed.
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    regNumberSmall      _lvOtherArgReg;    // Used for the second part of the struct passed in a register.
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 #ifndef LEGACY_BACKEND
     union
     {
@@ -382,7 +411,7 @@ public:
     regNumber           lvArgReg;
 
     regNumber GetArgReg() const
-{
+    {
         return (regNumber) _lvArgReg;
     }
 
@@ -392,6 +421,22 @@ public:
         assert(_lvArgReg == reg);
     }
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    __declspec(property(get = GetOtherArgReg, put = SetOtherArgReg))
+        regNumber           lvOtherArgReg;
+
+    regNumber GetOtherArgReg() const
+    {
+        return (regNumber)_lvOtherArgReg;
+    }
+
+    void SetOtherArgReg(regNumber reg)
+    {
+        _lvOtherArgReg = (regNumberSmall)reg;
+        assert(_lvOtherArgReg == reg);
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 #ifdef FEATURE_SIMD
     // Is this is a SIMD struct?
     bool lvIsSIMDType() const
@@ -1139,6 +1184,15 @@ struct FuncInfoDsc
 
 struct fgArgTabEntry
 {
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    fgArgTabEntry()
+    {
+        otherRegNum                     = REG_NA;
+        isStruct                        = false;  // is this a struct arg
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     GenTreePtr     node;        // Initially points at the Op1 field of 'parent', but if the argument is replaced with an GT_ASG or placeholder
                                 //  it will point at the actual argument in the gtCallLateArgs list.
     GenTreePtr     parent;      // Points at the GT_LIST node in the gtCallArgs for this argument
@@ -1165,6 +1219,13 @@ struct fgArgTabEntry
     bool           isBackFilled :1; // True when the argument fills a register slot skipped due to alignment requirements of previous arguments.
     bool           isNonStandard:1; // True if it is an arg that is passed in a reg other than a standard arg reg
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    regNumber             otherRegNum;              // The (second) register to use when passing this argument.
+    bool                  isStruct;                 // is this a struct arg
+
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     void SetIsHfaRegArg(bool hfaRegArg)
     {
         isHfaRegArg = hfaRegArg;
@@ -1196,10 +1257,10 @@ class  fgArgInfo
     unsigned              nextSlotNum;  // Updatable slot count value
     unsigned              stkLevel;     // Stack depth when we make this call (for x86)
 
-    unsigned              argTableSize;  // size of argTable array (equal to the argCount when done with fgMorphArgs)
-    bool                  argsComplete;  // marker for state
-    bool                  argsSorted;    // marker for state
-    fgArgTabEntryPtr *    argTable;      // variable sized array of per argument descrption: (i.e. argTable[argTableSize])
+    unsigned              argTableSize; // size of argTable array (equal to the argCount when done with fgMorphArgs)
+    bool                  argsComplete; // marker for state
+    bool                  argsSorted;   // marker for state
+    fgArgTabEntryPtr *    argTable;     // variable sized array of per argument descrption: (i.e. argTable[argTableSize])
 
 private:
 
@@ -1217,11 +1278,24 @@ public:
                                         unsigned        numRegs,
                                         unsigned        alignment);
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    fgArgTabEntryPtr AddRegArg         (unsigned        argNum,
+                                        GenTreePtr      node,
+                                        GenTreePtr      parent,
+                                        regNumber       regNum,
+                                        unsigned        numRegs,
+                                        unsigned        alignment,
+                                        const bool      isStruct,
+                                        const regNumber otherRegNum = REG_NA,
+                                        const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* const structDescPtr = nullptr);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     fgArgTabEntryPtr AddStkArg         (unsigned        argNum,
                                         GenTreePtr      node,
                                         GenTreePtr      parent,
                                         unsigned        numSlots,
-                                        unsigned        alignment);
+                                        unsigned        alignment
+                                        FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const bool isStruct));
 
     void             RemorphReset      ();
     fgArgTabEntryPtr RemorphRegArg     (unsigned        argNum,
@@ -1391,7 +1465,9 @@ public:
     DWORD expensiveDebugCheckLevel;
 #endif
 
-
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    CORINFO_CLASS_HANDLE     GetStructClassHandle(GenTreePtr tree);
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 #ifdef _TARGET_ARM_
 
@@ -1403,8 +1479,6 @@ public:
     // floating-point registers.
     //
 
-    inline CORINFO_CLASS_HANDLE     GetHfaClassHandle(GenTreePtr tree);
-
     bool                            IsHfa(CORINFO_CLASS_HANDLE hClass);
     bool                            IsHfa(GenTreePtr tree);
 
@@ -1417,6 +1491,14 @@ public:
 #endif // _TARGET_ARM_
 
     //-------------------------------------------------------------------------
+    // The following is used for struct passing on System V system.
+    //
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    bool                            IsRegisterPassable(CORINFO_CLASS_HANDLE hClass);
+    bool                            IsRegisterPassable(GenTreePtr tree);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    //-------------------------------------------------------------------------
     // The following is used for validating format of EH table
     //
 
@@ -2450,7 +2532,7 @@ public :
         unsigned char         fldOrdinal;
         var_types             fldType;
         unsigned              fldSize;
-        CORINFO_CLASS_HANDLE  fldTypeHnd;      
+        CORINFO_CLASS_HANDLE  fldTypeHnd;
     };
 
     // Info about struct to be promoted.
@@ -3006,9 +3088,12 @@ private:
     bool                impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &opcode);
     void                impAbortInline(bool abortThisInlineOnly, bool contextDependent, const char *reason);
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_)
     void                impMarkLclDstNotPromotable(unsigned tmpNum, GenTreePtr op, CORINFO_CLASS_HANDLE hClass);
-    GenTreePtr          impAssignHfaToVar(GenTreePtr op, CORINFO_CLASS_HANDLE hClass);
+#endif
+
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    GenTreePtr          impAssignStructToVar(GenTreePtr op, CORINFO_CLASS_HANDLE hClass);
 #endif
 
     // A free list of linked list nodes used to represent to-do stacks of basic blocks.
@@ -3026,9 +3111,11 @@ private:
 
     bool                impIsValueType              (typeInfo* pTypeInfo);
     var_types           mangleVarArgsType           (var_types type);
+
+#if FEATURE_VARARG
     regNumber           getCallArgIntRegister       (regNumber floatReg);
     regNumber           getCallArgFloatRegister     (regNumber intReg);
-
+#endif // FEATURE_VARARG
     //--------------------------- Inlining-------------------------------------
 
 #if defined(DEBUG) || MEASURE_INLINING
@@ -4080,10 +4167,9 @@ public:
 
     bool                fgCastNeeded(GenTreePtr tree, var_types toType);
     GenTreePtr          fgDoNormalizeOnStore(GenTreePtr tree);
-    GenTreePtr          fgMakeTmpArgNode(unsigned tmpVarNum);
-
-    /* The following check for loops that don't execute calls */
+    GenTreePtr          fgMakeTmpArgNode(unsigned tmpVarNum FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const bool passedInRegisters));
 
+    // The following check for loops that don't execute calls
     bool                fgLoopCallMarked;
 
     void                fgLoopCallTest    (BasicBlock *srcBB,
@@ -4450,7 +4536,14 @@ private:
     GenTreePtr          fgMorphCast         (GenTreePtr     tree);
     GenTreePtr          fgUnwrapProxy       (GenTreePtr     objRef);
     GenTreeCall*        fgMorphArgs         (GenTreeCall*   call);
-    void                fgMakeOutgoingStructArgCopy(GenTreeCall* call, GenTree* args, unsigned argIndex, CORINFO_CLASS_HANDLE copyBlkClass);
+    
+    void                fgMakeOutgoingStructArgCopy(
+                            GenTreeCall* call,
+                            GenTree* args,
+                            unsigned argIndex,
+                            CORINFO_CLASS_HANDLE copyBlkClass
+                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structDescPtr));
+
     void                fgFixupStructReturn (GenTreePtr     call);
     GenTreePtr          fgMorphLocalVar     (GenTreePtr     tree);
     bool                fgAddrCouldBeNull   (GenTreePtr     addr);
@@ -4570,11 +4663,11 @@ private:
     void                fgInsertInlineeBlocks (InlineInfo * pInlineInfo);
     GenTreePtr          fgInlinePrependStatements(InlineInfo * inlineInfo);
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     GenTreePtr          fgGetStructAsStructPtr(GenTreePtr tree);
-    GenTreePtr          fgAssignHfaInlineeToVar(GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd);
-    void                fgAttachHfaInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd);
-#endif
+    GenTreePtr          fgAssignStructInlineeToVar(GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd);
+    void                fgAttachStructInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd);
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     static fgWalkPreFn  fgUpdateInlineReturnExpressionPlaceHolder;
 
 #ifdef DEBUG
@@ -6275,6 +6368,17 @@ public :
     void                        eeSetEHinfo(unsigned                 EHnumber,
                                             const CORINFO_EH_CLAUSE* clause);
 
+    // ICorStaticInfo wrapper functions
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#ifdef DEBUG
+    static void                 dumpSystemVClassificationType(SystemVClassificationType ct);
+#endif // DEBUG
+
+    void                        eeGetSystemVAmd64PassStructInRegisterDescriptor(/*IN*/  CORINFO_CLASS_HANDLE structHnd,
+                                                                                /*OUT*/ SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     // Utility functions
 
 #if defined(DEBUG)
@@ -8433,6 +8537,11 @@ public:
 
     static HelperCallProperties s_helperCallProperties;
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    var_types GetTypeFromClassificationAndSizes(SystemVClassificationType classType, int size);
+    var_types getEightByteType(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc, unsigned slotNum);
+    void fgMorphSystemVStructArgs(GenTreeCall* call, bool hasStructArgument);
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 }; // end of class Compiler
 
 // Inline methods of CompAllocator.
@@ -8466,7 +8575,6 @@ LclVarDsc::LclVarDsc(Compiler* comp)
 {
 }
 
-
 /*
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
diff --git a/src/jit/compiler.hpp b/src/jit/compiler.hpp
index 1cdc939d16..e4168b0f18 100644
--- a/src/jit/compiler.hpp
+++ b/src/jit/compiler.hpp
@@ -651,7 +651,10 @@ bool   Compiler::VarTypeIsMultiByteAndCanEnreg(var_types type,
     if (type == TYP_STRUCT)
     {
         size = info.compCompHnd->getClassSize(typeClass);
-
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // Account for the classification of the struct.
+        result = IsRegisterPassable(typeClass);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         switch(size)
         {
             case 1:
@@ -664,6 +667,7 @@ bool   Compiler::VarTypeIsMultiByteAndCanEnreg(var_types type,
             default:
                 break;
         }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
     }
     else
     {
@@ -2268,8 +2272,10 @@ int                 Compiler::lvaFrameAddress(int varNum, bool * pFPbased)
         if (lvaDoneFrameLayout > REGALLOC_FRAME_LAYOUT && !varDsc->lvOnFrame)
         {
 #ifdef _TARGET_AMD64_
-            // On amd64, every param has a stack location.
+            // On amd64, every param has a stack location, except on Unix-like systems.
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
             assert(varDsc->lvIsParam);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #elif defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
             // For !LEGACY_BACKEND on x86, a stack parameter that is enregistered will have a stack location. 
             assert(varDsc->lvIsParam && !varDsc->lvIsRegArg);
@@ -2589,6 +2595,8 @@ var_types Compiler::mangleVarArgsType(var_types type)
     return type;
 }
 
+// For CORECLR there is no vararg on System V systems.
+#if FEATURE_VARARG
 inline regNumber Compiler::getCallArgIntRegister(regNumber floatReg)
 {
 #ifdef _TARGET_AMD64_
@@ -2630,10 +2638,11 @@ inline regNumber Compiler::getCallArgFloatRegister(regNumber intReg)
     }
 #else  // !_TARGET_AMD64_
     // How will float args be passed for RyuJIT/x86?
-    NYI("getCallArgIntRegister for RyuJIT/x86");
+    NYI("getCallArgFloatRegister for RyuJIT/x86");
     return REG_NA;
 #endif // !_TARGET_AMD64_
 }
+#endif // FEATURE_VARARG
 
 /*
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
diff --git a/src/jit/ee_il_dll.cpp b/src/jit/ee_il_dll.cpp
index 90e50ed84a..4c8e2ff30e 100644
--- a/src/jit/ee_il_dll.cpp
+++ b/src/jit/ee_il_dll.cpp
@@ -281,6 +281,16 @@ unsigned           Compiler::eeGetArgSize(CORINFO_ARG_LIST_HANDLE list, CORINFO_
     // Everything fits into a single 'slot' size
     // to accommodate irregular sized structs, they are passed byref
     // TODO-ARM64-Bug?: structs <= 16 bytes get passed in 2 consecutive registers.
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    CORINFO_CLASS_HANDLE        argClass;
+    CorInfoType argTypeJit = strip(info.compCompHnd->getArgType(sig, list, &argClass));
+    var_types argType = JITtype2varType(argTypeJit);
+    if (argType == TYP_STRUCT)
+    {
+        unsigned structSize = info.compCompHnd->getClassSize(argClass);
+        return structSize;
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     return sizeof(size_t);
 
 #else // !_TARGET_AMD64_ && !_TARGET_ARM64_
@@ -920,6 +930,60 @@ int Compiler::eeGetJitDataOffs(CORINFO_FIELD_HANDLE  field)
     }
 }
 
+
+/*****************************************************************************
+ *
+ *                      ICorStaticInfo wrapper functions
+ */
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+#ifdef DEBUG
+void Compiler::dumpSystemVClassificationType(SystemVClassificationType ct)
+{
+    switch (ct)
+    {
+    case SystemVClassificationTypeUnknown:              printf("UNKNOWN");          break;
+    case SystemVClassificationTypeStruct:               printf("Struct");           break;
+    case SystemVClassificationTypeNoClass:              printf("NoClass");          break;
+    case SystemVClassificationTypeMemory:               printf("Memory");           break;
+    case SystemVClassificationTypeInteger:              printf("Integer");          break;
+    case SystemVClassificationTypeIntegerReference:     printf("IntegerReference"); break;
+    case SystemVClassificationTypeSSE:                  printf("SSE");              break;
+    default:                                            printf("ILLEGAL");          break;
+    }
+}
+#endif // DEBUG
+
+void Compiler::eeGetSystemVAmd64PassStructInRegisterDescriptor(/*IN*/  CORINFO_CLASS_HANDLE structHnd,
+                                                               /*OUT*/ SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr)
+{
+    bool ok = info.compCompHnd->getSystemVAmd64PassStructInRegisterDescriptor(structHnd, structPassInRegDescPtr);
+    noway_assert(ok);
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("**** getSystemVAmd64PassStructInRegisterDescriptor(0x%x (%s), ...) =>\n", dspPtr(structHnd), eeGetClassName(structHnd));
+        printf("        passedInRegisters = %s\n", dspBool(structPassInRegDescPtr->passedInRegisters));
+        if (structPassInRegDescPtr->passedInRegisters)
+        {
+            printf("        eightByteCount   = %d\n", structPassInRegDescPtr->eightByteCount);
+            for (unsigned int i = 0; i < structPassInRegDescPtr->eightByteCount; i++)
+            {
+                printf("        eightByte #%d -- classification: ", i);
+                dumpSystemVClassificationType(structPassInRegDescPtr->eightByteClassifications[i]);
+                printf(", byteSize: %d, byteOffset: %d\n",
+                    structPassInRegDescPtr->eightByteSizes[i],
+                    structPassInRegDescPtr->eightByteOffsets[i]);
+            }
+        }
+    }
+#endif // DEBUG
+}
+
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 /*****************************************************************************
  *
  *                      Utility functions
diff --git a/src/jit/emit.cpp b/src/jit/emit.cpp
index 20f8af3fa2..fa9d3597de 100644
--- a/src/jit/emit.cpp
+++ b/src/jit/emit.cpp
@@ -5653,8 +5653,9 @@ void                emitter::emitRecordGCcall(BYTE * codePos,
     call->cdGCrefRegs     = (regMaskSmall)emitThisGCrefRegs;
     call->cdByrefRegs     = (regMaskSmall)emitThisByrefRegs;
 #if EMIT_TRACK_STACK_DEPTH
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     noway_assert(FitsIn<USHORT>(emitCurStackLvl / ((unsigned)sizeof(unsigned))));
-    call->cdArgBaseOffset = (USHORT)(emitCurStackLvl / ((unsigned)sizeof(unsigned)));
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #endif
 
     // Append the call descriptor to the list */
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index 6f1c6c8fce..d6de1f2dba 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -3671,7 +3671,8 @@ void                emitter::emitIns_C(instruction  ins,
     }
     else if (ins == INS_pop)
     {
-        emitCurStackLvl -= emitCntStackDepth; assert((int)emitCurStackLvl >= 0);
+        emitCurStackLvl -= emitCntStackDepth;
+        assert((int)emitCurStackLvl >= 0);
     }
 
 #endif // !FEATURE_FIXED_OUT_ARGS
@@ -11010,7 +11011,7 @@ size_t              emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE**
                                 && id->idReg1()   == REG_ESP)
             {
                 assert((size_t)emitGetInsSC(id) < 0x00000000FFFFFFFFLL);
-                emitStackPop (dst, /*isCall*/false, /*callInstrSize*/0, (unsigned)(emitGetInsSC(id) / sizeof(void*)));
+                emitStackPop(dst, /*isCall*/false, /*callInstrSize*/0, (unsigned)(emitGetInsSC(id) / sizeof(void*)));
             }
             break;
 
diff --git a/src/jit/flowgraph.cpp b/src/jit/flowgraph.cpp
index 84233d82c6..c26f221c3f 100644
--- a/src/jit/flowgraph.cpp
+++ b/src/jit/flowgraph.cpp
@@ -8148,17 +8148,67 @@ void                Compiler::fgAddInternal()
 
     // If there is a return value, then create a temp for it.  Real returns will store the value in there and
     // it'll be reloaded by the single return.
-
+    // TODO-ARM-Bug: Deal with multi-register genReturnLocaled structs?
+    // TODO-ARM64: Does this apply for ARM64 too?
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Create a local temp to store the return if the return type is not void and the
+    // native return type is not a struct or the native return type is a struct that is returned
+    // in registers (no RetBuffArg argument.)
+    // If we fold all returns into a single return statement, create a temp for struct type variables as well.
+    if (genReturnBB && ((info.compRetType != TYP_VOID && info.compRetNativeType != TYP_STRUCT) ||
+        (info.compRetNativeType == TYP_STRUCT && info.compRetBuffArg == BAD_VAR_NUM)))
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     if (genReturnBB && (info.compRetType != TYP_VOID && info.compRetNativeType != TYP_STRUCT))
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     {
         genReturnLocal = lvaGrabTemp(true DEBUGARG("Single return block return value"));
-        lvaTable[genReturnLocal].lvType = genActualType(info.compRetNativeType);
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        var_types retLocalType = TYP_STRUCT;
+        if (info.compRetNativeType == TYP_STRUCT)
+        {
+            // If the native ret type is a struct, make sure the right 
+            // normalized type is assigned to the local variable.
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            assert(info.compMethodInfo->args.retTypeClass != nullptr);
+            eeGetSystemVAmd64PassStructInRegisterDescriptor(info.compMethodInfo->args.retTypeClass, &structDesc);
+            if (structDesc.passedInRegisters && structDesc.eightByteCount <= 1)
+            {
+                retLocalType = lvaTable[genReturnLocal].lvType = getEightByteType(structDesc, 0);
+            }
+            else
+            {
+                lvaTable[genReturnLocal].lvType = TYP_STRUCT;
+            }
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            lvaTable[genReturnLocal].lvType = genActualType(info.compRetNativeType);
+        }
         
         if (varTypeIsFloating(lvaTable[genReturnLocal].lvType))
         {
             this->compFloatingPointUsed = true;
         }
-        
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        // Handle a struct return type for System V Amd64 systems.
+        if (info.compRetNativeType == TYP_STRUCT)
+        {
+            // Handle the normalized return type.
+            if (retLocalType == TYP_STRUCT)
+            {
+                lvaSetStruct(genReturnLocal, info.compMethodInfo->args.retTypeClass, true);
+            }
+            else
+            {
+                lvaTable[genReturnLocal].lvVerTypeInfo = typeInfo(TI_STRUCT, info.compMethodInfo->args.retTypeClass);
+            }
+
+            lvaTable[genReturnLocal].lvDontPromote = true;
+        }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
         if (!varTypeIsFloating(info.compRetType))
             lvaTable[genReturnLocal].setPrefReg(REG_INTRET, this);
 #ifdef REG_FLOATRET
@@ -8172,7 +8222,6 @@ void                Compiler::fgAddInternal()
         lvaTable[genReturnLocal].lvKeepType = 1;
 #endif
     }
-
     else
     {
         genReturnLocal = BAD_VAR_NUM;
@@ -8442,7 +8491,11 @@ void                Compiler::fgAddInternal()
         //make sure to reload the return value as part of the return (it is saved by the "real return").
         if (genReturnLocal != BAD_VAR_NUM)
         {
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            noway_assert(info.compRetType != TYP_VOID);
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
             noway_assert(info.compRetType != TYP_VOID && info.compRetNativeType != TYP_STRUCT);
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
             GenTreePtr retTemp = gtNewLclvNode(genReturnLocal, lvaTable[genReturnLocal].TypeGet());
 
             //make sure copy prop ignores this node (make sure it always does a reload from the temp).
@@ -21424,7 +21477,7 @@ void                Compiler::fgInline()
 #endif // DEBUG
 }
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 /*********************************************************************************
  *
@@ -21463,16 +21516,16 @@ GenTreePtr Compiler::fgGetStructAsStructPtr(GenTreePtr tree)
 
 /***************************************************************************************************
  * child     - The inlinee of the retExpr node.
- * retClsHnd - The HFA class handle of the type of the inlinee.
+ * retClsHnd - The struct class handle of the type of the inlinee.
  *
  * Assign the inlinee to a tmp, if it is a call, just assign it to a lclVar, else we can
  * use a copyblock to do the assignment.
  */
-GenTreePtr Compiler::fgAssignHfaInlineeToVar(GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd)
+GenTreePtr Compiler::fgAssignStructInlineeToVar(GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd)
 {
     assert(child->gtOper != GT_RET_EXPR && child->gtOper != GT_MKREFANY);
 
-    unsigned tmpNum = lvaGrabTemp(false DEBUGARG("RetBuf for HFA inline return candidates."));
+    unsigned tmpNum = lvaGrabTemp(false DEBUGARG("RetBuf for struct inline return candidates."));
     lvaSetStruct(tmpNum, retClsHnd, false);
 
     GenTreePtr dst = gtNewLclvNode(tmpNum, TYP_STRUCT);
@@ -21518,7 +21571,7 @@ GenTreePtr Compiler::fgAssignHfaInlineeToVar(GenTreePtr child, CORINFO_CLASS_HAN
 /***************************************************************************************************
  * tree      - The tree pointer that has one of its child nodes as retExpr.
  * child     - The inlinee child.
- * retClsHnd - The HFA class handle of the type of the inlinee.
+ * retClsHnd - The struct class handle of the type of the inlinee.
  *
  * V04 = call() assignments are okay as we codegen it. Everything else needs to be a copy block or
  * would need a temp. For example, a cast(ldobj) will then be, cast(v05 = ldobj, v05); But it is
@@ -21526,7 +21579,7 @@ GenTreePtr Compiler::fgAssignHfaInlineeToVar(GenTreePtr child, CORINFO_CLASS_HAN
  * a lclVar/call. So it is not worthwhile to do pattern matching optimizations like addr(ldobj(op1))
  * can just be op1.
  */
-void Compiler::fgAttachHfaInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd)
+void Compiler::fgAttachStructInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd)
 {
     // We are okay to have:
     // 1. V02 = call();
@@ -21541,13 +21594,13 @@ void Compiler::fgAttachHfaInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINF
 
     GenTreePtr dstAddr = fgGetStructAsStructPtr(tree->gtOp.gtOp1);
     GenTreePtr srcAddr = fgGetStructAsStructPtr((child->gtOper == GT_CALL)
-                            ? fgAssignHfaInlineeToVar(child, retClsHnd) // Assign to a variable if it is a call.
+                            ? fgAssignStructInlineeToVar(child, retClsHnd) // Assign to a variable if it is a call.
                             : child);                                   // Just get the address, if not a call.
 
     tree->CopyFrom(gtNewCpObjNode(dstAddr, srcAddr, retClsHnd, false), this);
 }
 
-#endif // _TARGET_ARM_
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 /*****************************************************************************
  * Callback to replace the inline return expression place holder (GT_RET_EXPR)
@@ -21562,12 +21615,12 @@ Compiler::fgWalkResult      Compiler::fgUpdateInlineReturnExpressionPlaceHolder(
 
     if (tree->gtOper == GT_RET_EXPR)
     {
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         // We are going to copy the tree from the inlinee, so save the handle now.
         CORINFO_CLASS_HANDLE retClsHnd = (tree->TypeGet() == TYP_STRUCT)
                                        ? tree->gtRetExpr.gtRetClsHnd
                                        : NO_CLASS_HANDLE;
-#endif // _TARGET_ARM_
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
         do
         {
@@ -21605,32 +21658,36 @@ Compiler::fgWalkResult      Compiler::fgUpdateInlineReturnExpressionPlaceHolder(
         }
         while (tree->gtOper == GT_RET_EXPR);
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#if defined(_TARGET_ARM_)
         if (retClsHnd != NO_CLASS_HANDLE && comp->IsHfa(retClsHnd))
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (retClsHnd != NO_CLASS_HANDLE && comp->IsRegisterPassable(retClsHnd))
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         {
             GenTreePtr parent = data->parent;
             // See assert below, we only look one level above for an asg parent.
             if (parent->gtOper == GT_ASG)
             {
                 // Either lhs is a call V05 = call(); or lhs is addr, and asg becomes a copyBlk.
-                comp->fgAttachHfaInlineeToAsg(parent, tree, retClsHnd);
+                comp->fgAttachStructInlineeToAsg(parent, tree, retClsHnd);
             }
             else
             {
                 // Just assign the inlinee to a variable to keep it simple.
-                tree->CopyFrom(comp->fgAssignHfaInlineeToVar(tree, retClsHnd), comp);
+                tree->CopyFrom(comp->fgAssignStructInlineeToVar(tree, retClsHnd), comp);
             }
         }
-#endif // _TARGET_ARM_
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     }
 
-#if defined(DEBUG) && defined(_TARGET_ARM_)
+#if defined(DEBUG) && (defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
     // Make sure we don't have a tree like so: V05 = (, , , retExpr);
     // Since we only look one level above for the parent for '=' and
     // do not check if there is a series of COMMAs. See above.
     // Importer and FlowGraph will not generate such a tree, so just
     // leaving an assert in here. This can be fixed by looking ahead
-    // when we visit GT_ASG similar to fgAttachHfaInlineeToAsg.
+    // when we visit GT_ASG similar to fgAttachStructInlineeToAsg.
     else if (tree->gtOper == GT_ASG &&
              tree->gtOp.gtOp2->gtOper == GT_COMMA)
     {
@@ -21642,11 +21699,17 @@ Compiler::fgWalkResult      Compiler::fgUpdateInlineReturnExpressionPlaceHolder(
             // empty
         }
 
+#if defined(_TARGET_ARM_)
+        noway_assert(comma->gtType != TYP_STRUCT ||
+                     comma->gtOper != GT_RET_EXPR ||
+                     (!comp->IsHfa(comma->gtRetExpr.gtRetClsHnd)));
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         noway_assert(comma->gtType != TYP_STRUCT ||
                      comma->gtOper != GT_RET_EXPR ||
-                     !comp->IsHfa(comma->gtRetExpr.gtRetClsHnd));
+                     (!comp->IsRegisterPassable(comma->gtRetExpr.gtRetClsHnd)));
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     }
-#endif // defined(DEBUG) && defined(_TARGET_ARM_)
+#endif // defined(DEBUG) && (defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
 
     return WALK_CONTINUE;
 }
diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp
index 284000e55b..3c06925fe4 100644
--- a/src/jit/gentree.cpp
+++ b/src/jit/gentree.cpp
@@ -224,7 +224,15 @@ void                GenTree::InitNodeSize()
         GenTree::s_gtNodeSizes[op] = TREE_NODE_SZ_SMALL;
     }
 
-    /* Now set all of the appropriate entries to 'large' */
+    // Now set all of the appropriate entries to 'large'
+
+    // On ARM and System V struct returning there
+    // is code that does GT_ASG-tree.CopyObj call.
+    // CopyObj is a large node and the GT_ASG is small, which triggers an exception.
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    GenTree::s_gtNodeSizes[GT_ASG             ] = TREE_NODE_SZ_LARGE;
+    GenTree::s_gtNodeSizes[GT_RETURN          ] = TREE_NODE_SZ_LARGE;
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
     GenTree::s_gtNodeSizes[GT_CALL            ] = TREE_NODE_SZ_LARGE;
     GenTree::s_gtNodeSizes[GT_CAST            ] = TREE_NODE_SZ_LARGE;
@@ -256,6 +264,15 @@ void                GenTree::InitNodeSize()
     GenTree::s_gtNodeSizes[GT_MOD             ] = TREE_NODE_SZ_LARGE;
     GenTree::s_gtNodeSizes[GT_UMOD            ] = TREE_NODE_SZ_LARGE;
 #endif
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    GenTree::s_gtNodeSizes[GT_PUTARG_STK      ] = TREE_NODE_SZ_LARGE;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // In importer for Hfa and register returned structs we rewrite GT_ASG to GT_COPYOBJ/GT_CPYBLK
+    // Make sure the sizes agree.
+    assert(GenTree::s_gtNodeSizes[GT_COPYOBJ] <= GenTree::s_gtNodeSizes[GT_ASG]);
+    assert(GenTree::s_gtNodeSizes[GT_COPYBLK] <= GenTree::s_gtNodeSizes[GT_ASG]);
+#endif // !(defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
 
     assert(GenTree::s_gtNodeSizes[GT_RETURN] == GenTree::s_gtNodeSizes[GT_ASG]);
 
@@ -312,7 +329,12 @@ void                GenTree::InitNodeSize()
     static_assert_no_msg(sizeof(GenTreeArgPlace)      <= TREE_NODE_SZ_SMALL);
     static_assert_no_msg(sizeof(GenTreeLabel)         <= TREE_NODE_SZ_SMALL);
     static_assert_no_msg(sizeof(GenTreePhiArg)        <= TREE_NODE_SZ_SMALL);
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     static_assert_no_msg(sizeof(GenTreePutArgStk)     <= TREE_NODE_SZ_SMALL);
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
+    static_assert_no_msg(sizeof(GenTreePutArgStk)     <= TREE_NODE_SZ_LARGE);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifdef FEATURE_SIMD
     static_assert_no_msg(sizeof(GenTreeSIMD)          <= TREE_NODE_SZ_SMALL);
 #endif // FEATURE_SIMD
@@ -4366,13 +4388,21 @@ void            GenTree::InsertAfterSelf(GenTree* node, GenTreeStmt* stmt /* = n
 //    'parent' must be non-null
 //
 // Notes:
-//    Must not be called for GT_LDOBJ (which isn't used for RyuJIT, which is the only context
-//    in which this method is used)
+//    For non System V systems with native struct passing (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING not defined)
+//    this method must not be called for GT_LDOBJ (which isn't used for RyuJIT, which is the only context
+//    in which this method is used).
+//    If FEATURE_UNIX_AMD64_STRUCT_PASSING is defined we can get here with GT_LDOBJ tree. This happens when
+//    a struct is passed in two registers. The GT_LDOBJ is converted to a GT_LIST with two GT_LCL_FLDs later
+//    in Lower/LowerXArch.
+//
 
 GenTreePtr*         GenTree::gtGetChildPointer(GenTreePtr parent)
 
 {
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     noway_assert(parent->OperGet() != GT_LDOBJ);
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     switch (parent->OperGet())
     {
     default:
@@ -4380,6 +4410,14 @@ GenTreePtr*         GenTree::gtGetChildPointer(GenTreePtr parent)
         if (this == parent->gtOp.gtOp1)                    return &(parent->gtOp.gtOp1);
         if (this == parent->gtOp.gtOp2)                    return &(parent->gtOp.gtOp2);
         break;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    case GT_LDOBJ:
+        // Any GT_LDOBJ with a field must be lowered before this point.
+        noway_assert(parent->AsLdObj()->gtFldTreeList == nullptr);
+        break;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     case GT_CMPXCHG:
         if (this == parent->gtCmpXchg.gtOpLocation)        return &(parent->gtCmpXchg.gtOpLocation);
         if (this == parent->gtCmpXchg.gtOpValue)           return &(parent->gtCmpXchg.gtOpValue);
@@ -5027,7 +5065,7 @@ GenTreePtr          Compiler::gtNewInlineCandidateReturnExpr(GenTreePtr   inline
     GenTreePtr node = new(this, GT_RET_EXPR) GenTreeRetExpr(type);
     
     node->gtRetExpr.gtInlineCandidate = inlineCandidate;
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     if (inlineCandidate->gtType == TYP_STRUCT)
     {
         if (inlineCandidate->gtOper == GT_CALL)
@@ -5067,7 +5105,13 @@ GenTreeArgList* Compiler::gtNewListNode(GenTreePtr op1, GenTreeArgList* op2)
 
 GenTreeArgList* Compiler::gtNewArgList(GenTreePtr op)
 {
-    assert((op != NULL) && (op->OperGet() != GT_LIST) && (op->OperGet() != GT_LIST));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // With structs passed in multiple args we could have the arg
+    // GT_LIST containing a list of LCL_FLDs
+    assert((op != NULL) && ((!op->IsList()) || (op->IsListOfLclFlds())));
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    assert((op != NULL) && (op->OperGet() != GT_LIST));
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     return new (this, GT_LIST) GenTreeArgList(op);
 }
@@ -5079,8 +5123,15 @@ GenTreeArgList* Compiler::gtNewArgList(GenTreePtr op)
 
 GenTreeArgList* Compiler::gtNewArgList(GenTreePtr op1, GenTreePtr op2)
 {
-    assert((op1 != NULL) && (op1->OperGet() != GT_LIST) && (op1->OperGet() != GT_LIST));
-    assert((op2 != NULL) && (op2->OperGet() != GT_LIST) && (op2->OperGet() != GT_LIST));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // With structs passed in multiple args we could have the arg
+    // GT_LIST containing a list of LCL_FLDs
+    assert((op1 != NULL) && ((!op1->IsList()) || (op1->IsListOfLclFlds())));
+    assert((op2 != NULL) && ((!op2->IsList()) || (op2->IsListOfLclFlds())));
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    assert((op1 != NULL) && (!op1->IsList()));
+    assert((op2 != NULL) && (!op2->IsList()));
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     GenTreePtr tree;
 
@@ -5207,9 +5258,11 @@ GenTreePtr          Compiler::gtNewAssignNode(GenTreePtr dst, GenTreePtr src DEB
     // using struct assignment.
 #ifdef _TARGET_ARM_
     assert(isPhiDefn || type != TYP_STRUCT || IsHfa(dst) || IsHfa(src));
-#else
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     // You need to use GT_COPYBLK for assigning structs
     // See impAssignStruct()
+    assert(isPhiDefn || type != TYP_STRUCT || IsRegisterPassable(dst) || IsRegisterPassable(src));
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(isPhiDefn || type != TYP_STRUCT);
 #endif
 
@@ -5553,7 +5606,6 @@ GenTreePtr          Compiler::gtClone(GenTree * tree, bool complexOK)
                                  tree->gtField.gtFldHnd,
                                  objp,
                                  tree->gtField.gtFldOffset);
-
         }
         else if  (tree->gtOper == GT_ADD)
         {
@@ -8629,6 +8681,51 @@ GenTreePtr          Compiler::gtDispLinearTree(GenTreeStmt* curStmt,
                     // get child msg
                     if (tree->IsCall())
                     {
+                        // If this is a call and the arg (listElem) is a GT_LIST (Unix LCL_FLD for passing a var in multiple registers)
+                        // print the nodes of the nested list and continue to the next argument.
+                        if (listElem->gtOper == GT_LIST)
+                        {
+                            GenTreePtr nextListNested = nullptr;
+                            for (GenTreePtr listNested = listElem; listNested != nullptr; listNested = nextListNested)
+                            {
+                                GenTreePtr listElemNested;
+                                if (listNested->gtOper == GT_LIST)
+                                {
+                                    nextListNested = listNested->MoveNext();
+                                    listElemNested = listNested->Current();
+                                }
+                                else
+                                {
+                                    // GT_LIST nodes (under initBlk, others?) can have a non-null op2 that's not a GT_LIST
+                                    nextListNested = nullptr;
+                                    listElemNested = listNested;
+                                }
+
+                                indentStack->Push(indentInfo);
+                                if (child == tree->gtCall.gtCallArgs)
+                                {
+                                    gtGetArgMsg(tree, listNested, listElemNum, bufp, BufLength);
+                                }
+                                else
+                                {
+                                    assert(child == tree->gtCall.gtCallLateArgs);
+                                    gtGetLateArgMsg(tree, listNested, listElemNum, bufp, BufLength);
+                                }
+                                nextLinearNode = gtDispLinearTree(curStmt, nextLinearNode, listElemNested, indentStack, bufp);
+                                indentStack->Pop();
+                            }
+
+                            // Skip the GT_LIST nodes, as we do not print them, and the next node to print will occur
+                            // after the list.
+                            while (nextLinearNode->OperGet() == GT_LIST)
+                            {
+                                nextLinearNode = nextLinearNode->gtNext;
+                            }
+
+                            listElemNum++;
+                            continue;
+                        }
+
                         if (child == tree->gtCall.gtCallArgs)
                         {
                             gtGetArgMsg(tree, listElem, listElemNum, bufp, BufLength);
@@ -8643,6 +8740,7 @@ GenTreePtr          Compiler::gtDispLinearTree(GenTreeStmt* curStmt,
                     {
                         sprintf_s(bufp, sizeof(buf), "List Item %d", listElemNum);
                     }
+
                     indentStack->Push(indentInfo);
                     nextLinearNode = gtDispLinearTree(curStmt, nextLinearNode, listElem, indentStack, bufp);
                     indentStack->Pop();
@@ -10179,6 +10277,7 @@ LNG_ADD_CHKOVF:
                     }
                 }
             }
+
             lval1 = ltemp; break;
 
         case GT_OR : lval1 |= lval2; break;
diff --git a/src/jit/gentree.h b/src/jit/gentree.h
index f6c850ea5a..1402445da0 100644
--- a/src/jit/gentree.h
+++ b/src/jit/gentree.h
@@ -1027,6 +1027,11 @@ public:
         return OperIsCopyBlkOp(OperGet());
     }
 
+    bool            OperIsPutArgStk() const
+    {
+        return gtOper == GT_PUTARG_STK;
+    }
+
     bool            OperIsAddrMode() const
     {
         return OperIsAddrMode(OperGet());
@@ -1125,7 +1130,7 @@ public:
     static
     int             OperIsSimple(genTreeOps gtOper)
     {
-        return  (OperKind(gtOper) & GTK_SMPOP  ) != 0;
+        return (OperKind(gtOper) & GTK_SMPOP  ) != 0;
     }
 
     static
@@ -1294,7 +1299,7 @@ public:
 
     static
     inline bool RequiresNonNullOp2(genTreeOps oper);
-
+    bool IsListOfLclFlds();
 #endif // DEBUG
 
     inline bool IsZero();
@@ -2277,7 +2282,7 @@ struct GenTreeColon: public GenTreeOp
 /* gtCall   -- method call      (GT_CALL) */
 typedef class fgArgInfo *  fgArgInfoPtr;
 
-struct GenTreeCall: public GenTree
+struct GenTreeCall final : public GenTree
 {
     GenTreePtr        gtCallObjp;             // The instance argument ('this' pointer)
     GenTreeArgList*   gtCallArgs;             // The list of arguments in original evaluation order
@@ -2296,6 +2301,14 @@ struct GenTreeCall: public GenTree
     CORINFO_SIG_INFO* callSig;                // Used by tail calls and to register callsites with the EE
 
     regMaskTP         gtCallRegUsedMask;      // mask of registers used to pass parameters
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+
+    void SetRegisterReturningStructState(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDescIn)
+    {
+        structDesc.CopyFrom(structDescIn);
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
 #define     GTF_CALL_M_EXPLICIT_TAILCALL       0x0001  // GT_CALL -- the call is "tail" prefixed and importer has performed tail call checks
 #define     GTF_CALL_M_TAILCALL                0x0002  // GT_CALL -- the call is a tailcall
@@ -2438,9 +2451,12 @@ struct GenTreeCall: public GenTree
 
     GenTreeCall(var_types type) : 
         GenTree(GT_CALL, type) 
-        {}
+    {
+    }
 #if DEBUGGABLE_GENTREE
-    GenTreeCall() : GenTree() {}
+    GenTreeCall() : GenTree()
+    {
+    }
 #endif
 };
 
@@ -3024,7 +3040,7 @@ struct GenTreeRetExpr: public GenTree
 {
     GenTreePtr      gtInlineCandidate;
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     CORINFO_CLASS_HANDLE gtRetClsHnd;
 #endif
 
@@ -3243,10 +3259,26 @@ struct GenTreePutArgStk: public GenTreeUnOp
                                   // Fast tail calls set this to true.
                                   // In future if we need to add more such bool fields consider bit fields.
 
-    GenTreePutArgStk(genTreeOps oper, var_types type, unsigned slotNum, bool _putInIncomingArgArea = false
-                DEBUG_ARG(GenTreePtr callNode = NULL) DEBUG_ARG(bool largeNode = false)) : 
-                GenTreeUnOp(oper, type DEBUG_ARG(largeNode)),
-                gtSlotNum(slotNum), putInIncomingArgArea(_putInIncomingArgArea)
+    GenTreePutArgStk(
+            genTreeOps oper,
+            var_types type,
+            unsigned slotNum
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(unsigned numSlots)
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(bool isStruct),
+            bool _putInIncomingArgArea = false
+            DEBUG_ARG(GenTreePtr callNode = NULL)
+            DEBUG_ARG(bool largeNode = false))
+        : 
+        GenTreeUnOp(oper, type DEBUG_ARG(largeNode)),
+        gtSlotNum(slotNum),
+        putInIncomingArgArea(_putInIncomingArgArea)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        , gtPutArgStkKind(PutArgStkKindInvalid),
+        gtNumSlots(numSlots),
+        gtIsStruct(isStruct),
+        gtNumberReferenceSlots(0),
+        gtGcPtrs(nullptr)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
 #ifdef DEBUG
         gtCall = callNode;
@@ -3254,22 +3286,53 @@ struct GenTreePutArgStk: public GenTreeUnOp
     }
 
 
-    GenTreePutArgStk(genTreeOps oper, var_types type, GenTreePtr op1, unsigned slotNum, bool _putInIncomingArgArea = false
-                DEBUG_ARG(GenTreePtr callNode = NULL) DEBUG_ARG(bool largeNode = false)) : 
-                GenTreeUnOp(oper, type, op1 DEBUG_ARG(largeNode)), 
-                gtSlotNum(slotNum), putInIncomingArgArea(_putInIncomingArgArea)
+    GenTreePutArgStk(
+            genTreeOps oper,
+            var_types type,
+            GenTreePtr op1,
+            unsigned slotNum
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(unsigned numSlots)
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(bool isStruct),
+            bool _putInIncomingArgArea = false
+            DEBUG_ARG(GenTreePtr callNode = NULL)
+            DEBUG_ARG(bool largeNode = false))
+        :
+        GenTreeUnOp(oper, type, op1 DEBUG_ARG(largeNode)), 
+        gtSlotNum(slotNum),
+        putInIncomingArgArea(_putInIncomingArgArea)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        , gtPutArgStkKind(PutArgStkKindInvalid),
+        gtNumSlots(numSlots),
+        gtIsStruct(isStruct),
+        gtNumberReferenceSlots(0),
+        gtGcPtrs(nullptr)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
 #ifdef DEBUG
         gtCall = callNode;
 #endif
     }
 
-#else  // !FEATURE_FASTTAIL_CALL
-
-    GenTreePutArgStk(genTreeOps oper, var_types type, unsigned slotNum
-                DEBUG_ARG(GenTreePtr callNode = NULL) DEBUG_ARG(bool largeNode = false)) : 
-                GenTreeUnOp(oper, type DEBUG_ARG(largeNode)),
-                gtSlotNum(slotNum)
+#else  // !FEATURE_FASTTAILCALL
+
+    GenTreePutArgStk(
+            genTreeOps oper,
+            var_types type,
+            unsigned slotNum
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(unsigned numSlots)
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(bool isStruct)
+            DEBUG_ARG(GenTreePtr callNode = NULL)
+            DEBUG_ARG(bool largeNode = false))
+        :
+        GenTreeUnOp(oper, type DEBUG_ARG(largeNode)),
+        gtSlotNum(slotNum)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        , gtPutArgStkKind(PutArgStkKindInvalid),
+        gtNumSlots(numSlots),
+        gtIsStruct(isStruct),
+        gtNumberReferenceSlots(0),
+        gtGcPtrs(nullptr)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
 #ifdef DEBUG
         gtCall = callNode;
@@ -3277,10 +3340,25 @@ struct GenTreePutArgStk: public GenTreeUnOp
     }
 
 
-    GenTreePutArgStk(genTreeOps oper, var_types type, GenTreePtr op1, unsigned slotNum
-                DEBUG_ARG(GenTreePtr callNode = NULL) DEBUG_ARG(bool largeNode = false)) : 
-                GenTreeUnOp(oper, type, op1 DEBUG_ARG(largeNode)), 
-                gtSlotNum(slotNum)
+    GenTreePutArgStk(
+            genTreeOps oper,
+            var_types type,
+            GenTreePtr op1,
+            unsigned slotNum
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(unsigned numSlots)
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(bool isStruct)
+            DEBUG_ARG(GenTreePtr callNode = NULL)
+            DEBUG_ARG(bool largeNode = false))
+        :
+        GenTreeUnOp(oper, type, op1 DEBUG_ARG(largeNode)), 
+        gtSlotNum(slotNum)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        , gtPutArgStkKind(PutArgStkKindInvalid),
+        gtNumSlots(numSlots),
+        gtIsStruct(isStruct),
+        gtNumberReferenceSlots(0),
+        gtGcPtrs(nullptr)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
 #ifdef DEBUG
         gtCall = callNode;
@@ -3288,10 +3366,53 @@ struct GenTreePutArgStk: public GenTreeUnOp
     }
 #endif // FEATURE_FASTTAILCALL
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    //------------------------------------------------------------------------
+    // setGcPointers: Sets the number of references and the layout of the struct object returned by the VM.
+    //
+    // Arguments:
+    //    numPointers - Number of pointer references.
+    //    pointers    - layout of the struct (with pointers marked.)
+    //
+    // Return Value:
+    //    None
+    //
+    // Notes:
+    //    This data is used in the codegen for GT_PUTARG_STK to decide how to copy the struct to the stack by value.
+    //    If no pointer references are used, block copying instructions are used.
+    //    Otherwise the pointer reference slots are copied atomically in a way that gcinfo is emitted.
+    //    Any non pointer references between the pointer reference slots are copied in block fashion.
+    //
+    void setGcPointers(unsigned numPointers, BYTE* pointers)
+    {
+        gtNumberReferenceSlots = numPointers;
+        gtGcPtrs = pointers;
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifdef DEBUG
     GenTreePtr      gtCall;                // the call node to which this argument belongs
 #endif
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // Instruction selection: during codegen time, what code sequence we will be using
+    // to encode this operation.
+
+    enum PutArgStkKind : __int8
+    {
+        PutArgStkKindInvalid,
+        PutArgStkKindRepInstr,
+        PutArgStkKindUnroll,
+    };
+
+    PutArgStkKind gtPutArgStkKind;
+
+    unsigned gtNumSlots;              // Number of slots for the argument to be passed on stack
+    bool     gtIsStruct;              // This stack arg is a struct.
+    unsigned gtNumberReferenceSlots;  // Number of reference slots.
+    BYTE*    gtGcPtrs;                // gcPointers
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #if DEBUGGABLE_GENTREE
     GenTreePutArgStk() : GenTreeUnOp() {}
 #endif
@@ -3325,6 +3446,30 @@ inline GenTreePtr GenTree::MoveNext()
     return gtOp.gtOp2;
 }
 
+#ifdef DEBUG
+inline bool GenTree::IsListOfLclFlds()
+
+{
+    if (!IsList())
+    {
+        return false;
+    }
+
+    GenTree* gtListPtr = this;
+    while (gtListPtr->Current() != nullptr)
+    {
+        if (gtListPtr->Current()->OperGet() != GT_LCL_FLD)
+        {
+            return false;
+        }
+
+        gtListPtr = gtListPtr->MoveNext();
+    }
+
+    return true;
+}
+#endif // DEBUG
+
 inline GenTreePtr GenTree::Current()
 {
     assert(IsList());
diff --git a/src/jit/importer.cpp b/src/jit/importer.cpp
index d56ca3ddda..0ee654c837 100644
--- a/src/jit/importer.cpp
+++ b/src/jit/importer.cpp
@@ -1152,13 +1152,22 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
                                         BasicBlock    * block        /* = NULL */
                                        ) 
 {
-    assert(src->TypeGet() == TYP_STRUCT);
-
+    assert(src->TypeGet() == TYP_STRUCT || (src->gtOper == GT_ADDR && src->TypeGet() == TYP_BYREF));
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // TODO-ARM-BUG: Does ARM need this?
+    // TODO-ARM64-BUG: Does ARM64 need this?
+    assert(src->gtOper == GT_LCL_VAR  || src->gtOper == GT_FIELD    ||
+           src->gtOper == GT_IND      || src->gtOper == GT_LDOBJ    ||
+           src->gtOper == GT_CALL     || src->gtOper == GT_MKREFANY ||
+           src->gtOper == GT_RET_EXPR || src->gtOper == GT_COMMA    ||
+           src->gtOper == GT_ADDR     || GenTree::OperIsSIMD(src->gtOper));
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     assert(src->gtOper == GT_LCL_VAR  || src->gtOper == GT_FIELD    ||
            src->gtOper == GT_IND      || src->gtOper == GT_LDOBJ    ||
            src->gtOper == GT_CALL     || src->gtOper == GT_MKREFANY ||
            src->gtOper == GT_RET_EXPR || src->gtOper == GT_COMMA    ||
            GenTree::OperIsSIMD(src->gtOper));
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
     if (src->gtOper == GT_CALL)
     {
@@ -1187,8 +1196,14 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
                 fgLclFldAssign(lcl->gtLclVarCommon.gtLclNum);
                 lcl->gtType = src->gtType;
                 dest = lcl;
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_)
                 impMarkLclDstNotPromotable(lcl->gtLclVarCommon.gtLclNum, src, structHnd);
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                // Not allowed for FEATURE_CORCLR which is the only SKU available for System V OSs.
+                assert(!src->gtCall.IsVarargs() && "varargs not allowed for System V OSs.");
+
+                // Make the struct non promotable. The eightbytes could contain multiple fields.
+                lvaTable[lcl->gtLclVarCommon.gtLclNum].lvDontPromote = true;
 #endif
             }
             else
@@ -1207,6 +1222,7 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
     {
         GenTreePtr call = src->gtRetExpr.gtInlineCandidate;
         noway_assert(call->gtOper == GT_CALL);
+
         if (call->gtCall.gtCallMoreFlags & GTF_CALL_M_RETBUFFARG)
         {
             // insert the return value buffer into the argument list as first byref parameter
@@ -1274,7 +1290,8 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
     }
     else if (src->gtOper == GT_COMMA)
     {
-        assert(src->gtOp.gtOp2->gtType == TYP_STRUCT);  // Second thing is the struct
+        // Second thing is the struct or it's address.
+        assert(src->gtOp.gtOp2->gtType == TYP_STRUCT || src->gtOp.gtOp2->gtType == TYP_BYREF);
         if (pAfterStmt)
         {
             * pAfterStmt = fgInsertStmtAfter(block, * pAfterStmt, gtNewStmt(src->gtOp.gtOp1, impCurStmtOffs));
@@ -1287,6 +1304,10 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
         // evaluate the second thing using recursion
         return impAssignStructPtr(dest, src->gtOp.gtOp2, structHnd, curLevel, pAfterStmt, block);
     }
+    else if (src->gtOper == GT_ADDR)
+    {
+        // In case of address already in src, use it to copy the struct. 
+    }
     else
     {
         src = gtNewOperNode(GT_ADDR, TYP_BYREF, src);
@@ -4528,8 +4549,7 @@ GenTreePtr Compiler::impTransformThis (GenTreePtr thisPtr,
             GenTreePtr obj = thisPtr;
             
             assert(obj->TypeGet() == TYP_BYREF || obj->TypeGet() == TYP_I_IMPL);
-            obj = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, obj, pConstrainedResolvedToken->hClass
-                                                   );
+            obj = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, obj, pConstrainedResolvedToken->hClass);
             obj->gtFlags |= GTF_EXCEPT;
             
             CorInfoType jitTyp = info.compCompHnd->asCorInfoType(pConstrainedResolvedToken->hClass);
@@ -5948,7 +5968,14 @@ var_types           Compiler::impImportCall (OPCODE         opcode,
         }
     }
 
-    /* Check for varargs */
+    // Check for varargs
+#if !FEATURE_VARARG
+    if ((sig->callConv & CORINFO_CALLCONV_MASK) == CORINFO_CALLCONV_VARARG ||
+        (sig->callConv & CORINFO_CALLCONV_MASK) == CORINFO_CALLCONV_NATIVEVARARG)
+    {
+        BADCODE("Varargs not supported.");
+    }
+#endif // !FEATURE_VARARG
 
     if  ((sig->callConv & CORINFO_CALLCONV_MASK) == CORINFO_CALLCONV_VARARG ||
          (sig->callConv & CORINFO_CALLCONV_MASK) == CORINFO_CALLCONV_NATIVEVARARG)
@@ -6699,12 +6726,23 @@ bool                Compiler::impMethodInfo_hasRetBuffArg(CORINFO_METHOD_INFO *
         return false;
     }
 
-#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
+#if defined(_TARGET_AMD64_) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    assert(!info.compIsVarArgs && "Varargs not supported in CoreCLR on Unix.");
+    if (IsRegisterPassable(methInfo->args.retTypeClass))
+    {
+        return false;
+    }
+
+    // The struct is not aligned properly or it is bigger than 16 bytes,
+    // or it is custom layout, or it is not passed in registers for any other reason.
+    return true;
+#elif defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
+    // Check for TYP_STRUCT argument that can fit into a single register.
     // We don't need a return buffer if:
     //   i) TYP_STRUCT argument that can fit into a single register and
     //  ii) Power of two sized TYP_STRUCT.
     unsigned size = info.compCompHnd->getClassSize(methInfo->args.retTypeClass);
-    return (size > TARGET_POINTER_SIZE) || ((size & (size-1)) != 0);
+    return (size > TARGET_POINTER_SIZE) || ((size & (size - 1)) != 0);
 #elif defined(_TARGET_ARM_)
     // Check for non HFA: in ARM HFAs are returned in registers.
     if (!info.compIsVarArgs && IsHfa(methInfo->args.retTypeClass))
@@ -6717,8 +6755,6 @@ bool                Compiler::impMethodInfo_hasRetBuffArg(CORINFO_METHOD_INFO *
     // TODO-ARM64-NYI: HFA/HVA arguments.
     // Check for TYP_STRUCT argument that is greater than 16 bytes.
     return info.compCompHnd->getClassSize(methInfo->args.retTypeClass) > 16;
-#elif defined(_TARGET_X86_)
-    return true;
 #else // _TARGET_*
 #error Unsupported or unset target architecture
 #endif // _TARGET_*
@@ -6792,7 +6828,6 @@ GenTreePtr                Compiler::impFixupStructReturn(GenTreePtr     call,
                                                          CORINFO_CLASS_HANDLE retClsHnd)
 {
     assert(call->gtOper == GT_CALL);
-
     if (call->TypeGet() != TYP_STRUCT)
     {
         return call;
@@ -6826,13 +6861,46 @@ GenTreePtr                Compiler::impFixupStructReturn(GenTreePtr     call,
             return call;
         }
 
-        return impAssignHfaToVar(call, retClsHnd);
+        return impAssignStructToVar(call, retClsHnd);
     }
-#endif
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Not allowed for FEATURE_CORCLR which is the only SKU available for System V OSs.
+    assert(!call->gtCall.IsVarargs() && "varargs not allowed for System V OSs.");
+
+    // The return is a struct if not normalized to a single eightbyte return type below.
+    call->gtCall.gtReturnType = TYP_STRUCT;
+    // Get the classification for the struct.
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+    if (structDesc.passedInRegisters)
+    {
+        call->gtCall.SetRegisterReturningStructState(structDesc);
+
+        if (structDesc.eightByteCount <= 1)
+        {
+            call->gtCall.gtReturnType = getEightByteType(structDesc, 0);
+        }
+        else
+        {
+            if (!call->gtCall.CanTailCall() && ((call->gtFlags & GTF_CALL_INLINE_CANDIDATE) == 0))
+            {
+                // If we can tail call returning in registers struct or inline a method that returns
+                // a registers returned struct, then don't assign it to
+                // a variable back and forth.
+                return impAssignStructToVar(call, retClsHnd);
+            }
+        }
+    }
+    else
+    {
+        call->gtCall.gtCallMoreFlags |= GTF_CALL_M_RETBUFFARG;
+    }
+
+    return call;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     unsigned size = info.compCompHnd->getClassSize(retClsHnd);
     BYTE gcPtr = 0;
-
     // Check for TYP_STRUCT argument that can fit into a single register
     // change the type on those trees.
     // TODO-ARM64-NYI: what about structs 9 to 16 bytes that fit in two consecutive registers?
@@ -6913,7 +6981,37 @@ GenTreePtr          Compiler::impFixupStructReturnType(GenTreePtr op, CORINFO_CL
     assert(info.compRetBuffArg == BAD_VAR_NUM);
 
 #if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(info.compRetNativeType != TYP_STRUCT);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    assert(!info.compIsVarArgs); // No VarArgs for CoreCLR.
+    if (info.compRetNativeType == TYP_STRUCT)
+    {
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+
+        if (structDesc.passedInRegisters)
+        {
+            if (op->gtOper == GT_LCL_VAR)
+            {
+                // This LCL_VAR is a register return value, it stays as a TYP_STRUCT
+                unsigned lclNum = op->gtLclVarCommon.gtLclNum;
+                // Make sure this struct type stays as struct so that we can return it in registers.
+                lvaTable[lclNum].lvDontPromote = true;
+
+                return op;
+            }
+
+            if (op->gtOper == GT_CALL)
+            {
+                return op;
+            }
+
+            return impAssignStructToVar(op, retClsHnd);
+        }
+    }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #elif defined(_TARGET_ARM_)
     if (!info.compIsVarArgs && IsHfa(retClsHnd))
     {
@@ -6941,7 +7039,7 @@ GenTreePtr          Compiler::impFixupStructReturnType(GenTreePtr op, CORINFO_CL
                 return op;
             }
         }
-        return impAssignHfaToVar(op, retClsHnd);
+        return impAssignStructToVar(op, retClsHnd);
     }
 #endif
 
@@ -7003,7 +7101,22 @@ REDO_RETURN_NODE:
         }
         else
         {
-            assert(info.compRetNativeType == op->gtCall.gtReturnType);
+#ifdef DEBUG
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (op->gtType == TYP_STRUCT)
+            {
+                SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+                eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+                assert(structDesc.eightByteCount < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+                assert(getEightByteType(structDesc, 0) == op->gtCall.gtReturnType);
+            }
+            else
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+            {
+                assert(info.compRetNativeType == op->gtCall.gtReturnType);
+            }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // DEBUG
             // Don't change the gtType node just yet, it will get changed later
             return op;
         }
@@ -7012,8 +7125,19 @@ REDO_RETURN_NODE:
     {
         op->gtOp.gtOp2 = impFixupStructReturnType(op->gtOp.gtOp2, retClsHnd);
     }
-
-    op->gtType = info.compRetNativeType;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (op->gtType == TYP_STRUCT)
+    {
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+        assert(structDesc.eightByteCount < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+        op->gtType = getEightByteType(structDesc, 0);
+    }
+    else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+    {
+        op->gtType = info.compRetNativeType;
+    }
 
     return op;
 }
@@ -11412,7 +11536,6 @@ DO_LDFTN:
             }
 
             eeGetFieldInfo(&resolvedToken, (CORINFO_ACCESS_FLAGS)aflags, &fieldInfo);
-
             // Figure out the type of the member.  We always call canAccessField, so you always need this
             // handle
             CorInfoType ciType = fieldInfo.fieldType;
@@ -11590,7 +11713,6 @@ DO_LDFTN:
 
                 /* Create the data member node */
                 op1 = gtNewFieldRef(lclTyp, resolvedToken.hField, NULL, fieldInfo.offset);
-
                 op1->gtFlags |= GTF_IND_TLS_REF; // fgMorphField will handle the transformation
 
                 if (isLoadAddress)
@@ -11850,7 +11972,6 @@ FIELD_DONE:
 
                 /* Create the data member node */
                 op1 = gtNewFieldRef(lclTyp, resolvedToken.hField, NULL, fieldInfo.offset);
-
                 op1->gtFlags |= GTF_IND_TLS_REF; // fgMorphField will handle the transformation
 
                 break;
@@ -12396,7 +12517,11 @@ FIELD_DONE:
               |           |                         | push the BYREF to this local |
               |--------------------------------------------------------------------- 
               | UNBOX_ANY | push a GT_LDOBJ of      | push the STRUCT              |
-              |           | the BYREF               |                              |
+              |           | the BYREF               | For Linux when the           |
+              |           |                         |  struct is returned in two   |
+              |           |                         |  registers create a temp     |
+              |           |                         |  which address is passed to  |
+              |           |                         |  the unbox_nullable helper.  |
               |---------------------------------------------------------------------
             */
                 
@@ -12434,11 +12559,40 @@ FIELD_DONE:
                     impPushOnStack(op1, tiRetVal);
                     oper = GT_LDOBJ;
                     goto LDOBJ;
-                }   
-                
+                }
+
+                assert(helper == CORINFO_HELP_UNBOX_NULLABLE && "Make sure the helper is nullable!");
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                if (op1->gtType == TYP_STRUCT)
+                {
+                    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+                    eeGetSystemVAmd64PassStructInRegisterDescriptor(resolvedToken.hClass, &structDesc);
+                    if (structDesc.passedInRegisters && structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS)
+                    {
+                        // Unbox nullable helper returns a TYP_STRUCT.
+                        // We need to spill it to a temp so than we can take the address of it.
+                        // We need the temp so we can pass its address to the unbox_nullable jit helper function.
+                        // This is needed for 2 register returned nullables.
+                        // The one register ones are normalized. For the bigger than 16 bytes ones there is retbuf already passed in rdi.
+
+                        unsigned   tmp = lvaGrabTemp(true DEBUGARG("UNBOXing a register returnable nullable"));
+                        lvaTable[tmp].lvDontPromote = true;
+                        lvaSetStruct(tmp, resolvedToken.hClass, true  /* unsafe value cls check */);
+
+                        op2 = gtNewLclvNode(tmp, TYP_STRUCT);
+                        op1 = impAssignStruct(op2, op1, resolvedToken.hClass, (unsigned)CHECK_SPILL_ALL);
+                        assert(op1->gtType == TYP_VOID); // We must be assigning the return struct to the temp.
+
+                        op2 = gtNewLclvNode(tmp, TYP_STRUCT);
+                        op2 = gtNewOperNode(GT_ADDR, TYP_BYREF, op2);
+                        op1 = gtNewOperNode(GT_COMMA, TYP_STRUCT, op1, op2);
+                    }
+                }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
                 assert(op1->gtType == TYP_STRUCT);
                 tiRetVal = verMakeTypeInfo(resolvedToken.hClass);
-                assert(tiRetVal.IsValueClass());                       
+                assert(tiRetVal.IsValueClass());
             }
 
             impPushOnStack(op1, tiRetVal);                    
@@ -12946,8 +13100,7 @@ LDOBJ:
            
             // LDOBJ returns a struct
             // and an inline argument which is the class token of the loaded obj
-            op1 = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, op1, resolvedToken.hClass
-                                                   );
+            op1 = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, op1, resolvedToken.hClass);
             op1->gtFlags |= GTF_EXCEPT;
 
             CorInfoType jitTyp = info.compCompHnd->asCorInfoType(resolvedToken.hClass);
@@ -13231,7 +13384,7 @@ void Compiler::impLoadLoc(unsigned ilLclNum, IL_OFFSET offset)
     }            
 }
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_)
 /**************************************************************************************
  *
  *  When assigning a vararg call src to a HFA lcl dest, mark that we cannot promote the
@@ -13269,12 +13422,32 @@ void Compiler::impMarkLclDstNotPromotable(unsigned tmpNum, GenTreePtr src, CORIN
         }
     }
 }
+#endif
 
-GenTreePtr Compiler::impAssignHfaToVar(GenTreePtr op, CORINFO_CLASS_HANDLE hClass)
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+GenTreePtr Compiler::impAssignStructToVar(GenTreePtr op, CORINFO_CLASS_HANDLE hClass)
 {
-    unsigned tmpNum = lvaGrabTemp(true DEBUGARG("Return value temp for HFA structs in ARM."));
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    unsigned tmpNum = lvaGrabTemp(true DEBUGARG("Return value temp for register returned structs in System V"));
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    unsigned tmpNum = lvaGrabTemp(true DEBUGARG("Return value temp for HFA structs in ARM"));
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     impAssignTempGen(tmpNum, op, hClass, (unsigned) CHECK_SPILL_NONE);
-    return gtNewLclvNode(tmpNum, TYP_STRUCT);
+    GenTreePtr ret = gtNewLclvNode(tmpNum, TYP_STRUCT);
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#ifdef DEBUG
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    eeGetSystemVAmd64PassStructInRegisterDescriptor(hClass, &structDesc);
+    // If single eightbyte, the return type would have been normalized and there won't be a temp var.
+    // This code will be called only if the struct return has not been normalized (i.e. 2 eightbytes - max allowed.)
+    assert(structDesc.passedInRegisters && structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+#endif // DEBUG
+    // Mark the var to store the eightbytes on stack non promotable.
+    // The return value is based on eightbytes, so all the fields need 
+    // to be on stack before loading the eightbyte in the corresponding return register.
+    lvaTable[tmpNum].lvDontPromote = true;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    return ret;
 }
 #endif
 
@@ -13297,7 +13470,7 @@ bool Compiler::impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &
             Verify(!verIsByRefLike(tiDeclared) ||
                    verIsSafeToReturnByRef(tiVal)
                    , "byref return");
-                    
+
             Verify(tiCompatibleWith(tiVal, tiDeclared.NormaliseForStack(), true), "type mismatch");
             expectedStack=1;
         }
@@ -13502,15 +13675,35 @@ bool Compiler::impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &
                                      se.seTypeInfo.GetClassHandle(),
                                      (unsigned) CHECK_SPILL_ALL);
                 }
-#ifdef _TARGET_ARM_
+                // TODO-ARM64-NYI: HFA
+                // TODO-AMD64-Unix and TODO-ARM once the ARM64 functionality is implemented the
+                // next ifdefs could be refactored in a single method with the ifdef inside.
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#if defined(_TARGET_ARM_)
                 if (IsHfa(retClsHnd))
                 {
                     // Same as !IsHfa but just don't bother with impAssignStructPtr.
+#else // !defined(_TARGET_ARM_)
+                SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+                eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+                if (structDesc.passedInRegisters)
+                {
+                    // If single eightbyte, the return type would have been normalized and there won't be a temp var.
+                    // This code will be called only if the struct return has not been normalized (i.e. 2 eightbytes - max allowed.)
+                    assert(structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+                    // Same as !structDesc.passedInRegisters but just don't bother with impAssignStructPtr.
+#endif // !defined(_TARGET_ARM_)
+
                     if (lvaInlineeReturnSpillTemp != BAD_VAR_NUM)
                     {
                         if (!impInlineInfo->retExpr)
                         {
+#if defined(_TARGET_ARM_)
                             impInlineInfo->retExpr = gtNewLclvNode(lvaInlineeReturnSpillTemp, TYP_STRUCT);
+#else // !defined(_TARGET_ARM_)
+                            // The inlinee compiler has figured out the type of the temp already. Use it here.
+                            impInlineInfo->retExpr = gtNewLclvNode(lvaInlineeReturnSpillTemp, lvaTable[lvaInlineeReturnSpillTemp].lvType);
+#endif // !defined(_TARGET_ARM_)
                         }
                     }
                     else
@@ -13519,7 +13712,7 @@ bool Compiler::impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &
                     }
                 }
                 else
-#endif
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 {
                     assert(iciCall->gtCall.gtCallMoreFlags & GTF_CALL_M_RETBUFFARG);
                     GenTreePtr dest = gtCloneExpr(iciCall->gtCall.gtCallArgs->gtOp.gtOp1);   
@@ -13575,8 +13768,9 @@ bool Compiler::impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &
     }
     else if (info.compRetType == TYP_STRUCT)
     {
-#ifndef _TARGET_ARM_
+#if !defined(_TARGET_ARM_) && !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         // In ARM HFA native types are maintained as structs.
+        // The multi register System V AMD64 return structs are also left as structs and not normalized.
         // TODO-ARM64-NYI: HFA
         noway_assert(info.compRetNativeType != TYP_STRUCT);
 #endif
diff --git a/src/jit/jit.h b/src/jit/jit.h
index 9702da3ec9..2901ffd6eb 100644
--- a/src/jit/jit.h
+++ b/src/jit/jit.h
@@ -220,6 +220,22 @@
 #define INDEBUG_LDISASM_COMMA(x)
 #endif
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#define FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(x)   , x
+#define FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(x)   x
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#define FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(x)
+#define FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(x)
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+#if defined(UNIX_AMD64_ABI)
+#define UNIX_AMD64_ABI_ONLY_ARG(x)   , x
+#define UNIX_AMD64_ABI_ONLY(x)   x
+#else // !defined(UNIX_AMD64_ABI)
+#define UNIX_AMD64_ABI_ONLY_ARG(x)
+#define UNIX_AMD64_ABI_ONLY(x)
+#endif // defined(UNIX_AMD64_ABI)
+
 // To get rid of warning 4701 : local variable may be used without being initialized
 #define DUMMY_INIT(x)       (x)
 
@@ -605,7 +621,11 @@ unsigned int        unsigned_abs(int x)
 inline
 size_t              unsigned_abs(ssize_t x)
 {
+#ifndef FEATURE_PAL
     return ((size_t)          abs(x));
+#else // !FEATURE_PAL
+    return ((size_t)          labs(x));
+#endif // !FEATURE_PAL
 }
 #endif // _TARGET_64BIT_
 
diff --git a/src/jit/jitgcinfo.h b/src/jit/jitgcinfo.h
index 5c8d10f1b7..4063bafe15 100644
--- a/src/jit/jitgcinfo.h
+++ b/src/jit/jitgcinfo.h
@@ -253,7 +253,6 @@ public :
 #endif
 
         unsigned short      cdArgCnt;
-        unsigned short      cdArgBaseOffset;
 
         union
         {
diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp
index c12f735f68..b9e89f156d 100644
--- a/src/jit/lclvars.cpp
+++ b/src/jit/lclvars.cpp
@@ -103,8 +103,8 @@ void                Compiler::lvaInitTypeRef()
     /* Set compArgsCount and compLocalsCount */
 
     info.compArgsCount      = info.compMethodInfo->args.numArgs;
-
-    /* Is there a 'this' pointer */
+    
+    // Is there a 'this' pointer 
 
     if (!info.compIsStatic)
     {
@@ -133,6 +133,18 @@ void                Compiler::lvaInitTypeRef()
         else
 #endif
         {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            eeGetSystemVAmd64PassStructInRegisterDescriptor(info.compMethodInfo->args.retTypeClass, &structDesc);
+            if (structDesc.eightByteCount > 1)
+            {
+                info.compRetNativeType = TYP_STRUCT;
+            }
+            else
+            {
+                info.compRetNativeType = getEightByteType(structDesc, 0);
+            }
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
             unsigned size = info.compCompHnd->getClassSize(info.compMethodInfo->args.retTypeClass);
 
             // Check for TYP_STRUCT argument that can fit into a single register
@@ -173,6 +185,7 @@ void                Compiler::lvaInitTypeRef()
                 assert(!"Unexpected size when returning struct by value");
                 break;
             }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         }
     }
 
@@ -191,7 +204,9 @@ void                Compiler::lvaInitTypeRef()
        calling convention is varargs */
 
     if (info.compIsVarArgs)
+    {
         info.compArgsCount++;
+    }
 
     // Is there an extra parameter used to pass instantiation info to
     // shared generic methods and shared generic struct instance methods?
@@ -356,18 +371,17 @@ void                Compiler::lvaInitArgs(InitVarDscInfo *          varDscInfo)
 
     //----------------------------------------------------------------------
 
-    /* We have set info.compArgsCount in compCompile() */
-
+    // We have set info.compArgsCount in compCompile()
     noway_assert(varDscInfo->varNum == info.compArgsCount);
     assert (varDscInfo->intRegArgNum <= MAX_REG_ARG);
-        
+
     codeGen->intRegState.rsCalleeRegArgNum = varDscInfo->intRegArgNum;
 
 #if !FEATURE_STACK_FP_X87
     codeGen->floatRegState.rsCalleeRegArgNum = varDscInfo->floatRegArgNum;
 #endif // FEATURE_STACK_FP_X87
 
-    /* The total argument size must be aligned. */
+    // The total argument size must be aligned.
     noway_assert((compArgSize % sizeof(void*)) == 0);
 
 #ifdef _TARGET_X86_
@@ -440,6 +454,7 @@ void                Compiler::lvaInitThisPtr(InitVarDscInfo *       varDscInfo)
         }
 #endif
         compArgSize       += TARGET_POINTER_SIZE;
+
         varDscInfo->varNum++;
         varDscInfo->varDsc++;
     }
@@ -449,7 +464,17 @@ void                Compiler::lvaInitThisPtr(InitVarDscInfo *       varDscInfo)
 void                Compiler::lvaInitRetBuffArg(InitVarDscInfo *    varDscInfo)
 {
     LclVarDsc * varDsc = varDscInfo->varDsc;
-    const bool hasRetBuffArg = impMethodInfo_hasRetBuffArg(info.compMethodInfo);
+    bool hasRetBuffArg = impMethodInfo_hasRetBuffArg(info.compMethodInfo);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (info.compRetNativeType == TYP_STRUCT)
+    {
+        if (IsRegisterPassable(info.compMethodInfo->args.retTypeClass))
+        {
+            hasRetBuffArg = false;
+        }
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     if (hasRetBuffArg)
     {
@@ -594,7 +619,6 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
             // the type as a float or double.
             argType = hfaType;
         }
-
         if (isRegParamType(argType))
         {
             compArgSize += varDscInfo->alignReg(argType, cAlign) * REGSIZE_BYTES;
@@ -644,19 +668,94 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
         }
 
 #else // !_TARGET_ARM_
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        if (argType == TYP_STRUCT)
+        {
+            assert(typeHnd != nullptr);
+            eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            if (structDesc.passedInRegisters)
+            {
+                unsigned intRegCount = 0;
+                unsigned floatRegCount = 0;
 
-        varDsc->lvOnFrame = true; // The final home for this incoming register might be our local stack frame
+                for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
+                {
+                    switch (structDesc.eightByteClassifications[i])
+                    {
+                    case SystemVClassificationTypeInteger:
+                    case SystemVClassificationTypeIntegerReference:
+                        intRegCount++;
+                        break;
+                    case SystemVClassificationTypeSSE:
+                        floatRegCount++;
+                        break;
+                    default:
+                        assert(false && "Invalid eightbyte classification type.");
+                        break;
+                    }
+                }
+
+                if (intRegCount != 0 && !varDscInfo->canEnreg(TYP_INT, intRegCount))
+                {
+                    structDesc.passedInRegisters = false; // No register to enregister the eightbytes.
+                }
+
+                if (floatRegCount != 0 && !varDscInfo->canEnreg(TYP_FLOAT, floatRegCount))
+                {
+                    structDesc.passedInRegisters = false; // No register to enregister the eightbytes.
+                }
+            }
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        // The final home for this incoming register might be our local stack frame
+        // For System V platforms the final home will always be on the local stack frame.
+        varDsc->lvOnFrame = true;
 
 #endif // !_TARGET_ARM_
 
-        if (varDscInfo->canEnreg(argType, cSlotsToEnregister))
+        bool canPassArgInRegisters = false;
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (argType == TYP_STRUCT)
+        {
+            canPassArgInRegisters = structDesc.passedInRegisters;
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            canPassArgInRegisters = varDscInfo->canEnreg(argType, cSlotsToEnregister);
+        }
+
+        if (canPassArgInRegisters) 
         {
             /* Another register argument */
 
             // Allocate the registers we need. allocRegArg() returns the first argument register number of the set.
             // For non-HFA structs, we still "try" to enregister the whole thing; it will just max out if splitting
             // to the stack happens.
-            unsigned firstAllocatedRegArgNum = varDscInfo->allocRegArg(argType, cSlots);
+            unsigned firstAllocatedRegArgNum = 0;
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            unsigned secondAllocatedRegArgNum = 0;
+            var_types firstEightByteType  = TYP_UNDEF;
+            var_types secondEightByteType = TYP_UNDEF;
+            varDsc->lvOtherArgReg = REG_NA;
+
+            if (argType == TYP_STRUCT)
+            {
+                if (structDesc.eightByteCount >= 1)
+                {
+                    firstEightByteType = getEightByteType(structDesc, 0);
+                    firstAllocatedRegArgNum = varDscInfo->allocRegArg(firstEightByteType, 1);
+                }
+            }
+            else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            {
+                firstAllocatedRegArgNum = varDscInfo->allocRegArg(argType, cSlots);
+            }
 
 #ifdef _TARGET_ARM_
             if (isHfaArg)
@@ -668,7 +767,31 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
 #endif // _TARGET_ARM_
 
             varDsc->lvIsRegArg = 1;
-            varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, argType);
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            if (argType == TYP_STRUCT)
+            {
+                varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, firstEightByteType);
+
+                // If there is a second eightbyte, get a register for it too and map the arg to the reg number.
+                if (structDesc.eightByteCount >= 2)
+                {
+                    secondEightByteType = getEightByteType(structDesc, 1);
+                    secondAllocatedRegArgNum = varDscInfo->allocRegArg(secondEightByteType, 1);
+                }
+
+                if (secondEightByteType != TYP_UNDEF)
+                {
+                    varDsc->lvOtherArgReg = genMapRegArgNumToRegNum(secondAllocatedRegArgNum, secondEightByteType);
+                    varDsc->addPrefReg(genRegMask(varDsc->lvOtherArgReg), this);
+                }
+            }
+            else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
+            {
+                varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, argType);
+            }
+
             varDsc->setPrefReg(varDsc->lvArgReg, this);
 
 #ifdef _TARGET_ARM_
@@ -682,52 +805,91 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
 #ifdef  DEBUG
             if  (verbose)
             {
-                printf("Arg #%u    passed in register ", varDscInfo->varNum);
-
-                bool isFloat = varTypeIsFloating(argType);
-                unsigned regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, argType);
+                printf("Arg #%u    passed in register(s) ", varDscInfo->varNum);
+                bool isFloat = false;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                // In case of one eightbyte struct the type is already normalized earlier.
+                // The varTypeIsFloating(argType) is good for this case.
+                if ((argType == TYP_STRUCT) && (structDesc.eightByteCount >= 1))
+                {
+                    isFloat = varTypeIsFloating(firstEightByteType);
+                }
+                else
+#else // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                {
+                    isFloat = varTypeIsFloating(argType);
+                }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
-                for (unsigned ix = 0; ix < cSlots; ix++, regArgNum++)
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                if (argType == TYP_STRUCT)
                 {
-                    if (ix > 0)
-                        printf(",");
+                    // Print both registers, just to be clear
+                    if (firstEightByteType == TYP_UNDEF)
+                    {
+                        printf("firstEightByte: <not used>");
+                    }
+                    else
+                    {
+                        printf("firstEightByte: %s", getRegName(genMapRegArgNumToRegNum(firstAllocatedRegArgNum, firstEightByteType), isFloat));
+                    }
 
-                    if (!isFloat && (regArgNum >= varDscInfo->maxIntRegArgNum)) // a struct has been split between registers and stack
+                    if (secondEightByteType == TYP_UNDEF)
                     {
-                        printf(" stack slots:%d", cSlots - ix);
-                        break;
+                        printf(", secondEightByte: <not used>");
                     }
+                    else
+                    {
+                        printf(", secondEightByte: %s", getRegName(genMapRegArgNumToRegNum(secondAllocatedRegArgNum, secondEightByteType), varTypeIsFloating(secondEightByteType)));
+                    }
+                }
+                else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                {
+                    unsigned regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, argType);
 
-#ifdef _TARGET_ARM_
-                    if (isFloat)
+                    for (unsigned ix = 0; ix < cSlots; ix++, regArgNum++)
                     {
-                        // Print register size prefix
-                        if (argType == TYP_DOUBLE)
+                        if (ix > 0)
+                            printf(",");
+
+                        if (!isFloat && (regArgNum >= varDscInfo->maxIntRegArgNum)) // a struct has been split between registers and stack
+                        {
+                            printf(" stack slots:%d", cSlots - ix);
+                            break;
+                        }
+
+#ifdef _TARGET_ARM_
+                        if (isFloat)
                         {
-                            // Print both registers, just to be clear
-                            printf("%s/%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType),     isFloat), 
-                                            getRegName(genMapRegArgNumToRegNum(regArgNum + 1, argType), isFloat));
-
-                            // doubles take 2 slots
-                            assert(ix + 1 < cSlots);
-                            ++ix;
-                            ++regArgNum;
+                            // Print register size prefix
+                            if (argType == TYP_DOUBLE)
+                            {
+                                // Print both registers, just to be clear
+                                printf("%s/%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType),     isFloat), 
+                                                getRegName(genMapRegArgNumToRegNum(regArgNum + 1, argType), isFloat));
+
+                                // doubles take 2 slots
+                                assert(ix + 1 < cSlots);
+                                ++ix;
+                                ++regArgNum;
+                            }
+                            else
+                            {
+                                printf("%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType), isFloat));
+                            }
                         }
                         else
+#endif // _TARGET_ARM_
                         {
                             printf("%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType), isFloat));
                         }
                     }
-                    else
-#endif // _TARGET_ARM_
-                    {
-                        printf("%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType), isFloat));
-                    }
                 }
                 printf("\n");
             }
 #endif // DEBUG
-        } // if canEnreg()
+        } // end if (canPassArgInRegisters) 
         else
         {
 #ifdef _TARGET_ARM_
@@ -739,8 +901,13 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
 #endif
         }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // The arg size is returning the number of bytes of the argument. For a struct it could return a size not a multiple of 
+        // TARGET_POINTER_SIZE. The stack allocated space should always be multiple of TARGET_POINTER_SIZE, so round it up.
+        compArgSize += (unsigned)roundUp(argSize, TARGET_POINTER_SIZE);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         compArgSize += argSize;
-
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         if (info.compIsVarArgs)
         {
 #if defined(_TARGET_X86_)
@@ -807,6 +974,7 @@ void                Compiler::lvaInitGenericsCtxt(InitVarDscInfo *  varDscInfo)
             varDsc->lvArgReg   = genMapRegArgNumToRegNum(varDscInfo->regArgNum(TYP_INT), varDsc->TypeGet());
             varDsc->setPrefReg(varDsc->lvArgReg, this);
             varDsc->lvOnFrame = true; // The final home for this incoming register might be our local stack frame
+
             varDscInfo->intRegArgNum++;
 
 #ifdef  DEBUG
@@ -1180,11 +1348,6 @@ void   Compiler::lvaCanPromoteStructType(CORINFO_CLASS_HANDLE     typeHnd,
                                          lvaStructPromotionInfo * StructPromotionInfo,
                                          bool                     sortFields)
 {    
-#ifdef UNIX_AMD64_ABI
-    // TODO-Amd64-Unix: For now don't promote structs on Linux.
-    // This should be brought online with the full SystemVStruct passing work.
-    return;
-#endif // UNIX_AMD64_ABI
     assert(eeIsValueClass(typeHnd));
     
     if (typeHnd != StructPromotionInfo->typeHnd)
@@ -2844,14 +3007,21 @@ void                Compiler::lvaMarkLclRefs(GenTreePtr tree)
     }
 #endif // ASSERTION_PROP
 
+    bool allowStructs = false;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // On System V the type of the var could be a TYP_STRUCT.
+    allowStructs = varDsc->lvType == TYP_STRUCT;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     /* Variables must be used as the same type throughout the method */
-    noway_assert(tiVerificationNeeded ||
-           varDsc->lvType == TYP_UNDEF   || tree->gtType   == TYP_UNKNOWN ||
-           genActualType(varDsc->TypeGet()) == genActualType(tree->gtType) ||
-           (tree->gtType == TYP_BYREF && varDsc->TypeGet() == TYP_I_IMPL)  ||
-           (tree->gtType == TYP_I_IMPL && varDsc->TypeGet() == TYP_BYREF)  ||
-           (tree->gtFlags & GTF_VAR_CAST) ||
-           varTypeIsFloating(varDsc->TypeGet()) && varTypeIsFloating(tree->gtType));
+        noway_assert(tiVerificationNeeded ||
+               varDsc->lvType == TYP_UNDEF   || tree->gtType   == TYP_UNKNOWN ||
+               allowStructs ||
+               genActualType(varDsc->TypeGet()) == genActualType(tree->gtType) ||
+               (tree->gtType == TYP_BYREF && varDsc->TypeGet() == TYP_I_IMPL)  ||
+               (tree->gtType == TYP_I_IMPL && varDsc->TypeGet() == TYP_BYREF)  ||
+               (tree->gtFlags & GTF_VAR_CAST) ||
+               varTypeIsFloating(varDsc->TypeGet()) && varTypeIsFloating(tree->gtType));
 
     /* Remember the type of the reference */
 
@@ -3690,7 +3860,6 @@ void Compiler::lvaFixVirtualFrameOffsets()
         delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta();
     }
 #endif //_TARGET_AMD64_
-
     unsigned lclNum;
     LclVarDsc * varDsc;
     for (lclNum = 0, varDsc = lvaTable;
@@ -3735,6 +3904,7 @@ void Compiler::lvaFixVirtualFrameOffsets()
         if (doAssignStkOffs)
         {
            varDsc->lvStkOffs += delta;
+
 #if DOUBLE_ALIGN
             if (genDoubleAlign() && !codeGen->isFramePointerUsed())
             {
@@ -3886,11 +4056,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
     {
         noway_assert(lclNum == info.compThisArg);
 #ifndef _TARGET_X86_
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
 #endif // _TARGET_X86_
         lclNum++;
     }
@@ -3902,11 +4068,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
         noway_assert(lclNum == info.compRetBuffArg);
         noway_assert(lvaTable[lclNum].lvIsRegArg);
 #ifndef _TARGET_X86_
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
 #endif // _TARGET_X86_
         lclNum++;
     }
@@ -3917,20 +4079,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
     if (info.compMethodInfo->args.callConv & CORINFO_CALLCONV_PARAMTYPE)
     {
         noway_assert(lclNum == (unsigned)info.compTypeCtxtArg);
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, &callerArgOffset);
-#else // UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs);
-#endif // UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
     }
 
     if (info.compIsVarArgs)
     {
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
     }
 
 #endif // USER_ARGS_COME_LAST
@@ -3976,18 +4130,10 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
         if (lvaIsPreSpilled(preSpillLclNum, preSpillMask))
         {
             unsigned argSize = eeGetArgSize(argLst, &info.compMethodInfo->args);
-#ifdef UNIX_AMD64_ABI
-            argOffs = lvaAssignVirtualFrameOffsetToArg(
-                preSpillLclNum,
-                argSize,
-                argOffs,
-                &callerArgOffset);
-#else // !UNIX_AMD64_ABI
             argOffs = lvaAssignVirtualFrameOffsetToArg(
                 preSpillLclNum,
                 argSize,
                 argOffs);
-#endif // !UNIX_AMD64_ABI
             argLcls++;
 
             // Early out if we can. If size is 8 and base reg is 2, then the mask is 0x1100
@@ -4008,18 +4154,10 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
     {
         if (!lvaIsPreSpilled(stkLclNum, preSpillMask))
         {
-#ifdef UNIX_AMD64_ABI
-            argOffs = lvaAssignVirtualFrameOffsetToArg(
-                stkLclNum,
-                eeGetArgSize(argLst, &info.compMethodInfo->args),
-                argOffs,
-                &callerArgOffset);
-#else // !UNIX_AMD64_ABI
             argOffs = lvaAssignVirtualFrameOffsetToArg(
                 stkLclNum,
                 eeGetArgSize(argLst, &info.compMethodInfo->args),
                 argOffs);
-#endif // !UNIX_AMD64_ABI
             argLcls++;
         }
         argLst = info.compCompHnd->getArgNext(argLst);
@@ -4029,16 +4167,18 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
 #else // !_TARGET_ARM_
     for (unsigned i = 0; i < argSigLen; i++)
     {
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++,
-            eeGetArgSize(argLst, &info.compMethodInfo->args),
-            argOffs,
-            &callerArgOffset);
-#else // !UNIX_AMD64_ABI
+        unsigned argumentSize = eeGetArgSize(argLst, &info.compMethodInfo->args);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // On the stack frame the homed arg always takes a full number of slots
+        // for proper stack alignment. Make sure the real struct size is properly rounded up.
+        argumentSize = (unsigned)roundUp(argumentSize, TARGET_POINTER_SIZE);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
         argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++,
-            eeGetArgSize(argLst, &info.compMethodInfo->args),
-            argOffs);
-#endif // UNIX_AMD64_ABI
+            argumentSize,
+            argOffs
+            UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
         argLst = info.compCompHnd->getArgNext(argLst);
     }
 #endif // !_TARGET_ARM_
@@ -4049,26 +4189,19 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
     if (info.compMethodInfo->args.callConv & CORINFO_CALLCONV_PARAMTYPE)
     {
         noway_assert(lclNum == (unsigned)info.compTypeCtxtArg);
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
     }
 
     if (info.compIsVarArgs)
     {
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
     }
 
 #endif // USER_ARGS_COME_LAST
 
 }
 
+#ifdef UNIX_AMD64_ABI
 //
 //  lvaAssignVirtualFrameOffsetToArg() : Assign virtual stack offsets to an
 //  individual argument, and return the offset for the next argument.
@@ -4076,12 +4209,9 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
 //  (if any - the RA might decide to spill(home on the stack) register passed arguments, if rarely used.)
 //        The final offset is calculated in lvaFixVirtualFrameOffsets method. It accounts for FP existance, 
 //        ret address slot, stack frame padding, alloca instructions, etc. 
+//  Note: This is the implementation for UNIX_AMD64 System V platforms.
 //
-#ifdef UNIX_AMD64_ABI
-int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize, int argOffs, int * callerArgOffset)
-#else // !UNIX_AMD64_ABI
-int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize, int argOffs)
-#endif // !UNIX_AMD64_ABI
+int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize, int argOffs UNIX_AMD64_ABI_ONLY_ARG(int * callerArgOffset))
 {
     noway_assert(lclNum < info.compArgsCount);
     noway_assert(argSize);
@@ -4114,30 +4244,131 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
 
     if (varDsc->lvIsRegArg)
     {
-        /* Argument is passed in a register, don't count it
-         * when updating the current offset on the stack */
-
-#ifndef _TARGET_ARM_
-        noway_assert(argSize == sizeof(void *));
-#endif
+        // Argument is passed in a register, don't count it
+        // when updating the current offset on the stack.
 
-#if defined(_TARGET_X86_)
-        argOffs += sizeof(void *);
-#elif defined(_TARGET_AMD64_)
-#ifdef UNIX_AMD64_ABI
         if (varDsc->lvOnFrame)
-#endif
         {
             // The offset for args needs to be set only for the stack homed arguments for System V.
             varDsc->lvStkOffs = argOffs;
-            argOffs += sizeof(void *);
         }
-#ifdef UNIX_AMD64_ABI
-        else 
+        else
         {
             varDsc->lvStkOffs = 0;
         }
+    }
+    else
+    {
+        // For Windows AMD64 there are 4 slots for the register passed arguments on the top of the caller's stack. This is where they are always homed.
+        // So, they can be accessed with positive offset.
+        // On System V platforms, if the RA decides to home a register passed arg on the stack,
+        // it creates a stack location on the callee stack (like any other local var.) In such a case, the register passed, stack homed arguments
+        // are accessed using negative offsets and the stack passed arguments are accessed using positive offset (from the caller's stack.)
+        // For  System V platforms if there is no frame pointer the caller stack parameter offset should include the callee allocated space.
+        // If frame register is used, the callee allocated space should not be included for accessing the caller stack parameters.
+        // The last two requirements are met in lvaFixVirtualFrameOffsets method, which fixes the offsets, based on frame pointer existence, 
+        // existence of alloca instructions, ret address pushed, ets.
+
+        varDsc->lvStkOffs = *callerArgOffset;
+        // Structs passed on stack could be of size less than TARGET_POINTER_SIZE.
+        // Make sure they get at least TARGET_POINTER_SIZE on the stack - this is required for alignment.
+        if (varDsc->lvType == TYP_STRUCT)
+        {
+            *callerArgOffset += (int)roundUp(argSize, TARGET_POINTER_SIZE);
+        }
+        else
+        {
+            *callerArgOffset += TARGET_POINTER_SIZE;
+        }
+    }
+
+    // For struct promoted parameters we need to set the offsets for both LclVars.
+    // 
+    // For a dependent promoted struct we also assign the struct fields stack offset 
+    if (varDsc->lvPromotedStruct())
+    {
+        lvaPromotionType promotionType = lvaGetPromotionType(varDsc);
+
+        if (promotionType == PROMOTION_TYPE_DEPENDENT)
+        {
+            noway_assert(varDsc->lvFieldCnt == 1);  // We only handle one field here
+
+            assert(fieldVarNum == varDsc->lvFieldLclStart);
+            lvaTable[fieldVarNum].lvStkOffs = varDsc->lvStkOffs;
+        }
+    }
+    // For an independent promoted struct field we also assign the parent struct stack offset
+    else if (varDsc->lvIsStructField)
+    {
+        noway_assert(varDsc->lvParentLcl < lvaCount);
+        lvaTable[varDsc->lvParentLcl].lvStkOffs = varDsc->lvStkOffs;
+    }
+
+    if (Target::g_tgtArgOrder == Target::ARG_ORDER_R2L && !varDsc->lvIsRegArg)
+        argOffs += argSize;
+
+    return argOffs;
+}
+
+#else // !UNIX_AMD64_ABI
+
+//
+//  lvaAssignVirtualFrameOffsetToArg() : Assign virtual stack offsets to an
+//  individual argument, and return the offset for the next argument.
+//  Note: This method only calculates the initial offset of the stack passed/spilled arguments 
+//  (if any - the RA might decide to spill(home on the stack) register passed arguments, if rarely used.)
+//        The final offset is calculated in lvaFixVirtualFrameOffsets method. It accounts for FP existance, 
+//        ret address slot, stack frame padding, alloca instructions, etc. 
+//  Note: This implementation for all the platforms but UNIX_AMD64 OSs (System V 64 bit.)
+int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize, int argOffs UNIX_AMD64_ABI_ONLY_ARG(int * callerArgOffset))
+{
+    noway_assert(lclNum < info.compArgsCount);
+    noway_assert(argSize);
+
+    if (Target::g_tgtArgOrder == Target::ARG_ORDER_L2R)
+        argOffs -= argSize;
+
+    unsigned fieldVarNum = BAD_VAR_NUM;
+
+    noway_assert(lclNum < lvaCount);
+    LclVarDsc * varDsc = lvaTable + lclNum;
+
+    if (varDsc->lvPromotedStruct())
+    {
+        noway_assert(varDsc->lvFieldCnt == 1);  // We only handle one field here
+        fieldVarNum = varDsc->lvFieldLclStart;
+
+        lvaPromotionType promotionType = lvaGetPromotionType(varDsc);
+
+        if (promotionType == PROMOTION_TYPE_INDEPENDENT)
+        {
+            lclNum = fieldVarNum;
+            noway_assert(lclNum < lvaCount);
+            varDsc = lvaTable + lclNum;
+            assert(varDsc->lvIsStructField);
+        }
+    }
+
+    noway_assert(varDsc->lvIsParam);
+
+    if (varDsc->lvIsRegArg)
+    {
+        /* Argument is passed in a register, don't count it
+        * when updating the current offset on the stack */
+
+#ifndef _TARGET_ARM_
+#if DEBUG
+        noway_assert(argSize == sizeof(void *));
+#endif // DEBUG
 #endif
+
+#if defined(_TARGET_X86_)
+        argOffs += sizeof(void *);
+#elif defined(_TARGET_AMD64_)
+        // The offset for args needs to be set only for the stack homed arguments for System V.
+        varDsc->lvStkOffs = argOffs;
+        // Register arguments also take stack space.
+        argOffs += sizeof(void *);
 #elif defined(_TARGET_ARM64_)
         // Register arguments don't take stack space.
 #elif defined(_TARGET_ARM_)
@@ -4181,32 +4412,32 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
 
             case TYP_DOUBLE:
             case TYP_LONG:
+            {
+                //
+                // Let's assign offsets to arg1, a double in r2. argOffs has to be 4 not 8.
+                //
+                // ------- CALLER SP -------
+                // r3
+                // r2 double   -- argOffs = 4, but it doesn't need to be skipped, because there is no skipping.
+                // r1 VACookie -- argOffs = 0
+                // -------------------------
+                //
+                // Consider argOffs as if it accounts for number of prespilled registers before the current register.
+                // In the above example, for r2, it is r1 that is prespilled, but since r1 is accounted for by argOffs
+                // being 4, there should have been no skipping. Instead, if we didn't assign r1 to any variable, then
+                // argOffs would still be 0 which implies it is not accounting for r1, equivalently r1 is skipped.
+                //
+                // If prevRegsSize is unaccounted for by a corresponding argOffs, we must have skipped a register.
+                int prevRegsSize = genCountBits(codeGen->regSet.rsMaskPreSpillRegArg & (regMask - 1)) * TARGET_POINTER_SIZE;
+                if (argOffs < prevRegsSize)
                 {
-                    //
-                    // Let's assign offsets to arg1, a double in r2. argOffs has to be 4 not 8.
-                    //
-                    // ------- CALLER SP -------
-                    // r3
-                    // r2 double   -- argOffs = 4, but it doesn't need to be skipped, because there is no skipping.
-                    // r1 VACookie -- argOffs = 0
-                    // -------------------------
-                    //
-                    // Consider argOffs as if it accounts for number of prespilled registers before the current register.
-                    // In the above example, for r2, it is r1 that is prespilled, but since r1 is accounted for by argOffs
-                    // being 4, there should have been no skipping. Instead, if we didn't assign r1 to any variable, then
-                    // argOffs would still be 0 which implies it is not accounting for r1, equivalently r1 is skipped.
-                    //
-                    // If prevRegsSize is unaccounted for by a corresponding argOffs, we must have skipped a register.
-                    int prevRegsSize = genCountBits(codeGen->regSet.rsMaskPreSpillRegArg & (regMask - 1)) * TARGET_POINTER_SIZE;
-                    if (argOffs < prevRegsSize)
-                    {
-                        // We must align up the argOffset to a multiple of 8 to account for skipped registers.
-                        argOffs = roundUp(argOffs, 2*TARGET_POINTER_SIZE);
-                    }
-                    // We should've skipped only a single register.
-                    assert(argOffs == prevRegsSize);
+                    // We must align up the argOffset to a multiple of 8 to account for skipped registers.
+                    argOffs = roundUp(argOffs, 2 * TARGET_POINTER_SIZE);
                 }
-                break;
+                // We should've skipped only a single register.
+                assert(argOffs == prevRegsSize);
+            }
+            break;
 
             default:
                 // No alignment of argOffs required
@@ -4292,16 +4523,16 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
             if (!compIsProfilerHookNeeded())
 #endif
             {
-                bool cond = (info.compIsVarArgs && 
-                             // Does cur stk arg require double alignment?
-                             ((varDsc->lvType == TYP_STRUCT && varDsc->lvStructDoubleAlign) ||
-                              (varDsc->lvType == TYP_DOUBLE) ||
-                              (varDsc->lvType == TYP_LONG))
-                            ) ||
-                            // Did first reg arg require alignment?
-                            (codeGen->regSet.rsMaskPreSpillAlign & genRegMask(REG_ARG_LAST));
-
-                noway_assert(cond);                            
+                bool cond = (info.compIsVarArgs &&
+                    // Does cur stk arg require double alignment?
+                    ((varDsc->lvType == TYP_STRUCT && varDsc->lvStructDoubleAlign) ||
+                    (varDsc->lvType == TYP_DOUBLE) ||
+                    (varDsc->lvType == TYP_LONG))
+                    ) ||
+                    // Did first reg arg require alignment?
+                    (codeGen->regSet.rsMaskPreSpillAlign & genRegMask(REG_ARG_LAST));
+
+                noway_assert(cond);
                 noway_assert(sizeofPreSpillRegArgs <= argOffs + TARGET_POINTER_SIZE); // at most one register of alignment
             }
             argOffs = sizeofPreSpillRegArgs;
@@ -4321,7 +4552,7 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
         case TYP_DOUBLE:
         case TYP_LONG:
             // We must align up the argOffset to a multiple of 8
-            argOffs = roundUp(argOffsWithoutPreSpillRegArgs, 2*TARGET_POINTER_SIZE) + sizeofPreSpillRegArgs;
+            argOffs = roundUp(argOffsWithoutPreSpillRegArgs, 2 * TARGET_POINTER_SIZE) + sizeofPreSpillRegArgs;
             break;
 
         default:
@@ -4330,21 +4561,7 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
         }
 #endif // _TARGET_ARM_
 
-#ifdef UNIX_AMD64_ABI
-        // For Windows there are 4 slots for the register passed arguments on the top of the caller's stack. This is where they are always homed.
-        // So, they can be accessed with positive offset.
-        // On System V platforms, if the RA decides to home a register passed arg on the stack,
-        // it creates a stack location on the callee stack (like any other local var.) In such a case, the register passed, stack homed arguments
-        // are accessed using negative offsets and the stack passed arguments are accessed using positive offset (from the caller's stack.)
-        // For  System V platforms if there is no frame pointer the caller stack parameter offset should include the callee allocated space.
-        // If frame register is used, the callee allocated space should not be included for accessing the caller stack parameters.
-        // The last two requirements are met in lvaFixVirtualFrameOffsets method, which fixes the offsets, based on frame pointer existence, 
-        // existence of alloca instructions, ret address pushed, ets.
-        varDsc->lvStkOffs = *callerArgOffset;
-        *callerArgOffset += TARGET_POINTER_SIZE;
-#else // !UNIX_AMD64_ABI
         varDsc->lvStkOffs = argOffs;
-#endif // !UNIX_AMD64_ABI
     }
 
     // For struct promoted parameters we need to set the offsets for both LclVars.
@@ -4360,31 +4577,31 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
     }
     else
 #endif // !defined(_TARGET_64BIT_)
-    if (varDsc->lvPromotedStruct())
-    {
-        lvaPromotionType promotionType = lvaGetPromotionType(varDsc);
-
-        if (promotionType == PROMOTION_TYPE_DEPENDENT)
+        if (varDsc->lvPromotedStruct())
         {
-            noway_assert(varDsc->lvFieldCnt == 1);  // We only handle one field here
+            lvaPromotionType promotionType = lvaGetPromotionType(varDsc);
 
-            assert(fieldVarNum == varDsc->lvFieldLclStart);
-            lvaTable[fieldVarNum].lvStkOffs = varDsc->lvStkOffs;
+            if (promotionType == PROMOTION_TYPE_DEPENDENT)
+            {
+                noway_assert(varDsc->lvFieldCnt == 1);  // We only handle one field here
+
+                assert(fieldVarNum == varDsc->lvFieldLclStart);
+                lvaTable[fieldVarNum].lvStkOffs = varDsc->lvStkOffs;
+            }
         }
-    }
     // For an independent promoted struct field we also assign the parent struct stack offset
-    else if (varDsc->lvIsStructField)
-    {
-        noway_assert(varDsc->lvParentLcl < lvaCount);
-        lvaTable[varDsc->lvParentLcl].lvStkOffs = varDsc->lvStkOffs;
-    }
+        else if (varDsc->lvIsStructField)
+        {
+            noway_assert(varDsc->lvParentLcl < lvaCount);
+            lvaTable[varDsc->lvParentLcl].lvStkOffs = varDsc->lvStkOffs;
+        }
 
     if (Target::g_tgtArgOrder == Target::ARG_ORDER_R2L && !varDsc->lvIsRegArg)
         argOffs += argSize;
 
     return argOffs;
 }
-
+#endif // !UNIX_AMD64_ABI
 
 /*****************************************************************************
  *  lvaAssignVirtualFrameOffsetsToLocals() : Assign virtual stack offsets to
@@ -5261,8 +5478,18 @@ void Compiler::lvaAssignFrameOffsetsToPromotedStructs()
     {     
         // For promoted struct fields that are params, we will
         // assign their offsets in lvaAssignVirtualFrameOffsetToArg().
+        // This is not true for the System V systems since there is no 
+        // outgoing args space. Assign the dependently promoted fields properly.
         //
-        if (varDsc->lvIsStructField && !varDsc->lvIsParam)
+        if (varDsc->lvIsStructField 
+#ifndef UNIX_AMD64_ABI
+        // For System V platforms there is no outgoing args space. 
+        // A register passed struct arg is homed on the stack in a separate local var.
+        // The offset of these structs is already calculated in lvaAssignVirtualFrameOffsetToArg methos.
+        // Make sure the code below is not executed for these structs and the offset is not changed.
+            && !varDsc->lvIsParam
+#endif // UNIX_AMD64_ABI
+            )
         {
             LclVarDsc *      parentvarDsc  = &lvaTable[varDsc->lvParentLcl];
             lvaPromotionType promotionType = lvaGetPromotionType(parentvarDsc);
diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp
index bb69d103cf..5882ecfa71 100644
--- a/src/jit/lower.cpp
+++ b/src/jit/lower.cpp
@@ -1001,9 +1001,39 @@ void Lowering::SpliceInUnary(GenTreePtr parent, GenTreePtr* ppChild, GenTreePtr
     oldChild->InsertAfterSelf(newNode);
 }
 
+//------------------------------------------------------------------------
+// NewPutArg: rewrites the tree to put an arg in a register or on the stack.
+//
+// Arguments:
+//    call - the call whose arg is being rewritten.
+//    arg  - the arg being rewritten.
+//    fp   - the ArgTabEntry for the argument.
+//    type - the type of the argument.
+//
+// Return Value:
+//    The new tree that was created to put the arg in the right place
+//    or the incoming arg if the arg tree was not rewritten.
+//
+// Assumptions:
+//    call, arg, and fp must be non-null.
+//
+// Notes:
+//    For System V systems with native struct passing (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined)
+//    this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_LIST of two GT_PUTARG_REGs
+//    for two eightbyte structs.
+//
+//    For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing 
+//    (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GP pointers count and the pointers 
+//    layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value.
+//    (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.)
+//
 GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryPtr fp, var_types type)
 {
-    GenTreePtr putArg;
+    assert(call != nullptr);
+    assert(arg != nullptr);
+    assert(fp != nullptr);
+
+    GenTreePtr putArg = nullptr;
     bool updateArgTable = true;
 
 #if !defined(_TARGET_64BIT_)
@@ -1015,7 +1045,22 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
         type = TYP_INT;
     }
 #endif // !defined(_TARGET_64BIT_)
-    if (fp->regNum != REG_STK)
+
+    bool  isOnStack = true;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (type == TYP_STRUCT)
+    {
+        isOnStack = !fp->structDesc.passedInRegisters;
+    }
+    else
+    {
+        isOnStack = fp->regNum == REG_STK;
+    }
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    isOnStack = fp->regNum == REG_STK; 
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    if (!isOnStack)
     {
 #ifdef FEATURE_SIMD
         // We can have SIMD types that are handled as TYP_DOUBLE, but which need to be
@@ -1025,24 +1070,182 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
             type = TYP_LONG;
         }
 #endif //FEATURE_SIMD
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (fp->isStruct)
+        {
+            // The following code makes sure a register passed struct arg is moved to
+            // the register before the call is made.
+            // There are two cases (comments added in the code below.)
+            // 1. The struct is of size one eightbyte:
+            //    In this case a new tree is created that is GT_PUTARG_REG
+            //    with a op1 the original argument.
+            // 2. The struct is contained in 2 eightbytes:
+            //    in this case the arg comes as a GT_LIST of two GT_LCL_FLDs - the two eightbytes of the struct.
+            //    The code creates a GT_PUTARG_REG node for each GT_LCL_FLD in the GT_LIST
+            //    and splices it in the list with the corresponding original GT_LCL_FLD tree as op1.
+
+            assert(fp->structDesc.eightByteCount != 0);
+
+            if (fp->structDesc.eightByteCount == 1)
+            {
+                // Case 1 above: Create a GT_PUTARG_REG node with op1 of the original tree.
+                //
+                // Here the IR for this operation:
+                // lowering call :
+                //     N001(3, 2)[000017] ------ - N---- / --*  &lclVar   byref  V00 loc0
+                //     N003(6, 5)[000052] * --XG------ - / --*  indir     int
+                //     N004(3, 2)[000046] ------ - N---- + --*  &lclVar   byref  V02 tmp0
+                //     (13, 11)[000070] -- - XG-- - R-- - arg0 in out + 00 / --*  storeIndir int
+                //     N009(3, 4)[000054] ------ - N----arg0 in rdi + --*  lclFld    int    V02 tmp0[+0](last use)
+                //     N011(33, 21)[000018] --CXG------ - *call      void   Test.Foo.test1
+                //
+                // args :
+                //     lowering arg : (13, 11)[000070] -- - XG-- - R-- - *storeIndir int
+                //
+                // late :
+                //    lowering arg : N009(3, 4)[000054] ------ - N----             *  lclFld    int    V02 tmp0[+0](last use)
+                //    new node is : (3, 4)[000071] ------------             *  putarg_reg int    RV
+                //
+                // after :
+                //    N001(3, 2)[000017] ------ - N---- / --*  &lclVar   byref  V00 loc0
+                //    N003(6, 5)[000052] * --XG------ - / --*  indir     int
+                //    N004(3, 2)[000046] ------ - N---- + --*  &lclVar   byref  V02 tmp0
+                //    (13, 11)[000070] -- - XG-- - R-- - arg0 in out + 00 / --*  storeIndir int
+                //    N009(3, 4)[000054] ------ - N---- | / --*  lclFld    int    V02 tmp0[+0](last use)
+                //    (3, 4)[000071] ------------arg0 in rdi + --*  putarg_reg int    RV
+                //    N011(33, 21)[000018] --CXG------ - *call      void   Test.Foo.test1
+                //
+
+                putArg = comp->gtNewOperNode(GT_PUTARG_REG, type, arg);
+            }
+            else if (fp->structDesc.eightByteCount == 2)
+            {
+                // Case 2 above: Convert the LCL_FLDs to PUTARG_REG
+                //
+                // lowering call :
+                //     N001(3, 2)[000025] ------ - N----Source / --*  &lclVar   byref  V01 loc1
+                //     N003(3, 2)[000056] ------ - N----Destination + --*  &lclVar   byref  V03 tmp1
+                //     N006(1, 1)[000058] ------------ + --*  const     int    16
+                //     N007(12, 12)[000059] - A--G---- - L - arg0 SETUP / --*  copyBlk   void
+                //     N009(3, 4)[000061] ------ - N----arg0 in rdi + --*  lclFld    long   V03 tmp1[+0]
+                //     N010(3, 4)[000063] ------------arg0 in rsi + --*  lclFld    long   V03 tmp1[+8](last use)
+                //     N014(40, 31)[000026] --CXG------ - *call      void   Test.Foo.test2
+                //
+                // args :
+                //     lowering arg : N007(12, 12)[000059] - A--G---- - L - *copyBlk   void
+                //
+                // late :
+                //     lowering arg : N012(11, 13)[000065] ------------             *  <list>    struct
+                //
+                // after :
+                //     N001(3, 2)[000025] ------ - N----Source / --*  &lclVar   byref  V01 loc1
+                //     N003(3, 2)[000056] ------ - N----Destination + --*  &lclVar   byref  V03 tmp1
+                //     N006(1, 1)[000058] ------------ + --*  const     int    16
+                //     N007(12, 12)[000059] - A--G---- - L - arg0 SETUP / --*  copyBlk   void
+                //     N009(3, 4)[000061] ------ - N---- | / --*  lclFld    long   V03 tmp1[+0]
+                //     (3, 4)[000072] ------------arg0 in rdi + --*  putarg_reg long
+                //     N010(3, 4)[000063] ------------ | / --*  lclFld    long   V03 tmp1[+8](last use)
+                //     (3, 4)[000073] ------------arg0 in rsi + --*  putarg_reg long
+                //     N014(40, 31)[000026] --CXG------ - *call      void   Test.Foo.test2
+                //
+
+                assert(arg->OperGet() == GT_LIST);
+                GenTreeArgList* argListPtr = arg->AsArgList();
+                
+                for (unsigned ctr = 0; argListPtr != nullptr; argListPtr = argListPtr->Rest(), ctr++)
+                {
+                    // Create a new GT_PUTARG_REG node with op1 the original GT_LCL_FLD.
+                    GenTreePtr newOper = comp->gtNewOperNode(
+                        GT_PUTARG_REG,
+                        comp->GetTypeFromClassificationAndSizes(fp->structDesc.eightByteClassifications[ctr], fp->structDesc.eightByteSizes[ctr]),
+                        argListPtr->gtOp.gtOp1);
+
+                    // CopyCosts
+                    newOper->CopyCosts(argListPtr->gtOp.gtOp1);
+
+                    // Splice in the new GT_PUTARG_REG node in the GT_LIST
+                    SpliceInUnary(argListPtr, &argListPtr->gtOp.gtOp1, newOper);
+                }
 
-        putArg = comp->gtNewOperNode(GT_PUTARG_REG, type, arg);
+                // Just return arg. The GT_LIST is not replaced.
+                // Nothing more to do.
+                return arg;
+            }
+            else
+            {
+                assert(false && "Illegal count of eightbytes for the CLR type system"); // No more than 2 eightbytes for the CLR.
+                
+            }
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            putArg = comp->gtNewOperNode(GT_PUTARG_REG, type, arg);
+        }
     }
     else
     {
         // Mark this one as tail call arg if it is a fast tail call.
         // This provides the info to put this argument in in-coming arg area slot 
         // instead of in out-going arg area slot.
+
+        FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(assert(fp->isStruct == (type == TYP_STRUCT))); // Make sure state is correct
+
 #if FEATURE_FASTTAILCALL
-        putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, type, arg, fp->slotNum, call->IsFastTailCall() DEBUG_ARG(call));
+        putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK,
+                                                            type,
+                                                            arg, 
+                                                            fp->slotNum
+                                                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(fp->numSlots)
+                                                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(fp->isStruct),
+                                                            call->IsFastTailCall() 
+                                                            DEBUG_ARG(call));
 #else
-        putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, type, arg, fp->slotNum DEBUG_ARG(call));
+        putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK,
+                                                            type, 
+                                                            arg, 
+                                                            fp->slotNum
+                                                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(fp->numSlots)
+                                                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(fp->isStruct)
+                                                            DEBUG_ARG(call));
 #endif
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // If the ArgTabEntry indicates that this arg is a struct
+        // get and store the number of slots that are references.
+        // This is later used in the codegen for PUT_ARG_STK implementation
+        // for struct to decide whether and how many single eight-byte copies 
+        // to be done (only for reference slots), so gcinfo is emitted.
+        // For non-reference slots faster/smaller size instructions are used - 
+        // pair copying using XMM registers or rep mov instructions.
+        if (fp->isStruct)
+        {
+            assert(arg->OperGet() == GT_LDOBJ);
+            
+            BYTE* gcLayout = new (comp, CMK_Codegen) BYTE[fp->numSlots];
+
+            unsigned numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtLdObj.gtClass, gcLayout);
+
+            putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     }
+
     putArg->CopyCosts(arg);
 
     if (arg->InReg())
+    {
         putArg->SetInReg();
+    }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    else if (fp->isStruct)
+    {
+        if (fp->structDesc.passedInRegisters)
+        {
+            putArg->SetInReg();
+        }
+    }
+#endif
 
     JITDUMP("new node is : ");
     DISPNODE(putArg);
@@ -1076,10 +1279,14 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg)
     // assignments/stores at this level are not really placing an arg
     // they are setting up temporary locals that will later be placed into
     // outgoing regs or stack
-    if (!arg->OperIsAssignment()     && 
+    if (
+        !arg->OperIsAssignment()     && 
         !arg->OperIsStore()          &&
         !arg->IsArgPlaceHolderNode() &&
-        !arg->IsNothingNode()        &&
+        !arg->IsNothingNode()        && 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        !arg->OperIsPutArgStk()      &&
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
         !arg->OperIsCopyBlkOp()) // these are de facto placeholders (apparently)
     {
         fgArgTabEntryPtr fp = comp->gtArgEntryByNode(call, arg);
@@ -1153,7 +1360,15 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg)
 #endif // !defined(_TARGET_64BIT_)
         {
             putArg = NewPutArg(call, arg, fp, type);
-            SpliceInUnary(call, ppArg, putArg);
+
+            // In the case of register passable struct (in one or two registers)
+            // the NewPutArg returns a new node (GT_PUTARG_REG or a GT_LIST with two GT_PUTARG_REGs.)
+            // If an extra node is returned, splice it in the right place in the tree.
+            if (arg != putArg)
+            {
+                // putArg and arg are equals if arg is GT_LIST (a list of multiple LCL_FLDs to be passed in registers.)
+                SpliceInUnary(call, ppArg, putArg);
+            }
         }
     }
 }
diff --git a/src/jit/lower.h b/src/jit/lower.h
index ae1f73e5b8..6754b7b75d 100644
--- a/src/jit/lower.h
+++ b/src/jit/lower.h
@@ -134,6 +134,10 @@ private:
     void TreeNodeInfoInitSIMD(GenTree* tree, LinearScan* lsra);
 #endif // FEATURE_SIMD
 
+#if defined(_TARGET_XARCH_)
+    void TreeNodeInfoInitSimple(GenTree* tree, TreeNodeInfo* info, unsigned kind);
+#endif // defined(_TARGET_XARCH_)
+
     void SpliceInUnary(GenTreePtr parent, GenTreePtr* ppChild, GenTreePtr newNode);
     void DumpNodeInfoMap();
 
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 08c340cbee..a7b4600df9 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -103,7 +103,38 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
     }
 }
 
-        
+// TreeNodeInfoInitSimple:
+//   Sets the srcCount and dstCount for all the trees without special handling based on the tree node type.
+//
+// args:
+//   tree: The tree on which TreeNodeInfo's srcCount and dstCount are set.
+//   info: The TreeNodeInfo on which to set the srcCount and dstCount.
+//         This is the TreeNodeInfo corresponding to the tree parameter.
+//   kind: The kind flags of the tree node.
+//   
+void Lowering::TreeNodeInfoInitSimple(GenTree* tree, TreeNodeInfo* info, unsigned kind)
+{
+    info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+    if (kind & (GTK_CONST | GTK_LEAF))
+    {
+        info->srcCount = 0;
+    }
+    else if (kind & (GTK_SMPOP))
+    {
+        if (tree->gtGetOp2() != nullptr)
+        {
+            info->srcCount = 2;
+        }
+        else
+        {
+            info->srcCount = 1;
+        }
+    }
+    else
+    {
+        unreached();
+    }
+}
 
 /**
  * Takes care of annotating the register requirements 
@@ -138,26 +169,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             GenTree* op2;
 
         default:
-            info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
-            if (kind & (GTK_CONST|GTK_LEAF))
-            {
-                info->srcCount = 0;
-            }
-            else if (kind & (GTK_SMPOP))
-            {
-                if (tree->gtGetOp2() != nullptr)
-                {
-                    info->srcCount = 2;
-                }
-                else
-                {
-                    info->srcCount = 1;
-                }
-            }
-            else
-            {
-                unreached();
-            }
+            TreeNodeInfoInitSimple(tree, info, kind);
             break;
 
         case GT_LCL_FLD:
@@ -275,6 +287,24 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             else
 #endif // !defined(_TARGET_64BIT_)
             {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                if (tree->TypeGet() == TYP_STRUCT && 
+                    tree->gtOp.gtOp1->OperGet() == GT_LCL_VAR)
+                {
+#ifdef DEBUG
+                    GenTreeLclVarCommon* lclVarPtr = tree->gtOp.gtOp1->AsLclVarCommon();
+                    LclVarDsc* varDsc = &(compiler->lvaTable[lclVarPtr->gtLclNum]);
+                    assert(varDsc->lvDontPromote);
+#endif // DEBUG
+                    // If this is a two eightbyte return, make the var
+                    // contained by the return expression. The code gen will put
+                    // the values in the right registers for return.
+                    info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+                    info->dstCount = 0;
+                    MakeSrcContained(tree, tree->gtOp.gtOp1);
+                    break;
+                }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                 info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
                 info->dstCount = 0;
 
@@ -840,9 +870,10 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             }
 
             // First, count reg args
-
+#if FEATURE_VARARG
             bool callHasFloatRegArgs = false;
-
+#endif // !FEATURE_VARARG
+            
             for (GenTreePtr list = tree->gtCall.gtCallLateArgs; list; list = list->MoveNext())
             {
                 assert(list->IsList());
@@ -859,26 +890,52 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                     assert(argNode->gtOper == GT_PUTARG_STK);
                     argNode->gtLsraInfo.srcCount = 1;
                     argNode->gtLsraInfo.dstCount = 0;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    // If the node is a struct and it is put on stack with
+                    // putarg_stk operation, we consume and produce no registers.
+                    // In this case the embedded LdObj node should not produce 
+                    // registers too since it is contained.
+                    if (argNode->TypeGet() == TYP_STRUCT)
+                    {
+                        assert(argNode != nullptr && argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_LDOBJ);
+                        argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
+                        argNode->gtLsraInfo.srcCount = 0;
+                    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                     continue;
                 }
 
-                var_types argType = argNode->TypeGet();
+                regNumber argReg = REG_NA;
+                regMaskTP argMask = RBM_NONE;
+                short regCount = 0;
+                bool isOnStack = true;
+                if (curArgTabEntry->regNum != REG_STK)
+                {
+                    isOnStack = false;
+                    var_types argType = argNode->TypeGet();
 
-                callHasFloatRegArgs |= varTypeIsFloating(argType);
+#if FEATURE_VARARG
+                    callHasFloatRegArgs |= varTypeIsFloating(argType);
+#endif // !FEATURE_VARARG
 
-                regNumber argReg = curArgTabEntry->regNum;
-                short regCount = 1;
-                // Default case is that we consume one source; modify this later (e.g. for
-                // promoted structs)
-                info->srcCount++;
+                    argReg = curArgTabEntry->regNum;
+                    regCount = 1;
 
-                regMaskTP argMask = genRegMask(argReg);
-                argNode = argNode->gtEffectiveVal();
-                
-                if (argNode->TypeGet() == TYP_STRUCT)
+                    // Default case is that we consume one source; modify this later (e.g. for
+                    // promoted structs)
+                    info->srcCount++;
+
+                    argMask = genRegMask(argReg);
+                    argNode = argNode->gtEffectiveVal();
+                }
+
+                // If the struct arg is wraped in CPYBLK the type of the param will beTYP_VOID.
+                // Use the curArgTabEntry's isStruct to get whether the param is a struct.
+                if (argNode->TypeGet() == TYP_STRUCT 
+                    FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(|| curArgTabEntry->isStruct))
                 {
                     unsigned originalSize = 0;
-                    bool isPromoted = false;
                     LclVarDsc* varDsc = nullptr;
                     if (argNode->gtOper == GT_LCL_VAR)
                     {
@@ -893,20 +950,70 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                     {
                         noway_assert(!"GT_LDOBJ not supported for amd64");
                     }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    else if (argNode->gtOper == GT_PUTARG_REG)
+                    {
+                        originalSize = genTypeSize(argNode->gtType);
+                    }
+                    else if (argNode->gtOper == GT_LIST)
+                    {
+                        originalSize = 0;
+
+                        // There could be up to 2 PUTARG_REGs in the list
+                        GenTreeArgList* argListPtr = argNode->AsArgList();
+                        unsigned iterationNum = 0;
+                        for (; argListPtr; argListPtr = argListPtr->Rest())
+                        {
+                            GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+                            assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+
+                            if (iterationNum == 0)
+                            {
+                                varDsc = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
+                                originalSize = varDsc->lvSize();
+                                assert(originalSize != 0);
+                            }
+                            else
+                            {
+                                // Need an extra source for every node, but the first in the list.
+                                info->srcCount++;
+
+                                // Get the mask for the second putarg_reg
+                                argMask = genRegMask(curArgTabEntry->otherRegNum);
+                            }
+
+                            putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
+                            putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);
+
+                            // To avoid redundant moves, have the argument child tree computed in the
+                            // register in which the argument is passed to the call.
+                            putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
+                            iterationNum++;
+                        }
+
+                        assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+                    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                     else
                     {
                         noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
                     }
 
-                    unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES;
-                    regNumber reg = (regNumber)(argReg + 1);
-                    unsigned remainingSlots = slots - 1;
-                    while (remainingSlots > 0 && reg <= REG_ARG_LAST)
+                    unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES; 
+                    unsigned remainingSlots = slots;
+
+                    if (!isOnStack)
                     {
-                        argMask |= genRegMask(reg);
-                        reg = (regNumber)(reg + 1);
-                        remainingSlots--;
-                        regCount++;
+                        remainingSlots = slots - 1;
+
+                        regNumber reg = (regNumber)(argReg + 1);
+                        while (remainingSlots > 0 && reg <= REG_ARG_LAST)
+                        {
+                            argMask |= genRegMask(reg);
+                            reg = (regNumber)(reg + 1);
+                            remainingSlots--;
+                            regCount++;
+                        }
                     }
 
                     short internalIntCount = 0;
@@ -915,9 +1022,21 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                         // This TYP_STRUCT argument is also passed in the outgoing argument area
                         // We need a register to address the TYP_STRUCT
                         // And we may need 2
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                        internalIntCount = 1;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
                         internalIntCount = 2;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                     }
                     argNode->gtLsraInfo.internalIntCount = internalIntCount;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    if (argNode->gtOper == GT_PUTARG_REG)
+                    {
+                        argNode->gtLsraInfo.setDstCandidates(l, argMask);
+                        argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+                    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                 }
                 else
                 {
@@ -931,6 +1050,8 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                 {
                     argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
                 }
+
+#if FEATURE_VARARG
                 // In the case of a varargs call, the ABI dictates that if we have floating point args,
                 // we must pass the enregistered arguments in both the integer and floating point registers.
                 // Since the integer register is not associated with this arg node, we will reserve it as
@@ -942,6 +1063,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                     tree->gtLsraInfo.setInternalIntCount(tree->gtLsraInfo.internalIntCount + 1);
                     tree->gtLsraInfo.addInternalCandidates(l, genRegMask(targetReg));
                 }
+#endif // FEATURE_VARARG
             }
 
             // Now, count stack args
@@ -995,6 +1117,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                 args = args->gtOp.gtOp2;
             }
 
+#if FEATURE_VARARG
             // If it is a fast tail call, it is already preferenced to use RAX.
             // Therefore, no need set src candidates on call tgt again.
             if (tree->gtCall.IsVarargs() && 
@@ -1007,6 +1130,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                 // by Amd64 ABI.
                 ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
             }
+#endif // !FEATURE_VARARG
         }
         break;
 
@@ -1020,7 +1144,6 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             info->dstCount = 1;
         }
         break;
-
 #ifdef _TARGET_X86_
         case GT_LDOBJ:
             NYI_X86("GT_LDOBJ");
@@ -1218,6 +1341,116 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
         }
         break;
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        case GT_PUTARG_STK:
+        {
+            if (tree->TypeGet() != TYP_STRUCT)
+            {
+                TreeNodeInfoInitSimple(tree, info, kind);
+                break;
+            }
+
+            GenTreePutArgStk* putArgStkTree = tree->AsPutArgStk();
+
+            GenTreePtr   dstAddr = tree;
+            GenTreePtr   srcAddr = tree->gtOp.gtOp1;
+
+            assert(srcAddr->OperGet() == GT_LDOBJ);
+            info->srcCount =  srcAddr->gtLsraInfo.dstCount;
+
+            // If this is a stack variable address,
+            // make the op1 contained, so this way 
+            // there is no unnecessary copying between registers.
+            // To avoid assertion, increment the parent's source.
+            // It is recovered below.
+            if (srcAddr->gtGetOp1()->OperIsLocalAddr())
+            {
+                info->srcCount += 1;
+            }
+
+            info->dstCount = 0;
+
+            // In case of a CpBlk we could use a helper call. In case of putarg_stk we 
+            // can't do that since the helper call could kill some already set up outgoing args.
+            // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
+            // The cpyXXXX code is rather complex and this could cause it to be more complex, but
+            // it might be the right thing to do.
+
+            // This threshold will decide from using the helper or let the JIT decide to inline
+            // a code sequence of its choice.
+            ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
+            ssize_t size = putArgStkTree->gtNumSlots * TARGET_POINTER_SIZE;
+
+            // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
+            // (I don't know which).
+
+            // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. 
+            // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
+            // our framework assemblies, so this is the main code generation scheme we'll use.
+            if (size <= CPBLK_UNROLL_LIMIT && putArgStkTree->gtNumberReferenceSlots == 0)
+            {
+                // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
+                // 
+                // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
+                // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
+                // RBM_NON_BYTE_REGS from internal candidates.
+                if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
+                {
+                    info->internalIntCount++;
+                    regMaskTP regMask = l->allRegs(TYP_INT);
+
+#ifdef _TARGET_X86_
+                    if ((size % 2) != 0)
+                    {
+                        regMask &= ~RBM_NON_BYTE_REGS;
+                    }
+#endif
+                    info->setInternalCandidates(l, regMask);
+                }
+
+                if (size >= XMM_REGSIZE_BYTES)
+                {
+                    // If we have a buffer larger than XMM_REGSIZE_BYTES, 
+                    // reserve an XMM register to use it for a 
+                    // series of 16-byte loads and stores.
+                    info->internalFloatCount = 1;
+                    info->addInternalCandidates(l, l->internalFloatRegCandidates());
+                }
+
+                if (srcAddr->gtGetOp1()->OperIsLocalAddr())
+                {
+                    MakeSrcContained(putArgStkTree, srcAddr->gtGetOp1());
+                }
+
+                // If src or dst are on stack, we don't have to generate the address into a register
+                // because it's just some constant+SP
+                putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindUnroll;
+            }
+            else
+            {
+                info->internalIntCount += 3;
+                info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
+                if (srcAddr->gtGetOp1()->OperIsLocalAddr())
+                {
+                    MakeSrcContained(putArgStkTree, srcAddr->gtGetOp1());
+                }
+
+                putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindRepInstr;
+            }
+
+            // Always mark the LDOBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
+            MakeSrcContained(putArgStkTree, srcAddr);
+
+            // Balance up the inc above.
+            if (srcAddr->gtGetOp1()->OperIsLocalAddr())
+            {
+                info->srcCount -= 1;
+            }
+        }
+        
+        break;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
         case GT_COPYBLK:
         {
             // Sources are src, dest and size (or class token for CpObj).
@@ -2995,6 +3228,6 @@ bool Lowering:: IsContainableImmed(GenTree* parentNode, GenTree* childNode)
     return true;
 }
 
-#endif // _TARGET_AMD64_
+#endif // _TARGET_XARCH_
 
 #endif // !LEGACY_BACKEND
diff --git a/src/jit/lsra.cpp b/src/jit/lsra.cpp
index d8341b1d7f..8f11af9878 100644
--- a/src/jit/lsra.cpp
+++ b/src/jit/lsra.cpp
@@ -2671,14 +2671,14 @@ LinearScan::buildInternalRegisterDefsForNode(GenTree *tree,
     int internalIntCount = tree->gtLsraInfo.internalIntCount;
     regMaskTP internalCands = tree->gtLsraInfo.getInternalCandidates(this);
 
-    // If this is a varArgs call, the internal candidates represent the integer registers that
-    // floating point arguments must be copied into.  These must be handled as fixed regs.
+    // If the number of internal integer registers required is the same as the number of candidate integer registers in the candidate set, 
+    // then they must be handled as fixed registers.
+    // (E.g. for the integer registers that floating point arguments must be copied into for a varargs call.)
     bool fixedRegs = false;
-    if ((internalIntCount != 0) && (tree->OperGet() == GT_CALL))
+    regMaskTP internalIntCandidates = (internalCands & allRegs(TYP_INT));
+    if (((int)genCountBits(internalIntCandidates)) == internalIntCount)
     {
-        assert(tree->gtCall.IsVarargs());
         fixedRegs = true;
-        assert((int)genCountBits(internalCands) == internalIntCount);
     }
 
     for (count = 0; count < internalIntCount; count++)
@@ -3317,6 +3317,50 @@ LinearScan::insertZeroInitRefPositions()
     }
 }
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+// -----------------------------------------------------------------------
+// Sets the register state for an argument of type STRUCT for System V systems.
+//     See Compiler::raUpdateRegStateForArg(RegState *regState, LclVarDsc *argDsc) in regalloc.cpp
+//         for how state for argument is updated for unix non-structs and Windows AMD64 structs.
+void
+LinearScan::unixAmd64UpdateRegStateForArg(LclVarDsc* argDsc)
+{
+    assert(argDsc->lvType == TYP_STRUCT);
+    RegState              * intRegState = &compiler->codeGen->intRegState;
+    RegState              * floatRegState = &compiler->codeGen->floatRegState;
+
+    if ((argDsc->lvArgReg != REG_STK) && (argDsc->lvArgReg != REG_NA))
+    {
+        if (genRegMask(argDsc->lvArgReg) & (RBM_ALLFLOAT))
+        {
+            assert(genRegMask(argDsc->lvArgReg) & (RBM_FLTARG_REGS));
+            floatRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->lvArgReg);
+        }
+        else
+        {
+            assert(genRegMask(argDsc->lvArgReg) & (RBM_ARG_REGS));
+            intRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->lvArgReg);
+        }
+    }
+
+
+    if ((argDsc->lvOtherArgReg != REG_STK) && (argDsc->lvOtherArgReg != REG_NA))
+    {
+        if (genRegMask(argDsc->lvOtherArgReg) & (RBM_ALLFLOAT))
+        {
+            assert(genRegMask(argDsc->lvOtherArgReg) & (RBM_FLTARG_REGS));
+            floatRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->lvOtherArgReg);
+        }
+        else
+        {
+            assert(genRegMask(argDsc->lvOtherArgReg) & (RBM_ARG_REGS));
+            intRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->lvOtherArgReg);
+        }
+    }
+}
+
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 //------------------------------------------------------------------------
 // updateRegStateForArg: Updates rsCalleeRegArgMaskLiveIn for the appropriate
 //    regState (either compiler->intRegState or compiler->floatRegState),
@@ -3339,31 +3383,41 @@ LinearScan::insertZeroInitRefPositions()
 void
 LinearScan::updateRegStateForArg(LclVarDsc* argDsc)
 {
-    RegState              * intRegState   = &compiler->codeGen->intRegState;
-    RegState              * floatRegState = &compiler->codeGen->floatRegState;
-
-    // In the case of AMD64 we'll still use the floating point registers
-    // to model the register usage for argument on vararg calls, so
-    // we will ignore the varargs condition to determine whether we use 
-    // XMM registers or not for setting up the call.
-    bool isFloat = (isFloatRegType(argDsc->lvType) 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // For System V AMD64 calls the argDsc can have 2 registers (for structs.)
+    // Handle them here.
+    if (argDsc->lvType == TYP_STRUCT)
+    {
+        unixAmd64UpdateRegStateForArg(argDsc);
+    }
+    else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    {
+        RegState              * intRegState = &compiler->codeGen->intRegState;
+        RegState              * floatRegState = &compiler->codeGen->floatRegState;
+        // In the case of AMD64 we'll still use the floating point registers
+        // to model the register usage for argument on vararg calls, so
+        // we will ignore the varargs condition to determine whether we use 
+        // XMM registers or not for setting up the call.
+        bool isFloat = (isFloatRegType(argDsc->lvType)
 #ifndef _TARGET_AMD64_
-        && !compiler->info.compIsVarArgs
+            && !compiler->info.compIsVarArgs
 #endif
-        );
+            );
 
 #ifdef _TARGET_ARM_
-    if (argDsc->lvIsHfaRegArg) isFloat = true;
+        if (argDsc->lvIsHfaRegArg) isFloat = true;
 #endif // _TARGET_ARM_
-    if (isFloat)
-    {
-        JITDUMP("Float arg V%02u in reg %s\n", (argDsc - compiler->lvaTable), getRegName(argDsc->lvArgReg));
-        compiler->raUpdateRegStateForArg(floatRegState, argDsc);
-    } 
-    else
-    {
-        JITDUMP("Int arg V%02u in reg %s\n", (argDsc - compiler->lvaTable), getRegName(argDsc->lvArgReg));
-        compiler->raUpdateRegStateForArg(intRegState, argDsc);
+        if (isFloat)
+        {
+            JITDUMP("Float arg V%02u in reg %s\n", (argDsc - compiler->lvaTable), getRegName(argDsc->lvArgReg));
+            compiler->raUpdateRegStateForArg(floatRegState, argDsc);
+        }
+        else
+        {
+            JITDUMP("Int arg V%02u in reg %s\n", (argDsc - compiler->lvaTable), getRegName(argDsc->lvArgReg));
+            compiler->raUpdateRegStateForArg(intRegState, argDsc);
+        }
     }
 }
 
@@ -3548,7 +3602,9 @@ LinearScan::buildIntervals()
         // won't have done dataflow on it, but it needs to be marked as live-in so
         // it will get saved in the prolog.
         if (!compiler->compJmpOpUsed && argDsc->lvRefCnt == 0 && !compiler->opts.compDbgCode)
+        {
             continue;
+        }
 
         if (argDsc->lvIsRegArg) updateRegStateForArg(argDsc);
 
diff --git a/src/jit/lsra.h b/src/jit/lsra.h
index e57873fb65..cef6669513 100644
--- a/src/jit/lsra.h
+++ b/src/jit/lsra.h
@@ -574,6 +574,14 @@ private:
     void             buildUpperVectorRestoreRefPositions(GenTree *tree, LsraLocation currentLoc, VARSET_VALARG_TP liveLargeVectors);
 #endif //FEATURE_SIMD
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // For AMD64 on SystemV machines. This method 
+    // is called as replacement for raUpdateRegStateForArg 
+    // that is used on Windows. On System V systems a struct can be passed
+    // partially using registers from the 2 register files.
+    void unixAmd64UpdateRegStateForArg(LclVarDsc* argDsc);
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     // Update reg state for an incoming register argument
     void            updateRegStateForArg(LclVarDsc* argDsc);
 
@@ -998,7 +1006,6 @@ private:
     // Set of large vector (TYP_SIMD32 on AVX) variables to consider for callee-save registers.
     VARSET_TP           largeVectorCalleeSaveCandidateVars;
 #endif // FEATURE_SIMD
-    
 };
 
 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp
index f3eb506b0d..b000f58969 100644
--- a/src/jit/morph.cpp
+++ b/src/jit/morph.cpp
@@ -926,6 +926,7 @@ fgArgInfo::fgArgInfo(Compiler *  comp,  GenTreePtr  call, unsigned numArgs)
     argTableSize = numArgs; // the allocated table size
     argsComplete = false;
     argsSorted   = false;
+
     if (argTableSize == 0)
         argTable = NULL;
     else
@@ -1127,7 +1128,6 @@ void fgArgInfo::AddArg(fgArgTabEntryPtr curArgTabEntry)
     argCount++;
 }
 
-
 fgArgTabEntryPtr fgArgInfo::AddRegArg(unsigned    argNum,
                                       GenTreePtr  node,
                                       GenTreePtr  parent,
@@ -1137,38 +1137,79 @@ fgArgTabEntryPtr fgArgInfo::AddRegArg(unsigned    argNum,
 {
     fgArgTabEntryPtr curArgTabEntry = new(compiler, CMK_fgArgInfo) fgArgTabEntry;
 
-    curArgTabEntry->argNum       = argNum;
-    curArgTabEntry->node         = node;
-    curArgTabEntry->parent       = parent;
-    curArgTabEntry->regNum       = regNum;
-    curArgTabEntry->slotNum      = 0;
-    curArgTabEntry->numRegs      = numRegs;
-    curArgTabEntry->numSlots     = 0;
-    curArgTabEntry->alignment    = alignment;
-    curArgTabEntry->lateArgInx   = (unsigned) -1;
-    curArgTabEntry->tmpNum       = (unsigned) -1;
-    curArgTabEntry->isSplit      = false;
-    curArgTabEntry->isTmp        = false;
-    curArgTabEntry->needTmp      = false;
-    curArgTabEntry->needPlace    = false;
-    curArgTabEntry->processed    = false;
-    curArgTabEntry->isHfaRegArg  = false;
-    curArgTabEntry->isBackFilled = false;
-    curArgTabEntry->isNonStandard = false;
+    curArgTabEntry->argNum =            argNum;
+    curArgTabEntry->node =              node;
+    curArgTabEntry->parent =            parent;
+    curArgTabEntry->regNum =            regNum;
+    curArgTabEntry->slotNum =           0;
+    curArgTabEntry->numRegs =           numRegs;
+    curArgTabEntry->numSlots =          0;
+    curArgTabEntry->alignment =         alignment;
+    curArgTabEntry->lateArgInx =        (unsigned)-1;
+    curArgTabEntry->tmpNum =            (unsigned)-1;
+    curArgTabEntry->isSplit =           false;
+    curArgTabEntry->isTmp =             false;
+    curArgTabEntry->needTmp =           false;
+    curArgTabEntry->needPlace =         false;
+    curArgTabEntry->processed =         false;
+    curArgTabEntry->isHfaRegArg =       false;
+    curArgTabEntry->isBackFilled =      false;
+    curArgTabEntry->isNonStandard =     false;
 
     AddArg(curArgTabEntry);
     return curArgTabEntry;
 }
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+fgArgTabEntryPtr fgArgInfo::AddRegArg(unsigned          argNum,
+                                      GenTreePtr        node,
+                                      GenTreePtr        parent,
+                                      regNumber         regNum,
+                                      unsigned          numRegs,
+                                      unsigned          alignment,
+                                      const bool        isStruct,
+                                      const regNumber   otherRegNum,
+                                      const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* const structDescPtr)
+{
+    fgArgTabEntryPtr curArgTabEntry = AddRegArg(argNum, node, parent, regNum, numRegs, alignment);
+    assert(curArgTabEntry != nullptr);
+
+    // The node of the ArgTabEntry could change after remorphing - it could be rewritten to a cpyblk or a
+    // PlaceHolder node (in case of needed late argument, for example.)
+    // This requires using of an extra flag. At creation time the state is right, so
+    // and this assert enforces that.
+    assert((node->gtType == TYP_STRUCT && isStruct) || (node->gtType != TYP_STRUCT && !isStruct));
+    curArgTabEntry->otherRegNum = otherRegNum;                       // Second reg for the struct
+    curArgTabEntry->isStruct = isStruct;                             // is this a struct arg
+
+    if (isStruct && structDescPtr != nullptr)
+    {
+        curArgTabEntry->structDesc.CopyFrom(*structDescPtr);
+    }
+
+    return curArgTabEntry;
+}
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 fgArgTabEntryPtr fgArgInfo::AddStkArg(unsigned    argNum,
                                       GenTreePtr  node,
                                       GenTreePtr  parent,
                                       unsigned    numSlots,
-                                      unsigned    alignment)
+                                      unsigned    alignment
+                                      FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const bool isStruct))
 {
     fgArgTabEntryPtr curArgTabEntry = new(compiler, CMK_fgArgInfo) fgArgTabEntry;
 
-    nextSlotNum = (unsigned) roundUp(nextSlotNum, alignment);
+    nextSlotNum = (unsigned)roundUp(nextSlotNum, alignment);
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // The node of the ArgTabEntry could change after remorphing - it could be rewritten to a cpyblk or a
+    // PlaceHolder node (in case of needed late argument, for example.)
+    // This reqires using of an extra flag. At creation time the state is right, so
+    // and this assert enforces that.
+    assert((node->gtType == TYP_STRUCT && isStruct) || (node->gtType != TYP_STRUCT && !isStruct));
+    curArgTabEntry->isStruct = isStruct;                             // is this a struct arg
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
     curArgTabEntry->argNum        = argNum;
     curArgTabEntry->node          = node;
@@ -1399,9 +1440,24 @@ void fgArgInfo::ArgsComplete()
 
     for (unsigned curInx = 0; curInx < argCount; curInx++)
     {
-        fgArgTabEntryPtr curArgTabEntry = argTable[curInx];        assert(curArgTabEntry != NULL);
+        fgArgTabEntryPtr curArgTabEntry = argTable[curInx];
+        assert(curArgTabEntry != NULL);
         GenTreePtr       argx           = curArgTabEntry->node;
 
+        // If this is a struct, mark it for needing a tempVar.
+        // In the copyblk and store this should have minimal perf impact since 
+        // the local vars where we copy/store to already exist and the logic for temp 
+        // var will not create a new one if it creates a tempVar from another tempVar.
+        // (Debugging through the code, there was no new copy of data created, neither a new tempVar.)
+        // The need for this arise from Lower::LowerArg. 
+        // In case of copyblk and store operation, the NewPutArg method will 
+        // not be invoked and the struct will not be loaded to be passed in
+        // registers or by value on the stack.
+        if (argx->TypeGet() == TYP_STRUCT FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY( || curArgTabEntry->isStruct))
+        {
+            curArgTabEntry->needTmp = true;
+        }
+
         if (curArgTabEntry->regNum == REG_STK)
         {
             hasStackArgs = true;
@@ -1415,8 +1471,11 @@ void fgArgInfo::ArgsComplete()
         }
         else // we have a register argument, next we look for a TYP_STRUCT
         {
-            if (argx->TypeGet() == TYP_STRUCT)
+            if (argx->TypeGet() == TYP_STRUCT 
+                FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY( || curArgTabEntry->isStruct))
+            {
                 hasStructRegArg = true;
+            }
         }
 
         /* If the argument tree contains an assignment (GTF_ASG) then the argument and
@@ -1461,7 +1520,6 @@ void fgArgInfo::ArgsComplete()
             }
         }
 
-
 #if FEATURE_FIXED_OUT_ARGS
         // Like calls, if this argument has a tree that will do an inline throw,
         // a call to a jit helper, then we need to treat it like a call (but only
@@ -1917,7 +1975,11 @@ void fgArgInfo::SortArgs()
     argsSorted = true;
 }
 
-GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
+// This function creates a tmp var ony if needed.
+// We need this to be done in order to enforce ordering
+// of the evaluation of arguments. There are times this function will not be called for an argument at all.
+GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum
+                                         FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const bool passedInRegisters))
 {
     LclVarDsc *  varDsc = &lvaTable[tmpVarNum];
     assert(varDsc->lvIsTemp);
@@ -1926,9 +1988,12 @@ GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
     // Create a copy of the temp to go into the late argument list
     GenTreePtr arg = gtNewLclvNode(tmpVarNum, type);
 
-#ifdef _TARGET_AMD64_
+#if defined(_TARGET_AMD64_)
     if (type == TYP_STRUCT)
     {
+
+
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
         switch (lvaLclExactSize(tmpVarNum))
         {
         case 1: type = TYP_BYTE;  break;
@@ -1953,6 +2018,8 @@ GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
         default:
             break;
         }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING 
+
         // If we didn't change the type of the struct, it means
         // its structure doesn't support to be passed directly through a
         // register, so we need to pass a pointer to the destination where
@@ -1960,7 +2027,23 @@ GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
         if (type == TYP_STRUCT)
         {
             arg->gtFlags |= GTF_DONT_CSE;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING 
+
+            // If it is passed in registers, don't get the address of the var. Make it a
+            // field instead. It will be loaded in registers with putarg_reg tree in lower.
+            if (passedInRegisters)
+            {
+                arg->ChangeOper(GT_LCL_FLD);
+                arg->gtType = type;
+            }
+            else
+            {
+                arg = gtNewOperNode(GT_ADDR, TYP_STRUCT, arg);
+            }
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING 
             arg = gtNewOperNode(GT_ADDR, TYP_I_IMPL, arg);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING 
         }
         else
         {
@@ -1973,10 +2056,8 @@ GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
 
     arg->gtFlags |= GTF_DONT_CSE;
     arg = gtNewOperNode(GT_ADDR, TYP_I_IMPL, arg);
-
     // Ldobj the temp to use it as a call argument
-    arg = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, arg, lvaGetStruct(tmpVarNum)
-                                            );
+    arg = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, arg, lvaGetStruct(tmpVarNum));
     arg->gtFlags |= GTF_EXCEPT;
 
 #endif // _TARGET_AMD64_
@@ -2007,7 +2088,7 @@ void fgArgInfo::EvalArgsToTemps()
         //   Only the register arguments need to be replaced with placeholders node
         //   stacked arguments are evaluated and pushed in order
         //
-        if (curArgTabEntry->regNum == REG_STK)
+        if (curArgTabEntry->regNum == REG_STK && !curArgTabEntry->needTmp) 
             continue;
 #endif
 
@@ -2019,9 +2100,11 @@ void fgArgInfo::EvalArgsToTemps()
             {
                 // Create a copy of the temp to go into the late argument list
                 tmpVarNum = curArgTabEntry->tmpNum;
-                defArg = compiler->fgMakeTmpArgNode(tmpVarNum);
+                defArg = compiler->fgMakeTmpArgNode(
+                    tmpVarNum
+                    FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(argTable[curInx]->structDesc.passedInRegisters));
 
-                /* mark the original node as a late argument */
+                // mark the original node as a late argument
                 argx->gtFlags |= GTF_LATE_ARG;
             }
             else
@@ -2036,7 +2119,7 @@ void fgArgInfo::EvalArgsToTemps()
                 }
 #endif
 
-#ifdef _TARGET_AMD64_
+#if defined(_TARGET_AMD64_) && !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 noway_assert(argx->gtType != TYP_STRUCT);
 #endif
 
@@ -2160,11 +2243,11 @@ void fgArgInfo::EvalArgsToTemps()
             /* For a TYP_STRUCT we also need to record the class handle of the arg */
             CORINFO_CLASS_HANDLE clsHnd = NULL;
 
-#ifdef _TARGET_AMD64_
+#if defined(_TARGET_AMD64_) && !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
             noway_assert(argx->gtType != TYP_STRUCT);
 
-#else // _TARGET_AMD664_
+#else // _TARGET_AMD64_
 
             if (defArg->gtType == TYP_STRUCT)
             {
@@ -2429,6 +2512,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 #endif
 
     unsigned        argSlots          = 0;
+    unsigned        nonRegPassedStructSlots = 0;
     bool            lateArgsComputed  = (call->gtCallLateArgs != nullptr);
     bool            callHasRetBuffArg = ((call->gtCallMoreFlags & GTF_CALL_M_RETBUFFARG) != 0);
 
@@ -2606,13 +2690,19 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                    (call->gtCallObjp->gtType == TYP_I_IMPL));
 
             /* this is a register argument - put it in the table */
-            call->fgArgInfo->AddRegArg(argIndex, argx, NULL, genMapIntRegArgNumToRegNum(intArgRegNum), 1, 1);
+            call->fgArgInfo->AddRegArg(argIndex, argx, NULL, genMapIntRegArgNumToRegNum(intArgRegNum), 1, 1
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                , false, REG_STK, nullptr
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+                );
         }
         else
         {
             /* this is a register argument - possibly update it in the table */
             call->fgArgInfo->RemorphRegArg(argIndex, argx, NULL, genMapIntRegArgNumToRegNum(intArgRegNum), 1, 1);
         }
+        // this can't be a struct.
+        assert(argx->gtType != TYP_STRUCT);
 
         /* Increment the argument register count and argument index */
         if (!varTypeIsFloating(argx->gtType))
@@ -2714,9 +2804,22 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
 #endif // _TARGET_ARM_
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    bool nonRegPassableStruct = false;
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    bool hasStructArgument = false;
     for (args = call->gtCallArgs; args; args = args->gtOp.gtOp2)
     {
         GenTreePtr * parentArgx = &args->gtOp.gtOp1;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        if (!hasStructArgument)
+        {
+            hasStructArgument = (args->gtOp.gtOp1->TypeGet() == TYP_STRUCT);
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
         argx = fgMorphTree(*parentArgx);
         *parentArgx = argx;
         flagsSummary |= argx->gtFlags;
@@ -2741,7 +2844,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
         unsigned             size         = 0;
         CORINFO_CLASS_HANDLE copyBlkClass = NULL;
-        bool                 isRegArg;
+        bool                 isRegArg = false;
 
         fgArgTabEntryPtr argEntry = NULL;
 
@@ -2816,14 +2919,20 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         }
 
 #elif defined(_TARGET_AMD64_)
-        
-        passUsingFloatRegs = varTypeIsFloating(argx);
-
 #if defined(UNIX_AMD64_ABI)        
+        if (lateArgsComputed)
+        {
+            passUsingFloatRegs = isValidFloatArgReg(argEntry->regNum);
+        }
+        else
+        {
+            passUsingFloatRegs = varTypeIsFloating(argx);
+        }
         bool passUsingIntRegs;
         passUsingIntRegs = passUsingFloatRegs ? false : (intArgRegNum < MAX_REG_ARG);
-#endif // UNIX_AMD64_ABI
-
+#else // !UNIX_AMD64_ABI
+        passUsingFloatRegs = varTypeIsFloating(argx);
+#endif // !UNIX_AMD64_ABI
 #elif defined(_TARGET_X86_)
 
         passUsingFloatRegs = false;
@@ -2836,6 +2945,12 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         unsigned     nextFltArgRegNum = fltArgRegNum;  // This is the next floating-point argument register number to use
         var_types    structBaseType   = TYP_STRUCT;
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        unsigned int structFloatRegs = 0;
+        unsigned int structIntRegs = 0;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        bool isStructArg = argx->gtType == TYP_STRUCT;
+
         if (lateArgsComputed)
         {
             assert(argEntry != NULL);
@@ -2870,12 +2985,24 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             // stack slots, or both if the argument is split between the registers and the stack.
             //
 
-            if (argx->IsArgPlaceHolderNode() || (argx->gtType != TYP_STRUCT))
+            if (argx->IsArgPlaceHolderNode() || (!isStructArg))
             {
 #if   defined(_TARGET_AMD64_)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                if (!isStructArg)
+                {
+                    size = 1; // On AMD64, all primitives fit in a single (64-bit) 'slot'
+                }
+                else
+                {
+                    size = (unsigned)(roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), TARGET_POINTER_SIZE)) / TARGET_POINTER_SIZE;
+                    eeGetSystemVAmd64PassStructInRegisterDescriptor(argx->gtArgPlace.gtArgPlaceClsHnd, &structDesc);
+                }
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
                 size = 1; // On AMD64, all primitives fit in a single (64-bit) 'slot'
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #elif defined(_TARGET_ARM64_)    
-                if (argx->gtType == TYP_STRUCT)
+                if (isStructArg)
                 {
                     // Structs are eith passed in 1 or 2 (64-bit) slots
                     size = (unsigned)(roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), TARGET_POINTER_SIZE)) / TARGET_POINTER_SIZE;
@@ -2891,7 +3018,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                     size = 1; // On ARM64, all primitives fit in a single (64-bit) 'slot'
                 }
 #elif defined(_TARGET_ARM_)
-                if (argx->gtType == TYP_STRUCT)
+                if (isStructArg)
                 {
                     size = (unsigned)(roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), TARGET_POINTER_SIZE)) / TARGET_POINTER_SIZE;
                 }
@@ -2915,10 +3042,26 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             else // argx->gtType == TYP_STRUCT
             {
                 /* We handle two opcodes: GT_MKREFANY and GT_LDOBJ */
-                if (argx->gtOper == GT_MKREFANY)
+                if (argx->gtOper == GT_MKREFANY) 
                 {
+                    if (argx->TypeGet() == TYP_STRUCT)
+                    {
+                        isStructArg = true;
+                    }
 #ifdef _TARGET_AMD64_
-                    size = 1;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    if (argx->TypeGet() == TYP_STRUCT)
+                    {
+                        size = info.compCompHnd->getClassSize(impGetRefAnyClass());
+                        unsigned roundupSize = (unsigned)roundUp(size, TARGET_POINTER_SIZE);
+                        size = roundupSize / TARGET_POINTER_SIZE;
+                        eeGetSystemVAmd64PassStructInRegisterDescriptor(impGetRefAnyClass(), &structDesc);
+                    }
+                    else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    {
+                        size = 1;
+                    }
 #else
                     size = 2;
 #endif
@@ -2942,22 +3085,42 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                         BADCODE("illegal argument tree in fgMorphArgs");
 
                     CORINFO_CLASS_HANDLE ldObjClass = argLdobj->gtLdObj.gtClass;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    eeGetSystemVAmd64PassStructInRegisterDescriptor(ldObjClass, &structDesc);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
                     unsigned originalSize = info.compCompHnd->getClassSize(ldObjClass);
+                    originalSize = (originalSize == 0 ? TARGET_POINTER_SIZE : originalSize);
                     unsigned roundupSize  = (unsigned)roundUp(originalSize, TARGET_POINTER_SIZE);
                     bool     passStructByRef = false;
 
 #ifndef _TARGET_X86_
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
                     // Check for TYP_STRUCT argument with size 1, 2, 4 or 8 bytes
                     // As we can optimize these by turning them into a GT_IND of the correct type
-                    if ((originalSize > TARGET_POINTER_SIZE) || ((originalSize & (originalSize-1)) != 0))
+                    if ((originalSize > TARGET_POINTER_SIZE) || ((originalSize & (originalSize - 1)) != 0))
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                     {
                         // Normalize 'size' to the number of pointer sized items
                         // 'size' is the number of register slots that we will use to pass the argument
                         size = roundupSize / TARGET_POINTER_SIZE;
 #if defined(_TARGET_AMD64_)
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
                         size = 1;      // This must be copied to a temp and passed by address
                         passStructByRef = true;
                         copyBlkClass = ldObjClass;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
+                        if (!structDesc.passedInRegisters)
+                        {
+                            passStructByRef = false;
+                            copyBlkClass = NULL;
+                        }
+                        else 
+                        {
+                            passStructByRef = true;
+                            copyBlkClass = ldObjClass;
+                        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #elif defined(_TARGET_ARM64_)
                         if (size > 2)
                         {
@@ -2985,6 +3148,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                         }
 #endif // _TARGET_ARM_
                     }
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
                     else 
                     {
                         // change our GT_LDOBJ into a GT_IND of the correct type
@@ -3109,10 +3273,10 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
                         size = 1;
                     }
-#endif // not _TARGET_X86_
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
+#endif // not _TARGET_X86_
                     // We still have a TYP_STRUCT unless we converted the GT_LDOBJ into a GT_IND above...
-
                     if ((structBaseType == TYP_STRUCT) && !passStructByRef)
                     {
                         // if the valuetype size is not a multiple of sizeof(void*),
@@ -3158,8 +3322,23 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             //
             // Figure out if the argument will be passed in a register.
             //
+            bool passedInRegisters = true;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            passedInRegisters = !isStructArg;
+            if (!passedInRegisters)
+            {
+                if (structDesc.passedInRegisters)
+                {
+                    passedInRegisters = true;
+                }
+                else
+                {
+                    passedInRegisters = false;
+                }
+            }
 
-            if (isRegParamType(genActualType(argx->TypeGet())))
+#endif
+            if (passedInRegisters && isRegParamType(genActualType(argx->TypeGet())))
             {
 #ifdef _TARGET_ARM_
                 if (passUsingFloatRegs)
@@ -3192,13 +3371,48 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                 }
 #else // _TARGET_ARM_
 #if defined(UNIX_AMD64_ABI)
-                if (passUsingFloatRegs)
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                // Here a struct can be passed in register following the classifications of its members and size.
+                // Now make sure there are actually enough registers to do so.
+                if (isStructArg)
                 {
-                    isRegArg = fltArgRegNum < MAX_FLOAT_REG_ARG;
+                    for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
+                    {
+                        if (structDesc.eightByteClassifications[i] == SystemVClassificationTypeInteger ||
+                            structDesc.eightByteClassifications[i] == SystemVClassificationTypeIntegerReference)
+                        {
+                            structIntRegs++;
+                        }
+                        else if (structDesc.eightByteClassifications[i] == SystemVClassificationTypeSSE)
+                        {
+                            structFloatRegs++;
+                        }
+                    }
+
+                    if (((nextFltArgRegNum + structFloatRegs) > MAX_FLOAT_REG_ARG) ||
+                        ((intArgRegNum + structIntRegs) > MAX_REG_ARG))
+                    {
+                        isRegArg = false;
+                        nonRegPassableStruct = true;
+                    }
+                    else
+                    {
+                        isRegArg = true;
+                        nonRegPassableStruct = false;
+                    }
                 }
                 else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 {
-                    isRegArg = intArgRegNum < MAX_REG_ARG;
+                    if (passUsingFloatRegs)
+                    {
+                        isRegArg = nextFltArgRegNum < MAX_FLOAT_REG_ARG;
+                    }
+                    else
+                    {
+                        isRegArg = intArgRegNum < MAX_REG_ARG;
+                    }
                 }
 #else // !defined(UNIX_AMD64_ABI)
                 isRegArg = intArgRegNum < maxRegArgs;
@@ -3208,6 +3422,10 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             else
             {
                 isRegArg = false;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                nonRegPassableStruct = true;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
             }
         }
 
@@ -3245,16 +3463,67 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         }
 
 #endif // _TARGET_ARM_
-
         if (isRegArg)
         {
-            // fill in or update the argInfo table 
+            regNumber nextRegNum = REG_STK;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            regNumber nextOtherRegNum = REG_STK;
+
+            if (isStructArg)
+            {
+                // It is a struct passed in registers. Assign the next available register.
+                unsigned int curIntReg = intArgRegNum;
+                unsigned int curFloatReg = nextFltArgRegNum;
+                for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
+                {
+                    if (structDesc.eightByteClassifications[i] == SystemVClassificationTypeInteger ||
+                        structDesc.eightByteClassifications[i] == SystemVClassificationTypeIntegerReference)
+                    {
+                        if (i == 0)
+                        {
+                            nextRegNum = genMapIntRegArgNumToRegNum(curIntReg);
+                        }
+                        else if (i == 1)
+                        {
+                            nextOtherRegNum = genMapIntRegArgNumToRegNum(curIntReg);
+                        }
+                        else
+                        {
+                            assert(false && "fgMorphArgs Invalid index for int classification.");
+                        }
 
-            regNumber nextRegNum = passUsingFloatRegs ? genMapFloatRegArgNumToRegNum(nextFltArgRegNum) : genMapIntRegArgNumToRegNum(intArgRegNum);
+                        curIntReg++;
+                    }
+                    else if (structDesc.eightByteClassifications[i] == SystemVClassificationTypeSSE)
+                    {
+                        if (i == 0)
+                        {
+                            nextRegNum = genMapFloatRegArgNumToRegNum(curFloatReg);
+                        }
+                        else if (i == 1)
+                        {
+                            nextOtherRegNum = genMapFloatRegArgNumToRegNum(curFloatReg);
+                        }
+                        else
+                        {
+                            assert(false && "fgMorphArgs Invalid index for SSE classification.");
+                        }
 
+                        curFloatReg++;
+                    }
+                }
+            }
+            else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            {
+                // fill in or update the argInfo table 
+                nextRegNum = passUsingFloatRegs ? genMapFloatRegArgNumToRegNum(nextFltArgRegNum) : genMapIntRegArgNumToRegNum(intArgRegNum);
+            }
 
 #ifdef _TARGET_AMD64_
-            assert(size == 1);
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
+       assert(size == 1);
+#endif
 #endif
 
 #ifndef LEGACY_BACKEND
@@ -3263,14 +3532,18 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             // 
             // They should not affect the placement of any other args or stack space required.
             // Example: on AMD64 R10 and R11 are used for indirect VSD (generic interface) and cookie calls.
-
             bool nonStandardFound = false;
             for (int i=0; i<nonStandardArgs.Height(); i++)
             {
                 hasNonStandardArg = true;
                 if (argx == nonStandardArgs.Index(i).node)
                 {
-                    fgArgTabEntry* argEntry = call->fgArgInfo->AddRegArg(argIndex, argx, args, nonStandardArgs.Index(i).reg, size, argAlign);
+                    fgArgTabEntry* argEntry = call->fgArgInfo->AddRegArg(argIndex, argx,
+                        args, nonStandardArgs.Index(i).reg, size, argAlign
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                        , isStructArg, nextOtherRegNum, &structDesc
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    );
                     argEntry->isNonStandard = true;
                     argIndex++;
                     nonStandardFound = true;
@@ -3283,9 +3556,13 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
             if (!lateArgsComputed)
             {
-                /* This is a register argument - put it in the table */
-
-                fgArgTabEntryPtr newArg = call->fgArgInfo->AddRegArg(argIndex, argx, args, nextRegNum, size, argAlign);
+                // This is a register argument - put it in the table
+                fgArgTabEntryPtr newArg = call->fgArgInfo->AddRegArg(
+                    argIndex, argx, args, nextRegNum, size, argAlign
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    , isStructArg, nextOtherRegNum, &structDesc
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    );
                 (void)newArg; //prevent "unused variable" error from GCC
 #ifdef _TARGET_ARM_
                 newArg->SetIsHfaRegArg(passUsingFloatRegs && isHfaArg); // Note that an HFA is passed in int regs for varargs
@@ -3294,7 +3571,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             }
             else
             {
-                /* This is a register argument - possibly update it in the table */
+                // This is a register argument - possibly update it in the table
                 fgArgTabEntryPtr entry = call->fgArgInfo->RemorphRegArg(argIndex, argx, args, nextRegNum, size, argAlign);
                 if (entry->isNonStandard)
                 {
@@ -3306,45 +3583,55 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             // Setup the next argRegNum value
             if (!isBackFilled)
             {
-                if (passUsingFloatRegs)
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                if (isStructArg)
                 {
-                    fltArgRegNum += size;
-#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
-                    argSkippedRegMask |= genMapArgNumToRegMask(intArgRegNum, TYP_I_IMPL);
-                    intArgRegNum = min(intArgRegNum + size, MAX_REG_ARG);
-#endif // _TARGET_AMD64_
-#ifdef _TARGET_ARM_
-                    if (fltArgRegNum > MAX_FLOAT_REG_ARG)
-                    {
-                        // This indicates a partial enregistration of a struct type
-                        assert(argx->gtType == TYP_STRUCT);
-                        unsigned numRegsPartial = size - (fltArgRegNum - MAX_FLOAT_REG_ARG);
-                        assert((unsigned char)numRegsPartial == numRegsPartial);
-                        call->fgArgInfo->SplitArg(argIndex, numRegsPartial, size - numRegsPartial);
-                        fltArgRegNum = MAX_FLOAT_REG_ARG;
-                    }
-#endif // _TARGET_ARM_
+                    intArgRegNum += structIntRegs;
+                    fltArgRegNum += structFloatRegs;
                 }
                 else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 {
-                    intArgRegNum += size;
+                    if (passUsingFloatRegs)
+                    {
+                        fltArgRegNum += size;
 #if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
-                    fltArgSkippedRegMask |= genMapArgNumToRegMask(fltArgRegNum, TYP_DOUBLE);
-                    fltArgRegNum = min(fltArgRegNum + size, MAX_FLOAT_REG_ARG);
+                        argSkippedRegMask |= genMapArgNumToRegMask(intArgRegNum, TYP_I_IMPL);
+                        intArgRegNum = min(intArgRegNum + size, MAX_REG_ARG);
 #endif // _TARGET_AMD64_
 #ifdef _TARGET_ARM_
-                    if (intArgRegNum > MAX_REG_ARG)
-                    {
-                        // This indicates a partial enregistration of a struct type
-                        assert((argx->gtType == TYP_STRUCT) || argx->OperIsCopyBlkOp() ||
-                               (argx->gtOper == GT_COMMA && (args->gtFlags & GTF_ASG)));
-                        unsigned numRegsPartial = size - (intArgRegNum - MAX_REG_ARG);
-                        assert((unsigned char)numRegsPartial == numRegsPartial);
-                        call->fgArgInfo->SplitArg(argIndex, numRegsPartial, size - numRegsPartial);
-                        intArgRegNum = MAX_REG_ARG;
-                        fgPtrArgCntCur += size - numRegsPartial;
+                        if (fltArgRegNum > MAX_FLOAT_REG_ARG)
+                        {
+                            // This indicates a partial enregistration of a struct type
+                            assert(isStructArg);
+                            unsigned numRegsPartial = size - (fltArgRegNum - MAX_FLOAT_REG_ARG);
+                            assert((unsigned char)numRegsPartial == numRegsPartial);
+                            call->fgArgInfo->SplitArg(argIndex, numRegsPartial, size - numRegsPartial);
+                            fltArgRegNum = MAX_FLOAT_REG_ARG;
+                        }
+#endif // _TARGET_ARM_
                     }
+                    else
+                    {
+                        intArgRegNum += size;
+#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
+                        fltArgSkippedRegMask |= genMapArgNumToRegMask(fltArgRegNum, TYP_DOUBLE);
+                        fltArgRegNum = min(fltArgRegNum + size, MAX_FLOAT_REG_ARG);
+#endif // _TARGET_AMD64_
+#ifdef _TARGET_ARM_
+                        if (intArgRegNum > MAX_REG_ARG)
+                        {
+                            // This indicates a partial enregistration of a struct type
+                            assert((isStructArg) || argx->OperIsCopyBlkOp() ||
+                                (argx->gtOper == GT_COMMA && (args->gtFlags & GTF_ASG)));
+                            unsigned numRegsPartial = size - (intArgRegNum - MAX_REG_ARG);
+                            assert((unsigned char)numRegsPartial == numRegsPartial);
+                            call->fgArgInfo->SplitArg(argIndex, numRegsPartial, size - numRegsPartial);
+                            intArgRegNum = MAX_REG_ARG;
+                            fgPtrArgCntCur += size - numRegsPartial;
+                        }
 #endif // _TARGET_ARM_
+                    }
                 }
             }
         }
@@ -3352,27 +3639,28 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         {
             fgPtrArgCntCur += size;
 
-            /* If the register arguments have not been determined then we must fill in the argInfo */
+            // If the register arguments have not been determined then we must fill in the argInfo
 
             if  (!lateArgsComputed)
             {
-                /* This is a stack argument - put it in the table */
-                call->fgArgInfo->AddStkArg(argIndex, argx, args, size, argAlign);
+                // This is a stack argument - put it in the table
+                call->fgArgInfo->AddStkArg(argIndex, argx, args, size, argAlign FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(isStructArg));
+
             }
             else
             {
-                /* This is a stack argument - possibly update it in the table */
+                // This is a stack argument - possibly update it in the table
                 call->fgArgInfo->RemorphStkArg(argIndex, argx, args, size, argAlign);
             }
         }
-
         if (copyBlkClass != NULL)
         {
             noway_assert(!lateArgsComputed);
-            fgMakeOutgoingStructArgCopy(call, args, argIndex, copyBlkClass);
+            fgMakeOutgoingStructArgCopy(call, args, argIndex, copyBlkClass FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(&structDesc));
         }
 
 #ifdef _TARGET_AMD64_
+
         if (argx->gtOper == GT_MKREFANY)
         {
             // 'Lower' the MKREFANY tree and insert it.
@@ -3406,10 +3694,15 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         }
 #endif // _TARGET_AMD64_
 
-
         argIndex++;
-        argSlots += size;
-
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        if (nonRegPassableStruct)
+        {
+            nonRegPassedStructSlots += size;
+        }
+        else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            argSlots += size;
     } // end foreach argument loop
 
     if  (!lateArgsComputed)
@@ -3478,18 +3771,17 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         // and ignores floating point args (it is overly conservative in that case).
         if (argSlots <= MAX_REG_ARG)
         {
-            preallocatedArgCount = 0;
+            preallocatedArgCount = nonRegPassedStructSlots;
         }
         else
         {
-            preallocatedArgCount = argSlots - MAX_REG_ARG;
+            preallocatedArgCount = argSlots + nonRegPassedStructSlots - MAX_REG_ARG;
         }
 #elif defined(_TARGET_AMD64_)
         preallocatedArgCount = max(4, argSlots);
 #else
 #error Unsupported or unset target architecture
 #endif // _TARGET_*
-
         if (preallocatedArgCount * REGSIZE_BYTES > lvaOutgoingArgSpaceSize)
         {
             lvaOutgoingArgSpaceSize = preallocatedArgCount * REGSIZE_BYTES;
@@ -3514,39 +3806,242 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
     // If the register arguments have already been determined
     // or we have no register arguments then we are done.
 
-    if  (lateArgsComputed || (intArgRegNum == 0 && fltArgRegNum == 0 && !hasNonStandardArg))
+    bool needEvalArgsToTemps = true;
+
+    if  (lateArgsComputed || (intArgRegNum == 0 && fltArgRegNum == 0 && !hasNonStandardArg && !hasStructArgument))
     {
-        return call;
+        needEvalArgsToTemps = false;
     }
 
-    // This is the first time that we morph this call AND it has register arguments.
-    // Follow into the code below and do the 'defer or eval to temp' analysis.
+    if (needEvalArgsToTemps)
+    {
+        // This is the first time that we morph this call AND it has register arguments.
+        // Follow into the code below and do the 'defer or eval to temp' analysis.
 
-    call->fgArgInfo->SortArgs();
+        call->fgArgInfo->SortArgs();
 
-    call->fgArgInfo->EvalArgsToTemps();
+        call->fgArgInfo->EvalArgsToTemps();
 
-    // We may have updated the arguments
-    if (call->gtCallArgs)
-    {
-        UpdateGT_LISTFlags(call->gtCallArgs);
+        // We may have updated the arguments
+        if (call->gtCallArgs)
+        {
+            UpdateGT_LISTFlags(call->gtCallArgs);
+        }
     }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // Rewrite the struct args to be passed by value on stack or in registers.
+    fgMorphSystemVStructArgs(call, hasStructArgument);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     return call;
 }
 #ifdef _PREFAST_
 #pragma warning(pop)
 #endif
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+// fgMorphSystemVStructArgs:
+//   Rewrite the struct args to be passed by value on stack or in registers.
+//
+// args:
+//   call: The cll whose arguments need to be morphed..
+//   hasStructArgument: Whether this call has struct arguments.
+//   
+void Compiler::fgMorphSystemVStructArgs(GenTreeCall* call, bool hasStructArgument)
+{
+    unsigned flagsSummary = 0;
+    GenTreePtr      args;
+    GenTreePtr      argx;
+
+    if (hasStructArgument)
+    {
+        fgArgInfoPtr allArgInfo = call->fgArgInfo;
+
+        for (args = call->gtCallArgs; args; args = args->gtOp.gtOp2)
+        {
+            // For late arguments the arg tree that is overridden is in the gtCallLateArgs list. 
+            // For suchlate args the gtCallArgList contains the setup arg node (ealuating the arg.) 
+            // The tree from the gtCallLateArgs list is passed to the calle. The fgArgEntry node cointains the mapping
+            // between the nodes in both lists. If the arg is not a late arg, the fgArgEntryt->node points to itself,
+            // otherwise points to the list in the late args list.
+            bool isLateArg = (args->gtOp.gtOp1->gtFlags & GTF_LATE_ARG) != 0;
+            fgArgTabEntryPtr fgEntryPtr = gtArgEntryByNode(call, args->gtOp.gtOp1);
+            assert(fgEntryPtr != nullptr);
+            GenTreePtr argx = fgEntryPtr->node;
+            GenTreePtr lateList = nullptr;
+            GenTreePtr lateNode = nullptr;
+
+            if (isLateArg)
+            {
+                for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+                {
+                    assert(list->IsList());
+
+                    GenTreePtr argNode = list->Current();
+                    if (argx == argNode)
+                    {
+                        lateList = list;
+                        lateNode = argNode;
+                        break;
+                    }
+                }
+                assert(lateList != nullptr && lateNode != nullptr);
+            }
+            GenTreePtr arg = argx;
+            bool argListCreated = false;
+
+            var_types  type = arg->TypeGet();
+
+            if (type == TYP_STRUCT)
+            {
+                // If we have already processed the arg...
+                if (arg->OperGet() == GT_LIST && arg->TypeGet() == TYP_STRUCT)
+                {
+                    continue;
+                }
+
+                // If already LDOBJ it is set properly already.
+                if (arg->OperGet() == GT_LDOBJ)
+                {
+                    assert(!fgEntryPtr->structDesc.passedInRegisters);
+                    continue;
+                }
+
+                assert(
+                    arg->OperGet() == GT_ADDR ||
+                    arg->OperGet() == GT_LCL_FLD ||
+                    arg->OperGet() == GT_LCL_VAR);
+
+                assert(
+                    arg->OperGet() == GT_LCL_VAR ||
+                    arg->OperGet() == GT_LCL_FLD ||
+                    arg->gtOp.gtOp1->OperGet() == GT_LCL_FLD ||
+                    arg->gtOp.gtOp1->OperGet() == GT_LCL_VAR);
+
+                GenTreeLclVarCommon* lclCommon = arg->OperGet() == GT_ADDR ?
+                    arg->gtOp.gtOp1->AsLclVarCommon() : arg->AsLclVarCommon();
+                if (fgEntryPtr->structDesc.passedInRegisters)
+                {
+                    if (fgEntryPtr->structDesc.eightByteCount == 1) 
+                    {
+                        // Change the type and below the code will change the LclVar to a LCL_FLD
+                        type = GetTypeFromClassificationAndSizes(fgEntryPtr->structDesc.eightByteClassifications[0], fgEntryPtr->structDesc.eightByteSizes[0]);
+                    }
+                    else if (fgEntryPtr->structDesc.eightByteCount == 2)
+                    {
+                        // Create LCL_FLD for each eightbyte.
+                        argListCreated = true;
+
+                        // Second eightbyte.
+                        GenTreeLclFld* newLclField = new(this, GT_LCL_FLD) GenTreeLclFld(
+                            GetTypeFromClassificationAndSizes(
+                            fgEntryPtr->structDesc.eightByteClassifications[1],
+                            fgEntryPtr->structDesc.eightByteSizes[1]),
+                            lclCommon->gtLclNum,
+                            fgEntryPtr->structDesc.eightByteOffsets[1]);
+                        GenTreeArgList* secondNode = gtNewListNode(newLclField, nullptr);
+                        secondNode->gtType = TYP_STRUCT; // Preserve the TYP_STRUCT. It is a special case.
+                        newLclField->gtFieldSeq = FieldSeqStore::NotAField();
+
+                        // First field
+                        arg->AsLclFld()->gtFieldSeq = FieldSeqStore::NotAField();
+                        arg->gtType = GetTypeFromClassificationAndSizes(
+                            fgEntryPtr->structDesc.eightByteClassifications[0],
+                            fgEntryPtr->structDesc.eightByteSizes[0]);
+                        arg = gtNewListNode(arg, secondNode);
+                        arg->gtType = TYP_STRUCT; // Preserve the TYP_STRUCT. It is a special case.
+                    }
+                    else
+                    {
+                        assert(false && "More than two eightbytes detected for CLR."); // No more than two eightbytes for the CLR.
+                    }
+                }
+
+                // If we didn't change the type of the struct, it means
+                // its classification doesn't support to be passed directly through a
+                // register, so we need to pass a pointer to the destination where
+                // where we copied the struct to.
+                if (!argListCreated)
+                {
+                    if (fgEntryPtr->structDesc.passedInRegisters)
+                    {
+                        arg->gtType = type;
+                    }
+                    else
+                    {
+                        arg->gtType = TYP_I_IMPL;
+
+                        // Make sure this is an addr node.
+                        if (arg->OperGet() != GT_ADDR && arg->OperGet() != GT_LCL_VAR_ADDR)
+                        {
+                            arg = gtNewOperNode(GT_ADDR, TYP_I_IMPL, arg);
+                        }
+
+                        assert(arg->OperGet() == GT_ADDR || arg->OperGet() == GT_LCL_VAR_ADDR);
+
+                        // Ldobj the temp to use it as a call argument
+                        arg = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, arg, lvaGetStruct(lclCommon->gtLclNum));
+                        arg->gtFlags |= GTF_EXCEPT;
+                        flagsSummary |= GTF_EXCEPT;
+                    }
+                }
+            }
+
+            if (argx != arg)
+            {
+                bool isLateArg = (args->gtOp.gtOp1->gtFlags & GTF_LATE_ARG) != 0;
+                fgArgTabEntryPtr fgEntryPtr = gtArgEntryByNode(call, args->gtOp.gtOp1);
+                assert(fgEntryPtr != nullptr);
+                GenTreePtr argx = fgEntryPtr->node;
+                GenTreePtr lateList = nullptr;
+                GenTreePtr lateNode = nullptr;
+                if (isLateArg)
+                {
+                    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+                    {
+                        assert(list->IsList());
+
+                        GenTreePtr argNode = list->Current();
+                        if (argx == argNode)
+                        {
+                            lateList = list;
+                            lateNode = argNode;
+                            break;
+                        }
+                    }
+                    assert(lateList != nullptr && lateNode != nullptr);
+                }
+
+                fgEntryPtr->node = arg;
+                if (isLateArg)
+                {
+                    lateList->gtOp.gtOp1 = arg;
+                }
+                else
+                {
+                    args->gtOp.gtOp1 = arg;
+                }
+            }
+        }
+    }
+
+    // Update the flags
+    call->gtFlags |= (flagsSummary & GTF_ALL_EFFECT);
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 // Make a copy of a struct variable if necessary, to pass to a callee.
 // returns: tree that computes address of the outgoing arg
 void
-Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, GenTree* args, unsigned argIndex, CORINFO_CLASS_HANDLE copyBlkClass)
+Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, 
+                                      GenTree* args, 
+                                      unsigned argIndex, 
+                                      CORINFO_CLASS_HANDLE copyBlkClass
+                                      FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* const structDescPtr))
 {
     GenTree* argx = args->Current();
-
     noway_assert(argx->gtOper != GT_MKREFANY);
-
     // See if we need to insert a copy at all
     // Case 1: don't need a copy if it is the last use of a local.  We can't determine that all of the time
     // but if there is only one use and no loops, the use must be last.
@@ -3616,8 +4111,6 @@ Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, GenTree* args, unsigned
 
     fgCurrentlyInUseArgTemps->setBit(tmp);
 
-            
-
     // TYP_SIMD structs should not be enregistered, since ABI requires it to be
     // allocated on stack and address of it needs to be passed.
     if (lclVarIsSIMDType(tmp))
@@ -3648,13 +4141,16 @@ Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, GenTree* args, unsigned
 #if FEATURE_FIXED_OUT_ARGS
 
     // Do the copy early, and evalute the temp later (see EvalArgsToTemps)
+    // When on Unix create LCL_FLD for structs passed in more than one registers. See fgMakeTmpArgNode
     GenTreePtr arg = copyBlk;
 
 #else // FEATURE_FIXED_OUT_ARGS
 
     // Structs are always on the stack, and thus never need temps
     // so we have to put the copy and temp all into one expression
-    GenTreePtr arg = fgMakeTmpArgNode(tmp);
+    GenTreePtr arg = fgMakeTmpArgNode(
+        tmp
+        FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(structDescPtr->passedInRegisters));
 
     // Change the expression to "(tmp=val),tmp"
     arg = gtNewOperNode(GT_COMMA, arg->TypeGet(), copyBlk, arg);
@@ -3718,30 +4214,60 @@ void                Compiler::fgFixupStructReturn(GenTreePtr     call)
 {
     bool callHasRetBuffArg = ((call->gtCall.gtCallMoreFlags & GTF_CALL_M_RETBUFFARG) != 0);
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    if (!callHasRetBuffArg && call->TypeGet() == TYP_STRUCT && call->gtCall.gtRetClsHnd != NO_CLASS_HANDLE)
+    {
+        eeGetSystemVAmd64PassStructInRegisterDescriptor(GetStructClassHandle(call), &structDesc);
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     if (!callHasRetBuffArg && call->TypeGet() == TYP_STRUCT)
     {
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_)
         if (call->gtCall.IsVarargs() || !IsHfa(call))
-#endif
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (!structDesc.passedInRegisters)
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         {
             // Now that we are past the importer, re-type this node so the register predictor does
             // the right thing
             call->gtType = genActualType((var_types)call->gtCall.gtReturnType);
         }
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        else
+        {
+            if (structDesc.passedInRegisters && structDesc.eightByteCount <= 1)
+            {                
+                call->gtType = genActualType(getEightByteType(structDesc, 0));
+            }
+        }
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     }
-
 #ifdef _TARGET_ARM_
     // Either we don't have a struct now or if struct, then it is HFA returned in regs.
     assert(call->TypeGet() != TYP_STRUCT || (IsHfa(call) && !callHasRetBuffArg));
 #else
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Either we don't have a struct now or if struct, then it is a struct returned in regs or in return buffer.
+    assert((call->TypeGet() != TYP_STRUCT) ||
+           (structDesc.passedInRegisters) ||
+           (callHasRetBuffArg));
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     // No more struct returns
     assert(call->TypeGet() != TYP_STRUCT);
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 #endif
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // If there is a struct that is returned in registers there might be a retbuf (homing space for the return) and type struct.
+    assert(!callHasRetBuffArg || (call->TypeGet() == TYP_VOID) || (call->TypeGet() == TYP_STRUCT));
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     // If it was a struct return, it has been transformed into a call
     // with a return buffer (that returns TYP_VOID) or into a return
     // of a primitive/enregisterable type
     assert(!callHasRetBuffArg || (call->TypeGet() == TYP_VOID));
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 }
 
 
@@ -4698,7 +5224,6 @@ GenTreePtr          Compiler::fgMorphField(GenTreePtr tree, MorphAddrContext* ma
                                 );
         }
 #endif
-
         if  (fldOffset != 0)
         {
             // Generate the "addr" node.
@@ -5180,6 +5705,7 @@ bool                Compiler::fgCanFastTailCall(GenTreeCall* callee)
             }
 
             // Get the size of the struct and see if it is 1, 2, 4 or 8 bytes in size
+            // For Amd64-Unix the call below checks to see if the struct is register passable.
             if (argx->OperGet() == GT_LDOBJ)
             {
 #ifdef _TARGET_AMD64_
@@ -5634,6 +6160,13 @@ GenTreePtr          Compiler::fgMorphCall(GenTreeCall* call)
         call->gtCallMoreFlags &= ~GTF_CALL_M_IMPLICIT_TAILCALL;
 #endif
         
+#ifdef FEATURE_PAL
+        if (!canFastTailCall && szFailReason == nullptr)
+        {
+            szFailReason = "Non fast tail calls disabled for PAL based systems.";
+        }
+#endif // FEATURE_PAL
+
         if (szFailReason != nullptr)
         {
 #ifdef DEBUG
@@ -5659,13 +6192,6 @@ GenTreePtr          Compiler::fgMorphCall(GenTreeCall* call)
             compCurBB->bbJumpKind = BBJ_RETURN;
 #endif
 
-#ifdef FEATURE_PAL
-        if (!canFastTailCall)
-        {
-            goto NO_TAIL_CALL;
-        }
-#endif // FEATURE_PAL
-
         // Set this flag before calling fgMorphCall() to prevent inlining this call.
         call->gtCallMoreFlags |=  GTF_CALL_M_TAILCALL;
 
@@ -5847,6 +6373,13 @@ GenTreePtr          Compiler::fgMorphCall(GenTreeCall* call)
                 // This is a HFA, use float 0.
                 callType = TYP_FLOAT;
             }
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            // Return a dummy node, as the return is already removed.
+            if (callType == TYP_STRUCT)
+            {
+                // This is an register-returned struct. Return a 0.
+                callType = TYP_INT;
+            }
 #endif
             result = gtNewZeroConNode(genActualType(callType));
             result = fgMorphTree(result);
@@ -5990,7 +6523,6 @@ NO_TAIL_CALL:
 
                     retValTmpNum = lvaGrabTemp(true DEBUGARG("substitute local for ret buff arg"));
                     lvaSetStruct(retValTmpNum, structHnd, true);
-
                     dest = gtNewOperNode(GT_ADDR, TYP_BYREF, gtNewLclvNode(retValTmpNum, TYP_STRUCT));
                 }
             }
@@ -6400,6 +6932,7 @@ ONE_SIMPLE_ASG:
             if (lclVarTree->TypeGet() == TYP_STRUCT &&
                 (lvaTable[lclNum].lvPromoted || lclVarIsSIMDType(lclNum)))
             {
+
                 // Let fgMorphInitBlock handle it.  (Since we'll need to do field-var-wise assignments.)
                 goto GENERAL_BLKOP;
             }
@@ -7203,8 +7736,13 @@ GenTreePtr          Compiler::fgMorphCopyBlock(GenTreePtr tree)
         {
             // Spill the (complex) address to a BYREF temp.
             // Note, at most one address may need to be spilled.
-
             addrSpillTemp = lvaGrabTemp(true DEBUGARG("BlockOp address local"));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            lvaTable[addrSpillTemp].lvType = TYP_I_IMPL;
+
+            tree = gtNewAssignNode(gtNewLclvNode(addrSpillTemp, TYP_I_IMPL),
+                                   addrSpill);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
             lvaTable[addrSpillTemp].lvType = TYP_BYREF;
 
             if (addrSpillIsStackDest)
@@ -7214,6 +7752,8 @@ GenTreePtr          Compiler::fgMorphCopyBlock(GenTreePtr tree)
 
             tree = gtNewAssignNode(gtNewLclvNode(addrSpillTemp, TYP_BYREF),
                                    addrSpill);
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifndef LEGACY_BACKEND
             // If we are assigning the address of a LclVar here 
             // liveness does not account for this kind of address taken use. 
@@ -9529,7 +10069,7 @@ COMPARE:
 
     case GT_ADD:
 
-CM_OVF_OP:
+    CM_OVF_OP :
         if (tree->gtOverflow())
         {
             tree->gtRequestSetFlags();
@@ -10906,7 +11446,9 @@ ASG_OP:
             if  (add->IsCnsIntOrI() && (op2->GetScaleIndexMul() != 0))
             {
                 if (tree->gtOverflow() || op1->gtOverflow())
+                {
                     break;
+                }
 
                 ssize_t     imul = op2->gtIntCon.gtIconVal;
                 ssize_t     iadd = add->gtIntCon.gtIconVal;
@@ -12825,7 +13367,11 @@ void                Compiler::fgMorphBlocks()
                 //replace the GT_RETURN node to be a GT_ASG that stores the return value into genReturnLocal.
                 if (genReturnLocal != BAD_VAR_NUM)
                 {
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    noway_assert(info.compRetType != TYP_VOID);
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                     noway_assert(info.compRetType != TYP_VOID && info.compRetNativeType != TYP_STRUCT);
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                     noway_assert(block->bbTreeList);
 
                     GenTreePtr last = block->bbTreeList->gtPrev;
@@ -13834,9 +14380,9 @@ void                Compiler::fgPromoteStructs()
             break;
         }
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         if (!varDsc->lvDontPromote)
-#endif // _TARGET_ARM_
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         {
 #ifdef FEATURE_SIMD
             if (varDsc->lvSIMDType && varDsc->lvUsedInSIMDIntrinsic)
@@ -14154,6 +14700,8 @@ void                Compiler::fgMarkImplicitByRefArgs()
                 size = info.compCompHnd->getClassSize(typeHnd);
             }
 
+
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 #if defined(_TARGET_AMD64_)
             if (size > REGSIZE_BYTES || (size & (size - 1)) != 0)
 #elif defined(_TARGET_ARM64_)
@@ -14184,6 +14732,7 @@ void                Compiler::fgMarkImplicitByRefArgs()
                 varDsc->lvKeepType = 1;
 #endif // DEBUG
             }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         }
     }
 
diff --git a/src/jit/regalloc.cpp b/src/jit/regalloc.cpp
index 839f497f4a..89945301f0 100644
--- a/src/jit/regalloc.cpp
+++ b/src/jit/regalloc.cpp
@@ -667,7 +667,7 @@ void                Compiler::raSetupArgMasks(RegState *regState)
 #endif // LEGACY_BACKEND
 
 // The code to set the regState for each arg is outlined for shared use
-// by linear scan
+// by linear scan. (It is not shared for System V AMD64 platform.)
 regNumber     Compiler::raUpdateRegStateForArg(RegState *regState, LclVarDsc *argDsc)
 {
     regNumber inArgReg = argDsc->lvArgReg;
diff --git a/src/jit/scopeinfo.cpp b/src/jit/scopeinfo.cpp
index a108713792..53a5960967 100644
--- a/src/jit/scopeinfo.cpp
+++ b/src/jit/scopeinfo.cpp
@@ -909,21 +909,65 @@ void                CodeGen::psiBegProlog()
         psiScope * newScope      = psiNewPrologScope(varScope->vsdLVnum,
                                                      varScope->vsdVarNum);
 
-        if  (lclVarDsc1->lvIsRegArg)
+        if (lclVarDsc1->lvIsRegArg)
         {
-#ifdef DEBUG
-            var_types regType = compiler->mangleVarArgsType(lclVarDsc1->TypeGet());
-#ifdef _TARGET_ARM_
-            if (lclVarDsc1->lvIsHfaRegArg)
+            bool isStructHandled = false;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            if (lclVarDsc1->TypeGet() == TYP_STRUCT)
             {
-                regType = lclVarDsc1->GetHfaType();
+                CORINFO_CLASS_HANDLE typeHnd = lclVarDsc1->lvVerTypeInfo.GetClassHandle();
+                assert(typeHnd != nullptr);
+                compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+                assert(structDesc.passedInRegisters);
+
+                for (unsigned nCnt = 0; nCnt < structDesc.eightByteCount; nCnt++)
+                {
+                    unsigned len = structDesc.eightByteSizes[nCnt];
+                    var_types regType = TYP_UNDEF;
+                    regNumber regNum = REG_NA;
+                    if (nCnt == 0)
+                    {
+                        regNum = lclVarDsc1->lvArgReg;
+                    }
+                    else if (nCnt == 1)
+                    {
+                        regNum = lclVarDsc1->lvOtherArgReg;
+                    }
+                    else
+                    {
+                        assert(false && "Invalid eightbyte number.");
+                    }
+
+                    regType = compiler->getEightByteType(structDesc, nCnt);
+#ifdef DEBUG
+                    regType = compiler->mangleVarArgsType(regType);
+                    assert(genMapRegNumToRegArgNum(regNum, regType) != (unsigned)-1);
+#endif // DEBUG
+
+                    newScope->scRegister = true;
+                    newScope->u1.scRegNum = (regNumberSmall)regNum;
+                }
+
+                isStructHandled = true;
             }
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            if (!isStructHandled)
+            {
+#ifdef DEBUG
+                var_types regType = compiler->mangleVarArgsType(lclVarDsc1->TypeGet());
+#ifdef _TARGET_ARM_
+                if (lclVarDsc1->lvIsHfaRegArg)
+                {
+                    regType = lclVarDsc1->GetHfaType();
+                }
 #endif // _TARGET_ARM_
-            assert(genMapRegNumToRegArgNum(lclVarDsc1->lvArgReg, regType) != (unsigned)-1);
+                assert(genMapRegNumToRegArgNum(lclVarDsc1->lvArgReg, regType) != (unsigned)-1);
 #endif // DEBUG
 
-            newScope->scRegister     = true;
-            newScope->u1.scRegNum    = (regNumberSmall) lclVarDsc1->lvArgReg;
+                newScope->scRegister =  true;
+                newScope->u1.scRegNum = (regNumberSmall)lclVarDsc1->lvArgReg;
+            }
         }
         else
         {
diff --git a/src/jit/target.h b/src/jit/target.h
index f4aad4e153..767eb31d8d 100644
--- a/src/jit/target.h
+++ b/src/jit/target.h
@@ -19,6 +19,12 @@
 #endif
 #endif
 
+#if (defined(FEATURE_CORECLR) && defined(PLATFORM_UNIX))
+#define FEATURE_VARARG    0
+#else // !(defined(FEATURE_CORECLR) && defined(PLATFORM_UNIX))
+#define FEATURE_VARARG    1
+#endif // !(defined(FEATURE_CORECLR) && defined(PLATFORM_UNIX))
+
 /*****************************************************************************/
 // The following are intended to capture only those #defines that cannot be replaced
 // with static const members of Target
@@ -971,10 +977,28 @@ typedef unsigned short          regPairNoSmall; // arm: need 12 bits
   #define REG_LNGRET               REG_EAX
   #define RBM_LNGRET               RBM_EAX
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    #define REG_INTRET_1           REG_RDX
+    #define RBM_INTRET_1           RBM_RDX
+
+    #define REG_LNGRET_1           REG_RDX
+    #define RBM_LNGRET_1           RBM_RDX
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+
   #define REG_FLOATRET             REG_XMM0
   #define RBM_FLOATRET             RBM_XMM0
+  #define REG_DOUBLERET            REG_XMM0
   #define RBM_DOUBLERET            RBM_XMM0
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#define REG_FLOATRET_1             REG_XMM1
+#define RBM_FLOATRET_1             RBM_XMM1
+
+#define REG_DOUBLERET_1            REG_XMM1
+#define RBM_DOUBLERET_1            RBM_XMM1
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
   #define REG_FPBASE               REG_EBP
   #define RBM_FPBASE               RBM_EBP
   #define STR_FPBASE               "rbp"
@@ -1872,7 +1896,7 @@ extern const regMaskSmall  regMasks[REG_COUNT];
 inline regMaskTP    genRegMask(regNumber reg)
 {
     assert((unsigned)reg < ArrLen(regMasks));
-#if defined _TARGET_AMD64_
+#ifdef _TARGET_AMD64_
     // shift is faster than a L1 hit on modern x86
     // (L1 latency on sandy bridge is 4 cycles for [base] and 5 for [base + index*c] )
     // the reason this is AMD-only is because the x86 BE will try to get reg masks for REG_STK
diff --git a/src/pal/src/cruntime/printfcpp.cpp b/src/pal/src/cruntime/printfcpp.cpp
index 87cd8a8aff..8adf3470c2 100644
--- a/src/pal/src/cruntime/printfcpp.cpp
+++ b/src/pal/src/cruntime/printfcpp.cpp
@@ -2306,7 +2306,7 @@ int CoreVfprintf(CPalThread *pthrCurrent, PAL_FILE *stream, const char *format,
                 if (!Length)
                 {
                     ASSERT("WideCharToMultiByte failed.  Error is %d\n",
-                          GetLastError());
+                        GetLastError());
                     PERF_EXIT(vfprintf);
                     va_end(ap);
                     return -1;
diff --git a/src/vm/amd64/calldescrworkeramd64.S b/src/vm/amd64/calldescrworkeramd64.S
index efee6f325a..ca4fd703c6 100644
--- a/src/vm/amd64/calldescrworkeramd64.S
+++ b/src/vm/amd64/calldescrworkeramd64.S
@@ -108,11 +108,43 @@ LOCAL_LABEL(NoFloatArguments):
         je      LOCAL_LABEL(ReturnsFloat)
         cmp     ecx, 8
         je      LOCAL_LABEL(ReturnsDouble)
-        // unexpected
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        // Struct with two integer eightbytes
+        cmp     ecx, 16
+        jne     LOCAL_LABEL(NotTwoIntegerEightbytes)
+        mov     qword ptr [rbx+CallDescrData__returnValue], rax
+        mov     qword ptr [rbx+CallDescrData__returnValue + 8], rdx
+        jmp     LOCAL_LABEL(Epilog)
+
+LOCAL_LABEL(NotTwoIntegerEightbytes):
+        // Struct with the first eightbyte SSE and the second one integer
+        cmp     ecx, 16 + 1
+        jne     LOCAL_LABEL(NotFirstSSESecondIntegerEightbyte)
+        movsd   real8 ptr [rbx+CallDescrData__returnValue], xmm0
+        mov     qword ptr [rbx+CallDescrData__returnValue + 8], rax
+        jmp     LOCAL_LABEL(Epilog)
+
+LOCAL_LABEL(NotFirstSSESecondIntegerEightbyte):
+        // Struct with the first eightbyte integer and the second one SSE
+        cmp     ecx, 16 + 2
+        jne     LOCAL_LABEL(NotFirstIntegerSecondSSEEightbyte)
+        mov     qword ptr [rbx+CallDescrData__returnValue], rax
+        movsd   real8 ptr [rbx+CallDescrData__returnValue + 8], xmm0
+        jmp     LOCAL_LABEL(Epilog)
+
+LOCAL_LABEL(NotFirstIntegerSecondSSEEightbyte):
+        // Struct with two SSE eightbytes
+        cmp     ecx, 16 + 3
+        jne     LOCAL_LABEL(Epilog) // unexpected
+        movsd   real8 ptr [rbx+CallDescrData__returnValue], xmm0
+        movsd   real8 ptr [rbx+CallDescrData__returnValue + 8], xmm1
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
         jmp     LOCAL_LABEL(Epilog)
 
 LOCAL_LABEL(ReturnsInt):
-        mov     [rbx+CallDescrData__returnValue], rax
+        mov     qword ptr [rbx+CallDescrData__returnValue], rax
 
 LOCAL_LABEL(Epilog):
         lea     rsp, [rbp - 8]          // deallocate arguments
diff --git a/src/vm/amd64/cgenamd64.cpp b/src/vm/amd64/cgenamd64.cpp
index e9c1ad468b..51738684ad 100644
--- a/src/vm/amd64/cgenamd64.cpp
+++ b/src/vm/amd64/cgenamd64.cpp
@@ -323,8 +323,16 @@ void HijackFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
 
     UpdateRegDisplayFromCalleeSavedRegisters(pRD, &(m_Args->Regs));
 
+#ifdef UNIX_AMD64_ABI
+    pRD->pCurrentContextPointers->Rsi = NULL;
+    pRD->pCurrentContextPointers->Rdi = NULL;
+#endif
     pRD->pCurrentContextPointers->Rcx = NULL;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    pRD->pCurrentContextPointers->Rdx = (PULONG64)&m_Args->Rdx;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
     pRD->pCurrentContextPointers->Rdx = NULL;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     pRD->pCurrentContextPointers->R8  = NULL;
     pRD->pCurrentContextPointers->R9  = NULL;
     pRD->pCurrentContextPointers->R10 = NULL;
diff --git a/src/vm/amd64/cgencpu.h b/src/vm/amd64/cgencpu.h
index 39b8ba91de..de64b1600b 100644
--- a/src/vm/amd64/cgencpu.h
+++ b/src/vm/amd64/cgencpu.h
@@ -66,14 +66,15 @@ EXTERN_C void FastCallFinalizeWorker(Object *obj, PCODE funcPtr);
 #define CACHE_LINE_SIZE                         64   // Current AMD64 processors have 64-byte cache lines as per AMD64 optmization manual
 #define LOG2SLOT                                LOG2_PTRSIZE
 
-#define ENREGISTERED_RETURNTYPE_MAXSIZE         8    // bytes
 #define ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE 8    // bytes
 #define ENREGISTERED_PARAMTYPE_MAXSIZE          8    // bytes
 
 #ifdef UNIX_AMD64_ABI
-#define CALLDESCR_ARGREGS                       1   // CallDescrWorker has ArgumentRegister parameter
-#define CALLDESCR_FPARGREGS                     1   // CallDescrWorker has FloatArgumentRegisters parameter
+#define ENREGISTERED_RETURNTYPE_MAXSIZE         16   // bytes
+#define CALLDESCR_ARGREGS                       1    // CallDescrWorker has ArgumentRegister parameter
+#define CALLDESCR_FPARGREGS                     1    // CallDescrWorker has FloatArgumentRegisters parameter
 #else
+#define ENREGISTERED_RETURNTYPE_MAXSIZE         8    // bytes
 #define COM_STUBS_SEPARATE_FP_LOCATIONS
 #define CALLDESCR_REGTYPEMAP                    1
 #endif
@@ -265,9 +266,11 @@ struct CalleeSavedRegistersPointers {
 
 #ifdef UNIX_AMD64_ABI
 
+#define NUM_FLOAT_ARGUMENT_REGISTERS 8
+
 typedef DPTR(struct FloatArgumentRegisters) PTR_FloatArgumentRegisters;
 struct FloatArgumentRegisters {
-     M128A d[8];   // xmm0-xmm7
+     M128A d[NUM_FLOAT_ARGUMENT_REGISTERS];   // xmm0-xmm7
 };
 
 #endif
@@ -475,11 +478,23 @@ struct DECLSPEC_ALIGN(8) UMEntryThunkCode
 
 struct HijackArgs
 {
+#ifndef PLATFORM_UNIX
     union
     {
         ULONG64 Rax;
         ULONG64 ReturnValue;
     };
+#else // PLATFORM_UNIX
+    union
+    {
+        struct
+        {
+            ULONG64 Rax;
+            ULONG64 Rdx;
+        };
+        ULONG64 ReturnValue[2];
+    };
+#endif // PLATFORM_UNIX
     CalleeSavedRegisters Regs;
     union
     {
diff --git a/src/vm/amd64/unixasmhelpers.S b/src/vm/amd64/unixasmhelpers.S
index 21a8f63232..058a69a382 100644
--- a/src/vm/amd64/unixasmhelpers.S
+++ b/src/vm/amd64/unixasmhelpers.S
@@ -184,12 +184,13 @@ NESTED_ENTRY OnHijackScalarTripThread, _TEXT, NoHandler
 
     PUSH_CALLEE_SAVED_REGISTERS
 
+    push_register rdx
     // Push rax again - this is where integer/pointer return values are returned
     push_register rax
 
     mov                 rdi, rsp
 
-    alloc_stack         0x20
+    alloc_stack         0x28
 
     // First float return register
     movdqa              [rsp], xmm0
@@ -202,14 +203,55 @@ NESTED_ENTRY OnHijackScalarTripThread, _TEXT, NoHandler
 
     movdqa              xmm0, [rsp]
     movdqa              xmm1, [rsp+0x10]
-    free_stack          0x20
+    free_stack          0x28
     pop_register        rax
+    pop_register        rdx
 
     POP_CALLEE_SAVED_REGISTERS
     ret
 
 NESTED_END OnHijackScalarTripThread, _TEXT
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+//------------------------------------------------
+// OnHijackStructInRegsTripThread
+//
+NESTED_ENTRY OnHijackStructInRegsTripThread, _TEXT, NoHandler
+
+    // Make room for the real return address (rip)
+    push_register rax
+
+    PUSH_CALLEE_SAVED_REGISTERS
+
+    push_register rdx
+    // Push rax again - this is where part of the struct gets returned
+    push_register rax
+
+    mov                 rdi, rsp
+
+    alloc_stack         0x28
+
+    // First float return register
+    movdqa              [rsp], xmm0
+    // Second float return register
+    movdqa              [rsp+0x10], xmm1
+
+    END_PROLOGUE
+
+    call                C_FUNC(OnHijackStructInRegsWorker)
+
+    movdqa              xmm0, [rsp]
+    movdqa              xmm1, [rsp+0x10]
+    free_stack          0x28
+    pop_register        rax
+    pop_register        rdx
+
+    POP_CALLEE_SAVED_REGISTERS
+    ret
+
+NESTED_END OnHijackStructInRegsTripThread, _TEXT
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 //------------------------------------------------
 // OnHijackObjectTripThread
 //
@@ -220,16 +262,22 @@ NESTED_ENTRY OnHijackObjectTripThread, _TEXT, NoHandler
 
     PUSH_CALLEE_SAVED_REGISTERS
 
+    push_register rdx
     // Push rax again - this is where integer/pointer return values are returned
     push_register rax
 
     mov                 rdi, rsp
 
+    // align stack
+    alloc_stack         0x8
+
     END_PROLOGUE
 
     call                C_FUNC(OnHijackObjectWorker)
 
+    free_stack          0x8
     pop_register        rax
+    pop_register        rdx
 
     POP_CALLEE_SAVED_REGISTERS
     ret
@@ -246,16 +294,22 @@ NESTED_ENTRY OnHijackInteriorPointerTripThread, _TEXT, NoHandler
 
     PUSH_CALLEE_SAVED_REGISTERS
 
+    push_register rdx
     // Push rax again - this is where integer/pointer return values are returned
     push_register rax
 
     mov                 rdi, rsp
 
+    // align stack
+    alloc_stack         0x8
+
     END_PROLOGUE
 
     call                C_FUNC(OnHijackInteriorPointerWorker)
 
+    free_stack          0x8
     pop_register        rax
+    pop_register        rdx
 
     POP_CALLEE_SAVED_REGISTERS
     ret
diff --git a/src/vm/argdestination.h b/src/vm/argdestination.h
new file mode 100644
index 0000000000..5896414f35
--- /dev/null
+++ b/src/vm/argdestination.h
@@ -0,0 +1,217 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for full license information.
+//
+//
+
+#ifndef __ARGDESTINATION_H__
+#define __ARGDESTINATION_H__
+
+// The ArgDestination class represents a destination location of an argument.
+class ArgDestination
+{
+    // Base address to which the m_offset is applied to get the actual argument location.
+    PTR_VOID m_base;
+    // Offset of the argument relative to the m_base. On AMD64 on Unix, it can have a special
+    // value that represent a struct that contain both general purpose and floating point fields 
+    // passed in registers.
+    int m_offset;
+    // For structs passed in registers, this member points to an ArgLocDesc that contains
+    // details on the layout of the struct in general purpose and floating point registers.
+    ArgLocDesc* m_argLocDescForStructInRegs;
+
+public:
+
+    // Construct the ArgDestination
+    ArgDestination(PTR_VOID base, int offset, ArgLocDesc* argLocDescForStructInRegs)
+    :   m_base(base),
+        m_offset(offset),
+        m_argLocDescForStructInRegs(argLocDescForStructInRegs)
+    {
+        LIMITED_METHOD_CONTRACT;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        _ASSERTE((argLocDescForStructInRegs != NULL) || (offset != TransitionBlock::StructInRegsOffset));
+#else        
+        _ASSERTE(argLocDescForStructInRegs == NULL);
+#endif        
+    }
+
+    // Get argument destination address for arguments that are not structs passed in registers.
+    PTR_VOID GetDestinationAddress()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return dac_cast<PTR_VOID>(dac_cast<TADDR>(m_base) + m_offset);
+    }
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    // Returns true if the ArgDestination represents a struct passed in registers.
+    bool IsStructPassedInRegs()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return m_offset == TransitionBlock::StructInRegsOffset;
+    }
+
+    // Get destination address for floating point fields of a struct passed in registers.
+    PTR_VOID GetStructFloatRegDestinationAddress()
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(IsStructPassedInRegs());
+        int offset = TransitionBlock::GetOffsetOfFloatArgumentRegisters() + m_argLocDescForStructInRegs->m_idxFloatReg * 8;
+        return dac_cast<PTR_VOID>(dac_cast<TADDR>(m_base) + offset);
+    }
+
+    // Get destination address for non-floating point fields of a struct passed in registers.
+    PTR_VOID GetStructGenRegDestinationAddress()
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(IsStructPassedInRegs());
+        int offset = TransitionBlock::GetOffsetOfArgumentRegisters() + m_argLocDescForStructInRegs->m_idxGenReg * 8;
+        return dac_cast<PTR_VOID>(dac_cast<TADDR>(m_base) + offset);
+    }
+
+#ifndef DACCESS_COMPILE
+    // Zero struct argument stored in registers described by the current ArgDestination.
+    // Arguments:
+    //  fieldBytes - size of the structure
+    void ZeroStructInRegisters(int fieldBytes)
+    {
+        STATIC_CONTRACT_NOTHROW;
+        STATIC_CONTRACT_GC_NOTRIGGER;
+        STATIC_CONTRACT_FORBID_FAULT;
+        STATIC_CONTRACT_MODE_COOPERATIVE;
+
+        // To zero the struct, we create a zero filled array of large enough size and
+        // then copy it to the registers. It is implemented this way to keep the complexity
+        // of dealing with the eightbyte classification in single function.
+        // This function is used rarely and so the overhead of reading the zeros from
+        // the stack is negligible.
+        long long zeros[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS] = {};
+        _ASSERTE(sizeof(zeros) >= fieldBytes);
+
+        CopyStructToRegisters(zeros, fieldBytes, 0);
+    }
+
+    // Copy struct argument into registers described by the current ArgDestination.
+    // Arguments:
+    //  src = source data of the structure 
+    //  fieldBytes - size of the structure
+    //  destOffset - nonzero when copying values into Nullable<T>, it is the offset
+    //               of the T value inside of the Nullable<T>
+    void CopyStructToRegisters(void *src, int fieldBytes, int destOffset)
+    {
+        STATIC_CONTRACT_NOTHROW;
+        STATIC_CONTRACT_GC_NOTRIGGER;
+        STATIC_CONTRACT_FORBID_FAULT;
+        STATIC_CONTRACT_MODE_COOPERATIVE;
+
+        _ASSERTE(IsStructPassedInRegs());
+     
+        BYTE* genRegDest = (BYTE*)GetStructGenRegDestinationAddress() + destOffset;
+        BYTE* floatRegDest = (BYTE*)GetStructFloatRegDestinationAddress();
+        INDEBUG(int remainingBytes = fieldBytes;)
+
+        EEClass* eeClass = m_argLocDescForStructInRegs->m_eeClass;
+        _ASSERTE(eeClass != NULL);
+
+        // We start at the first eightByte that the destOffset didn't skip completely.
+        for (int i = destOffset / 8; i < eeClass->GetNumberEightBytes(); i++)
+        {
+            int eightByteSize = eeClass->GetEightByteSize(i);
+            SystemVClassificationType eightByteClassification = eeClass->GetEightByteClassification(i);
+
+            // Adjust the size of the first eightByte by the destOffset
+            eightByteSize -= (destOffset & 7);
+            destOffset = 0;
+
+            _ASSERTE(remainingBytes >= eightByteSize);
+
+            if (eightByteClassification == SystemVClassificationTypeSSE)
+            {
+                if (eightByteSize == 8)
+                {
+                    *(UINT64*)floatRegDest = *(UINT64*)src;
+                }
+                else
+                {
+                    _ASSERTE(eightByteSize == 4);
+                    *(UINT32*)floatRegDest = *(UINT32*)src;
+                }
+                floatRegDest += 8;
+            }
+            else
+            {
+                if (eightByteSize == 8)
+                {
+                    _ASSERTE((eightByteClassification == SystemVClassificationTypeInteger) ||
+                             (eightByteClassification == SystemVClassificationTypeIntegerReference));
+
+                    _ASSERTE(IS_ALIGNED((SIZE_T)genRegDest, 8));
+                    *(UINT64*)genRegDest = *(UINT64*)src;
+                }
+                else
+                {
+                    _ASSERTE(eightByteClassification == SystemVClassificationTypeInteger);
+                    memcpyNoGCRefs(genRegDest, src, eightByteSize);
+                }
+
+                genRegDest += eightByteSize;
+            }
+
+            src = (BYTE*)src + eightByteSize;
+            INDEBUG(remainingBytes -= eightByteSize;)
+        }
+
+        _ASSERTE(remainingBytes == 0);        
+    }
+
+#endif //DACCESS_COMPILE
+
+    // Report managed object pointers in the struct in registers
+    // Arguments:
+    //  fn - promotion function to apply to each managed object pointer
+    //  sc - scan context to pass to the promotion function
+    //  fieldBytes - size of the structure
+    void ReportPointersFromStructInRegisters(promote_func *fn, ScanContext *sc, int fieldBytes)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+       _ASSERTE(IsStructPassedInRegs());
+     
+        TADDR genRegDest = dac_cast<TADDR>(GetStructGenRegDestinationAddress());
+        INDEBUG(int remainingBytes = fieldBytes;)
+
+        EEClass* eeClass = m_argLocDescForStructInRegs->m_eeClass;
+        _ASSERTE(eeClass != NULL);
+
+        for (int i = 0; i < eeClass->GetNumberEightBytes(); i++)
+        {
+            int eightByteSize = eeClass->GetEightByteSize(i);
+            SystemVClassificationType eightByteClassification = eeClass->GetEightByteClassification(i);
+
+            _ASSERTE(remainingBytes >= eightByteSize);
+
+            if (eightByteClassification != SystemVClassificationTypeSSE)
+            {
+                if (eightByteClassification == SystemVClassificationTypeIntegerReference)
+                {
+                    _ASSERTE(eightByteSize == 8);
+                    _ASSERTE(IS_ALIGNED((SIZE_T)genRegDest, 8));
+
+                    (*fn)(dac_cast<PTR_PTR_Object>(genRegDest), sc, 0);
+                }
+
+                genRegDest += eightByteSize;
+            }
+
+            INDEBUG(remainingBytes -= eightByteSize;)
+        }
+
+        _ASSERTE(remainingBytes == 0);
+    }
+
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+};
+
+#endif // __ARGDESTINATION_H__
diff --git a/src/vm/arm/stubs.cpp b/src/vm/arm/stubs.cpp
index 368e6cf810..342c73b0d0 100644
--- a/src/vm/arm/stubs.cpp
+++ b/src/vm/arm/stubs.cpp
@@ -1052,7 +1052,7 @@ void  DispatchHolder::Initialize(PCODE implTarget, PCODE failTarget, size_t expe
 
     // nop - insert padding
     _stub._entryPoint[n++] = 0xbf00;
-	
+    
     _ASSERTE(n == DispatchStub::entryPointLen);
 
     // Make sure that the data members below are aligned
diff --git a/src/vm/callhelpers.cpp b/src/vm/callhelpers.cpp
index a910c0ea30..137dbb8656 100644
--- a/src/vm/callhelpers.cpp
+++ b/src/vm/callhelpers.cpp
@@ -401,7 +401,7 @@ ARG_SLOT MethodDescCallSite::CallTargetWorker(const ARG_SLOT *pArguments)
         // Record this call if required
         g_IBCLogger.LogMethodDescAccess(m_pMD);
 
-        // 
+        //  
         // All types must already be loaded. This macro also sets up a FAULT_FORBID region which is
         // also required for critical calls since we cannot inject any failure points between the 
         // caller of MethodDesc::CallDescr and the actual transition to managed code.
@@ -537,9 +537,12 @@ ARG_SLOT MethodDescCallSite::CallTargetWorker(const ARG_SLOT *pArguments)
             // have at least one such argument we point the call worker at the floating point area of the
             // frame (we leave it null otherwise since the worker can perform a useful optimization if it
             // knows no floating point registers need to be set up).
-            if ((ofs < 0) && (pFloatArgumentRegisters == NULL))
+            if (TransitionBlock::HasFloatRegister(ofs, m_argIt.GetArgLocDescForStructInRegs()) && 
+                (pFloatArgumentRegisters == NULL))
+            {
                 pFloatArgumentRegisters = (FloatArgumentRegisters*)(pTransitionBlock +
                                                                     TransitionBlock::GetOffsetOfFloatArgumentRegisters());
+            }
 #endif
 
 #if CHECK_APP_DOMAIN_LEAKS
@@ -553,6 +556,9 @@ ARG_SLOT MethodDescCallSite::CallTargetWorker(const ARG_SLOT *pArguments)
             }
 #endif // CHECK_APP_DOMAIN_LEAKS
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            _ASSERTE(ofs != TransitionBlock::StructInRegsOffset);
+#endif
             PVOID pDest = pTransitionBlock + ofs;
 
             UINT32 stackSize = m_argIt.GetArgSize();
diff --git a/src/vm/callingconvention.h b/src/vm/callingconvention.h
index 244a3df878..490ae3ce87 100644
--- a/src/vm/callingconvention.h
+++ b/src/vm/callingconvention.h
@@ -42,6 +42,12 @@ struct ArgLocDesc
     int     m_idxStack;     // First stack slot used (or -1)
     int     m_cStack;       // Count of stack slots used (or 0)
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    EEClass* m_eeClass;     // For structs passed in register, it points to the EEClass of the struct
+
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #if defined(_TARGET_ARM_)
     BOOL    m_fRequires64BitAlignment; // True if the argument should always be aligned (in registers or on the stack
 #endif
@@ -63,6 +69,9 @@ struct ArgLocDesc
 #if defined(_TARGET_ARM_)
         m_fRequires64BitAlignment = FALSE;
 #endif
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        m_eeClass = NULL;
+#endif
     }
 };
 
@@ -138,9 +147,13 @@ struct TransitionBlock
     {
         LIMITED_METHOD_CONTRACT;
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        return offset >= sizeof(TransitionBlock);
+#else        
         int ofsArgRegs = GetOffsetOfArgumentRegisters();
 
         return offset >= (int) (ofsArgRegs + ARGUMENTREGISTERS_SIZE);
+#endif        
     }
 
     static BOOL IsArgumentRegisterOffset(int offset)
@@ -156,14 +169,45 @@ struct TransitionBlock
     static UINT GetArgumentIndexFromOffset(int offset)
     {
         LIMITED_METHOD_CONTRACT;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        _ASSERTE(offset != TransitionBlock::StructInRegsOffset);
+#endif        
         return (offset - GetOffsetOfArgumentRegisters()) / sizeof(TADDR);
     }
+
+    static UINT GetStackArgumentIndexFromOffset(int offset)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        return (offset - TransitionBlock::GetOffsetOfArgs()) / STACK_ELEM_SIZE;
+    }
+
 #endif
 
 #ifdef CALLDESCR_FPARGREGS
     static BOOL IsFloatArgumentRegisterOffset(int offset)
     {
         LIMITED_METHOD_CONTRACT;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        return (offset != TransitionBlock::StructInRegsOffset) && (offset < 0);
+#else        
+        return offset < 0;
+#endif        
+    }
+
+    // Check if an argument has floating point register, that means that it is
+    // either a floating point argument or a struct passed in registers that
+    // has a floating point member.
+    static BOOL HasFloatRegister(int offset, ArgLocDesc* argLocDescForStructInRegs)
+    {
+        LIMITED_METHOD_CONTRACT;
+    #if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (offset == TransitionBlock::StructInRegsOffset)
+        {
+            return argLocDescForStructInRegs->m_cFloatReg > 0;
+        }
+    #endif        
         return offset < 0;
     }
 
@@ -172,7 +216,7 @@ struct TransitionBlock
         LIMITED_METHOD_CONTRACT;
         return -GetNegSpaceSize();
     }
-#endif
+#endif // CALLDESCR_FPARGREGS
 
     static int GetOffsetOfCalleeSavedRegisters()
     {
@@ -194,6 +238,11 @@ struct TransitionBlock
     }
 
     static const int InvalidOffset = -1;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Special offset value to represent  struct passed in registers. Such a struct can span both
+    // general purpose and floating point registers, so it can have two different offsets.
+    static const int StructInRegsOffset = -2;
+#endif    
 };
 
 //-----------------------------------------------------------------------
@@ -340,11 +389,16 @@ public:
     {
         LIMITED_METHOD_CONTRACT;
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // No arguments are passed by reference on AMD64 on Unix
+        return FALSE;
+#else
         // If the size is bigger than ENREGISTERED_PARAM_TYPE_MAXSIZE, or if the size is NOT a power of 2, then
         // the argument is passed by reference.
         return (size > ENREGISTERED_PARAMTYPE_MAXSIZE) || ((size & (size-1)) != 0);
+#endif        
     }
-#endif
+#endif // _TARGET_AMD64_
 
     // This overload should be used for varargs only.
     static BOOL IsVarArgPassedByRef(size_t size)
@@ -352,7 +406,13 @@ public:
         LIMITED_METHOD_CONTRACT;
 
 #ifdef _TARGET_AMD64_
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        PORTABILITY_ASSERT("ArgIteratorTemplate::IsVarArgPassedByRef");                
+        return FALSE;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
         return IsArgPassedByRef(size);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #else
         return (size > ENREGISTERED_PARAMTYPE_MAXSIZE);
 #endif
@@ -426,6 +486,15 @@ public:
     void GetVASigCookieLoc(ArgLocDesc * pLoc) { WRAPPER_NO_CONTRACT; GetSimpleLoc(GetVASigCookieOffset(), pLoc); }
 #endif // !_TARGET_X86_
 
+    ArgLocDesc* GetArgLocDescForStructInRegs()
+    {
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        return m_hasArgLocDescForStructInRegs ? &m_argLocDescForStructInRegs : NULL;
+#else
+        return NULL;
+#endif
+    }
+
 #ifdef _TARGET_ARM_
     // Get layout information for the argument that the ArgIterator is currently visiting.
     void GetArgLoc(int argOffset, ArgLocDesc *pLoc)
@@ -463,7 +532,7 @@ public:
         }
         else
         {
-            pLoc->m_idxStack = TransitionBlock::GetArgumentIndexFromOffset(argOffset) - 4;
+            pLoc->m_idxStack = TransitionBlock::GetStackArgumentIndexFromOffset(argOffset);
             pLoc->m_cStack = cSlots;
         }
     }
@@ -509,7 +578,7 @@ public:
          }
         else
         {
-            pLoc->m_idxStack = TransitionBlock::GetArgumentIndexFromOffset(argOffset) - 8;
+            pLoc->m_idxStack = TransitionBlock::GetStackArgumentIndexFromOffset(argOffset);
             pLoc->m_cStack = cSlots;
         }
     }
@@ -517,37 +586,46 @@ public:
 
 #if defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI)
     // Get layout information for the argument that the ArgIterator is currently visiting.
-    void GetArgLoc(int argOffset, ArgLocDesc *pLoc)
+    void GetArgLoc(int argOffset, ArgLocDesc* pLoc)
     {
         LIMITED_METHOD_CONTRACT;
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (m_hasArgLocDescForStructInRegs)
+        {
+            *pLoc = m_argLocDescForStructInRegs;
+            return;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        if (argOffset == TransitionBlock::StructInRegsOffset)
+        {
+            // We always already have argLocDesc for structs passed in registers, we 
+            // compute it in the GetNextOffset for those since it is always needed.
+            _ASSERTE(false);
+            return;
+        }
+
         pLoc->Init();
 
         if (TransitionBlock::IsFloatArgumentRegisterOffset(argOffset))
         {
             // Dividing by 8 as size of each register in FloatArgumentRegisters is 8 bytes.
             pLoc->m_idxFloatReg = (argOffset - TransitionBlock::GetOffsetOfFloatArgumentRegisters()) / 8;
-
-            // UNIXTODO: Passing of structs, HFAs. For now, use the Windows convention.
             pLoc->m_cFloatReg = 1;
-            return;
         }
-
-        // UNIXTODO: Passing of structs, HFAs. For now, use the Windows convention.
-        int cSlots = 1;
-
-        if (!TransitionBlock::IsStackArgumentOffset(argOffset))
+        else if (!TransitionBlock::IsStackArgumentOffset(argOffset))
         {
             pLoc->m_idxGenReg = TransitionBlock::GetArgumentIndexFromOffset(argOffset);
-            pLoc->m_cGenReg = cSlots;
-         }
+            pLoc->m_cGenReg = 1;
+        }
         else
         {
-            pLoc->m_idxStack = (argOffset - TransitionBlock::GetOffsetOfArgs()) / 8;
-            pLoc->m_cStack = cSlots;
+            pLoc->m_idxStack = TransitionBlock::GetStackArgumentIndexFromOffset(argOffset);
+            pLoc->m_cStack = (GetArgSize() + STACK_ELEM_SIZE - 1) / STACK_ELEM_SIZE;
         }
     }
-#endif // _TARGET_ARM64_ && UNIX_AMD64_ABI
+#endif // _TARGET_AMD64_ && UNIX_AMD64_ABI
 
 protected:
     DWORD               m_dwFlags;              // Cached flags
@@ -559,6 +637,10 @@ protected:
     CorElementType      m_argType;
     int                 m_argSize;
     TypeHandle          m_argTypeHandle;
+#if defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    ArgLocDesc          m_argLocDescForStructInRegs;
+    bool                m_hasArgLocDescForStructInRegs;
+#endif // _TARGET_AMD64_ && UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
 
 #ifdef _TARGET_X86_
     int                 m_curOfs;           // Current position of the stack iterator
@@ -567,9 +649,12 @@ protected:
 
 #ifdef _TARGET_AMD64_
 #ifdef UNIX_AMD64_ABI
-    int                 m_idxGenReg;
-    int                 m_idxStack;
-    int                 m_idxFPReg;
+    int                 m_idxGenReg;        // Next general register to be assigned a value
+    int                 m_idxStack;         // Next stack slot to be assigned a value
+    int                 m_idxFPReg;         // Next floating point register to be assigned a value
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    bool                m_fArgInRegisters;  // Indicates that the current argument is stored in registers
+#endif    
 #else
     int                 m_curOfs;           // Current position of the stack iterator
 #endif
@@ -843,6 +928,10 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
     m_argSize = argSize;
     m_argTypeHandle = thValueType;
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    m_hasArgLocDescForStructInRegs = false;
+#endif
+
 #ifdef _TARGET_X86_
 #ifdef FEATURE_INTERPRETER
     if (m_fUnmanagedCallConv)
@@ -862,7 +951,12 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
     return m_curOfs;
 #elif defined(_TARGET_AMD64_)
 #ifdef UNIX_AMD64_ABI
+
+    m_fArgInRegisters = true;
+
     int cFPRegs = 0;
+    int cbArg = StackElemSize(argSize);
+    int cGenRegs = cbArg / 8; // GP reg size
 
     switch (argType)
     {
@@ -879,8 +973,56 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
 
     case ELEMENT_TYPE_VALUETYPE:
     {
-        // UNIXTODO: Passing of structs, HFAs. For now, use the Windows convention.
-        argSize = sizeof(TADDR);
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        MethodTable *pMT = m_argTypeHandle.AsMethodTable();
+        if (pMT->IsRegPassedStruct())
+        {
+            EEClass* eeClass = pMT->GetClass();
+            cGenRegs = 0;
+            for (int i = 0; i < eeClass->GetNumberEightBytes(); i++)
+            {
+                switch (eeClass->GetEightByteClassification(i))
+                {
+                    case SystemVClassificationTypeInteger:
+                    case SystemVClassificationTypeIntegerReference:
+                        cGenRegs++;
+                        break;
+                    case SystemVClassificationTypeSSE:
+                        cFPRegs++;
+                        break;
+                    default:
+                        _ASSERTE(false);
+                        break;
+                }
+            }
+
+            // Check if we have enough registers available for the struct passing
+            if ((cFPRegs + m_idxFPReg <= NUM_FLOAT_ARGUMENT_REGISTERS) && (cGenRegs + m_idxGenReg) <= NUM_ARGUMENT_REGISTERS)
+            {
+                m_argLocDescForStructInRegs.Init();
+                m_argLocDescForStructInRegs.m_cGenReg = cGenRegs;
+                m_argLocDescForStructInRegs.m_cFloatReg = cFPRegs;
+                m_argLocDescForStructInRegs.m_idxGenReg = m_idxGenReg;
+                m_argLocDescForStructInRegs.m_idxFloatReg = m_idxFPReg;
+                m_argLocDescForStructInRegs.m_eeClass = eeClass;
+                
+                m_hasArgLocDescForStructInRegs = true;
+
+                m_idxGenReg += cGenRegs;
+                m_idxFPReg += cFPRegs;
+
+                return TransitionBlock::StructInRegsOffset;
+            }
+        }
+
+        // Set the register counts to indicate that this argument will not be passed in registers
+        cFPRegs = 0;
+        cGenRegs = 0;
+
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        argSize = sizeof(TADDR);        
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
         break;
     }
 
@@ -888,33 +1030,31 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
         break;
     }
 
-    int cbArg = StackElemSize(argSize);
-    int cArgSlots = cbArg / STACK_ELEM_SIZE;
-
-    if (cFPRegs>0)
+    if ((cFPRegs > 0) && (cFPRegs + m_idxFPReg <= NUM_FLOAT_ARGUMENT_REGISTERS))
     {
-        if (cFPRegs + m_idxFPReg <= 8)
-        {
-            int argOfs = TransitionBlock::GetOffsetOfFloatArgumentRegisters() + m_idxFPReg * 8;
-            m_idxFPReg += cFPRegs;
-            return argOfs;
-        }
+        int argOfs = TransitionBlock::GetOffsetOfFloatArgumentRegisters() + m_idxFPReg * 8;
+        m_idxFPReg += cFPRegs;
+        return argOfs;
     }
-    else
+    else if ((cGenRegs > 0) && (m_idxGenReg + cGenRegs <= NUM_ARGUMENT_REGISTERS))
     {
-        if (m_idxGenReg + cArgSlots <= 6)
-        {
-            int argOfs = TransitionBlock::GetOffsetOfArgumentRegisters() + m_idxGenReg * 8;
-            m_idxGenReg += cArgSlots;
-            return argOfs;
-        }
+        int argOfs = TransitionBlock::GetOffsetOfArgumentRegisters() + m_idxGenReg * 8;
+        m_idxGenReg += cGenRegs;
+        return argOfs;
     }
 
-    int argOfs = TransitionBlock::GetOffsetOfArgs() + m_idxStack * 8;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    m_fArgInRegisters = false;
+#endif        
+
+    int argOfs = TransitionBlock::GetOffsetOfArgs() + m_idxStack * STACK_ELEM_SIZE;
+
+    int cArgSlots = cbArg / STACK_ELEM_SIZE;
     m_idxStack += cArgSlots;
+
     return argOfs;
 #else
-    // Each argument takes exactly one slot on AMD64
+    // Each argument takes exactly one slot on AMD64 on Windows
     int argOfs = m_curOfs;
     m_curOfs += sizeof(void *);
     return argOfs;
@@ -1203,6 +1343,40 @@ void ArgIteratorTemplate<ARGITERATOR_BASE>::ComputeReturnFlags()
         {
             _ASSERTE(!thValueType.IsNull());
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            MethodTable *pMT = thValueType.AsMethodTable();
+            if (pMT->IsRegPassedStruct())
+            {
+                EEClass* eeClass = pMT->GetClass();
+
+                if (eeClass->GetNumberEightBytes() == 1)
+                {
+                    // Structs occupying just one eightbyte are treated as int / double
+                    if (eeClass->GetEightByteClassification(0) == SystemVClassificationTypeSSE)
+                    {
+                        flags |= sizeof(double) << RETURN_FP_SIZE_SHIFT;
+                    }
+                }
+                else
+                {
+                    // Size of the struct is 16 bytes
+                    flags |= (16 << RETURN_FP_SIZE_SHIFT);
+                    // The lowest two bits of the size encode the order of the int and SSE fields
+                    if (eeClass->GetEightByteClassification(0) == SystemVClassificationTypeSSE)
+                    {
+                        flags |= (1 << RETURN_FP_SIZE_SHIFT);
+                    }
+
+                    if (eeClass->GetEightByteClassification(1) == SystemVClassificationTypeSSE)
+                    {
+                        flags |= (2 << RETURN_FP_SIZE_SHIFT);                    
+                    }
+                }
+
+                break;
+            }
+#else // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifdef FEATURE_HFA
             if (thValueType.IsHFA() && !this->IsVarArg())
             {
@@ -1229,6 +1403,7 @@ void ArgIteratorTemplate<ARGITERATOR_BASE>::ComputeReturnFlags()
 
             if  (size <= ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE)
                 break;
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
         }
 #endif // ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE
 
@@ -1348,22 +1523,32 @@ void ArgIteratorTemplate<ARGITERATOR_BASE>::ForceSigWalk()
 
     int maxOffset = TransitionBlock::GetOffsetOfArgs();
 
-    int    ofs;
+    int ofs;
     while (TransitionBlock::InvalidOffset != (ofs = GetNextOffset()))
     {
         int stackElemSize;
 
 #ifdef _TARGET_AMD64_
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        if (m_fArgInRegisters)
+        {
+            // Arguments passed in registers don't consume any stack 
+            continue;
+        }
+
+        stackElemSize = StackElemSize(GetArgSize());
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
         // All stack arguments take just one stack slot on AMD64 because of arguments bigger 
         // than a stack slot are passed by reference. 
         stackElemSize = STACK_ELEM_SIZE;
-#else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#else // _TARGET_AMD64_
         stackElemSize = StackElemSize(GetArgSize());
 #if defined(ENREGISTERED_PARAMTYPE_MAXSIZE)
         if (IsArgPassedByRef())
             stackElemSize = STACK_ELEM_SIZE;
 #endif
-#endif
+#endif // _TARGET_AMD64_
 
         int endOfs = ofs + stackElemSize;
         if (endOfs > maxOffset)
diff --git a/src/vm/class.cpp b/src/vm/class.cpp
index 932f8bed00..f45e6ebbfa 100644
--- a/src/vm/class.cpp
+++ b/src/vm/class.cpp
@@ -1679,7 +1679,7 @@ CorElementType MethodTable::GetHFAType()
 
         default:
             // This should never happen. MethodTable::IsHFA() should be set only on types
-            // that have a valid HFA type
+            // that have a valid HFA type when the flag is used to track HFA status.
             _ASSERTE(false);
             return ELEMENT_TYPE_END;
         }
diff --git a/src/vm/class.h b/src/vm/class.h
index 758a0dbaee..c53cf8ba72 100644
--- a/src/vm/class.h
+++ b/src/vm/class.h
@@ -428,21 +428,26 @@ class EEClassLayoutInfo
             // to its unmanaged counterpart (i.e. no internal reference fields,
             // no ansi-unicode char conversions required, etc.) Used to
             // optimize marshaling.
-            e_BLITTABLE             = 0x01,
+            e_BLITTABLE                 = 0x01,
             // Post V1.0 addition: Is this type also sequential in managed memory?
-            e_MANAGED_SEQUENTIAL    = 0x02,
+            e_MANAGED_SEQUENTIAL        = 0x02,
             // When a sequential/explicit type has no fields, it is conceptually
             // zero-sized, but actually is 1 byte in length. This holds onto this
             // fact and allows us to revert the 1 byte of padding when another
             // explicit type inherits from this type.
-            e_ZERO_SIZED            = 0x04,
+            e_ZERO_SIZED                =   0x04,
             // The size of the struct is explicitly specified in the meta-data.
-            e_HAS_EXPLICIT_SIZE     = 0x08,
-
+            e_HAS_EXPLICIT_SIZE         = 0x08,
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#ifdef FEATURE_HFA
+#error Can't have FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF defined at the same time.
+#endif // FEATURE_HFA
+            e_NATIVE_PASS_IN_REGISTERS  = 0x10, // Flag wheter a native struct is passed in registers.
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
 #ifdef FEATURE_HFA
             // HFA type of the unmanaged layout
-            e_R4_HFA                = 0x10,
-            e_R8_HFA                = 0x20,
+            e_R4_HFA                    = 0x10,
+            e_R8_HFA                    = 0x20,
 #endif
         };
 
@@ -527,6 +532,14 @@ class EEClassLayoutInfo
             return m_cbPackingSize;
         }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+        bool IsNativeStructPassedInRegisters()
+        {
+            LIMITED_METHOD_CONTRACT;
+            return (m_bFlags & e_NATIVE_PASS_IN_REGISTERS) != 0;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
 #ifdef FEATURE_HFA
         bool IsNativeHFA()
         {
@@ -579,6 +592,14 @@ class EEClassLayoutInfo
             m_bFlags |= (hfaType == ELEMENT_TYPE_R4) ? e_R4_HFA : e_R8_HFA;
         }
 #endif
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+        void SetNativeStructPassedInRegisters()
+        {
+            LIMITED_METHOD_CONTRACT;
+            m_bFlags |= e_NATIVE_PASS_IN_REGISTERS;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
 };
 
 
@@ -713,6 +734,15 @@ class EEClassOptionalFields
 
     SecurityProperties m_SecProps;
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Number of eightBytes in the following arrays
+    int m_numberEightBytes; 
+    // Classification of the eightBytes
+    SystemVClassificationType m_eightByteClassifications[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    // Size of data the eightBytes
+    unsigned int m_eightByteSizes[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     // Set default values for optional fields.
     inline void Init();
 };
@@ -1811,6 +1841,45 @@ public:
         GetOptionalFields()->m_dwReliabilityContract = dwValue;
     }
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Get number of eightbytes used by a struct passed in registers.
+    inline int GetNumberEightBytes()
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(HasOptionalFields());
+        return GetOptionalFields()->m_numberEightBytes;
+    }
+
+    // Get eightbyte classification for the eightbyte with the specified index.
+    inline SystemVClassificationType GetEightByteClassification(int index)
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(HasOptionalFields());
+        return GetOptionalFields()->m_eightByteClassifications[index];
+    }
+
+    // Get size of the data in the eightbyte with the specified index.
+    inline unsigned int GetEightByteSize(int index)
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(HasOptionalFields());
+        return GetOptionalFields()->m_eightByteSizes[index];
+    }
+
+    // Set the eightByte classification
+    inline void SetEightByteClassification(int eightByteCount, SystemVClassificationType *eightByteClassifications, unsigned int *eightByteSizes)
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(HasOptionalFields());
+        GetOptionalFields()->m_numberEightBytes = eightByteCount;
+        for (int i = 0; i < eightByteCount; i++)
+        {
+            GetOptionalFields()->m_eightByteClassifications[i] = eightByteClassifications[i];
+            GetOptionalFields()->m_eightByteSizes[i] = eightByteSizes[i];
+        }
+    }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING    
+
 #ifdef FEATURE_COMINTEROP
     inline TypeHandle GetCoClassForInterface()
     {
diff --git a/src/vm/class.inl b/src/vm/class.inl
index 12c5230fd2..a4c8276476 100644
--- a/src/vm/class.inl
+++ b/src/vm/class.inl
@@ -53,6 +53,9 @@ inline void EEClassOptionalFields::Init()
     m_cbModuleDynamicID = MODULE_NON_DYNAMIC_STATICS;
     m_dwReliabilityContract = RC_NULL;
     m_SecProps = 0;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    m_numberEightBytes = 0;
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING    
 }
 #endif // !DACCESS_COMPILE
 
diff --git a/src/vm/comdelegate.cpp b/src/vm/comdelegate.cpp
index a6c7e063b1..80742cdaca 100644
--- a/src/vm/comdelegate.cpp
+++ b/src/vm/comdelegate.cpp
@@ -72,37 +72,149 @@ static UINT16 ShuffleOfs(INT ofs, UINT stackSizeDelta = 0)
 
 #else // Portable default implementation
 
-// Helpers used when calculating shuffle array entries in GenerateShuffleArray below.
-
-// Return true if the current argument still has slots left to shuffle in general registers or on the stack
-// (currently we never shuffle floating point registers since there's no need).
-static bool AnythingToShuffle(ArgLocDesc * pArg)
+// Iterator for extracting shuffle entries for argument desribed by an ArgLocDesc.
+// Used when calculating shuffle array entries in GenerateShuffleArray below.
+class ShuffleIterator
 {
-    return (pArg->m_cGenReg > 0) || (pArg->m_cStack > 0);
-}
+    // Argument location description
+    ArgLocDesc* m_argLocDesc;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Current eightByte used for struct arguments in registers
+    int m_currentEightByte;
+#endif    
+    // Current general purpose register index (relative to the ArgLocDesc::m_idxGenReg)
+    int m_currentGenRegIndex;
+    // Current floating point register index (relative to the ArgLocDesc::m_idxFloatReg)
+    int m_currentFloatRegIndex;
+    // Current stack slot index (relative to the ArgLocDesc::m_idxStack)
+    int m_currentStackSlotIndex;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Get next shuffle offset for struct passed in registers. There has to be at least one offset left.
+    UINT16 GetNextOfsInStruct()
+    {
+        EEClass* eeClass = m_argLocDesc->m_eeClass;
+        _ASSERTE(eeClass != NULL);
+        
+        if (m_currentEightByte < eeClass->GetNumberEightBytes())
+        {
+            SystemVClassificationType eightByte = eeClass->GetEightByteClassification(m_currentEightByte);
+            unsigned int eightByteSize = eeClass->GetEightByteSize(m_currentEightByte);
 
-// Return an encoded shuffle entry describing a general register or stack offset that needs to be shuffled.
-static UINT16 ShuffleOfs(ArgLocDesc * pArg)
-{
-    // Shuffle any registers first (the order matters since otherwise we could end up shuffling a stack slot
-    // over a register we later need to shuffle down as well).
-    if (pArg->m_cGenReg > 0)
-    {        
-        pArg->m_cGenReg--;
-        return (UINT16)(ShuffleEntry::REGMASK | pArg->m_idxGenReg++);
+            m_currentEightByte++;
+
+            int index;
+            UINT16 mask = ShuffleEntry::REGMASK;
+
+            if (eightByte == SystemVClassificationTypeSSE)
+            {
+                _ASSERTE(m_currentFloatRegIndex < m_argLocDesc->m_cFloatReg);
+                index = m_argLocDesc->m_idxFloatReg + m_currentFloatRegIndex;
+                m_currentFloatRegIndex++;
+
+                mask |= ShuffleEntry::FPREGMASK;
+                if (eightByteSize == 4)
+                {
+                    mask |= ShuffleEntry::FPSINGLEMASK;
+                }
+            }
+            else
+            {
+                _ASSERTE(m_currentGenRegIndex < m_argLocDesc->m_cGenReg);
+                index = m_argLocDesc->m_idxGenReg + m_currentGenRegIndex;
+                m_currentGenRegIndex++;
+            }
+
+            return (UINT16)index | mask;
+        }
+
+        // There are no more offsets to get, the caller should not have called us
+        _ASSERTE(false);
+        return 0;
     }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
 
-    // If we get here we must have at least one stack slot left to shuffle (this method should only be called
-    // when AnythingToShuffle(pArg) == true).
-    _ASSERTE(pArg->m_cStack > 0);
-    pArg->m_cStack--;
+public:
 
-    // Delegates cannot handle overly large argument stacks due to shuffle entry encoding limitations.
-    if (pArg->m_idxStack >= ShuffleEntry::REGMASK)
-        COMPlusThrow(kNotSupportedException);
+    // Construct the iterator for the ArgLocDesc
+    ShuffleIterator(ArgLocDesc* argLocDesc)
+    :
+        m_argLocDesc(argLocDesc),
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        m_currentEightByte(0),
+#endif
+        m_currentGenRegIndex(0),
+        m_currentFloatRegIndex(0),
+        m_currentStackSlotIndex(0)
+    {
+    }
 
-    return (UINT16)(pArg->m_idxStack++);
-}
+    // Check if there are more offsets to shuffle
+    bool HasNextOfs()
+    {
+        return (m_currentGenRegIndex < m_argLocDesc->m_cGenReg) || 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+               (m_currentFloatRegIndex < m_argLocDesc->m_cFloatReg) ||
+#endif
+               (m_currentStackSlotIndex < m_argLocDesc->m_cStack);        
+    }
+
+    // Get next offset to shuffle. There has to be at least one offset left.
+    UINT16 GetNextOfs()
+    {
+        int index;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+        // Check if the argLocDesc is for a struct in registers
+        EEClass* eeClass = m_argLocDesc->m_eeClass;
+        if (m_argLocDesc->m_eeClass != 0)
+        {
+            return GetNextOfsInStruct();
+        }
+
+        // Shuffle float registers first
+        if (m_currentFloatRegIndex < m_argLocDesc->m_cFloatReg)
+        {        
+            index = m_argLocDesc->m_idxFloatReg + m_currentFloatRegIndex;
+            m_currentFloatRegIndex++;
+
+            return (UINT16)index | ShuffleEntry::REGMASK | ShuffleEntry::FPREGMASK;
+        }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        // Shuffle any registers first (the order matters since otherwise we could end up shuffling a stack slot
+        // over a register we later need to shuffle down as well).
+        if (m_currentGenRegIndex < m_argLocDesc->m_cGenReg)
+        {        
+            index = m_argLocDesc->m_idxGenReg + m_currentGenRegIndex;
+            m_currentGenRegIndex++;
+
+            return (UINT16)index | ShuffleEntry::REGMASK;
+        }
+
+        // If we get here we must have at least one stack slot left to shuffle (this method should only be called
+        // when AnythingToShuffle(pArg) == true).
+        if (m_currentStackSlotIndex < m_argLocDesc->m_cStack)
+        {
+            index = m_argLocDesc->m_idxStack + m_currentStackSlotIndex;
+            m_currentStackSlotIndex++;
+
+            // Delegates cannot handle overly large argument stacks due to shuffle entry encoding limitations.
+            if (index >= ShuffleEntry::REGMASK)
+            {
+                COMPlusThrow(kNotSupportedException);
+            }
+
+            return (UINT16)index;
+        }
+
+        // There are no more offsets to get, the caller should not have called us
+        _ASSERTE(false);
+        return 0;
+    }
+};
 
 #endif
 
@@ -247,8 +359,11 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
 
         sArgPlacerSrc.GetThisLoc(&sArgDst);
 
-        entry.srcofs = ShuffleOfs(&sArgSrc);
-        entry.dstofs = ShuffleOfs(&sArgDst);
+        ShuffleIterator iteratorSrc(&sArgSrc);
+        ShuffleIterator iteratorDst(&sArgDst);
+
+        entry.srcofs = iteratorSrc.GetNextOfs();
+        entry.dstofs = iteratorDst.GetNextOfs();
 
         pShuffleEntryArray->Append(entry);
     }
@@ -261,8 +376,11 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
         sArgPlacerSrc.GetRetBuffArgLoc(&sArgSrc);
         sArgPlacerDst.GetRetBuffArgLoc(&sArgDst);
 
-        entry.srcofs = ShuffleOfs(&sArgSrc);
-        entry.dstofs = ShuffleOfs(&sArgDst);
+        ShuffleIterator iteratorSrc(&sArgSrc);
+        ShuffleIterator iteratorDst(&sArgDst);
+
+        entry.srcofs = iteratorSrc.GetNextOfs();
+        entry.dstofs = iteratorDst.GetNextOfs();
 
         // Depending on the type of target method (static vs instance) the return buffer argument may end up
         // in the same register in both signatures. So we only commit the entry (by moving the entry pointer
@@ -271,34 +389,76 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
             pShuffleEntryArray->Append(entry);
     }
 
-    // Iterate all the regular arguments. mapping source registers and stack locations to the corresponding
-    // destination locations.
-    while ((ofsSrc = sArgPlacerSrc.GetNextOffset()) != TransitionBlock::InvalidOffset)
-    {
-        ofsDst = sArgPlacerDst.GetNextOffset();
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // The shuffle entries are produced in two passes on Unix AMD64. The first pass generates shuffle entries for
+    // all cases except of shuffling struct argument from stack to registers, which is performed in the second pass
+    // The reason is that if such structure argument contained floating point field and it was followed by a 
+    // floating point argument, generating code for transferring the structure from stack into registers would
+    // overwrite the xmm register of the floating point argument before it could actually be shuffled.
+    // For example, consider this case:
+    // struct S { int x; float y; };
+    // void fn(long a, long b, long c, long d, long e, S f, float g);
+    // src: rdi = this, rsi = a, rdx = b, rcx = c, r8 = d, r9 = e, stack: f, xmm0 = g
+    // dst: rdi = a, rsi = b, rdx = c, rcx = d, r8 = e, r9 = S.x, xmm0 = s.y, xmm1 = g
+    for (int pass = 0; pass < 2; pass++)
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+    {
+        // Iterate all the regular arguments. mapping source registers and stack locations to the corresponding
+        // destination locations.
+        while ((ofsSrc = sArgPlacerSrc.GetNextOffset()) != TransitionBlock::InvalidOffset)
+        {
+            ofsDst = sArgPlacerDst.GetNextOffset();
 
-        // Find the argument location mapping for both source and destination signature. A single argument can
-        // occupy a floating point register (in which case we don't need to do anything, they're not shuffled)
-        // or some combination of general registers and the stack.
-        sArgPlacerSrc.GetArgLoc(ofsSrc, &sArgSrc);
-        sArgPlacerDst.GetArgLoc(ofsDst, &sArgDst);
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            bool shuffleStructFromStackToRegs = (ofsSrc != TransitionBlock::StructInRegsOffset) && (ofsDst == TransitionBlock::StructInRegsOffset);
+            if (((pass == 0) && shuffleStructFromStackToRegs) || 
+                ((pass == 1) && !shuffleStructFromStackToRegs))
+            {
+                continue;
+            }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+            // Find the argument location mapping for both source and destination signature. A single argument can
+            // occupy a floating point register (in which case we don't need to do anything, they're not shuffled)
+            // or some combination of general registers and the stack.
+            sArgPlacerSrc.GetArgLoc(ofsSrc, &sArgSrc);
+            sArgPlacerDst.GetArgLoc(ofsDst, &sArgDst);
+
+            ShuffleIterator iteratorSrc(&sArgSrc);
+            ShuffleIterator iteratorDst(&sArgDst);
+
+            // Shuffle each slot in the argument (register or stack slot) from source to destination.
+            while (iteratorSrc.HasNextOfs())
+            {
+                // Locate the next slot to shuffle in the source and destination and encode the transfer into a
+                // shuffle entry.
+                entry.srcofs = iteratorSrc.GetNextOfs();
+                entry.dstofs = iteratorDst.GetNextOfs();
+
+                // Only emit this entry if it's not a no-op (i.e. the source and destination locations are
+                // different).
+                if (entry.srcofs != entry.dstofs)
+                    pShuffleEntryArray->Append(entry);
+            }
 
-        // Shuffle each slot in the argument (register or stack slot) from source to destination.
-        while (AnythingToShuffle(&sArgSrc))
+            // We should have run out of slots to shuffle in the destination at the same time as the source.
+            _ASSERTE(!iteratorDst.HasNextOfs());
+        }
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (pass == 0)
         {
-            // Locate the next slot to shuffle in the source and destination and encode the transfer into a
-            // shuffle entry.
-            entry.srcofs = ShuffleOfs(&sArgSrc);
-            entry.dstofs = ShuffleOfs(&sArgDst);
+            // Reset the iterator for the 2nd pass
+            sSigSrc.Reset();
+            sSigDst.Reset();
 
-            // Only emit this entry if it's not a no-op (i.e. the source and destination locations are
-            // different).
-            if (entry.srcofs != entry.dstofs)
-                pShuffleEntryArray->Append(entry);
-        }
+            sArgPlacerSrc = ArgIterator(&sSigSrc);
+            sArgPlacerDst = ArgIterator(&sSigDst);
 
-        // We should have run out of slots to shuffle in the destination at the same time as the source.
-        _ASSERTE(!AnythingToShuffle(&sArgDst));
+            if (sSigDst.HasThis())
+            {
+                sArgPlacerSrc.GetNextOffset();
+            }
+        }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
     }
 
     entry.srcofs = ShuffleEntry::SENTINEL;
@@ -1323,7 +1483,7 @@ OBJECTREF COMDelegate::ConvertToDelegate(LPVOID pCallback, MethodTable* pMT)
 
     // Lookup the callsite in the hash, if found, we can map this call back to its managed function.
     // Otherwise, we'll treat this as an unmanaged callsite.
-	// Make sure that the pointer doesn't have the value of 1 which is our hash table deleted item marker.
+    // Make sure that the pointer doesn't have the value of 1 which is our hash table deleted item marker.
     LPVOID DelegateHnd = (pUMEntryThunk != NULL) && ((UPTR)pUMEntryThunk != (UPTR)1)
         ? COMDelegate::s_pDelegateToFPtrHash->LookupValue((UPTR)pUMEntryThunk, 0)
         : (LPVOID)INVALIDENTRY;
diff --git a/src/vm/comdelegate.h b/src/vm/comdelegate.h
index cfb9afa783..ab8ca04338 100644
--- a/src/vm/comdelegate.h
+++ b/src/vm/comdelegate.h
@@ -211,10 +211,14 @@ void DistributeUnhandledExceptionReliably(OBJECTREF *pDelegate,
 //     signature.
 struct ShuffleEntry
 {
+    // Offset masks and special value
     enum {
-        REGMASK  = 0x8000,
-        OFSMASK  = 0x7fff,
-        SENTINEL = 0xffff,
+        REGMASK      = 0x8000, // Register offset bit
+        FPREGMASK    = 0x4000, // Floating point register bit
+        FPSINGLEMASK = 0x2000, // Single precising floating point register
+        OFSMASK      = 0x7fff, // Mask to get stack offset
+        OFSREGMASK   = 0x1fff, // Mask to get register index
+        SENTINEL     = 0xffff, // Indicates end of shuffle array
     };
 
 #if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
@@ -224,17 +228,11 @@ struct ShuffleEntry
     };
 #else
 
-    // Special values:
-    //  -1       - indicates end of shuffle array: stacksizedelta
-    //             == difference in stack size between virtual and static sigs.
-    //  high bit - indicates a register argument: mask it off and
-    //             the result is an offset into ArgumentRegisters.
-
     UINT16    srcofs;
 
     union {
         UINT16    dstofs;           //if srcofs != SENTINEL
-        UINT16    stacksizedelta;   //if dstofs == SENTINEL
+        UINT16    stacksizedelta;   //if dstofs == SENTINEL, difference in stack size between virtual and static sigs
     };
 #endif // _TARGET_AMD64_
 };
diff --git a/src/vm/compile.cpp b/src/vm/compile.cpp
index 5b33792d35..23242df1db 100644
--- a/src/vm/compile.cpp
+++ b/src/vm/compile.cpp
@@ -76,6 +76,8 @@
 #endif
 #include "tritonstress.h"
 
+#include "argdestination.h"
+
 #ifdef CROSSGEN_COMPILE
 CompilationDomain * theDomain;
 #endif
@@ -1483,7 +1485,8 @@ void FakeGcScanRoots(MetaSig& msig, ArgIterator& argit, MethodDesc * pMD, BYTE *
     int argOffset;
     while ((argOffset = argit.GetNextOffset()) != TransitionBlock::InvalidOffset)
     {
-        msig.GcScanRoots(pFrame + argOffset, &FakePromote, &sc, &FakePromoteCarefully);
+        ArgDestination argDest(pFrame, argOffset, argit.GetArgLocDescForStructInRegs());
+        msig.GcScanRoots(&argDest, &FakePromote, &sc, &FakePromoteCarefully);
     }
 }
 
@@ -1933,7 +1936,17 @@ BOOL CanDeduplicateCode(CORINFO_METHOD_HANDLE method, CORINFO_METHOD_HANDLE dupl
         return FALSE;
 #endif // _TARGET_X86_
 
-    if (pMethod->ReturnsObject() != pDuplicateMethod->ReturnsObject())
+    MetaSig::RETURNTYPE returnType = pMethod->ReturnsObject();
+    MetaSig::RETURNTYPE returnTypeDuplicate = pDuplicateMethod->ReturnsObject();
+
+    if (returnType != returnTypeDuplicate)
+        return FALSE;
+
+    //
+    // Do not enable deduplication of structs returned in registers
+    //
+
+    if (returnType == MetaSig::RETVALUETYPE)
         return FALSE;
 
     //
diff --git a/src/vm/crossdomaincalls.cpp b/src/vm/crossdomaincalls.cpp
index fa04b57faa..dd695fe5f1 100644
--- a/src/vm/crossdomaincalls.cpp
+++ b/src/vm/crossdomaincalls.cpp
@@ -1264,7 +1264,7 @@ CrossDomainChannel::BlitAndCall()
             MetaSig mSig(m_pCliMD, thDeclaringType);
             ArgIterator argit(&mSig);
 
-            int    offset;
+            int offset;
             while (TransitionBlock::InvalidOffset != (offset = argit.GetNextOffset()))
             {    
                 int regArgNum = TransitionBlock::GetArgumentIndexFromOffset(offset);
@@ -2068,7 +2068,7 @@ CrossDomainChannel::MarshalAndCall()
     CDC_DETERMINE_DECLARING_TYPE(m_pCliMD, TypeHandle(CTPMethodTable::GetMethodTableBeingProxied(m_pFrame->GetThis())));
     MetaSig mSig(m_pCliMD, thDeclaringType);
     ArgIterator argit(&mSig);
-    int    ofs;
+    int ofs;
 
     // NumFixedArgs() doesn't count the "this" object, but SizeOfFrameArgumentArray() does.
     dwNumArgs = mSig.NumFixedArgs();
@@ -2141,7 +2141,7 @@ CrossDomainChannel::MarshalAndCall()
     TADDR pTransitionBlock = m_pFrame->GetTransitionBlock();
 
     for (int argNum = 0;
-        TransitionBlock::InvalidOffset != (ofs = argit.GetNextOffset());
+         TransitionBlock::InvalidOffset != (ofs = argit.GetNextOffset());
          argNum++
         )
     {
diff --git a/src/vm/eetwain.cpp b/src/vm/eetwain.cpp
index 5df7b6305a..dbbfac9000 100644
--- a/src/vm/eetwain.cpp
+++ b/src/vm/eetwain.cpp
@@ -18,6 +18,7 @@
 #include "gcinfodecoder.h"
 #endif
 
+#include "argdestination.h"
 
 #define X86_INSTR_W_TEST_ESP            0x4485  // test [esp+N], eax
 #define X86_INSTR_TEST_ESP_SIB          0x24
@@ -4071,7 +4072,10 @@ void promoteVarArgs(PTR_BYTE argsStart, PTR_VASigCookie varArgSig, GCCONTEXT* ct
         // if skipFixedArgs is false we report all arguments
         //  otherwise we just report the varargs.
         if (!skipFixedArgs || inVarArgs)
-            msig.GcScanRoots(pFrameBase + argOffset, ctx->f, ctx->sc);
+        {
+            ArgDestination argDest(pFrameBase, argOffset, argit.GetArgLocDescForStructInRegs());
+            msig.GcScanRoots(&argDest, ctx->f, ctx->sc);
+        }
     }
 }
 
diff --git a/src/vm/fcall.h b/src/vm/fcall.h
index 2bf6080706..8cfcc3e68e 100644
--- a/src/vm/fcall.h
+++ b/src/vm/fcall.h
@@ -1318,9 +1318,8 @@ typedef UINT16 FC_UINT16_RET;
 
 
 // FC_TypedByRef should be used for TypedReferences in FCall signatures
-#ifdef UNIX_AMD64_ABI
+#if defined(UNIX_AMD64_ABI) && !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 // Explicitly pass the TypedReferences by reference 
-// UNIXTODO: Remove once the proper managed calling convention for struct is in place
 #define FC_TypedByRef   TypedByRef&
 #define FC_DECIMAL      DECIMAL&
 #else
diff --git a/src/vm/field.h b/src/vm/field.h
index a278c4d12c..9fc5583c2f 100644
--- a/src/vm/field.h
+++ b/src/vm/field.h
@@ -223,7 +223,6 @@ public:
     DWORD GetOffset()
     {
         LIMITED_METHOD_DAC_CONTRACT;
-
         g_IBCLogger.LogFieldDescsAccess(this);
         return GetOffset_NoLogging();
     }
diff --git a/src/vm/fieldmarshaler.h b/src/vm/fieldmarshaler.h
index d67637e27c..ee464e4c05 100644
--- a/src/vm/fieldmarshaler.h
+++ b/src/vm/fieldmarshaler.h
@@ -396,7 +396,7 @@ public:
         m_dwExternalOffset = dwExternalOffset;
     }
 
-    UINT32 GetExternalOffset()
+    UINT32 GetExternalOffset() const
     {
         LIMITED_METHOD_CONTRACT;
         return m_dwExternalOffset;
diff --git a/src/vm/frames.cpp b/src/vm/frames.cpp
index 1c7f2f4348..f4d96e5f5d 100644
--- a/src/vm/frames.cpp
+++ b/src/vm/frames.cpp
@@ -45,6 +45,8 @@
 #include "interpreter.h"
 #endif // FEATURE_INTERPRETER
 
+#include "argdestination.h"
+
 #if CHECK_APP_DOMAIN_LEAKS
 #define CHECK_APP_DOMAIN    GC_CALL_CHECK_APP_DOMAIN
 #else
@@ -1278,7 +1280,8 @@ void TransitionFrame::PromoteCallerStackHelper(promote_func* fn, ScanContext* sc
     int argOffset;
     while ((argOffset = argit.GetNextOffset()) != TransitionBlock::InvalidOffset)
     {
-        pmsig->GcScanRoots(dac_cast<PTR_VOID>(pTransitionBlock + argOffset), fn, sc);
+        ArgDestination argDest(dac_cast<PTR_VOID>(pTransitionBlock), argOffset, argit.GetArgLocDescForStructInRegs());
+        pmsig->GcScanRoots(&argDest, fn, sc);
     }
 }
 
diff --git a/src/vm/i386/stublinkerx86.cpp b/src/vm/i386/stublinkerx86.cpp
index e42f7d792f..b86151243c 100644
--- a/src/vm/i386/stublinkerx86.cpp
+++ b/src/vm/i386/stublinkerx86.cpp
@@ -4001,16 +4001,49 @@ VOID StubLinkerCPU::EmitShuffleThunk(ShuffleEntry *pShuffleEntryArray)
         {
             // If source is present in register then destination must also be a register
             _ASSERTE(pEntry->dstofs & ShuffleEntry::REGMASK);
+            // Both the srcofs and dstofs must be of the same kind of registers - float or general purpose.
+            _ASSERTE((pEntry->dstofs & ShuffleEntry::FPREGMASK) == (pEntry->srcofs & ShuffleEntry::FPREGMASK));
 
-            X86EmitMovRegReg(c_argRegs[pEntry->dstofs & ShuffleEntry::OFSMASK], c_argRegs[pEntry->srcofs & ShuffleEntry::OFSMASK]);
+            int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK;
+            int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK;
+
+            if (pEntry->srcofs & ShuffleEntry::FPREGMASK) 
+            {
+                // movdqa dstReg, srcReg
+                X64EmitMovXmmXmm((X86Reg)(kXMM0 + dstRegIndex), (X86Reg)(kXMM0 + srcRegIndex));
+            }
+            else
+            {
+                // mov dstReg, srcReg
+                X86EmitMovRegReg(c_argRegs[dstRegIndex], c_argRegs[srcRegIndex]);
+            }
         }
         else if (pEntry->dstofs & ShuffleEntry::REGMASK)
         {
             // source must be on the stack
             _ASSERTE(!(pEntry->srcofs & ShuffleEntry::REGMASK));
 
-            // mov dstreg, [rax + src]
-            X86EmitIndexRegLoad(c_argRegs[pEntry->dstofs & ShuffleEntry::OFSMASK], SCRATCH_REGISTER_X86REG, (pEntry->srcofs + 1) * sizeof(void*));
+            int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK;
+            int srcOffset = (pEntry->srcofs + 1) * sizeof(void*);
+
+            if (pEntry->dstofs & ShuffleEntry::FPREGMASK) 
+            {
+                if (pEntry->dstofs & ShuffleEntry::FPSINGLEMASK)
+                {
+                    // movss dstReg, [rax + src]
+                    X64EmitMovSSFromMem((X86Reg)(kXMM0 + dstRegIndex), SCRATCH_REGISTER_X86REG, srcOffset);
+                }
+                else
+                {
+                    // movsd dstReg, [rax + src]
+                    X64EmitMovSDFromMem((X86Reg)(kXMM0 + dstRegIndex), SCRATCH_REGISTER_X86REG, srcOffset);
+                }
+            }
+            else
+            {
+                // mov dstreg, [rax + src]
+                X86EmitIndexRegLoad(c_argRegs[dstRegIndex], SCRATCH_REGISTER_X86REG, srcOffset);
+            }
         }
         else
         {
diff --git a/src/vm/ilmarshalers.h b/src/vm/ilmarshalers.h
index 5a2453b603..1bd072f417 100644
--- a/src/vm/ilmarshalers.h
+++ b/src/vm/ilmarshalers.h
@@ -601,7 +601,7 @@ public:
                 nativeSize = wNativeSize;
             }
 
-#ifndef _TARGET_ARM_
+#if !defined(_TARGET_ARM) && !(defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
             switch (nativeSize)
             {
                 case 1: typ = ELEMENT_TYPE_U1; break;
diff --git a/src/vm/invokeutil.cpp b/src/vm/invokeutil.cpp
index ee80056abe..e17458ce1d 100644
--- a/src/vm/invokeutil.cpp
+++ b/src/vm/invokeutil.cpp
@@ -28,6 +28,7 @@
 #include "eeconfig.h"
 #include "generics.h"
 #include "runtimehandles.h"
+#include "argdestination.h"
 
 #ifndef CROSSGEN_COMPILE
 
@@ -130,7 +131,7 @@ void *InvokeUtil::GetIntPtrValue(OBJECTREF pObj) {
     RETURN *(void **)((pObj)->UnBox());
 }
 
-void InvokeUtil::CopyArg(TypeHandle th, OBJECTREF *pObjUNSAFE, void *pArgDst) {
+void InvokeUtil::CopyArg(TypeHandle th, OBJECTREF *pObjUNSAFE, ArgDestination *argDest) {
     CONTRACTL {
         THROWS;
         GC_NOTRIGGER; // Caller does not protect object references
@@ -140,7 +141,9 @@ void InvokeUtil::CopyArg(TypeHandle th, OBJECTREF *pObjUNSAFE, void *pArgDst) {
         INJECT_FAULT(COMPlusThrowOM()); 
     }
     CONTRACTL_END;
-    
+
+    void *pArgDst = argDest->GetDestinationAddress();
+
     OBJECTREF rObj = *pObjUNSAFE;
     MethodTable* pMT;
     CorElementType oType;
@@ -204,12 +207,12 @@ void InvokeUtil::CopyArg(TypeHandle th, OBJECTREF *pObjUNSAFE, void *pArgDst) {
 
     case ELEMENT_TYPE_VALUETYPE:
     {
-        // If we got the univeral zero...Then assign it and exit.
+        // If we got the universal zero...Then assign it and exit.
         if (rObj == 0) {
-            InitValueClass(pArgDst, th.AsMethodTable());
+            InitValueClassArg(argDest, th.AsMethodTable());
          }
         else {
-            if (!th.AsMethodTable()->UnBoxInto(pArgDst, rObj))
+            if (!th.AsMethodTable()->UnBoxIntoArg(argDest, rObj))
                 COMPlusThrow(kArgumentException, W("Arg_ObjObj"));
         }
         break;
diff --git a/src/vm/invokeutil.h b/src/vm/invokeutil.h
index f2acb61f9e..14d7dc8e14 100644
--- a/src/vm/invokeutil.h
+++ b/src/vm/invokeutil.h
@@ -44,6 +44,7 @@ struct InterfaceMapData
 #include <poppack.h>
 
 class ReflectMethodList;
+class ArgDestination;
 
 // Structure used to track security access checks efficiently when applied
 // across a range of methods, fields etc.
@@ -114,7 +115,7 @@ class InvokeUtil
 {
 
 public:
-    static void CopyArg(TypeHandle th, OBJECTREF *obj, void *pArgDst);
+    static void CopyArg(TypeHandle th, OBJECTREF *obj, ArgDestination *argDest);
    
     // Given a type, this routine will convert an return value representing that
     //  type into an ObjectReference.  If the type is a primitive, the 
diff --git a/src/vm/jitinterface.cpp b/src/vm/jitinterface.cpp
index ba6aebb3cc..442fb91186 100644
--- a/src/vm/jitinterface.cpp
+++ b/src/vm/jitinterface.cpp
@@ -58,7 +58,6 @@
 #include "runtimehandles.h"
 #include "sigbuilder.h"
 #include "openum.h"
-
 #ifdef HAVE_GCCOVER
 #include "gccover.h"
 #endif // HAVE_GCCOVER
@@ -1651,7 +1650,6 @@ void CEEInfo::getFieldInfo (CORINFO_RESOLVED_TOKEN * pResolvedToken,
     DWORD fieldFlags = 0;
 
     pResult->offset = pField->GetOffset();
-
     if (pField->IsStatic())
     {
 #ifdef FEATURE_LEGACYNETCF
@@ -1850,7 +1848,6 @@ void CEEInfo::getFieldInfo (CORINFO_RESOLVED_TOKEN * pResolvedToken,
 
     if (!(flags & CORINFO_ACCESS_INLINECHECK))
     {
-
     //get the field's type.  Grab the class for structs.
     pResult->fieldType = getFieldTypeInternal(pResolvedToken->hField, &pResult->structType, pResolvedToken->hClass);
 
@@ -2568,9 +2565,82 @@ bool CEEInfo::getSystemVAmd64PassStructInRegisterDescriptor(
                                                 /*IN*/  CORINFO_CLASS_HANDLE structHnd,
                                                 /*OUT*/ SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr)
 {
-    LIMITED_METHOD_CONTRACT;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+    JIT_TO_EE_TRANSITION();
+
+    _ASSERTE(structPassInRegDescPtr != nullptr);
+    TypeHandle th(structHnd);
+    
+    // Make sure this is a value type.
+    if (th.IsValueType())
+    {
+        _ASSERTE(CorInfoType2UnixAmd64Classification(th.GetInternalCorElementType()) == SystemVClassificationTypeStruct);
+
+        MethodTable* methodTablePtr = nullptr;
+        bool isNativeStruct = false;
+        if (!th.IsTypeDesc())
+        {
+            methodTablePtr = th.AsMethodTable();
+            _ASSERTE(methodTablePtr != nullptr);
+        }
+        else if (th.IsTypeDesc())
+        {
+            if (th.IsNativeValueType())
+            {
+                methodTablePtr = th.AsNativeValueType();
+                isNativeStruct = true;
+                _ASSERTE(methodTablePtr != nullptr);
+            }
+            else
+            {
+                _ASSERTE(false && "Unhandled TypeHandle for struct!");
+            }
+        }
+
+        bool isPassableInRegs = false;
+
+        if (isNativeStruct)
+        {
+            isPassableInRegs = methodTablePtr->GetLayoutInfo()->IsNativeStructPassedInRegisters();
+        }
+        else
+        {
+            isPassableInRegs = methodTablePtr->IsRegPassedStruct();
+        }
+
+        if (!isPassableInRegs)
+        {
+            structPassInRegDescPtr->passedInRegisters = false;
+        }
+        else
+        {
+            structPassInRegDescPtr->passedInRegisters = true;
+
+            SystemVStructRegisterPassingHelper helper((unsigned int)th.GetSize());
+            bool result = methodTablePtr->ClassifyEightBytes(&helper, 0, 0);
+
+            structPassInRegDescPtr->eightByteCount = helper.eightByteCount;
+            _ASSERTE(structPassInRegDescPtr->eightByteCount <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
 
+            for (unsigned int i = 0; i < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
+            {
+                structPassInRegDescPtr->eightByteClassifications[i] = helper.eightByteClassifications[i];
+                structPassInRegDescPtr->eightByteSizes[i] = helper.eightByteSizes[i];
+                structPassInRegDescPtr->eightByteOffsets[i] = helper.eightByteOffsets[i];
+            }
+        }
+    }
+    else
+    {
+        structPassInRegDescPtr->passedInRegisters = false;
+    }
+
+    EE_TO_JIT_TRANSITION();
+
+    return true;
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
     return false;
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
 }
 
 /*********************************************************************/
diff --git a/src/vm/message.cpp b/src/vm/message.cpp
index d8bdb3d2c8..dab78f46e6 100644
--- a/src/vm/message.cpp
+++ b/src/vm/message.cpp
@@ -752,7 +752,7 @@ FCIMPL2(FC_BOOL_RET, CMessage::Dispatch, MessageObject* pMessageUNSAFE, Object*
     int ofs;
     while ((ofs = argit.GetNextOffset()) != TransitionBlock::InvalidOffset)
     {
-        if (TransitionBlock::IsFloatArgumentRegisterOffset(ofs))
+        if (TransitionBlock::HasFloatRegister(ofs, argit.GetArgLocDescForStructInRegs()))
         {
             // Found a floating point argument register. The first time we find this we point
             // pFloatArgumentRegisters to the part of the frame where these values were spilled (we don't do
@@ -772,7 +772,7 @@ FCIMPL2(FC_BOOL_RET, CMessage::Dispatch, MessageObject* pMessageUNSAFE, Object*
     DWORD_PTR   dwRegTypeMap    = 0;
 
     {
-        int    ofs;
+        int ofs;
         while ((ofs = argit.GetNextOffset()) != TransitionBlock::InvalidOffset)
         {
             int regArgNum = TransitionBlock::GetArgumentIndexFromOffset(ofs);
diff --git a/src/vm/method.cpp b/src/vm/method.cpp
index 6926ce4b6e..3e7271b1fb 100644
--- a/src/vm/method.cpp
+++ b/src/vm/method.cpp
@@ -1396,8 +1396,9 @@ COR_ILMETHOD* MethodDesc::GetILHeader(BOOL fAllowOverrides /*=FALSE*/)
 //*******************************************************************************
 MetaSig::RETURNTYPE MethodDesc::ReturnsObject(
 #ifdef _DEBUG
-    bool supportStringConstructors
+    bool supportStringConstructors,
 #endif
+    MethodTable** pMT
     )
 {
     CONTRACTL
@@ -1439,7 +1440,19 @@ MetaSig::RETURNTYPE MethodDesc::ReturnsObject(
                     if (!thValueType.IsTypeDesc())
                     {
                         MethodTable * pReturnTypeMT = thValueType.AsMethodTable();
-                        if(pReturnTypeMT->ContainsPointers())
+                        if (pMT != NULL)
+                        {
+                            *pMT = pReturnTypeMT;
+                        }
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                        if (pReturnTypeMT->IsRegPassedStruct())
+                        {
+                            return MetaSig::RETVALUETYPE;
+                        }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+                        if (pReturnTypeMT->ContainsPointers())
                         {
                             _ASSERTE(pReturnTypeMT->GetNumInstanceFieldBytes() == sizeof(void*));
                             return MetaSig::RETOBJ;
diff --git a/src/vm/method.hpp b/src/vm/method.hpp
index 0f283e5c79..680662b94c 100644
--- a/src/vm/method.hpp
+++ b/src/vm/method.hpp
@@ -1611,8 +1611,9 @@ public:
     // does this function return an object reference?
     MetaSig::RETURNTYPE ReturnsObject(
 #ifdef _DEBUG 
-    bool supportStringConstructors = false
+        bool supportStringConstructors = false,
 #endif
+        MethodTable** pMT = NULL
         );
 
 
diff --git a/src/vm/methodtable.cpp b/src/vm/methodtable.cpp
index de660268e4..e632ce3700 100644
--- a/src/vm/methodtable.cpp
+++ b/src/vm/methodtable.cpp
@@ -39,9 +39,12 @@
 #include "dbginterface.h"
 #include "comdelegate.h"
 #include "eventtrace.h"
+#include "fieldmarshaler.h"
+
 #ifdef FEATURE_REMOTING
 #include "remoting.h"
 #endif
+
 #include "eeprofinterfaces.h"
 #include "dllimportcallback.h"
 #include "listlock.h"
@@ -2275,6 +2278,916 @@ BOOL MethodTable::IsClassPreInited()
 #pragma optimize("", on)
 #endif // _MSC_VER
 
+//========================================================================================
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+
+#if defined(_DEBUG) && defined(LOGGING)
+static
+const char* GetSystemVClassificationTypeName(SystemVClassificationType t)
+{
+    switch (t)
+    {
+    case SystemVClassificationTypeUnknown:          return "Unknown";
+    case SystemVClassificationTypeStruct:           return "Struct";
+    case SystemVClassificationTypeNoClass:          return "NoClass";
+    case SystemVClassificationTypeMemory:           return "Memory";
+    case SystemVClassificationTypeInteger:          return "Integer";
+    case SystemVClassificationTypeIntegerReference: return "IntegerReference";
+    case SystemVClassificationTypeSSE:              return "SSE";
+    default:                                        return "ERROR";
+    }
+};
+#endif // _DEBUG && LOGGING
+
+// If we have a field classification already, but there is a union, we must merge the classification type of the field. Returns the
+// new, merged classification type.
+/* static */
+SystemVClassificationType MethodTable::ReClassifyField(SystemVClassificationType originalClassification, SystemVClassificationType newFieldClassification)
+{
+    _ASSERTE((newFieldClassification == SystemVClassificationTypeInteger) ||
+             (newFieldClassification == SystemVClassificationTypeIntegerReference) ||
+             (newFieldClassification == SystemVClassificationTypeSSE));
+
+    switch (newFieldClassification)
+    {
+    case SystemVClassificationTypeInteger:
+        // Integer overrides everything; the resulting classification is Integer. Can't merge Integer and IntegerReference.
+        _ASSERTE((originalClassification == SystemVClassificationTypeInteger) ||
+                 (originalClassification == SystemVClassificationTypeSSE));
+
+        return SystemVClassificationTypeInteger;
+
+    case SystemVClassificationTypeSSE:
+        // If the old and new classifications are both SSE, then the merge is SSE, otherwise it will be integer. Can't merge SSE and IntegerReference.
+        _ASSERTE((originalClassification == SystemVClassificationTypeInteger) ||
+                 (originalClassification == SystemVClassificationTypeSSE));
+
+        if (originalClassification == SystemVClassificationTypeSSE)
+        {
+            return SystemVClassificationTypeSSE;
+        }
+        else
+        {
+            return SystemVClassificationTypeInteger;
+        }
+
+    case SystemVClassificationTypeIntegerReference:
+        // IntegerReference can only merge with IntegerReference.
+        _ASSERTE(originalClassification == SystemVClassificationTypeIntegerReference);
+        return SystemVClassificationTypeIntegerReference;
+
+    default:
+        _ASSERTE(false); // Unexpected type.
+        return SystemVClassificationTypeUnknown;
+    }
+}
+
+// Returns 'true' if the struct is passed in registers, 'false' otherwise.
+bool MethodTable::ClassifyEightBytes(SystemVStructRegisterPassingHelperPtr helperPtr, unsigned int nestingLevel, unsigned int startOffsetOfStruct)
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+        SO_TOLERANT;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+    
+    WORD numIntroducedFields = GetNumIntroducedInstanceFields();
+
+    // It appears the VM gives a struct with no fields of size 1.
+    // Don't pass in register such structure.
+    if (numIntroducedFields == 0)
+    {
+        return false;
+    }
+
+    // No struct register passing with explicit layout. There may be cases where explicit layout may be still
+    // eligible for register struct passing, but it is hard to tell the real intent. Make it simple and just 
+    // unconditionally disable register struct passing for explicit layout.
+    if (GetClass()->HasExplicitFieldOffsetLayout())
+    {
+        LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytes: struct %s has explicit layout; will not be enregistered\n",
+               nestingLevel * 5, "", this->GetDebugClassName()));
+        return false;
+    }
+#ifdef _DEBUG
+    LOG((LF_JIT, LL_EVERYTHING, "%*s**** Classify %s (%p), startOffset %d, total struct size %d\n",
+        nestingLevel * 5, "", this->GetDebugClassName(), this, startOffsetOfStruct, helperPtr->structSize));
+    int fieldNum = -1;
+#endif // _DEBUG
+
+    FieldDesc *pField = GetApproxFieldDescListRaw();
+    FieldDesc *pFieldEnd = pField + numIntroducedFields;
+
+    for (; pField < pFieldEnd; pField++)
+    {
+#ifdef _DEBUG
+        ++fieldNum;
+#endif // _DEBUG
+
+        DWORD fieldOffset = pField->GetOffset();
+        unsigned normalizedFieldOffset = fieldOffset + startOffsetOfStruct;
+
+        unsigned int fieldSize = pField->GetSize();
+        _ASSERTE(fieldSize != (unsigned int)-1);
+
+        // The field can't span past the end of the struct.
+        if ((normalizedFieldOffset + fieldSize) > helperPtr->structSize)
+        {
+            _ASSERTE(false && "Invalid struct size. The size of fields and overall size don't agree");
+            return false;
+        }
+
+        CorElementType fieldType = pField->GetFieldType();
+
+        SystemVClassificationType fieldClassificationType = CorInfoType2UnixAmd64Classification(fieldType);
+
+#ifdef _DEBUG
+        LPCUTF8 fieldName;
+        pField->GetName_NoThrow(&fieldName);
+#endif // _DEBUG
+
+        if (fieldClassificationType == SystemVClassificationTypeStruct)
+        {
+            TypeHandle th = pField->GetApproxFieldTypeHandleThrowing();
+            _ASSERTE(!th.IsNull());
+            MethodTable* pFieldMT = th.GetMethodTable();
+
+            bool inEmbeddedStructPrev = helperPtr->inEmbeddedStruct;
+            helperPtr->inEmbeddedStruct = true;
+            bool structRet = pFieldMT->ClassifyEightBytes(helperPtr, nestingLevel + 1, normalizedFieldOffset);
+            helperPtr->inEmbeddedStruct = inEmbeddedStructPrev;
+
+            if (!structRet)
+            {
+                // If the nested struct says not to enregister, there's no need to continue analyzing at this level. Just return do not enregister.
+                return false;
+            }
+
+            continue;
+        }
+
+        if ((normalizedFieldOffset % fieldSize) != 0)
+        {
+            // The spec requires that struct values on the stack from register passed fields expects
+            // those fields to be at their natural alignment.
+
+            LOG((LF_JIT, LL_EVERYTHING, "     %*sxxxx Field %d %s: offset %d (normalized %d), size %d not at natural alignment; not enregistering struct\n",
+                   nestingLevel * 5, "", fieldNum, fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldSize));
+            return false;
+        }
+
+        if ((int)normalizedFieldOffset <= helperPtr->largestFieldOffset)
+        {
+            // Find the field corresponding to this offset and update the size if needed.
+            // We assume that either it matches the offset of a previously seen field, or
+            // it is an out-of-order offset (the VM does give us structs in non-increasing
+            // offset order sometimes) that doesn't overlap any other field.
+
+            // REVIEW: will the offset ever match a previously seen field offset for cases that are NOT ExplicitLayout?
+            // If not, we can get rid of this loop, and just assume the offset is from an out-of-order field. We wouldn't
+            // need to maintain largestFieldOffset, either, since we would then assume all fields are unique. We could
+            // also get rid of ReClassifyField().
+            int i;
+            for (i = helperPtr->currentUniqueOffsetField - 1; i >= 0; i--)
+            {
+                if (helperPtr->fieldOffsets[i] == normalizedFieldOffset)
+                {
+                    if (fieldSize > helperPtr->fieldSizes[i])
+                    {
+                        helperPtr->fieldSizes[i] = fieldSize;
+                    }
+
+                    helperPtr->fieldClassifications[i] = ReClassifyField(helperPtr->fieldClassifications[i], fieldClassificationType);
+
+                    LOG((LF_JIT, LL_EVERYTHING, "     %*sxxxx Field %d %s: offset %d (normalized %d), size %d, union with uniqueOffsetField %d, field type classification %s, reclassified field to %s\n",
+                           nestingLevel * 5, "", fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldSize, i,
+                           GetSystemVClassificationTypeName(fieldClassificationType),
+                           GetSystemVClassificationTypeName(helperPtr->fieldClassifications[i])));
+
+                    break;
+                }
+                // Make sure the field doesn't start in the middle of another field.
+                _ASSERTE((normalizedFieldOffset <  helperPtr->fieldOffsets[i]) ||
+                         (normalizedFieldOffset >= helperPtr->fieldOffsets[i] + helperPtr->fieldSizes[i]));
+            }
+
+            if (i >= 0)
+            {
+                // The proper size of the union set of fields has been set above; continue to the next field.
+                continue;
+            }
+        }
+        else
+        {
+            helperPtr->largestFieldOffset = (int)normalizedFieldOffset;
+        }
+
+        // Set the data for a new field.
+
+        // The new field classification must not have been initialized yet.
+        _ASSERTE(helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] == SystemVClassificationTypeNoClass);
+
+        // There are only a few field classifications that are allowed.
+        _ASSERTE((fieldClassificationType == SystemVClassificationTypeInteger) ||
+                 (fieldClassificationType == SystemVClassificationTypeIntegerReference) ||
+                 (fieldClassificationType == SystemVClassificationTypeSSE));
+
+        helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] = fieldClassificationType;
+        helperPtr->fieldSizes[helperPtr->currentUniqueOffsetField] = fieldSize;
+        helperPtr->fieldOffsets[helperPtr->currentUniqueOffsetField] = normalizedFieldOffset;
+
+        LOG((LF_JIT, LL_EVERYTHING, "     %*s**** Field %d %s: offset %d (normalized %d), size %d, currentUniqueOffsetField %d, field type classification %s, chosen field classification %s\n",
+               nestingLevel * 5, "", fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldSize, helperPtr->currentUniqueOffsetField,
+               GetSystemVClassificationTypeName(fieldClassificationType),
+               GetSystemVClassificationTypeName(helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField])));
+
+        helperPtr->currentUniqueOffsetField++;
+        _ASSERTE(helperPtr->currentUniqueOffsetField < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT);
+    } // end per-field for loop
+
+    if (!helperPtr->inEmbeddedStruct)
+    {
+        _ASSERTE(nestingLevel == 0);
+
+        // We're at the top level of the recursion, and we're done looking at the fields.
+        // Now sort the fields by offset and set the output data.
+
+        int sortedFieldOrder[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            sortedFieldOrder[i] = -1;
+        }
+
+        for (unsigned i = 0; i < helperPtr->currentUniqueOffsetField; i++)
+        {
+            _ASSERTE(helperPtr->fieldOffsets[i] < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT);
+            _ASSERTE(sortedFieldOrder[helperPtr->fieldOffsets[i]] == -1); // we haven't seen this field offset yet.
+            sortedFieldOrder[helperPtr->fieldOffsets[i]] = i;
+        }
+
+        // Set the layoutSizes (includes holes from alignment of the fields.)
+        int lastField = -1;
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            int ordinal = sortedFieldOrder[i];
+            if (ordinal == -1)
+            {
+                continue;
+            }
+
+            if (lastField == -1)
+            {
+                lastField = ordinal;
+                continue;
+            }
+
+            helperPtr->fieldLayoutSizes[lastField] = helperPtr->fieldOffsets[ordinal] - helperPtr->fieldOffsets[lastField];
+
+            lastField = ordinal;
+        }
+        // Now the last field
+        _ASSERTE(lastField != -1); // if lastField==-1, then the struct has no fields!
+        helperPtr->fieldLayoutSizes[lastField] = helperPtr->structSize - helperPtr->fieldOffsets[lastField];
+
+        // Calculate the eightbytes and their types.
+        unsigned int accumulatedSizeForEightByte = 0;
+        unsigned int lastEightByteOffset = 0;
+        unsigned int currentEightByte = 0;
+
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            int ordinal = sortedFieldOrder[i];
+            if (ordinal == -1)
+            {
+                continue;
+            }
+
+            if ((accumulatedSizeForEightByte + helperPtr->fieldLayoutSizes[ordinal]) > SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES)
+            {
+                // Save data for this eightbyte.
+                helperPtr->eightByteSizes[currentEightByte] = accumulatedSizeForEightByte;
+                helperPtr->eightByteOffsets[currentEightByte] = lastEightByteOffset;
+
+                // Set up for next eightbyte.
+                currentEightByte++;
+                _ASSERTE(currentEightByte < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+                lastEightByteOffset = helperPtr->fieldOffsets[ordinal];
+                accumulatedSizeForEightByte = 0;
+            }
+
+            accumulatedSizeForEightByte += helperPtr->fieldLayoutSizes[ordinal];
+
+            _ASSERTE(helperPtr->fieldClassifications[ordinal] != SystemVClassificationTypeMemory);
+
+            if (helperPtr->eightByteClassifications[currentEightByte] == helperPtr->fieldClassifications[ordinal])
+            {
+                // Do nothing. The eight-byte is already classified.
+            }
+            else if (helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeNoClass)
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = helperPtr->fieldClassifications[ordinal];
+            }
+            else if ((helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeInteger) ||
+                     (helperPtr->fieldClassifications[ordinal] == SystemVClassificationTypeInteger))
+            {
+                _ASSERTE(helperPtr->fieldClassifications[ordinal] != SystemVClassificationTypeIntegerReference);
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeInteger;
+            }
+            else if ((helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeIntegerReference) ||
+                     (helperPtr->fieldClassifications[ordinal] == SystemVClassificationTypeIntegerReference))
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeIntegerReference;
+            }
+            else
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeSSE;
+            }
+        }
+
+        helperPtr->eightByteCount = currentEightByte + 1; 
+        helperPtr->eightByteSizes[currentEightByte] = accumulatedSizeForEightByte;
+        helperPtr->eightByteOffsets[currentEightByte] = lastEightByteOffset;
+        _ASSERTE(helperPtr->eightByteCount <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+#ifdef _DEBUG
+        LOG((LF_JIT, LL_EVERYTHING, "     ----\n"));
+        LOG((LF_JIT, LL_EVERYTHING, "     **** Number EightBytes: %d\n", helperPtr->eightByteCount));
+        for (unsigned i = 0; i < helperPtr->eightByteCount; i++)
+        {
+            LOG((LF_JIT, LL_EVERYTHING, "     **** eightByte %d -- classType: %s, eightByteOffset: %d, eightByteSize: %d\n",
+                i, GetSystemVClassificationTypeName(helperPtr->eightByteClassifications[i]), helperPtr->eightByteOffsets[i], helperPtr->eightByteSizes[i]));
+        }
+#endif // _DEBUG
+    }
+
+    return true;
+}
+
+// Returns 'true' if the struct is passed in registers, 'false' otherwise.
+bool MethodTable::ClassifyEightBytesForNativeStruct(SystemVStructRegisterPassingHelperPtr helperPtr, unsigned int nestingLevel, unsigned int startOffsetOfStruct)
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+        SO_TOLERANT;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+#ifdef DACCESS_COMPILE
+    // No register classification for this case.
+    return false;
+#else // DACCESS_COMPILE
+
+    if (!HasLayout())
+    {
+        return false;
+    }
+
+    const FieldMarshaler *pFieldMarshaler = GetLayoutInfo()->GetFieldMarshalers();
+    UINT  numIntroducedFields = GetLayoutInfo()->GetNumCTMFields();
+
+    // No fields.
+    if (numIntroducedFields == 0)
+    {
+        return false;
+    }
+
+    // No struct register passing with explicit layout. There may be cases where explicit layout may be still
+    // eligible for register struct passing, but it is hard to tell the real intent. Make it simple and just 
+    // unconditionally disable register struct passing for explicit layout.
+    if (GetClass()->HasExplicitFieldOffsetLayout())
+    {
+        LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytesForNativeStruct: struct %s has explicit layout; will not be enregistered\n",
+            nestingLevel * 5, "", this->GetDebugClassName()));
+        return false;
+    }
+#ifdef _DEBUG
+    LOG((LF_JIT, LL_EVERYTHING, "%*s**** Classify for native struct %s (%p), startOffset %d, total struct size %d\n",
+        nestingLevel * 5, "", this->GetDebugClassName(), this, startOffsetOfStruct, helperPtr->structSize));
+    int fieldNum = -1;
+#endif // _DEBUG
+
+    while (numIntroducedFields--)
+    {
+#ifdef _DEBUG
+        ++fieldNum;
+#endif // _DEBUG
+
+        FieldDesc *pField = pFieldMarshaler->GetFieldDesc();
+        CorElementType fieldType = pField->GetFieldType();
+
+        // Invalid field type.
+        if (fieldType == ELEMENT_TYPE_END)
+        {
+            return false;
+        }
+
+        DWORD fieldOffset = pFieldMarshaler->GetExternalOffset();
+        unsigned normalizedFieldOffset = fieldOffset + startOffsetOfStruct;
+
+        unsigned int fieldNativeSize = pFieldMarshaler->NativeSize();
+        if (fieldNativeSize > SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES)
+        {
+            // Pass on stack in this case.
+            return false;
+        }
+
+        _ASSERTE(fieldNativeSize != (unsigned int)-1);
+
+        // The field can't span past the end of the struct.
+        if ((normalizedFieldOffset + fieldNativeSize) > helperPtr->structSize)
+        {
+            _ASSERTE(false && "Invalid native struct size. The size of fields and overall size don't agree");
+            return false;
+        }
+
+        SystemVClassificationType fieldClassificationType = SystemVClassificationTypeUnknown;
+
+#ifdef _DEBUG
+        LPCUTF8 fieldName;
+        pField->GetName_NoThrow(&fieldName);
+#endif // _DEBUG
+
+        // Some NStruct Field Types have extra information and require special handling
+        NStructFieldType cls = pFieldMarshaler->GetNStructFieldType();
+        if (cls == NFT_FIXEDCHARARRAYANSI)
+        {
+            fieldClassificationType = SystemVClassificationTypeInteger;
+        }
+        else if (cls == NFT_FIXEDARRAY)
+        {
+            VARTYPE vtElement = ((FieldMarshaler_FixedArray*)pFieldMarshaler)->GetElementVT();
+            switch (vtElement)
+            {
+            case VT_EMPTY:
+            case VT_NULL:
+            case VT_BOOL:
+            case VT_I1:
+            case VT_I2:
+            case VT_I4:
+            case VT_I8:
+            case VT_UI1:
+            case VT_UI2:
+            case VT_UI4:
+            case VT_UI8:
+            case VT_PTR:
+            case VT_INT:
+            case VT_UINT:
+            case VT_LPSTR:
+            case VT_LPWSTR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case VT_R4:
+            case VT_R8:
+                fieldClassificationType = SystemVClassificationTypeSSE;
+                break;
+            case VT_DECIMAL:
+            case VT_DATE:
+            case VT_BSTR:
+            case VT_UNKNOWN:
+            case VT_DISPATCH:
+            case VT_SAFEARRAY:
+            case VT_ERROR:
+            case VT_HRESULT:
+            case VT_CARRAY:
+            case VT_USERDEFINED:
+            case VT_RECORD:
+            case VT_FILETIME:
+            case VT_BLOB:
+            case VT_STREAM:
+            case VT_STORAGE:
+            case VT_STREAMED_OBJECT:
+            case VT_STORED_OBJECT:
+            case VT_BLOB_OBJECT:
+            case VT_CF:
+            case VT_CLSID:
+            default:
+                // Not supported.
+                return false;
+            }
+        }
+#ifdef FEATURE_COMINTEROP
+        else if (cls == NFT_INTERFACE)
+        {
+            // COMInterop not supported for CORECLR.
+            _ASSERTE(false && "COMInterop not supported for CORECLR.");
+            return false;
+        }
+#ifdef FEATURE_CLASSIC_COMINTEROP
+        else if (cls == NFT_SAFEARRAY)
+        {
+            // COMInterop not supported for CORECLR.
+            _ASSERTE(false && "COMInterop not supported for CORECLR.");
+            return false;
+        }
+#endif // FEATURE_CLASSIC_COMINTEROP
+#endif // FEATURE_COMINTEROP
+        else if (cls == NFT_NESTEDLAYOUTCLASS)
+        {
+            MethodTable* pFieldMT = ((FieldMarshaler_NestedLayoutClass*)pFieldMarshaler)->GetMethodTable();
+
+            bool inEmbeddedStructPrev = helperPtr->inEmbeddedStruct;
+            helperPtr->inEmbeddedStruct = true;
+            bool structRet = pFieldMT->ClassifyEightBytesForNativeStruct(helperPtr, nestingLevel + 1, normalizedFieldOffset);
+            helperPtr->inEmbeddedStruct = inEmbeddedStructPrev;
+
+            if (!structRet)
+            {
+                // If the nested struct says not to enregister, there's no need to continue analyzing at this level. Just return do not enregister.
+                return false;
+            }
+
+            continue;
+        }
+        else if (cls == NFT_NESTEDVALUECLASS)
+        {
+            MethodTable* pFieldMT = ((FieldMarshaler_NestedValueClass*)pFieldMarshaler)->GetMethodTable();
+
+            bool inEmbeddedStructPrev = helperPtr->inEmbeddedStruct;
+            helperPtr->inEmbeddedStruct = true;
+            bool structRet = pFieldMT->ClassifyEightBytesForNativeStruct(helperPtr, nestingLevel + 1, normalizedFieldOffset);
+            helperPtr->inEmbeddedStruct = inEmbeddedStructPrev;
+
+            if (!structRet)
+            {
+                // If the nested struct says not to enregister, there's no need to continue analyzing at this level. Just return do not enregister.
+                return false;
+            }
+
+            continue;
+        }
+        else if (cls == NFT_COPY1)
+        {
+            // The following CorElementTypes are the only ones handled with FieldMarshaler_Copy1. 
+            switch (fieldType)
+            {
+            case ELEMENT_TYPE_I1:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_U1:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            default:
+                // Invalid entry.
+                return false; // Pass on stack.
+            }
+        }
+        else if (cls == NFT_COPY2)
+        {
+            // The following CorElementTypes are the only ones handled with FieldMarshaler_Copy2. 
+            switch (fieldType)
+            {
+            case ELEMENT_TYPE_CHAR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_I2:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_U2:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            default:
+                // Invalid entry.
+                return false; // Pass on stack.
+            }
+        }
+        else if (cls == NFT_COPY4)
+        {
+            // The following CorElementTypes are the only ones handled with FieldMarshaler_Copy4. 
+            switch (fieldType)
+            {
+                // At this point, ELEMENT_TYPE_I must be 4 bytes long.  Same for ELEMENT_TYPE_U.
+            case ELEMENT_TYPE_I:
+            case ELEMENT_TYPE_I4:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_U:
+            case ELEMENT_TYPE_U4:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_R4:
+                fieldClassificationType = SystemVClassificationTypeSSE;
+                break;
+
+            case ELEMENT_TYPE_PTR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            default:
+                // Invalid entry.
+                return false; // Pass on stack.
+            }
+        }
+        else if (cls == NFT_COPY8)
+        {
+            // The following CorElementTypes are the only ones handled with FieldMarshaler_Copy8. 
+            switch (fieldType)
+            {
+                // At this point, ELEMENT_TYPE_I must be 8 bytes long.  Same for ELEMENT_TYPE_U.
+            case ELEMENT_TYPE_I:
+            case ELEMENT_TYPE_I8:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_U:
+            case ELEMENT_TYPE_U8:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_R8:
+                fieldClassificationType = SystemVClassificationTypeSSE;
+                break;
+
+            case ELEMENT_TYPE_PTR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            default:
+                // Invalid entry.
+                return false; // Pass on stack.
+            }
+        }
+        else if (cls == NFT_FIXEDSTRINGUNI)
+        {
+            fieldClassificationType = SystemVClassificationTypeInteger;
+        }
+        else if (cls == NFT_FIXEDSTRINGANSI)
+        {
+            fieldClassificationType = SystemVClassificationTypeInteger;
+        }
+        else
+        {
+            // All other NStruct Field Types which do not require special handling.
+            switch (cls)
+            {
+#ifdef FEATURE_COMINTEROP
+            case NFT_BSTR:
+                // COMInterop not supported for CORECLR.
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+            case NFT_HSTRING:
+                // COMInterop not supported for CORECLR.
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+#endif  // FEATURE_COMINTEROP
+            case NFT_STRINGUNI:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_STRINGANSI:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_DELEGATE:
+                return false;
+#ifdef FEATURE_COMINTEROP
+            case NFT_VARIANT:
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+#endif  // FEATURE_COMINTEROP
+            case NFT_ANSICHAR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_WINBOOL:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_CBOOL:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_DECIMAL:
+                return false;
+            case NFT_DATE:
+                return false;
+#ifdef FEATURE_COMINTEROP
+            case NFT_VARIANTBOOL:
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+            case NFT_CURRENCY:
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+#endif  // FEATURE_COMINTEROP
+            case NFT_ILLEGAL:
+                return false;
+            case NFT_SAFEHANDLE:
+                return false;
+            case NFT_CRITICALHANDLE:
+                return false;
+            default:
+                return false;
+            }
+        }
+
+        if ((normalizedFieldOffset % fieldNativeSize) != 0)
+        {
+            // The spec requires that struct values on the stack from register passed fields expects
+            // those fields to be at their natural alignment.
+
+            LOG((LF_JIT, LL_EVERYTHING, "     %*sxxxx Native Field %d %s: offset %d (normalized %d), native size %d not at natural alignment; not enregistering struct\n",
+                nestingLevel * 5, "", fieldNum, fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldNativeSize));
+            return false;
+        }
+
+        if ((int)normalizedFieldOffset <= helperPtr->largestFieldOffset)
+        {
+            // Find the field corresponding to this offset and update the size if needed.
+            // We assume that either it matches the offset of a previously seen field, or
+            // it is an out-of-order offset (the VM does give us structs in non-increasing
+            // offset order sometimes) that doesn't overlap any other field.
+
+            int i;
+            for (i = helperPtr->currentUniqueOffsetField - 1; i >= 0; i--)
+            {
+                if (helperPtr->fieldOffsets[i] == normalizedFieldOffset)
+                {
+                    if (fieldNativeSize > helperPtr->fieldSizes[i])
+                    {
+                        helperPtr->fieldSizes[i] = fieldNativeSize;
+                    }
+
+                    helperPtr->fieldClassifications[i] = ReClassifyField(helperPtr->fieldClassifications[i], fieldClassificationType);
+
+                    LOG((LF_JIT, LL_EVERYTHING, "     %*sxxxx Native Field %d %s: offset %d (normalized %d), native size %d, union with uniqueOffsetField %d, field type classification %s, reclassified field to %s\n",
+                        nestingLevel * 5, "", fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldNativeSize, i,
+                        GetSystemVClassificationTypeName(fieldClassificationType),
+                        GetSystemVClassificationTypeName(helperPtr->fieldClassifications[i])));
+
+                    break;
+                }
+                // Make sure the field doesn't start in the middle of another field.
+                _ASSERTE((normalizedFieldOffset <  helperPtr->fieldOffsets[i]) ||
+                    (normalizedFieldOffset >= helperPtr->fieldOffsets[i] + helperPtr->fieldSizes[i]));
+            }
+
+            if (i >= 0)
+            {
+                // The proper size of the union set of fields has been set above; continue to the next field.
+                continue;
+            }
+        }
+        else
+        {
+            helperPtr->largestFieldOffset = (int)normalizedFieldOffset;
+        }
+
+        // Set the data for a new field.
+
+        // The new field classification must not have been initialized yet.
+        _ASSERTE(helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] == SystemVClassificationTypeNoClass);
+
+        // There are only a few field classifications that are allowed.
+        _ASSERTE((fieldClassificationType == SystemVClassificationTypeInteger) ||
+            (fieldClassificationType == SystemVClassificationTypeIntegerReference) ||
+            (fieldClassificationType == SystemVClassificationTypeSSE));
+
+        helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] = fieldClassificationType;
+        helperPtr->fieldSizes[helperPtr->currentUniqueOffsetField] = fieldNativeSize;
+        helperPtr->fieldOffsets[helperPtr->currentUniqueOffsetField] = normalizedFieldOffset;
+
+        LOG((LF_JIT, LL_EVERYTHING, "     %*s**** Native Field %d %s: offset %d (normalized %d), size %d, currentUniqueOffsetField %d, field type classification %s, chosen field classification %s\n",
+            nestingLevel * 5, "", fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldNativeSize, helperPtr->currentUniqueOffsetField,
+            GetSystemVClassificationTypeName(fieldClassificationType),
+            GetSystemVClassificationTypeName(helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField])));
+
+        helperPtr->currentUniqueOffsetField++;
+        ((BYTE*&)pFieldMarshaler) += MAXFIELDMARSHALERSIZE;
+        _ASSERTE(helperPtr->currentUniqueOffsetField < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT);
+
+    } // end per-field for loop
+
+    if (!helperPtr->inEmbeddedStruct)
+    {
+        _ASSERTE(nestingLevel == 0);
+
+        // We're at the top level of the recursion, and we're done looking at the fields.
+        // Now sort the fields by offset and set the output data.
+
+        int sortedFieldOrder[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            sortedFieldOrder[i] = -1;
+        }
+
+        for (unsigned i = 0; i < helperPtr->currentUniqueOffsetField; i++)
+        {
+            _ASSERTE(helperPtr->fieldOffsets[i] < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT);
+            _ASSERTE(sortedFieldOrder[helperPtr->fieldOffsets[i]] == -1); // we haven't seen this field offset yet.
+            sortedFieldOrder[helperPtr->fieldOffsets[i]] = i;
+        }
+
+        // Set the layoutSizes (includes holes from alignment of the fields.)
+        int lastField = -1;
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            int ordinal = sortedFieldOrder[i];
+            if (ordinal == -1)
+            {
+                continue;
+            }
+
+            if (lastField == -1)
+            {
+                lastField = ordinal;
+                continue;
+            }
+
+            helperPtr->fieldLayoutSizes[lastField] = helperPtr->fieldOffsets[ordinal] - helperPtr->fieldOffsets[lastField];
+
+            lastField = ordinal;
+        }
+        // Now the last field
+        _ASSERTE(lastField != -1); // if lastField==-1, then the struct has no fields!
+        helperPtr->fieldLayoutSizes[lastField] = helperPtr->structSize - helperPtr->fieldOffsets[lastField];
+
+        // Calculate the eightbytes and their types.
+        unsigned int accumulatedSizeForEightByte = 0;
+        unsigned int lastEightByteOffset = 0;
+        unsigned int currentEightByte = 0;
+
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            int ordinal = sortedFieldOrder[i];
+            if (ordinal == -1)
+            {
+                continue;
+            }
+
+            if ((accumulatedSizeForEightByte + helperPtr->fieldLayoutSizes[ordinal]) > SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES)
+            {
+                // Save data for this eightbyte.
+                helperPtr->eightByteSizes[currentEightByte] = accumulatedSizeForEightByte;
+                helperPtr->eightByteOffsets[currentEightByte] = lastEightByteOffset;
+
+                // Set up for next eightbyte.
+                currentEightByte++;
+                _ASSERTE(currentEightByte < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+                lastEightByteOffset = helperPtr->fieldOffsets[ordinal];
+                accumulatedSizeForEightByte = 0;
+            }
+
+            accumulatedSizeForEightByte += helperPtr->fieldLayoutSizes[ordinal];
+
+            _ASSERTE(helperPtr->fieldClassifications[ordinal] != SystemVClassificationTypeMemory);
+
+            if (helperPtr->eightByteClassifications[currentEightByte] == helperPtr->fieldClassifications[ordinal])
+            {
+                // Do nothing. The eight-byte is already classified.
+            }
+            else if (helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeNoClass)
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = helperPtr->fieldClassifications[ordinal];
+            }
+            else if ((helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeInteger) ||
+                (helperPtr->fieldClassifications[ordinal] == SystemVClassificationTypeInteger))
+            {
+                _ASSERTE(helperPtr->fieldClassifications[ordinal] != SystemVClassificationTypeIntegerReference);
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeInteger;
+            }
+            else if ((helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeIntegerReference) ||
+                (helperPtr->fieldClassifications[ordinal] == SystemVClassificationTypeIntegerReference))
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeIntegerReference;
+            }
+            else
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeSSE;
+            }
+        }
+
+        helperPtr->eightByteCount = currentEightByte + 1;
+        helperPtr->eightByteSizes[currentEightByte] = accumulatedSizeForEightByte;
+        helperPtr->eightByteOffsets[currentEightByte] = lastEightByteOffset;
+        _ASSERTE(helperPtr->eightByteCount <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+#ifdef _DEBUG
+        LOG((LF_JIT, LL_EVERYTHING, "     ----\n"));
+        LOG((LF_JIT, LL_EVERYTHING, "     **** Number EightBytes: %d\n", helperPtr->eightByteCount));
+        for (unsigned i = 0; i < helperPtr->eightByteCount; i++)
+        {
+            LOG((LF_JIT, LL_EVERYTHING, "     **** eightByte %d -- classType: %s, eightByteOffset: %d, eightByteSize: %d\n",
+                i, GetSystemVClassificationTypeName(helperPtr->eightByteClassifications[i]), helperPtr->eightByteOffsets[i], helperPtr->eightByteSizes[i]));
+        }
+#endif // _DEBUG
+    }
+
+    return true;
+#endif // DACCESS_COMPILE
+}
+
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+
 #if !defined(DACCESS_COMPILE) && !defined(CROSSGEN_COMPILE)
 //==========================================================================================
 void MethodTable::AllocateRegularStaticBoxes()
@@ -2643,7 +3556,7 @@ void MethodTable::DoRunClassInitThrowing()
     }
 
     description = ".cctor lock";
-#if _DEBUG
+#ifdef _DEBUG
     description = GetDebugClassName();
 #endif
 
diff --git a/src/vm/methodtable.h b/src/vm/methodtable.h
index 8e6a59b6b3..e4aecf3140 100644
--- a/src/vm/methodtable.h
+++ b/src/vm/methodtable.h
@@ -53,7 +53,6 @@ class FCallMethodDesc;
 class    EEClass;
 class    EnCFieldDesc;
 class FieldDesc;
-class    FieldMarshaler;
 class JIT_TrialAlloc;
 struct LayoutRawFieldInfo;
 class MetaSig;
@@ -80,6 +79,7 @@ class   ComCallWrapperTemplate;
 #ifdef FEATURE_COMINTEROP_UNMANAGED_ACTIVATION
 class ClassFactoryBase;
 #endif // FEATURE_COMINTEROP_UNMANAGED_ACTIVATION
+class ArgDestination;
 
 //============================================================================
 // This is the in-memory structure of a class and it will evolve.
@@ -625,6 +625,112 @@ public:
 typedef DPTR(MethodTableWriteableData) PTR_MethodTableWriteableData;
 typedef DPTR(MethodTableWriteableData const) PTR_Const_MethodTableWriteableData;
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+inline
+SystemVClassificationType CorInfoType2UnixAmd64Classification(CorElementType eeType)
+{
+    static const SystemVClassificationType toSystemVAmd64ClassificationTypeMap[] = {
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_END
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_VOID
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_BOOLEAN
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_CHAR
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I1
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U1
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I2
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U2
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I4
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U4
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I8
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U8
+        SystemVClassificationTypeSSE,               // ELEMENT_TYPE_R4
+        SystemVClassificationTypeSSE,               // ELEMENT_TYPE_R8
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_STRING
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_PTR
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_BYREF
+        SystemVClassificationTypeStruct,            // ELEMENT_TYPE_VALUETYPE
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_CLASS
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_VAR              - (type variable)
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_ARRAY
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_GENERICINST
+        SystemVClassificationTypeStruct,            // ELEMENT_TYPE_TYPEDBYREF
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_VALUEARRAY_UNSUPPORTED
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_R_UNSUPPORTED
+
+        // put the correct type when we know our implementation
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_FNPTR
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_OBJECT
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_SZARRAY
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_MVAR
+
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_CMOD_REQD
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_CMOD_OPT
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_INTERNAL
+    };
+
+    _ASSERTE(sizeof(toSystemVAmd64ClassificationTypeMap) == ELEMENT_TYPE_MAX);
+    _ASSERTE(eeType < (CorElementType) sizeof(toSystemVAmd64ClassificationTypeMap));
+    // spot check of the map
+    _ASSERTE((SystemVClassificationType)toSystemVAmd64ClassificationTypeMap[ELEMENT_TYPE_I4] == SystemVClassificationTypeInteger);
+    _ASSERTE((SystemVClassificationType)toSystemVAmd64ClassificationTypeMap[ELEMENT_TYPE_PTR] == SystemVClassificationTypeInteger);
+    _ASSERTE((SystemVClassificationType)toSystemVAmd64ClassificationTypeMap[ELEMENT_TYPE_TYPEDBYREF] == SystemVClassificationTypeStruct);
+
+    return (((int)eeType) < ELEMENT_TYPE_MAX) ? (toSystemVAmd64ClassificationTypeMap[eeType]) : SystemVClassificationTypeUnknown;
+};
+
+#define SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES                    8 // Size of an eightbyte in bytes.
+#define SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT    16 // Maximum number of fields in struct passed in registers
+
+struct SystemVStructRegisterPassingHelper
+{
+    SystemVStructRegisterPassingHelper(unsigned int totalStructSize) :
+        structSize(totalStructSize),
+        eightByteCount(0),
+        inEmbeddedStruct(false),
+        currentUniqueOffsetField(0),
+        largestFieldOffset(-1)
+    {
+        for (int i = 0; i < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
+        {
+            eightByteClassifications[i] = SystemVClassificationTypeNoClass;
+            eightByteSizes[i] = 0;
+            eightByteOffsets[i] = 0;
+        }
+
+        // Initialize the work arrays
+        for (int i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            fieldClassifications[i] = SystemVClassificationTypeNoClass;
+            fieldSizes[i] = 0;
+            fieldLayoutSizes[i] = 0;
+            fieldOffsets[i] = 0;
+        }
+    }
+
+    // Input state.
+    unsigned int                    structSize;
+
+    // These fields are the output; these are what is computed by the classification algorithm.
+    unsigned int                    eightByteCount;
+    SystemVClassificationType       eightByteClassifications[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    unsigned int                    eightByteSizes[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    unsigned int                    eightByteOffsets[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+
+    // Helper members to track state.
+    bool                            inEmbeddedStruct;
+    unsigned int                    currentUniqueOffsetField; // A virtual field that could encompass many overlapping fields.
+    int                             largestFieldOffset;
+    SystemVClassificationType       fieldClassifications[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+    unsigned int                    fieldSizes[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+    unsigned int                    fieldLayoutSizes[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+    unsigned int                    fieldOffsets[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+};
+
+typedef DPTR(SystemVStructRegisterPassingHelper) SystemVStructRegisterPassingHelperPtr;
+
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
 //===============================================================================================
 //
 // GC data appears before the beginning of the MethodTable
@@ -941,6 +1047,16 @@ public:
     // during object construction.
     void CheckRunClassInitAsIfConstructingThrowing();
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+    // Helper function for ClassifyEightBytes
+    static SystemVClassificationType ReClassifyField(SystemVClassificationType originalClassification, SystemVClassificationType newFieldClassification);
+
+    // Builds the internal data structures and classifies struct eightbytes for Amd System V calling convention.
+    bool ClassifyEightBytes(SystemVStructRegisterPassingHelperPtr helperPtr, unsigned int nestingLevel, unsigned int startOffsetOfStruct);
+    bool ClassifyEightBytesForNativeStruct(SystemVStructRegisterPassingHelperPtr helperPtr, unsigned int nestingLevel, unsigned int startOffsetOfStruct);
+    
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+
     // Copy m_dwFlags from another method table
     void CopyFlags(MethodTable * pOldMT)
     {
@@ -1929,7 +2045,7 @@ public:
         SetFlag(enum_flag_HasPreciseInitCctors);
     }
 
-#ifdef FEATURE_HFA
+#if defined(FEATURE_HFA)
     inline bool IsHFA()
     {
         LIMITED_METHOD_CONTRACT;
@@ -1941,6 +2057,23 @@ public:
         LIMITED_METHOD_CONTRACT;
         SetFlag(enum_flag_IsHFA);
     }
+#endif // FEATURE_HFA
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+    inline bool IsRegPassedStruct()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return !!GetFlag(enum_flag_IsRegStructPassed);
+    }
+
+    inline void SetRegPassedStruct()
+    {
+        LIMITED_METHOD_CONTRACT;
+        SetFlag(enum_flag_IsRegStructPassed);
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+
+#ifdef FEATURE_HFA
 
     CorElementType GetHFAType();
 
@@ -2642,6 +2775,7 @@ public:
     OBJECTREF FastBox(void** data);
 #ifndef DACCESS_COMPILE
     BOOL UnBoxInto(void *dest, OBJECTREF src);
+    BOOL UnBoxIntoArg(ArgDestination *argDest, OBJECTREF src);
     void UnBoxIntoUnchecked(void *dest, OBJECTREF src);
 #endif
 
@@ -3775,7 +3909,19 @@ private:
         enum_flag_HasDefaultCtor            = 0x00000200,
         enum_flag_HasPreciseInitCctors      = 0x00000400,   // Do we need to run class constructors at allocation time? (Not perf important, could be moved to EEClass
 
+#if defined(FEATURE_HFA)
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+#error Can't define both FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#endif
         enum_flag_IsHFA                     = 0x00000800,   // This type is an HFA (Homogenous Floating-point Aggregate)
+#endif // FEATURE_HFA
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+#if defined(FEATURE_HFA)
+#error Can't define both FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#endif
+        enum_flag_IsRegStructPassed         = 0x00000800,   // This type is a System V register passed struct.
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
 
         // In a perfect world we would fill these flags using other flags that we already have
         // which have a constant value for something which has a component size.
diff --git a/src/vm/methodtable.inl b/src/vm/methodtable.inl
index a993556db6..aa07eea9d1 100644
--- a/src/vm/methodtable.inl
+++ b/src/vm/methodtable.inl
@@ -1716,6 +1716,32 @@ inline BOOL MethodTable::UnBoxInto(void *dest, OBJECTREF src)
 }
 
 //==========================================================================================
+// unbox src into argument, making sure src is of the correct type.
+
+inline BOOL MethodTable::UnBoxIntoArg(ArgDestination *argDest, OBJECTREF src)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        SO_TOLERANT;
+        MODE_COOPERATIVE;
+    }
+    CONTRACTL_END;
+
+    if (Nullable::IsNullableType(TypeHandle(this)))
+        return Nullable::UnBoxIntoArgNoGC(argDest, src, this);
+    else  
+    {
+        if (src == NULL || src->GetMethodTable() != this)
+            return FALSE;
+
+        CopyValueClassArg(argDest, src->UnBox(), this, src->GetAppDomain(), 0);
+    }
+    return TRUE;
+}
+
+//==========================================================================================
 // unbox src into dest, No checks are done
 
 inline void MethodTable::UnBoxIntoUnchecked(void *dest, OBJECTREF src) 
diff --git a/src/vm/methodtablebuilder.cpp b/src/vm/methodtablebuilder.cpp
index e1d2dbb2e5..0e3cb45675 100644
--- a/src/vm/methodtablebuilder.cpp
+++ b/src/vm/methodtablebuilder.cpp
@@ -1897,8 +1897,23 @@ MethodTableBuilder::BuildMethodTableThrowing(
 #ifdef FEATURE_HFA
         CheckForHFA(pByValueClassCache);
 #endif
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#ifdef FEATURE_HFA
+#error Can't have FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF defined at the same time.
+#endif // FEATURE_HFA
+        SystemVAmd64CheckForPassStructInRegister();
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
     }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#ifdef FEATURE_HFA
+#error Can't have FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF defined at the same time.
+#endif // FEATURE_HFA
+    if (HasLayout())
+    {
+        SystemVAmd64CheckForPassNativeStructInRegister();
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
 #ifdef FEATURE_HFA
     if (HasLayout())
     {
@@ -8429,6 +8444,93 @@ DWORD MethodTableBuilder::GetFieldSize(FieldDesc *pFD)
     return (1 << (DWORD)(DWORD_PTR&)(pFD->m_pMTOfEnclosingClass));
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+// checks whether the struct is enregisterable.
+void MethodTableBuilder::SystemVAmd64CheckForPassStructInRegister()
+{
+    STANDARD_VM_CONTRACT;
+
+    // This method should be called for valuetypes only
+    _ASSERTE(IsValueClass());
+
+    TypeHandle th(GetHalfBakedMethodTable());
+
+    if (th.IsTypeDesc())
+    {
+        // Not an enregisterable managed structure.
+        return;
+    }
+
+    DWORD totalStructSize = bmtFP->NumInstanceFieldBytes;
+
+    // If num of bytes for the fields is bigger than CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS
+    // pass through stack
+    if (totalStructSize > CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS)
+    {
+        LOG((LF_JIT, LL_EVERYTHING, "**** SystemVAmd64CheckForPassStructInRegister: struct %s is too big to pass in registers (%d bytes)\n",
+               this->GetDebugClassName(), totalStructSize));
+        return;
+    }
+
+    // Iterate through the fields and make sure they meet requirements to pass in registers
+    SystemVStructRegisterPassingHelper helper((unsigned int)totalStructSize);
+
+    if (GetHalfBakedMethodTable()->ClassifyEightBytes(&helper, 0, 0))
+    {
+        // All the above tests passed. It's registers passed struct!
+        GetHalfBakedMethodTable()->SetRegPassedStruct();
+
+        StoreEightByteClassification(&helper);
+    }
+}
+
+// checks whether the struct is enregisterable.
+void MethodTableBuilder::SystemVAmd64CheckForPassNativeStructInRegister()
+{
+    STANDARD_VM_CONTRACT;
+    DWORD totalStructSize = 0;
+
+    // If not a native value type, return.
+    if (!IsValueClass())
+    {
+        return;
+    }
+
+    totalStructSize = GetLayoutInfo()->GetNativeSize();
+
+    // If num of bytes for the fields is bigger than CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS
+    // pass through stack
+    if (totalStructSize > CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS)
+    {
+        LOG((LF_JIT, LL_EVERYTHING, "**** SystemVAmd64CheckForPassNativeStructInRegister: struct %s is too big to pass in registers (%d bytes)\n",
+            this->GetDebugClassName(), totalStructSize));
+        return;
+    }
+
+    _ASSERTE(HasLayout());
+
+    // Classify the native layout for this struct.
+   
+    // Iterate through the fields and make sure they meet requirements to pass in registers
+    SystemVStructRegisterPassingHelper helper((unsigned int)totalStructSize);
+    if (GetHalfBakedMethodTable()->ClassifyEightBytesForNativeStruct(&helper, 0, 0))
+    {
+        GetLayoutInfo()->SetNativeStructPassedInRegisters();
+    }
+}
+
+// Store the eightbyte classification into the EEClass
+void MethodTableBuilder::StoreEightByteClassification(SystemVStructRegisterPassingHelper* helper)
+{
+    EEClass* eeClass = GetHalfBakedMethodTable()->GetClass();
+    LoaderAllocator* pAllocator = MethodTableBuilder::GetLoaderAllocator();
+    AllocMemTracker* pamTracker = MethodTableBuilder::GetMemTracker();
+    EnsureOptionalFieldsAreAllocated(eeClass, pamTracker, pAllocator->GetLowFrequencyHeap());
+    eeClass->SetEightByteClassification(helper->eightByteCount, helper->eightByteClassifications, helper->eightByteSizes);
+}
+
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
 #ifdef FEATURE_HFA
 //---------------------------------------------------------------------------------------
 //
diff --git a/src/vm/methodtablebuilder.h b/src/vm/methodtablebuilder.h
index bc543c1bf8..10ba278535 100644
--- a/src/vm/methodtablebuilder.h
+++ b/src/vm/methodtablebuilder.h
@@ -2980,6 +2980,15 @@ private:
 
     VOID    CheckForNativeHFA();
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+    // checks whether the struct is enregisterable.
+    void SystemVAmd64CheckForPassStructInRegister();
+    void SystemVAmd64CheckForPassNativeStructInRegister();
+    // Store the eightbyte classification into the EEClass
+    void StoreEightByteClassification(SystemVStructRegisterPassingHelper* helper);
+
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
     // this accesses the field size which is temporarily stored in m_pMTOfEnclosingClass
     // during class loading. Don't use any other time
     DWORD GetFieldSize(FieldDesc *pFD);
diff --git a/src/vm/object.cpp b/src/vm/object.cpp
index 3b07a12543..25a7109905 100644
--- a/src/vm/object.cpp
+++ b/src/vm/object.cpp
@@ -24,6 +24,7 @@
 #endif
 #include "field.h"
 #include "gcscan.h"
+#include "argdestination.h"
 
 #ifdef FEATURE_COMPRESSEDSTACK
 void* CompressedStackObject::GetUnmanagedCompressedStack()
@@ -1498,6 +1499,31 @@ void CopyValueClassChecked(void* dest, void* src, MethodTable *pMT, AppDomain *p
     EX_END_CATCH(SwallowAllExceptions);
     CopyValueClassUnchecked(dest,src,pMT);
 }
+
+// Copy value class into the argument specified by the argDest, performing an appdomain check first.
+// The destOffset is nonzero when copying values into Nullable<T>, it is the offset
+// of the T value inside of the Nullable<T>
+void CopyValueClassArgChecked(ArgDestination *argDest, void* src, MethodTable *pMT, AppDomain *pDomain, int destOffset)
+{
+    STATIC_CONTRACT_DEBUG_ONLY;
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_FORBID_FAULT;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+    DEBUG_ONLY_FUNCTION;
+
+    FAULT_NOT_FATAL();
+    EX_TRY
+    {
+        Object::AssignValueTypeAppDomain(pMT, src, pDomain);
+    }
+    EX_CATCH
+    {
+    }
+    EX_END_CATCH(SwallowAllExceptions);
+    CopyValueClassArgUnchecked(argDest, src, pMT, destOffset);
+}
 #endif
     
 void STDCALL CopyValueClassUnchecked(void* dest, void* src, MethodTable *pMT) 
@@ -1563,6 +1589,51 @@ void STDCALL CopyValueClassUnchecked(void* dest, void* src, MethodTable *pMT)
     }
 }
 
+// Copy value class into the argument specified by the argDest.
+// The destOffset is nonzero when copying values into Nullable<T>, it is the offset
+// of the T value inside of the Nullable<T>
+void STDCALL CopyValueClassArgUnchecked(ArgDestination *argDest, void* src, MethodTable *pMT, int destOffset) 
+{
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_FORBID_FAULT;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    if (argDest->IsStructPassedInRegs())
+    {
+        argDest->CopyStructToRegisters(src, pMT->GetNumInstanceFieldBytes(), destOffset);
+        return;
+    }
+
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // destOffset is only valid for Nullable<T> passed in registers
+    _ASSERTE(destOffset == 0);
+
+    CopyValueClassUnchecked(argDest->GetDestinationAddress(), src, pMT);
+}
+
+// Initialize the value class argument to zeros
+void InitValueClassArg(ArgDestination *argDest, MethodTable *pMT)
+{ 
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_FORBID_FAULT;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    if (argDest->IsStructPassedInRegs())
+    {
+        argDest->ZeroStructInRegisters(pMT->GetNumInstanceFieldBytes());
+        return;
+    }
+
+#endif    
+    InitValueClass(argDest->GetDestinationAddress(), pMT);
+}
+
 #if defined (VERIFY_HEAP)
 
 #include "dbginterface.h"
@@ -3245,7 +3316,7 @@ BOOL Nullable::UnBox(void* destPtr, OBJECTREF boxedVal, MethodTable* destMT)
 
     if (boxedVal == NULL) 
     {
-        // logicall we are doing *dest->HasValueAddr(destMT) = false;
+        // Logically we are doing *dest->HasValueAddr(destMT) = false;
         // We zero out the whole structure becasue it may contain GC references
         // and these need to be initialized to zero.   (could optimize in the non-GC case)
         InitValueClass(destPtr, destMT);
@@ -3302,7 +3373,7 @@ BOOL Nullable::UnBoxNoGC(void* destPtr, OBJECTREF boxedVal, MethodTable* destMT)
 
     if (boxedVal == NULL) 
     {
-        // logicall we are doing *dest->HasValueAddr(destMT) = false;
+        // Logically we are doing *dest->HasValueAddr(destMT) = false;
         // We zero out the whole structure becasue it may contain GC references
         // and these need to be initialized to zero.   (could optimize in the non-GC case)
         InitValueClass(destPtr, destMT);
@@ -3328,6 +3399,64 @@ BOOL Nullable::UnBoxNoGC(void* destPtr, OBJECTREF boxedVal, MethodTable* destMT)
 }
 
 //===============================================================================
+// Special Logic to unbox a boxed T as a nullable<T> into an argument 
+// specified by the argDest.
+// Does not handle type equivalence (may conservatively return FALSE)
+BOOL Nullable::UnBoxIntoArgNoGC(ArgDestination *argDest, OBJECTREF boxedVal, MethodTable* destMT)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_COOPERATIVE;
+        SO_TOLERANT;
+    }
+    CONTRACTL_END;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    if (argDest->IsStructPassedInRegs())
+    {
+        // We should only get here if we are unboxing a T as a Nullable<T>
+        _ASSERTE(IsNullableType(destMT));
+
+        // We better have a concrete instantiation, or our field offset asserts are not useful
+        _ASSERTE(!destMT->ContainsGenericVariables());
+
+        if (boxedVal == NULL) 
+        {
+            // Logically we are doing *dest->HasValueAddr(destMT) = false;
+            // We zero out the whole structure becasue it may contain GC references
+            // and these need to be initialized to zero.   (could optimize in the non-GC case)
+            InitValueClassArg(argDest, destMT);
+        }
+        else 
+        {
+            if (!IsNullableForTypeNoGC(destMT, boxedVal->GetMethodTable()))
+            {
+                // For safety's sake, also allow true nullables to be unboxed normally.  
+                // This should not happen normally, but we want to be robust
+                if (destMT == boxedVal->GetMethodTable())
+                {
+                    CopyValueClassArg(argDest, boxedVal->GetData(), destMT, boxedVal->GetAppDomain(), 0);
+                    return TRUE;
+                }
+                return FALSE;
+            }
+
+            Nullable* dest = (Nullable*)argDest->GetStructGenRegDestinationAddress();
+            *dest->HasValueAddr(destMT) = true;
+            int destOffset = (BYTE*)dest->ValueAddr(destMT) - (BYTE*)dest;
+            CopyValueClassArg(argDest, boxedVal->UnBox(), boxedVal->GetMethodTable(), boxedVal->GetAppDomain(), destOffset);
+        }
+        return TRUE;
+    }
+
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    return UnBoxNoGC(argDest->GetDestinationAddress(), boxedVal, destMT);
+}
+
+//===============================================================================
 // Special Logic to unbox a boxed T as a nullable<T>
 // Does not do any type checks.
 void Nullable::UnBoxNoCheck(void* destPtr, OBJECTREF boxedVal, MethodTable* destMT)
@@ -3350,7 +3479,7 @@ void Nullable::UnBoxNoCheck(void* destPtr, OBJECTREF boxedVal, MethodTable* dest
 
     if (boxedVal == NULL) 
     {
-        // logicall we are doing *dest->HasValueAddr(destMT) = false;
+        // Logically we are doing *dest->HasValueAddr(destMT) = false;
         // We zero out the whole structure becasue it may contain GC references
         // and these need to be initialized to zero.   (could optimize in the non-GC case)
         InitValueClass(destPtr, destMT);
diff --git a/src/vm/object.h b/src/vm/object.h
index abf15fa591..5808e6c0eb 100644
--- a/src/vm/object.h
+++ b/src/vm/object.h
@@ -94,6 +94,8 @@ class CtxStaticData;
 class DomainAssembly;
 class AssemblyNative;
 class WaitHandleNative;
+class ArgDestination;
+
 struct RCW;
 
 #if CHECK_APP_DOMAIN_LEAKS
@@ -702,6 +704,7 @@ inline void ClearObjectReference(OBJECTREF* dst)
 // CopyValueClass sets a value class field
 
 void STDCALL CopyValueClassUnchecked(void* dest, void* src, MethodTable *pMT);
+void STDCALL CopyValueClassArgUnchecked(ArgDestination *argDest, void* src, MethodTable *pMT, int destOffset);
 
 inline void InitValueClass(void *dest, MethodTable *pMT)
 { 
@@ -709,18 +712,24 @@ inline void InitValueClass(void *dest, MethodTable *pMT)
     ZeroMemoryInGCHeap(dest, pMT->GetNumInstanceFieldBytes());
 }
 
+// Initialize value class argument
+void InitValueClassArg(ArgDestination *argDest, MethodTable *pMT);
+
 #if CHECK_APP_DOMAIN_LEAKS
 
 void SetObjectReferenceChecked(OBJECTREF *dst,OBJECTREF ref, AppDomain *pAppDomain);
 void CopyValueClassChecked(void* dest, void* src, MethodTable *pMT, AppDomain *pAppDomain);
+void CopyValueClassArgChecked(ArgDestination *argDest, void* src, MethodTable *pMT, AppDomain *pAppDomain, int destOffset);
 
 #define SetObjectReference(_d,_r,_a)        SetObjectReferenceChecked(_d, _r, _a)
 #define CopyValueClass(_d,_s,_m,_a)         CopyValueClassChecked(_d,_s,_m,_a)      
+#define CopyValueClassArg(_d,_s,_m,_a,_o)   CopyValueClassArgChecked(_d,_s,_m,_a,_o)      
 
 #else
 
 #define SetObjectReference(_d,_r,_a)        SetObjectReferenceUnchecked(_d, _r)
 #define CopyValueClass(_d,_s,_m,_a)         CopyValueClassUnchecked(_d,_s,_m)       
+#define CopyValueClassArg(_d,_s,_m,_a,_o)   CopyValueClassArgUnchecked(_d,_s,_m,_o)       
 
 #endif
 
@@ -4649,6 +4658,7 @@ public:
     static OBJECTREF Box(void* src, MethodTable* nullable);
     static BOOL UnBox(void* dest, OBJECTREF boxedVal, MethodTable* destMT);
     static BOOL UnBoxNoGC(void* dest, OBJECTREF boxedVal, MethodTable* destMT);
+    static BOOL UnBoxIntoArgNoGC(ArgDestination *argDest, OBJECTREF boxedVal, MethodTable* destMT);
     static void UnBoxNoCheck(void* dest, OBJECTREF boxedVal, MethodTable* destMT);
     static OBJECTREF BoxedNullableNull(TypeHandle nullableType) { return 0; }
 
diff --git a/src/vm/reflectioninvocation.cpp b/src/vm/reflectioninvocation.cpp
index 777b120ad4..d3a3125ed0 100644
--- a/src/vm/reflectioninvocation.cpp
+++ b/src/vm/reflectioninvocation.cpp
@@ -34,6 +34,7 @@
 #endif
 
 #include "dbginterface.h"
+#include "argdestination.h"
 
 // these flags are defined in XXXInfo.cs and only those that are used are replicated here
 #define INVOCATION_FLAGS_UNKNOWN                    0x00000000
@@ -1578,7 +1579,7 @@ FCIMPL4(Object*, RuntimeMethodHandle::InvokeMethod,
 
         TypeHandle th = gc.pSig->GetArgumentAt(i);
 
-        int    ofs = argit.GetNextOffset();
+        int ofs = argit.GetNextOffset();
         _ASSERTE(ofs != TransitionBlock::InvalidOffset);
 
 #ifdef CALLDESCR_REGTYPEMAP
@@ -1590,16 +1591,22 @@ FCIMPL4(Object*, RuntimeMethodHandle::InvokeMethod,
         // least one such argument we point the call worker at the floating point area of the frame (we leave
         // it null otherwise since the worker can perform a useful optimization if it knows no floating point
         // registers need to be set up).
-        if ((ofs < 0) && (callDescrData.pFloatArgumentRegisters == NULL))
+
+        if (TransitionBlock::HasFloatRegister(ofs, argit.GetArgLocDescForStructInRegs()) && 
+            (callDescrData.pFloatArgumentRegisters == NULL))
+        {
             callDescrData.pFloatArgumentRegisters = (FloatArgumentRegisters*) (pTransitionBlock +
-                                                                TransitionBlock::GetOffsetOfFloatArgumentRegisters());
+                                                                               TransitionBlock::GetOffsetOfFloatArgumentRegisters());
+        }
 #endif
 
         UINT structSize = argit.GetArgSize();
 
         bool needsStackCopy = false;
-        PVOID pArgDst = pTransitionBlock + ofs;
 
+        // A boxed Nullable<T> is represented as boxed T. So to pass a Nullable<T> by reference, 
+        // we have to create a Nullable<T> on stack, copy the T into it, then pass it to the callee and
+        // after returning from the call, copy the T out of the Nullable<T> back to the boxed T.
         TypeHandle nullableType = NullableTypeOfByref(th);
         if (!nullableType.IsNull()) {
             th = nullableType;
@@ -1607,17 +1614,21 @@ FCIMPL4(Object*, RuntimeMethodHandle::InvokeMethod,
             needsStackCopy = true;
         }
 #ifdef ENREGISTERED_PARAMTYPE_MAXSIZE
-        else
-        if (argit.IsArgPassedByRef()) {
+        else if (argit.IsArgPassedByRef()) 
+        {
             needsStackCopy = true;
         }
 #endif
 
+        ArgDestination argDest(pTransitionBlock, ofs, argit.GetArgLocDescForStructInRegs());
+
         if(needsStackCopy)
         {
             MethodTable * pMT = th.GetMethodTable();
             _ASSERTE(pMT && pMT->IsValueType());
 
+            PVOID pArgDst = argDest.GetDestinationAddress();
+
             PVOID pStackCopy = _alloca(structSize);
             *(PVOID *)pArgDst = pStackCopy;
             pArgDst = pStackCopy;
@@ -1632,9 +1643,12 @@ FCIMPL4(Object*, RuntimeMethodHandle::InvokeMethod,
             {
                 pValueClasses = new (_alloca(sizeof(ValueClassInfo))) ValueClassInfo(pStackCopy, pMT, pValueClasses);
             }
+
+            // We need a new ArgDestination that points to the stack copy
+            argDest = ArgDestination(pStackCopy, 0, NULL);
         }
 
-        InvokeUtil::CopyArg(th, &(gc.args->m_Array[i]), pArgDst);
+        InvokeUtil::CopyArg(th, &(gc.args->m_Array[i]), &argDest);
     }
 
     ENDFORBIDGC();
diff --git a/src/vm/siginfo.cpp b/src/vm/siginfo.cpp
index 25fe157784..ec023e9d0b 100644
--- a/src/vm/siginfo.cpp
+++ b/src/vm/siginfo.cpp
@@ -25,6 +25,7 @@
 #include "sigbuilder.h"
 #include "../md/compiler/custattr.h"
 #include <corhlprpriv.h>
+#include "argdestination.h"
 
 /*******************************************************************/
 const CorTypeInfo::CorTypeInfoEntry CorTypeInfo::info[ELEMENT_TYPE_MAX] = 
@@ -4976,11 +4977,28 @@ void ReportPointersFromValueType(promote_func *fn, ScanContext *sc, PTR_MethodTa
     } while (cur >= last);
 }
 
+void ReportPointersFromValueTypeArg(promote_func *fn, ScanContext *sc, PTR_MethodTable pMT, ArgDestination *pSrc)
+{
+    WRAPPER_NO_CONTRACT;
+    
+    if (!pMT->ContainsPointers())
+        return;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)    
+    if (pSrc->IsStructPassedInRegs())
+    {
+        pSrc->ReportPointersFromStructInRegisters(fn, sc, pMT->GetNumInstanceFieldBytes());
+        return;
+    }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    ReportPointersFromValueType(fn, sc, pMT, pSrc->GetDestinationAddress());
+}
+
 //------------------------------------------------------------------
 // Perform type-specific GC promotion on the value (based upon the
 // last type retrieved by NextArg()).
 //------------------------------------------------------------------
-VOID MetaSig::GcScanRoots(PTR_VOID pValue,
+VOID MetaSig::GcScanRoots(ArgDestination *pValue,
                           promote_func *fn,
                           ScanContext* sc,
                           promote_carefully_func *fnc)
@@ -4997,7 +5015,7 @@ VOID MetaSig::GcScanRoots(PTR_VOID pValue,
     CONTRACTL_END
 
 
-    PTR_PTR_Object pArgPtr = (PTR_PTR_Object)pValue;
+    PTR_PTR_Object pArgPtr = (PTR_PTR_Object)pValue->GetDestinationAddress();
     if (fnc == NULL)
         fnc = &PromoteCarefully;
 
@@ -5083,7 +5101,7 @@ VOID MetaSig::GcScanRoots(PTR_VOID pValue,
                 }
 #endif // ENREGISTERED_PARAMTYPE_MAXSIZE
 
-                ReportPointersFromValueType(fn, sc, pMT, pArgPtr);
+                ReportPointersFromValueTypeArg(fn, sc, pMT, pValue);
             }
             break;
 
diff --git a/src/vm/siginfo.hpp b/src/vm/siginfo.hpp
index 06d3b66a24..586802b1b1 100644
--- a/src/vm/siginfo.hpp
+++ b/src/vm/siginfo.hpp
@@ -50,6 +50,7 @@ unsigned GetSizeForCorElementType(CorElementType etyp);
 const ElementTypeInfo* GetElementTypeInfo(CorElementType etyp);
 
 class SigBuilder;
+class ArgDestination;
 
 typedef const struct HardCodedMetaSig *LPHARDCODEDMETASIG;
 
@@ -841,7 +842,7 @@ class MetaSig
         // Perform type-specific GC promotion on the value (based upon the
         // last type retrieved by NextArg()).
         //------------------------------------------------------------------
-        VOID GcScanRoots(PTR_VOID pValue, promote_func *fn,
+        VOID GcScanRoots(ArgDestination *pValue, promote_func *fn,
                          ScanContext* sc, promote_carefully_func *fnc = NULL);
 
         //------------------------------------------------------------------
@@ -888,7 +889,7 @@ class MetaSig
         BOOL IsReturnTypeVoid() const;
 
 
-        enum RETURNTYPE {RETOBJ, RETBYREF, RETNONOBJ};
+        enum RETURNTYPE {RETOBJ, RETBYREF, RETNONOBJ, RETVALUETYPE};
 
         CorElementType GetReturnTypeNormalized(TypeHandle * pthValueType = NULL) const;
 
diff --git a/src/vm/stackbuildersink.cpp b/src/vm/stackbuildersink.cpp
index bcd8d62f50..5d6aa7bb15 100644
--- a/src/vm/stackbuildersink.cpp
+++ b/src/vm/stackbuildersink.cpp
@@ -404,13 +404,16 @@ void CallDescrWithObjectArray(OBJECTREF& pServer,
 #endif
 
 #ifdef CALLDESCR_FPARGREGS
-        // Under CALLDESCR_FPARGREGS -ve offsets indicate arguments in floating point registers. If we have at
+        // Under CALLDESCR_FPARGREGS we can have arguments in floating point registers. If we have at
         // least one such argument we point the call worker at the floating point area of the frame (we leave
         // it null otherwise since the worker can perform a useful optimization if it knows no floating point
         // registers need to be set up).
-        if (TransitionBlock::IsFloatArgumentRegisterOffset(ofs) && (pFloatArgumentRegisters == NULL))
+        if (TransitionBlock::HasFloatRegister(ofs, argit.GetArgLocDescForStructInRegs()) && 
+            (pFloatArgumentRegisters == NULL))
+        {
             pFloatArgumentRegisters = (FloatArgumentRegisters*)(pTransitionBlock +
                                                                 TransitionBlock::GetOffsetOfFloatArgumentRegisters());
+        }
 #endif
 
         if (argit.GetArgType() == ELEMENT_TYPE_BYREF)
diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp
index 065c396929..5e4c05f514 100644
--- a/src/vm/threads.cpp
+++ b/src/vm/threads.cpp
@@ -2242,6 +2242,9 @@ Thread::Thread()
 #endif
 
     m_pAllLoggedTypes = NULL;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    m_pHijackReturnTypeClass = NULL;
+#endif
 }
 
 
diff --git a/src/vm/threads.h b/src/vm/threads.h
index 0ab550f741..da94c0e2ce 100644
--- a/src/vm/threads.h
+++ b/src/vm/threads.h
@@ -689,6 +689,9 @@ void InitThreadManager();
 EXTERN_C void __stdcall OnHijackObjectTripThread();                 // hijacked JIT code is returning an objectref
 EXTERN_C void __stdcall OnHijackInteriorPointerTripThread();        // hijacked JIT code is returning a byref
 EXTERN_C void __stdcall OnHijackScalarTripThread();                 // hijacked JIT code is returning a non-objectref, non-FP
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+EXTERN_C void __stdcall OnHijackStructInRegsTripThread();           // hijacked JIT code is returning a struct in registers
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
 #ifdef _TARGET_X86_
 EXTERN_C void __stdcall OnHijackFloatingPointTripThread();          // hijacked JIT code is returning an FP value
@@ -1017,6 +1020,9 @@ typedef DWORD (*AppropriateWaitFunc) (void *args, DWORD timeout, DWORD option);
 EXTERN_C void STDCALL OnHijackObjectWorker(HijackArgs * pArgs);
 EXTERN_C void STDCALL OnHijackInteriorPointerWorker(HijackArgs * pArgs);
 EXTERN_C void STDCALL OnHijackScalarWorker(HijackArgs * pArgs);
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+EXTERN_C void STDCALL OnHijackStructInRegsWorker(HijackArgs * pArgs);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #endif // FEATURE_HIJACK
 
 // This is the code we pass around for Thread.Interrupt, mainly for assertions
@@ -1067,7 +1073,9 @@ class Thread: public IUnknown
     friend void STDCALL OnHijackObjectWorker(HijackArgs *pArgs);
     friend void STDCALL OnHijackInteriorPointerWorker(HijackArgs *pArgs);
     friend void STDCALL OnHijackScalarWorker(HijackArgs *pArgs);
-
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    friend void STDCALL OnHijackStructInRegsWorker(HijackArgs *pArgs);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #ifdef PLATFORM_UNIX
     friend void PALAPI HandleGCSuspensionForInterruptedThread(CONTEXT *interruptedContext);
 #endif // PLATFORM_UNIX
@@ -5553,6 +5561,24 @@ public:
         _ASSERTE(pAllLoggedTypes != NULL ? m_pAllLoggedTypes == NULL : TRUE);
         m_pAllLoggedTypes = pAllLoggedTypes;
     }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+private:
+    EEClass* m_pHijackReturnTypeClass;
+public:
+    EEClass* GetHijackReturnTypeClass()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        return m_pHijackReturnTypeClass;
+    }
+
+    void SetHijackReturnTypeClass(EEClass* pClass)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        m_pHijackReturnTypeClass = pClass;
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 };
 
 // End of class Thread
diff --git a/src/vm/threadsuspend.cpp b/src/vm/threadsuspend.cpp
index 10ea699faa..5d414192c4 100644
--- a/src/vm/threadsuspend.cpp
+++ b/src/vm/threadsuspend.cpp
@@ -7260,7 +7260,7 @@ void STDCALL OnHijackInteriorPointerWorker(HijackArgs * pArgs)
             GC_ON_TRANSITIONS (GCOnTransition);
         }
 #endif
-        pArgs->ReturnValue = (size_t)ptr;
+        *(size_t*)&pArgs->ReturnValue = (size_t)ptr;
     }
     GCPROTECT_END();        // trashes or here!
 
@@ -7327,6 +7327,90 @@ void STDCALL OnHijackScalarWorker(HijackArgs * pArgs)
 #endif
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+// A hijacked method is returning a struct in registers to its caller.
+// The struct can possibly contain object references that we have to
+// protect.
+void STDCALL OnHijackStructInRegsWorker(HijackArgs * pArgs)
+{
+    CONTRACTL {
+        THROWS;
+        GC_TRIGGERS;
+        SO_TOLERANT;
+    } CONTRACTL_END;
+
+#ifdef HIJACK_NONINTERRUPTIBLE_THREADS
+    Thread         *thread = GetThread();
+
+    EEClass* eeClass = thread->GetHijackReturnTypeClass();
+
+    OBJECTREF oref[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    int orefCount = 0;
+    for (int i = 0; i < eeClass->GetNumberEightBytes(); i++)
+    {
+        if (eeClass->GetEightByteClassification(i) == SystemVClassificationTypeIntegerReference)
+        {
+            oref[orefCount++] = ObjectToOBJECTREF(*(Object **) &pArgs->ReturnValue[i]);
+        }
+    }
+
+#ifdef FEATURE_STACK_PROBE
+    if (GetEEPolicy()->GetActionOnFailure(FAIL_StackOverflow) == eRudeUnloadAppDomain)
+    {
+        RetailStackProbe(ADJUST_PROBE(DEFAULT_ENTRY_PROBE_AMOUNT), thread);
+    }
+#endif
+
+    CONTRACT_VIOLATION(SOToleranceViolation);
+
+    thread->ResetThreadState(Thread::TS_Hijacked);
+
+    // Fix up our caller's stack, so it can resume from the hijack correctly
+    pArgs->ReturnAddress = (size_t)thread->m_pvHJRetAddr;
+
+    // Build a frame so that stack crawling can proceed from here back to where
+    // we will resume execution.
+    FrameWithCookie<HijackFrame> frame((void *)pArgs->ReturnAddress, thread, pArgs);
+
+    GCPROTECT_ARRAY_BEGIN(oref[0], orefCount)
+    {
+#ifdef _DEBUG
+        BOOL GCOnTransition = FALSE;
+        if (g_pConfig->FastGCStressLevel()) {
+            GCOnTransition = GC_ON_TRANSITIONS (FALSE);
+        }
+#endif
+
+#ifdef TIME_SUSPEND
+        g_SuspendStatistics.cntHijackTrap++;
+#endif
+
+        CommonTripThread();
+#ifdef _DEBUG
+        if (g_pConfig->FastGCStressLevel()) {
+            GC_ON_TRANSITIONS (GCOnTransition);
+        }
+#endif
+
+        // Update the references in the returned struct
+        orefCount = 0;
+        for (int i = 0; i < eeClass->GetNumberEightBytes(); i++)
+        {
+            if (eeClass->GetEightByteClassification(i) == SystemVClassificationTypeIntegerReference)
+            {
+                *((OBJECTREF *) &pArgs->ReturnValue[i]) = oref[orefCount++];
+            }
+        }
+    }
+    GCPROTECT_END();
+
+    frame.Pop();
+#else
+    PORTABILITY_ASSERT("OnHijackInteriorPointerWorker not implemented on this platform.");
+#endif
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifndef PLATFORM_UNIX
 
 // Get the ExecutionState for the specified SwitchIn thread.  Note that this is
@@ -7806,11 +7890,19 @@ BOOL Thread::HandledJITCase(BOOL ForTaskSwitchIn)
                 else
 #endif // _TARGET_X86_
                 {
-                    MetaSig::RETURNTYPE type = esb.m_pFD->ReturnsObject();
+                    MethodTable* pMT = NULL;
+                    MetaSig::RETURNTYPE type = esb.m_pFD->ReturnsObject(INDEBUG_COMMA(false) &pMT);
                     if (type == MetaSig::RETOBJ)
                         pvHijackAddr = OnHijackObjectTripThread;
                     else if (type == MetaSig::RETBYREF)
                         pvHijackAddr = OnHijackInteriorPointerTripThread;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    else if (type == MetaSig::RETVALUETYPE)
+                    {
+                        pThread->SetHijackReturnTypeClass(pMT->GetClass());
+                        pvHijackAddr = OnHijackStructInRegsTripThread;
+                    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                 }
             }
 
@@ -8354,7 +8446,8 @@ void PALAPI HandleGCSuspensionForInterruptedThread(CONTEXT *interruptedContext)
         // Hijack the return address to point to the appropriate routine based on the method's return type.
         void *pvHijackAddr = OnHijackScalarTripThread;
         MethodDesc *pMethodDesc = codeInfo.GetMethodDesc();
-        MetaSig::RETURNTYPE type = pMethodDesc->ReturnsObject();
+        MethodTable* pMT = NULL;
+        MetaSig::RETURNTYPE type = pMethodDesc->ReturnsObject(INDEBUG_COMMA(false) &pMT);
         if (type == MetaSig::RETOBJ)
         {
             pvHijackAddr = OnHijackObjectTripThread;
@@ -8363,6 +8456,13 @@ void PALAPI HandleGCSuspensionForInterruptedThread(CONTEXT *interruptedContext)
         {
             pvHijackAddr = OnHijackInteriorPointerTripThread;
         }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        else if (type == MetaSig::RETVALUETYPE)
+        {
+            pThread->SetHijackReturnTypeClass(pMT->GetClass());
+            pvHijackAddr = OnHijackStructInRegsTripThread;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
         pThread->HijackThread(pvHijackAddr, &executionState);
     }
diff --git a/tests/src/JIT/SIMD/project.lock.json b/tests/src/JIT/SIMD/project.lock.json
index 5a0680001c..6cf037e202 100644
--- a/tests/src/JIT/SIMD/project.lock.json
+++ b/tests/src/JIT/SIMD/project.lock.json
@@ -242,7 +242,10 @@
         "ref/MonoTouch10/_._",
         "ref/net46/System.Console.dll",
         "ref/xamarinios10/_._",
-        "ref/xamarinmac20/_._"
+        "ref/xamarinmac20/_._",
+        "ru/System.Console.xml",
+        "zh-hans/System.Console.xml",
+        "zh-hant/System.Console.xml"
       ]
     },
     "System.Diagnostics.Debug/4.0.10": {
author	Lubomir Litchev <llitchev@live.com>	2015-02-19 11:42:30 -0800
committer	Lubomir Litchev <lubol@microsoft.com>	2015-10-20 14:20:36 -0700
commit	378e304f9e22b3c4d03c3b1b62c47b0aa58ceaf5 (patch)
tree	b83aec8f77caeb9ca94c6d7505a548b93cdb7259
parent	3015ff7afb4936a1c5c5856daa4e3482e6b390a9 (diff)
download	coreclr-378e304f9e22b3c4d03c3b1b62c47b0aa58ceaf5.tar.gz coreclr-378e304f9e22b3c4d03c3b1b62c47b0aa58ceaf5.tar.bz2 coreclr-378e304f9e22b3c4d03c3b1b62c47b0aa58ceaf5.zip