74 files changed, 6735 insertions, 951 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1be2864ecb..2ac0ebb07a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -345,6 +345,11 @@ endif (WIN32)
 
 endif (OVERRIDE_CMAKE_CXX_FLAGS)
 
+if(CLR_CMAKE_PLATFORM_UNIX_TARGET_AMD64)
+add_definitions(-DFEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+add_definitions(-DFEATURE_UNIX_AMD64_STRUCT_PASSING)
+endif (CLR_CMAKE_PLATFORM_UNIX_TARGET_AMD64)
+
 OPTION(CMAKE_ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
 
 if(CMAKE_ENABLE_CODE_COVERAGE)
diff --git a/src/debug/daccess/nidump.cpp b/src/debug/daccess/nidump.cpp
index 44569d9874..c90c29f752 100644
--- a/src/debug/daccess/nidump.cpp
+++ b/src/debug/daccess/nidump.cpp
@@ -5678,7 +5678,12 @@ NativeImageDumper::EnumMnemonics s_MTFlagsLow[] =
     MTFLAG_ENTRY(HasVariance),
     MTFLAG_ENTRY(HasDefaultCtor),
     MTFLAG_ENTRY(HasPreciseInitCctors),
+#if defined(FEATURE_HFA)
     MTFLAG_ENTRY(IsHFA),
+#endif // FEATURE_HFA
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+    MTFLAG_ENTRY(IsRegStructPassed),
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
     MTFLAG_ENTRY(UNUSED_ComponentSize_4),
     MTFLAG_ENTRY(UNUSED_ComponentSize_5),
     MTFLAG_ENTRY(UNUSED_ComponentSize_6),
diff --git a/src/inc/corinfo.h b/src/inc/corinfo.h
index e0004a5948..cc2ce720b8 100644
--- a/src/inc/corinfo.h
+++ b/src/inc/corinfo.h
@@ -190,9 +190,10 @@ TODO: Talk about initializing strutures before use
 #include <specstrings.h>
 
 // For System V on the CLR type system number of registers to pass in and return a struct is the same.
-#define SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS   2
-#define SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_RETURN_IN_REGISTERS SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS
-#define SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS       16
+// The CLR type system allows only up to 2 eightbytes to be passed in registers. There is no SSEUP classification types.
+#define CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS   2 
+#define CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_RETURN_IN_REGISTERS 2
+#define CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS       16
 
 // System V struct passing
 // The Classification types are described in the ABI spec at http://www.x86-64.org/documentation/abi.pdf
@@ -212,7 +213,7 @@ enum SystemVClassificationType : unsigned __int8
     SystemVClassificationTypeMAX = 7,
 };
 
-
+// Represents classification information for a struct.
 struct SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR
 {
     SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR()
@@ -220,19 +221,40 @@ struct SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR
         Initialize();
     }
 
-    bool                        canPassInRegisters;
-    unsigned int                eightByteCount;
-    SystemVClassificationType   eightByteClassifications[SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
-    unsigned int                eightByteSizes[SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
-    unsigned int                eightByteOffsets[SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    bool                        passedInRegisters; // Whether the struct is passable/passed (this includes struct returning) in registers.
+    unsigned __int8             eightByteCount;    // Number of eightbytes for this struct.
+    SystemVClassificationType   eightByteClassifications[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS]; // The eightbytes type classification.
+    unsigned __int8             eightByteSizes[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];           // The size of the eightbytes (an eightbyte could include padding. This represents the no padding size of the eightbyte).
+    unsigned __int8             eightByteOffsets[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];         // The start offset of the eightbytes (in bytes).
+
+
+    //------------------------------------------------------------------------
+    // CopyFrom: Copies a struct classification into this one.
+    //
+    // Arguments:
+    //    'copyFrom' the struct classification to copy from.
+    //
+    void CopyFrom(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& copyFrom)
+    {
+        passedInRegisters = copyFrom.passedInRegisters;
+        eightByteCount = copyFrom.eightByteCount;
+
+        for (int i = 0; i < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
+        {
+            eightByteClassifications[i] = copyFrom.eightByteClassifications[i];
+            eightByteSizes[i] = copyFrom.eightByteSizes[i];
+            eightByteOffsets[i] = copyFrom.eightByteOffsets[i];
+        }
+    }
 
     // Members
+private:
     void Initialize()
     {
-        canPassInRegisters = false;
+        passedInRegisters = false;
         eightByteCount = 0;
 
-        for (int i = 0; i < SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
+        for (int i = 0; i < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
         {
             eightByteClassifications[i] = SystemVClassificationTypeUnknown;
             eightByteSizes[i] = 0;
diff --git a/src/inc/winwrap.h b/src/inc/winwrap.h
index a670a51de0..c0c43eb74c 100644
--- a/src/inc/winwrap.h
+++ b/src/inc/winwrap.h
@@ -854,9 +854,13 @@ InterlockedCompareExchangePointer (
 // Interlockedxxx64 that do not have intrinsics are only supported on Windows Server 2003 
 // or higher for X86 so define our own portable implementation
 
+#undef InterlockedIncrement64
 #define InterlockedIncrement64          __InterlockedIncrement64
+#undef InterlockedDecrement64
 #define InterlockedDecrement64          __InterlockedDecrement64
+#undef InterlockedExchange64
 #define InterlockedExchange64           __InterlockedExchange64
+#undef InterlockedExchangeAdd64
 #define InterlockedExchangeAdd64        __InterlockedExchangeAdd64
 
 __forceinline LONGLONG __InterlockedIncrement64(LONGLONG volatile *Addend)
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 0828a160c9..ea3cce6cc8 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -3648,7 +3648,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
                                                    RegState *regState)
 {
 #ifdef DEBUG
-    if (verbose)
+    if (verbose) 
         printf("*************** In genFnPrologCalleeRegArgs() for %s regs\n", regState->rsIsFloat ? "float" : "int");
 #endif
 
@@ -3678,6 +3678,9 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
     struct
     {
         unsigned    varNum;     // index into compiler->lvaTable[] for this register argument
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        var_types   type;       // the Jit type of this regArgTab entry
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         unsigned    trashBy;    // index into this regArgTab[] table of the register that will be copied to this register.
                                 // That is, for regArgTab[x].trashBy = y, argument register number 'y' will be copied to
                                 // argument register number 'x'. Only used when circular = true.
@@ -3691,18 +3694,20 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
     unsigned    varNum;
     LclVarDsc * varDsc;
-
     for (varNum = 0, varDsc = compiler->lvaTable;
          varNum < compiler->lvaCount;
-         varNum++  , varDsc++)
+         varNum++, varDsc++)
     {
         /* Is this variable a register arg? */
-
-        if  (!varDsc->lvIsParam)
+        if (!varDsc->lvIsParam)
+        {
             continue;
+        }
 
-        if  (!varDsc->lvIsRegArg)
+        if (!varDsc->lvIsRegArg)
+        {
             continue;
+        }
 
         // When we have a promoted struct we have two possible LclVars that can represent the incoming argument
         // in the regArgTab[], either the original TYP_STRUCT argument or the introduced lvStructField.
@@ -3726,13 +3731,17 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
                 // For register arguments that are independent promoted structs we put the promoted field varNum in the regArgTab[]
                 if (varDsc->lvPromoted)
+                {
                     continue;
+                }
             }
             else
             {
                 // For register arguments that are not independent promoted structs we put the parent struct varNum in the regArgTab[]
                 if (varDsc->lvIsStructField)
+                {
                     continue;
+                }
             }
         }
 
@@ -3743,19 +3752,89 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
         var_types regType = varDsc->TypeGet();
 #endif // !_TARGET_ARM_
 
-        if (isFloatRegType(regType) != doingFloat)
-            continue;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (regType != TYP_STRUCT)
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // A struct might be passed  partially in XMM register for System V calls.
+            // So a single arg might use both register files.
+            if (isFloatRegType(regType) != doingFloat)
+            {
+                continue;
+            }
+        }
 
-        /* Bingo - add it to our table */
-        
-        regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, regType);
-        noway_assert(regArgNum < regState->rsCalleeRegArgNum);
-        noway_assert(regArgTab[regArgNum].slot == 0); // we better not have added it already (there better not be multiple vars representing this argument register)
+        int slots = 0;
 
-        regArgTab[regArgNum].varNum = varNum;
-        regArgTab[regArgNum].slot = 1;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        if (varDsc->TypeGet() == TYP_STRUCT)
+        {
+            CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+            assert(typeHnd != nullptr);
+            compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            if (!structDesc.passedInRegisters)
+            {
+                // The var is not passed in registers.
+                continue;
+            }
 
-        int slots = 1;
+            unsigned firstRegSlot = 0;
+            for (unsigned slotCounter = 0; slotCounter < structDesc.eightByteCount; slotCounter++)
+            {
+                regNumber regNum = varDsc->lvRegNumForSlot(slotCounter);
+
+                var_types regType = compiler->getEightByteType(structDesc, slotCounter);
+                
+                regArgNum = genMapRegNumToRegArgNum(regNum, regType);
+                
+                if ((!doingFloat &&
+                    ((structDesc.eightByteClassifications[slotCounter] == SystemVClassificationTypeInteger) || 
+                     (structDesc.eightByteClassifications[slotCounter] == SystemVClassificationTypeIntegerReference))) ||
+                     (doingFloat && structDesc.eightByteClassifications[slotCounter] == SystemVClassificationTypeSSE))
+                {
+                    // Store the reg for the first slot.
+                    if (slots == 0)
+                    {
+                        firstRegSlot = regArgNum;
+                    }
+
+                    // Bingo - add it to our table
+                    noway_assert(regArgNum < regState->rsCalleeRegArgNum);
+                    noway_assert(regArgTab[regArgNum].slot == 0); // we better not have added it already (there better not be multiple vars representing this argument register)
+                    regArgTab[regArgNum].varNum = varNum;
+                    regArgTab[regArgNum].slot = (char)(slotCounter + 1);
+                    regArgTab[regArgNum].type = regType;
+                    slots++;
+                }
+            }
+
+            if (slots == 0)
+            {
+                continue; // Nothing to do for this regState set.
+            }
+
+            regArgNum = firstRegSlot;
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // Bingo - add it to our table
+            regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, regType);
+            noway_assert(regArgNum < regState->rsCalleeRegArgNum);
+            // we better not have added it already (there better not be multiple vars representing this argument register)
+            noway_assert(regArgTab[regArgNum].slot == 0);
+
+            // Set the register type.
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            regArgTab[regArgNum].type = regType;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+            regArgTab[regArgNum].varNum = varNum;
+            regArgTab[regArgNum].slot = 1;
+
+            slots = 1;
+        }
 
 #ifdef _TARGET_ARM_
         int lclSize = compiler->lvaLclSize(varNum);
@@ -3778,9 +3857,23 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         for (int i = 0; i < slots; i ++)
         {
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            // For structs passed in registers on System V systems,
+            // get the regType from the table for each slot.
+            if (regType == TYP_STRUCT)
+            {
+                regType = regArgTab[regArgNum + i].type;
+            }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
             regNumber regNum = genMapRegArgNumToRegNum(regArgNum + i, regType);
-            assert((i > 0) || (regNum == varDsc->lvArgReg));
 
+            // lvArgReg could be INT or FLOAT reg. So the following assertion doesn't hold.
+            // The type of the register depends on the classification of the first eightbyte 
+            // of the struct. For information on classification refer to the System V x86_64 ABI at:
+            // http://www.x86-64.org/documentation/abi.pdf
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            assert((i > 0) || (regNum == varDsc->lvArgReg));
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
             // Is the arg dead on entry to the method ?
 
             if ((regArgMaskLive & genRegMask(regNum)) == 0)
@@ -3831,8 +3924,8 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
             /* If it goes on the stack or in a register that doesn't hold
              * an argument anymore -> CANNOT form a circular dependency */
 
-            if ( varDsc->lvIsInReg()                              &&
-                (genRegMask(regNum) & regArgMaskLive)   )
+            if (varDsc->lvIsInReg() &&
+                (genRegMask(regNum) & regArgMaskLive))
             {
                 /* will trash another argument -> possible dependency
                  * We may need several passes after the table is constructed
@@ -3841,22 +3934,33 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
                 /* Maybe the argument stays in the register (IDEAL) */
 
                 if ((i == 0) && (varDsc->lvRegNum == regNum))
+                {
                     goto NON_DEP;
+                }
 
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                if ((i == 1) && (varDsc->TypeGet() == TYP_STRUCT) &&
+                    (varDsc->lvOtherReg == regNum))
+                {
+                    goto NON_DEP;
+                }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 if ((i == 1) && (genActualType(varDsc->TypeGet()) == TYP_LONG) &&
-                                (varDsc->lvOtherReg == regNum))
+                    (varDsc->lvOtherReg == regNum))
+                {
                     goto NON_DEP;
+                }
 
                 if ((i == 1) && (genActualType(varDsc->TypeGet()) == TYP_DOUBLE) &&
-                                (REG_NEXT(varDsc->lvRegNum) == regNum))
+                    (REG_NEXT(varDsc->lvRegNum) == regNum))
+                {
                     goto NON_DEP;
-
+                }
                 regArgTab[regArgNum+i].circular  = true;
             }
             else
             {
             NON_DEP:
-
                 regArgTab[regArgNum+i].circular  = false;
                 
                 /* mark the argument register as free */
@@ -3870,7 +3974,6 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
      * such that R1->R2 (that is, R1 needs to be moved to R2), R2->R3, ..., Rn->R1 */
 
     bool    change = true;
-
     if (regArgMaskLive)
     {
         /* Possible circular dependencies still exist; the previous pass was not enough
@@ -3882,15 +3985,20 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
             for (argNum = 0; argNum < regState->rsCalleeRegArgNum; argNum++)
             {
-                /* If we already marked the argument as non-circular then continue */
+                // If we already marked the argument as non-circular then continue
 
                 if (!regArgTab[argNum].circular)
+                {
                     continue;
+                }
 
                 if (regArgTab[argNum].slot == 0) // Not a register argument
+                {
                     continue;
+                }
 
-                varNum = regArgTab[argNum].varNum; noway_assert(varNum < compiler->lvaCount);
+                varNum = regArgTab[argNum].varNum;
+                noway_assert(varNum < compiler->lvaCount);
                 varDsc = compiler->lvaTable + varNum;
                 noway_assert(varDsc->lvIsParam && varDsc->lvIsRegArg);
 
@@ -3899,11 +4007,19 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
                 noway_assert(!regArgTab[argNum].stackArg);
 
                 regNumber regNum = genMapRegArgNumToRegNum(argNum, varDsc->TypeGet());
+
                 regNumber destRegNum;
                 if (regArgTab[argNum].slot == 1)
                 {
                     destRegNum = varDsc->lvRegNum;
                 }
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                else
+                {
+                    assert(regArgTab[argNum].slot == 2);
+                    destRegNum = varDsc->lvOtherReg;
+                }
+#else // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 else if (regArgTab[argNum].slot == 2 &&
                          genActualType(varDsc->TypeGet()) == TYP_LONG)
                 {
@@ -3915,7 +4031,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
                     assert(varDsc->TypeGet() == TYP_DOUBLE);
                     destRegNum = REG_NEXT(varDsc->lvRegNum);
                 }
-
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 if (genRegMask(destRegNum) & regArgMaskLive)
                 {
                     /* we are trashing a live argument register - record it */
@@ -3949,33 +4065,47 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
     }
 #endif
 
-    // TODO-AMD64-Bug? - homing of float argument registers with circular dependencies.
-#ifdef _TARGET_AMD64_
-    NYI_IF((regArgMaskLive & RBM_FLTARG_REGS) != 0, "Homing of float argument registers with circular dependencies not implemented");        
-#endif // _TARGET_AMD64_
+    // LSRA allocates registers to incoming parameters in order and will not overwrite
+    // a register still holding a live parameter.
+#ifndef LEGACY_BACKEND
+    noway_assert(((regArgMaskLive & RBM_FLTARG_REGS) == 0) && "Homing of float argument registers with circular dependencies not implemented.");
+#endif // LEGACY_BACKEND
 
     /* Now move the arguments to their locations.
      * First consider ones that go on the stack since they may
      * free some registers. */
 
     regArgMaskLive = regState->rsCalleeRegArgMaskLiveIn; // reset the live in to what it was at the start
-
     for (argNum = 0; argNum < regState->rsCalleeRegArgNum; argNum++)
     {
         emitAttr        size;
 
-        /* If the arg is dead on entry to the method, skip it */
+        // If this is the wrong register file, just continue.
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (regArgTab[argNum].type == TYP_UNDEF)
+        {
+            // This could happen if the reg in regArgTab[argNum] is of the other register file -
+            //     for System V register passed structs where the first reg is GPR and the second an XMM reg.
+            // The next register file processing will process it.
+            continue;
+        }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        // If the arg is dead on entry to the method, skip it
 
         if (regArgTab[argNum].processed)
+        {
             continue;
+        }
 
         if (regArgTab[argNum].slot == 0) // Not a register argument
+        {
             continue;
+        }
 
         varNum = regArgTab[argNum].varNum; noway_assert(varNum < compiler->lvaCount);
         varDsc = compiler->lvaTable + varNum;
 
-        /* If not a stack arg go to the next one */
+        // If not a stack arg go to the next one
 
 #ifndef _TARGET_64BIT_
         if (varDsc->lvType == TYP_LONG)
@@ -3993,7 +4123,9 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 #endif // !_TARGET_64BIT_
         {
             if (!regArgTab[argNum].stackArg)
+            {
                 continue;
+            }
         }
 
 #if defined(_TARGET_ARM_)
@@ -4021,10 +4153,15 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
         {
             size = EA_SIZE(varDsc->lvSize());
 #if defined(_TARGET_AMD64_)
-            storeType = (var_types) ((size <= 4) ? TYP_INT : TYP_I_IMPL);
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            storeType = (var_types)((size <= 4) ? TYP_INT : TYP_I_IMPL);
             // Must be 1, 2, 4, or 8, or else it wouldn't be passed in a register
             noway_assert(EA_SIZE_IN_BYTES(size) <= 8);
             assert((EA_SIZE_IN_BYTES(size) & (EA_SIZE_IN_BYTES(size) - 1)) == 0);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+            storeType = regArgTab[argNum].type;
+            size = emitActualTypeSize(storeType);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #elif defined(_TARGET_ARM64_)
             // Must be <= 16 bytes or else it wouldn't be passed in registers
             noway_assert(EA_SIZE_IN_BYTES(size) <= 16);
@@ -4060,7 +4197,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         regNumber srcRegNum = genMapRegArgNumToRegNum(argNum, storeType);
         
-        /* Stack argument - if the ref count is 0 don't care about it */
+        // Stack argument - if the ref count is 0 don't care about it
 
         if (!varDsc->lvOnFrame)
         {
@@ -4084,6 +4221,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         regArgTab[argNum].processed = true;
         regArgMaskLive &= ~genRegMask(srcRegNum);
+
 #if defined(_TARGET_ARM_)
         if (storeType == TYP_DOUBLE)
         {
@@ -4094,7 +4232,6 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
     }
 
     /* Process any circular dependencies */
-
     if (regArgMaskLive)
     {
         unsigned        begReg, destReg, srcReg;
@@ -4105,21 +4242,39 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         if (doingFloat)
         {
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 #if defined(_TARGET_ARM_)
             insCopy = INS_vmov;
-
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            insCopy = INS_mov;
+#else
+#error Error. Wrong architecture.
+#endif
             // Compute xtraReg here when we have a float argument
             assert(xtraReg == REG_NA);
 
             regMaskTP fpAvailMask;  
                     
             fpAvailMask = RBM_FLT_CALLEE_TRASH & ~regArgMaskLive;
+#if defined(_TARGET_ARM_)
             fpAvailMask &= RBM_DBL_REGS;
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            fpAvailMask &= RBM_ALLFLOAT;
+#else
+#error Error. Wrong architecture.
+#endif
+            
 
             if (fpAvailMask == RBM_NONE)
             {
                 fpAvailMask = RBM_ALLFLOAT & ~regArgMaskLive;
+#if defined(_TARGET_ARM_)
                 fpAvailMask &= RBM_DBL_REGS;
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                fpAvailMask &= RBM_ALLFLOAT;
+#else
+#error Error. Wrong architecture.
+#endif
             }
 
             assert(fpAvailMask != RBM_NONE);
@@ -4135,23 +4290,30 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
 
         for (argNum = 0; argNum < regState->rsCalleeRegArgNum; argNum++)
         {
-            /* If not a circular dependency then continue */
-
+            // If not a circular dependency then continue
             if (!regArgTab[argNum].circular)
+            {
                 continue;
+            }
 
-            /* If already processed the dependency then continue */
+            // If already processed the dependency then continue
 
             if (regArgTab[argNum].processed)
+            {
                 continue;
+            }
 
             if (regArgTab[argNum].slot == 0) // Not a register argument
+            {
                 continue;
-           
+            }
+
             destReg = begReg = argNum;
-            srcReg  = regArgTab[argNum].trashBy; noway_assert(srcReg < regState->rsCalleeRegArgNum);
+            srcReg  = regArgTab[argNum].trashBy;
+            noway_assert(srcReg < regState->rsCalleeRegArgNum);
 
-            varNumDest = regArgTab[destReg].varNum; noway_assert(varNumDest < compiler->lvaCount);
+            varNumDest = regArgTab[destReg].varNum; 
+            noway_assert(varNumDest < compiler->lvaCount);
             varDscDest = compiler->lvaTable + varNumDest;
             noway_assert(varDscDest->lvIsParam && varDscDest->lvIsRegArg);
 
@@ -4376,6 +4538,18 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
             varDsc = compiler->lvaTable + varNum;
             regNumber regNum = genMapRegArgNumToRegNum(argNum, varDsc->TypeGet());
 
+            // If this is the wrong register file, just continue.
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            if (regArgTab[argNum].type == TYP_UNDEF)
+            {
+                // This could happen if the reg in regArgTab[argNum] is of the other register file -
+                //     for System V register passed structs where the first reg is GPR and the second an XMM reg.
+                // The next register file processing will process it.
+                regArgMaskLive &= ~genRegMask(regNum);
+                continue;
+            }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
             noway_assert(varDsc->lvIsParam && varDsc->lvIsRegArg);
 #ifndef _WIN64
             //Right now we think that incoming arguments are not pointer sized.  When we eventually
@@ -4506,7 +4680,7 @@ void            CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg,
             }
 #endif
         }
-
+        
         noway_assert(regArgMaskLiveSave != regArgMaskLive);   // if it doesn't change, we have an infinite loop
     }
 }
@@ -6729,12 +6903,14 @@ void CodeGen::genProfilingEnterCallback(regNumber  initReg,
         regNumber  argReg    = varDsc->lvArgReg;
         getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
 
+#if FEATURE_VARARG
         if (compiler->info.compIsVarArgs && varTypeIsFloating(loadType))
         {
             regNumber intArgReg = compiler->getCallArgIntRegister(argReg);
             instruction ins = ins_CopyFloatToInt(loadType, TYP_LONG);
             inst_RV_RV(ins, argReg, intArgReg, loadType);
         }
+#endif //  FEATURE_VARARG
     }
 
     // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using.
@@ -8495,6 +8671,7 @@ void                CodeGen::genFnProlog()
 #endif // !LEGACY_BACKEND
 
     RegState *regState;
+    
     FOREACH_REGISTER_FILE(regState)
     {
         if (regState->rsCalleeRegArgMaskLiveIn)
@@ -10789,8 +10966,8 @@ void                CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
 //------------------------------------------------------------------------
 // ARM-specific methods used by both the classic and RyuJIT
 //------------------------------------------------------------------------
-#ifdef _TARGET_ARM_
-CORINFO_CLASS_HANDLE Compiler::GetHfaClassHandle(GenTreePtr tree)
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+CORINFO_CLASS_HANDLE Compiler::GetStructClassHandle(GenTreePtr tree)
 {
     if (tree->TypeGet() == TYP_STRUCT)
     {
@@ -10809,7 +10986,7 @@ CORINFO_CLASS_HANDLE Compiler::GetHfaClassHandle(GenTreePtr tree)
 
         case GT_RETURN:
             assert(tree->gtOp.gtOp1->gtOper == GT_LCL_VAR);
-            return GetHfaClassHandle(tree->gtOp.gtOp1);
+            return GetStructClassHandle(tree->gtOp.gtOp1);
 
         case GT_LDOBJ:
             return tree->gtLdObj.gtClass;
@@ -10823,15 +11000,35 @@ CORINFO_CLASS_HANDLE Compiler::GetHfaClassHandle(GenTreePtr tree)
 
         case GT_ASG:
             assert(tree->gtOp.gtOp1->gtOper == GT_LCL_VAR || tree->gtOp.gtOp1->gtOper == GT_LCL_FLD);
-            return GetHfaClassHandle(tree->gtOp.gtOp1);
-            
+            return GetStructClassHandle(tree->gtOp.gtOp1);
         default:
-            unreached();
+            return NO_CLASS_HANDLE; 
         }
     }
     return NO_CLASS_HANDLE;
 }
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+bool Compiler::IsRegisterPassable(CORINFO_CLASS_HANDLE hClass)
+{
+    if (hClass == NO_CLASS_HANDLE)
+    {
+        return false;
+    }
+
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    eeGetSystemVAmd64PassStructInRegisterDescriptor(hClass, &structDesc);
+    return structDesc.passedInRegisters;
+}
 
+bool Compiler::IsRegisterPassable(GenTreePtr tree)
+{
+    return IsRegisterPassable(GetStructClassHandle(tree));
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+#ifdef _TARGET_ARM_
 bool Compiler::IsHfa(CORINFO_CLASS_HANDLE hClass)
 {
     return varTypeIsFloating(GetHfaType(hClass));
@@ -10839,12 +11036,12 @@ bool Compiler::IsHfa(CORINFO_CLASS_HANDLE hClass)
 
 bool Compiler::IsHfa(GenTreePtr tree)
 {
-    return IsHfa(GetHfaClassHandle(tree));
+    return IsHfa(GetStructClassHandle(tree));
 }
 
 var_types Compiler::GetHfaType(GenTreePtr tree)
 {
-    return (tree->TypeGet() == TYP_STRUCT) ? GetHfaType(GetHfaClassHandle(tree)) : TYP_UNDEF;
+    return (tree->TypeGet() == TYP_STRUCT) ? GetHfaType(GetStructClassHandle(tree)) : TYP_UNDEF;
 }
 
 unsigned Compiler::GetHfaSlots(GenTreePtr tree)
diff --git a/src/jit/codegenlegacy.cpp b/src/jit/codegenlegacy.cpp
index e37322d3b4..0914f7d7d6 100644
--- a/src/jit/codegenlegacy.cpp
+++ b/src/jit/codegenlegacy.cpp
@@ -12870,7 +12870,7 @@ void                CodeGen::genCodeForBBlist()
         genStackLevel = 0;
 #if FEATURE_STACK_FP_X87
         genResetFPstkLevel();
-#endif //FEATURE_STACK_FP_X87
+#endif // FEATURE_STACK_FP_X87
 
 #if !FEATURE_FIXED_OUT_ARGS
         /* Check for inserted throw blocks and adjust genStackLevel */
diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h
index 57eac7ced4..6a030eb926 100644
--- a/src/jit/codegenlinear.h
+++ b/src/jit/codegenlinear.h
@@ -103,6 +103,10 @@
 
     void                genConsumeBlockOp(GenTreeBlkOp* blkNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg);
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    void                genConsumePutArgStk(GenTreePutArgStk* putArgStkNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     void                genConsumeRegs(GenTree* tree);
 
     void                genConsumeOperands(GenTreeOp* tree);
@@ -126,6 +130,11 @@
 
     void                genCodeForCpBlkUnroll    (GenTreeCpBlk* cpBlkNode);
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    void                genCodeForPutArgRepMovs(GenTreePutArgStk* putArgStkNode);
+    void                genCodeForPutArgUnroll(GenTreePutArgStk* putArgStkNode);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     void                genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset);
 
     void                genCodeForStoreOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset);
@@ -150,6 +159,18 @@
     
     void                genJmpMethod(GenTreePtr jmp);
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    void genGetStructTypeSizeOffset(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc,
+                                    var_types* type0,
+                                    var_types* type1,
+                                    emitAttr* size0,
+                                    emitAttr* size1,
+                                    unsigned __int8* offset0,
+                                    unsigned __int8* offset1);
+
+    bool                genStoreRegisterReturnInLclVar(GenTreePtr treeNode);
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     void                genLclHeap(GenTreePtr tree);
 
     bool                genIsRegCandidateLocal (GenTreePtr    tree)
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index 076ba7c262..7064862c4c 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -785,7 +785,6 @@ void                CodeGen::genCodeForBBlist()
 #endif
 
         /* Both stacks should always be empty on exit from a basic block */
-
         noway_assert(genStackLevel == 0);
 
 #ifdef _TARGET_AMD64_
@@ -1571,6 +1570,7 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             if (!treeNode->InReg() && !(treeNode->gtFlags & GTF_SPILLED))
             {
                 assert(!isRegCandidate);
+
                 emit->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)), 
                                   emitTypeSize(treeNode), treeNode->gtRegNum, lcl->gtLclNum, 0);
                 genProduceReg(treeNode);
@@ -1618,85 +1618,98 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
 
     case GT_STORE_LCL_FLD:
         {
-            noway_assert(targetType != TYP_STRUCT);
-            noway_assert(!treeNode->InReg());            
-            assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (!genStoreRegisterReturnInLclVar(treeNode))
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            {
+                noway_assert(targetType != TYP_STRUCT);
+                noway_assert(!treeNode->InReg());
+                assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
 
 #ifdef FEATURE_SIMD
-            // storing of TYP_SIMD12 (i.e. Vector3) field
-            if (treeNode->TypeGet() == TYP_SIMD12)
-            {
-                genStoreLclFldTypeSIMD12(treeNode);
-                break;
-            }
+                // storing of TYP_SIMD12 (i.e. Vector3) field
+                if (treeNode->TypeGet() == TYP_SIMD12)
+                {
+                    genStoreLclFldTypeSIMD12(treeNode);
+                    break;
+                }
 #endif
 
-            GenTreePtr op1 = treeNode->gtOp.gtOp1;
-            genConsumeRegs(op1);
-            emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
+                GenTreePtr op1 = treeNode->gtOp.gtOp1;
+                genConsumeRegs(op1);
+                emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
+            }
         }
         break;
 
     case GT_STORE_LCL_VAR:
         {
-            noway_assert(targetType != TYP_STRUCT);
-            assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (!genStoreRegisterReturnInLclVar(treeNode))
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            {
+                noway_assert(targetType != TYP_STRUCT);
+                assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
 
-            unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
-            LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+                unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
+                LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
 
-            // Ensure that lclVar nodes are typed correctly.
-            assert(!varDsc->lvNormalizeOnStore() || treeNode->TypeGet() == genActualType(varDsc->TypeGet()));
+                // Ensure that lclVar nodes are typed correctly.
+                assert(!varDsc->lvNormalizeOnStore() || treeNode->TypeGet() == genActualType(varDsc->TypeGet()));
 
 #if !defined(_TARGET_64BIT_)
-            if (treeNode->TypeGet() == TYP_LONG)
-            {
-                genStoreLongLclVar(treeNode);
-                break;
-            }
+                if (treeNode->TypeGet() == TYP_LONG)
+                {
+                    genStoreLongLclVar(treeNode);
+                    break;
+                }
 #endif // !defined(_TARGET_64BIT_)
 
-            GenTreePtr op1 = treeNode->gtOp.gtOp1;
-            genConsumeRegs(op1);
-            if (treeNode->gtRegNum == REG_NA)
-            {
-                // stack store
-                emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)), emitTypeSize(treeNode), treeNode);
-                varDsc->lvRegNum = REG_STK;
-            }
-            else
-            {
-                bool containedOp1 = op1->isContained();
-                // Look for the case where we have a constant zero which we've marked for reuse,
-                // but which isn't actually in the register we want.  In that case, it's better to create
-                // zero in the target register, because an xor is smaller than a copy. Note that we could
-                // potentially handle this in the register allocator, but we can't always catch it there
-                // because the target may not have a register allocated for it yet.
-                if (!containedOp1 && (op1->gtRegNum != treeNode->gtRegNum) && op1->IsZero())
+                GenTreePtr op1 = treeNode->gtOp.gtOp1;
+                genConsumeRegs(op1);
+
+                if (treeNode->gtRegNum == REG_NA)
                 {
-                    op1->gtRegNum = REG_NA;
-                    op1->ResetReuseRegVal();
-                    containedOp1 = true;
+                    // stack store
+                    emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)), emitTypeSize(treeNode), treeNode);
+                    varDsc->lvRegNum = REG_STK;
                 }
-                if (containedOp1)
+                else
                 {
-                    // Currently, we assume that the contained source of a GT_STORE_LCL_VAR writing to a register
-                    // must be a constant. However, in the future we might want to support a contained memory op.
-                    // This is a bit tricky because we have to decide it's contained before register allocation,
-                    // and this would be a case where, once that's done, we need to mark that node as always
-                    // requiring a register - which we always assume now anyway, but once we "optimize" that
-                    // we'll have to take cases like this into account.
-                    assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
-                    genSetRegToConst(treeNode->gtRegNum, targetType, op1);
+                    bool containedOp1 = op1->isContained();
+                    // Look for the case where we have a constant zero which we've marked for reuse,
+                    // but which isn't actually in the register we want.  In that case, it's better to create
+                    // zero in the target register, because an xor is smaller than a copy. Note that we could
+                    // potentially handle this in the register allocator, but we can't always catch it there
+                    // because the target may not have a register allocated for it yet.
+                    if (!containedOp1 && (op1->gtRegNum != treeNode->gtRegNum) && op1->IsZero())
+                    {
+                        op1->gtRegNum = REG_NA;
+                        op1->ResetReuseRegVal();
+                        containedOp1 = true;
+                    }
+                    if (containedOp1)
+                    {
+                        // Currently, we assume that the contained source of a GT_STORE_LCL_VAR writing to a register
+                        // must be a constant. However, in the future we might want to support a contained memory op.
+                        // This is a bit tricky because we have to decide it's contained before register allocation,
+                        // and this would be a case where, once that's done, we need to mark that node as always
+                        // requiring a register - which we always assume now anyway, but once we "optimize" that
+                        // we'll have to take cases like this into account.
+                        assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
+                        genSetRegToConst(treeNode->gtRegNum, targetType, op1);
+                    }
+                    else if (op1->gtRegNum != treeNode->gtRegNum)
+                    {
+                        assert(op1->gtRegNum != REG_NA);
+                        emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(treeNode), treeNode, op1);
+                    }
                 }
-                else if (op1->gtRegNum != treeNode->gtRegNum)
+                if (treeNode->gtRegNum != REG_NA)
                 {
-                    assert(op1->gtRegNum != REG_NA);
-                    emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(treeNode), treeNode, op1);
+                    genProduceReg(treeNode);
                 }
             }
-            if (treeNode->gtRegNum != REG_NA)
-                genProduceReg(treeNode);
         }
         break;
 
@@ -1717,6 +1730,15 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             GenTreePtr op1 = treeNode->gtOp.gtOp1;
             if (targetType == TYP_VOID)
             {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                if (compiler->info.compRetBuffArg != BAD_VAR_NUM)
+                {
+                    // System V AMD64 spec requires that when a struct is returned by a hidden
+                    // argument the RAX should contain the value of the hidden retbuf arg.
+                    emit->emitIns_R_S(INS_mov, EA_BYREF, REG_RAX, compiler->info.compRetBuffArg, 0);
+                }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
                 assert(op1 == nullptr);
             }
 #if !defined(_TARGET_64BIT_)
@@ -1742,53 +1764,233 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
 #endif // !defined(_TARGET_64BIT_)
             else
             {
-                assert(op1 != nullptr);
-                noway_assert(op1->gtRegNum != REG_NA);
-
-                // !! NOTE !! genConsumeReg will clear op1 as GC ref after it has
-                // consumed a reg for the operand. This is because the variable
-                // is dead after return. But we are issuing more instructions
-                // like "profiler leave callback" after this consumption. So
-                // if you are issuing more instructions after this point,
-                // remember to keep the variable live up until the new method
-                // exit point where it is actually dead.
-                genConsumeReg(op1);
-
-                regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET;
-#ifdef _TARGET_X86_
-                if (varTypeIsFloating(treeNode))
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                if (treeNode->TypeGet() == TYP_STRUCT &&
+                    treeNode->gtOp.gtOp1->OperGet() == GT_LCL_VAR)
                 {
-                    if (genIsRegCandidateLocal(op1) && !compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvRegister)
+                    GenTreeLclVarCommon* lclVarPtr = treeNode->gtOp.gtOp1->AsLclVarCommon();
+                    LclVarDsc* varDsc = &(compiler->lvaTable[lclVarPtr->gtLclNum]);
+                    assert(varDsc->lvDontPromote);
+
+                    CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+                    assert(typeHnd != nullptr);
+
+                    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+                    compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+                    assert(structDesc.passedInRegisters);
+                    assert(structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+                    regNumber retReg0 = REG_NA;
+                    emitAttr size0 = EA_UNKNOWN;
+                    unsigned offset0 = structDesc.eightByteOffsets[0];
+                    regNumber retReg1 = REG_NA;
+                    emitAttr size1 = EA_UNKNOWN;
+                    unsigned offset1 = structDesc.eightByteOffsets[1];
+
+                    bool firstIntUsed = false;
+                    bool firstFloatUsed = false;
+                    
+                    var_types type0 = TYP_UNKNOWN;
+                    var_types type1 = TYP_UNKNOWN;
+
+                    // Set the first eightbyte data
+                    switch (structDesc.eightByteClassifications[0])
                     {
-                        // Store local variable to its home location, if necessary.
-                        if ((op1->gtFlags & GTF_REG_VAL) != 0)
+                    case SystemVClassificationTypeInteger:
+                        if (structDesc.eightByteSizes[0] <= 4)
+                        {
+                            retReg0 = REG_INTRET;
+                            size0 = EA_4BYTE;
+                            type0 = TYP_INT;
+                            firstIntUsed = true;
+                        }
+                        else if (structDesc.eightByteSizes[0] <= 8)
+                        {
+                            retReg0 = REG_LNGRET;
+                            size0 = EA_8BYTE;
+                            type0 = TYP_LONG;
+                            firstIntUsed = true;
+                        }
+                        else
+                        {
+                            assert(false && "Bad int type.");
+                        }
+                        break;
+                    case SystemVClassificationTypeIntegerReference:
+                        assert(structDesc.eightByteSizes[0] == REGSIZE_BYTES);
+                        retReg0 = REG_LNGRET;
+                        size0 = EA_GCREF;
+                        type0 = TYP_REF;
+                        firstIntUsed = true;
+                        break;
+                    case SystemVClassificationTypeSSE:
+                        if (structDesc.eightByteSizes[0] <= 4)
+                        {
+                            retReg0 = REG_FLOATRET;
+                            size0 = EA_4BYTE;
+                            type0 = TYP_FLOAT;
+                            firstFloatUsed = true;
+                        }
+                        else if (structDesc.eightByteSizes[0] <= 8)
+                        {
+                            retReg0 = REG_DOUBLERET;
+                            size0 = EA_8BYTE;
+                            type0 = TYP_DOUBLE;
+                            firstFloatUsed = true;
+                        }
+                        else
                         {
-                            op1->gtFlags &= ~GTF_REG_VAL;
-                            inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)), op1, op1->gtRegNum);
+                            assert(false && "Bat float type."); // Not possible.
                         }
-                        // Now, load it to the fp stack.
-                        getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, 0);
+                        break;
+                    default:
+                        assert(false && "Bad EightByte classification.");
+                        break;
                     }
-                    else
+
+                    // Set the second eight byte data
+                    switch (structDesc.eightByteClassifications[1])
                     {
-                        // Spill the value, which should be in a register, then load it to the fp stack.
-                        // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
-                        op1->gtFlags |= GTF_SPILL;
-                        regSet.rsSpillTree(op1->gtRegNum, op1);
-                        op1->gtFlags |= GTF_SPILLED;
-                        op1->gtFlags &= ~GTF_SPILL;
-
-                        TempDsc* t = regSet.rsUnspillInPlace(op1);
-                        inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
-                        op1->gtFlags &= ~GTF_SPILLED;
-                        compiler->tmpRlsTemp(t);
+                    case SystemVClassificationTypeInteger:
+                        if (structDesc.eightByteSizes[1] <= 4)
+                        {
+                            if (firstIntUsed)
+                            {
+                                retReg1 = REG_INTRET_1;
+                            }
+                            else
+                            {
+                                retReg1 = REG_INTRET;
+                            }
+                            type1 = TYP_INT;
+                            size1 = EA_4BYTE;
+                        }
+                        else if (structDesc.eightByteSizes[1] <= 8)
+                        {
+                            if (firstIntUsed)
+                            {
+                                retReg1 = REG_LNGRET_1;
+                            }
+                            else
+                            {
+                                retReg1 = REG_LNGRET;
+                            }
+                            type1 = TYP_LONG;
+                            size1 = EA_8BYTE;
+                        }
+                        else
+                        {
+                            assert(false && "Bad int type.");
+                        }
+                        break;
+                    case SystemVClassificationTypeIntegerReference:
+                        assert(structDesc.eightByteSizes[1] == REGSIZE_BYTES);
+                        if (firstIntUsed)
+                        {
+                            retReg1 = REG_LNGRET_1;
+                        }
+                        else
+                        {
+                            retReg1 = REG_LNGRET;
+                        }
+                        type1 = TYP_REF;
+                        size1 = EA_GCREF;
+                        break;
+                    case SystemVClassificationTypeSSE:
+                        if (structDesc.eightByteSizes[1] <= 4)
+                        {
+                            if (firstFloatUsed)
+                            {
+                                retReg1 = REG_FLOATRET_1;
+                            }
+                            else
+                            {
+                                retReg1 = REG_FLOATRET;
+                            }
+                            type1 = TYP_FLOAT;
+                            size1 = EA_4BYTE;
+                        }
+                        else if (structDesc.eightByteSizes[1] <= 8)
+                        {
+                            if (firstFloatUsed)
+                            {
+                                retReg1 = REG_DOUBLERET_1;
+                            }
+                            else
+                            {
+                                retReg1 = REG_DOUBLERET;
+                            }
+                            type1 = TYP_DOUBLE;
+                            size1 = EA_8BYTE;
+                        }
+                        else
+                        {
+                            assert(false && "Bat float type."); // Not possible.
+                        }
+                        break;
+                    default:
+                        assert(false && "Bad EightByte classification.");
+                        break;
                     }
+
+                    // Move the values into the return registers.
+                    // 
+                    emit->emitIns_R_S(ins_Load(type0), size0, retReg0, lclVarPtr->gtLclNum, offset0);
+                    emit->emitIns_R_S(ins_Load(type1), size1, retReg1, lclVarPtr->gtLclNum, offset1);
                 }
                 else
-#endif // _TARGET_X86_
-                if (op1->gtRegNum != retReg)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                 {
-                    inst_RV_RV(ins_Copy(targetType), retReg, op1->gtRegNum, targetType);
+                    assert(op1 != nullptr);
+                    noway_assert(op1->gtRegNum != REG_NA);
+
+                    // !! NOTE !! genConsumeReg will clear op1 as GC ref after it has
+                    // consumed a reg for the operand. This is because the variable
+                    // is dead after return. But we are issuing more instructions
+                    // like "profiler leave callback" after this consumption. So
+                    // if you are issuing more instructions after this point,
+                    // remember to keep the variable live up until the new method
+                    // exit point where it is actually dead.
+                    genConsumeReg(op1);
+
+                    regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET;
+#ifdef _TARGET_X86_
+                    if (varTypeIsFloating(treeNode))
+                    {
+                        if (genIsRegCandidateLocal(op1) && !compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvRegister)
+                        {
+                            // Store local variable to its home location, if necessary.
+                            if ((op1->gtFlags & GTF_REG_VAL) != 0)
+                            {
+                                op1->gtFlags &= ~GTF_REG_VAL;
+                                inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)), op1, op1->gtRegNum);
+                            }
+                            // Now, load it to the fp stack.
+                            getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, 0);
+                        }
+                        else
+                        {
+                            // Spill the value, which should be in a register, then load it to the fp stack.
+                            // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
+                            op1->gtFlags |= GTF_SPILL;
+                            regSet.rsSpillTree(op1->gtRegNum, op1);
+                            op1->gtFlags |= GTF_SPILLED;
+                            op1->gtFlags &= ~GTF_SPILL;
+
+                            TempDsc* t = regSet.rsUnspillInPlace(op1);
+                            inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
+                            op1->gtFlags &= ~GTF_SPILLED;
+                            compiler->tmpRlsTemp(t);
+                        }
+                    }
+                    else
+#endif // _TARGET_X86_
+                    {
+                        if (op1->gtRegNum != retReg)
+                        {
+                            inst_RV_RV(ins_Copy(targetType), retReg, op1->gtRegNum, targetType);
+                        }
+                    }
                 }
             }
 
@@ -2468,6 +2670,14 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
         genPutArgStk(treeNode);
 #else // !_TARGET_X86_
         {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+            if (targetType == TYP_STRUCT)
+            {
+                genPutArgStk(treeNode);
+                break;
+            }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
             noway_assert(targetType != TYP_STRUCT);
             assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
 
@@ -2536,8 +2746,9 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
 
     case GT_PUTARG_REG:
         {
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
             noway_assert(targetType != TYP_STRUCT);
-
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
             // commas show up here commonly, as part of a nullchk operation
             GenTree *op1 = treeNode->gtOp.gtOp1;
             // If child node is not already in the register we need, move it
@@ -2546,8 +2757,8 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             {
                 inst_RV_RV(ins_Copy(targetType), treeNode->gtRegNum, op1->gtRegNum, targetType);
             }
+            genProduceReg(treeNode);
         }
-        genProduceReg(treeNode);
         break;
 
     case GT_CALL:
@@ -2767,6 +2978,198 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
     }
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+// This method handles storing double register return struct value to a 
+// local homing stack location.
+// It returns true if this is a struct and storing of the returned
+// register value is handled. It returns false otherwise.
+bool
+CodeGen::genStoreRegisterReturnInLclVar(GenTreePtr treeNode)
+{
+    if (treeNode->TypeGet() == TYP_STRUCT)
+    {
+        noway_assert(!treeNode->InReg());
+
+        GenTreeLclVarCommon* lclVarPtr = treeNode->AsLclVarCommon();
+
+        LclVarDsc * varDsc = &(compiler->lvaTable[lclVarPtr->gtLclNum]);
+
+        CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+        assert(typeHnd != nullptr);
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+
+        assert(structDesc.passedInRegisters);
+        assert(structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+        GenTreePtr op1 = treeNode->gtOp.gtOp1;
+        genConsumeRegs(op1);
+
+        regNumber retReg0 = REG_NA;
+        emitAttr size0 = EA_UNKNOWN;
+        unsigned offset0 = structDesc.eightByteOffsets[0];
+        regNumber retReg1 = REG_NA;
+        emitAttr size1 = EA_UNKNOWN;
+        unsigned offset1 = structDesc.eightByteOffsets[1];
+
+        bool firstIntUsed = false;
+        bool firstFloatUsed = false;
+
+        var_types type0 = TYP_UNKNOWN;
+        var_types type1 = TYP_UNKNOWN;
+
+        // Set the first eightbyte data
+        switch (structDesc.eightByteClassifications[0])
+        {
+        case SystemVClassificationTypeInteger:
+            if (structDesc.eightByteSizes[0] <= 4)
+            {
+                retReg0 = REG_INTRET;
+                size0 = EA_4BYTE;
+                type0 = TYP_INT;
+                firstIntUsed = true;
+            }
+            else if (structDesc.eightByteSizes[0] <= 8)
+            {
+                retReg0 = REG_LNGRET;
+                size0 = EA_8BYTE;
+                type0 = TYP_LONG;
+                firstIntUsed = true;
+            }
+            else
+            {
+                assert(false && "Bad int type.");
+            }
+            break;
+        case SystemVClassificationTypeIntegerReference:
+            assert(structDesc.eightByteSizes[0] == REGSIZE_BYTES);
+            retReg0 = REG_LNGRET;
+            size0 = EA_GCREF;
+            type0 = TYP_REF;
+            firstIntUsed = true;
+            break;
+        case SystemVClassificationTypeSSE:
+            if (structDesc.eightByteSizes[0] <= 4)
+            {
+                retReg0 = REG_FLOATRET;
+                size0 = EA_4BYTE;
+                type0 = TYP_FLOAT;
+                firstFloatUsed = true;
+            }
+            else if (structDesc.eightByteSizes[0] <= 8)
+            {
+                retReg0 = REG_DOUBLERET;
+                size0 = EA_8BYTE;
+                type0 = TYP_DOUBLE;
+                firstFloatUsed = true;
+            }
+            else
+            {
+                assert(false && "Bat float type."); // Not possible.
+            }
+            break;
+        default:
+            assert(false && "Bad EightByte classification.");
+            break;
+        }
+
+        // Set the second eight byte data
+        switch (structDesc.eightByteClassifications[1])
+        {
+        case SystemVClassificationTypeInteger:
+            if (structDesc.eightByteSizes[1] <= 4)
+            {
+                if (firstIntUsed)
+                {
+                    retReg1 = REG_INTRET_1;
+                }
+                else
+                {
+                    retReg1 = REG_INTRET;
+                }
+                type1 = TYP_INT;
+                size1 = EA_4BYTE;
+            }
+            else if (structDesc.eightByteSizes[1] <= 8)
+            {
+                if (firstIntUsed)
+                {
+                    retReg1 = REG_LNGRET_1;
+                }
+                else
+                {
+                    retReg1 = REG_LNGRET;
+                }
+                type1 = TYP_LONG;
+                size1 = EA_8BYTE;
+            }
+            else
+            {
+                assert(false && "Bad int type.");
+            }
+            break;
+        case SystemVClassificationTypeIntegerReference:
+            assert(structDesc.eightByteSizes[1] == REGSIZE_BYTES);
+            if (firstIntUsed)
+            {
+                retReg1 = REG_LNGRET_1;
+            }
+            else
+            {
+                retReg1 = REG_LNGRET;
+            }
+            type1 = TYP_REF;
+            size1 = EA_GCREF;
+            break;
+        case SystemVClassificationTypeSSE:
+            if (structDesc.eightByteSizes[1] <= 4)
+            {
+                if (firstFloatUsed)
+                {
+                    retReg1 = REG_FLOATRET_1;
+                }
+                else
+                {
+                    retReg1 = REG_FLOATRET;
+                }
+                type1 = TYP_FLOAT;
+                size1 = EA_4BYTE;
+            }
+            else if (structDesc.eightByteSizes[1] <= 8)
+            {
+                if (firstFloatUsed)
+                {
+                    retReg1 = REG_DOUBLERET_1;
+                }
+                else
+                {
+                    retReg1 = REG_DOUBLERET;
+                }
+                type1 = TYP_DOUBLE;
+                size1 = EA_8BYTE;
+            }
+            else
+            {
+                assert(false && "Bat float type."); // Not possible.
+            }
+            break;
+        default:
+            assert(false && "Bad EightByte classification.");
+            break;
+        }
+
+        // Move the values into the return registers.
+        // 
+
+        getEmitter()->emitIns_S_R(ins_Store(type0), size0, retReg0, lclVarPtr->gtLclNum, offset0);
+        getEmitter()->emitIns_S_R(ins_Store(type1), size1, retReg1, lclVarPtr->gtLclNum, offset1);
+
+        return true;
+    }
+
+    return false;
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
 // Generate code for division (or mod) by power of two
 // or negative powers of two.  (meaning -1 * a power of two, not 2^(-1))
@@ -3366,40 +3769,55 @@ void CodeGen::genCodeForInitBlk(GenTreeInitBlk* initBlkNode)
 
 
 // Generate code for a load from some address + offset
-//   base: tree node which can be either a local address or arbitrary node
-//   offset: distance from the base from which to load
-void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset)
+//   baseNode: tree node which can be either a local address or arbitrary node
+//   offset: distance from the baseNode from which to load
+void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
 {
     emitter *emit = getEmitter();
 
-    if (base->OperIsLocalAddr())
+    if (baseNode->OperIsLocalAddr())
     {
-        if (base->gtOper == GT_LCL_FLD_ADDR)
-            offset += base->gtLclFld.gtLclOffs;
-        emit->emitIns_R_S(ins, size, dst, base->gtLclVarCommon.gtLclNum, offset);
+        if (baseNode->gtOper == GT_LCL_FLD_ADDR)
+            offset += baseNode->gtLclFld.gtLclOffs;
+        emit->emitIns_R_S(ins, size, dst, baseNode->gtLclVarCommon.gtLclNum, offset);
     }
     else
     {
-        emit->emitIns_R_AR(ins, size, dst, base->gtRegNum, offset);
+        emit->emitIns_R_AR(ins, size, dst, baseNode->gtRegNum, offset);
     }
 }
 
 // Generate code for a store to some address + offset
-//   base: tree node which can be either a local address or arbitrary node
-//   offset: distance from the base from which to load
-void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* base, unsigned offset)
+//   baseNode: tree node which can be either a local address or arbitrary node
+//   offset: distance from the baseNode from which to load
+void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* baseNode, unsigned offset)
 {
     emitter *emit = getEmitter();
 
-    if (base->OperIsLocalAddr())
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (baseNode->OperGet() == GT_PUTARG_STK)
     {
-        if (base->gtOper == GT_LCL_FLD_ADDR)
-            offset += base->gtLclFld.gtLclOffs;
-        emit->emitIns_S_R(ins, size, src, base->gtLclVarCommon.gtLclNum, offset);
+        GenTreePutArgStk* putArgStkNode = baseNode->AsPutArgStk();
+        assert(putArgStkNode->gtOp.gtOp1->isContained());
+        assert(putArgStkNode->gtOp.gtOp1->gtOp.gtOper == GT_LDOBJ);
+
+        emit->emitIns_S_R(ins, size, src, compiler->lvaOutgoingArgSpaceVar, 
+                          (putArgStkNode->gtSlotNum * TARGET_POINTER_SIZE) + offset);
     }
     else
+#endif // #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
-        emit->emitIns_AR_R(ins, size, src, base->gtRegNum, offset);
+
+        if (baseNode->OperIsLocalAddr())
+        {
+            if (baseNode->gtOper == GT_LCL_FLD_ADDR)
+                offset += baseNode->gtLclFld.gtLclOffs;
+            emit->emitIns_S_R(ins, size, src, baseNode->gtLclVarCommon.gtLclNum, offset);
+        }
+        else
+        {
+            emit->emitIns_AR_R(ins, size, src, baseNode->gtRegNum, offset);
+        }
     }
 }
 
@@ -3523,6 +3941,126 @@ void CodeGen::genCodeForCpBlkRepMovs(GenTreeCpBlk* cpBlkNode)
     instGen(INS_r_movsb);
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+// Generates PutArg code by performing a loop unroll
+//
+// TODO-Amd64-Unix: Try to share code with copyblk. 
+//      The difference for now is thethe putarg_stk contains it's children, while cpyblk not.
+//      This creates differences in code. After some significant refactoring it could be reused.
+void CodeGen::genCodeForPutArgUnroll(GenTreePutArgStk* putArgNode)
+{
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    GenTreePtr   dstAddr = putArgNode;
+    GenTreePtr   srcAddr = putArgNode->gtOp.gtOp1;
+
+    size_t size = putArgNode->gtNumSlots * TARGET_POINTER_SIZE;
+    assert(size <= CPBLK_UNROLL_LIMIT);
+
+    emitter *emit = getEmitter();
+
+    assert(srcAddr->isContained());
+    assert(srcAddr->gtOper == GT_LDOBJ);
+
+    if (!srcAddr->gtOp.gtOp1->isContained())
+    {
+        genConsumeReg(srcAddr->gtOp.gtOp1);
+    }
+
+    unsigned offset = 0;
+
+    // If the size of this struct is larger than 16 bytes
+    // let's use SSE2 to be able to do 16 byte at a time 
+    // loads and stores.
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+        assert(putArgNode->gtRsvdRegs != RBM_NONE);
+        regNumber xmmReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLFLOAT);
+        assert(genIsValidFloatReg(xmmReg));
+        size_t slots = size / XMM_REGSIZE_BYTES;
+
+        while (slots-- > 0)
+        {
+            // Load
+            genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr->gtOp.gtOp1, offset); // Load the address of the child of the LdObj node.
+            // Store
+            genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset);
+            offset += XMM_REGSIZE_BYTES;
+        }
+    }
+
+    // Fill the remainder (15 bytes or less) if there's one.
+    if ((size & 0xf) != 0)
+    {
+        // Grab the integer temp register to emit the remaining loads and stores.
+        regNumber tmpReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLINT);
+
+        if ((size & 8) != 0)
+        {
+#ifdef _TARGET_X86_
+            // TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
+            for (unsigned savedOffs = offset; offset < savedOffs + 8; offset += 4)
+            {
+                genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
+                genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
+            }
+#else // !_TARGET_X86_
+            genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr->gtOp.gtOp1, offset);
+            genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset);
+            offset += 8;
+#endif // !_TARGET_X86_
+        }
+        if ((size & 4) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr->gtOp.gtOp1, offset);
+            genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
+            offset += 4;
+        }
+        if ((size & 2) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr->gtOp.gtOp1, offset);
+            genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset);
+            offset += 2;
+        }
+        if ((size & 1) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr->gtOp.gtOp1, offset);
+            genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset);
+        }
+    }
+}
+
+// Generate code for CpBlk by using rep movs
+// Preconditions:
+// The size argument of the PutArgStk (for structs) is a constant and is between 
+// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
+void CodeGen::genCodeForPutArgRepMovs(GenTreePutArgStk* putArgNode)
+{
+
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    GenTreePtr   dstAddr = putArgNode;
+    GenTreePtr   srcAddr = putArgNode->gtOp.gtOp1;
+#ifdef DEBUG
+    size_t size = putArgNode->gtNumSlots * TARGET_POINTER_SIZE;
+#endif // DEBUG
+
+    // Validate state.
+    assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI));
+
+#ifdef DEBUG
+    assert(srcAddr->isContained());
+
+#ifdef _TARGET_AMD64_
+    assert(size > CPBLK_UNROLL_LIMIT);
+#else
+    assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
+#endif
+
+#endif // DEBUG
+    genConsumePutArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX);
+    instGen(INS_r_movsb);
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 // Generate code for CpObj nodes wich copy structs that have interleaved
 // GC pointers.
 // This will generate a sequence of movsq instructions for the cases of non-gc members
@@ -3686,7 +4224,7 @@ void CodeGen::genCodeForCpBlk(GenTreeCpBlk* cpBlkNode)
 {
 #ifdef _TARGET_AMD64_
     // Make sure we got the arguments of the cpblk operation in the right registers
-    GenTreePtr blockSize  = cpBlkNode->Size();
+    GenTreePtr blockSize = cpBlkNode->Size();
     GenTreePtr    dstAddr = cpBlkNode->Dest();
     GenTreePtr    srcAddr = cpBlkNode->Source();
 
@@ -3705,7 +4243,7 @@ void CodeGen::genCodeForCpBlk(GenTreeCpBlk* cpBlkNode)
 
     genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN);
 #else // !_TARGET_AMD64_
-    NYI_X86("Helper call for CpBlk");
+    noway_assert(false && "Helper call for CpBlk is not needed.");
 #endif // !_TARGET_AMD64_
 }
 
@@ -4558,7 +5096,9 @@ regNumber CodeGen::genConsumeReg(GenTree *tree)
 
     // genUpdateLife() will also spill local var if marked as GTF_SPILL by calling CodeGen::genSpillVar
     genUpdateLife(tree);
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(tree->gtRegNum != REG_NA);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     // there are three cases where consuming a reg means clearing the bit in the live mask
     // 1. it was not produced by a local
@@ -4678,6 +5218,82 @@ void CodeGen::genConsumeOperands(GenTreeOp* tree)
     }
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+void CodeGen::genConsumePutArgStk(GenTreePutArgStk* putArgNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg)
+{
+    // The putArgNode children are always contained. We should not consume any registers.
+
+    GenTree* dst = putArgNode;
+
+#ifdef DEBUG
+    // Get the GT_ADDR node, which is GT_LCL_VAR_ADDR (asserted below.)
+    GenTree* src = putArgNode->gtOp.gtOp1; 
+    assert(src->OperGet() == GT_LDOBJ);
+    src = src->gtOp.gtOp1;
+#else // !DEBUG
+    // Get the GT_ADDR node, which is GT_LCL_VAR_ADDR (asserted below.)
+    GenTree* src = putArgNode->gtOp.gtOp1->gtOp.gtOp1;
+#endif // !DEBUG
+    
+    size_t size = putArgNode->gtNumSlots * TARGET_POINTER_SIZE;
+    GenTree* op1;
+    GenTree* op2;
+
+    regNumber reg1, reg2, reg3;
+    op1 = dst;
+    reg1 = dstReg;
+    op2 = src;
+    reg2 = srcReg;
+    reg3 = sizeReg;
+
+    if (reg2 != REG_NA && op2->gtRegNum != REG_NA)
+    {
+        genConsumeReg(op2);
+    }
+
+    if ((reg1 != REG_NA) && (op1->gtRegNum != reg1))
+    {
+#if FEATURE_FIXED_OUT_ARGS
+        // Generate LEA instruction to load the stack of the outgoing var + SlotNum offset in  RDI.
+        LclVarDsc *  varDsc = &compiler->lvaTable[compiler->lvaOutgoingArgSpaceVar];
+        int offset = varDsc->lvStkOffs + putArgNode->gtSlotNum * TARGET_POINTER_SIZE;
+        // Outgoing area always on top of the stack (relative to rsp.)
+        getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, reg1, REG_SPBASE, offset);
+#else // !FEATURE_FIXED_OUT_ARGS
+        NYI_X86("Stack args for x86/RyuJIT");
+#endif // !FEATURE_FIXED_OUT_ARGS
+
+    }
+    
+    if (op2->gtRegNum != reg2)
+    {
+        if (src->OperIsLocalAddr())
+        {
+            // The OperLocalAddr is always contained.
+            assert(src->isContained());
+            GenTreeLclVarCommon* lclNode = src->AsLclVarCommon();
+
+            // Generate LEA instruction to load the LclVar address in RSI.
+            LclVarDsc *  varLclDsc = &compiler->lvaTable[lclNode->gtLclNum];
+            int offset = varLclDsc->lvStkOffs;
+
+            // Otutgoing area always on top of the stack (relative to rsp.)
+            getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, reg2, (isFramePointerUsed() ? getFramePointerReg() : REG_SPBASE), offset);
+        }
+        else
+        {
+            assert(src->gtRegNum != REG_NA);
+            getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, reg2, src->gtRegNum);
+        }
+    }
+
+    if ((reg3 != REG_NA))
+    {
+        inst_RV_IV(INS_mov, reg3, size, EA_8BYTE);
+    }
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 void CodeGen::genConsumeBlockOp(GenTreeBlkOp* blkNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg)
 {
     // We have to consume the registers, and perform any copies, in the actual execution order.
@@ -4827,7 +5443,6 @@ void CodeGen::genTransferRegGCState(regNumber dst, regNumber src)
    }
 }
 
-
 // generates an ip-relative call or indirect call via reg ('call reg')
 //     pass in 'addr' for a relative call or 'base' for a indirect register call
 //     methHnd - optional, only used for pretty printing 
@@ -4843,9 +5458,9 @@ void CodeGen::genEmitCall(int                   callType,
                           bool                  isJump,
                           bool                  isNoGC)
 {
-#ifndef _TARGET_X86_
+#if !defined(_TARGET_X86_)
     ssize_t               argSize = 0;
-#endif // !_TARGET_X86_
+#endif // !defined(_TARGET_X86_)
     getEmitter()->emitIns_Call(emitter::EmitCallType(callType),
                                methHnd,
                                INDEBUG_LDISASM_COMMA(sigInfo)
@@ -4867,14 +5482,14 @@ void CodeGen::genEmitCall(int                   callType,
 void CodeGen::genEmitCall(int                   callType,
                           CORINFO_METHOD_HANDLE methHnd,
                           INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo)
-                          GenTreeIndir*         indir
+                          GenTreeIndir*         indir 
                           X86_ARG(ssize_t       argSize),
                           emitAttr              retSize,
                           IL_OFFSETX            ilOffset)
 {
-#ifndef _TARGET_X86_
+#if !defined(_TARGET_X86_)
     ssize_t               argSize = 0;
-#endif // !_TARGET_X86_
+#endif // !defined(_TARGET_X86_)
     genConsumeAddress(indir->Addr());
 
     getEmitter()->emitIns_Call(emitter::EmitCallType(callType),
@@ -4920,13 +5535,49 @@ void CodeGen::genCallInstruction(GenTreePtr node)
         if (curArgTabEntry->regNum == REG_STK)
             continue;
 
-        regNumber argReg = curArgTabEntry->regNum;
-        genConsumeReg(argNode);
-        if (argNode->gtRegNum != argReg)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // Deal with multi register passed struct args.
+        if (argNode->OperGet() == GT_LIST)
         {
-            inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum);
+            GenTreeArgList* argListPtr = argNode->AsArgList();
+            unsigned iterationNum = 0;
+            for (; argListPtr; argListPtr = argListPtr->Rest(), iterationNum++)
+            {
+                GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+                assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+                regNumber argReg = REG_NA;
+                if (iterationNum == 0)
+                {
+                    argReg = curArgTabEntry->regNum;
+                }
+                else if (iterationNum == 1)
+                {
+                    argReg = curArgTabEntry->otherRegNum;
+                }
+                else
+                {
+                    assert(false); // Illegal state.
+                }
+
+                genConsumeReg(putArgRegNode);
+                if (putArgRegNode->gtRegNum != argReg)
+                {
+                    inst_RV_RV(ins_Move_Extend(putArgRegNode->TypeGet(), putArgRegNode->InReg()), argReg, putArgRegNode->gtRegNum);
+                }
+            }
+        }
+        else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        {
+            regNumber argReg = curArgTabEntry->regNum;
+            genConsumeReg(argNode);
+            if (argNode->gtRegNum != argReg)
+            {
+                inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum);
+            }
         }
 
+#if FEATURE_VARARG
         // In the case of a varargs call, 
         // the ABI dictates that if we have floating point args,
         // we must pass the enregistered arguments in both the 
@@ -4937,9 +5588,10 @@ void CodeGen::genCallInstruction(GenTreePtr node)
             instruction ins = ins_CopyFloatToInt(argNode->TypeGet(), TYP_LONG);
             inst_RV_RV(ins, argNode->gtRegNum, targetReg);
         }
+#endif // FEATURE_VARARG
     }
 
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     // The call will pop its arguments.
     // for each putarg_stk:
     ssize_t stackArgBytes = 0;
@@ -4949,16 +5601,31 @@ void CodeGen::genCallInstruction(GenTreePtr node)
         GenTreePtr arg = args->gtOp.gtOp1;
         if (arg->OperGet() != GT_ARGPLACE && !(arg->gtFlags & GTF_LATE_ARG))
         {
+#if defined(_TARGET_X86_)
             assert((arg->OperGet() == GT_PUTARG_STK) || (arg->OperGet() == GT_LONG));
             if (arg->OperGet() == GT_LONG)
             {
                 assert((arg->gtGetOp1()->OperGet() == GT_PUTARG_STK) && (arg->gtGetOp2()->OperGet() == GT_PUTARG_STK));
             }
+#endif // defined(_TARGET_X86_)
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (genActualType(arg->TypeGet()) == TYP_STRUCT)
+            {
+                if (arg->OperGet() == GT_PUTARG_STK)
+                {
+                    GenTreeLdObj* ldObj = arg->gtGetOp1()->AsLdObj();
+                    stackArgBytes = compiler->info.compCompHnd->getClassSize(ldObj->gtClass);
+                }
+            }
+            else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
             stackArgBytes += genTypeSize(genActualType(arg->TypeGet()));
         }
         args = args->gtOp.gtOp2;
     }
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
     // Insert a null check on "this" pointer if asked.
     if (call->NeedsNullCheck())
@@ -5056,9 +5723,9 @@ void CodeGen::genCallInstruction(GenTreePtr node)
                             methHnd,
                             INDEBUG_LDISASM_COMMA(sigInfo)
                             (void*) target->AsIndir()->Base()->AsIntConCommon()->IconValue(),
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
                             stackArgBytes,
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_)
                             retSize,
                             ilOffset);
             }
@@ -5070,9 +5737,9 @@ void CodeGen::genCallInstruction(GenTreePtr node)
                             methHnd,
                             INDEBUG_LDISASM_COMMA(sigInfo)
                             target->AsIndir(),
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
                             stackArgBytes,
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_)
                             retSize,
                             ilOffset);
             }
@@ -5086,9 +5753,9 @@ void CodeGen::genCallInstruction(GenTreePtr node)
                         methHnd,
                         INDEBUG_LDISASM_COMMA(sigInfo)
                         nullptr, //addr
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
                         stackArgBytes,
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_)
                         retSize,
                         ilOffset,
                         genConsumeReg(target));
@@ -5153,9 +5820,9 @@ void CodeGen::genCallInstruction(GenTreePtr node)
                     methHnd, 
                     INDEBUG_LDISASM_COMMA(sigInfo)
                     addr,
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
                     stackArgBytes,
-#endif // _TARGET_X86_
+#endif // _defined(_TARGET_X86_)
                     retSize,
                     ilOffset);
     }
@@ -5168,10 +5835,10 @@ void CodeGen::genCallInstruction(GenTreePtr node)
         genPendingCallLabel = nullptr;
     }
 
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_)
     // The call will pop its arguments.
     genStackLevel -= stackArgBytes;
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_)
 
     // Update GC info:
     // All Callee arg registers are trashed and no longer contain any GC pointers.
@@ -5218,6 +5885,130 @@ void CodeGen::genCallInstruction(GenTreePtr node)
     }
 }
 
+//------------------------------------------------------------------------
+// genGetStructTypeSizeOffset: Gets the type, size and offset of the eightbytes of a struct for System V systems.
+//
+// Arguments:
+//    'structDesc' struct description
+//    'type0'   returns the type of the first eightbyte.
+//    'type1'   returns the type of the second eightbyte.
+//    'size0'   returns the size of the first eightbyte.
+//    'size1'   returns the size of the second eightbyte.
+//    'offset0' returns the offset of the first eightbyte.
+//    'offset1' returns the offset of the second eightbyte.
+//
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+void CodeGen::genGetStructTypeSizeOffset(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc,
+    var_types* type0, var_types* type1, emitAttr* size0, emitAttr* size1,
+    unsigned __int8* offset0, unsigned __int8* offset1)
+{
+    *size0 = EA_UNKNOWN;
+    *offset0 = structDesc.eightByteOffsets[0];
+    *size1 = EA_UNKNOWN;
+    *offset1 = structDesc.eightByteOffsets[1];
+
+    *type0 = TYP_UNKNOWN;
+    *type1 = TYP_UNKNOWN;
+
+    // Set the first eightbyte data
+    if (structDesc.eightByteCount >= 1)
+    {
+        switch (structDesc.eightByteClassifications[0])
+        {
+        case SystemVClassificationTypeInteger:
+            if (structDesc.eightByteSizes[0] <= 4)
+            {
+                *size0 = EA_4BYTE;
+                *type0 = TYP_INT;
+            }
+            else if (structDesc.eightByteSizes[0] <= 8)
+            {
+                *size0 = EA_8BYTE;
+                *type0 = TYP_LONG;
+            }
+            else
+            {
+                assert(false && "Bad int type.");
+            }
+            break;
+        case SystemVClassificationTypeIntegerReference:
+            assert(structDesc.eightByteSizes[0] == REGSIZE_BYTES);
+            *size0 = EA_GCREF;
+            *type0 = TYP_REF;
+            break;
+        case SystemVClassificationTypeSSE:
+            if (structDesc.eightByteSizes[0] <= 4)
+            {
+                *size0 = EA_4BYTE;
+                *type0 = TYP_FLOAT;
+            }
+            else if (structDesc.eightByteSizes[0] <= 8)
+            {
+                *size0 = EA_8BYTE;
+                *type0 = TYP_DOUBLE;
+            }
+            else
+            {
+                assert(false && "Bat float type."); // Not possible.
+            }
+            break;
+        default:
+            assert(false && "Bad EightByte classification.");
+            break;
+        }
+    }
+
+    // Set the second eight byte data
+    if (structDesc.eightByteCount == 2)
+    {
+        switch (structDesc.eightByteClassifications[1])
+        {
+        case SystemVClassificationTypeInteger:
+            if (structDesc.eightByteSizes[1] <= 4)
+            {
+                *type1 = TYP_INT;
+                *size1 = EA_4BYTE;
+            }
+            else if (structDesc.eightByteSizes[1] <= 8)
+            {
+                *type1 = TYP_LONG;
+                *size1 = EA_8BYTE;
+            }
+            else
+            {
+                assert(false && "Bad int type.");
+            }
+            break;
+        case SystemVClassificationTypeIntegerReference:
+            assert(structDesc.eightByteSizes[1] == REGSIZE_BYTES);
+            *type1 = TYP_REF;
+            *size1 = EA_GCREF;
+            break;
+        case SystemVClassificationTypeSSE:
+            if (structDesc.eightByteSizes[1] <= 4)
+            {
+                *type1 = TYP_FLOAT;
+                *size1 = EA_4BYTE;
+            }
+            else if (structDesc.eightByteSizes[1] <= 8)
+            {
+                *type1 = TYP_DOUBLE;
+                *size1 = EA_8BYTE;
+            }
+            else
+            {
+                assert(false && "Bat float type."); // Not possible.
+            }
+            break;
+        default:
+            assert(false && "Bad EightByte classification.");
+            break;
+        }
+    }
+}
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 // Produce code for a GT_JMP node.
 // The arguments of the caller needs to be transferred to the callee before exiting caller.
 // The actual jump to callee is generated as part of caller epilog sequence.
@@ -5319,36 +6110,94 @@ void CodeGen::genJmpMethod(GenTreePtr jmp)
         if  (!varDsc->lvIsRegArg)
             continue;
 
-        // Register argument
-        noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (varDsc->lvType == TYP_STRUCT)
+        {
+            CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+            assert(typeHnd != nullptr);
 
-        // Is register argument already in the right register?
-        // If not load it from its stack location.
-        var_types  loadType  = varDsc->lvaArgType();
-        regNumber  argReg    = varDsc->lvArgReg;    // incoming arg register
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            assert(structDesc.passedInRegisters);
 
-        if (varDsc->lvRegNum != argReg)
-        {
-            assert(genIsValidReg(argReg)); 
+            emitAttr size0 = EA_UNKNOWN;
+            emitAttr size1 = EA_UNKNOWN;
+            unsigned __int8 offset0 = 0;
+            unsigned __int8 offset1 = 0;
+            var_types type0 = TYP_UNKNOWN;
+            var_types type1 = TYP_UNKNOWN;
+
+            // Get the eightbyte data
+            genGetStructTypeSizeOffset(structDesc, &type0, &type1, &size0, &size1, &offset0, &offset1);
+
+            // Move the values into the right registers.
+            // 
+            if (type0 != TYP_UNKNOWN)
+            {
+                getEmitter()->emitIns_R_S(ins_Load(type0), size0, varDsc->lvArgReg, varNum, offset0);
+
+                // Update varDsc->lvArgReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+                // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+                // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
+                // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
+                regSet.rsMaskVars |= genRegMask(varDsc->lvArgReg);
+                gcInfo.gcMarkRegPtrVal(varDsc->lvArgReg, type0);
+            }
+            
+            if (type1 != TYP_UNKNOWN)
+            {
+                getEmitter()->emitIns_R_S(ins_Load(type1), size1, varDsc->lvOtherArgReg, varNum, offset1);
 
-            getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
+                // Update varDsc->lvArgReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+                // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+                // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
+                // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
+                regSet.rsMaskVars |= genRegMask(varDsc->lvOtherArgReg);
+                gcInfo.gcMarkRegPtrVal(varDsc->lvOtherArgReg, type1);
+            }
 
-            // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
-            // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
-            // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
-            // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
-            regSet.rsMaskVars |= genRegMask(argReg);
-            gcInfo.gcMarkRegPtrVal(argReg, loadType);
             if (varDsc->lvTracked)
             {
-                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varNum);            
+                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varNum);
             }
         }
+        else
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // Register argument
+            noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
 
+            // Is register argument already in the right register?
+            // If not load it from its stack location.
+            var_types  loadType = varDsc->lvaArgType();
+            regNumber  argReg = varDsc->lvArgReg;    // incoming arg register
+
+            if (varDsc->lvRegNum != argReg)
+            {
+                assert(genIsValidReg(argReg));
+                getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
+
+                // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+                // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+                // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
+                // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
+                regSet.rsMaskVars |= genRegMask(argReg);
+                gcInfo.gcMarkRegPtrVal(argReg, loadType);
+                if (varDsc->lvTracked)
+                {
+                    VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varNum);
+                }
+            }
+        }
+
+#if FEATURE_VARARG
         // In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg register.        
         if (compiler->info.compIsVarArgs)
         {
             regNumber intArgReg;
+            var_types  loadType = varDsc->lvaArgType();
+            regNumber  argReg = varDsc->lvArgReg;    // incoming arg register
+
             if (varTypeIsFloating(loadType))
             {
                 intArgReg = compiler->getCallArgIntRegister(argReg);
@@ -5368,8 +6217,10 @@ void CodeGen::genJmpMethod(GenTreePtr jmp)
                 firstArgVarNum = varNum;
             }
         }
+#endif // FEATURE_VARARG    
     }
 
+#if FEATURE_VARARG
     // Jmp call to a vararg method - if the method has fewer than 4 fixed arguments,
     // load the remaining arg registers (both int and float) from the corresponding
     // shadow stack slots.  This is for the reason that we don't know the number and type
@@ -5409,7 +6260,7 @@ void CodeGen::genJmpMethod(GenTreePtr jmp)
             getEmitter()->emitEnableGC();
         }
     }
-
+#endif // FEATURE_VARARG
 }
 
 // produce code for a GT_LEA subnode
@@ -6488,13 +7339,122 @@ CodeGen::genMathIntrinsic(GenTreePtr treeNode)
     genProduceReg(treeNode);
 }
 
-#ifdef _TARGET_X86_
+#if defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+//---------------------------------------------------------------------
+// genPutArgStk - generate code for putting a struct arg on the stack by value.
+//                In case there are references to heap object in the struct,
+//                it generates the gcinfo as well.
+//
+// Arguments
+//    treeNode - the GT_PUTARG_STK node
+//
+// Return value:
+//    None
+//
 void
 CodeGen::genPutArgStk(GenTreePtr treeNode)
 {
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(treeNode->OperGet() == GT_PUTARG_STK);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     var_types targetType = treeNode->TypeGet();
+#ifdef _TARGET_X86_
     noway_assert(targetType != TYP_STRUCT);
+#elif defined (FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    noway_assert(targetType == TYP_STRUCT);
+    
+    GenTreePutArgStk* putArgStk = treeNode->AsPutArgStk();
+    if (putArgStk->gtNumberReferenceSlots == 0)
+    {
+        switch (putArgStk->gtPutArgStkKind)
+        {
+        case GenTreePutArgStk::PutArgStkKindRepInstr:
+            genCodeForPutArgRepMovs(putArgStk);
+            break;
+        case GenTreePutArgStk::PutArgStkKindUnroll:
+            genCodeForPutArgUnroll(putArgStk);
+            break;
+        default:
+            unreached();
+        }
+    }
+    else
+    {
+        // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always.
+
+        // Consume these registers.
+        // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
+        genConsumePutArgStk(putArgStk, REG_RDI, REG_RSI, REG_NA);
+        GenTreePtr   dstAddr = putArgStk;
+        GenTreePtr   srcAddr = putArgStk->gtOp.gtOp1;
+        gcInfo.gcMarkRegPtrVal(REG_RSI, srcAddr->TypeGet());
+        gcInfo.gcMarkRegPtrVal(REG_RDI, dstAddr->TypeGet());
+
+        unsigned slots = putArgStk->gtNumSlots;
+
+        // We are always on the stack we don't need to use the write barrier. 
+        BYTE*    gcPtrs     = putArgStk->gtGcPtrs;
+        unsigned gcPtrCount = putArgStk->gtNumberReferenceSlots;
+
+        unsigned i = 0;
+        unsigned copiedSlots = 0;
+        while (i < slots)
+        {
+            switch (gcPtrs[i])
+            {
+            case TYPE_GC_NONE:
+                // Let's see if we can use rep movsq instead of a sequence of movsq instructions
+                // to save cycles and code size.
+            {
+                unsigned nonGcSlotCount = 0;
+
+                do
+                {
+                    nonGcSlotCount++;
+                    i++;
+                } while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
+
+                // If we have a very small contiguous non-gc region, it's better just to
+                // emit a sequence of movsq instructions
+                if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
+                {
+                    copiedSlots += nonGcSlotCount;
+                    while (nonGcSlotCount > 0)
+                    {
+                        instGen(INS_movsq);
+                        nonGcSlotCount--;
+                    }
+                }
+                else
+                {
+                    getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
+                    copiedSlots += nonGcSlotCount;
+                    instGen(INS_r_movsq);
+                }
+            }
+                break;
+            default:
+                // We have a GC pointer
+                // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsq instruction,
+                // but the logic for emitting a GC info record is not available (it is internal for the emitter only.)
+                // See emitGCVarLiveUpd function. If we could call it separately, we could do instGen(INS_movsq); and emission of gc info.
+
+                getEmitter()->emitIns_R_AR(ins_Load(TYP_REF), EA_GCREF, REG_RCX, REG_RSI, 0);
+                getEmitter()->emitIns_S_R(ins_Store(TYP_REF), EA_GCREF, REG_RCX, compiler->lvaOutgoingArgSpaceVar,
+                                          ((copiedSlots + putArgStk->gtSlotNum) * TARGET_POINTER_SIZE)); 
+                getEmitter()->emitIns_R_I(INS_add, EA_8BYTE, REG_RSI, TARGET_POINTER_SIZE);
+                getEmitter()->emitIns_R_I(INS_add, EA_8BYTE, REG_RDI, TARGET_POINTER_SIZE);
+                copiedSlots++;
+                gcPtrCount--;
+                i++;
+            }
+        }
+
+        gcInfo.gcMarkRegSetNpt(RBM_RSI);
+        gcInfo.gcMarkRegSetNpt(RBM_RDI);
+    }
+    return;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
 
     GenTreePtr data = treeNode->gtOp.gtOp1;
@@ -6508,7 +7468,9 @@ CodeGen::genPutArgStk(GenTreePtr treeNode)
     // Decrement SP.
     int argSize = genTypeSize(genActualType(targetType));
     inst_RV_IV(INS_sub, REG_SPBASE, argSize, emitActualTypeSize(TYP_I_IMPL));
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     genStackLevel += argSize;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     // TODO-Cleanup: Handle this in emitInsMov() in emitXArch.cpp?
     if (data->isContained())
@@ -6522,7 +7484,7 @@ CodeGen::genPutArgStk(GenTreePtr treeNode)
         getEmitter()->emitIns_AR_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, REG_SPBASE, 0);
     }
 }
-#endif // _TARGET_X86_
+#endif // defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 /*****************************************************************************
  *
diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp
index 427d778b90..b54657202a 100644
--- a/src/jit/compiler.cpp
+++ b/src/jit/compiler.cpp
@@ -2992,7 +2992,6 @@ void                 Compiler::compCompile(void * * methodCodePtr,
                                            unsigned compileFlags)
 {
     hashBv::Init(this);
-
     VarSetOps::AssignAllowUninitRhs(this, compCurLife, VarSetOps::UninitVal());
 
     /* The temp holding the secret stub argument is used by fgImport() when importing the intrinsic. */
@@ -4042,7 +4041,6 @@ int           Compiler::compCompileHelper (CORINFO_MODULE_HANDLE            clas
                                            unsigned                         compileFlags,
                                            CorInfoInstantiationVerification instVerInfo)
     {
-
         CORINFO_METHOD_HANDLE methodHnd = info.compMethodHnd;
 
         info.compCode           = methodInfo->ILCode;
@@ -5027,6 +5025,125 @@ START:
     return result;
 }
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+// GetTypeFromClassificationAndSizes:
+//   Returns the type of the eightbyte accounting for the classification and size of the eightbyte.
+//
+// args:
+//   classType: classification type
+//   size: size of the eightbyte.
+//   
+var_types Compiler::GetTypeFromClassificationAndSizes(SystemVClassificationType classType, int size)
+{
+    var_types type = TYP_UNKNOWN;
+    switch (classType)
+    {
+    case SystemVClassificationTypeInteger:
+        if (size == 1)
+        {
+            type = TYP_BYTE;
+        }
+        else if (size <= 2)
+        {
+            type = TYP_SHORT;
+        }
+        else if (size <= 4)
+        {
+            type = TYP_INT;
+        }
+        else if (size <= 8)
+        {
+            type = TYP_LONG;
+        }
+        else
+        {
+            assert(false && "GetTypeFromClassificationAndSizes Invalid Integer classification type.");
+        }
+        break;
+    case SystemVClassificationTypeIntegerReference:
+        type = TYP_REF;
+        break;
+    case SystemVClassificationTypeSSE:
+        if (size <= 4)
+        {
+            type = TYP_FLOAT;
+        }
+        else if (size <= 8)
+        {
+            type = TYP_DOUBLE;
+        }
+        else
+        {
+            assert(false && "GetTypeFromClassificationAndSizes Invalid SSE classification type.");
+        }
+        break;
+
+    default:
+        assert(false && "GetTypeFromClassificationAndSizes Invalid classification type.");
+        break;
+    }
+
+    return type;
+}
+
+// getEightByteType:
+//   Returns the type of the struct description and slot number of the eightbyte.
+//
+// args:
+//   structDesc: struct classification description.
+//   slotNum: eightbyte slot number for the struct.
+//   
+var_types Compiler::getEightByteType(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc, unsigned slotNum)
+{
+    var_types eightByteType = TYP_UNDEF;
+    unsigned len = structDesc.eightByteSizes[slotNum];
+
+    switch (structDesc.eightByteClassifications[slotNum])
+    {
+    case SystemVClassificationTypeInteger:
+        // See typelist.h for jit type definition. 
+        // All the types of size < 4 bytes are of jit type TYP_INT.
+        if (structDesc.eightByteSizes[slotNum] <= 4)
+        {
+            eightByteType = TYP_INT;
+        }
+        else if (structDesc.eightByteSizes[slotNum] <= 8)
+        {
+            eightByteType = TYP_LONG;
+        }
+        else
+        {
+            assert(false && "getEightByteType Invalid Integer classification type.");
+        }
+        break;
+    case SystemVClassificationTypeIntegerReference:
+        assert(len == REGSIZE_BYTES);
+        eightByteType = TYP_REF;
+        break;
+    case SystemVClassificationTypeSSE:
+        if (structDesc.eightByteSizes[slotNum] <= 4)
+        {
+            eightByteType = TYP_FLOAT;
+        }
+        else if (structDesc.eightByteSizes[slotNum] <= 8)
+        {
+            eightByteType = TYP_DOUBLE;
+        }
+        else
+        {
+            assert(false && "getEightByteType Invalid SSE classification type.");
+        }
+        break;
+    default:
+        assert(false && "getEightByteType Invalid classification type.");
+        break;
+    }
+
+    return eightByteType;
+}
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 /*****************************************************************************/
 /*****************************************************************************/
 
diff --git a/src/jit/compiler.h b/src/jit/compiler.h
index 520c94a462..bc851dcf1d 100644
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -269,9 +269,12 @@ public:
     unsigned char       lvOverlappingFields :1;  // True when we have a struct with possibly overlapping fields
     unsigned char       lvContainsHoles     :1;  // True when we have a promoted struct that contains holes
     unsigned char       lvCustomLayout      :1;  // True when this struct has "CustomLayout"
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     unsigned char       lvDontPromote:1;        // Should struct promoter consider this variable for promotion?
-    unsigned char       lvIsHfaRegArg:1;        // Is this argument variable holding a HFA register argument.
+#endif
+
+#ifdef _TARGET_ARM_
+    unsigned char       lvIsHfaRegArg   :1;        // Is this argument variable holding a HFA register argument.
     unsigned char       lvHfaTypeIsFloat:1;     // Is the HFA type float or double?
 #endif
 
@@ -290,7 +293,7 @@ public:
     unsigned char       lvSIMDType       :1; // This is a SIMD struct
     unsigned char       lvUsedInSIMDIntrinsic :1; // This tells lclvar is used for simd intrinsic
 #endif // FEATURE_SIMD
-    unsigned char       lvRegStruct : 1;     // This is a reg-sized non-field-addressed struct.
+    unsigned char       lvRegStruct      :1;     // This is a reg-sized non-field-addressed struct.
 
     union 
     {
@@ -305,6 +308,26 @@ public:
     unsigned char       lvFldOffset;
     unsigned char       lvFldOrdinal;
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    regNumber lvRegNumForSlot(unsigned slotNum)
+    {
+        if (slotNum == 0)
+        {
+            return lvArgReg;
+        }
+        else if (slotNum == 1)
+        {
+            return lvOtherArgReg;
+        }
+        else
+        {
+            assert(false && "Invalid slotNum!");
+        }
+
+        unreached();
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 private:
 
     regNumberSmall      _lvRegNum;      // Used to store the register this variable is in (or, the low register of a register pair).
@@ -314,7 +337,13 @@ private:
 #if !defined(_TARGET_64BIT_)
     regNumberSmall      _lvOtherReg;    // Used for "upper half" of long var.
 #endif // !defined(_TARGET_64BIT_)
+
     regNumberSmall      _lvArgReg;      // The register in which this argument is passed.
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    regNumberSmall      _lvOtherArgReg;    // Used for the second part of the struct passed in a register.
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 #ifndef LEGACY_BACKEND
     union
     {
@@ -382,7 +411,7 @@ public:
     regNumber           lvArgReg;
 
     regNumber GetArgReg() const
-{
+    {
         return (regNumber) _lvArgReg;
     }
 
@@ -392,6 +421,22 @@ public:
         assert(_lvArgReg == reg);
     }
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    __declspec(property(get = GetOtherArgReg, put = SetOtherArgReg))
+        regNumber           lvOtherArgReg;
+
+    regNumber GetOtherArgReg() const
+    {
+        return (regNumber)_lvOtherArgReg;
+    }
+
+    void SetOtherArgReg(regNumber reg)
+    {
+        _lvOtherArgReg = (regNumberSmall)reg;
+        assert(_lvOtherArgReg == reg);
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 #ifdef FEATURE_SIMD
     // Is this is a SIMD struct?
     bool lvIsSIMDType() const
@@ -1139,6 +1184,15 @@ struct FuncInfoDsc
 
 struct fgArgTabEntry
 {
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    fgArgTabEntry()
+    {
+        otherRegNum                     = REG_NA;
+        isStruct                        = false;  // is this a struct arg
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     GenTreePtr     node;        // Initially points at the Op1 field of 'parent', but if the argument is replaced with an GT_ASG or placeholder
                                 //  it will point at the actual argument in the gtCallLateArgs list.
     GenTreePtr     parent;      // Points at the GT_LIST node in the gtCallArgs for this argument
@@ -1165,6 +1219,13 @@ struct fgArgTabEntry
     bool           isBackFilled :1; // True when the argument fills a register slot skipped due to alignment requirements of previous arguments.
     bool           isNonStandard:1; // True if it is an arg that is passed in a reg other than a standard arg reg
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    regNumber             otherRegNum;              // The (second) register to use when passing this argument.
+    bool                  isStruct;                 // is this a struct arg
+
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     void SetIsHfaRegArg(bool hfaRegArg)
     {
         isHfaRegArg = hfaRegArg;
@@ -1196,10 +1257,10 @@ class  fgArgInfo
     unsigned              nextSlotNum;  // Updatable slot count value
     unsigned              stkLevel;     // Stack depth when we make this call (for x86)
 
-    unsigned              argTableSize;  // size of argTable array (equal to the argCount when done with fgMorphArgs)
-    bool                  argsComplete;  // marker for state
-    bool                  argsSorted;    // marker for state
-    fgArgTabEntryPtr *    argTable;      // variable sized array of per argument descrption: (i.e. argTable[argTableSize])
+    unsigned              argTableSize; // size of argTable array (equal to the argCount when done with fgMorphArgs)
+    bool                  argsComplete; // marker for state
+    bool                  argsSorted;   // marker for state
+    fgArgTabEntryPtr *    argTable;     // variable sized array of per argument descrption: (i.e. argTable[argTableSize])
 
 private:
 
@@ -1217,11 +1278,24 @@ public:
                                         unsigned        numRegs,
                                         unsigned        alignment);
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    fgArgTabEntryPtr AddRegArg         (unsigned        argNum,
+                                        GenTreePtr      node,
+                                        GenTreePtr      parent,
+                                        regNumber       regNum,
+                                        unsigned        numRegs,
+                                        unsigned        alignment,
+                                        const bool      isStruct,
+                                        const regNumber otherRegNum = REG_NA,
+                                        const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* const structDescPtr = nullptr);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     fgArgTabEntryPtr AddStkArg         (unsigned        argNum,
                                         GenTreePtr      node,
                                         GenTreePtr      parent,
                                         unsigned        numSlots,
-                                        unsigned        alignment);
+                                        unsigned        alignment
+                                        FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const bool isStruct));
 
     void             RemorphReset      ();
     fgArgTabEntryPtr RemorphRegArg     (unsigned        argNum,
@@ -1391,7 +1465,9 @@ public:
     DWORD expensiveDebugCheckLevel;
 #endif
 
-
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    CORINFO_CLASS_HANDLE     GetStructClassHandle(GenTreePtr tree);
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 #ifdef _TARGET_ARM_
 
@@ -1403,8 +1479,6 @@ public:
     // floating-point registers.
     //
 
-    inline CORINFO_CLASS_HANDLE     GetHfaClassHandle(GenTreePtr tree);
-
     bool                            IsHfa(CORINFO_CLASS_HANDLE hClass);
     bool                            IsHfa(GenTreePtr tree);
 
@@ -1417,6 +1491,14 @@ public:
 #endif // _TARGET_ARM_
 
     //-------------------------------------------------------------------------
+    // The following is used for struct passing on System V system.
+    //
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    bool                            IsRegisterPassable(CORINFO_CLASS_HANDLE hClass);
+    bool                            IsRegisterPassable(GenTreePtr tree);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    //-------------------------------------------------------------------------
     // The following is used for validating format of EH table
     //
 
@@ -2450,7 +2532,7 @@ public :
         unsigned char         fldOrdinal;
         var_types             fldType;
         unsigned              fldSize;
-        CORINFO_CLASS_HANDLE  fldTypeHnd;      
+        CORINFO_CLASS_HANDLE  fldTypeHnd;
     };
 
     // Info about struct to be promoted.
@@ -3006,9 +3088,12 @@ private:
     bool                impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &opcode);
     void                impAbortInline(bool abortThisInlineOnly, bool contextDependent, const char *reason);
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_)
     void                impMarkLclDstNotPromotable(unsigned tmpNum, GenTreePtr op, CORINFO_CLASS_HANDLE hClass);
-    GenTreePtr          impAssignHfaToVar(GenTreePtr op, CORINFO_CLASS_HANDLE hClass);
+#endif
+
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    GenTreePtr          impAssignStructToVar(GenTreePtr op, CORINFO_CLASS_HANDLE hClass);
 #endif
 
     // A free list of linked list nodes used to represent to-do stacks of basic blocks.
@@ -3026,9 +3111,11 @@ private:
 
     bool                impIsValueType              (typeInfo* pTypeInfo);
     var_types           mangleVarArgsType           (var_types type);
+
+#if FEATURE_VARARG
     regNumber           getCallArgIntRegister       (regNumber floatReg);
     regNumber           getCallArgFloatRegister     (regNumber intReg);
-
+#endif // FEATURE_VARARG
     //--------------------------- Inlining-------------------------------------
 
 #if defined(DEBUG) || MEASURE_INLINING
@@ -4080,10 +4167,9 @@ public:
 
     bool                fgCastNeeded(GenTreePtr tree, var_types toType);
     GenTreePtr          fgDoNormalizeOnStore(GenTreePtr tree);
-    GenTreePtr          fgMakeTmpArgNode(unsigned tmpVarNum);
-
-    /* The following check for loops that don't execute calls */
+    GenTreePtr          fgMakeTmpArgNode(unsigned tmpVarNum FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const bool passedInRegisters));
 
+    // The following check for loops that don't execute calls
     bool                fgLoopCallMarked;
 
     void                fgLoopCallTest    (BasicBlock *srcBB,
@@ -4450,7 +4536,14 @@ private:
     GenTreePtr          fgMorphCast         (GenTreePtr     tree);
     GenTreePtr          fgUnwrapProxy       (GenTreePtr     objRef);
     GenTreeCall*        fgMorphArgs         (GenTreeCall*   call);
-    void                fgMakeOutgoingStructArgCopy(GenTreeCall* call, GenTree* args, unsigned argIndex, CORINFO_CLASS_HANDLE copyBlkClass);
+    
+    void                fgMakeOutgoingStructArgCopy(
+                            GenTreeCall* call,
+                            GenTree* args,
+                            unsigned argIndex,
+                            CORINFO_CLASS_HANDLE copyBlkClass
+                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structDescPtr));
+
     void                fgFixupStructReturn (GenTreePtr     call);
     GenTreePtr          fgMorphLocalVar     (GenTreePtr     tree);
     bool                fgAddrCouldBeNull   (GenTreePtr     addr);
@@ -4570,11 +4663,11 @@ private:
     void                fgInsertInlineeBlocks (InlineInfo * pInlineInfo);
     GenTreePtr          fgInlinePrependStatements(InlineInfo * inlineInfo);
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     GenTreePtr          fgGetStructAsStructPtr(GenTreePtr tree);
-    GenTreePtr          fgAssignHfaInlineeToVar(GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd);
-    void                fgAttachHfaInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd);
-#endif
+    GenTreePtr          fgAssignStructInlineeToVar(GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd);
+    void                fgAttachStructInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd);
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     static fgWalkPreFn  fgUpdateInlineReturnExpressionPlaceHolder;
 
 #ifdef DEBUG
@@ -6275,6 +6368,17 @@ public :
     void                        eeSetEHinfo(unsigned                 EHnumber,
                                             const CORINFO_EH_CLAUSE* clause);
 
+    // ICorStaticInfo wrapper functions
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#ifdef DEBUG
+    static void                 dumpSystemVClassificationType(SystemVClassificationType ct);
+#endif // DEBUG
+
+    void                        eeGetSystemVAmd64PassStructInRegisterDescriptor(/*IN*/  CORINFO_CLASS_HANDLE structHnd,
+                                                                                /*OUT*/ SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     // Utility functions
 
 #if defined(DEBUG)
@@ -8433,6 +8537,11 @@ public:
 
     static HelperCallProperties s_helperCallProperties;
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    var_types GetTypeFromClassificationAndSizes(SystemVClassificationType classType, int size);
+    var_types getEightByteType(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc, unsigned slotNum);
+    void fgMorphSystemVStructArgs(GenTreeCall* call, bool hasStructArgument);
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 }; // end of class Compiler
 
 // Inline methods of CompAllocator.
@@ -8466,7 +8575,6 @@ LclVarDsc::LclVarDsc(Compiler* comp)
 {
 }
 
-
 /*
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
diff --git a/src/jit/compiler.hpp b/src/jit/compiler.hpp
index 1cdc939d16..e4168b0f18 100644
--- a/src/jit/compiler.hpp
+++ b/src/jit/compiler.hpp
@@ -651,7 +651,10 @@ bool   Compiler::VarTypeIsMultiByteAndCanEnreg(var_types type,
     if (type == TYP_STRUCT)
     {
         size = info.compCompHnd->getClassSize(typeClass);
-
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // Account for the classification of the struct.
+        result = IsRegisterPassable(typeClass);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         switch(size)
         {
             case 1:
@@ -664,6 +667,7 @@ bool   Compiler::VarTypeIsMultiByteAndCanEnreg(var_types type,
             default:
                 break;
         }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
     }
     else
     {
@@ -2268,8 +2272,10 @@ int                 Compiler::lvaFrameAddress(int varNum, bool * pFPbased)
         if (lvaDoneFrameLayout > REGALLOC_FRAME_LAYOUT && !varDsc->lvOnFrame)
         {
 #ifdef _TARGET_AMD64_
-            // On amd64, every param has a stack location.
+            // On amd64, every param has a stack location, except on Unix-like systems.
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
             assert(varDsc->lvIsParam);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #elif defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
             // For !LEGACY_BACKEND on x86, a stack parameter that is enregistered will have a stack location. 
             assert(varDsc->lvIsParam && !varDsc->lvIsRegArg);
@@ -2589,6 +2595,8 @@ var_types Compiler::mangleVarArgsType(var_types type)
     return type;
 }
 
+// For CORECLR there is no vararg on System V systems.
+#if FEATURE_VARARG
 inline regNumber Compiler::getCallArgIntRegister(regNumber floatReg)
 {
 #ifdef _TARGET_AMD64_
@@ -2630,10 +2638,11 @@ inline regNumber Compiler::getCallArgFloatRegister(regNumber intReg)
     }
 #else  // !_TARGET_AMD64_
     // How will float args be passed for RyuJIT/x86?
-    NYI("getCallArgIntRegister for RyuJIT/x86");
+    NYI("getCallArgFloatRegister for RyuJIT/x86");
     return REG_NA;
 #endif // !_TARGET_AMD64_
 }
+#endif // FEATURE_VARARG
 
 /*
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
diff --git a/src/jit/ee_il_dll.cpp b/src/jit/ee_il_dll.cpp
index 90e50ed84a..4c8e2ff30e 100644
--- a/src/jit/ee_il_dll.cpp
+++ b/src/jit/ee_il_dll.cpp
@@ -281,6 +281,16 @@ unsigned           Compiler::eeGetArgSize(CORINFO_ARG_LIST_HANDLE list, CORINFO_
     // Everything fits into a single 'slot' size
     // to accommodate irregular sized structs, they are passed byref
     // TODO-ARM64-Bug?: structs <= 16 bytes get passed in 2 consecutive registers.
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    CORINFO_CLASS_HANDLE        argClass;
+    CorInfoType argTypeJit = strip(info.compCompHnd->getArgType(sig, list, &argClass));
+    var_types argType = JITtype2varType(argTypeJit);
+    if (argType == TYP_STRUCT)
+    {
+        unsigned structSize = info.compCompHnd->getClassSize(argClass);
+        return structSize;
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     return sizeof(size_t);
 
 #else // !_TARGET_AMD64_ && !_TARGET_ARM64_
@@ -920,6 +930,60 @@ int Compiler::eeGetJitDataOffs(CORINFO_FIELD_HANDLE  field)
     }
 }
 
+
+/*****************************************************************************
+ *
+ *                      ICorStaticInfo wrapper functions
+ */
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+#ifdef DEBUG
+void Compiler::dumpSystemVClassificationType(SystemVClassificationType ct)
+{
+    switch (ct)
+    {
+    case SystemVClassificationTypeUnknown:              printf("UNKNOWN");          break;
+    case SystemVClassificationTypeStruct:               printf("Struct");           break;
+    case SystemVClassificationTypeNoClass:              printf("NoClass");          break;
+    case SystemVClassificationTypeMemory:               printf("Memory");           break;
+    case SystemVClassificationTypeInteger:              printf("Integer");          break;
+    case SystemVClassificationTypeIntegerReference:     printf("IntegerReference"); break;
+    case SystemVClassificationTypeSSE:                  printf("SSE");              break;
+    default:                                            printf("ILLEGAL");          break;
+    }
+}
+#endif // DEBUG
+
+void Compiler::eeGetSystemVAmd64PassStructInRegisterDescriptor(/*IN*/  CORINFO_CLASS_HANDLE structHnd,
+                                                               /*OUT*/ SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr)
+{
+    bool ok = info.compCompHnd->getSystemVAmd64PassStructInRegisterDescriptor(structHnd, structPassInRegDescPtr);
+    noway_assert(ok);
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("**** getSystemVAmd64PassStructInRegisterDescriptor(0x%x (%s), ...) =>\n", dspPtr(structHnd), eeGetClassName(structHnd));
+        printf("        passedInRegisters = %s\n", dspBool(structPassInRegDescPtr->passedInRegisters));
+        if (structPassInRegDescPtr->passedInRegisters)
+        {
+            printf("        eightByteCount   = %d\n", structPassInRegDescPtr->eightByteCount);
+            for (unsigned int i = 0; i < structPassInRegDescPtr->eightByteCount; i++)
+            {
+                printf("        eightByte #%d -- classification: ", i);
+                dumpSystemVClassificationType(structPassInRegDescPtr->eightByteClassifications[i]);
+                printf(", byteSize: %d, byteOffset: %d\n",
+                    structPassInRegDescPtr->eightByteSizes[i],
+                    structPassInRegDescPtr->eightByteOffsets[i]);
+            }
+        }
+    }
+#endif // DEBUG
+}
+
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 /*****************************************************************************
  *
  *                      Utility functions
diff --git a/src/jit/emit.cpp b/src/jit/emit.cpp
index 20f8af3fa2..fa9d3597de 100644
--- a/src/jit/emit.cpp
+++ b/src/jit/emit.cpp
@@ -5653,8 +5653,9 @@ void                emitter::emitRecordGCcall(BYTE * codePos,
     call->cdGCrefRegs     = (regMaskSmall)emitThisGCrefRegs;
     call->cdByrefRegs     = (regMaskSmall)emitThisByrefRegs;
 #if EMIT_TRACK_STACK_DEPTH
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     noway_assert(FitsIn<USHORT>(emitCurStackLvl / ((unsigned)sizeof(unsigned))));
-    call->cdArgBaseOffset = (USHORT)(emitCurStackLvl / ((unsigned)sizeof(unsigned)));
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #endif
 
     // Append the call descriptor to the list */
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index 6f1c6c8fce..d6de1f2dba 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -3671,7 +3671,8 @@ void                emitter::emitIns_C(instruction  ins,
     }
     else if (ins == INS_pop)
     {
-        emitCurStackLvl -= emitCntStackDepth; assert((int)emitCurStackLvl >= 0);
+        emitCurStackLvl -= emitCntStackDepth;
+        assert((int)emitCurStackLvl >= 0);
     }
 
 #endif // !FEATURE_FIXED_OUT_ARGS
@@ -11010,7 +11011,7 @@ size_t              emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE**
                                 && id->idReg1()   == REG_ESP)
             {
                 assert((size_t)emitGetInsSC(id) < 0x00000000FFFFFFFFLL);
-                emitStackPop (dst, /*isCall*/false, /*callInstrSize*/0, (unsigned)(emitGetInsSC(id) / sizeof(void*)));
+                emitStackPop(dst, /*isCall*/false, /*callInstrSize*/0, (unsigned)(emitGetInsSC(id) / sizeof(void*)));
             }
             break;
 
diff --git a/src/jit/flowgraph.cpp b/src/jit/flowgraph.cpp
index 84233d82c6..c26f221c3f 100644
--- a/src/jit/flowgraph.cpp
+++ b/src/jit/flowgraph.cpp
@@ -8148,17 +8148,67 @@ void                Compiler::fgAddInternal()
 
     // If there is a return value, then create a temp for it.  Real returns will store the value in there and
     // it'll be reloaded by the single return.
-
+    // TODO-ARM-Bug: Deal with multi-register genReturnLocaled structs?
+    // TODO-ARM64: Does this apply for ARM64 too?
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Create a local temp to store the return if the return type is not void and the
+    // native return type is not a struct or the native return type is a struct that is returned
+    // in registers (no RetBuffArg argument.)
+    // If we fold all returns into a single return statement, create a temp for struct type variables as well.
+    if (genReturnBB && ((info.compRetType != TYP_VOID && info.compRetNativeType != TYP_STRUCT) ||
+        (info.compRetNativeType == TYP_STRUCT && info.compRetBuffArg == BAD_VAR_NUM)))
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     if (genReturnBB && (info.compRetType != TYP_VOID && info.compRetNativeType != TYP_STRUCT))
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     {
         genReturnLocal = lvaGrabTemp(true DEBUGARG("Single return block return value"));
-        lvaTable[genReturnLocal].lvType = genActualType(info.compRetNativeType);
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        var_types retLocalType = TYP_STRUCT;
+        if (info.compRetNativeType == TYP_STRUCT)
+        {
+            // If the native ret type is a struct, make sure the right 
+            // normalized type is assigned to the local variable.
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            assert(info.compMethodInfo->args.retTypeClass != nullptr);
+            eeGetSystemVAmd64PassStructInRegisterDescriptor(info.compMethodInfo->args.retTypeClass, &structDesc);
+            if (structDesc.passedInRegisters && structDesc.eightByteCount <= 1)
+            {
+                retLocalType = lvaTable[genReturnLocal].lvType = getEightByteType(structDesc, 0);
+            }
+            else
+            {
+                lvaTable[genReturnLocal].lvType = TYP_STRUCT;
+            }
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            lvaTable[genReturnLocal].lvType = genActualType(info.compRetNativeType);
+        }
         
         if (varTypeIsFloating(lvaTable[genReturnLocal].lvType))
         {
             this->compFloatingPointUsed = true;
         }
-        
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        // Handle a struct return type for System V Amd64 systems.
+        if (info.compRetNativeType == TYP_STRUCT)
+        {
+            // Handle the normalized return type.
+            if (retLocalType == TYP_STRUCT)
+            {
+                lvaSetStruct(genReturnLocal, info.compMethodInfo->args.retTypeClass, true);
+            }
+            else
+            {
+                lvaTable[genReturnLocal].lvVerTypeInfo = typeInfo(TI_STRUCT, info.compMethodInfo->args.retTypeClass);
+            }
+
+            lvaTable[genReturnLocal].lvDontPromote = true;
+        }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
         if (!varTypeIsFloating(info.compRetType))
             lvaTable[genReturnLocal].setPrefReg(REG_INTRET, this);
 #ifdef REG_FLOATRET
@@ -8172,7 +8222,6 @@ void                Compiler::fgAddInternal()
         lvaTable[genReturnLocal].lvKeepType = 1;
 #endif
     }
-
     else
     {
         genReturnLocal = BAD_VAR_NUM;
@@ -8442,7 +8491,11 @@ void                Compiler::fgAddInternal()
         //make sure to reload the return value as part of the return (it is saved by the "real return").
         if (genReturnLocal != BAD_VAR_NUM)
         {
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            noway_assert(info.compRetType != TYP_VOID);
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
             noway_assert(info.compRetType != TYP_VOID && info.compRetNativeType != TYP_STRUCT);
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
             GenTreePtr retTemp = gtNewLclvNode(genReturnLocal, lvaTable[genReturnLocal].TypeGet());
 
             //make sure copy prop ignores this node (make sure it always does a reload from the temp).
@@ -21424,7 +21477,7 @@ void                Compiler::fgInline()
 #endif // DEBUG
 }
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 /*********************************************************************************
  *
@@ -21463,16 +21516,16 @@ GenTreePtr Compiler::fgGetStructAsStructPtr(GenTreePtr tree)
 
 /***************************************************************************************************
  * child     - The inlinee of the retExpr node.
- * retClsHnd - The HFA class handle of the type of the inlinee.
+ * retClsHnd - The struct class handle of the type of the inlinee.
  *
  * Assign the inlinee to a tmp, if it is a call, just assign it to a lclVar, else we can
  * use a copyblock to do the assignment.
  */
-GenTreePtr Compiler::fgAssignHfaInlineeToVar(GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd)
+GenTreePtr Compiler::fgAssignStructInlineeToVar(GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd)
 {
     assert(child->gtOper != GT_RET_EXPR && child->gtOper != GT_MKREFANY);
 
-    unsigned tmpNum = lvaGrabTemp(false DEBUGARG("RetBuf for HFA inline return candidates."));
+    unsigned tmpNum = lvaGrabTemp(false DEBUGARG("RetBuf for struct inline return candidates."));
     lvaSetStruct(tmpNum, retClsHnd, false);
 
     GenTreePtr dst = gtNewLclvNode(tmpNum, TYP_STRUCT);
@@ -21518,7 +21571,7 @@ GenTreePtr Compiler::fgAssignHfaInlineeToVar(GenTreePtr child, CORINFO_CLASS_HAN
 /***************************************************************************************************
  * tree      - The tree pointer that has one of its child nodes as retExpr.
  * child     - The inlinee child.
- * retClsHnd - The HFA class handle of the type of the inlinee.
+ * retClsHnd - The struct class handle of the type of the inlinee.
  *
  * V04 = call() assignments are okay as we codegen it. Everything else needs to be a copy block or
  * would need a temp. For example, a cast(ldobj) will then be, cast(v05 = ldobj, v05); But it is
@@ -21526,7 +21579,7 @@ GenTreePtr Compiler::fgAssignHfaInlineeToVar(GenTreePtr child, CORINFO_CLASS_HAN
  * a lclVar/call. So it is not worthwhile to do pattern matching optimizations like addr(ldobj(op1))
  * can just be op1.
  */
-void Compiler::fgAttachHfaInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd)
+void Compiler::fgAttachStructInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINFO_CLASS_HANDLE retClsHnd)
 {
     // We are okay to have:
     // 1. V02 = call();
@@ -21541,13 +21594,13 @@ void Compiler::fgAttachHfaInlineeToAsg(GenTreePtr tree, GenTreePtr child, CORINF
 
     GenTreePtr dstAddr = fgGetStructAsStructPtr(tree->gtOp.gtOp1);
     GenTreePtr srcAddr = fgGetStructAsStructPtr((child->gtOper == GT_CALL)
-                            ? fgAssignHfaInlineeToVar(child, retClsHnd) // Assign to a variable if it is a call.
+                            ? fgAssignStructInlineeToVar(child, retClsHnd) // Assign to a variable if it is a call.
                             : child);                                   // Just get the address, if not a call.
 
     tree->CopyFrom(gtNewCpObjNode(dstAddr, srcAddr, retClsHnd, false), this);
 }
 
-#endif // _TARGET_ARM_
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 /*****************************************************************************
  * Callback to replace the inline return expression place holder (GT_RET_EXPR)
@@ -21562,12 +21615,12 @@ Compiler::fgWalkResult      Compiler::fgUpdateInlineReturnExpressionPlaceHolder(
 
     if (tree->gtOper == GT_RET_EXPR)
     {
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         // We are going to copy the tree from the inlinee, so save the handle now.
         CORINFO_CLASS_HANDLE retClsHnd = (tree->TypeGet() == TYP_STRUCT)
                                        ? tree->gtRetExpr.gtRetClsHnd
                                        : NO_CLASS_HANDLE;
-#endif // _TARGET_ARM_
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
         do
         {
@@ -21605,32 +21658,36 @@ Compiler::fgWalkResult      Compiler::fgUpdateInlineReturnExpressionPlaceHolder(
         }
         while (tree->gtOper == GT_RET_EXPR);
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#if defined(_TARGET_ARM_)
         if (retClsHnd != NO_CLASS_HANDLE && comp->IsHfa(retClsHnd))
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (retClsHnd != NO_CLASS_HANDLE && comp->IsRegisterPassable(retClsHnd))
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         {
             GenTreePtr parent = data->parent;
             // See assert below, we only look one level above for an asg parent.
             if (parent->gtOper == GT_ASG)
             {
                 // Either lhs is a call V05 = call(); or lhs is addr, and asg becomes a copyBlk.
-                comp->fgAttachHfaInlineeToAsg(parent, tree, retClsHnd);
+                comp->fgAttachStructInlineeToAsg(parent, tree, retClsHnd);
             }
             else
             {
                 // Just assign the inlinee to a variable to keep it simple.
-                tree->CopyFrom(comp->fgAssignHfaInlineeToVar(tree, retClsHnd), comp);
+                tree->CopyFrom(comp->fgAssignStructInlineeToVar(tree, retClsHnd), comp);
             }
         }
-#endif // _TARGET_ARM_
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     }
 
-#if defined(DEBUG) && defined(_TARGET_ARM_)
+#if defined(DEBUG) && (defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
     // Make sure we don't have a tree like so: V05 = (, , , retExpr);
     // Since we only look one level above for the parent for '=' and
     // do not check if there is a series of COMMAs. See above.
     // Importer and FlowGraph will not generate such a tree, so just
     // leaving an assert in here. This can be fixed by looking ahead
-    // when we visit GT_ASG similar to fgAttachHfaInlineeToAsg.
+    // when we visit GT_ASG similar to fgAttachStructInlineeToAsg.
     else if (tree->gtOper == GT_ASG &&
              tree->gtOp.gtOp2->gtOper == GT_COMMA)
     {
@@ -21642,11 +21699,17 @@ Compiler::fgWalkResult      Compiler::fgUpdateInlineReturnExpressionPlaceHolder(
             // empty
         }
 
+#if defined(_TARGET_ARM_)
+        noway_assert(comma->gtType != TYP_STRUCT ||
+                     comma->gtOper != GT_RET_EXPR ||
+                     (!comp->IsHfa(comma->gtRetExpr.gtRetClsHnd)));
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         noway_assert(comma->gtType != TYP_STRUCT ||
                      comma->gtOper != GT_RET_EXPR ||
-                     !comp->IsHfa(comma->gtRetExpr.gtRetClsHnd));
+                     (!comp->IsRegisterPassable(comma->gtRetExpr.gtRetClsHnd)));
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     }
-#endif // defined(DEBUG) && defined(_TARGET_ARM_)
+#endif // defined(DEBUG) && (defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
 
     return WALK_CONTINUE;
 }
diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp
index 284000e55b..3c06925fe4 100644
--- a/src/jit/gentree.cpp
+++ b/src/jit/gentree.cpp
@@ -224,7 +224,15 @@ void                GenTree::InitNodeSize()
         GenTree::s_gtNodeSizes[op] = TREE_NODE_SZ_SMALL;
     }
 
-    /* Now set all of the appropriate entries to 'large' */
+    // Now set all of the appropriate entries to 'large'
+
+    // On ARM and System V struct returning there
+    // is code that does GT_ASG-tree.CopyObj call.
+    // CopyObj is a large node and the GT_ASG is small, which triggers an exception.
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    GenTree::s_gtNodeSizes[GT_ASG             ] = TREE_NODE_SZ_LARGE;
+    GenTree::s_gtNodeSizes[GT_RETURN          ] = TREE_NODE_SZ_LARGE;
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
     GenTree::s_gtNodeSizes[GT_CALL            ] = TREE_NODE_SZ_LARGE;
     GenTree::s_gtNodeSizes[GT_CAST            ] = TREE_NODE_SZ_LARGE;
@@ -256,6 +264,15 @@ void                GenTree::InitNodeSize()
     GenTree::s_gtNodeSizes[GT_MOD             ] = TREE_NODE_SZ_LARGE;
     GenTree::s_gtNodeSizes[GT_UMOD            ] = TREE_NODE_SZ_LARGE;
 #endif
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    GenTree::s_gtNodeSizes[GT_PUTARG_STK      ] = TREE_NODE_SZ_LARGE;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // In importer for Hfa and register returned structs we rewrite GT_ASG to GT_COPYOBJ/GT_CPYBLK
+    // Make sure the sizes agree.
+    assert(GenTree::s_gtNodeSizes[GT_COPYOBJ] <= GenTree::s_gtNodeSizes[GT_ASG]);
+    assert(GenTree::s_gtNodeSizes[GT_COPYBLK] <= GenTree::s_gtNodeSizes[GT_ASG]);
+#endif // !(defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
 
     assert(GenTree::s_gtNodeSizes[GT_RETURN] == GenTree::s_gtNodeSizes[GT_ASG]);
 
@@ -312,7 +329,12 @@ void                GenTree::InitNodeSize()
     static_assert_no_msg(sizeof(GenTreeArgPlace)      <= TREE_NODE_SZ_SMALL);
     static_assert_no_msg(sizeof(GenTreeLabel)         <= TREE_NODE_SZ_SMALL);
     static_assert_no_msg(sizeof(GenTreePhiArg)        <= TREE_NODE_SZ_SMALL);
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     static_assert_no_msg(sizeof(GenTreePutArgStk)     <= TREE_NODE_SZ_SMALL);
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
+    static_assert_no_msg(sizeof(GenTreePutArgStk)     <= TREE_NODE_SZ_LARGE);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifdef FEATURE_SIMD
     static_assert_no_msg(sizeof(GenTreeSIMD)          <= TREE_NODE_SZ_SMALL);
 #endif // FEATURE_SIMD
@@ -4366,13 +4388,21 @@ void            GenTree::InsertAfterSelf(GenTree* node, GenTreeStmt* stmt /* = n
 //    'parent' must be non-null
 //
 // Notes:
-//    Must not be called for GT_LDOBJ (which isn't used for RyuJIT, which is the only context
-//    in which this method is used)
+//    For non System V systems with native struct passing (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING not defined)
+//    this method must not be called for GT_LDOBJ (which isn't used for RyuJIT, which is the only context
+//    in which this method is used).
+//    If FEATURE_UNIX_AMD64_STRUCT_PASSING is defined we can get here with GT_LDOBJ tree. This happens when
+//    a struct is passed in two registers. The GT_LDOBJ is converted to a GT_LIST with two GT_LCL_FLDs later
+//    in Lower/LowerXArch.
+//
 
 GenTreePtr*         GenTree::gtGetChildPointer(GenTreePtr parent)
 
 {
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     noway_assert(parent->OperGet() != GT_LDOBJ);
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     switch (parent->OperGet())
     {
     default:
@@ -4380,6 +4410,14 @@ GenTreePtr*         GenTree::gtGetChildPointer(GenTreePtr parent)
         if (this == parent->gtOp.gtOp1)                    return &(parent->gtOp.gtOp1);
         if (this == parent->gtOp.gtOp2)                    return &(parent->gtOp.gtOp2);
         break;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    case GT_LDOBJ:
+        // Any GT_LDOBJ with a field must be lowered before this point.
+        noway_assert(parent->AsLdObj()->gtFldTreeList == nullptr);
+        break;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     case GT_CMPXCHG:
         if (this == parent->gtCmpXchg.gtOpLocation)        return &(parent->gtCmpXchg.gtOpLocation);
         if (this == parent->gtCmpXchg.gtOpValue)           return &(parent->gtCmpXchg.gtOpValue);
@@ -5027,7 +5065,7 @@ GenTreePtr          Compiler::gtNewInlineCandidateReturnExpr(GenTreePtr   inline
     GenTreePtr node = new(this, GT_RET_EXPR) GenTreeRetExpr(type);
     
     node->gtRetExpr.gtInlineCandidate = inlineCandidate;
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     if (inlineCandidate->gtType == TYP_STRUCT)
     {
         if (inlineCandidate->gtOper == GT_CALL)
@@ -5067,7 +5105,13 @@ GenTreeArgList* Compiler::gtNewListNode(GenTreePtr op1, GenTreeArgList* op2)
 
 GenTreeArgList* Compiler::gtNewArgList(GenTreePtr op)
 {
-    assert((op != NULL) && (op->OperGet() != GT_LIST) && (op->OperGet() != GT_LIST));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // With structs passed in multiple args we could have the arg
+    // GT_LIST containing a list of LCL_FLDs
+    assert((op != NULL) && ((!op->IsList()) || (op->IsListOfLclFlds())));
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    assert((op != NULL) && (op->OperGet() != GT_LIST));
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     return new (this, GT_LIST) GenTreeArgList(op);
 }
@@ -5079,8 +5123,15 @@ GenTreeArgList* Compiler::gtNewArgList(GenTreePtr op)
 
 GenTreeArgList* Compiler::gtNewArgList(GenTreePtr op1, GenTreePtr op2)
 {
-    assert((op1 != NULL) && (op1->OperGet() != GT_LIST) && (op1->OperGet() != GT_LIST));
-    assert((op2 != NULL) && (op2->OperGet() != GT_LIST) && (op2->OperGet() != GT_LIST));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // With structs passed in multiple args we could have the arg
+    // GT_LIST containing a list of LCL_FLDs
+    assert((op1 != NULL) && ((!op1->IsList()) || (op1->IsListOfLclFlds())));
+    assert((op2 != NULL) && ((!op2->IsList()) || (op2->IsListOfLclFlds())));
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    assert((op1 != NULL) && (!op1->IsList()));
+    assert((op2 != NULL) && (!op2->IsList()));
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     GenTreePtr tree;
 
@@ -5207,9 +5258,11 @@ GenTreePtr          Compiler::gtNewAssignNode(GenTreePtr dst, GenTreePtr src DEB
     // using struct assignment.
 #ifdef _TARGET_ARM_
     assert(isPhiDefn || type != TYP_STRUCT || IsHfa(dst) || IsHfa(src));
-#else
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     // You need to use GT_COPYBLK for assigning structs
     // See impAssignStruct()
+    assert(isPhiDefn || type != TYP_STRUCT || IsRegisterPassable(dst) || IsRegisterPassable(src));
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(isPhiDefn || type != TYP_STRUCT);
 #endif
 
@@ -5553,7 +5606,6 @@ GenTreePtr          Compiler::gtClone(GenTree * tree, bool complexOK)
                                  tree->gtField.gtFldHnd,
                                  objp,
                                  tree->gtField.gtFldOffset);
-
         }
         else if  (tree->gtOper == GT_ADD)
         {
@@ -8629,6 +8681,51 @@ GenTreePtr          Compiler::gtDispLinearTree(GenTreeStmt* curStmt,
                     // get child msg
                     if (tree->IsCall())
                     {
+                        // If this is a call and the arg (listElem) is a GT_LIST (Unix LCL_FLD for passing a var in multiple registers)
+                        // print the nodes of the nested list and continue to the next argument.
+                        if (listElem->gtOper == GT_LIST)
+                        {
+                            GenTreePtr nextListNested = nullptr;
+                            for (GenTreePtr listNested = listElem; listNested != nullptr; listNested = nextListNested)
+                            {
+                                GenTreePtr listElemNested;
+                                if (listNested->gtOper == GT_LIST)
+                                {
+                                    nextListNested = listNested->MoveNext();
+                                    listElemNested = listNested->Current();
+                                }
+                                else
+                                {
+                                    // GT_LIST nodes (under initBlk, others?) can have a non-null op2 that's not a GT_LIST
+                                    nextListNested = nullptr;
+                                    listElemNested = listNested;
+                                }
+
+                                indentStack->Push(indentInfo);
+                                if (child == tree->gtCall.gtCallArgs)
+                                {
+                                    gtGetArgMsg(tree, listNested, listElemNum, bufp, BufLength);
+                                }
+                                else
+                                {
+                                    assert(child == tree->gtCall.gtCallLateArgs);
+                                    gtGetLateArgMsg(tree, listNested, listElemNum, bufp, BufLength);
+                                }
+                                nextLinearNode = gtDispLinearTree(curStmt, nextLinearNode, listElemNested, indentStack, bufp);
+                                indentStack->Pop();
+                            }
+
+                            // Skip the GT_LIST nodes, as we do not print them, and the next node to print will occur
+                            // after the list.
+                            while (nextLinearNode->OperGet() == GT_LIST)
+                            {
+                                nextLinearNode = nextLinearNode->gtNext;
+                            }
+
+                            listElemNum++;
+                            continue;
+                        }
+
                         if (child == tree->gtCall.gtCallArgs)
                         {
                             gtGetArgMsg(tree, listElem, listElemNum, bufp, BufLength);
@@ -8643,6 +8740,7 @@ GenTreePtr          Compiler::gtDispLinearTree(GenTreeStmt* curStmt,
                     {
                         sprintf_s(bufp, sizeof(buf), "List Item %d", listElemNum);
                     }
+
                     indentStack->Push(indentInfo);
                     nextLinearNode = gtDispLinearTree(curStmt, nextLinearNode, listElem, indentStack, bufp);
                     indentStack->Pop();
@@ -10179,6 +10277,7 @@ LNG_ADD_CHKOVF:
                     }
                 }
             }
+
             lval1 = ltemp; break;
 
         case GT_OR : lval1 |= lval2; break;
diff --git a/src/jit/gentree.h b/src/jit/gentree.h
index f6c850ea5a..1402445da0 100644
--- a/src/jit/gentree.h
+++ b/src/jit/gentree.h
@@ -1027,6 +1027,11 @@ public:
         return OperIsCopyBlkOp(OperGet());
     }
 
+    bool            OperIsPutArgStk() const
+    {
+        return gtOper == GT_PUTARG_STK;
+    }
+
     bool            OperIsAddrMode() const
     {
         return OperIsAddrMode(OperGet());
@@ -1125,7 +1130,7 @@ public:
     static
     int             OperIsSimple(genTreeOps gtOper)
     {
-        return  (OperKind(gtOper) & GTK_SMPOP  ) != 0;
+        return (OperKind(gtOper) & GTK_SMPOP  ) != 0;
     }
 
     static
@@ -1294,7 +1299,7 @@ public:
 
     static
     inline bool RequiresNonNullOp2(genTreeOps oper);
-
+    bool IsListOfLclFlds();
 #endif // DEBUG
 
     inline bool IsZero();
@@ -2277,7 +2282,7 @@ struct GenTreeColon: public GenTreeOp
 /* gtCall   -- method call      (GT_CALL) */
 typedef class fgArgInfo *  fgArgInfoPtr;
 
-struct GenTreeCall: public GenTree
+struct GenTreeCall final : public GenTree
 {
     GenTreePtr        gtCallObjp;             // The instance argument ('this' pointer)
     GenTreeArgList*   gtCallArgs;             // The list of arguments in original evaluation order
@@ -2296,6 +2301,14 @@ struct GenTreeCall: public GenTree
     CORINFO_SIG_INFO* callSig;                // Used by tail calls and to register callsites with the EE
 
     regMaskTP         gtCallRegUsedMask;      // mask of registers used to pass parameters
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+
+    void SetRegisterReturningStructState(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDescIn)
+    {
+        structDesc.CopyFrom(structDescIn);
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
 #define     GTF_CALL_M_EXPLICIT_TAILCALL       0x0001  // GT_CALL -- the call is "tail" prefixed and importer has performed tail call checks
 #define     GTF_CALL_M_TAILCALL                0x0002  // GT_CALL -- the call is a tailcall
@@ -2438,9 +2451,12 @@ struct GenTreeCall: public GenTree
 
     GenTreeCall(var_types type) : 
         GenTree(GT_CALL, type) 
-        {}
+    {
+    }
 #if DEBUGGABLE_GENTREE
-    GenTreeCall() : GenTree() {}
+    GenTreeCall() : GenTree()
+    {
+    }
 #endif
 };
 
@@ -3024,7 +3040,7 @@ struct GenTreeRetExpr: public GenTree
 {
     GenTreePtr      gtInlineCandidate;
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     CORINFO_CLASS_HANDLE gtRetClsHnd;
 #endif
 
@@ -3243,10 +3259,26 @@ struct GenTreePutArgStk: public GenTreeUnOp
                                   // Fast tail calls set this to true.
                                   // In future if we need to add more such bool fields consider bit fields.
 
-    GenTreePutArgStk(genTreeOps oper, var_types type, unsigned slotNum, bool _putInIncomingArgArea = false
-                DEBUG_ARG(GenTreePtr callNode = NULL) DEBUG_ARG(bool largeNode = false)) : 
-                GenTreeUnOp(oper, type DEBUG_ARG(largeNode)),
-                gtSlotNum(slotNum), putInIncomingArgArea(_putInIncomingArgArea)
+    GenTreePutArgStk(
+            genTreeOps oper,
+            var_types type,
+            unsigned slotNum
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(unsigned numSlots)
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(bool isStruct),
+            bool _putInIncomingArgArea = false
+            DEBUG_ARG(GenTreePtr callNode = NULL)
+            DEBUG_ARG(bool largeNode = false))
+        : 
+        GenTreeUnOp(oper, type DEBUG_ARG(largeNode)),
+        gtSlotNum(slotNum),
+        putInIncomingArgArea(_putInIncomingArgArea)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        , gtPutArgStkKind(PutArgStkKindInvalid),
+        gtNumSlots(numSlots),
+        gtIsStruct(isStruct),
+        gtNumberReferenceSlots(0),
+        gtGcPtrs(nullptr)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
 #ifdef DEBUG
         gtCall = callNode;
@@ -3254,22 +3286,53 @@ struct GenTreePutArgStk: public GenTreeUnOp
     }
 
 
-    GenTreePutArgStk(genTreeOps oper, var_types type, GenTreePtr op1, unsigned slotNum, bool _putInIncomingArgArea = false
-                DEBUG_ARG(GenTreePtr callNode = NULL) DEBUG_ARG(bool largeNode = false)) : 
-                GenTreeUnOp(oper, type, op1 DEBUG_ARG(largeNode)), 
-                gtSlotNum(slotNum), putInIncomingArgArea(_putInIncomingArgArea)
+    GenTreePutArgStk(
+            genTreeOps oper,
+            var_types type,
+            GenTreePtr op1,
+            unsigned slotNum
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(unsigned numSlots)
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(bool isStruct),
+            bool _putInIncomingArgArea = false
+            DEBUG_ARG(GenTreePtr callNode = NULL)
+            DEBUG_ARG(bool largeNode = false))
+        :
+        GenTreeUnOp(oper, type, op1 DEBUG_ARG(largeNode)), 
+        gtSlotNum(slotNum),
+        putInIncomingArgArea(_putInIncomingArgArea)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        , gtPutArgStkKind(PutArgStkKindInvalid),
+        gtNumSlots(numSlots),
+        gtIsStruct(isStruct),
+        gtNumberReferenceSlots(0),
+        gtGcPtrs(nullptr)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
 #ifdef DEBUG
         gtCall = callNode;
 #endif
     }
 
-#else  // !FEATURE_FASTTAIL_CALL
-
-    GenTreePutArgStk(genTreeOps oper, var_types type, unsigned slotNum
-                DEBUG_ARG(GenTreePtr callNode = NULL) DEBUG_ARG(bool largeNode = false)) : 
-                GenTreeUnOp(oper, type DEBUG_ARG(largeNode)),
-                gtSlotNum(slotNum)
+#else  // !FEATURE_FASTTAILCALL
+
+    GenTreePutArgStk(
+            genTreeOps oper,
+            var_types type,
+            unsigned slotNum
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(unsigned numSlots)
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(bool isStruct)
+            DEBUG_ARG(GenTreePtr callNode = NULL)
+            DEBUG_ARG(bool largeNode = false))
+        :
+        GenTreeUnOp(oper, type DEBUG_ARG(largeNode)),
+        gtSlotNum(slotNum)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        , gtPutArgStkKind(PutArgStkKindInvalid),
+        gtNumSlots(numSlots),
+        gtIsStruct(isStruct),
+        gtNumberReferenceSlots(0),
+        gtGcPtrs(nullptr)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
 #ifdef DEBUG
         gtCall = callNode;
@@ -3277,10 +3340,25 @@ struct GenTreePutArgStk: public GenTreeUnOp
     }
 
 
-    GenTreePutArgStk(genTreeOps oper, var_types type, GenTreePtr op1, unsigned slotNum
-                DEBUG_ARG(GenTreePtr callNode = NULL) DEBUG_ARG(bool largeNode = false)) : 
-                GenTreeUnOp(oper, type, op1 DEBUG_ARG(largeNode)), 
-                gtSlotNum(slotNum)
+    GenTreePutArgStk(
+            genTreeOps oper,
+            var_types type,
+            GenTreePtr op1,
+            unsigned slotNum
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(unsigned numSlots)
+            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(bool isStruct)
+            DEBUG_ARG(GenTreePtr callNode = NULL)
+            DEBUG_ARG(bool largeNode = false))
+        :
+        GenTreeUnOp(oper, type, op1 DEBUG_ARG(largeNode)), 
+        gtSlotNum(slotNum)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        , gtPutArgStkKind(PutArgStkKindInvalid),
+        gtNumSlots(numSlots),
+        gtIsStruct(isStruct),
+        gtNumberReferenceSlots(0),
+        gtGcPtrs(nullptr)
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     {
 #ifdef DEBUG
         gtCall = callNode;
@@ -3288,10 +3366,53 @@ struct GenTreePutArgStk: public GenTreeUnOp
     }
 #endif // FEATURE_FASTTAILCALL
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    //------------------------------------------------------------------------
+    // setGcPointers: Sets the number of references and the layout of the struct object returned by the VM.
+    //
+    // Arguments:
+    //    numPointers - Number of pointer references.
+    //    pointers    - layout of the struct (with pointers marked.)
+    //
+    // Return Value:
+    //    None
+    //
+    // Notes:
+    //    This data is used in the codegen for GT_PUTARG_STK to decide how to copy the struct to the stack by value.
+    //    If no pointer references are used, block copying instructions are used.
+    //    Otherwise the pointer reference slots are copied atomically in a way that gcinfo is emitted.
+    //    Any non pointer references between the pointer reference slots are copied in block fashion.
+    //
+    void setGcPointers(unsigned numPointers, BYTE* pointers)
+    {
+        gtNumberReferenceSlots = numPointers;
+        gtGcPtrs = pointers;
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifdef DEBUG
     GenTreePtr      gtCall;                // the call node to which this argument belongs
 #endif
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // Instruction selection: during codegen time, what code sequence we will be using
+    // to encode this operation.
+
+    enum PutArgStkKind : __int8
+    {
+        PutArgStkKindInvalid,
+        PutArgStkKindRepInstr,
+        PutArgStkKindUnroll,
+    };
+
+    PutArgStkKind gtPutArgStkKind;
+
+    unsigned gtNumSlots;              // Number of slots for the argument to be passed on stack
+    bool     gtIsStruct;              // This stack arg is a struct.
+    unsigned gtNumberReferenceSlots;  // Number of reference slots.
+    BYTE*    gtGcPtrs;                // gcPointers
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #if DEBUGGABLE_GENTREE
     GenTreePutArgStk() : GenTreeUnOp() {}
 #endif
@@ -3325,6 +3446,30 @@ inline GenTreePtr GenTree::MoveNext()
     return gtOp.gtOp2;
 }
 
+#ifdef DEBUG
+inline bool GenTree::IsListOfLclFlds()
+
+{
+    if (!IsList())
+    {
+        return false;
+    }
+
+    GenTree* gtListPtr = this;
+    while (gtListPtr->Current() != nullptr)
+    {
+        if (gtListPtr->Current()->OperGet() != GT_LCL_FLD)
+        {
+            return false;
+        }
+
+        gtListPtr = gtListPtr->MoveNext();
+    }
+
+    return true;
+}
+#endif // DEBUG
+
 inline GenTreePtr GenTree::Current()
 {
     assert(IsList());
diff --git a/src/jit/importer.cpp b/src/jit/importer.cpp
index d56ca3ddda..0ee654c837 100644
--- a/src/jit/importer.cpp
+++ b/src/jit/importer.cpp
@@ -1152,13 +1152,22 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
                                         BasicBlock    * block        /* = NULL */
                                        ) 
 {
-    assert(src->TypeGet() == TYP_STRUCT);
-
+    assert(src->TypeGet() == TYP_STRUCT || (src->gtOper == GT_ADDR && src->TypeGet() == TYP_BYREF));
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // TODO-ARM-BUG: Does ARM need this?
+    // TODO-ARM64-BUG: Does ARM64 need this?
+    assert(src->gtOper == GT_LCL_VAR  || src->gtOper == GT_FIELD    ||
+           src->gtOper == GT_IND      || src->gtOper == GT_LDOBJ    ||
+           src->gtOper == GT_CALL     || src->gtOper == GT_MKREFANY ||
+           src->gtOper == GT_RET_EXPR || src->gtOper == GT_COMMA    ||
+           src->gtOper == GT_ADDR     || GenTree::OperIsSIMD(src->gtOper));
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     assert(src->gtOper == GT_LCL_VAR  || src->gtOper == GT_FIELD    ||
            src->gtOper == GT_IND      || src->gtOper == GT_LDOBJ    ||
            src->gtOper == GT_CALL     || src->gtOper == GT_MKREFANY ||
            src->gtOper == GT_RET_EXPR || src->gtOper == GT_COMMA    ||
            GenTree::OperIsSIMD(src->gtOper));
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
     if (src->gtOper == GT_CALL)
     {
@@ -1187,8 +1196,14 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
                 fgLclFldAssign(lcl->gtLclVarCommon.gtLclNum);
                 lcl->gtType = src->gtType;
                 dest = lcl;
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_)
                 impMarkLclDstNotPromotable(lcl->gtLclVarCommon.gtLclNum, src, structHnd);
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                // Not allowed for FEATURE_CORCLR which is the only SKU available for System V OSs.
+                assert(!src->gtCall.IsVarargs() && "varargs not allowed for System V OSs.");
+
+                // Make the struct non promotable. The eightbytes could contain multiple fields.
+                lvaTable[lcl->gtLclVarCommon.gtLclNum].lvDontPromote = true;
 #endif
             }
             else
@@ -1207,6 +1222,7 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
     {
         GenTreePtr call = src->gtRetExpr.gtInlineCandidate;
         noway_assert(call->gtOper == GT_CALL);
+
         if (call->gtCall.gtCallMoreFlags & GTF_CALL_M_RETBUFFARG)
         {
             // insert the return value buffer into the argument list as first byref parameter
@@ -1274,7 +1290,8 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
     }
     else if (src->gtOper == GT_COMMA)
     {
-        assert(src->gtOp.gtOp2->gtType == TYP_STRUCT);  // Second thing is the struct
+        // Second thing is the struct or it's address.
+        assert(src->gtOp.gtOp2->gtType == TYP_STRUCT || src->gtOp.gtOp2->gtType == TYP_BYREF);
         if (pAfterStmt)
         {
             * pAfterStmt = fgInsertStmtAfter(block, * pAfterStmt, gtNewStmt(src->gtOp.gtOp1, impCurStmtOffs));
@@ -1287,6 +1304,10 @@ GenTreePtr Compiler::impAssignStructPtr(GenTreePtr      dest,
         // evaluate the second thing using recursion
         return impAssignStructPtr(dest, src->gtOp.gtOp2, structHnd, curLevel, pAfterStmt, block);
     }
+    else if (src->gtOper == GT_ADDR)
+    {
+        // In case of address already in src, use it to copy the struct. 
+    }
     else
     {
         src = gtNewOperNode(GT_ADDR, TYP_BYREF, src);
@@ -4528,8 +4549,7 @@ GenTreePtr Compiler::impTransformThis (GenTreePtr thisPtr,
             GenTreePtr obj = thisPtr;
             
             assert(obj->TypeGet() == TYP_BYREF || obj->TypeGet() == TYP_I_IMPL);
-            obj = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, obj, pConstrainedResolvedToken->hClass
-                                                   );
+            obj = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, obj, pConstrainedResolvedToken->hClass);
             obj->gtFlags |= GTF_EXCEPT;
             
             CorInfoType jitTyp = info.compCompHnd->asCorInfoType(pConstrainedResolvedToken->hClass);
@@ -5948,7 +5968,14 @@ var_types           Compiler::impImportCall (OPCODE         opcode,
         }
     }
 
-    /* Check for varargs */
+    // Check for varargs
+#if !FEATURE_VARARG
+    if ((sig->callConv & CORINFO_CALLCONV_MASK) == CORINFO_CALLCONV_VARARG ||
+        (sig->callConv & CORINFO_CALLCONV_MASK) == CORINFO_CALLCONV_NATIVEVARARG)
+    {
+        BADCODE("Varargs not supported.");
+    }
+#endif // !FEATURE_VARARG
 
     if  ((sig->callConv & CORINFO_CALLCONV_MASK) == CORINFO_CALLCONV_VARARG ||
          (sig->callConv & CORINFO_CALLCONV_MASK) == CORINFO_CALLCONV_NATIVEVARARG)
@@ -6699,12 +6726,23 @@ bool                Compiler::impMethodInfo_hasRetBuffArg(CORINFO_METHOD_INFO *
         return false;
     }
 
-#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
+#if defined(_TARGET_AMD64_) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    assert(!info.compIsVarArgs && "Varargs not supported in CoreCLR on Unix.");
+    if (IsRegisterPassable(methInfo->args.retTypeClass))
+    {
+        return false;
+    }
+
+    // The struct is not aligned properly or it is bigger than 16 bytes,
+    // or it is custom layout, or it is not passed in registers for any other reason.
+    return true;
+#elif defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
+    // Check for TYP_STRUCT argument that can fit into a single register.
     // We don't need a return buffer if:
     //   i) TYP_STRUCT argument that can fit into a single register and
     //  ii) Power of two sized TYP_STRUCT.
     unsigned size = info.compCompHnd->getClassSize(methInfo->args.retTypeClass);
-    return (size > TARGET_POINTER_SIZE) || ((size & (size-1)) != 0);
+    return (size > TARGET_POINTER_SIZE) || ((size & (size - 1)) != 0);
 #elif defined(_TARGET_ARM_)
     // Check for non HFA: in ARM HFAs are returned in registers.
     if (!info.compIsVarArgs && IsHfa(methInfo->args.retTypeClass))
@@ -6717,8 +6755,6 @@ bool                Compiler::impMethodInfo_hasRetBuffArg(CORINFO_METHOD_INFO *
     // TODO-ARM64-NYI: HFA/HVA arguments.
     // Check for TYP_STRUCT argument that is greater than 16 bytes.
     return info.compCompHnd->getClassSize(methInfo->args.retTypeClass) > 16;
-#elif defined(_TARGET_X86_)
-    return true;
 #else // _TARGET_*
 #error Unsupported or unset target architecture
 #endif // _TARGET_*
@@ -6792,7 +6828,6 @@ GenTreePtr                Compiler::impFixupStructReturn(GenTreePtr     call,
                                                          CORINFO_CLASS_HANDLE retClsHnd)
 {
     assert(call->gtOper == GT_CALL);
-
     if (call->TypeGet() != TYP_STRUCT)
     {
         return call;
@@ -6826,13 +6861,46 @@ GenTreePtr                Compiler::impFixupStructReturn(GenTreePtr     call,
             return call;
         }
 
-        return impAssignHfaToVar(call, retClsHnd);
+        return impAssignStructToVar(call, retClsHnd);
     }
-#endif
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Not allowed for FEATURE_CORCLR which is the only SKU available for System V OSs.
+    assert(!call->gtCall.IsVarargs() && "varargs not allowed for System V OSs.");
+
+    // The return is a struct if not normalized to a single eightbyte return type below.
+    call->gtCall.gtReturnType = TYP_STRUCT;
+    // Get the classification for the struct.
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+    if (structDesc.passedInRegisters)
+    {
+        call->gtCall.SetRegisterReturningStructState(structDesc);
+
+        if (structDesc.eightByteCount <= 1)
+        {
+            call->gtCall.gtReturnType = getEightByteType(structDesc, 0);
+        }
+        else
+        {
+            if (!call->gtCall.CanTailCall() && ((call->gtFlags & GTF_CALL_INLINE_CANDIDATE) == 0))
+            {
+                // If we can tail call returning in registers struct or inline a method that returns
+                // a registers returned struct, then don't assign it to
+                // a variable back and forth.
+                return impAssignStructToVar(call, retClsHnd);
+            }
+        }
+    }
+    else
+    {
+        call->gtCall.gtCallMoreFlags |= GTF_CALL_M_RETBUFFARG;
+    }
+
+    return call;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     unsigned size = info.compCompHnd->getClassSize(retClsHnd);
     BYTE gcPtr = 0;
-
     // Check for TYP_STRUCT argument that can fit into a single register
     // change the type on those trees.
     // TODO-ARM64-NYI: what about structs 9 to 16 bytes that fit in two consecutive registers?
@@ -6913,7 +6981,37 @@ GenTreePtr          Compiler::impFixupStructReturnType(GenTreePtr op, CORINFO_CL
     assert(info.compRetBuffArg == BAD_VAR_NUM);
 
 #if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
     assert(info.compRetNativeType != TYP_STRUCT);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    assert(!info.compIsVarArgs); // No VarArgs for CoreCLR.
+    if (info.compRetNativeType == TYP_STRUCT)
+    {
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+
+        if (structDesc.passedInRegisters)
+        {
+            if (op->gtOper == GT_LCL_VAR)
+            {
+                // This LCL_VAR is a register return value, it stays as a TYP_STRUCT
+                unsigned lclNum = op->gtLclVarCommon.gtLclNum;
+                // Make sure this struct type stays as struct so that we can return it in registers.
+                lvaTable[lclNum].lvDontPromote = true;
+
+                return op;
+            }
+
+            if (op->gtOper == GT_CALL)
+            {
+                return op;
+            }
+
+            return impAssignStructToVar(op, retClsHnd);
+        }
+    }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #elif defined(_TARGET_ARM_)
     if (!info.compIsVarArgs && IsHfa(retClsHnd))
     {
@@ -6941,7 +7039,7 @@ GenTreePtr          Compiler::impFixupStructReturnType(GenTreePtr op, CORINFO_CL
                 return op;
             }
         }
-        return impAssignHfaToVar(op, retClsHnd);
+        return impAssignStructToVar(op, retClsHnd);
     }
 #endif
 
@@ -7003,7 +7101,22 @@ REDO_RETURN_NODE:
         }
         else
         {
-            assert(info.compRetNativeType == op->gtCall.gtReturnType);
+#ifdef DEBUG
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (op->gtType == TYP_STRUCT)
+            {
+                SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+                eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+                assert(structDesc.eightByteCount < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+                assert(getEightByteType(structDesc, 0) == op->gtCall.gtReturnType);
+            }
+            else
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+            {
+                assert(info.compRetNativeType == op->gtCall.gtReturnType);
+            }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // DEBUG
             // Don't change the gtType node just yet, it will get changed later
             return op;
         }
@@ -7012,8 +7125,19 @@ REDO_RETURN_NODE:
     {
         op->gtOp.gtOp2 = impFixupStructReturnType(op->gtOp.gtOp2, retClsHnd);
     }
-
-    op->gtType = info.compRetNativeType;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (op->gtType == TYP_STRUCT)
+    {
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+        assert(structDesc.eightByteCount < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+        op->gtType = getEightByteType(structDesc, 0);
+    }
+    else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+    {
+        op->gtType = info.compRetNativeType;
+    }
 
     return op;
 }
@@ -11412,7 +11536,6 @@ DO_LDFTN:
             }
 
             eeGetFieldInfo(&resolvedToken, (CORINFO_ACCESS_FLAGS)aflags, &fieldInfo);
-
             // Figure out the type of the member.  We always call canAccessField, so you always need this
             // handle
             CorInfoType ciType = fieldInfo.fieldType;
@@ -11590,7 +11713,6 @@ DO_LDFTN:
 
                 /* Create the data member node */
                 op1 = gtNewFieldRef(lclTyp, resolvedToken.hField, NULL, fieldInfo.offset);
-
                 op1->gtFlags |= GTF_IND_TLS_REF; // fgMorphField will handle the transformation
 
                 if (isLoadAddress)
@@ -11850,7 +11972,6 @@ FIELD_DONE:
 
                 /* Create the data member node */
                 op1 = gtNewFieldRef(lclTyp, resolvedToken.hField, NULL, fieldInfo.offset);
-
                 op1->gtFlags |= GTF_IND_TLS_REF; // fgMorphField will handle the transformation
 
                 break;
@@ -12396,7 +12517,11 @@ FIELD_DONE:
               |           |                         | push the BYREF to this local |
               |--------------------------------------------------------------------- 
               | UNBOX_ANY | push a GT_LDOBJ of      | push the STRUCT              |
-              |           | the BYREF               |                              |
+              |           | the BYREF               | For Linux when the           |
+              |           |                         |  struct is returned in two   |
+              |           |                         |  registers create a temp     |
+              |           |                         |  which address is passed to  |
+              |           |                         |  the unbox_nullable helper.  |
               |---------------------------------------------------------------------
             */
                 
@@ -12434,11 +12559,40 @@ FIELD_DONE:
                     impPushOnStack(op1, tiRetVal);
                     oper = GT_LDOBJ;
                     goto LDOBJ;
-                }   
-                
+                }
+
+                assert(helper == CORINFO_HELP_UNBOX_NULLABLE && "Make sure the helper is nullable!");
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                if (op1->gtType == TYP_STRUCT)
+                {
+                    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+                    eeGetSystemVAmd64PassStructInRegisterDescriptor(resolvedToken.hClass, &structDesc);
+                    if (structDesc.passedInRegisters && structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS)
+                    {
+                        // Unbox nullable helper returns a TYP_STRUCT.
+                        // We need to spill it to a temp so than we can take the address of it.
+                        // We need the temp so we can pass its address to the unbox_nullable jit helper function.
+                        // This is needed for 2 register returned nullables.
+                        // The one register ones are normalized. For the bigger than 16 bytes ones there is retbuf already passed in rdi.
+
+                        unsigned   tmp = lvaGrabTemp(true DEBUGARG("UNBOXing a register returnable nullable"));
+                        lvaTable[tmp].lvDontPromote = true;
+                        lvaSetStruct(tmp, resolvedToken.hClass, true  /* unsafe value cls check */);
+
+                        op2 = gtNewLclvNode(tmp, TYP_STRUCT);
+                        op1 = impAssignStruct(op2, op1, resolvedToken.hClass, (unsigned)CHECK_SPILL_ALL);
+                        assert(op1->gtType == TYP_VOID); // We must be assigning the return struct to the temp.
+
+                        op2 = gtNewLclvNode(tmp, TYP_STRUCT);
+                        op2 = gtNewOperNode(GT_ADDR, TYP_BYREF, op2);
+                        op1 = gtNewOperNode(GT_COMMA, TYP_STRUCT, op1, op2);
+                    }
+                }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
                 assert(op1->gtType == TYP_STRUCT);
                 tiRetVal = verMakeTypeInfo(resolvedToken.hClass);
-                assert(tiRetVal.IsValueClass());                       
+                assert(tiRetVal.IsValueClass());
             }
 
             impPushOnStack(op1, tiRetVal);                    
@@ -12946,8 +13100,7 @@ LDOBJ:
            
             // LDOBJ returns a struct
             // and an inline argument which is the class token of the loaded obj
-            op1 = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, op1, resolvedToken.hClass
-                                                   );
+            op1 = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, op1, resolvedToken.hClass);
             op1->gtFlags |= GTF_EXCEPT;
 
             CorInfoType jitTyp = info.compCompHnd->asCorInfoType(resolvedToken.hClass);
@@ -13231,7 +13384,7 @@ void Compiler::impLoadLoc(unsigned ilLclNum, IL_OFFSET offset)
     }            
 }
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_)
 /**************************************************************************************
  *
  *  When assigning a vararg call src to a HFA lcl dest, mark that we cannot promote the
@@ -13269,12 +13422,32 @@ void Compiler::impMarkLclDstNotPromotable(unsigned tmpNum, GenTreePtr src, CORIN
         }
     }
 }
+#endif
 
-GenTreePtr Compiler::impAssignHfaToVar(GenTreePtr op, CORINFO_CLASS_HANDLE hClass)
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+GenTreePtr Compiler::impAssignStructToVar(GenTreePtr op, CORINFO_CLASS_HANDLE hClass)
 {
-    unsigned tmpNum = lvaGrabTemp(true DEBUGARG("Return value temp for HFA structs in ARM."));
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    unsigned tmpNum = lvaGrabTemp(true DEBUGARG("Return value temp for register returned structs in System V"));
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    unsigned tmpNum = lvaGrabTemp(true DEBUGARG("Return value temp for HFA structs in ARM"));
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     impAssignTempGen(tmpNum, op, hClass, (unsigned) CHECK_SPILL_NONE);
-    return gtNewLclvNode(tmpNum, TYP_STRUCT);
+    GenTreePtr ret = gtNewLclvNode(tmpNum, TYP_STRUCT);
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#ifdef DEBUG
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    eeGetSystemVAmd64PassStructInRegisterDescriptor(hClass, &structDesc);
+    // If single eightbyte, the return type would have been normalized and there won't be a temp var.
+    // This code will be called only if the struct return has not been normalized (i.e. 2 eightbytes - max allowed.)
+    assert(structDesc.passedInRegisters && structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+#endif // DEBUG
+    // Mark the var to store the eightbytes on stack non promotable.
+    // The return value is based on eightbytes, so all the fields need 
+    // to be on stack before loading the eightbyte in the corresponding return register.
+    lvaTable[tmpNum].lvDontPromote = true;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    return ret;
 }
 #endif
 
@@ -13297,7 +13470,7 @@ bool Compiler::impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &
             Verify(!verIsByRefLike(tiDeclared) ||
                    verIsSafeToReturnByRef(tiVal)
                    , "byref return");
-                    
+
             Verify(tiCompatibleWith(tiVal, tiDeclared.NormaliseForStack(), true), "type mismatch");
             expectedStack=1;
         }
@@ -13502,15 +13675,35 @@ bool Compiler::impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &
                                      se.seTypeInfo.GetClassHandle(),
                                      (unsigned) CHECK_SPILL_ALL);
                 }
-#ifdef _TARGET_ARM_
+                // TODO-ARM64-NYI: HFA
+                // TODO-AMD64-Unix and TODO-ARM once the ARM64 functionality is implemented the
+                // next ifdefs could be refactored in a single method with the ifdef inside.
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#if defined(_TARGET_ARM_)
                 if (IsHfa(retClsHnd))
                 {
                     // Same as !IsHfa but just don't bother with impAssignStructPtr.
+#else // !defined(_TARGET_ARM_)
+                SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+                eeGetSystemVAmd64PassStructInRegisterDescriptor(retClsHnd, &structDesc);
+                if (structDesc.passedInRegisters)
+                {
+                    // If single eightbyte, the return type would have been normalized and there won't be a temp var.
+                    // This code will be called only if the struct return has not been normalized (i.e. 2 eightbytes - max allowed.)
+                    assert(structDesc.eightByteCount == CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+                    // Same as !structDesc.passedInRegisters but just don't bother with impAssignStructPtr.
+#endif // !defined(_TARGET_ARM_)
+
                     if (lvaInlineeReturnSpillTemp != BAD_VAR_NUM)
                     {
                         if (!impInlineInfo->retExpr)
                         {
+#if defined(_TARGET_ARM_)
                             impInlineInfo->retExpr = gtNewLclvNode(lvaInlineeReturnSpillTemp, TYP_STRUCT);
+#else // !defined(_TARGET_ARM_)
+                            // The inlinee compiler has figured out the type of the temp already. Use it here.
+                            impInlineInfo->retExpr = gtNewLclvNode(lvaInlineeReturnSpillTemp, lvaTable[lvaInlineeReturnSpillTemp].lvType);
+#endif // !defined(_TARGET_ARM_)
                         }
                     }
                     else
@@ -13519,7 +13712,7 @@ bool Compiler::impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &
                     }
                 }
                 else
-#endif
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 {
                     assert(iciCall->gtCall.gtCallMoreFlags & GTF_CALL_M_RETBUFFARG);
                     GenTreePtr dest = gtCloneExpr(iciCall->gtCall.gtCallArgs->gtOp.gtOp1);   
@@ -13575,8 +13768,9 @@ bool Compiler::impReturnInstruction(BasicBlock *block, int prefixFlags, OPCODE &
     }
     else if (info.compRetType == TYP_STRUCT)
     {
-#ifndef _TARGET_ARM_
+#if !defined(_TARGET_ARM_) && !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         // In ARM HFA native types are maintained as structs.
+        // The multi register System V AMD64 return structs are also left as structs and not normalized.
         // TODO-ARM64-NYI: HFA
         noway_assert(info.compRetNativeType != TYP_STRUCT);
 #endif
diff --git a/src/jit/jit.h b/src/jit/jit.h
index 9702da3ec9..2901ffd6eb 100644
--- a/src/jit/jit.h
+++ b/src/jit/jit.h
@@ -220,6 +220,22 @@
 #define INDEBUG_LDISASM_COMMA(x)
 #endif
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#define FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(x)   , x
+#define FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(x)   x
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#define FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(x)
+#define FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(x)
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+#if defined(UNIX_AMD64_ABI)
+#define UNIX_AMD64_ABI_ONLY_ARG(x)   , x
+#define UNIX_AMD64_ABI_ONLY(x)   x
+#else // !defined(UNIX_AMD64_ABI)
+#define UNIX_AMD64_ABI_ONLY_ARG(x)
+#define UNIX_AMD64_ABI_ONLY(x)
+#endif // defined(UNIX_AMD64_ABI)
+
 // To get rid of warning 4701 : local variable may be used without being initialized
 #define DUMMY_INIT(x)       (x)
 
@@ -605,7 +621,11 @@ unsigned int        unsigned_abs(int x)
 inline
 size_t              unsigned_abs(ssize_t x)
 {
+#ifndef FEATURE_PAL
     return ((size_t)          abs(x));
+#else // !FEATURE_PAL
+    return ((size_t)          labs(x));
+#endif // !FEATURE_PAL
 }
 #endif // _TARGET_64BIT_
 
diff --git a/src/jit/jitgcinfo.h b/src/jit/jitgcinfo.h
index 5c8d10f1b7..4063bafe15 100644
--- a/src/jit/jitgcinfo.h
+++ b/src/jit/jitgcinfo.h
@@ -253,7 +253,6 @@ public :
 #endif
 
         unsigned short      cdArgCnt;
-        unsigned short      cdArgBaseOffset;
 
         union
         {
diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp
index c12f735f68..b9e89f156d 100644
--- a/src/jit/lclvars.cpp
+++ b/src/jit/lclvars.cpp
@@ -103,8 +103,8 @@ void                Compiler::lvaInitTypeRef()
     /* Set compArgsCount and compLocalsCount */
 
     info.compArgsCount      = info.compMethodInfo->args.numArgs;
-
-    /* Is there a 'this' pointer */
+    
+    // Is there a 'this' pointer 
 
     if (!info.compIsStatic)
     {
@@ -133,6 +133,18 @@ void                Compiler::lvaInitTypeRef()
         else
 #endif
         {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            eeGetSystemVAmd64PassStructInRegisterDescriptor(info.compMethodInfo->args.retTypeClass, &structDesc);
+            if (structDesc.eightByteCount > 1)
+            {
+                info.compRetNativeType = TYP_STRUCT;
+            }
+            else
+            {
+                info.compRetNativeType = getEightByteType(structDesc, 0);
+            }
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
             unsigned size = info.compCompHnd->getClassSize(info.compMethodInfo->args.retTypeClass);
 
             // Check for TYP_STRUCT argument that can fit into a single register
@@ -173,6 +185,7 @@ void                Compiler::lvaInitTypeRef()
                 assert(!"Unexpected size when returning struct by value");
                 break;
             }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         }
     }
 
@@ -191,7 +204,9 @@ void                Compiler::lvaInitTypeRef()
        calling convention is varargs */
 
     if (info.compIsVarArgs)
+    {
         info.compArgsCount++;
+    }
 
     // Is there an extra parameter used to pass instantiation info to
     // shared generic methods and shared generic struct instance methods?
@@ -356,18 +371,17 @@ void                Compiler::lvaInitArgs(InitVarDscInfo *          varDscInfo)
 
     //----------------------------------------------------------------------
 
-    /* We have set info.compArgsCount in compCompile() */
-
+    // We have set info.compArgsCount in compCompile()
     noway_assert(varDscInfo->varNum == info.compArgsCount);
     assert (varDscInfo->intRegArgNum <= MAX_REG_ARG);
-        
+
     codeGen->intRegState.rsCalleeRegArgNum = varDscInfo->intRegArgNum;
 
 #if !FEATURE_STACK_FP_X87
     codeGen->floatRegState.rsCalleeRegArgNum = varDscInfo->floatRegArgNum;
 #endif // FEATURE_STACK_FP_X87
 
-    /* The total argument size must be aligned. */
+    // The total argument size must be aligned.
     noway_assert((compArgSize % sizeof(void*)) == 0);
 
 #ifdef _TARGET_X86_
@@ -440,6 +454,7 @@ void                Compiler::lvaInitThisPtr(InitVarDscInfo *       varDscInfo)
         }
 #endif
         compArgSize       += TARGET_POINTER_SIZE;
+
         varDscInfo->varNum++;
         varDscInfo->varDsc++;
     }
@@ -449,7 +464,17 @@ void                Compiler::lvaInitThisPtr(InitVarDscInfo *       varDscInfo)
 void                Compiler::lvaInitRetBuffArg(InitVarDscInfo *    varDscInfo)
 {
     LclVarDsc * varDsc = varDscInfo->varDsc;
-    const bool hasRetBuffArg = impMethodInfo_hasRetBuffArg(info.compMethodInfo);
+    bool hasRetBuffArg = impMethodInfo_hasRetBuffArg(info.compMethodInfo);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (info.compRetNativeType == TYP_STRUCT)
+    {
+        if (IsRegisterPassable(info.compMethodInfo->args.retTypeClass))
+        {
+            hasRetBuffArg = false;
+        }
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     if (hasRetBuffArg)
     {
@@ -594,7 +619,6 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
             // the type as a float or double.
             argType = hfaType;
         }
-
         if (isRegParamType(argType))
         {
             compArgSize += varDscInfo->alignReg(argType, cAlign) * REGSIZE_BYTES;
@@ -644,19 +668,94 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
         }
 
 #else // !_TARGET_ARM_
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+        if (argType == TYP_STRUCT)
+        {
+            assert(typeHnd != nullptr);
+            eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            if (structDesc.passedInRegisters)
+            {
+                unsigned intRegCount = 0;
+                unsigned floatRegCount = 0;
 
-        varDsc->lvOnFrame = true; // The final home for this incoming register might be our local stack frame
+                for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
+                {
+                    switch (structDesc.eightByteClassifications[i])
+                    {
+                    case SystemVClassificationTypeInteger:
+                    case SystemVClassificationTypeIntegerReference:
+                        intRegCount++;
+                        break;
+                    case SystemVClassificationTypeSSE:
+                        floatRegCount++;
+                        break;
+                    default:
+                        assert(false && "Invalid eightbyte classification type.");
+                        break;
+                    }
+                }
+
+                if (intRegCount != 0 && !varDscInfo->canEnreg(TYP_INT, intRegCount))
+                {
+                    structDesc.passedInRegisters = false; // No register to enregister the eightbytes.
+                }
+
+                if (floatRegCount != 0 && !varDscInfo->canEnreg(TYP_FLOAT, floatRegCount))
+                {
+                    structDesc.passedInRegisters = false; // No register to enregister the eightbytes.
+                }
+            }
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        // The final home for this incoming register might be our local stack frame
+        // For System V platforms the final home will always be on the local stack frame.
+        varDsc->lvOnFrame = true;
 
 #endif // !_TARGET_ARM_
 
-        if (varDscInfo->canEnreg(argType, cSlotsToEnregister))
+        bool canPassArgInRegisters = false;
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (argType == TYP_STRUCT)
+        {
+            canPassArgInRegisters = structDesc.passedInRegisters;
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            canPassArgInRegisters = varDscInfo->canEnreg(argType, cSlotsToEnregister);
+        }
+
+        if (canPassArgInRegisters) 
         {
             /* Another register argument */
 
             // Allocate the registers we need. allocRegArg() returns the first argument register number of the set.
             // For non-HFA structs, we still "try" to enregister the whole thing; it will just max out if splitting
             // to the stack happens.
-            unsigned firstAllocatedRegArgNum = varDscInfo->allocRegArg(argType, cSlots);
+            unsigned firstAllocatedRegArgNum = 0;
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            unsigned secondAllocatedRegArgNum = 0;
+            var_types firstEightByteType  = TYP_UNDEF;
+            var_types secondEightByteType = TYP_UNDEF;
+            varDsc->lvOtherArgReg = REG_NA;
+
+            if (argType == TYP_STRUCT)
+            {
+                if (structDesc.eightByteCount >= 1)
+                {
+                    firstEightByteType = getEightByteType(structDesc, 0);
+                    firstAllocatedRegArgNum = varDscInfo->allocRegArg(firstEightByteType, 1);
+                }
+            }
+            else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            {
+                firstAllocatedRegArgNum = varDscInfo->allocRegArg(argType, cSlots);
+            }
 
 #ifdef _TARGET_ARM_
             if (isHfaArg)
@@ -668,7 +767,31 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
 #endif // _TARGET_ARM_
 
             varDsc->lvIsRegArg = 1;
-            varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, argType);
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            if (argType == TYP_STRUCT)
+            {
+                varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, firstEightByteType);
+
+                // If there is a second eightbyte, get a register for it too and map the arg to the reg number.
+                if (structDesc.eightByteCount >= 2)
+                {
+                    secondEightByteType = getEightByteType(structDesc, 1);
+                    secondAllocatedRegArgNum = varDscInfo->allocRegArg(secondEightByteType, 1);
+                }
+
+                if (secondEightByteType != TYP_UNDEF)
+                {
+                    varDsc->lvOtherArgReg = genMapRegArgNumToRegNum(secondAllocatedRegArgNum, secondEightByteType);
+                    varDsc->addPrefReg(genRegMask(varDsc->lvOtherArgReg), this);
+                }
+            }
+            else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
+            {
+                varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, argType);
+            }
+
             varDsc->setPrefReg(varDsc->lvArgReg, this);
 
 #ifdef _TARGET_ARM_
@@ -682,52 +805,91 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
 #ifdef  DEBUG
             if  (verbose)
             {
-                printf("Arg #%u    passed in register ", varDscInfo->varNum);
-
-                bool isFloat = varTypeIsFloating(argType);
-                unsigned regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, argType);
+                printf("Arg #%u    passed in register(s) ", varDscInfo->varNum);
+                bool isFloat = false;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                // In case of one eightbyte struct the type is already normalized earlier.
+                // The varTypeIsFloating(argType) is good for this case.
+                if ((argType == TYP_STRUCT) && (structDesc.eightByteCount >= 1))
+                {
+                    isFloat = varTypeIsFloating(firstEightByteType);
+                }
+                else
+#else // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                {
+                    isFloat = varTypeIsFloating(argType);
+                }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
-                for (unsigned ix = 0; ix < cSlots; ix++, regArgNum++)
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                if (argType == TYP_STRUCT)
                 {
-                    if (ix > 0)
-                        printf(",");
+                    // Print both registers, just to be clear
+                    if (firstEightByteType == TYP_UNDEF)
+                    {
+                        printf("firstEightByte: <not used>");
+                    }
+                    else
+                    {
+                        printf("firstEightByte: %s", getRegName(genMapRegArgNumToRegNum(firstAllocatedRegArgNum, firstEightByteType), isFloat));
+                    }
 
-                    if (!isFloat && (regArgNum >= varDscInfo->maxIntRegArgNum)) // a struct has been split between registers and stack
+                    if (secondEightByteType == TYP_UNDEF)
                     {
-                        printf(" stack slots:%d", cSlots - ix);
-                        break;
+                        printf(", secondEightByte: <not used>");
                     }
+                    else
+                    {
+                        printf(", secondEightByte: %s", getRegName(genMapRegArgNumToRegNum(secondAllocatedRegArgNum, secondEightByteType), varTypeIsFloating(secondEightByteType)));
+                    }
+                }
+                else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                {
+                    unsigned regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, argType);
 
-#ifdef _TARGET_ARM_
-                    if (isFloat)
+                    for (unsigned ix = 0; ix < cSlots; ix++, regArgNum++)
                     {
-                        // Print register size prefix
-                        if (argType == TYP_DOUBLE)
+                        if (ix > 0)
+                            printf(",");
+
+                        if (!isFloat && (regArgNum >= varDscInfo->maxIntRegArgNum)) // a struct has been split between registers and stack
+                        {
+                            printf(" stack slots:%d", cSlots - ix);
+                            break;
+                        }
+
+#ifdef _TARGET_ARM_
+                        if (isFloat)
                         {
-                            // Print both registers, just to be clear
-                            printf("%s/%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType),     isFloat), 
-                                            getRegName(genMapRegArgNumToRegNum(regArgNum + 1, argType), isFloat));
-
-                            // doubles take 2 slots
-                            assert(ix + 1 < cSlots);
-                            ++ix;
-                            ++regArgNum;
+                            // Print register size prefix
+                            if (argType == TYP_DOUBLE)
+                            {
+                                // Print both registers, just to be clear
+                                printf("%s/%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType),     isFloat), 
+                                                getRegName(genMapRegArgNumToRegNum(regArgNum + 1, argType), isFloat));
+
+                                // doubles take 2 slots
+                                assert(ix + 1 < cSlots);
+                                ++ix;
+                                ++regArgNum;
+                            }
+                            else
+                            {
+                                printf("%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType), isFloat));
+                            }
                         }
                         else
+#endif // _TARGET_ARM_
                         {
                             printf("%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType), isFloat));
                         }
                     }
-                    else
-#endif // _TARGET_ARM_
-                    {
-                        printf("%s", getRegName(genMapRegArgNumToRegNum(regArgNum, argType), isFloat));
-                    }
                 }
                 printf("\n");
             }
 #endif // DEBUG
-        } // if canEnreg()
+        } // end if (canPassArgInRegisters) 
         else
         {
 #ifdef _TARGET_ARM_
@@ -739,8 +901,13 @@ void                Compiler::lvaInitUserArgs(InitVarDscInfo *      varDscInfo)
 #endif
         }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // The arg size is returning the number of bytes of the argument. For a struct it could return a size not a multiple of 
+        // TARGET_POINTER_SIZE. The stack allocated space should always be multiple of TARGET_POINTER_SIZE, so round it up.
+        compArgSize += (unsigned)roundUp(argSize, TARGET_POINTER_SIZE);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         compArgSize += argSize;
-
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         if (info.compIsVarArgs)
         {
 #if defined(_TARGET_X86_)
@@ -807,6 +974,7 @@ void                Compiler::lvaInitGenericsCtxt(InitVarDscInfo *  varDscInfo)
             varDsc->lvArgReg   = genMapRegArgNumToRegNum(varDscInfo->regArgNum(TYP_INT), varDsc->TypeGet());
             varDsc->setPrefReg(varDsc->lvArgReg, this);
             varDsc->lvOnFrame = true; // The final home for this incoming register might be our local stack frame
+
             varDscInfo->intRegArgNum++;
 
 #ifdef  DEBUG
@@ -1180,11 +1348,6 @@ void   Compiler::lvaCanPromoteStructType(CORINFO_CLASS_HANDLE     typeHnd,
                                          lvaStructPromotionInfo * StructPromotionInfo,
                                          bool                     sortFields)
 {    
-#ifdef UNIX_AMD64_ABI
-    // TODO-Amd64-Unix: For now don't promote structs on Linux.
-    // This should be brought online with the full SystemVStruct passing work.
-    return;
-#endif // UNIX_AMD64_ABI
     assert(eeIsValueClass(typeHnd));
     
     if (typeHnd != StructPromotionInfo->typeHnd)
@@ -2844,14 +3007,21 @@ void                Compiler::lvaMarkLclRefs(GenTreePtr tree)
     }
 #endif // ASSERTION_PROP
 
+    bool allowStructs = false;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // On System V the type of the var could be a TYP_STRUCT.
+    allowStructs = varDsc->lvType == TYP_STRUCT;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     /* Variables must be used as the same type throughout the method */
-    noway_assert(tiVerificationNeeded ||
-           varDsc->lvType == TYP_UNDEF   || tree->gtType   == TYP_UNKNOWN ||
-           genActualType(varDsc->TypeGet()) == genActualType(tree->gtType) ||
-           (tree->gtType == TYP_BYREF && varDsc->TypeGet() == TYP_I_IMPL)  ||
-           (tree->gtType == TYP_I_IMPL && varDsc->TypeGet() == TYP_BYREF)  ||
-           (tree->gtFlags & GTF_VAR_CAST) ||
-           varTypeIsFloating(varDsc->TypeGet()) && varTypeIsFloating(tree->gtType));
+        noway_assert(tiVerificationNeeded ||
+               varDsc->lvType == TYP_UNDEF   || tree->gtType   == TYP_UNKNOWN ||
+               allowStructs ||
+               genActualType(varDsc->TypeGet()) == genActualType(tree->gtType) ||
+               (tree->gtType == TYP_BYREF && varDsc->TypeGet() == TYP_I_IMPL)  ||
+               (tree->gtType == TYP_I_IMPL && varDsc->TypeGet() == TYP_BYREF)  ||
+               (tree->gtFlags & GTF_VAR_CAST) ||
+               varTypeIsFloating(varDsc->TypeGet()) && varTypeIsFloating(tree->gtType));
 
     /* Remember the type of the reference */
 
@@ -3690,7 +3860,6 @@ void Compiler::lvaFixVirtualFrameOffsets()
         delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta();
     }
 #endif //_TARGET_AMD64_
-
     unsigned lclNum;
     LclVarDsc * varDsc;
     for (lclNum = 0, varDsc = lvaTable;
@@ -3735,6 +3904,7 @@ void Compiler::lvaFixVirtualFrameOffsets()
         if (doAssignStkOffs)
         {
            varDsc->lvStkOffs += delta;
+
 #if DOUBLE_ALIGN
             if (genDoubleAlign() && !codeGen->isFramePointerUsed())
             {
@@ -3886,11 +4056,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
     {
         noway_assert(lclNum == info.compThisArg);
 #ifndef _TARGET_X86_
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
 #endif // _TARGET_X86_
         lclNum++;
     }
@@ -3902,11 +4068,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
         noway_assert(lclNum == info.compRetBuffArg);
         noway_assert(lvaTable[lclNum].lvIsRegArg);
 #ifndef _TARGET_X86_
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum, REGSIZE_BYTES, argOffs UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
 #endif // _TARGET_X86_
         lclNum++;
     }
@@ -3917,20 +4079,12 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
     if (info.compMethodInfo->args.callConv & CORINFO_CALLCONV_PARAMTYPE)
     {
         noway_assert(lclNum == (unsigned)info.compTypeCtxtArg);
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, &callerArgOffset);
-#else // UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs);
-#endif // UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
     }
 
     if (info.compIsVarArgs)
     {
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
     }
 
 #endif // USER_ARGS_COME_LAST
@@ -3976,18 +4130,10 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
         if (lvaIsPreSpilled(preSpillLclNum, preSpillMask))
         {
             unsigned argSize = eeGetArgSize(argLst, &info.compMethodInfo->args);
-#ifdef UNIX_AMD64_ABI
-            argOffs = lvaAssignVirtualFrameOffsetToArg(
-                preSpillLclNum,
-                argSize,
-                argOffs,
-                &callerArgOffset);
-#else // !UNIX_AMD64_ABI
             argOffs = lvaAssignVirtualFrameOffsetToArg(
                 preSpillLclNum,
                 argSize,
                 argOffs);
-#endif // !UNIX_AMD64_ABI
             argLcls++;
 
             // Early out if we can. If size is 8 and base reg is 2, then the mask is 0x1100
@@ -4008,18 +4154,10 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
     {
         if (!lvaIsPreSpilled(stkLclNum, preSpillMask))
         {
-#ifdef UNIX_AMD64_ABI
-            argOffs = lvaAssignVirtualFrameOffsetToArg(
-                stkLclNum,
-                eeGetArgSize(argLst, &info.compMethodInfo->args),
-                argOffs,
-                &callerArgOffset);
-#else // !UNIX_AMD64_ABI
             argOffs = lvaAssignVirtualFrameOffsetToArg(
                 stkLclNum,
                 eeGetArgSize(argLst, &info.compMethodInfo->args),
                 argOffs);
-#endif // !UNIX_AMD64_ABI
             argLcls++;
         }
         argLst = info.compCompHnd->getArgNext(argLst);
@@ -4029,16 +4167,18 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
 #else // !_TARGET_ARM_
     for (unsigned i = 0; i < argSigLen; i++)
     {
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++,
-            eeGetArgSize(argLst, &info.compMethodInfo->args),
-            argOffs,
-            &callerArgOffset);
-#else // !UNIX_AMD64_ABI
+        unsigned argumentSize = eeGetArgSize(argLst, &info.compMethodInfo->args);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // On the stack frame the homed arg always takes a full number of slots
+        // for proper stack alignment. Make sure the real struct size is properly rounded up.
+        argumentSize = (unsigned)roundUp(argumentSize, TARGET_POINTER_SIZE);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
         argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++,
-            eeGetArgSize(argLst, &info.compMethodInfo->args),
-            argOffs);
-#endif // UNIX_AMD64_ABI
+            argumentSize,
+            argOffs
+            UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
         argLst = info.compCompHnd->getArgNext(argLst);
     }
 #endif // !_TARGET_ARM_
@@ -4049,26 +4189,19 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
     if (info.compMethodInfo->args.callConv & CORINFO_CALLCONV_PARAMTYPE)
     {
         noway_assert(lclNum == (unsigned)info.compTypeCtxtArg);
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
     }
 
     if (info.compIsVarArgs)
     {
-#ifdef UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, &callerArgOffset);
-#else // !UNIX_AMD64_ABI
-        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs);
-#endif // !UNIX_AMD64_ABI
+        argOffs = lvaAssignVirtualFrameOffsetToArg(lclNum++, sizeof(void *), argOffs, UNIX_AMD64_ABI_ONLY_ARG(&callerArgOffset));
     }
 
 #endif // USER_ARGS_COME_LAST
 
 }
 
+#ifdef UNIX_AMD64_ABI
 //
 //  lvaAssignVirtualFrameOffsetToArg() : Assign virtual stack offsets to an
 //  individual argument, and return the offset for the next argument.
@@ -4076,12 +4209,9 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs()
 //  (if any - the RA might decide to spill(home on the stack) register passed arguments, if rarely used.)
 //        The final offset is calculated in lvaFixVirtualFrameOffsets method. It accounts for FP existance, 
 //        ret address slot, stack frame padding, alloca instructions, etc. 
+//  Note: This is the implementation for UNIX_AMD64 System V platforms.
 //
-#ifdef UNIX_AMD64_ABI
-int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize, int argOffs, int * callerArgOffset)
-#else // !UNIX_AMD64_ABI
-int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize, int argOffs)
-#endif // !UNIX_AMD64_ABI
+int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize, int argOffs UNIX_AMD64_ABI_ONLY_ARG(int * callerArgOffset))
 {
     noway_assert(lclNum < info.compArgsCount);
     noway_assert(argSize);
@@ -4114,30 +4244,131 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
 
     if (varDsc->lvIsRegArg)
     {
-        /* Argument is passed in a register, don't count it
-         * when updating the current offset on the stack */
-
-#ifndef _TARGET_ARM_
-        noway_assert(argSize == sizeof(void *));
-#endif
+        // Argument is passed in a register, don't count it
+        // when updating the current offset on the stack.
 
-#if defined(_TARGET_X86_)
-        argOffs += sizeof(void *);
-#elif defined(_TARGET_AMD64_)
-#ifdef UNIX_AMD64_ABI
         if (varDsc->lvOnFrame)
-#endif
         {
             // The offset for args needs to be set only for the stack homed arguments for System V.
             varDsc->lvStkOffs = argOffs;
-            argOffs += sizeof(void *);
         }
-#ifdef UNIX_AMD64_ABI
-        else 
+        else
         {
             varDsc->lvStkOffs = 0;
         }
+    }
+    else
+    {
+        // For Windows AMD64 there are 4 slots for the register passed arguments on the top of the caller's stack. This is where they are always homed.
+        // So, they can be accessed with positive offset.
+        // On System V platforms, if the RA decides to home a register passed arg on the stack,
+        // it creates a stack location on the callee stack (like any other local var.) In such a case, the register passed, stack homed arguments
+        // are accessed using negative offsets and the stack passed arguments are accessed using positive offset (from the caller's stack.)
+        // For  System V platforms if there is no frame pointer the caller stack parameter offset should include the callee allocated space.
+        // If frame register is used, the callee allocated space should not be included for accessing the caller stack parameters.
+        // The last two requirements are met in lvaFixVirtualFrameOffsets method, which fixes the offsets, based on frame pointer existence, 
+        // existence of alloca instructions, ret address pushed, ets.
+
+        varDsc->lvStkOffs = *callerArgOffset;
+        // Structs passed on stack could be of size less than TARGET_POINTER_SIZE.
+        // Make sure they get at least TARGET_POINTER_SIZE on the stack - this is required for alignment.
+        if (varDsc->lvType == TYP_STRUCT)
+        {
+            *callerArgOffset += (int)roundUp(argSize, TARGET_POINTER_SIZE);
+        }
+        else
+        {
+            *callerArgOffset += TARGET_POINTER_SIZE;
+        }
+    }
+
+    // For struct promoted parameters we need to set the offsets for both LclVars.
+    // 
+    // For a dependent promoted struct we also assign the struct fields stack offset 
+    if (varDsc->lvPromotedStruct())
+    {
+        lvaPromotionType promotionType = lvaGetPromotionType(varDsc);
+
+        if (promotionType == PROMOTION_TYPE_DEPENDENT)
+        {
+            noway_assert(varDsc->lvFieldCnt == 1);  // We only handle one field here
+
+            assert(fieldVarNum == varDsc->lvFieldLclStart);
+            lvaTable[fieldVarNum].lvStkOffs = varDsc->lvStkOffs;
+        }
+    }
+    // For an independent promoted struct field we also assign the parent struct stack offset
+    else if (varDsc->lvIsStructField)
+    {
+        noway_assert(varDsc->lvParentLcl < lvaCount);
+        lvaTable[varDsc->lvParentLcl].lvStkOffs = varDsc->lvStkOffs;
+    }
+
+    if (Target::g_tgtArgOrder == Target::ARG_ORDER_R2L && !varDsc->lvIsRegArg)
+        argOffs += argSize;
+
+    return argOffs;
+}
+
+#else // !UNIX_AMD64_ABI
+
+//
+//  lvaAssignVirtualFrameOffsetToArg() : Assign virtual stack offsets to an
+//  individual argument, and return the offset for the next argument.
+//  Note: This method only calculates the initial offset of the stack passed/spilled arguments 
+//  (if any - the RA might decide to spill(home on the stack) register passed arguments, if rarely used.)
+//        The final offset is calculated in lvaFixVirtualFrameOffsets method. It accounts for FP existance, 
+//        ret address slot, stack frame padding, alloca instructions, etc. 
+//  Note: This implementation for all the platforms but UNIX_AMD64 OSs (System V 64 bit.)
+int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize, int argOffs UNIX_AMD64_ABI_ONLY_ARG(int * callerArgOffset))
+{
+    noway_assert(lclNum < info.compArgsCount);
+    noway_assert(argSize);
+
+    if (Target::g_tgtArgOrder == Target::ARG_ORDER_L2R)
+        argOffs -= argSize;
+
+    unsigned fieldVarNum = BAD_VAR_NUM;
+
+    noway_assert(lclNum < lvaCount);
+    LclVarDsc * varDsc = lvaTable + lclNum;
+
+    if (varDsc->lvPromotedStruct())
+    {
+        noway_assert(varDsc->lvFieldCnt == 1);  // We only handle one field here
+        fieldVarNum = varDsc->lvFieldLclStart;
+
+        lvaPromotionType promotionType = lvaGetPromotionType(varDsc);
+
+        if (promotionType == PROMOTION_TYPE_INDEPENDENT)
+        {
+            lclNum = fieldVarNum;
+            noway_assert(lclNum < lvaCount);
+            varDsc = lvaTable + lclNum;
+            assert(varDsc->lvIsStructField);
+        }
+    }
+
+    noway_assert(varDsc->lvIsParam);
+
+    if (varDsc->lvIsRegArg)
+    {
+        /* Argument is passed in a register, don't count it
+        * when updating the current offset on the stack */
+
+#ifndef _TARGET_ARM_
+#if DEBUG
+        noway_assert(argSize == sizeof(void *));
+#endif // DEBUG
 #endif
+
+#if defined(_TARGET_X86_)
+        argOffs += sizeof(void *);
+#elif defined(_TARGET_AMD64_)
+        // The offset for args needs to be set only for the stack homed arguments for System V.
+        varDsc->lvStkOffs = argOffs;
+        // Register arguments also take stack space.
+        argOffs += sizeof(void *);
 #elif defined(_TARGET_ARM64_)
         // Register arguments don't take stack space.
 #elif defined(_TARGET_ARM_)
@@ -4181,32 +4412,32 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
 
             case TYP_DOUBLE:
             case TYP_LONG:
+            {
+                //
+                // Let's assign offsets to arg1, a double in r2. argOffs has to be 4 not 8.
+                //
+                // ------- CALLER SP -------
+                // r3
+                // r2 double   -- argOffs = 4, but it doesn't need to be skipped, because there is no skipping.
+                // r1 VACookie -- argOffs = 0
+                // -------------------------
+                //
+                // Consider argOffs as if it accounts for number of prespilled registers before the current register.
+                // In the above example, for r2, it is r1 that is prespilled, but since r1 is accounted for by argOffs
+                // being 4, there should have been no skipping. Instead, if we didn't assign r1 to any variable, then
+                // argOffs would still be 0 which implies it is not accounting for r1, equivalently r1 is skipped.
+                //
+                // If prevRegsSize is unaccounted for by a corresponding argOffs, we must have skipped a register.
+                int prevRegsSize = genCountBits(codeGen->regSet.rsMaskPreSpillRegArg & (regMask - 1)) * TARGET_POINTER_SIZE;
+                if (argOffs < prevRegsSize)
                 {
-                    //
-                    // Let's assign offsets to arg1, a double in r2. argOffs has to be 4 not 8.
-                    //
-                    // ------- CALLER SP -------
-                    // r3
-                    // r2 double   -- argOffs = 4, but it doesn't need to be skipped, because there is no skipping.
-                    // r1 VACookie -- argOffs = 0
-                    // -------------------------
-                    //
-                    // Consider argOffs as if it accounts for number of prespilled registers before the current register.
-                    // In the above example, for r2, it is r1 that is prespilled, but since r1 is accounted for by argOffs
-                    // being 4, there should have been no skipping. Instead, if we didn't assign r1 to any variable, then
-                    // argOffs would still be 0 which implies it is not accounting for r1, equivalently r1 is skipped.
-                    //
-                    // If prevRegsSize is unaccounted for by a corresponding argOffs, we must have skipped a register.
-                    int prevRegsSize = genCountBits(codeGen->regSet.rsMaskPreSpillRegArg & (regMask - 1)) * TARGET_POINTER_SIZE;
-                    if (argOffs < prevRegsSize)
-                    {
-                        // We must align up the argOffset to a multiple of 8 to account for skipped registers.
-                        argOffs = roundUp(argOffs, 2*TARGET_POINTER_SIZE);
-                    }
-                    // We should've skipped only a single register.
-                    assert(argOffs == prevRegsSize);
+                    // We must align up the argOffset to a multiple of 8 to account for skipped registers.
+                    argOffs = roundUp(argOffs, 2 * TARGET_POINTER_SIZE);
                 }
-                break;
+                // We should've skipped only a single register.
+                assert(argOffs == prevRegsSize);
+            }
+            break;
 
             default:
                 // No alignment of argOffs required
@@ -4292,16 +4523,16 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
             if (!compIsProfilerHookNeeded())
 #endif
             {
-                bool cond = (info.compIsVarArgs && 
-                             // Does cur stk arg require double alignment?
-                             ((varDsc->lvType == TYP_STRUCT && varDsc->lvStructDoubleAlign) ||
-                              (varDsc->lvType == TYP_DOUBLE) ||
-                              (varDsc->lvType == TYP_LONG))
-                            ) ||
-                            // Did first reg arg require alignment?
-                            (codeGen->regSet.rsMaskPreSpillAlign & genRegMask(REG_ARG_LAST));
-
-                noway_assert(cond);                            
+                bool cond = (info.compIsVarArgs &&
+                    // Does cur stk arg require double alignment?
+                    ((varDsc->lvType == TYP_STRUCT && varDsc->lvStructDoubleAlign) ||
+                    (varDsc->lvType == TYP_DOUBLE) ||
+                    (varDsc->lvType == TYP_LONG))
+                    ) ||
+                    // Did first reg arg require alignment?
+                    (codeGen->regSet.rsMaskPreSpillAlign & genRegMask(REG_ARG_LAST));
+
+                noway_assert(cond);
                 noway_assert(sizeofPreSpillRegArgs <= argOffs + TARGET_POINTER_SIZE); // at most one register of alignment
             }
             argOffs = sizeofPreSpillRegArgs;
@@ -4321,7 +4552,7 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
         case TYP_DOUBLE:
         case TYP_LONG:
             // We must align up the argOffset to a multiple of 8
-            argOffs = roundUp(argOffsWithoutPreSpillRegArgs, 2*TARGET_POINTER_SIZE) + sizeofPreSpillRegArgs;
+            argOffs = roundUp(argOffsWithoutPreSpillRegArgs, 2 * TARGET_POINTER_SIZE) + sizeofPreSpillRegArgs;
             break;
 
         default:
@@ -4330,21 +4561,7 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
         }
 #endif // _TARGET_ARM_
 
-#ifdef UNIX_AMD64_ABI
-        // For Windows there are 4 slots for the register passed arguments on the top of the caller's stack. This is where they are always homed.
-        // So, they can be accessed with positive offset.
-        // On System V platforms, if the RA decides to home a register passed arg on the stack,
-        // it creates a stack location on the callee stack (like any other local var.) In such a case, the register passed, stack homed arguments
-        // are accessed using negative offsets and the stack passed arguments are accessed using positive offset (from the caller's stack.)
-        // For  System V platforms if there is no frame pointer the caller stack parameter offset should include the callee allocated space.
-        // If frame register is used, the callee allocated space should not be included for accessing the caller stack parameters.
-        // The last two requirements are met in lvaFixVirtualFrameOffsets method, which fixes the offsets, based on frame pointer existence, 
-        // existence of alloca instructions, ret address pushed, ets.
-        varDsc->lvStkOffs = *callerArgOffset;
-        *callerArgOffset += TARGET_POINTER_SIZE;
-#else // !UNIX_AMD64_ABI
         varDsc->lvStkOffs = argOffs;
-#endif // !UNIX_AMD64_ABI
     }
 
     // For struct promoted parameters we need to set the offsets for both LclVars.
@@ -4360,31 +4577,31 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum, unsigned argSize
     }
     else
 #endif // !defined(_TARGET_64BIT_)
-    if (varDsc->lvPromotedStruct())
-    {
-        lvaPromotionType promotionType = lvaGetPromotionType(varDsc);
-
-        if (promotionType == PROMOTION_TYPE_DEPENDENT)
+        if (varDsc->lvPromotedStruct())
         {
-            noway_assert(varDsc->lvFieldCnt == 1);  // We only handle one field here
+            lvaPromotionType promotionType = lvaGetPromotionType(varDsc);
 
-            assert(fieldVarNum == varDsc->lvFieldLclStart);
-            lvaTable[fieldVarNum].lvStkOffs = varDsc->lvStkOffs;
+            if (promotionType == PROMOTION_TYPE_DEPENDENT)
+            {
+                noway_assert(varDsc->lvFieldCnt == 1);  // We only handle one field here
+
+                assert(fieldVarNum == varDsc->lvFieldLclStart);
+                lvaTable[fieldVarNum].lvStkOffs = varDsc->lvStkOffs;
+            }
         }
-    }
     // For an independent promoted struct field we also assign the parent struct stack offset
-    else if (varDsc->lvIsStructField)
-    {
-        noway_assert(varDsc->lvParentLcl < lvaCount);
-        lvaTable[varDsc->lvParentLcl].lvStkOffs = varDsc->lvStkOffs;
-    }
+        else if (varDsc->lvIsStructField)
+        {
+            noway_assert(varDsc->lvParentLcl < lvaCount);
+            lvaTable[varDsc->lvParentLcl].lvStkOffs = varDsc->lvStkOffs;
+        }
 
     if (Target::g_tgtArgOrder == Target::ARG_ORDER_R2L && !varDsc->lvIsRegArg)
         argOffs += argSize;
 
     return argOffs;
 }
-
+#endif // !UNIX_AMD64_ABI
 
 /*****************************************************************************
  *  lvaAssignVirtualFrameOffsetsToLocals() : Assign virtual stack offsets to
@@ -5261,8 +5478,18 @@ void Compiler::lvaAssignFrameOffsetsToPromotedStructs()
     {     
         // For promoted struct fields that are params, we will
         // assign their offsets in lvaAssignVirtualFrameOffsetToArg().
+        // This is not true for the System V systems since there is no 
+        // outgoing args space. Assign the dependently promoted fields properly.
         //
-        if (varDsc->lvIsStructField && !varDsc->lvIsParam)
+        if (varDsc->lvIsStructField 
+#ifndef UNIX_AMD64_ABI
+        // For System V platforms there is no outgoing args space. 
+        // A register passed struct arg is homed on the stack in a separate local var.
+        // The offset of these structs is already calculated in lvaAssignVirtualFrameOffsetToArg methos.
+        // Make sure the code below is not executed for these structs and the offset is not changed.
+            && !varDsc->lvIsParam
+#endif // UNIX_AMD64_ABI
+            )
         {
             LclVarDsc *      parentvarDsc  = &lvaTable[varDsc->lvParentLcl];
             lvaPromotionType promotionType = lvaGetPromotionType(parentvarDsc);
diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp
index bb69d103cf..5882ecfa71 100644
--- a/src/jit/lower.cpp
+++ b/src/jit/lower.cpp
@@ -1001,9 +1001,39 @@ void Lowering::SpliceInUnary(GenTreePtr parent, GenTreePtr* ppChild, GenTreePtr
     oldChild->InsertAfterSelf(newNode);
 }
 
+//------------------------------------------------------------------------
+// NewPutArg: rewrites the tree to put an arg in a register or on the stack.
+//
+// Arguments:
+//    call - the call whose arg is being rewritten.
+//    arg  - the arg being rewritten.
+//    fp   - the ArgTabEntry for the argument.
+//    type - the type of the argument.
+//
+// Return Value:
+//    The new tree that was created to put the arg in the right place
+//    or the incoming arg if the arg tree was not rewritten.
+//
+// Assumptions:
+//    call, arg, and fp must be non-null.
+//
+// Notes:
+//    For System V systems with native struct passing (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined)
+//    this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_LIST of two GT_PUTARG_REGs
+//    for two eightbyte structs.
+//
+//    For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing 
+//    (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GP pointers count and the pointers 
+//    layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value.
+//    (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.)
+//
 GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryPtr fp, var_types type)
 {
-    GenTreePtr putArg;
+    assert(call != nullptr);
+    assert(arg != nullptr);
+    assert(fp != nullptr);
+
+    GenTreePtr putArg = nullptr;
     bool updateArgTable = true;
 
 #if !defined(_TARGET_64BIT_)
@@ -1015,7 +1045,22 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
         type = TYP_INT;
     }
 #endif // !defined(_TARGET_64BIT_)
-    if (fp->regNum != REG_STK)
+
+    bool  isOnStack = true;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (type == TYP_STRUCT)
+    {
+        isOnStack = !fp->structDesc.passedInRegisters;
+    }
+    else
+    {
+        isOnStack = fp->regNum == REG_STK;
+    }
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    isOnStack = fp->regNum == REG_STK; 
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    if (!isOnStack)
     {
 #ifdef FEATURE_SIMD
         // We can have SIMD types that are handled as TYP_DOUBLE, but which need to be
@@ -1025,24 +1070,182 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
             type = TYP_LONG;
         }
 #endif //FEATURE_SIMD
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (fp->isStruct)
+        {
+            // The following code makes sure a register passed struct arg is moved to
+            // the register before the call is made.
+            // There are two cases (comments added in the code below.)
+            // 1. The struct is of size one eightbyte:
+            //    In this case a new tree is created that is GT_PUTARG_REG
+            //    with a op1 the original argument.
+            // 2. The struct is contained in 2 eightbytes:
+            //    in this case the arg comes as a GT_LIST of two GT_LCL_FLDs - the two eightbytes of the struct.
+            //    The code creates a GT_PUTARG_REG node for each GT_LCL_FLD in the GT_LIST
+            //    and splices it in the list with the corresponding original GT_LCL_FLD tree as op1.
+
+            assert(fp->structDesc.eightByteCount != 0);
+
+            if (fp->structDesc.eightByteCount == 1)
+            {
+                // Case 1 above: Create a GT_PUTARG_REG node with op1 of the original tree.
+                //
+                // Here the IR for this operation:
+                // lowering call :
+                //     N001(3, 2)[000017] ------ - N---- / --*  &lclVar   byref  V00 loc0
+                //     N003(6, 5)[000052] * --XG------ - / --*  indir     int
+                //     N004(3, 2)[000046] ------ - N---- + --*  &lclVar   byref  V02 tmp0
+                //     (13, 11)[000070] -- - XG-- - R-- - arg0 in out + 00 / --*  storeIndir int
+                //     N009(3, 4)[000054] ------ - N----arg0 in rdi + --*  lclFld    int    V02 tmp0[+0](last use)
+                //     N011(33, 21)[000018] --CXG------ - *call      void   Test.Foo.test1
+                //
+                // args :
+                //     lowering arg : (13, 11)[000070] -- - XG-- - R-- - *storeIndir int
+                //
+                // late :
+                //    lowering arg : N009(3, 4)[000054] ------ - N----             *  lclFld    int    V02 tmp0[+0](last use)
+                //    new node is : (3, 4)[000071] ------------             *  putarg_reg int    RV
+                //
+                // after :
+                //    N001(3, 2)[000017] ------ - N---- / --*  &lclVar   byref  V00 loc0
+                //    N003(6, 5)[000052] * --XG------ - / --*  indir     int
+                //    N004(3, 2)[000046] ------ - N---- + --*  &lclVar   byref  V02 tmp0
+                //    (13, 11)[000070] -- - XG-- - R-- - arg0 in out + 00 / --*  storeIndir int
+                //    N009(3, 4)[000054] ------ - N---- | / --*  lclFld    int    V02 tmp0[+0](last use)
+                //    (3, 4)[000071] ------------arg0 in rdi + --*  putarg_reg int    RV
+                //    N011(33, 21)[000018] --CXG------ - *call      void   Test.Foo.test1
+                //
+
+                putArg = comp->gtNewOperNode(GT_PUTARG_REG, type, arg);
+            }
+            else if (fp->structDesc.eightByteCount == 2)
+            {
+                // Case 2 above: Convert the LCL_FLDs to PUTARG_REG
+                //
+                // lowering call :
+                //     N001(3, 2)[000025] ------ - N----Source / --*  &lclVar   byref  V01 loc1
+                //     N003(3, 2)[000056] ------ - N----Destination + --*  &lclVar   byref  V03 tmp1
+                //     N006(1, 1)[000058] ------------ + --*  const     int    16
+                //     N007(12, 12)[000059] - A--G---- - L - arg0 SETUP / --*  copyBlk   void
+                //     N009(3, 4)[000061] ------ - N----arg0 in rdi + --*  lclFld    long   V03 tmp1[+0]
+                //     N010(3, 4)[000063] ------------arg0 in rsi + --*  lclFld    long   V03 tmp1[+8](last use)
+                //     N014(40, 31)[000026] --CXG------ - *call      void   Test.Foo.test2
+                //
+                // args :
+                //     lowering arg : N007(12, 12)[000059] - A--G---- - L - *copyBlk   void
+                //
+                // late :
+                //     lowering arg : N012(11, 13)[000065] ------------             *  <list>    struct
+                //
+                // after :
+                //     N001(3, 2)[000025] ------ - N----Source / --*  &lclVar   byref  V01 loc1
+                //     N003(3, 2)[000056] ------ - N----Destination + --*  &lclVar   byref  V03 tmp1
+                //     N006(1, 1)[000058] ------------ + --*  const     int    16
+                //     N007(12, 12)[000059] - A--G---- - L - arg0 SETUP / --*  copyBlk   void
+                //     N009(3, 4)[000061] ------ - N---- | / --*  lclFld    long   V03 tmp1[+0]
+                //     (3, 4)[000072] ------------arg0 in rdi + --*  putarg_reg long
+                //     N010(3, 4)[000063] ------------ | / --*  lclFld    long   V03 tmp1[+8](last use)
+                //     (3, 4)[000073] ------------arg0 in rsi + --*  putarg_reg long
+                //     N014(40, 31)[000026] --CXG------ - *call      void   Test.Foo.test2
+                //
+
+                assert(arg->OperGet() == GT_LIST);
+                GenTreeArgList* argListPtr = arg->AsArgList();
+                
+                for (unsigned ctr = 0; argListPtr != nullptr; argListPtr = argListPtr->Rest(), ctr++)
+                {
+                    // Create a new GT_PUTARG_REG node with op1 the original GT_LCL_FLD.
+                    GenTreePtr newOper = comp->gtNewOperNode(
+                        GT_PUTARG_REG,
+                        comp->GetTypeFromClassificationAndSizes(fp->structDesc.eightByteClassifications[ctr], fp->structDesc.eightByteSizes[ctr]),
+                        argListPtr->gtOp.gtOp1);
+
+                    // CopyCosts
+                    newOper->CopyCosts(argListPtr->gtOp.gtOp1);
+
+                    // Splice in the new GT_PUTARG_REG node in the GT_LIST
+                    SpliceInUnary(argListPtr, &argListPtr->gtOp.gtOp1, newOper);
+                }
 
-        putArg = comp->gtNewOperNode(GT_PUTARG_REG, type, arg);
+                // Just return arg. The GT_LIST is not replaced.
+                // Nothing more to do.
+                return arg;
+            }
+            else
+            {
+                assert(false && "Illegal count of eightbytes for the CLR type system"); // No more than 2 eightbytes for the CLR.
+                
+            }
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            putArg = comp->gtNewOperNode(GT_PUTARG_REG, type, arg);
+        }
     }
     else
     {
         // Mark this one as tail call arg if it is a fast tail call.
         // This provides the info to put this argument in in-coming arg area slot 
         // instead of in out-going arg area slot.
+
+        FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(assert(fp->isStruct == (type == TYP_STRUCT))); // Make sure state is correct
+
 #if FEATURE_FASTTAILCALL
-        putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, type, arg, fp->slotNum, call->IsFastTailCall() DEBUG_ARG(call));
+        putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK,
+                                                            type,
+                                                            arg, 
+                                                            fp->slotNum
+                                                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(fp->numSlots)
+                                                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(fp->isStruct),
+                                                            call->IsFastTailCall() 
+                                                            DEBUG_ARG(call));
 #else
-        putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, type, arg, fp->slotNum DEBUG_ARG(call));
+        putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK,
+                                                            type, 
+                                                            arg, 
+                                                            fp->slotNum
+                                                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(fp->numSlots)
+                                                            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(fp->isStruct)
+                                                            DEBUG_ARG(call));
 #endif
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // If the ArgTabEntry indicates that this arg is a struct
+        // get and store the number of slots that are references.
+        // This is later used in the codegen for PUT_ARG_STK implementation
+        // for struct to decide whether and how many single eight-byte copies 
+        // to be done (only for reference slots), so gcinfo is emitted.
+        // For non-reference slots faster/smaller size instructions are used - 
+        // pair copying using XMM registers or rep mov instructions.
+        if (fp->isStruct)
+        {
+            assert(arg->OperGet() == GT_LDOBJ);
+            
+            BYTE* gcLayout = new (comp, CMK_Codegen) BYTE[fp->numSlots];
+
+            unsigned numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtLdObj.gtClass, gcLayout);
+
+            putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     }
+
     putArg->CopyCosts(arg);
 
     if (arg->InReg())
+    {
         putArg->SetInReg();
+    }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    else if (fp->isStruct)
+    {
+        if (fp->structDesc.passedInRegisters)
+        {
+            putArg->SetInReg();
+        }
+    }
+#endif
 
     JITDUMP("new node is : ");
     DISPNODE(putArg);
@@ -1076,10 +1279,14 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg)
     // assignments/stores at this level are not really placing an arg
     // they are setting up temporary locals that will later be placed into
     // outgoing regs or stack
-    if (!arg->OperIsAssignment()     && 
+    if (
+        !arg->OperIsAssignment()     && 
         !arg->OperIsStore()          &&
         !arg->IsArgPlaceHolderNode() &&
-        !arg->IsNothingNode()        &&
+        !arg->IsNothingNode()        && 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        !arg->OperIsPutArgStk()      &&
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
         !arg->OperIsCopyBlkOp()) // these are de facto placeholders (apparently)
     {
         fgArgTabEntryPtr fp = comp->gtArgEntryByNode(call, arg);
@@ -1153,7 +1360,15 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg)
 #endif // !defined(_TARGET_64BIT_)
         {
             putArg = NewPutArg(call, arg, fp, type);
-            SpliceInUnary(call, ppArg, putArg);
+
+            // In the case of register passable struct (in one or two registers)
+            // the NewPutArg returns a new node (GT_PUTARG_REG or a GT_LIST with two GT_PUTARG_REGs.)
+            // If an extra node is returned, splice it in the right place in the tree.
+            if (arg != putArg)
+            {
+                // putArg and arg are equals if arg is GT_LIST (a list of multiple LCL_FLDs to be passed in registers.)
+                SpliceInUnary(call, ppArg, putArg);
+            }
         }
     }
 }
diff --git a/src/jit/lower.h b/src/jit/lower.h
index ae1f73e5b8..6754b7b75d 100644
--- a/src/jit/lower.h
+++ b/src/jit/lower.h
@@ -134,6 +134,10 @@ private:
     void TreeNodeInfoInitSIMD(GenTree* tree, LinearScan* lsra);
 #endif // FEATURE_SIMD
 
+#if defined(_TARGET_XARCH_)
+    void TreeNodeInfoInitSimple(GenTree* tree, TreeNodeInfo* info, unsigned kind);
+#endif // defined(_TARGET_XARCH_)
+
     void SpliceInUnary(GenTreePtr parent, GenTreePtr* ppChild, GenTreePtr newNode);
     void DumpNodeInfoMap();
 
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 08c340cbee..a7b4600df9 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -103,7 +103,38 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
     }
 }
 
-        
+// TreeNodeInfoInitSimple:
+//   Sets the srcCount and dstCount for all the trees without special handling based on the tree node type.
+//
+// args:
+//   tree: The tree on which TreeNodeInfo's srcCount and dstCount are set.
+//   info: The TreeNodeInfo on which to set the srcCount and dstCount.
+//         This is the TreeNodeInfo corresponding to the tree parameter.
+//   kind: The kind flags of the tree node.
+//   
+void Lowering::TreeNodeInfoInitSimple(GenTree* tree, TreeNodeInfo* info, unsigned kind)
+{
+    info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+    if (kind & (GTK_CONST | GTK_LEAF))
+    {
+        info->srcCount = 0;
+    }
+    else if (kind & (GTK_SMPOP))
+    {
+        if (tree->gtGetOp2() != nullptr)
+        {
+            info->srcCount = 2;
+        }
+        else
+        {
+            info->srcCount = 1;
+        }
+    }
+    else
+    {
+        unreached();
+    }
+}
 
 /**
  * Takes care of annotating the register requirements 
@@ -138,26 +169,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             GenTree* op2;
 
         default:
-            info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
-            if (kind & (GTK_CONST|GTK_LEAF))
-            {
-                info->srcCount = 0;
-            }
-            else if (kind & (GTK_SMPOP))
-            {
-                if (tree->gtGetOp2() != nullptr)
-                {
-                    info->srcCount = 2;
-                }
-                else
-                {
-                    info->srcCount = 1;
-                }
-            }
-            else
-            {
-                unreached();
-            }
+            TreeNodeInfoInitSimple(tree, info, kind);
             break;
 
         case GT_LCL_FLD:
@@ -275,6 +287,24 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             else
 #endif // !defined(_TARGET_64BIT_)
             {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                if (tree->TypeGet() == TYP_STRUCT && 
+                    tree->gtOp.gtOp1->OperGet() == GT_LCL_VAR)
+                {
+#ifdef DEBUG
+                    GenTreeLclVarCommon* lclVarPtr = tree->gtOp.gtOp1->AsLclVarCommon();
+                    LclVarDsc* varDsc = &(compiler->lvaTable[lclVarPtr->gtLclNum]);
+                    assert(varDsc->lvDontPromote);
+#endif // DEBUG
+                    // If this is a two eightbyte return, make the var
+                    // contained by the return expression. The code gen will put
+                    // the values in the right registers for return.
+                    info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+                    info->dstCount = 0;
+                    MakeSrcContained(tree, tree->gtOp.gtOp1);
+                    break;
+                }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                 info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
                 info->dstCount = 0;
 
@@ -840,9 +870,10 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             }
 
             // First, count reg args
-
+#if FEATURE_VARARG
             bool callHasFloatRegArgs = false;
-
+#endif // !FEATURE_VARARG
+            
             for (GenTreePtr list = tree->gtCall.gtCallLateArgs; list; list = list->MoveNext())
             {
                 assert(list->IsList());
@@ -859,26 +890,52 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                     assert(argNode->gtOper == GT_PUTARG_STK);
                     argNode->gtLsraInfo.srcCount = 1;
                     argNode->gtLsraInfo.dstCount = 0;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    // If the node is a struct and it is put on stack with
+                    // putarg_stk operation, we consume and produce no registers.
+                    // In this case the embedded LdObj node should not produce 
+                    // registers too since it is contained.
+                    if (argNode->TypeGet() == TYP_STRUCT)
+                    {
+                        assert(argNode != nullptr && argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_LDOBJ);
+                        argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
+                        argNode->gtLsraInfo.srcCount = 0;
+                    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                     continue;
                 }
 
-                var_types argType = argNode->TypeGet();
+                regNumber argReg = REG_NA;
+                regMaskTP argMask = RBM_NONE;
+                short regCount = 0;
+                bool isOnStack = true;
+                if (curArgTabEntry->regNum != REG_STK)
+                {
+                    isOnStack = false;
+                    var_types argType = argNode->TypeGet();
 
-                callHasFloatRegArgs |= varTypeIsFloating(argType);
+#if FEATURE_VARARG
+                    callHasFloatRegArgs |= varTypeIsFloating(argType);
+#endif // !FEATURE_VARARG
 
-                regNumber argReg = curArgTabEntry->regNum;
-                short regCount = 1;
-                // Default case is that we consume one source; modify this later (e.g. for
-                // promoted structs)
-                info->srcCount++;
+                    argReg = curArgTabEntry->regNum;
+                    regCount = 1;
 
-                regMaskTP argMask = genRegMask(argReg);
-                argNode = argNode->gtEffectiveVal();
-                
-                if (argNode->TypeGet() == TYP_STRUCT)
+                    // Default case is that we consume one source; modify this later (e.g. for
+                    // promoted structs)
+                    info->srcCount++;
+
+                    argMask = genRegMask(argReg);
+                    argNode = argNode->gtEffectiveVal();
+                }
+
+                // If the struct arg is wraped in CPYBLK the type of the param will beTYP_VOID.
+                // Use the curArgTabEntry's isStruct to get whether the param is a struct.
+                if (argNode->TypeGet() == TYP_STRUCT 
+                    FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(|| curArgTabEntry->isStruct))
                 {
                     unsigned originalSize = 0;
-                    bool isPromoted = false;
                     LclVarDsc* varDsc = nullptr;
                     if (argNode->gtOper == GT_LCL_VAR)
                     {
@@ -893,20 +950,70 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                     {
                         noway_assert(!"GT_LDOBJ not supported for amd64");
                     }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    else if (argNode->gtOper == GT_PUTARG_REG)
+                    {
+                        originalSize = genTypeSize(argNode->gtType);
+                    }
+                    else if (argNode->gtOper == GT_LIST)
+                    {
+                        originalSize = 0;
+
+                        // There could be up to 2 PUTARG_REGs in the list
+                        GenTreeArgList* argListPtr = argNode->AsArgList();
+                        unsigned iterationNum = 0;
+                        for (; argListPtr; argListPtr = argListPtr->Rest())
+                        {
+                            GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+                            assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+
+                            if (iterationNum == 0)
+                            {
+                                varDsc = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
+                                originalSize = varDsc->lvSize();
+                                assert(originalSize != 0);
+                            }
+                            else
+                            {
+                                // Need an extra source for every node, but the first in the list.
+                                info->srcCount++;
+
+                                // Get the mask for the second putarg_reg
+                                argMask = genRegMask(curArgTabEntry->otherRegNum);
+                            }
+
+                            putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
+                            putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);
+
+                            // To avoid redundant moves, have the argument child tree computed in the
+                            // register in which the argument is passed to the call.
+                            putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
+                            iterationNum++;
+                        }
+
+                        assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+                    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                     else
                     {
                         noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
                     }
 
-                    unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES;
-                    regNumber reg = (regNumber)(argReg + 1);
-                    unsigned remainingSlots = slots - 1;
-                    while (remainingSlots > 0 && reg <= REG_ARG_LAST)
+                    unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES; 
+                    unsigned remainingSlots = slots;
+
+                    if (!isOnStack)
                     {
-                        argMask |= genRegMask(reg);
-                        reg = (regNumber)(reg + 1);
-                        remainingSlots--;
-                        regCount++;
+                        remainingSlots = slots - 1;
+
+                        regNumber reg = (regNumber)(argReg + 1);
+                        while (remainingSlots > 0 && reg <= REG_ARG_LAST)
+                        {
+                            argMask |= genRegMask(reg);
+                            reg = (regNumber)(reg + 1);
+                            remainingSlots--;
+                            regCount++;
+                        }
                     }
 
                     short internalIntCount = 0;
@@ -915,9 +1022,21 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                         // This TYP_STRUCT argument is also passed in the outgoing argument area
                         // We need a register to address the TYP_STRUCT
                         // And we may need 2
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                        internalIntCount = 1;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
                         internalIntCount = 2;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                     }
                     argNode->gtLsraInfo.internalIntCount = internalIntCount;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    if (argNode->gtOper == GT_PUTARG_REG)
+                    {
+                        argNode->gtLsraInfo.setDstCandidates(l, argMask);
+                        argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+                    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                 }
                 else
                 {
@@ -931,6 +1050,8 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                 {
                     argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
                 }
+
+#if FEATURE_VARARG
                 // In the case of a varargs call, the ABI dictates that if we have floating point args,
                 // we must pass the enregistered arguments in both the integer and floating point registers.
                 // Since the integer register is not associated with this arg node, we will reserve it as
@@ -942,6 +1063,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                     tree->gtLsraInfo.setInternalIntCount(tree->gtLsraInfo.internalIntCount + 1);
                     tree->gtLsraInfo.addInternalCandidates(l, genRegMask(targetReg));
                 }
+#endif // FEATURE_VARARG
             }
 
             // Now, count stack args
@@ -995,6 +1117,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                 args = args->gtOp.gtOp2;
             }
 
+#if FEATURE_VARARG
             // If it is a fast tail call, it is already preferenced to use RAX.
             // Therefore, no need set src candidates on call tgt again.
             if (tree->gtCall.IsVarargs() && 
@@ -1007,6 +1130,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                 // by Amd64 ABI.
                 ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
             }
+#endif // !FEATURE_VARARG
         }
         break;
 
@@ -1020,7 +1144,6 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             info->dstCount = 1;
         }
         break;
-
 #ifdef _TARGET_X86_
         case GT_LDOBJ:
             NYI_X86("GT_LDOBJ");
@@ -1218,6 +1341,116 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
         }
         break;
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        case GT_PUTARG_STK:
+        {
+            if (tree->TypeGet() != TYP_STRUCT)
+            {
+                TreeNodeInfoInitSimple(tree, info, kind);
+                break;
+            }
+
+            GenTreePutArgStk* putArgStkTree = tree->AsPutArgStk();
+
+            GenTreePtr   dstAddr = tree;
+            GenTreePtr   srcAddr = tree->gtOp.gtOp1;
+
+            assert(srcAddr->OperGet() == GT_LDOBJ);
+            info->srcCount =  srcAddr->gtLsraInfo.dstCount;
+
+            // If this is a stack variable address,
+            // make the op1 contained, so this way 
+            // there is no unnecessary copying between registers.
+            // To avoid assertion, increment the parent's source.
+            // It is recovered below.
+            if (srcAddr->gtGetOp1()->OperIsLocalAddr())
+            {
+                info->srcCount += 1;
+            }
+
+            info->dstCount = 0;
+
+            // In case of a CpBlk we could use a helper call. In case of putarg_stk we 
+            // can't do that since the helper call could kill some already set up outgoing args.
+            // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
+            // The cpyXXXX code is rather complex and this could cause it to be more complex, but
+            // it might be the right thing to do.
+
+            // This threshold will decide from using the helper or let the JIT decide to inline
+            // a code sequence of its choice.
+            ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
+            ssize_t size = putArgStkTree->gtNumSlots * TARGET_POINTER_SIZE;
+
+            // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
+            // (I don't know which).
+
+            // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. 
+            // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
+            // our framework assemblies, so this is the main code generation scheme we'll use.
+            if (size <= CPBLK_UNROLL_LIMIT && putArgStkTree->gtNumberReferenceSlots == 0)
+            {
+                // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
+                // 
+                // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
+                // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
+                // RBM_NON_BYTE_REGS from internal candidates.
+                if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
+                {
+                    info->internalIntCount++;
+                    regMaskTP regMask = l->allRegs(TYP_INT);
+
+#ifdef _TARGET_X86_
+                    if ((size % 2) != 0)
+                    {
+                        regMask &= ~RBM_NON_BYTE_REGS;
+                    }
+#endif
+                    info->setInternalCandidates(l, regMask);
+                }
+
+                if (size >= XMM_REGSIZE_BYTES)
+                {
+                    // If we have a buffer larger than XMM_REGSIZE_BYTES, 
+                    // reserve an XMM register to use it for a 
+                    // series of 16-byte loads and stores.
+                    info->internalFloatCount = 1;
+                    info->addInternalCandidates(l, l->internalFloatRegCandidates());
+                }
+
+                if (srcAddr->gtGetOp1()->OperIsLocalAddr())
+                {
+                    MakeSrcContained(putArgStkTree, srcAddr->gtGetOp1());
+                }
+
+                // If src or dst are on stack, we don't have to generate the address into a register
+                // because it's just some constant+SP
+                putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindUnroll;
+            }
+            else
+            {
+                info->internalIntCount += 3;
+                info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
+                if (srcAddr->gtGetOp1()->OperIsLocalAddr())
+                {
+                    MakeSrcContained(putArgStkTree, srcAddr->gtGetOp1());
+                }
+
+                putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindRepInstr;
+            }
+
+            // Always mark the LDOBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
+            MakeSrcContained(putArgStkTree, srcAddr);
+
+            // Balance up the inc above.
+            if (srcAddr->gtGetOp1()->OperIsLocalAddr())
+            {
+                info->srcCount -= 1;
+            }
+        }
+        
+        break;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
         case GT_COPYBLK:
         {
             // Sources are src, dest and size (or class token for CpObj).
@@ -2995,6 +3228,6 @@ bool Lowering:: IsContainableImmed(GenTree* parentNode, GenTree* childNode)
     return true;
 }
 
-#endif // _TARGET_AMD64_
+#endif // _TARGET_XARCH_
 
 #endif // !LEGACY_BACKEND
diff --git a/src/jit/lsra.cpp b/src/jit/lsra.cpp
index d8341b1d7f..8f11af9878 100644
--- a/src/jit/lsra.cpp
+++ b/src/jit/lsra.cpp
@@ -2671,14 +2671,14 @@ LinearScan::buildInternalRegisterDefsForNode(GenTree *tree,
     int internalIntCount = tree->gtLsraInfo.internalIntCount;
     regMaskTP internalCands = tree->gtLsraInfo.getInternalCandidates(this);
 
-    // If this is a varArgs call, the internal candidates represent the integer registers that
-    // floating point arguments must be copied into.  These must be handled as fixed regs.
+    // If the number of internal integer registers required is the same as the number of candidate integer registers in the candidate set, 
+    // then they must be handled as fixed registers.
+    // (E.g. for the integer registers that floating point arguments must be copied into for a varargs call.)
     bool fixedRegs = false;
-    if ((internalIntCount != 0) && (tree->OperGet() == GT_CALL))
+    regMaskTP internalIntCandidates = (internalCands & allRegs(TYP_INT));
+    if (((int)genCountBits(internalIntCandidates)) == internalIntCount)
     {
-        assert(tree->gtCall.IsVarargs());
         fixedRegs = true;
-        assert((int)genCountBits(internalCands) == internalIntCount);
     }
 
     for (count = 0; count < internalIntCount; count++)
@@ -3317,6 +3317,50 @@ LinearScan::insertZeroInitRefPositions()
     }
 }
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+// -----------------------------------------------------------------------
+// Sets the register state for an argument of type STRUCT for System V systems.
+//     See Compiler::raUpdateRegStateForArg(RegState *regState, LclVarDsc *argDsc) in regalloc.cpp
+//         for how state for argument is updated for unix non-structs and Windows AMD64 structs.
+void
+LinearScan::unixAmd64UpdateRegStateForArg(LclVarDsc* argDsc)
+{
+    assert(argDsc->lvType == TYP_STRUCT);
+    RegState              * intRegState = &compiler->codeGen->intRegState;
+    RegState              * floatRegState = &compiler->codeGen->floatRegState;
+
+    if ((argDsc->lvArgReg != REG_STK) && (argDsc->lvArgReg != REG_NA))
+    {
+        if (genRegMask(argDsc->lvArgReg) & (RBM_ALLFLOAT))
+        {
+            assert(genRegMask(argDsc->lvArgReg) & (RBM_FLTARG_REGS));
+            floatRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->lvArgReg);
+        }
+        else
+        {
+            assert(genRegMask(argDsc->lvArgReg) & (RBM_ARG_REGS));
+            intRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->lvArgReg);
+        }
+    }
+
+
+    if ((argDsc->lvOtherArgReg != REG_STK) && (argDsc->lvOtherArgReg != REG_NA))
+    {
+        if (genRegMask(argDsc->lvOtherArgReg) & (RBM_ALLFLOAT))
+        {
+            assert(genRegMask(argDsc->lvOtherArgReg) & (RBM_FLTARG_REGS));
+            floatRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->lvOtherArgReg);
+        }
+        else
+        {
+            assert(genRegMask(argDsc->lvOtherArgReg) & (RBM_ARG_REGS));
+            intRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->lvOtherArgReg);
+        }
+    }
+}
+
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 //------------------------------------------------------------------------
 // updateRegStateForArg: Updates rsCalleeRegArgMaskLiveIn for the appropriate
 //    regState (either compiler->intRegState or compiler->floatRegState),
@@ -3339,31 +3383,41 @@ LinearScan::insertZeroInitRefPositions()
 void
 LinearScan::updateRegStateForArg(LclVarDsc* argDsc)
 {
-    RegState              * intRegState   = &compiler->codeGen->intRegState;
-    RegState              * floatRegState = &compiler->codeGen->floatRegState;
-
-    // In the case of AMD64 we'll still use the floating point registers
-    // to model the register usage for argument on vararg calls, so
-    // we will ignore the varargs condition to determine whether we use 
-    // XMM registers or not for setting up the call.
-    bool isFloat = (isFloatRegType(argDsc->lvType) 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // For System V AMD64 calls the argDsc can have 2 registers (for structs.)
+    // Handle them here.
+    if (argDsc->lvType == TYP_STRUCT)
+    {
+        unixAmd64UpdateRegStateForArg(argDsc);
+    }
+    else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    {
+        RegState              * intRegState = &compiler->codeGen->intRegState;
+        RegState              * floatRegState = &compiler->codeGen->floatRegState;
+        // In the case of AMD64 we'll still use the floating point registers
+        // to model the register usage for argument on vararg calls, so
+        // we will ignore the varargs condition to determine whether we use 
+        // XMM registers or not for setting up the call.
+        bool isFloat = (isFloatRegType(argDsc->lvType)
 #ifndef _TARGET_AMD64_
-        && !compiler->info.compIsVarArgs
+            && !compiler->info.compIsVarArgs
 #endif
-        );
+            );
 
 #ifdef _TARGET_ARM_
-    if (argDsc->lvIsHfaRegArg) isFloat = true;
+        if (argDsc->lvIsHfaRegArg) isFloat = true;
 #endif // _TARGET_ARM_
-    if (isFloat)
-    {
-        JITDUMP("Float arg V%02u in reg %s\n", (argDsc - compiler->lvaTable), getRegName(argDsc->lvArgReg));
-        compiler->raUpdateRegStateForArg(floatRegState, argDsc);
-    } 
-    else
-    {
-        JITDUMP("Int arg V%02u in reg %s\n", (argDsc - compiler->lvaTable), getRegName(argDsc->lvArgReg));
-        compiler->raUpdateRegStateForArg(intRegState, argDsc);
+        if (isFloat)
+        {
+            JITDUMP("Float arg V%02u in reg %s\n", (argDsc - compiler->lvaTable), getRegName(argDsc->lvArgReg));
+            compiler->raUpdateRegStateForArg(floatRegState, argDsc);
+        }
+        else
+        {
+            JITDUMP("Int arg V%02u in reg %s\n", (argDsc - compiler->lvaTable), getRegName(argDsc->lvArgReg));
+            compiler->raUpdateRegStateForArg(intRegState, argDsc);
+        }
     }
 }
 
@@ -3548,7 +3602,9 @@ LinearScan::buildIntervals()
         // won't have done dataflow on it, but it needs to be marked as live-in so
         // it will get saved in the prolog.
         if (!compiler->compJmpOpUsed && argDsc->lvRefCnt == 0 && !compiler->opts.compDbgCode)
+        {
             continue;
+        }
 
         if (argDsc->lvIsRegArg) updateRegStateForArg(argDsc);
 
diff --git a/src/jit/lsra.h b/src/jit/lsra.h
index e57873fb65..cef6669513 100644
--- a/src/jit/lsra.h
+++ b/src/jit/lsra.h
@@ -574,6 +574,14 @@ private:
     void             buildUpperVectorRestoreRefPositions(GenTree *tree, LsraLocation currentLoc, VARSET_VALARG_TP liveLargeVectors);
 #endif //FEATURE_SIMD
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // For AMD64 on SystemV machines. This method 
+    // is called as replacement for raUpdateRegStateForArg 
+    // that is used on Windows. On System V systems a struct can be passed
+    // partially using registers from the 2 register files.
+    void unixAmd64UpdateRegStateForArg(LclVarDsc* argDsc);
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     // Update reg state for an incoming register argument
     void            updateRegStateForArg(LclVarDsc* argDsc);
 
@@ -998,7 +1006,6 @@ private:
     // Set of large vector (TYP_SIMD32 on AVX) variables to consider for callee-save registers.
     VARSET_TP           largeVectorCalleeSaveCandidateVars;
 #endif // FEATURE_SIMD
-    
 };
 
 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp
index f3eb506b0d..b000f58969 100644
--- a/src/jit/morph.cpp
+++ b/src/jit/morph.cpp
@@ -926,6 +926,7 @@ fgArgInfo::fgArgInfo(Compiler *  comp,  GenTreePtr  call, unsigned numArgs)
     argTableSize = numArgs; // the allocated table size
     argsComplete = false;
     argsSorted   = false;
+
     if (argTableSize == 0)
         argTable = NULL;
     else
@@ -1127,7 +1128,6 @@ void fgArgInfo::AddArg(fgArgTabEntryPtr curArgTabEntry)
     argCount++;
 }
 
-
 fgArgTabEntryPtr fgArgInfo::AddRegArg(unsigned    argNum,
                                       GenTreePtr  node,
                                       GenTreePtr  parent,
@@ -1137,38 +1137,79 @@ fgArgTabEntryPtr fgArgInfo::AddRegArg(unsigned    argNum,
 {
     fgArgTabEntryPtr curArgTabEntry = new(compiler, CMK_fgArgInfo) fgArgTabEntry;
 
-    curArgTabEntry->argNum       = argNum;
-    curArgTabEntry->node         = node;
-    curArgTabEntry->parent       = parent;
-    curArgTabEntry->regNum       = regNum;
-    curArgTabEntry->slotNum      = 0;
-    curArgTabEntry->numRegs      = numRegs;
-    curArgTabEntry->numSlots     = 0;
-    curArgTabEntry->alignment    = alignment;
-    curArgTabEntry->lateArgInx   = (unsigned) -1;
-    curArgTabEntry->tmpNum       = (unsigned) -1;
-    curArgTabEntry->isSplit      = false;
-    curArgTabEntry->isTmp        = false;
-    curArgTabEntry->needTmp      = false;
-    curArgTabEntry->needPlace    = false;
-    curArgTabEntry->processed    = false;
-    curArgTabEntry->isHfaRegArg  = false;
-    curArgTabEntry->isBackFilled = false;
-    curArgTabEntry->isNonStandard = false;
+    curArgTabEntry->argNum =            argNum;
+    curArgTabEntry->node =              node;
+    curArgTabEntry->parent =            parent;
+    curArgTabEntry->regNum =            regNum;
+    curArgTabEntry->slotNum =           0;
+    curArgTabEntry->numRegs =           numRegs;
+    curArgTabEntry->numSlots =          0;
+    curArgTabEntry->alignment =         alignment;
+    curArgTabEntry->lateArgInx =        (unsigned)-1;
+    curArgTabEntry->tmpNum =            (unsigned)-1;
+    curArgTabEntry->isSplit =           false;
+    curArgTabEntry->isTmp =             false;
+    curArgTabEntry->needTmp =           false;
+    curArgTabEntry->needPlace =         false;
+    curArgTabEntry->processed =         false;
+    curArgTabEntry->isHfaRegArg =       false;
+    curArgTabEntry->isBackFilled =      false;
+    curArgTabEntry->isNonStandard =     false;
 
     AddArg(curArgTabEntry);
     return curArgTabEntry;
 }
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+fgArgTabEntryPtr fgArgInfo::AddRegArg(unsigned          argNum,
+                                      GenTreePtr        node,
+                                      GenTreePtr        parent,
+                                      regNumber         regNum,
+                                      unsigned          numRegs,
+                                      unsigned          alignment,
+                                      const bool        isStruct,
+                                      const regNumber   otherRegNum,
+                                      const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* const structDescPtr)
+{
+    fgArgTabEntryPtr curArgTabEntry = AddRegArg(argNum, node, parent, regNum, numRegs, alignment);
+    assert(curArgTabEntry != nullptr);
+
+    // The node of the ArgTabEntry could change after remorphing - it could be rewritten to a cpyblk or a
+    // PlaceHolder node (in case of needed late argument, for example.)
+    // This requires using of an extra flag. At creation time the state is right, so
+    // and this assert enforces that.
+    assert((node->gtType == TYP_STRUCT && isStruct) || (node->gtType != TYP_STRUCT && !isStruct));
+    curArgTabEntry->otherRegNum = otherRegNum;                       // Second reg for the struct
+    curArgTabEntry->isStruct = isStruct;                             // is this a struct arg
+
+    if (isStruct && structDescPtr != nullptr)
+    {
+        curArgTabEntry->structDesc.CopyFrom(*structDescPtr);
+    }
+
+    return curArgTabEntry;
+}
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
 fgArgTabEntryPtr fgArgInfo::AddStkArg(unsigned    argNum,
                                       GenTreePtr  node,
                                       GenTreePtr  parent,
                                       unsigned    numSlots,
-                                      unsigned    alignment)
+                                      unsigned    alignment
+                                      FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const bool isStruct))
 {
     fgArgTabEntryPtr curArgTabEntry = new(compiler, CMK_fgArgInfo) fgArgTabEntry;
 
-    nextSlotNum = (unsigned) roundUp(nextSlotNum, alignment);
+    nextSlotNum = (unsigned)roundUp(nextSlotNum, alignment);
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // The node of the ArgTabEntry could change after remorphing - it could be rewritten to a cpyblk or a
+    // PlaceHolder node (in case of needed late argument, for example.)
+    // This reqires using of an extra flag. At creation time the state is right, so
+    // and this assert enforces that.
+    assert((node->gtType == TYP_STRUCT && isStruct) || (node->gtType != TYP_STRUCT && !isStruct));
+    curArgTabEntry->isStruct = isStruct;                             // is this a struct arg
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
     curArgTabEntry->argNum        = argNum;
     curArgTabEntry->node          = node;
@@ -1399,9 +1440,24 @@ void fgArgInfo::ArgsComplete()
 
     for (unsigned curInx = 0; curInx < argCount; curInx++)
     {
-        fgArgTabEntryPtr curArgTabEntry = argTable[curInx];        assert(curArgTabEntry != NULL);
+        fgArgTabEntryPtr curArgTabEntry = argTable[curInx];
+        assert(curArgTabEntry != NULL);
         GenTreePtr       argx           = curArgTabEntry->node;
 
+        // If this is a struct, mark it for needing a tempVar.
+        // In the copyblk and store this should have minimal perf impact since 
+        // the local vars where we copy/store to already exist and the logic for temp 
+        // var will not create a new one if it creates a tempVar from another tempVar.
+        // (Debugging through the code, there was no new copy of data created, neither a new tempVar.)
+        // The need for this arise from Lower::LowerArg. 
+        // In case of copyblk and store operation, the NewPutArg method will 
+        // not be invoked and the struct will not be loaded to be passed in
+        // registers or by value on the stack.
+        if (argx->TypeGet() == TYP_STRUCT FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY( || curArgTabEntry->isStruct))
+        {
+            curArgTabEntry->needTmp = true;
+        }
+
         if (curArgTabEntry->regNum == REG_STK)
         {
             hasStackArgs = true;
@@ -1415,8 +1471,11 @@ void fgArgInfo::ArgsComplete()
         }
         else // we have a register argument, next we look for a TYP_STRUCT
         {
-            if (argx->TypeGet() == TYP_STRUCT)
+            if (argx->TypeGet() == TYP_STRUCT 
+                FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY( || curArgTabEntry->isStruct))
+            {
                 hasStructRegArg = true;
+            }
         }
 
         /* If the argument tree contains an assignment (GTF_ASG) then the argument and
@@ -1461,7 +1520,6 @@ void fgArgInfo::ArgsComplete()
             }
         }
 
-
 #if FEATURE_FIXED_OUT_ARGS
         // Like calls, if this argument has a tree that will do an inline throw,
         // a call to a jit helper, then we need to treat it like a call (but only
@@ -1917,7 +1975,11 @@ void fgArgInfo::SortArgs()
     argsSorted = true;
 }
 
-GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
+// This function creates a tmp var ony if needed.
+// We need this to be done in order to enforce ordering
+// of the evaluation of arguments. There are times this function will not be called for an argument at all.
+GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum
+                                         FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const bool passedInRegisters))
 {
     LclVarDsc *  varDsc = &lvaTable[tmpVarNum];
     assert(varDsc->lvIsTemp);
@@ -1926,9 +1988,12 @@ GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
     // Create a copy of the temp to go into the late argument list
     GenTreePtr arg = gtNewLclvNode(tmpVarNum, type);
 
-#ifdef _TARGET_AMD64_
+#if defined(_TARGET_AMD64_)
     if (type == TYP_STRUCT)
     {
+
+
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
         switch (lvaLclExactSize(tmpVarNum))
         {
         case 1: type = TYP_BYTE;  break;
@@ -1953,6 +2018,8 @@ GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
         default:
             break;
         }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING 
+
         // If we didn't change the type of the struct, it means
         // its structure doesn't support to be passed directly through a
         // register, so we need to pass a pointer to the destination where
@@ -1960,7 +2027,23 @@ GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
         if (type == TYP_STRUCT)
         {
             arg->gtFlags |= GTF_DONT_CSE;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING 
+
+            // If it is passed in registers, don't get the address of the var. Make it a
+            // field instead. It will be loaded in registers with putarg_reg tree in lower.
+            if (passedInRegisters)
+            {
+                arg->ChangeOper(GT_LCL_FLD);
+                arg->gtType = type;
+            }
+            else
+            {
+                arg = gtNewOperNode(GT_ADDR, TYP_STRUCT, arg);
+            }
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING 
             arg = gtNewOperNode(GT_ADDR, TYP_I_IMPL, arg);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING 
         }
         else
         {
@@ -1973,10 +2056,8 @@ GenTreePtr    Compiler::fgMakeTmpArgNode(unsigned tmpVarNum)
 
     arg->gtFlags |= GTF_DONT_CSE;
     arg = gtNewOperNode(GT_ADDR, TYP_I_IMPL, arg);
-
     // Ldobj the temp to use it as a call argument
-    arg = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, arg, lvaGetStruct(tmpVarNum)
-                                            );
+    arg = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, arg, lvaGetStruct(tmpVarNum));
     arg->gtFlags |= GTF_EXCEPT;
 
 #endif // _TARGET_AMD64_
@@ -2007,7 +2088,7 @@ void fgArgInfo::EvalArgsToTemps()
         //   Only the register arguments need to be replaced with placeholders node
         //   stacked arguments are evaluated and pushed in order
         //
-        if (curArgTabEntry->regNum == REG_STK)
+        if (curArgTabEntry->regNum == REG_STK && !curArgTabEntry->needTmp) 
             continue;
 #endif
 
@@ -2019,9 +2100,11 @@ void fgArgInfo::EvalArgsToTemps()
             {
                 // Create a copy of the temp to go into the late argument list
                 tmpVarNum = curArgTabEntry->tmpNum;
-                defArg = compiler->fgMakeTmpArgNode(tmpVarNum);
+                defArg = compiler->fgMakeTmpArgNode(
+                    tmpVarNum
+                    FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(argTable[curInx]->structDesc.passedInRegisters));
 
-                /* mark the original node as a late argument */
+                // mark the original node as a late argument
                 argx->gtFlags |= GTF_LATE_ARG;
             }
             else
@@ -2036,7 +2119,7 @@ void fgArgInfo::EvalArgsToTemps()
                 }
 #endif
 
-#ifdef _TARGET_AMD64_
+#if defined(_TARGET_AMD64_) && !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 noway_assert(argx->gtType != TYP_STRUCT);
 #endif
 
@@ -2160,11 +2243,11 @@ void fgArgInfo::EvalArgsToTemps()
             /* For a TYP_STRUCT we also need to record the class handle of the arg */
             CORINFO_CLASS_HANDLE clsHnd = NULL;
 
-#ifdef _TARGET_AMD64_
+#if defined(_TARGET_AMD64_) && !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
             noway_assert(argx->gtType != TYP_STRUCT);
 
-#else // _TARGET_AMD664_
+#else // _TARGET_AMD64_
 
             if (defArg->gtType == TYP_STRUCT)
             {
@@ -2429,6 +2512,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 #endif
 
     unsigned        argSlots          = 0;
+    unsigned        nonRegPassedStructSlots = 0;
     bool            lateArgsComputed  = (call->gtCallLateArgs != nullptr);
     bool            callHasRetBuffArg = ((call->gtCallMoreFlags & GTF_CALL_M_RETBUFFARG) != 0);
 
@@ -2606,13 +2690,19 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                    (call->gtCallObjp->gtType == TYP_I_IMPL));
 
             /* this is a register argument - put it in the table */
-            call->fgArgInfo->AddRegArg(argIndex, argx, NULL, genMapIntRegArgNumToRegNum(intArgRegNum), 1, 1);
+            call->fgArgInfo->AddRegArg(argIndex, argx, NULL, genMapIntRegArgNumToRegNum(intArgRegNum), 1, 1
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                , false, REG_STK, nullptr
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+                );
         }
         else
         {
             /* this is a register argument - possibly update it in the table */
             call->fgArgInfo->RemorphRegArg(argIndex, argx, NULL, genMapIntRegArgNumToRegNum(intArgRegNum), 1, 1);
         }
+        // this can't be a struct.
+        assert(argx->gtType != TYP_STRUCT);
 
         /* Increment the argument register count and argument index */
         if (!varTypeIsFloating(argx->gtType))
@@ -2714,9 +2804,22 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
 #endif // _TARGET_ARM_
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    bool nonRegPassableStruct = false;
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    bool hasStructArgument = false;
     for (args = call->gtCallArgs; args; args = args->gtOp.gtOp2)
     {
         GenTreePtr * parentArgx = &args->gtOp.gtOp1;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        if (!hasStructArgument)
+        {
+            hasStructArgument = (args->gtOp.gtOp1->TypeGet() == TYP_STRUCT);
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
         argx = fgMorphTree(*parentArgx);
         *parentArgx = argx;
         flagsSummary |= argx->gtFlags;
@@ -2741,7 +2844,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
         unsigned             size         = 0;
         CORINFO_CLASS_HANDLE copyBlkClass = NULL;
-        bool                 isRegArg;
+        bool                 isRegArg = false;
 
         fgArgTabEntryPtr argEntry = NULL;
 
@@ -2816,14 +2919,20 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         }
 
 #elif defined(_TARGET_AMD64_)
-        
-        passUsingFloatRegs = varTypeIsFloating(argx);
-
 #if defined(UNIX_AMD64_ABI)        
+        if (lateArgsComputed)
+        {
+            passUsingFloatRegs = isValidFloatArgReg(argEntry->regNum);
+        }
+        else
+        {
+            passUsingFloatRegs = varTypeIsFloating(argx);
+        }
         bool passUsingIntRegs;
         passUsingIntRegs = passUsingFloatRegs ? false : (intArgRegNum < MAX_REG_ARG);
-#endif // UNIX_AMD64_ABI
-
+#else // !UNIX_AMD64_ABI
+        passUsingFloatRegs = varTypeIsFloating(argx);
+#endif // !UNIX_AMD64_ABI
 #elif defined(_TARGET_X86_)
 
         passUsingFloatRegs = false;
@@ -2836,6 +2945,12 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         unsigned     nextFltArgRegNum = fltArgRegNum;  // This is the next floating-point argument register number to use
         var_types    structBaseType   = TYP_STRUCT;
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        unsigned int structFloatRegs = 0;
+        unsigned int structIntRegs = 0;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        bool isStructArg = argx->gtType == TYP_STRUCT;
+
         if (lateArgsComputed)
         {
             assert(argEntry != NULL);
@@ -2870,12 +2985,24 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             // stack slots, or both if the argument is split between the registers and the stack.
             //
 
-            if (argx->IsArgPlaceHolderNode() || (argx->gtType != TYP_STRUCT))
+            if (argx->IsArgPlaceHolderNode() || (!isStructArg))
             {
 #if   defined(_TARGET_AMD64_)
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                if (!isStructArg)
+                {
+                    size = 1; // On AMD64, all primitives fit in a single (64-bit) 'slot'
+                }
+                else
+                {
+                    size = (unsigned)(roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), TARGET_POINTER_SIZE)) / TARGET_POINTER_SIZE;
+                    eeGetSystemVAmd64PassStructInRegisterDescriptor(argx->gtArgPlace.gtArgPlaceClsHnd, &structDesc);
+                }
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
                 size = 1; // On AMD64, all primitives fit in a single (64-bit) 'slot'
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #elif defined(_TARGET_ARM64_)    
-                if (argx->gtType == TYP_STRUCT)
+                if (isStructArg)
                 {
                     // Structs are eith passed in 1 or 2 (64-bit) slots
                     size = (unsigned)(roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), TARGET_POINTER_SIZE)) / TARGET_POINTER_SIZE;
@@ -2891,7 +3018,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                     size = 1; // On ARM64, all primitives fit in a single (64-bit) 'slot'
                 }
 #elif defined(_TARGET_ARM_)
-                if (argx->gtType == TYP_STRUCT)
+                if (isStructArg)
                 {
                     size = (unsigned)(roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), TARGET_POINTER_SIZE)) / TARGET_POINTER_SIZE;
                 }
@@ -2915,10 +3042,26 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             else // argx->gtType == TYP_STRUCT
             {
                 /* We handle two opcodes: GT_MKREFANY and GT_LDOBJ */
-                if (argx->gtOper == GT_MKREFANY)
+                if (argx->gtOper == GT_MKREFANY) 
                 {
+                    if (argx->TypeGet() == TYP_STRUCT)
+                    {
+                        isStructArg = true;
+                    }
 #ifdef _TARGET_AMD64_
-                    size = 1;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    if (argx->TypeGet() == TYP_STRUCT)
+                    {
+                        size = info.compCompHnd->getClassSize(impGetRefAnyClass());
+                        unsigned roundupSize = (unsigned)roundUp(size, TARGET_POINTER_SIZE);
+                        size = roundupSize / TARGET_POINTER_SIZE;
+                        eeGetSystemVAmd64PassStructInRegisterDescriptor(impGetRefAnyClass(), &structDesc);
+                    }
+                    else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    {
+                        size = 1;
+                    }
 #else
                     size = 2;
 #endif
@@ -2942,22 +3085,42 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                         BADCODE("illegal argument tree in fgMorphArgs");
 
                     CORINFO_CLASS_HANDLE ldObjClass = argLdobj->gtLdObj.gtClass;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    eeGetSystemVAmd64PassStructInRegisterDescriptor(ldObjClass, &structDesc);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
                     unsigned originalSize = info.compCompHnd->getClassSize(ldObjClass);
+                    originalSize = (originalSize == 0 ? TARGET_POINTER_SIZE : originalSize);
                     unsigned roundupSize  = (unsigned)roundUp(originalSize, TARGET_POINTER_SIZE);
                     bool     passStructByRef = false;
 
 #ifndef _TARGET_X86_
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
                     // Check for TYP_STRUCT argument with size 1, 2, 4 or 8 bytes
                     // As we can optimize these by turning them into a GT_IND of the correct type
-                    if ((originalSize > TARGET_POINTER_SIZE) || ((originalSize & (originalSize-1)) != 0))
+                    if ((originalSize > TARGET_POINTER_SIZE) || ((originalSize & (originalSize - 1)) != 0))
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                     {
                         // Normalize 'size' to the number of pointer sized items
                         // 'size' is the number of register slots that we will use to pass the argument
                         size = roundupSize / TARGET_POINTER_SIZE;
 #if defined(_TARGET_AMD64_)
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
                         size = 1;      // This must be copied to a temp and passed by address
                         passStructByRef = true;
                         copyBlkClass = ldObjClass;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
+                        if (!structDesc.passedInRegisters)
+                        {
+                            passStructByRef = false;
+                            copyBlkClass = NULL;
+                        }
+                        else 
+                        {
+                            passStructByRef = true;
+                            copyBlkClass = ldObjClass;
+                        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #elif defined(_TARGET_ARM64_)
                         if (size > 2)
                         {
@@ -2985,6 +3148,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                         }
 #endif // _TARGET_ARM_
                     }
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
                     else 
                     {
                         // change our GT_LDOBJ into a GT_IND of the correct type
@@ -3109,10 +3273,10 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
                         size = 1;
                     }
-#endif // not _TARGET_X86_
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
+#endif // not _TARGET_X86_
                     // We still have a TYP_STRUCT unless we converted the GT_LDOBJ into a GT_IND above...
-
                     if ((structBaseType == TYP_STRUCT) && !passStructByRef)
                     {
                         // if the valuetype size is not a multiple of sizeof(void*),
@@ -3158,8 +3322,23 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             //
             // Figure out if the argument will be passed in a register.
             //
+            bool passedInRegisters = true;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            passedInRegisters = !isStructArg;
+            if (!passedInRegisters)
+            {
+                if (structDesc.passedInRegisters)
+                {
+                    passedInRegisters = true;
+                }
+                else
+                {
+                    passedInRegisters = false;
+                }
+            }
 
-            if (isRegParamType(genActualType(argx->TypeGet())))
+#endif
+            if (passedInRegisters && isRegParamType(genActualType(argx->TypeGet())))
             {
 #ifdef _TARGET_ARM_
                 if (passUsingFloatRegs)
@@ -3192,13 +3371,48 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                 }
 #else // _TARGET_ARM_
 #if defined(UNIX_AMD64_ABI)
-                if (passUsingFloatRegs)
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                // Here a struct can be passed in register following the classifications of its members and size.
+                // Now make sure there are actually enough registers to do so.
+                if (isStructArg)
                 {
-                    isRegArg = fltArgRegNum < MAX_FLOAT_REG_ARG;
+                    for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
+                    {
+                        if (structDesc.eightByteClassifications[i] == SystemVClassificationTypeInteger ||
+                            structDesc.eightByteClassifications[i] == SystemVClassificationTypeIntegerReference)
+                        {
+                            structIntRegs++;
+                        }
+                        else if (structDesc.eightByteClassifications[i] == SystemVClassificationTypeSSE)
+                        {
+                            structFloatRegs++;
+                        }
+                    }
+
+                    if (((nextFltArgRegNum + structFloatRegs) > MAX_FLOAT_REG_ARG) ||
+                        ((intArgRegNum + structIntRegs) > MAX_REG_ARG))
+                    {
+                        isRegArg = false;
+                        nonRegPassableStruct = true;
+                    }
+                    else
+                    {
+                        isRegArg = true;
+                        nonRegPassableStruct = false;
+                    }
                 }
                 else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 {
-                    isRegArg = intArgRegNum < MAX_REG_ARG;
+                    if (passUsingFloatRegs)
+                    {
+                        isRegArg = nextFltArgRegNum < MAX_FLOAT_REG_ARG;
+                    }
+                    else
+                    {
+                        isRegArg = intArgRegNum < MAX_REG_ARG;
+                    }
                 }
 #else // !defined(UNIX_AMD64_ABI)
                 isRegArg = intArgRegNum < maxRegArgs;
@@ -3208,6 +3422,10 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             else
             {
                 isRegArg = false;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                nonRegPassableStruct = true;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
             }
         }
 
@@ -3245,16 +3463,67 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         }
 
 #endif // _TARGET_ARM_
-
         if (isRegArg)
         {
-            // fill in or update the argInfo table 
+            regNumber nextRegNum = REG_STK;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            regNumber nextOtherRegNum = REG_STK;
+
+            if (isStructArg)
+            {
+                // It is a struct passed in registers. Assign the next available register.
+                unsigned int curIntReg = intArgRegNum;
+                unsigned int curFloatReg = nextFltArgRegNum;
+                for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
+                {
+                    if (structDesc.eightByteClassifications[i] == SystemVClassificationTypeInteger ||
+                        structDesc.eightByteClassifications[i] == SystemVClassificationTypeIntegerReference)
+                    {
+                        if (i == 0)
+                        {
+                            nextRegNum = genMapIntRegArgNumToRegNum(curIntReg);
+                        }
+                        else if (i == 1)
+                        {
+                            nextOtherRegNum = genMapIntRegArgNumToRegNum(curIntReg);
+                        }
+                        else
+                        {
+                            assert(false && "fgMorphArgs Invalid index for int classification.");
+                        }
 
-            regNumber nextRegNum = passUsingFloatRegs ? genMapFloatRegArgNumToRegNum(nextFltArgRegNum) : genMapIntRegArgNumToRegNum(intArgRegNum);
+                        curIntReg++;
+                    }
+                    else if (structDesc.eightByteClassifications[i] == SystemVClassificationTypeSSE)
+                    {
+                        if (i == 0)
+                        {
+                            nextRegNum = genMapFloatRegArgNumToRegNum(curFloatReg);
+                        }
+                        else if (i == 1)
+                        {
+                            nextOtherRegNum = genMapFloatRegArgNumToRegNum(curFloatReg);
+                        }
+                        else
+                        {
+                            assert(false && "fgMorphArgs Invalid index for SSE classification.");
+                        }
 
+                        curFloatReg++;
+                    }
+                }
+            }
+            else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            {
+                // fill in or update the argInfo table 
+                nextRegNum = passUsingFloatRegs ? genMapFloatRegArgNumToRegNum(nextFltArgRegNum) : genMapIntRegArgNumToRegNum(intArgRegNum);
+            }
 
 #ifdef _TARGET_AMD64_
-            assert(size == 1);
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
+       assert(size == 1);
+#endif
 #endif
 
 #ifndef LEGACY_BACKEND
@@ -3263,14 +3532,18 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             // 
             // They should not affect the placement of any other args or stack space required.
             // Example: on AMD64 R10 and R11 are used for indirect VSD (generic interface) and cookie calls.
-
             bool nonStandardFound = false;
             for (int i=0; i<nonStandardArgs.Height(); i++)
             {
                 hasNonStandardArg = true;
                 if (argx == nonStandardArgs.Index(i).node)
                 {
-                    fgArgTabEntry* argEntry = call->fgArgInfo->AddRegArg(argIndex, argx, args, nonStandardArgs.Index(i).reg, size, argAlign);
+                    fgArgTabEntry* argEntry = call->fgArgInfo->AddRegArg(argIndex, argx,
+                        args, nonStandardArgs.Index(i).reg, size, argAlign
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                        , isStructArg, nextOtherRegNum, &structDesc
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    );
                     argEntry->isNonStandard = true;
                     argIndex++;
                     nonStandardFound = true;
@@ -3283,9 +3556,13 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
             if (!lateArgsComputed)
             {
-                /* This is a register argument - put it in the table */
-
-                fgArgTabEntryPtr newArg = call->fgArgInfo->AddRegArg(argIndex, argx, args, nextRegNum, size, argAlign);
+                // This is a register argument - put it in the table
+                fgArgTabEntryPtr newArg = call->fgArgInfo->AddRegArg(
+                    argIndex, argx, args, nextRegNum, size, argAlign
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    , isStructArg, nextOtherRegNum, &structDesc
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    );
                 (void)newArg; //prevent "unused variable" error from GCC
 #ifdef _TARGET_ARM_
                 newArg->SetIsHfaRegArg(passUsingFloatRegs && isHfaArg); // Note that an HFA is passed in int regs for varargs
@@ -3294,7 +3571,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             }
             else
             {
-                /* This is a register argument - possibly update it in the table */
+                // This is a register argument - possibly update it in the table
                 fgArgTabEntryPtr entry = call->fgArgInfo->RemorphRegArg(argIndex, argx, args, nextRegNum, size, argAlign);
                 if (entry->isNonStandard)
                 {
@@ -3306,45 +3583,55 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             // Setup the next argRegNum value
             if (!isBackFilled)
             {
-                if (passUsingFloatRegs)
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                if (isStructArg)
                 {
-                    fltArgRegNum += size;
-#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
-                    argSkippedRegMask |= genMapArgNumToRegMask(intArgRegNum, TYP_I_IMPL);
-                    intArgRegNum = min(intArgRegNum + size, MAX_REG_ARG);
-#endif // _TARGET_AMD64_
-#ifdef _TARGET_ARM_
-                    if (fltArgRegNum > MAX_FLOAT_REG_ARG)
-                    {
-                        // This indicates a partial enregistration of a struct type
-                        assert(argx->gtType == TYP_STRUCT);
-                        unsigned numRegsPartial = size - (fltArgRegNum - MAX_FLOAT_REG_ARG);
-                        assert((unsigned char)numRegsPartial == numRegsPartial);
-                        call->fgArgInfo->SplitArg(argIndex, numRegsPartial, size - numRegsPartial);
-                        fltArgRegNum = MAX_FLOAT_REG_ARG;
-                    }
-#endif // _TARGET_ARM_
+                    intArgRegNum += structIntRegs;
+                    fltArgRegNum += structFloatRegs;
                 }
                 else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                 {
-                    intArgRegNum += size;
+                    if (passUsingFloatRegs)
+                    {
+                        fltArgRegNum += size;
 #if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
-                    fltArgSkippedRegMask |= genMapArgNumToRegMask(fltArgRegNum, TYP_DOUBLE);
-                    fltArgRegNum = min(fltArgRegNum + size, MAX_FLOAT_REG_ARG);
+                        argSkippedRegMask |= genMapArgNumToRegMask(intArgRegNum, TYP_I_IMPL);
+                        intArgRegNum = min(intArgRegNum + size, MAX_REG_ARG);
 #endif // _TARGET_AMD64_
 #ifdef _TARGET_ARM_
-                    if (intArgRegNum > MAX_REG_ARG)
-                    {
-                        // This indicates a partial enregistration of a struct type
-                        assert((argx->gtType == TYP_STRUCT) || argx->OperIsCopyBlkOp() ||
-                               (argx->gtOper == GT_COMMA && (args->gtFlags & GTF_ASG)));
-                        unsigned numRegsPartial = size - (intArgRegNum - MAX_REG_ARG);
-                        assert((unsigned char)numRegsPartial == numRegsPartial);
-                        call->fgArgInfo->SplitArg(argIndex, numRegsPartial, size - numRegsPartial);
-                        intArgRegNum = MAX_REG_ARG;
-                        fgPtrArgCntCur += size - numRegsPartial;
+                        if (fltArgRegNum > MAX_FLOAT_REG_ARG)
+                        {
+                            // This indicates a partial enregistration of a struct type
+                            assert(isStructArg);
+                            unsigned numRegsPartial = size - (fltArgRegNum - MAX_FLOAT_REG_ARG);
+                            assert((unsigned char)numRegsPartial == numRegsPartial);
+                            call->fgArgInfo->SplitArg(argIndex, numRegsPartial, size - numRegsPartial);
+                            fltArgRegNum = MAX_FLOAT_REG_ARG;
+                        }
+#endif // _TARGET_ARM_
                     }
+                    else
+                    {
+                        intArgRegNum += size;
+#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
+                        fltArgSkippedRegMask |= genMapArgNumToRegMask(fltArgRegNum, TYP_DOUBLE);
+                        fltArgRegNum = min(fltArgRegNum + size, MAX_FLOAT_REG_ARG);
+#endif // _TARGET_AMD64_
+#ifdef _TARGET_ARM_
+                        if (intArgRegNum > MAX_REG_ARG)
+                        {
+                            // This indicates a partial enregistration of a struct type
+                            assert((isStructArg) || argx->OperIsCopyBlkOp() ||
+                                (argx->gtOper == GT_COMMA && (args->gtFlags & GTF_ASG)));
+                            unsigned numRegsPartial = size - (intArgRegNum - MAX_REG_ARG);
+                            assert((unsigned char)numRegsPartial == numRegsPartial);
+                            call->fgArgInfo->SplitArg(argIndex, numRegsPartial, size - numRegsPartial);
+                            intArgRegNum = MAX_REG_ARG;
+                            fgPtrArgCntCur += size - numRegsPartial;
+                        }
 #endif // _TARGET_ARM_
+                    }
                 }
             }
         }
@@ -3352,27 +3639,28 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         {
             fgPtrArgCntCur += size;
 
-            /* If the register arguments have not been determined then we must fill in the argInfo */
+            // If the register arguments have not been determined then we must fill in the argInfo
 
             if  (!lateArgsComputed)
             {
-                /* This is a stack argument - put it in the table */
-                call->fgArgInfo->AddStkArg(argIndex, argx, args, size, argAlign);
+                // This is a stack argument - put it in the table
+                call->fgArgInfo->AddStkArg(argIndex, argx, args, size, argAlign FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(isStructArg));
+
             }
             else
             {
-                /* This is a stack argument - possibly update it in the table */
+                // This is a stack argument - possibly update it in the table
                 call->fgArgInfo->RemorphStkArg(argIndex, argx, args, size, argAlign);
             }
         }
-
         if (copyBlkClass != NULL)
         {
             noway_assert(!lateArgsComputed);
-            fgMakeOutgoingStructArgCopy(call, args, argIndex, copyBlkClass);
+            fgMakeOutgoingStructArgCopy(call, args, argIndex, copyBlkClass FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(&structDesc));
         }
 
 #ifdef _TARGET_AMD64_
+
         if (argx->gtOper == GT_MKREFANY)
         {
             // 'Lower' the MKREFANY tree and insert it.
@@ -3406,10 +3694,15 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         }
 #endif // _TARGET_AMD64_
 
-
         argIndex++;
-        argSlots += size;
-
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        if (nonRegPassableStruct)
+        {
+            nonRegPassedStructSlots += size;
+        }
+        else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            argSlots += size;
     } // end foreach argument loop
 
     if  (!lateArgsComputed)
@@ -3478,18 +3771,17 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
         // and ignores floating point args (it is overly conservative in that case).
         if (argSlots <= MAX_REG_ARG)
         {
-            preallocatedArgCount = 0;
+            preallocatedArgCount = nonRegPassedStructSlots;
         }
         else
         {
-            preallocatedArgCount = argSlots - MAX_REG_ARG;
+            preallocatedArgCount = argSlots + nonRegPassedStructSlots - MAX_REG_ARG;
         }
 #elif defined(_TARGET_AMD64_)
         preallocatedArgCount = max(4, argSlots);
 #else
 #error Unsupported or unset target architecture
 #endif // _TARGET_*
-
         if (preallocatedArgCount * REGSIZE_BYTES > lvaOutgoingArgSpaceSize)
         {
             lvaOutgoingArgSpaceSize = preallocatedArgCount * REGSIZE_BYTES;
@@ -3514,39 +3806,242 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
     // If the register arguments have already been determined
     // or we have no register arguments then we are done.
 
-    if  (lateArgsComputed || (intArgRegNum == 0 && fltArgRegNum == 0 && !hasNonStandardArg))
+    bool needEvalArgsToTemps = true;
+
+    if  (lateArgsComputed || (intArgRegNum == 0 && fltArgRegNum == 0 && !hasNonStandardArg && !hasStructArgument))
     {
-        return call;
+        needEvalArgsToTemps = false;
     }
 
-    // This is the first time that we morph this call AND it has register arguments.
-    // Follow into the code below and do the 'defer or eval to temp' analysis.
+    if (needEvalArgsToTemps)
+    {
+        // This is the first time that we morph this call AND it has register arguments.
+        // Follow into the code below and do the 'defer or eval to temp' analysis.
 
-    call->fgArgInfo->SortArgs();
+        call->fgArgInfo->SortArgs();
 
-    call->fgArgInfo->EvalArgsToTemps();
+        call->fgArgInfo->EvalArgsToTemps();
 
-    // We may have updated the arguments
-    if (call->gtCallArgs)
-    {
-        UpdateGT_LISTFlags(call->gtCallArgs);
+        // We may have updated the arguments
+        if (call->gtCallArgs)
+        {
+            UpdateGT_LISTFlags(call->gtCallArgs);
+        }
     }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // Rewrite the struct args to be passed by value on stack or in registers.
+    fgMorphSystemVStructArgs(call, hasStructArgument);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     return call;
 }
 #ifdef _PREFAST_
 #pragma warning(pop)
 #endif
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+// fgMorphSystemVStructArgs:
+//   Rewrite the struct args to be passed by value on stack or in registers.
+//
+// args:
+//   call: The cll whose arguments need to be morphed..
+//   hasStructArgument: Whether this call has struct arguments.
+//   
+void Compiler::fgMorphSystemVStructArgs(GenTreeCall* call, bool hasStructArgument)
+{
+    unsigned flagsSummary = 0;
+    GenTreePtr      args;
+    GenTreePtr      argx;
+
+    if (hasStructArgument)
+    {
+        fgArgInfoPtr allArgInfo = call->fgArgInfo;
+
+        for (args = call->gtCallArgs; args; args = args->gtOp.gtOp2)
+        {
+            // For late arguments the arg tree that is overridden is in the gtCallLateArgs list. 
+            // For suchlate args the gtCallArgList contains the setup arg node (ealuating the arg.) 
+            // The tree from the gtCallLateArgs list is passed to the calle. The fgArgEntry node cointains the mapping
+            // between the nodes in both lists. If the arg is not a late arg, the fgArgEntryt->node points to itself,
+            // otherwise points to the list in the late args list.
+            bool isLateArg = (args->gtOp.gtOp1->gtFlags & GTF_LATE_ARG) != 0;
+            fgArgTabEntryPtr fgEntryPtr = gtArgEntryByNode(call, args->gtOp.gtOp1);
+            assert(fgEntryPtr != nullptr);
+            GenTreePtr argx = fgEntryPtr->node;
+            GenTreePtr lateList = nullptr;
+            GenTreePtr lateNode = nullptr;
+
+            if (isLateArg)
+            {
+                for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+                {
+                    assert(list->IsList());
+
+                    GenTreePtr argNode = list->Current();
+                    if (argx == argNode)
+                    {
+                        lateList = list;
+                        lateNode = argNode;
+                        break;
+                    }
+                }
+                assert(lateList != nullptr && lateNode != nullptr);
+            }
+            GenTreePtr arg = argx;
+            bool argListCreated = false;
+
+            var_types  type = arg->TypeGet();
+
+            if (type == TYP_STRUCT)
+            {
+                // If we have already processed the arg...
+                if (arg->OperGet() == GT_LIST && arg->TypeGet() == TYP_STRUCT)
+                {
+                    continue;
+                }
+
+                // If already LDOBJ it is set properly already.
+                if (arg->OperGet() == GT_LDOBJ)
+                {
+                    assert(!fgEntryPtr->structDesc.passedInRegisters);
+                    continue;
+                }
+
+                assert(
+                    arg->OperGet() == GT_ADDR ||
+                    arg->OperGet() == GT_LCL_FLD ||
+                    arg->OperGet() == GT_LCL_VAR);
+
+                assert(
+                    arg->OperGet() == GT_LCL_VAR ||
+                    arg->OperGet() == GT_LCL_FLD ||
+                    arg->gtOp.gtOp1->OperGet() == GT_LCL_FLD ||
+                    arg->gtOp.gtOp1->OperGet() == GT_LCL_VAR);
+
+                GenTreeLclVarCommon* lclCommon = arg->OperGet() == GT_ADDR ?
+                    arg->gtOp.gtOp1->AsLclVarCommon() : arg->AsLclVarCommon();
+                if (fgEntryPtr->structDesc.passedInRegisters)
+                {
+                    if (fgEntryPtr->structDesc.eightByteCount == 1) 
+                    {
+                        // Change the type and below the code will change the LclVar to a LCL_FLD
+                        type = GetTypeFromClassificationAndSizes(fgEntryPtr->structDesc.eightByteClassifications[0], fgEntryPtr->structDesc.eightByteSizes[0]);
+                    }
+                    else if (fgEntryPtr->structDesc.eightByteCount == 2)
+                    {
+                        // Create LCL_FLD for each eightbyte.
+                        argListCreated = true;
+
+                        // Second eightbyte.
+                        GenTreeLclFld* newLclField = new(this, GT_LCL_FLD) GenTreeLclFld(
+                            GetTypeFromClassificationAndSizes(
+                            fgEntryPtr->structDesc.eightByteClassifications[1],
+                            fgEntryPtr->structDesc.eightByteSizes[1]),
+                            lclCommon->gtLclNum,
+                            fgEntryPtr->structDesc.eightByteOffsets[1]);
+                        GenTreeArgList* secondNode = gtNewListNode(newLclField, nullptr);
+                        secondNode->gtType = TYP_STRUCT; // Preserve the TYP_STRUCT. It is a special case.
+                        newLclField->gtFieldSeq = FieldSeqStore::NotAField();
+
+                        // First field
+                        arg->AsLclFld()->gtFieldSeq = FieldSeqStore::NotAField();
+                        arg->gtType = GetTypeFromClassificationAndSizes(
+                            fgEntryPtr->structDesc.eightByteClassifications[0],
+                            fgEntryPtr->structDesc.eightByteSizes[0]);
+                        arg = gtNewListNode(arg, secondNode);
+                        arg->gtType = TYP_STRUCT; // Preserve the TYP_STRUCT. It is a special case.
+                    }
+                    else
+                    {
+                        assert(false && "More than two eightbytes detected for CLR."); // No more than two eightbytes for the CLR.
+                    }
+                }
+
+                // If we didn't change the type of the struct, it means
+                // its classification doesn't support to be passed directly through a
+                // register, so we need to pass a pointer to the destination where
+                // where we copied the struct to.
+                if (!argListCreated)
+                {
+                    if (fgEntryPtr->structDesc.passedInRegisters)
+                    {
+                        arg->gtType = type;
+                    }
+                    else
+                    {
+                        arg->gtType = TYP_I_IMPL;
+
+                        // Make sure this is an addr node.
+                        if (arg->OperGet() != GT_ADDR && arg->OperGet() != GT_LCL_VAR_ADDR)
+                        {
+                            arg = gtNewOperNode(GT_ADDR, TYP_I_IMPL, arg);
+                        }
+
+                        assert(arg->OperGet() == GT_ADDR || arg->OperGet() == GT_LCL_VAR_ADDR);
+
+                        // Ldobj the temp to use it as a call argument
+                        arg = new (this, GT_LDOBJ) GenTreeLdObj(TYP_STRUCT, arg, lvaGetStruct(lclCommon->gtLclNum));
+                        arg->gtFlags |= GTF_EXCEPT;
+                        flagsSummary |= GTF_EXCEPT;
+                    }
+                }
+            }
+
+            if (argx != arg)
+            {
+                bool isLateArg = (args->gtOp.gtOp1->gtFlags & GTF_LATE_ARG) != 0;
+                fgArgTabEntryPtr fgEntryPtr = gtArgEntryByNode(call, args->gtOp.gtOp1);
+                assert(fgEntryPtr != nullptr);
+                GenTreePtr argx = fgEntryPtr->node;
+                GenTreePtr lateList = nullptr;
+                GenTreePtr lateNode = nullptr;
+                if (isLateArg)
+                {
+                    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+                    {
+                        assert(list->IsList());
+
+                        GenTreePtr argNode = list->Current();
+                        if (argx == argNode)
+                        {
+                            lateList = list;
+                            lateNode = argNode;
+                            break;
+                        }
+                    }
+                    assert(lateList != nullptr && lateNode != nullptr);
+                }
+
+                fgEntryPtr->node = arg;
+                if (isLateArg)
+                {
+                    lateList->gtOp.gtOp1 = arg;
+                }
+                else
+                {
+                    args->gtOp.gtOp1 = arg;
+                }
+            }
+        }
+    }
+
+    // Update the flags
+    call->gtFlags |= (flagsSummary & GTF_ALL_EFFECT);
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 // Make a copy of a struct variable if necessary, to pass to a callee.
 // returns: tree that computes address of the outgoing arg
 void
-Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, GenTree* args, unsigned argIndex, CORINFO_CLASS_HANDLE copyBlkClass)
+Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, 
+                                      GenTree* args, 
+                                      unsigned argIndex, 
+                                      CORINFO_CLASS_HANDLE copyBlkClass
+                                      FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* const structDescPtr))
 {
     GenTree* argx = args->Current();
-
     noway_assert(argx->gtOper != GT_MKREFANY);
-
     // See if we need to insert a copy at all
     // Case 1: don't need a copy if it is the last use of a local.  We can't determine that all of the time
     // but if there is only one use and no loops, the use must be last.
@@ -3616,8 +4111,6 @@ Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, GenTree* args, unsigned
 
     fgCurrentlyInUseArgTemps->setBit(tmp);
 
-            
-
     // TYP_SIMD structs should not be enregistered, since ABI requires it to be
     // allocated on stack and address of it needs to be passed.
     if (lclVarIsSIMDType(tmp))
@@ -3648,13 +4141,16 @@ Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall* call, GenTree* args, unsigned
 #if FEATURE_FIXED_OUT_ARGS
 
     // Do the copy early, and evalute the temp later (see EvalArgsToTemps)
+    // When on Unix create LCL_FLD for structs passed in more than one registers. See fgMakeTmpArgNode
     GenTreePtr arg = copyBlk;
 
 #else // FEATURE_FIXED_OUT_ARGS
 
     // Structs are always on the stack, and thus never need temps
     // so we have to put the copy and temp all into one expression
-    GenTreePtr arg = fgMakeTmpArgNode(tmp);
+    GenTreePtr arg = fgMakeTmpArgNode(
+        tmp
+        FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(structDescPtr->passedInRegisters));
 
     // Change the expression to "(tmp=val),tmp"
     arg = gtNewOperNode(GT_COMMA, arg->TypeGet(), copyBlk, arg);
@@ -3718,30 +4214,60 @@ void                Compiler::fgFixupStructReturn(GenTreePtr     call)
 {
     bool callHasRetBuffArg = ((call->gtCall.gtCallMoreFlags & GTF_CALL_M_RETBUFFARG) != 0);
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    if (!callHasRetBuffArg && call->TypeGet() == TYP_STRUCT && call->gtCall.gtRetClsHnd != NO_CLASS_HANDLE)
+    {
+        eeGetSystemVAmd64PassStructInRegisterDescriptor(GetStructClassHandle(call), &structDesc);
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
     if (!callHasRetBuffArg && call->TypeGet() == TYP_STRUCT)
     {
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_)
         if (call->gtCall.IsVarargs() || !IsHfa(call))
-#endif
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (!structDesc.passedInRegisters)
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         {
             // Now that we are past the importer, re-type this node so the register predictor does
             // the right thing
             call->gtType = genActualType((var_types)call->gtCall.gtReturnType);
         }
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        else
+        {
+            if (structDesc.passedInRegisters && structDesc.eightByteCount <= 1)
+            {                
+                call->gtType = genActualType(getEightByteType(structDesc, 0));
+            }
+        }
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     }
-
 #ifdef _TARGET_ARM_
     // Either we don't have a struct now or if struct, then it is HFA returned in regs.
     assert(call->TypeGet() != TYP_STRUCT || (IsHfa(call) && !callHasRetBuffArg));
 #else
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Either we don't have a struct now or if struct, then it is a struct returned in regs or in return buffer.
+    assert((call->TypeGet() != TYP_STRUCT) ||
+           (structDesc.passedInRegisters) ||
+           (callHasRetBuffArg));
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     // No more struct returns
     assert(call->TypeGet() != TYP_STRUCT);
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 #endif
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // If there is a struct that is returned in registers there might be a retbuf (homing space for the return) and type struct.
+    assert(!callHasRetBuffArg || (call->TypeGet() == TYP_VOID) || (call->TypeGet() == TYP_STRUCT));
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
     // If it was a struct return, it has been transformed into a call
     // with a return buffer (that returns TYP_VOID) or into a return
     // of a primitive/enregisterable type
     assert(!callHasRetBuffArg || (call->TypeGet() == TYP_VOID));
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 }
 
 
@@ -4698,7 +5224,6 @@ GenTreePtr          Compiler::fgMorphField(GenTreePtr tree, MorphAddrContext* ma
                                 );
         }
 #endif
-
         if  (fldOffset != 0)
         {
             // Generate the "addr" node.
@@ -5180,6 +5705,7 @@ bool                Compiler::fgCanFastTailCall(GenTreeCall* callee)
             }
 
             // Get the size of the struct and see if it is 1, 2, 4 or 8 bytes in size
+            // For Amd64-Unix the call below checks to see if the struct is register passable.
             if (argx->OperGet() == GT_LDOBJ)
             {
 #ifdef _TARGET_AMD64_
@@ -5634,6 +6160,13 @@ GenTreePtr          Compiler::fgMorphCall(GenTreeCall* call)
         call->gtCallMoreFlags &= ~GTF_CALL_M_IMPLICIT_TAILCALL;
 #endif
         
+#ifdef FEATURE_PAL
+        if (!canFastTailCall && szFailReason == nullptr)
+        {
+            szFailReason = "Non fast tail calls disabled for PAL based systems.";
+        }
+#endif // FEATURE_PAL
+
         if (szFailReason != nullptr)
         {
 #ifdef DEBUG
@@ -5659,13 +6192,6 @@ GenTreePtr          Compiler::fgMorphCall(GenTreeCall* call)
             compCurBB->bbJumpKind = BBJ_RETURN;
 #endif
 
-#ifdef FEATURE_PAL
-        if (!canFastTailCall)
-        {
-            goto NO_TAIL_CALL;
-        }
-#endif // FEATURE_PAL
-
         // Set this flag before calling fgMorphCall() to prevent inlining this call.
         call->gtCallMoreFlags |=  GTF_CALL_M_TAILCALL;
 
@@ -5847,6 +6373,13 @@ GenTreePtr          Compiler::fgMorphCall(GenTreeCall* call)
                 // This is a HFA, use float 0.
                 callType = TYP_FLOAT;
             }
+#elif defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            // Return a dummy node, as the return is already removed.
+            if (callType == TYP_STRUCT)
+            {
+                // This is an register-returned struct. Return a 0.
+                callType = TYP_INT;
+            }
 #endif
             result = gtNewZeroConNode(genActualType(callType));
             result = fgMorphTree(result);
@@ -5990,7 +6523,6 @@ NO_TAIL_CALL:
 
                     retValTmpNum = lvaGrabTemp(true DEBUGARG("substitute local for ret buff arg"));
                     lvaSetStruct(retValTmpNum, structHnd, true);
-
                     dest = gtNewOperNode(GT_ADDR, TYP_BYREF, gtNewLclvNode(retValTmpNum, TYP_STRUCT));
                 }
             }
@@ -6400,6 +6932,7 @@ ONE_SIMPLE_ASG:
             if (lclVarTree->TypeGet() == TYP_STRUCT &&
                 (lvaTable[lclNum].lvPromoted || lclVarIsSIMDType(lclNum)))
             {
+
                 // Let fgMorphInitBlock handle it.  (Since we'll need to do field-var-wise assignments.)
                 goto GENERAL_BLKOP;
             }
@@ -7203,8 +7736,13 @@ GenTreePtr          Compiler::fgMorphCopyBlock(GenTreePtr tree)
         {
             // Spill the (complex) address to a BYREF temp.
             // Note, at most one address may need to be spilled.
-
             addrSpillTemp = lvaGrabTemp(true DEBUGARG("BlockOp address local"));
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            lvaTable[addrSpillTemp].lvType = TYP_I_IMPL;
+
+            tree = gtNewAssignNode(gtNewLclvNode(addrSpillTemp, TYP_I_IMPL),
+                                   addrSpill);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
             lvaTable[addrSpillTemp].lvType = TYP_BYREF;
 
             if (addrSpillIsStackDest)
@@ -7214,6 +7752,8 @@ GenTreePtr          Compiler::fgMorphCopyBlock(GenTreePtr tree)
 
             tree = gtNewAssignNode(gtNewLclvNode(addrSpillTemp, TYP_BYREF),
                                    addrSpill);
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifndef LEGACY_BACKEND
             // If we are assigning the address of a LclVar here 
             // liveness does not account for this kind of address taken use. 
@@ -9529,7 +10069,7 @@ COMPARE:
 
     case GT_ADD:
 
-CM_OVF_OP:
+    CM_OVF_OP :
         if (tree->gtOverflow())
         {
             tree->gtRequestSetFlags();
@@ -10906,7 +11446,9 @@ ASG_OP:
             if  (add->IsCnsIntOrI() && (op2->GetScaleIndexMul() != 0))
             {
                 if (tree->gtOverflow() || op1->gtOverflow())
+                {
                     break;
+                }
 
                 ssize_t     imul = op2->gtIntCon.gtIconVal;
                 ssize_t     iadd = add->gtIntCon.gtIconVal;
@@ -12825,7 +13367,11 @@ void                Compiler::fgMorphBlocks()
                 //replace the GT_RETURN node to be a GT_ASG that stores the return value into genReturnLocal.
                 if (genReturnLocal != BAD_VAR_NUM)
                 {
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+                    noway_assert(info.compRetType != TYP_VOID);
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                     noway_assert(info.compRetType != TYP_VOID && info.compRetNativeType != TYP_STRUCT);
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
                     noway_assert(block->bbTreeList);
 
                     GenTreePtr last = block->bbTreeList->gtPrev;
@@ -13834,9 +14380,9 @@ void                Compiler::fgPromoteStructs()
             break;
         }
 
-#ifdef _TARGET_ARM_
+#if defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         if (!varDsc->lvDontPromote)
-#endif // _TARGET_ARM_
+#endif // defined(_TARGET_ARM_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         {
 #ifdef FEATURE_SIMD
             if (varDsc->lvSIMDType && varDsc->lvUsedInSIMDIntrinsic)
@@ -14154,6 +14700,8 @@ void                Compiler::fgMarkImplicitByRefArgs()
                 size = info.compCompHnd->getClassSize(typeHnd);
             }
 
+
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 #if defined(_TARGET_AMD64_)
             if (size > REGSIZE_BYTES || (size & (size - 1)) != 0)
 #elif defined(_TARGET_ARM64_)
@@ -14184,6 +14732,7 @@ void                Compiler::fgMarkImplicitByRefArgs()
                 varDsc->lvKeepType = 1;
 #endif // DEBUG
             }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
         }
     }
 
diff --git a/src/jit/regalloc.cpp b/src/jit/regalloc.cpp
index 839f497f4a..89945301f0 100644
--- a/src/jit/regalloc.cpp
+++ b/src/jit/regalloc.cpp
@@ -667,7 +667,7 @@ void                Compiler::raSetupArgMasks(RegState *regState)
 #endif // LEGACY_BACKEND
 
 // The code to set the regState for each arg is outlined for shared use
-// by linear scan
+// by linear scan. (It is not shared for System V AMD64 platform.)
 regNumber     Compiler::raUpdateRegStateForArg(RegState *regState, LclVarDsc *argDsc)
 {
     regNumber inArgReg = argDsc->lvArgReg;
diff --git a/src/jit/scopeinfo.cpp b/src/jit/scopeinfo.cpp
index a108713792..53a5960967 100644
--- a/src/jit/scopeinfo.cpp
+++ b/src/jit/scopeinfo.cpp
@@ -909,21 +909,65 @@ void                CodeGen::psiBegProlog()
         psiScope * newScope      = psiNewPrologScope(varScope->vsdLVnum,
                                                      varScope->vsdVarNum);
 
-        if  (lclVarDsc1->lvIsRegArg)
+        if (lclVarDsc1->lvIsRegArg)
         {
-#ifdef DEBUG
-            var_types regType = compiler->mangleVarArgsType(lclVarDsc1->TypeGet());
-#ifdef _TARGET_ARM_
-            if (lclVarDsc1->lvIsHfaRegArg)
+            bool isStructHandled = false;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            if (lclVarDsc1->TypeGet() == TYP_STRUCT)
             {
-                regType = lclVarDsc1->GetHfaType();
+                CORINFO_CLASS_HANDLE typeHnd = lclVarDsc1->lvVerTypeInfo.GetClassHandle();
+                assert(typeHnd != nullptr);
+                compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+                assert(structDesc.passedInRegisters);
+
+                for (unsigned nCnt = 0; nCnt < structDesc.eightByteCount; nCnt++)
+                {
+                    unsigned len = structDesc.eightByteSizes[nCnt];
+                    var_types regType = TYP_UNDEF;
+                    regNumber regNum = REG_NA;
+                    if (nCnt == 0)
+                    {
+                        regNum = lclVarDsc1->lvArgReg;
+                    }
+                    else if (nCnt == 1)
+                    {
+                        regNum = lclVarDsc1->lvOtherArgReg;
+                    }
+                    else
+                    {
+                        assert(false && "Invalid eightbyte number.");
+                    }
+
+                    regType = compiler->getEightByteType(structDesc, nCnt);
+#ifdef DEBUG
+                    regType = compiler->mangleVarArgsType(regType);
+                    assert(genMapRegNumToRegArgNum(regNum, regType) != (unsigned)-1);
+#endif // DEBUG
+
+                    newScope->scRegister = true;
+                    newScope->u1.scRegNum = (regNumberSmall)regNum;
+                }
+
+                isStructHandled = true;
             }
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            if (!isStructHandled)
+            {
+#ifdef DEBUG
+                var_types regType = compiler->mangleVarArgsType(lclVarDsc1->TypeGet());
+#ifdef _TARGET_ARM_
+                if (lclVarDsc1->lvIsHfaRegArg)
+                {
+                    regType = lclVarDsc1->GetHfaType();
+                }
 #endif // _TARGET_ARM_
-            assert(genMapRegNumToRegArgNum(lclVarDsc1->lvArgReg, regType) != (unsigned)-1);
+                assert(genMapRegNumToRegArgNum(lclVarDsc1->lvArgReg, regType) != (unsigned)-1);
 #endif // DEBUG
 
-            newScope->scRegister     = true;
-            newScope->u1.scRegNum    = (regNumberSmall) lclVarDsc1->lvArgReg;
+                newScope->scRegister =  true;
+                newScope->u1.scRegNum = (regNumberSmall)lclVarDsc1->lvArgReg;
+            }
         }
         else
         {
diff --git a/src/jit/target.h b/src/jit/target.h
index f4aad4e153..767eb31d8d 100644
--- a/src/jit/target.h
+++ b/src/jit/target.h
@@ -19,6 +19,12 @@
 #endif
 #endif
 
+#if (defined(FEATURE_CORECLR) && defined(PLATFORM_UNIX))
+#define FEATURE_VARARG    0
+#else // !(defined(FEATURE_CORECLR) && defined(PLATFORM_UNIX))
+#define FEATURE_VARARG    1
+#endif // !(defined(FEATURE_CORECLR) && defined(PLATFORM_UNIX))
+
 /*****************************************************************************/
 // The following are intended to capture only those #defines that cannot be replaced
 // with static const members of Target
@@ -971,10 +977,28 @@ typedef unsigned short          regPairNoSmall; // arm: need 12 bits
   #define REG_LNGRET               REG_EAX
   #define RBM_LNGRET               RBM_EAX
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    #define REG_INTRET_1           REG_RDX
+    #define RBM_INTRET_1           RBM_RDX
+
+    #define REG_LNGRET_1           REG_RDX
+    #define RBM_LNGRET_1           RBM_RDX
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+
   #define REG_FLOATRET             REG_XMM0
   #define RBM_FLOATRET             RBM_XMM0
+  #define REG_DOUBLERET            REG_XMM0
   #define RBM_DOUBLERET            RBM_XMM0
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#define REG_FLOATRET_1             REG_XMM1
+#define RBM_FLOATRET_1             RBM_XMM1
+
+#define REG_DOUBLERET_1            REG_XMM1
+#define RBM_DOUBLERET_1            RBM_XMM1
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
   #define REG_FPBASE               REG_EBP
   #define RBM_FPBASE               RBM_EBP
   #define STR_FPBASE               "rbp"
@@ -1872,7 +1896,7 @@ extern const regMaskSmall  regMasks[REG_COUNT];
 inline regMaskTP    genRegMask(regNumber reg)
 {
     assert((unsigned)reg < ArrLen(regMasks));
-#if defined _TARGET_AMD64_
+#ifdef _TARGET_AMD64_
     // shift is faster than a L1 hit on modern x86
     // (L1 latency on sandy bridge is 4 cycles for [base] and 5 for [base + index*c] )
     // the reason this is AMD-only is because the x86 BE will try to get reg masks for REG_STK
diff --git a/src/pal/src/cruntime/printfcpp.cpp b/src/pal/src/cruntime/printfcpp.cpp
index 87cd8a8aff..8adf3470c2 100644
--- a/src/pal/src/cruntime/printfcpp.cpp
+++ b/src/pal/src/cruntime/printfcpp.cpp
@@ -2306,7 +2306,7 @@ int CoreVfprintf(CPalThread *pthrCurrent, PAL_FILE *stream, const char *format,
                 if (!Length)
                 {
                     ASSERT("WideCharToMultiByte failed.  Error is %d\n",
-                          GetLastError());
+                        GetLastError());
                     PERF_EXIT(vfprintf);
                     va_end(ap);
                     return -1;
diff --git a/src/vm/amd64/calldescrworkeramd64.S b/src/vm/amd64/calldescrworkeramd64.S
index efee6f325a..ca4fd703c6 100644
--- a/src/vm/amd64/calldescrworkeramd64.S
+++ b/src/vm/amd64/calldescrworkeramd64.S
@@ -108,11 +108,43 @@ LOCAL_LABEL(NoFloatArguments):
         je      LOCAL_LABEL(ReturnsFloat)
         cmp     ecx, 8
         je      LOCAL_LABEL(ReturnsDouble)
-        // unexpected
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        // Struct with two integer eightbytes
+        cmp     ecx, 16
+        jne     LOCAL_LABEL(NotTwoIntegerEightbytes)
+        mov     qword ptr [rbx+CallDescrData__returnValue], rax
+        mov     qword ptr [rbx+CallDescrData__returnValue + 8], rdx
+        jmp     LOCAL_LABEL(Epilog)
+
+LOCAL_LABEL(NotTwoIntegerEightbytes):
+        // Struct with the first eightbyte SSE and the second one integer
+        cmp     ecx, 16 + 1
+        jne     LOCAL_LABEL(NotFirstSSESecondIntegerEightbyte)
+        movsd   real8 ptr [rbx+CallDescrData__returnValue], xmm0
+        mov     qword ptr [rbx+CallDescrData__returnValue + 8], rax
+        jmp     LOCAL_LABEL(Epilog)
+
+LOCAL_LABEL(NotFirstSSESecondIntegerEightbyte):
+        // Struct with the first eightbyte integer and the second one SSE
+        cmp     ecx, 16 + 2
+        jne     LOCAL_LABEL(NotFirstIntegerSecondSSEEightbyte)
+        mov     qword ptr [rbx+CallDescrData__returnValue], rax
+        movsd   real8 ptr [rbx+CallDescrData__returnValue + 8], xmm0
+        jmp     LOCAL_LABEL(Epilog)
+
+LOCAL_LABEL(NotFirstIntegerSecondSSEEightbyte):
+        // Struct with two SSE eightbytes
+        cmp     ecx, 16 + 3
+        jne     LOCAL_LABEL(Epilog) // unexpected
+        movsd   real8 ptr [rbx+CallDescrData__returnValue], xmm0
+        movsd   real8 ptr [rbx+CallDescrData__returnValue + 8], xmm1
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
         jmp     LOCAL_LABEL(Epilog)
 
 LOCAL_LABEL(ReturnsInt):
-        mov     [rbx+CallDescrData__returnValue], rax
+        mov     qword ptr [rbx+CallDescrData__returnValue], rax
 
 LOCAL_LABEL(Epilog):
         lea     rsp, [rbp - 8]          // deallocate arguments
diff --git a/src/vm/amd64/cgenamd64.cpp b/src/vm/amd64/cgenamd64.cpp
index e9c1ad468b..51738684ad 100644
--- a/src/vm/amd64/cgenamd64.cpp
+++ b/src/vm/amd64/cgenamd64.cpp
@@ -323,8 +323,16 @@ void HijackFrame::UpdateRegDisplay(const PREGDISPLAY pRD)
 
     UpdateRegDisplayFromCalleeSavedRegisters(pRD, &(m_Args->Regs));
 
+#ifdef UNIX_AMD64_ABI
+    pRD->pCurrentContextPointers->Rsi = NULL;
+    pRD->pCurrentContextPointers->Rdi = NULL;
+#endif
     pRD->pCurrentContextPointers->Rcx = NULL;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    pRD->pCurrentContextPointers->Rdx = (PULONG64)&m_Args->Rdx;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
     pRD->pCurrentContextPointers->Rdx = NULL;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
     pRD->pCurrentContextPointers->R8  = NULL;
     pRD->pCurrentContextPointers->R9  = NULL;
     pRD->pCurrentContextPointers->R10 = NULL;
diff --git a/src/vm/amd64/cgencpu.h b/src/vm/amd64/cgencpu.h
index 39b8ba91de..de64b1600b 100644
--- a/src/vm/amd64/cgencpu.h
+++ b/src/vm/amd64/cgencpu.h
@@ -66,14 +66,15 @@ EXTERN_C void FastCallFinalizeWorker(Object *obj, PCODE funcPtr);
 #define CACHE_LINE_SIZE                         64   // Current AMD64 processors have 64-byte cache lines as per AMD64 optmization manual
 #define LOG2SLOT                                LOG2_PTRSIZE
 
-#define ENREGISTERED_RETURNTYPE_MAXSIZE         8    // bytes
 #define ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE 8    // bytes
 #define ENREGISTERED_PARAMTYPE_MAXSIZE          8    // bytes
 
 #ifdef UNIX_AMD64_ABI
-#define CALLDESCR_ARGREGS                       1   // CallDescrWorker has ArgumentRegister parameter
-#define CALLDESCR_FPARGREGS                     1   // CallDescrWorker has FloatArgumentRegisters parameter
+#define ENREGISTERED_RETURNTYPE_MAXSIZE         16   // bytes
+#define CALLDESCR_ARGREGS                       1    // CallDescrWorker has ArgumentRegister parameter
+#define CALLDESCR_FPARGREGS                     1    // CallDescrWorker has FloatArgumentRegisters parameter
 #else
+#define ENREGISTERED_RETURNTYPE_MAXSIZE         8    // bytes
 #define COM_STUBS_SEPARATE_FP_LOCATIONS
 #define CALLDESCR_REGTYPEMAP                    1
 #endif
@@ -265,9 +266,11 @@ struct CalleeSavedRegistersPointers {
 
 #ifdef UNIX_AMD64_ABI
 
+#define NUM_FLOAT_ARGUMENT_REGISTERS 8
+
 typedef DPTR(struct FloatArgumentRegisters) PTR_FloatArgumentRegisters;
 struct FloatArgumentRegisters {
-     M128A d[8];   // xmm0-xmm7
+     M128A d[NUM_FLOAT_ARGUMENT_REGISTERS];   // xmm0-xmm7
 };
 
 #endif
@@ -475,11 +478,23 @@ struct DECLSPEC_ALIGN(8) UMEntryThunkCode
 
 struct HijackArgs
 {
+#ifndef PLATFORM_UNIX
     union
     {
         ULONG64 Rax;
         ULONG64 ReturnValue;
     };
+#else // PLATFORM_UNIX
+    union
+    {
+        struct
+        {
+            ULONG64 Rax;
+            ULONG64 Rdx;
+        };
+        ULONG64 ReturnValue[2];
+    };
+#endif // PLATFORM_UNIX
     CalleeSavedRegisters Regs;
     union
     {
diff --git a/src/vm/amd64/unixasmhelpers.S b/src/vm/amd64/unixasmhelpers.S
index 21a8f63232..058a69a382 100644
--- a/src/vm/amd64/unixasmhelpers.S
+++ b/src/vm/amd64/unixasmhelpers.S
@@ -184,12 +184,13 @@ NESTED_ENTRY OnHijackScalarTripThread, _TEXT, NoHandler
 
     PUSH_CALLEE_SAVED_REGISTERS
 
+    push_register rdx
     // Push rax again - this is where integer/pointer return values are returned
     push_register rax
 
     mov                 rdi, rsp
 
-    alloc_stack         0x20
+    alloc_stack         0x28
 
     // First float return register
     movdqa              [rsp], xmm0
@@ -202,14 +203,55 @@ NESTED_ENTRY OnHijackScalarTripThread, _TEXT, NoHandler
 
     movdqa              xmm0, [rsp]
     movdqa              xmm1, [rsp+0x10]
-    free_stack          0x20
+    free_stack          0x28
     pop_register        rax
+    pop_register        rdx
 
     POP_CALLEE_SAVED_REGISTERS
     ret
 
 NESTED_END OnHijackScalarTripThread, _TEXT
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+//------------------------------------------------
+// OnHijackStructInRegsTripThread
+//
+NESTED_ENTRY OnHijackStructInRegsTripThread, _TEXT, NoHandler
+
+    // Make room for the real return address (rip)
+    push_register rax
+
+    PUSH_CALLEE_SAVED_REGISTERS
+
+    push_register rdx
+    // Push rax again - this is where part of the struct gets returned
+    push_register rax
+
+    mov                 rdi, rsp
+
+    alloc_stack         0x28
+
+    // First float return register
+    movdqa              [rsp], xmm0
+    // Second float return register
+    movdqa              [rsp+0x10], xmm1
+
+    END_PROLOGUE
+
+    call                C_FUNC(OnHijackStructInRegsWorker)
+
+    movdqa              xmm0, [rsp]
+    movdqa              xmm1, [rsp+0x10]
+    free_stack          0x28
+    pop_register        rax
+    pop_register        rdx
+
+    POP_CALLEE_SAVED_REGISTERS
+    ret
+
+NESTED_END OnHijackStructInRegsTripThread, _TEXT
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 //------------------------------------------------
 // OnHijackObjectTripThread
 //
@@ -220,16 +262,22 @@ NESTED_ENTRY OnHijackObjectTripThread, _TEXT, NoHandler
 
     PUSH_CALLEE_SAVED_REGISTERS
 
+    push_register rdx
     // Push rax again - this is where integer/pointer return values are returned
     push_register rax
 
     mov                 rdi, rsp
 
+    // align stack
+    alloc_stack         0x8
+
     END_PROLOGUE
 
     call                C_FUNC(OnHijackObjectWorker)
 
+    free_stack          0x8
     pop_register        rax
+    pop_register        rdx
 
     POP_CALLEE_SAVED_REGISTERS
     ret
@@ -246,16 +294,22 @@ NESTED_ENTRY OnHijackInteriorPointerTripThread, _TEXT, NoHandler
 
     PUSH_CALLEE_SAVED_REGISTERS
 
+    push_register rdx
     // Push rax again - this is where integer/pointer return values are returned
     push_register rax
 
     mov                 rdi, rsp
 
+    // align stack
+    alloc_stack         0x8
+
     END_PROLOGUE
 
     call                C_FUNC(OnHijackInteriorPointerWorker)
 
+    free_stack          0x8
     pop_register        rax
+    pop_register        rdx
 
     POP_CALLEE_SAVED_REGISTERS
     ret
diff --git a/src/vm/argdestination.h b/src/vm/argdestination.h
new file mode 100644
index 0000000000..5896414f35
--- /dev/null
+++ b/src/vm/argdestination.h
@@ -0,0 +1,217 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for full license information.
+//
+//
+
+#ifndef __ARGDESTINATION_H__
+#define __ARGDESTINATION_H__
+
+// The ArgDestination class represents a destination location of an argument.
+class ArgDestination
+{
+    // Base address to which the m_offset is applied to get the actual argument location.
+    PTR_VOID m_base;
+    // Offset of the argument relative to the m_base. On AMD64 on Unix, it can have a special
+    // value that represent a struct that contain both general purpose and floating point fields 
+    // passed in registers.
+    int m_offset;
+    // For structs passed in registers, this member points to an ArgLocDesc that contains
+    // details on the layout of the struct in general purpose and floating point registers.
+    ArgLocDesc* m_argLocDescForStructInRegs;
+
+public:
+
+    // Construct the ArgDestination
+    ArgDestination(PTR_VOID base, int offset, ArgLocDesc* argLocDescForStructInRegs)
+    :   m_base(base),
+        m_offset(offset),
+        m_argLocDescForStructInRegs(argLocDescForStructInRegs)
+    {
+        LIMITED_METHOD_CONTRACT;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        _ASSERTE((argLocDescForStructInRegs != NULL) || (offset != TransitionBlock::StructInRegsOffset));
+#else        
+        _ASSERTE(argLocDescForStructInRegs == NULL);
+#endif        
+    }
+
+    // Get argument destination address for arguments that are not structs passed in registers.
+    PTR_VOID GetDestinationAddress()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return dac_cast<PTR_VOID>(dac_cast<TADDR>(m_base) + m_offset);
+    }
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    // Returns true if the ArgDestination represents a struct passed in registers.
+    bool IsStructPassedInRegs()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return m_offset == TransitionBlock::StructInRegsOffset;
+    }
+
+    // Get destination address for floating point fields of a struct passed in registers.
+    PTR_VOID GetStructFloatRegDestinationAddress()
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(IsStructPassedInRegs());
+        int offset = TransitionBlock::GetOffsetOfFloatArgumentRegisters() + m_argLocDescForStructInRegs->m_idxFloatReg * 8;
+        return dac_cast<PTR_VOID>(dac_cast<TADDR>(m_base) + offset);
+    }
+
+    // Get destination address for non-floating point fields of a struct passed in registers.
+    PTR_VOID GetStructGenRegDestinationAddress()
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(IsStructPassedInRegs());
+        int offset = TransitionBlock::GetOffsetOfArgumentRegisters() + m_argLocDescForStructInRegs->m_idxGenReg * 8;
+        return dac_cast<PTR_VOID>(dac_cast<TADDR>(m_base) + offset);
+    }
+
+#ifndef DACCESS_COMPILE
+    // Zero struct argument stored in registers described by the current ArgDestination.
+    // Arguments:
+    //  fieldBytes - size of the structure
+    void ZeroStructInRegisters(int fieldBytes)
+    {
+        STATIC_CONTRACT_NOTHROW;
+        STATIC_CONTRACT_GC_NOTRIGGER;
+        STATIC_CONTRACT_FORBID_FAULT;
+        STATIC_CONTRACT_MODE_COOPERATIVE;
+
+        // To zero the struct, we create a zero filled array of large enough size and
+        // then copy it to the registers. It is implemented this way to keep the complexity
+        // of dealing with the eightbyte classification in single function.
+        // This function is used rarely and so the overhead of reading the zeros from
+        // the stack is negligible.
+        long long zeros[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS] = {};
+        _ASSERTE(sizeof(zeros) >= fieldBytes);
+
+        CopyStructToRegisters(zeros, fieldBytes, 0);
+    }
+
+    // Copy struct argument into registers described by the current ArgDestination.
+    // Arguments:
+    //  src = source data of the structure 
+    //  fieldBytes - size of the structure
+    //  destOffset - nonzero when copying values into Nullable<T>, it is the offset
+    //               of the T value inside of the Nullable<T>
+    void CopyStructToRegisters(void *src, int fieldBytes, int destOffset)
+    {
+        STATIC_CONTRACT_NOTHROW;
+        STATIC_CONTRACT_GC_NOTRIGGER;
+        STATIC_CONTRACT_FORBID_FAULT;
+        STATIC_CONTRACT_MODE_COOPERATIVE;
+
+        _ASSERTE(IsStructPassedInRegs());
+     
+        BYTE* genRegDest = (BYTE*)GetStructGenRegDestinationAddress() + destOffset;
+        BYTE* floatRegDest = (BYTE*)GetStructFloatRegDestinationAddress();
+        INDEBUG(int remainingBytes = fieldBytes;)
+
+        EEClass* eeClass = m_argLocDescForStructInRegs->m_eeClass;
+        _ASSERTE(eeClass != NULL);
+
+        // We start at the first eightByte that the destOffset didn't skip completely.
+        for (int i = destOffset / 8; i < eeClass->GetNumberEightBytes(); i++)
+        {
+            int eightByteSize = eeClass->GetEightByteSize(i);
+            SystemVClassificationType eightByteClassification = eeClass->GetEightByteClassification(i);
+
+            // Adjust the size of the first eightByte by the destOffset
+            eightByteSize -= (destOffset & 7);
+            destOffset = 0;
+
+            _ASSERTE(remainingBytes >= eightByteSize);
+
+            if (eightByteClassification == SystemVClassificationTypeSSE)
+            {
+                if (eightByteSize == 8)
+                {
+                    *(UINT64*)floatRegDest = *(UINT64*)src;
+                }
+                else
+                {
+                    _ASSERTE(eightByteSize == 4);
+                    *(UINT32*)floatRegDest = *(UINT32*)src;
+                }
+                floatRegDest += 8;
+            }
+            else
+            {
+                if (eightByteSize == 8)
+                {
+                    _ASSERTE((eightByteClassification == SystemVClassificationTypeInteger) ||
+                             (eightByteClassification == SystemVClassificationTypeIntegerReference));
+
+                    _ASSERTE(IS_ALIGNED((SIZE_T)genRegDest, 8));
+                    *(UINT64*)genRegDest = *(UINT64*)src;
+                }
+                else
+                {
+                    _ASSERTE(eightByteClassification == SystemVClassificationTypeInteger);
+                    memcpyNoGCRefs(genRegDest, src, eightByteSize);
+                }
+
+                genRegDest += eightByteSize;
+            }
+
+            src = (BYTE*)src + eightByteSize;
+            INDEBUG(remainingBytes -= eightByteSize;)
+        }
+
+        _ASSERTE(remainingBytes == 0);        
+    }
+
+#endif //DACCESS_COMPILE
+
+    // Report managed object pointers in the struct in registers
+    // Arguments:
+    //  fn - promotion function to apply to each managed object pointer
+    //  sc - scan context to pass to the promotion function
+    //  fieldBytes - size of the structure
+    void ReportPointersFromStructInRegisters(promote_func *fn, ScanContext *sc, int fieldBytes)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+       _ASSERTE(IsStructPassedInRegs());
+     
+        TADDR genRegDest = dac_cast<TADDR>(GetStructGenRegDestinationAddress());
+        INDEBUG(int remainingBytes = fieldBytes;)
+
+        EEClass* eeClass = m_argLocDescForStructInRegs->m_eeClass;
+        _ASSERTE(eeClass != NULL);
+
+        for (int i = 0; i < eeClass->GetNumberEightBytes(); i++)
+        {
+            int eightByteSize = eeClass->GetEightByteSize(i);
+            SystemVClassificationType eightByteClassification = eeClass->GetEightByteClassification(i);
+
+            _ASSERTE(remainingBytes >= eightByteSize);
+
+            if (eightByteClassification != SystemVClassificationTypeSSE)
+            {
+                if (eightByteClassification == SystemVClassificationTypeIntegerReference)
+                {
+                    _ASSERTE(eightByteSize == 8);
+                    _ASSERTE(IS_ALIGNED((SIZE_T)genRegDest, 8));
+
+                    (*fn)(dac_cast<PTR_PTR_Object>(genRegDest), sc, 0);
+                }
+
+                genRegDest += eightByteSize;
+            }
+
+            INDEBUG(remainingBytes -= eightByteSize;)
+        }
+
+        _ASSERTE(remainingBytes == 0);
+    }
+
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+};
+
+#endif // __ARGDESTINATION_H__
diff --git a/src/vm/arm/stubs.cpp b/src/vm/arm/stubs.cpp
index 368e6cf810..342c73b0d0 100644
--- a/src/vm/arm/stubs.cpp
+++ b/src/vm/arm/stubs.cpp
@@ -1052,7 +1052,7 @@ void  DispatchHolder::Initialize(PCODE implTarget, PCODE failTarget, size_t expe
 
     // nop - insert padding
     _stub._entryPoint[n++] = 0xbf00;
-	
+    
     _ASSERTE(n == DispatchStub::entryPointLen);
 
     // Make sure that the data members below are aligned
diff --git a/src/vm/callhelpers.cpp b/src/vm/callhelpers.cpp
index a910c0ea30..137dbb8656 100644
--- a/src/vm/callhelpers.cpp
+++ b/src/vm/callhelpers.cpp
@@ -401,7 +401,7 @@ ARG_SLOT MethodDescCallSite::CallTargetWorker(const ARG_SLOT *pArguments)
         // Record this call if required
         g_IBCLogger.LogMethodDescAccess(m_pMD);
 
-        // 
+        //  
         // All types must already be loaded. This macro also sets up a FAULT_FORBID region which is
         // also required for critical calls since we cannot inject any failure points between the 
         // caller of MethodDesc::CallDescr and the actual transition to managed code.
@@ -537,9 +537,12 @@ ARG_SLOT MethodDescCallSite::CallTargetWorker(const ARG_SLOT *pArguments)
             // have at least one such argument we point the call worker at the floating point area of the
             // frame (we leave it null otherwise since the worker can perform a useful optimization if it
             // knows no floating point registers need to be set up).
-            if ((ofs < 0) && (pFloatArgumentRegisters == NULL))
+            if (TransitionBlock::HasFloatRegister(ofs, m_argIt.GetArgLocDescForStructInRegs()) && 
+                (pFloatArgumentRegisters == NULL))
+            {
                 pFloatArgumentRegisters = (FloatArgumentRegisters*)(pTransitionBlock +
                                                                     TransitionBlock::GetOffsetOfFloatArgumentRegisters());
+            }
 #endif
 
 #if CHECK_APP_DOMAIN_LEAKS
@@ -553,6 +556,9 @@ ARG_SLOT MethodDescCallSite::CallTargetWorker(const ARG_SLOT *pArguments)
             }
 #endif // CHECK_APP_DOMAIN_LEAKS
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            _ASSERTE(ofs != TransitionBlock::StructInRegsOffset);
+#endif
             PVOID pDest = pTransitionBlock + ofs;
 
             UINT32 stackSize = m_argIt.GetArgSize();
diff --git a/src/vm/callingconvention.h b/src/vm/callingconvention.h
index 244a3df878..490ae3ce87 100644
--- a/src/vm/callingconvention.h
+++ b/src/vm/callingconvention.h
@@ -42,6 +42,12 @@ struct ArgLocDesc
     int     m_idxStack;     // First stack slot used (or -1)
     int     m_cStack;       // Count of stack slots used (or 0)
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    EEClass* m_eeClass;     // For structs passed in register, it points to the EEClass of the struct
+
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #if defined(_TARGET_ARM_)
     BOOL    m_fRequires64BitAlignment; // True if the argument should always be aligned (in registers or on the stack
 #endif
@@ -63,6 +69,9 @@ struct ArgLocDesc
 #if defined(_TARGET_ARM_)
         m_fRequires64BitAlignment = FALSE;
 #endif
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        m_eeClass = NULL;
+#endif
     }
 };
 
@@ -138,9 +147,13 @@ struct TransitionBlock
     {
         LIMITED_METHOD_CONTRACT;
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        return offset >= sizeof(TransitionBlock);
+#else        
         int ofsArgRegs = GetOffsetOfArgumentRegisters();
 
         return offset >= (int) (ofsArgRegs + ARGUMENTREGISTERS_SIZE);
+#endif        
     }
 
     static BOOL IsArgumentRegisterOffset(int offset)
@@ -156,14 +169,45 @@ struct TransitionBlock
     static UINT GetArgumentIndexFromOffset(int offset)
     {
         LIMITED_METHOD_CONTRACT;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        _ASSERTE(offset != TransitionBlock::StructInRegsOffset);
+#endif        
         return (offset - GetOffsetOfArgumentRegisters()) / sizeof(TADDR);
     }
+
+    static UINT GetStackArgumentIndexFromOffset(int offset)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        return (offset - TransitionBlock::GetOffsetOfArgs()) / STACK_ELEM_SIZE;
+    }
+
 #endif
 
 #ifdef CALLDESCR_FPARGREGS
     static BOOL IsFloatArgumentRegisterOffset(int offset)
     {
         LIMITED_METHOD_CONTRACT;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        return (offset != TransitionBlock::StructInRegsOffset) && (offset < 0);
+#else        
+        return offset < 0;
+#endif        
+    }
+
+    // Check if an argument has floating point register, that means that it is
+    // either a floating point argument or a struct passed in registers that
+    // has a floating point member.
+    static BOOL HasFloatRegister(int offset, ArgLocDesc* argLocDescForStructInRegs)
+    {
+        LIMITED_METHOD_CONTRACT;
+    #if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (offset == TransitionBlock::StructInRegsOffset)
+        {
+            return argLocDescForStructInRegs->m_cFloatReg > 0;
+        }
+    #endif        
         return offset < 0;
     }
 
@@ -172,7 +216,7 @@ struct TransitionBlock
         LIMITED_METHOD_CONTRACT;
         return -GetNegSpaceSize();
     }
-#endif
+#endif // CALLDESCR_FPARGREGS
 
     static int GetOffsetOfCalleeSavedRegisters()
     {
@@ -194,6 +238,11 @@ struct TransitionBlock
     }
 
     static const int InvalidOffset = -1;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Special offset value to represent  struct passed in registers. Such a struct can span both
+    // general purpose and floating point registers, so it can have two different offsets.
+    static const int StructInRegsOffset = -2;
+#endif    
 };
 
 //-----------------------------------------------------------------------
@@ -340,11 +389,16 @@ public:
     {
         LIMITED_METHOD_CONTRACT;
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // No arguments are passed by reference on AMD64 on Unix
+        return FALSE;
+#else
         // If the size is bigger than ENREGISTERED_PARAM_TYPE_MAXSIZE, or if the size is NOT a power of 2, then
         // the argument is passed by reference.
         return (size > ENREGISTERED_PARAMTYPE_MAXSIZE) || ((size & (size-1)) != 0);
+#endif        
     }
-#endif
+#endif // _TARGET_AMD64_
 
     // This overload should be used for varargs only.
     static BOOL IsVarArgPassedByRef(size_t size)
@@ -352,7 +406,13 @@ public:
         LIMITED_METHOD_CONTRACT;
 
 #ifdef _TARGET_AMD64_
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        PORTABILITY_ASSERT("ArgIteratorTemplate::IsVarArgPassedByRef");                
+        return FALSE;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
         return IsArgPassedByRef(size);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #else
         return (size > ENREGISTERED_PARAMTYPE_MAXSIZE);
 #endif
@@ -426,6 +486,15 @@ public:
     void GetVASigCookieLoc(ArgLocDesc * pLoc) { WRAPPER_NO_CONTRACT; GetSimpleLoc(GetVASigCookieOffset(), pLoc); }
 #endif // !_TARGET_X86_
 
+    ArgLocDesc* GetArgLocDescForStructInRegs()
+    {
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        return m_hasArgLocDescForStructInRegs ? &m_argLocDescForStructInRegs : NULL;
+#else
+        return NULL;
+#endif
+    }
+
 #ifdef _TARGET_ARM_
     // Get layout information for the argument that the ArgIterator is currently visiting.
     void GetArgLoc(int argOffset, ArgLocDesc *pLoc)
@@ -463,7 +532,7 @@ public:
         }
         else
         {
-            pLoc->m_idxStack = TransitionBlock::GetArgumentIndexFromOffset(argOffset) - 4;
+            pLoc->m_idxStack = TransitionBlock::GetStackArgumentIndexFromOffset(argOffset);
             pLoc->m_cStack = cSlots;
         }
     }
@@ -509,7 +578,7 @@ public:
          }
         else
         {
-            pLoc->m_idxStack = TransitionBlock::GetArgumentIndexFromOffset(argOffset) - 8;
+            pLoc->m_idxStack = TransitionBlock::GetStackArgumentIndexFromOffset(argOffset);
             pLoc->m_cStack = cSlots;
         }
     }
@@ -517,37 +586,46 @@ public:
 
 #if defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI)
     // Get layout information for the argument that the ArgIterator is currently visiting.
-    void GetArgLoc(int argOffset, ArgLocDesc *pLoc)
+    void GetArgLoc(int argOffset, ArgLocDesc* pLoc)
     {
         LIMITED_METHOD_CONTRACT;
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (m_hasArgLocDescForStructInRegs)
+        {
+            *pLoc = m_argLocDescForStructInRegs;
+            return;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        if (argOffset == TransitionBlock::StructInRegsOffset)
+        {
+            // We always already have argLocDesc for structs passed in registers, we 
+            // compute it in the GetNextOffset for those since it is always needed.
+            _ASSERTE(false);
+            return;
+        }
+
         pLoc->Init();
 
         if (TransitionBlock::IsFloatArgumentRegisterOffset(argOffset))
         {
             // Dividing by 8 as size of each register in FloatArgumentRegisters is 8 bytes.
             pLoc->m_idxFloatReg = (argOffset - TransitionBlock::GetOffsetOfFloatArgumentRegisters()) / 8;
-
-            // UNIXTODO: Passing of structs, HFAs. For now, use the Windows convention.
             pLoc->m_cFloatReg = 1;
-            return;
         }
-
-        // UNIXTODO: Passing of structs, HFAs. For now, use the Windows convention.
-        int cSlots = 1;
-
-        if (!TransitionBlock::IsStackArgumentOffset(argOffset))
+        else if (!TransitionBlock::IsStackArgumentOffset(argOffset))
         {
             pLoc->m_idxGenReg = TransitionBlock::GetArgumentIndexFromOffset(argOffset);
-            pLoc->m_cGenReg = cSlots;
-         }
+            pLoc->m_cGenReg = 1;
+        }
         else
         {
-            pLoc->m_idxStack = (argOffset - TransitionBlock::GetOffsetOfArgs()) / 8;
-            pLoc->m_cStack = cSlots;
+            pLoc->m_idxStack = TransitionBlock::GetStackArgumentIndexFromOffset(argOffset);
+            pLoc->m_cStack = (GetArgSize() + STACK_ELEM_SIZE - 1) / STACK_ELEM_SIZE;
         }
     }
-#endif // _TARGET_ARM64_ && UNIX_AMD64_ABI
+#endif // _TARGET_AMD64_ && UNIX_AMD64_ABI
 
 protected:
     DWORD               m_dwFlags;              // Cached flags
@@ -559,6 +637,10 @@ protected:
     CorElementType      m_argType;
     int                 m_argSize;
     TypeHandle          m_argTypeHandle;
+#if defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    ArgLocDesc          m_argLocDescForStructInRegs;
+    bool                m_hasArgLocDescForStructInRegs;
+#endif // _TARGET_AMD64_ && UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
 
 #ifdef _TARGET_X86_
     int                 m_curOfs;           // Current position of the stack iterator
@@ -567,9 +649,12 @@ protected:
 
 #ifdef _TARGET_AMD64_
 #ifdef UNIX_AMD64_ABI
-    int                 m_idxGenReg;
-    int                 m_idxStack;
-    int                 m_idxFPReg;
+    int                 m_idxGenReg;        // Next general register to be assigned a value
+    int                 m_idxStack;         // Next stack slot to be assigned a value
+    int                 m_idxFPReg;         // Next floating point register to be assigned a value
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    bool                m_fArgInRegisters;  // Indicates that the current argument is stored in registers
+#endif    
 #else
     int                 m_curOfs;           // Current position of the stack iterator
 #endif
@@ -843,6 +928,10 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
     m_argSize = argSize;
     m_argTypeHandle = thValueType;
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    m_hasArgLocDescForStructInRegs = false;
+#endif
+
 #ifdef _TARGET_X86_
 #ifdef FEATURE_INTERPRETER
     if (m_fUnmanagedCallConv)
@@ -862,7 +951,12 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
     return m_curOfs;
 #elif defined(_TARGET_AMD64_)
 #ifdef UNIX_AMD64_ABI
+
+    m_fArgInRegisters = true;
+
     int cFPRegs = 0;
+    int cbArg = StackElemSize(argSize);
+    int cGenRegs = cbArg / 8; // GP reg size
 
     switch (argType)
     {
@@ -879,8 +973,56 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
 
     case ELEMENT_TYPE_VALUETYPE:
     {
-        // UNIXTODO: Passing of structs, HFAs. For now, use the Windows convention.
-        argSize = sizeof(TADDR);
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        MethodTable *pMT = m_argTypeHandle.AsMethodTable();
+        if (pMT->IsRegPassedStruct())
+        {
+            EEClass* eeClass = pMT->GetClass();
+            cGenRegs = 0;
+            for (int i = 0; i < eeClass->GetNumberEightBytes(); i++)
+            {
+                switch (eeClass->GetEightByteClassification(i))
+                {
+                    case SystemVClassificationTypeInteger:
+                    case SystemVClassificationTypeIntegerReference:
+                        cGenRegs++;
+                        break;
+                    case SystemVClassificationTypeSSE:
+                        cFPRegs++;
+                        break;
+                    default:
+                        _ASSERTE(false);
+                        break;
+                }
+            }
+
+            // Check if we have enough registers available for the struct passing
+            if ((cFPRegs + m_idxFPReg <= NUM_FLOAT_ARGUMENT_REGISTERS) && (cGenRegs + m_idxGenReg) <= NUM_ARGUMENT_REGISTERS)
+            {
+                m_argLocDescForStructInRegs.Init();
+                m_argLocDescForStructInRegs.m_cGenReg = cGenRegs;
+                m_argLocDescForStructInRegs.m_cFloatReg = cFPRegs;
+                m_argLocDescForStructInRegs.m_idxGenReg = m_idxGenReg;
+                m_argLocDescForStructInRegs.m_idxFloatReg = m_idxFPReg;
+                m_argLocDescForStructInRegs.m_eeClass = eeClass;
+                
+                m_hasArgLocDescForStructInRegs = true;
+
+                m_idxGenReg += cGenRegs;
+                m_idxFPReg += cFPRegs;
+
+                return TransitionBlock::StructInRegsOffset;
+            }
+        }
+
+        // Set the register counts to indicate that this argument will not be passed in registers
+        cFPRegs = 0;
+        cGenRegs = 0;
+
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        argSize = sizeof(TADDR);        
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
         break;
     }
 
@@ -888,33 +1030,31 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
         break;
     }
 
-    int cbArg = StackElemSize(argSize);
-    int cArgSlots = cbArg / STACK_ELEM_SIZE;
-
-    if (cFPRegs>0)
+    if ((cFPRegs > 0) && (cFPRegs + m_idxFPReg <= NUM_FLOAT_ARGUMENT_REGISTERS))
     {
-        if (cFPRegs + m_idxFPReg <= 8)
-        {
-            int argOfs = TransitionBlock::GetOffsetOfFloatArgumentRegisters() + m_idxFPReg * 8;
-            m_idxFPReg += cFPRegs;
-            return argOfs;
-        }
+        int argOfs = TransitionBlock::GetOffsetOfFloatArgumentRegisters() + m_idxFPReg * 8;
+        m_idxFPReg += cFPRegs;
+        return argOfs;
     }
-    else
+    else if ((cGenRegs > 0) && (m_idxGenReg + cGenRegs <= NUM_ARGUMENT_REGISTERS))
     {
-        if (m_idxGenReg + cArgSlots <= 6)
-        {
-            int argOfs = TransitionBlock::GetOffsetOfArgumentRegisters() + m_idxGenReg * 8;
-            m_idxGenReg += cArgSlots;
-            return argOfs;
-        }
+        int argOfs = TransitionBlock::GetOffsetOfArgumentRegisters() + m_idxGenReg * 8;
+        m_idxGenReg += cGenRegs;
+        return argOfs;
     }
 
-    int argOfs = TransitionBlock::GetOffsetOfArgs() + m_idxStack * 8;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    m_fArgInRegisters = false;
+#endif        
+
+    int argOfs = TransitionBlock::GetOffsetOfArgs() + m_idxStack * STACK_ELEM_SIZE;
+
+    int cArgSlots = cbArg / STACK_ELEM_SIZE;
     m_idxStack += cArgSlots;
+
     return argOfs;
 #else
-    // Each argument takes exactly one slot on AMD64
+    // Each argument takes exactly one slot on AMD64 on Windows
     int argOfs = m_curOfs;
     m_curOfs += sizeof(void *);
     return argOfs;
@@ -1203,6 +1343,40 @@ void ArgIteratorTemplate<ARGITERATOR_BASE>::ComputeReturnFlags()
         {
             _ASSERTE(!thValueType.IsNull());
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            MethodTable *pMT = thValueType.AsMethodTable();
+            if (pMT->IsRegPassedStruct())
+            {
+                EEClass* eeClass = pMT->GetClass();
+
+                if (eeClass->GetNumberEightBytes() == 1)
+                {
+                    // Structs occupying just one eightbyte are treated as int / double
+                    if (eeClass->GetEightByteClassification(0) == SystemVClassificationTypeSSE)
+                    {
+                        flags |= sizeof(double) << RETURN_FP_SIZE_SHIFT;
+                    }
+                }
+                else
+                {
+                    // Size of the struct is 16 bytes
+                    flags |= (16 << RETURN_FP_SIZE_SHIFT);
+                    // The lowest two bits of the size encode the order of the int and SSE fields
+                    if (eeClass->GetEightByteClassification(0) == SystemVClassificationTypeSSE)
+                    {
+                        flags |= (1 << RETURN_FP_SIZE_SHIFT);
+                    }
+
+                    if (eeClass->GetEightByteClassification(1) == SystemVClassificationTypeSSE)
+                    {
+                        flags |= (2 << RETURN_FP_SIZE_SHIFT);                    
+                    }
+                }
+
+                break;
+            }
+#else // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifdef FEATURE_HFA
             if (thValueType.IsHFA() && !this->IsVarArg())
             {
@@ -1229,6 +1403,7 @@ void ArgIteratorTemplate<ARGITERATOR_BASE>::ComputeReturnFlags()
 
             if  (size <= ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE)
                 break;
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
         }
 #endif // ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE
 
@@ -1348,22 +1523,32 @@ void ArgIteratorTemplate<ARGITERATOR_BASE>::ForceSigWalk()
 
     int maxOffset = TransitionBlock::GetOffsetOfArgs();
 
-    int    ofs;
+    int ofs;
     while (TransitionBlock::InvalidOffset != (ofs = GetNextOffset()))
     {
         int stackElemSize;
 
 #ifdef _TARGET_AMD64_
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        if (m_fArgInRegisters)
+        {
+            // Arguments passed in registers don't consume any stack 
+            continue;
+        }
+
+        stackElemSize = StackElemSize(GetArgSize());
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
         // All stack arguments take just one stack slot on AMD64 because of arguments bigger 
         // than a stack slot are passed by reference. 
         stackElemSize = STACK_ELEM_SIZE;
-#else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#else // _TARGET_AMD64_
         stackElemSize = StackElemSize(GetArgSize());
 #if defined(ENREGISTERED_PARAMTYPE_MAXSIZE)
         if (IsArgPassedByRef())
             stackElemSize = STACK_ELEM_SIZE;
 #endif
-#endif
+#endif // _TARGET_AMD64_
 
         int endOfs = ofs + stackElemSize;
         if (endOfs > maxOffset)
diff --git a/src/vm/class.cpp b/src/vm/class.cpp
index 932f8bed00..f45e6ebbfa 100644
--- a/src/vm/class.cpp
+++ b/src/vm/class.cpp
@@ -1679,7 +1679,7 @@ CorElementType MethodTable::GetHFAType()
 
         default:
             // This should never happen. MethodTable::IsHFA() should be set only on types
-            // that have a valid HFA type
+            // that have a valid HFA type when the flag is used to track HFA status.
             _ASSERTE(false);
             return ELEMENT_TYPE_END;
         }
diff --git a/src/vm/class.h b/src/vm/class.h
index 758a0dbaee..c53cf8ba72 100644
--- a/src/vm/class.h
+++ b/src/vm/class.h
@@ -428,21 +428,26 @@ class EEClassLayoutInfo
             // to its unmanaged counterpart (i.e. no internal reference fields,
             // no ansi-unicode char conversions required, etc.) Used to
             // optimize marshaling.
-            e_BLITTABLE             = 0x01,
+            e_BLITTABLE                 = 0x01,
             // Post V1.0 addition: Is this type also sequential in managed memory?
-            e_MANAGED_SEQUENTIAL    = 0x02,
+            e_MANAGED_SEQUENTIAL        = 0x02,
             // When a sequential/explicit type has no fields, it is conceptually
             // zero-sized, but actually is 1 byte in length. This holds onto this
             // fact and allows us to revert the 1 byte of padding when another
             // explicit type inherits from this type.
-            e_ZERO_SIZED            = 0x04,
+            e_ZERO_SIZED                =   0x04,
             // The size of the struct is explicitly specified in the meta-data.
-            e_HAS_EXPLICIT_SIZE     = 0x08,
-
+            e_HAS_EXPLICIT_SIZE         = 0x08,
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#ifdef FEATURE_HFA
+#error Can't have FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF defined at the same time.
+#endif // FEATURE_HFA
+            e_NATIVE_PASS_IN_REGISTERS  = 0x10, // Flag wheter a native struct is passed in registers.
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
 #ifdef FEATURE_HFA
             // HFA type of the unmanaged layout
-            e_R4_HFA                = 0x10,
-            e_R8_HFA                = 0x20,
+            e_R4_HFA                    = 0x10,
+            e_R8_HFA                    = 0x20,
 #endif
         };
 
@@ -527,6 +532,14 @@ class EEClassLayoutInfo
             return m_cbPackingSize;
         }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+        bool IsNativeStructPassedInRegisters()
+        {
+            LIMITED_METHOD_CONTRACT;
+            return (m_bFlags & e_NATIVE_PASS_IN_REGISTERS) != 0;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
 #ifdef FEATURE_HFA
         bool IsNativeHFA()
         {
@@ -579,6 +592,14 @@ class EEClassLayoutInfo
             m_bFlags |= (hfaType == ELEMENT_TYPE_R4) ? e_R4_HFA : e_R8_HFA;
         }
 #endif
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+        void SetNativeStructPassedInRegisters()
+        {
+            LIMITED_METHOD_CONTRACT;
+            m_bFlags |= e_NATIVE_PASS_IN_REGISTERS;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
 };
 
 
@@ -713,6 +734,15 @@ class EEClassOptionalFields
 
     SecurityProperties m_SecProps;
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Number of eightBytes in the following arrays
+    int m_numberEightBytes; 
+    // Classification of the eightBytes
+    SystemVClassificationType m_eightByteClassifications[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    // Size of data the eightBytes
+    unsigned int m_eightByteSizes[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     // Set default values for optional fields.
     inline void Init();
 };
@@ -1811,6 +1841,45 @@ public:
         GetOptionalFields()->m_dwReliabilityContract = dwValue;
     }
 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Get number of eightbytes used by a struct passed in registers.
+    inline int GetNumberEightBytes()
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(HasOptionalFields());
+        return GetOptionalFields()->m_numberEightBytes;
+    }
+
+    // Get eightbyte classification for the eightbyte with the specified index.
+    inline SystemVClassificationType GetEightByteClassification(int index)
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(HasOptionalFields());
+        return GetOptionalFields()->m_eightByteClassifications[index];
+    }
+
+    // Get size of the data in the eightbyte with the specified index.
+    inline unsigned int GetEightByteSize(int index)
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(HasOptionalFields());
+        return GetOptionalFields()->m_eightByteSizes[index];
+    }
+
+    // Set the eightByte classification
+    inline void SetEightByteClassification(int eightByteCount, SystemVClassificationType *eightByteClassifications, unsigned int *eightByteSizes)
+    {
+        LIMITED_METHOD_CONTRACT;
+        _ASSERTE(HasOptionalFields());
+        GetOptionalFields()->m_numberEightBytes = eightByteCount;
+        for (int i = 0; i < eightByteCount; i++)
+        {
+            GetOptionalFields()->m_eightByteClassifications[i] = eightByteClassifications[i];
+            GetOptionalFields()->m_eightByteSizes[i] = eightByteSizes[i];
+        }
+    }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING    
+
 #ifdef FEATURE_COMINTEROP
     inline TypeHandle GetCoClassForInterface()
     {
diff --git a/src/vm/class.inl b/src/vm/class.inl
index 12c5230fd2..a4c8276476 100644
--- a/src/vm/class.inl
+++ b/src/vm/class.inl
@@ -53,6 +53,9 @@ inline void EEClassOptionalFields::Init()
     m_cbModuleDynamicID = MODULE_NON_DYNAMIC_STATICS;
     m_dwReliabilityContract = RC_NULL;
     m_SecProps = 0;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    m_numberEightBytes = 0;
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING    
 }
 #endif // !DACCESS_COMPILE
 
diff --git a/src/vm/comdelegate.cpp b/src/vm/comdelegate.cpp
index a6c7e063b1..80742cdaca 100644
--- a/src/vm/comdelegate.cpp
+++ b/src/vm/comdelegate.cpp
@@ -72,37 +72,149 @@ static UINT16 ShuffleOfs(INT ofs, UINT stackSizeDelta = 0)
 
 #else // Portable default implementation
 
-// Helpers used when calculating shuffle array entries in GenerateShuffleArray below.
-
-// Return true if the current argument still has slots left to shuffle in general registers or on the stack
-// (currently we never shuffle floating point registers since there's no need).
-static bool AnythingToShuffle(ArgLocDesc * pArg)
+// Iterator for extracting shuffle entries for argument desribed by an ArgLocDesc.
+// Used when calculating shuffle array entries in GenerateShuffleArray below.
+class ShuffleIterator
 {
-    return (pArg->m_cGenReg > 0) || (pArg->m_cStack > 0);
-}
+    // Argument location description
+    ArgLocDesc* m_argLocDesc;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Current eightByte used for struct arguments in registers
+    int m_currentEightByte;
+#endif    
+    // Current general purpose register index (relative to the ArgLocDesc::m_idxGenReg)
+    int m_currentGenRegIndex;
+    // Current floating point register index (relative to the ArgLocDesc::m_idxFloatReg)
+    int m_currentFloatRegIndex;
+    // Current stack slot index (relative to the ArgLocDesc::m_idxStack)
+    int m_currentStackSlotIndex;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // Get next shuffle offset for struct passed in registers. There has to be at least one offset left.
+    UINT16 GetNextOfsInStruct()
+    {
+        EEClass* eeClass = m_argLocDesc->m_eeClass;
+        _ASSERTE(eeClass != NULL);
+        
+        if (m_currentEightByte < eeClass->GetNumberEightBytes())
+        {
+            SystemVClassificationType eightByte = eeClass->GetEightByteClassification(m_currentEightByte);
+            unsigned int eightByteSize = eeClass->GetEightByteSize(m_currentEightByte);
 
-// Return an encoded shuffle entry describing a general register or stack offset that needs to be shuffled.
-static UINT16 ShuffleOfs(ArgLocDesc * pArg)
-{
-    // Shuffle any registers first (the order matters since otherwise we could end up shuffling a stack slot
-    // over a register we later need to shuffle down as well).
-    if (pArg->m_cGenReg > 0)
-    {        
-        pArg->m_cGenReg--;
-        return (UINT16)(ShuffleEntry::REGMASK | pArg->m_idxGenReg++);
+            m_currentEightByte++;
+
+            int index;
+            UINT16 mask = ShuffleEntry::REGMASK;
+
+            if (eightByte == SystemVClassificationTypeSSE)
+            {
+                _ASSERTE(m_currentFloatRegIndex < m_argLocDesc->m_cFloatReg);
+                index = m_argLocDesc->m_idxFloatReg + m_currentFloatRegIndex;
+                m_currentFloatRegIndex++;
+
+                mask |= ShuffleEntry::FPREGMASK;
+                if (eightByteSize == 4)
+                {
+                    mask |= ShuffleEntry::FPSINGLEMASK;
+                }
+            }
+            else
+            {
+                _ASSERTE(m_currentGenRegIndex < m_argLocDesc->m_cGenReg);
+                index = m_argLocDesc->m_idxGenReg + m_currentGenRegIndex;
+                m_currentGenRegIndex++;
+            }
+
+            return (UINT16)index | mask;
+        }
+
+        // There are no more offsets to get, the caller should not have called us
+        _ASSERTE(false);
+        return 0;
     }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
 
-    // If we get here we must have at least one stack slot left to shuffle (this method should only be called
-    // when AnythingToShuffle(pArg) == true).
-    _ASSERTE(pArg->m_cStack > 0);
-    pArg->m_cStack--;
+public:
 
-    // Delegates cannot handle overly large argument stacks due to shuffle entry encoding limitations.
-    if (pArg->m_idxStack >= ShuffleEntry::REGMASK)
-        COMPlusThrow(kNotSupportedException);
+    // Construct the iterator for the ArgLocDesc
+    ShuffleIterator(ArgLocDesc* argLocDesc)
+    :
+        m_argLocDesc(argLocDesc),
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        m_currentEightByte(0),
+#endif
+        m_currentGenRegIndex(0),
+        m_currentFloatRegIndex(0),
+        m_currentStackSlotIndex(0)
+    {
+    }
 
-    return (UINT16)(pArg->m_idxStack++);
-}
+    // Check if there are more offsets to shuffle
+    bool HasNextOfs()
+    {
+        return (m_currentGenRegIndex < m_argLocDesc->m_cGenReg) || 
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+               (m_currentFloatRegIndex < m_argLocDesc->m_cFloatReg) ||
+#endif
+               (m_currentStackSlotIndex < m_argLocDesc->m_cStack);        
+    }
+
+    // Get next offset to shuffle. There has to be at least one offset left.
+    UINT16 GetNextOfs()
+    {
+        int index;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+        // Check if the argLocDesc is for a struct in registers
+        EEClass* eeClass = m_argLocDesc->m_eeClass;
+        if (m_argLocDesc->m_eeClass != 0)
+        {
+            return GetNextOfsInStruct();
+        }
+
+        // Shuffle float registers first
+        if (m_currentFloatRegIndex < m_argLocDesc->m_cFloatReg)
+        {        
+            index = m_argLocDesc->m_idxFloatReg + m_currentFloatRegIndex;
+            m_currentFloatRegIndex++;
+
+            return (UINT16)index | ShuffleEntry::REGMASK | ShuffleEntry::FPREGMASK;
+        }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        // Shuffle any registers first (the order matters since otherwise we could end up shuffling a stack slot
+        // over a register we later need to shuffle down as well).
+        if (m_currentGenRegIndex < m_argLocDesc->m_cGenReg)
+        {        
+            index = m_argLocDesc->m_idxGenReg + m_currentGenRegIndex;
+            m_currentGenRegIndex++;
+
+            return (UINT16)index | ShuffleEntry::REGMASK;
+        }
+
+        // If we get here we must have at least one stack slot left to shuffle (this method should only be called
+        // when AnythingToShuffle(pArg) == true).
+        if (m_currentStackSlotIndex < m_argLocDesc->m_cStack)
+        {
+            index = m_argLocDesc->m_idxStack + m_currentStackSlotIndex;
+            m_currentStackSlotIndex++;
+
+            // Delegates cannot handle overly large argument stacks due to shuffle entry encoding limitations.
+            if (index >= ShuffleEntry::REGMASK)
+            {
+                COMPlusThrow(kNotSupportedException);
+            }
+
+            return (UINT16)index;
+        }
+
+        // There are no more offsets to get, the caller should not have called us
+        _ASSERTE(false);
+        return 0;
+    }
+};
 
 #endif
 
@@ -247,8 +359,11 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
 
         sArgPlacerSrc.GetThisLoc(&sArgDst);
 
-        entry.srcofs = ShuffleOfs(&sArgSrc);
-        entry.dstofs = ShuffleOfs(&sArgDst);
+        ShuffleIterator iteratorSrc(&sArgSrc);
+        ShuffleIterator iteratorDst(&sArgDst);
+
+        entry.srcofs = iteratorSrc.GetNextOfs();
+        entry.dstofs = iteratorDst.GetNextOfs();
 
         pShuffleEntryArray->Append(entry);
     }
@@ -261,8 +376,11 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
         sArgPlacerSrc.GetRetBuffArgLoc(&sArgSrc);
         sArgPlacerDst.GetRetBuffArgLoc(&sArgDst);
 
-        entry.srcofs = ShuffleOfs(&sArgSrc);
-        entry.dstofs = ShuffleOfs(&sArgDst);
+        ShuffleIterator iteratorSrc(&sArgSrc);
+        ShuffleIterator iteratorDst(&sArgDst);
+
+        entry.srcofs = iteratorSrc.GetNextOfs();
+        entry.dstofs = iteratorDst.GetNextOfs();
 
         // Depending on the type of target method (static vs instance) the return buffer argument may end up
         // in the same register in both signatures. So we only commit the entry (by moving the entry pointer
@@ -271,34 +389,76 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
             pShuffleEntryArray->Append(entry);
     }
 
-    // Iterate all the regular arguments. mapping source registers and stack locations to the corresponding
-    // destination locations.
-    while ((ofsSrc = sArgPlacerSrc.GetNextOffset()) != TransitionBlock::InvalidOffset)
-    {
-        ofsDst = sArgPlacerDst.GetNextOffset();
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // The shuffle entries are produced in two passes on Unix AMD64. The first pass generates shuffle entries for
+    // all cases except of shuffling struct argument from stack to registers, which is performed in the second pass
+    // The reason is that if such structure argument contained floating point field and it was followed by a 
+    // floating point argument, generating code for transferring the structure from stack into registers would
+    // overwrite the xmm register of the floating point argument before it could actually be shuffled.
+    // For example, consider this case:
+    // struct S { int x; float y; };
+    // void fn(long a, long b, long c, long d, long e, S f, float g);
+    // src: rdi = this, rsi = a, rdx = b, rcx = c, r8 = d, r9 = e, stack: f, xmm0 = g
+    // dst: rdi = a, rsi = b, rdx = c, rcx = d, r8 = e, r9 = S.x, xmm0 = s.y, xmm1 = g
+    for (int pass = 0; pass < 2; pass++)
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+    {
+        // Iterate all the regular arguments. mapping source registers and stack locations to the corresponding
+        // destination locations.
+        while ((ofsSrc = sArgPlacerSrc.GetNextOffset()) != TransitionBlock::InvalidOffset)
+        {
+            ofsDst = sArgPlacerDst.GetNextOffset();
 
-        // Find the argument location mapping for both source and destination signature. A single argument can
-        // occupy a floating point register (in which case we don't need to do anything, they're not shuffled)
-        // or some combination of general registers and the stack.
-        sArgPlacerSrc.GetArgLoc(ofsSrc, &sArgSrc);
-        sArgPlacerDst.GetArgLoc(ofsDst, &sArgDst);
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            bool shuffleStructFromStackToRegs = (ofsSrc != TransitionBlock::StructInRegsOffset) && (ofsDst == TransitionBlock::StructInRegsOffset);
+            if (((pass == 0) && shuffleStructFromStackToRegs) || 
+                ((pass == 1) && !shuffleStructFromStackToRegs))
+            {
+                continue;
+            }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+            // Find the argument location mapping for both source and destination signature. A single argument can
+            // occupy a floating point register (in which case we don't need to do anything, they're not shuffled)
+            // or some combination of general registers and the stack.
+            sArgPlacerSrc.GetArgLoc(ofsSrc, &sArgSrc);
+            sArgPlacerDst.GetArgLoc(ofsDst, &sArgDst);
+
+            ShuffleIterator iteratorSrc(&sArgSrc);
+            ShuffleIterator iteratorDst(&sArgDst);
+
+            // Shuffle each slot in the argument (register or stack slot) from source to destination.
+            while (iteratorSrc.HasNextOfs())
+            {
+                // Locate the next slot to shuffle in the source and destination and encode the transfer into a
+                // shuffle entry.
+                entry.srcofs = iteratorSrc.GetNextOfs();
+                entry.dstofs = iteratorDst.GetNextOfs();
+
+                // Only emit this entry if it's not a no-op (i.e. the source and destination locations are
+                // different).
+                if (entry.srcofs != entry.dstofs)
+                    pShuffleEntryArray->Append(entry);
+            }
 
-        // Shuffle each slot in the argument (register or stack slot) from source to destination.
-        while (AnythingToShuffle(&sArgSrc))
+            // We should have run out of slots to shuffle in the destination at the same time as the source.
+            _ASSERTE(!iteratorDst.HasNextOfs());
+        }
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (pass == 0)
         {
-            // Locate the next slot to shuffle in the source and destination and encode the transfer into a
-            // shuffle entry.
-            entry.srcofs = ShuffleOfs(&sArgSrc);
-            entry.dstofs = ShuffleOfs(&sArgDst);
+            // Reset the iterator for the 2nd pass
+            sSigSrc.Reset();
+            sSigDst.Reset();
 
-            // Only emit this entry if it's not a no-op (i.e. the source and destination locations are
-            // different).
-            if (entry.srcofs != entry.dstofs)
-                pShuffleEntryArray->Append(entry);
-        }
+            sArgPlacerSrc = ArgIterator(&sSigSrc);
+            sArgPlacerDst = ArgIterator(&sSigDst);
 
-        // We should have run out of slots to shuffle in the destination at the same time as the source.
-        _ASSERTE(!AnythingToShuffle(&sArgDst));
+            if (sSigDst.HasThis())
+            {
+                sArgPlacerSrc.GetNextOffset();
+            }
+        }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
     }
 
     entry.srcofs = ShuffleEntry::SENTINEL;
@@ -1323,7 +1483,7 @@ OBJECTREF COMDelegate::ConvertToDelegate(LPVOID pCallback, MethodTable* pMT)
 
     // Lookup the callsite in the hash, if found, we can map this call back to its managed function.
     // Otherwise, we'll treat this as an unmanaged callsite.
-	// Make sure that the pointer doesn't have the value of 1 which is our hash table deleted item marker.
+    // Make sure that the pointer doesn't have the value of 1 which is our hash table deleted item marker.
     LPVOID DelegateHnd = (pUMEntryThunk != NULL) && ((UPTR)pUMEntryThunk != (UPTR)1)
         ? COMDelegate::s_pDelegateToFPtrHash->LookupValue((UPTR)pUMEntryThunk, 0)
         : (LPVOID)INVALIDENTRY;
diff --git a/src/vm/comdelegate.h b/src/vm/comdelegate.h
index cfb9afa783..ab8ca04338 100644
--- a/src/vm/comdelegate.h
+++ b/src/vm/comdelegate.h
@@ -211,10 +211,14 @@ void DistributeUnhandledExceptionReliably(OBJECTREF *pDelegate,
 //     signature.
 struct ShuffleEntry
 {
+    // Offset masks and special value
     enum {
-        REGMASK  = 0x8000,
-        OFSMASK  = 0x7fff,
-        SENTINEL = 0xffff,
+        REGMASK      = 0x8000, // Register offset bit
+        FPREGMASK    = 0x4000, // Floating point register bit
+        FPSINGLEMASK = 0x2000, // Single precising floating point register
+        OFSMASK      = 0x7fff, // Mask to get stack offset
+        OFSREGMASK   = 0x1fff, // Mask to get register index
+        SENTINEL     = 0xffff, // Indicates end of shuffle array
     };
 
 #if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
@@ -224,17 +228,11 @@ struct ShuffleEntry
     };
 #else
 
-    // Special values:
-    //  -1       - indicates end of shuffle array: stacksizedelta
-    //             == difference in stack size between virtual and static sigs.
-    //  high bit - indicates a register argument: mask it off and
-    //             the result is an offset into ArgumentRegisters.
-
     UINT16    srcofs;
 
     union {
         UINT16    dstofs;           //if srcofs != SENTINEL
-        UINT16    stacksizedelta;   //if dstofs == SENTINEL
+        UINT16    stacksizedelta;   //if dstofs == SENTINEL, difference in stack size between virtual and static sigs
     };
 #endif // _TARGET_AMD64_
 };
diff --git a/src/vm/compile.cpp b/src/vm/compile.cpp
index 5b33792d35..23242df1db 100644
--- a/src/vm/compile.cpp
+++ b/src/vm/compile.cpp
@@ -76,6 +76,8 @@
 #endif
 #include "tritonstress.h"
 
+#include "argdestination.h"
+
 #ifdef CROSSGEN_COMPILE
 CompilationDomain * theDomain;
 #endif
@@ -1483,7 +1485,8 @@ void FakeGcScanRoots(MetaSig& msig, ArgIterator& argit, MethodDesc * pMD, BYTE *
     int argOffset;
     while ((argOffset = argit.GetNextOffset()) != TransitionBlock::InvalidOffset)
     {
-        msig.GcScanRoots(pFrame + argOffset, &FakePromote, &sc, &FakePromoteCarefully);
+        ArgDestination argDest(pFrame, argOffset, argit.GetArgLocDescForStructInRegs());
+        msig.GcScanRoots(&argDest, &FakePromote, &sc, &FakePromoteCarefully);
     }
 }
 
@@ -1933,7 +1936,17 @@ BOOL CanDeduplicateCode(CORINFO_METHOD_HANDLE method, CORINFO_METHOD_HANDLE dupl
         return FALSE;
 #endif // _TARGET_X86_
 
-    if (pMethod->ReturnsObject() != pDuplicateMethod->ReturnsObject())
+    MetaSig::RETURNTYPE returnType = pMethod->ReturnsObject();
+    MetaSig::RETURNTYPE returnTypeDuplicate = pDuplicateMethod->ReturnsObject();
+
+    if (returnType != returnTypeDuplicate)
+        return FALSE;
+
+    //
+    // Do not enable deduplication of structs returned in registers
+    //
+
+    if (returnType == MetaSig::RETVALUETYPE)
         return FALSE;
 
     //
diff --git a/src/vm/crossdomaincalls.cpp b/src/vm/crossdomaincalls.cpp
index fa04b57faa..dd695fe5f1 100644
--- a/src/vm/crossdomaincalls.cpp
+++ b/src/vm/crossdomaincalls.cpp
@@ -1264,7 +1264,7 @@ CrossDomainChannel::BlitAndCall()
             MetaSig mSig(m_pCliMD, thDeclaringType);
             ArgIterator argit(&mSig);
 
-            int    offset;
+            int offset;
             while (TransitionBlock::InvalidOffset != (offset = argit.GetNextOffset()))
             {    
                 int regArgNum = TransitionBlock::GetArgumentIndexFromOffset(offset);
@@ -2068,7 +2068,7 @@ CrossDomainChannel::MarshalAndCall()
     CDC_DETERMINE_DECLARING_TYPE(m_pCliMD, TypeHandle(CTPMethodTable::GetMethodTableBeingProxied(m_pFrame->GetThis())));
     MetaSig mSig(m_pCliMD, thDeclaringType);
     ArgIterator argit(&mSig);
-    int    ofs;
+    int ofs;
 
     // NumFixedArgs() doesn't count the "this" object, but SizeOfFrameArgumentArray() does.
     dwNumArgs = mSig.NumFixedArgs();
@@ -2141,7 +2141,7 @@ CrossDomainChannel::MarshalAndCall()
     TADDR pTransitionBlock = m_pFrame->GetTransitionBlock();
 
     for (int argNum = 0;
-        TransitionBlock::InvalidOffset != (ofs = argit.GetNextOffset());
+         TransitionBlock::InvalidOffset != (ofs = argit.GetNextOffset());
          argNum++
         )
     {
diff --git a/src/vm/eetwain.cpp b/src/vm/eetwain.cpp
index 5df7b6305a..dbbfac9000 100644
--- a/src/vm/eetwain.cpp
+++ b/src/vm/eetwain.cpp
@@ -18,6 +18,7 @@
 #include "gcinfodecoder.h"
 #endif
 
+#include "argdestination.h"
 
 #define X86_INSTR_W_TEST_ESP            0x4485  // test [esp+N], eax
 #define X86_INSTR_TEST_ESP_SIB          0x24
@@ -4071,7 +4072,10 @@ void promoteVarArgs(PTR_BYTE argsStart, PTR_VASigCookie varArgSig, GCCONTEXT* ct
         // if skipFixedArgs is false we report all arguments
         //  otherwise we just report the varargs.
         if (!skipFixedArgs || inVarArgs)
-            msig.GcScanRoots(pFrameBase + argOffset, ctx->f, ctx->sc);
+        {
+            ArgDestination argDest(pFrameBase, argOffset, argit.GetArgLocDescForStructInRegs());
+            msig.GcScanRoots(&argDest, ctx->f, ctx->sc);
+        }
     }
 }
 
diff --git a/src/vm/fcall.h b/src/vm/fcall.h
index 2bf6080706..8cfcc3e68e 100644
--- a/src/vm/fcall.h
+++ b/src/vm/fcall.h
@@ -1318,9 +1318,8 @@ typedef UINT16 FC_UINT16_RET;
 
 
 // FC_TypedByRef should be used for TypedReferences in FCall signatures
-#ifdef UNIX_AMD64_ABI
+#if defined(UNIX_AMD64_ABI) && !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 // Explicitly pass the TypedReferences by reference 
-// UNIXTODO: Remove once the proper managed calling convention for struct is in place
 #define FC_TypedByRef   TypedByRef&
 #define FC_DECIMAL      DECIMAL&
 #else
diff --git a/src/vm/field.h b/src/vm/field.h
index a278c4d12c..9fc5583c2f 100644
--- a/src/vm/field.h
+++ b/src/vm/field.h
@@ -223,7 +223,6 @@ public:
     DWORD GetOffset()
     {
         LIMITED_METHOD_DAC_CONTRACT;
-
         g_IBCLogger.LogFieldDescsAccess(this);
         return GetOffset_NoLogging();
     }
diff --git a/src/vm/fieldmarshaler.h b/src/vm/fieldmarshaler.h
index d67637e27c..ee464e4c05 100644
--- a/src/vm/fieldmarshaler.h
+++ b/src/vm/fieldmarshaler.h
@@ -396,7 +396,7 @@ public:
         m_dwExternalOffset = dwExternalOffset;
     }
 
-    UINT32 GetExternalOffset()
+    UINT32 GetExternalOffset() const
     {
         LIMITED_METHOD_CONTRACT;
         return m_dwExternalOffset;
diff --git a/src/vm/frames.cpp b/src/vm/frames.cpp
index 1c7f2f4348..f4d96e5f5d 100644
--- a/src/vm/frames.cpp
+++ b/src/vm/frames.cpp
@@ -45,6 +45,8 @@
 #include "interpreter.h"
 #endif // FEATURE_INTERPRETER
 
+#include "argdestination.h"
+
 #if CHECK_APP_DOMAIN_LEAKS
 #define CHECK_APP_DOMAIN    GC_CALL_CHECK_APP_DOMAIN
 #else
@@ -1278,7 +1280,8 @@ void TransitionFrame::PromoteCallerStackHelper(promote_func* fn, ScanContext* sc
     int argOffset;
     while ((argOffset = argit.GetNextOffset()) != TransitionBlock::InvalidOffset)
     {
-        pmsig->GcScanRoots(dac_cast<PTR_VOID>(pTransitionBlock + argOffset), fn, sc);
+        ArgDestination argDest(dac_cast<PTR_VOID>(pTransitionBlock), argOffset, argit.GetArgLocDescForStructInRegs());
+        pmsig->GcScanRoots(&argDest, fn, sc);
     }
 }
 
diff --git a/src/vm/i386/stublinkerx86.cpp b/src/vm/i386/stublinkerx86.cpp
index e42f7d792f..b86151243c 100644
--- a/src/vm/i386/stublinkerx86.cpp
+++ b/src/vm/i386/stublinkerx86.cpp
@@ -4001,16 +4001,49 @@ VOID StubLinkerCPU::EmitShuffleThunk(ShuffleEntry *pShuffleEntryArray)
         {
             // If source is present in register then destination must also be a register
             _ASSERTE(pEntry->dstofs & ShuffleEntry::REGMASK);
+            // Both the srcofs and dstofs must be of the same kind of registers - float or general purpose.
+            _ASSERTE((pEntry->dstofs & ShuffleEntry::FPREGMASK) == (pEntry->srcofs & ShuffleEntry::FPREGMASK));
 
-            X86EmitMovRegReg(c_argRegs[pEntry->dstofs & ShuffleEntry::OFSMASK], c_argRegs[pEntry->srcofs & ShuffleEntry::OFSMASK]);
+            int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK;
+            int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK;
+
+            if (pEntry->srcofs & ShuffleEntry::FPREGMASK) 
+            {
+                // movdqa dstReg, srcReg
+                X64EmitMovXmmXmm((X86Reg)(kXMM0 + dstRegIndex), (X86Reg)(kXMM0 + srcRegIndex));
+            }
+            else
+            {
+                // mov dstReg, srcReg
+                X86EmitMovRegReg(c_argRegs[dstRegIndex], c_argRegs[srcRegIndex]);
+            }
         }
         else if (pEntry->dstofs & ShuffleEntry::REGMASK)
         {
             // source must be on the stack
             _ASSERTE(!(pEntry->srcofs & ShuffleEntry::REGMASK));
 
-            // mov dstreg, [rax + src]
-            X86EmitIndexRegLoad(c_argRegs[pEntry->dstofs & ShuffleEntry::OFSMASK], SCRATCH_REGISTER_X86REG, (pEntry->srcofs + 1) * sizeof(void*));
+            int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK;
+            int srcOffset = (pEntry->srcofs + 1) * sizeof(void*);
+
+            if (pEntry->dstofs & ShuffleEntry::FPREGMASK) 
+            {
+                if (pEntry->dstofs & ShuffleEntry::FPSINGLEMASK)
+                {
+                    // movss dstReg, [rax + src]
+                    X64EmitMovSSFromMem((X86Reg)(kXMM0 + dstRegIndex), SCRATCH_REGISTER_X86REG, srcOffset);
+                }
+                else
+                {
+                    // movsd dstReg, [rax + src]
+                    X64EmitMovSDFromMem((X86Reg)(kXMM0 + dstRegIndex), SCRATCH_REGISTER_X86REG, srcOffset);
+                }
+            }
+            else
+            {
+                // mov dstreg, [rax + src]
+                X86EmitIndexRegLoad(c_argRegs[dstRegIndex], SCRATCH_REGISTER_X86REG, srcOffset);
+            }
         }
         else
         {
diff --git a/src/vm/ilmarshalers.h b/src/vm/ilmarshalers.h
index 5a2453b603..1bd072f417 100644
--- a/src/vm/ilmarshalers.h
+++ b/src/vm/ilmarshalers.h
@@ -601,7 +601,7 @@ public:
                 nativeSize = wNativeSize;
             }
 
-#ifndef _TARGET_ARM_
+#if !defined(_TARGET_ARM) && !(defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING))
             switch (nativeSize)
             {
                 case 1: typ = ELEMENT_TYPE_U1; break;
diff --git a/src/vm/invokeutil.cpp b/src/vm/invokeutil.cpp
index ee80056abe..e17458ce1d 100644
--- a/src/vm/invokeutil.cpp
+++ b/src/vm/invokeutil.cpp
@@ -28,6 +28,7 @@
 #include "eeconfig.h"
 #include "generics.h"
 #include "runtimehandles.h"
+#include "argdestination.h"
 
 #ifndef CROSSGEN_COMPILE
 
@@ -130,7 +131,7 @@ void *InvokeUtil::GetIntPtrValue(OBJECTREF pObj) {
     RETURN *(void **)((pObj)->UnBox());
 }
 
-void InvokeUtil::CopyArg(TypeHandle th, OBJECTREF *pObjUNSAFE, void *pArgDst) {
+void InvokeUtil::CopyArg(TypeHandle th, OBJECTREF *pObjUNSAFE, ArgDestination *argDest) {
     CONTRACTL {
         THROWS;
         GC_NOTRIGGER; // Caller does not protect object references
@@ -140,7 +141,9 @@ void InvokeUtil::CopyArg(TypeHandle th, OBJECTREF *pObjUNSAFE, void *pArgDst) {
         INJECT_FAULT(COMPlusThrowOM()); 
     }
     CONTRACTL_END;
-    
+
+    void *pArgDst = argDest->GetDestinationAddress();
+
     OBJECTREF rObj = *pObjUNSAFE;
     MethodTable* pMT;
     CorElementType oType;
@@ -204,12 +207,12 @@ void InvokeUtil::CopyArg(TypeHandle th, OBJECTREF *pObjUNSAFE, void *pArgDst) {
 
     case ELEMENT_TYPE_VALUETYPE:
     {
-        // If we got the univeral zero...Then assign it and exit.
+        // If we got the universal zero...Then assign it and exit.
         if (rObj == 0) {
-            InitValueClass(pArgDst, th.AsMethodTable());
+            InitValueClassArg(argDest, th.AsMethodTable());
          }
         else {
-            if (!th.AsMethodTable()->UnBoxInto(pArgDst, rObj))
+            if (!th.AsMethodTable()->UnBoxIntoArg(argDest, rObj))
                 COMPlusThrow(kArgumentException, W("Arg_ObjObj"));
         }
         break;
diff --git a/src/vm/invokeutil.h b/src/vm/invokeutil.h
index f2acb61f9e..14d7dc8e14 100644
--- a/src/vm/invokeutil.h
+++ b/src/vm/invokeutil.h
@@ -44,6 +44,7 @@ struct InterfaceMapData
 #include <poppack.h>
 
 class ReflectMethodList;
+class ArgDestination;
 
 // Structure used to track security access checks efficiently when applied
 // across a range of methods, fields etc.
@@ -114,7 +115,7 @@ class InvokeUtil
 {
 
 public:
-    static void CopyArg(TypeHandle th, OBJECTREF *obj, void *pArgDst);
+    static void CopyArg(TypeHandle th, OBJECTREF *obj, ArgDestination *argDest);
    
     // Given a type, this routine will convert an return value representing that
     //  type into an ObjectReference.  If the type is a primitive, the 
diff --git a/src/vm/jitinterface.cpp b/src/vm/jitinterface.cpp
index ba6aebb3cc..442fb91186 100644
--- a/src/vm/jitinterface.cpp
+++ b/src/vm/jitinterface.cpp
@@ -58,7 +58,6 @@
 #include "runtimehandles.h"
 #include "sigbuilder.h"
 #include "openum.h"
-
 #ifdef HAVE_GCCOVER
 #include "gccover.h"
 #endif // HAVE_GCCOVER
@@ -1651,7 +1650,6 @@ void CEEInfo::getFieldInfo (CORINFO_RESOLVED_TOKEN * pResolvedToken,
     DWORD fieldFlags = 0;
 
     pResult->offset = pField->GetOffset();
-
     if (pField->IsStatic())
     {
 #ifdef FEATURE_LEGACYNETCF
@@ -1850,7 +1848,6 @@ void CEEInfo::getFieldInfo (CORINFO_RESOLVED_TOKEN * pResolvedToken,
 
     if (!(flags & CORINFO_ACCESS_INLINECHECK))
     {
-
     //get the field's type.  Grab the class for structs.
     pResult->fieldType = getFieldTypeInternal(pResolvedToken->hField, &pResult->structType, pResolvedToken->hClass);
 
@@ -2568,9 +2565,82 @@ bool CEEInfo::getSystemVAmd64PassStructInRegisterDescriptor(
                                                 /*IN*/  CORINFO_CLASS_HANDLE structHnd,
                                                 /*OUT*/ SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr)
 {
-    LIMITED_METHOD_CONTRACT;
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+    JIT_TO_EE_TRANSITION();
+
+    _ASSERTE(structPassInRegDescPtr != nullptr);
+    TypeHandle th(structHnd);
+    
+    // Make sure this is a value type.
+    if (th.IsValueType())
+    {
+        _ASSERTE(CorInfoType2UnixAmd64Classification(th.GetInternalCorElementType()) == SystemVClassificationTypeStruct);
+
+        MethodTable* methodTablePtr = nullptr;
+        bool isNativeStruct = false;
+        if (!th.IsTypeDesc())
+        {
+            methodTablePtr = th.AsMethodTable();
+            _ASSERTE(methodTablePtr != nullptr);
+        }
+        else if (th.IsTypeDesc())
+        {
+            if (th.IsNativeValueType())
+            {
+                methodTablePtr = th.AsNativeValueType();
+                isNativeStruct = true;
+                _ASSERTE(methodTablePtr != nullptr);
+            }
+            else
+            {
+                _ASSERTE(false && "Unhandled TypeHandle for struct!");
+            }
+        }
+
+        bool isPassableInRegs = false;
+
+        if (isNativeStruct)
+        {
+            isPassableInRegs = methodTablePtr->GetLayoutInfo()->IsNativeStructPassedInRegisters();
+        }
+        else
+        {
+            isPassableInRegs = methodTablePtr->IsRegPassedStruct();
+        }
+
+        if (!isPassableInRegs)
+        {
+            structPassInRegDescPtr->passedInRegisters = false;
+        }
+        else
+        {
+            structPassInRegDescPtr->passedInRegisters = true;
+
+            SystemVStructRegisterPassingHelper helper((unsigned int)th.GetSize());
+            bool result = methodTablePtr->ClassifyEightBytes(&helper, 0, 0);
+
+            structPassInRegDescPtr->eightByteCount = helper.eightByteCount;
+            _ASSERTE(structPassInRegDescPtr->eightByteCount <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
 
+            for (unsigned int i = 0; i < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
+            {
+                structPassInRegDescPtr->eightByteClassifications[i] = helper.eightByteClassifications[i];
+                structPassInRegDescPtr->eightByteSizes[i] = helper.eightByteSizes[i];
+                structPassInRegDescPtr->eightByteOffsets[i] = helper.eightByteOffsets[i];
+            }
+        }
+    }
+    else
+    {
+        structPassInRegDescPtr->passedInRegisters = false;
+    }
+
+    EE_TO_JIT_TRANSITION();
+
+    return true;
+#else // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
     return false;
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
 }
 
 /*********************************************************************/
diff --git a/src/vm/message.cpp b/src/vm/message.cpp
index d8bdb3d2c8..dab78f46e6 100644
--- a/src/vm/message.cpp
+++ b/src/vm/message.cpp
@@ -752,7 +752,7 @@ FCIMPL2(FC_BOOL_RET, CMessage::Dispatch, MessageObject* pMessageUNSAFE, Object*
     int ofs;
     while ((ofs = argit.GetNextOffset()) != TransitionBlock::InvalidOffset)
     {
-        if (TransitionBlock::IsFloatArgumentRegisterOffset(ofs))
+        if (TransitionBlock::HasFloatRegister(ofs, argit.GetArgLocDescForStructInRegs()))
         {
             // Found a floating point argument register. The first time we find this we point
             // pFloatArgumentRegisters to the part of the frame where these values were spilled (we don't do
@@ -772,7 +772,7 @@ FCIMPL2(FC_BOOL_RET, CMessage::Dispatch, MessageObject* pMessageUNSAFE, Object*
     DWORD_PTR   dwRegTypeMap    = 0;
 
     {
-        int    ofs;
+        int ofs;
         while ((ofs = argit.GetNextOffset()) != TransitionBlock::InvalidOffset)
         {
             int regArgNum = TransitionBlock::GetArgumentIndexFromOffset(ofs);
diff --git a/src/vm/method.cpp b/src/vm/method.cpp
index 6926ce4b6e..3e7271b1fb 100644
--- a/src/vm/method.cpp
+++ b/src/vm/method.cpp
@@ -1396,8 +1396,9 @@ COR_ILMETHOD* MethodDesc::GetILHeader(BOOL fAllowOverrides /*=FALSE*/)
 //*******************************************************************************
 MetaSig::RETURNTYPE MethodDesc::ReturnsObject(
 #ifdef _DEBUG
-    bool supportStringConstructors
+    bool supportStringConstructors,
 #endif
+    MethodTable** pMT
     )
 {
     CONTRACTL
@@ -1439,7 +1440,19 @@ MetaSig::RETURNTYPE MethodDesc::ReturnsObject(
                     if (!thValueType.IsTypeDesc())
                     {
                         MethodTable * pReturnTypeMT = thValueType.AsMethodTable();
-                        if(pReturnTypeMT->ContainsPointers())
+                        if (pMT != NULL)
+                        {
+                            *pMT = pReturnTypeMT;
+                        }
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                        if (pReturnTypeMT->IsRegPassedStruct())
+                        {
+                            return MetaSig::RETVALUETYPE;
+                        }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+                        if (pReturnTypeMT->ContainsPointers())
                         {
                             _ASSERTE(pReturnTypeMT->GetNumInstanceFieldBytes() == sizeof(void*));
                             return MetaSig::RETOBJ;
diff --git a/src/vm/method.hpp b/src/vm/method.hpp
index 0f283e5c79..680662b94c 100644
--- a/src/vm/method.hpp
+++ b/src/vm/method.hpp
@@ -1611,8 +1611,9 @@ public:
     // does this function return an object reference?
     MetaSig::RETURNTYPE ReturnsObject(
 #ifdef _DEBUG 
-    bool supportStringConstructors = false
+        bool supportStringConstructors = false,
 #endif
+        MethodTable** pMT = NULL
         );
 
 
diff --git a/src/vm/methodtable.cpp b/src/vm/methodtable.cpp
index de660268e4..e632ce3700 100644
--- a/src/vm/methodtable.cpp
+++ b/src/vm/methodtable.cpp
@@ -39,9 +39,12 @@
 #include "dbginterface.h"
 #include "comdelegate.h"
 #include "eventtrace.h"
+#include "fieldmarshaler.h"
+
 #ifdef FEATURE_REMOTING
 #include "remoting.h"
 #endif
+
 #include "eeprofinterfaces.h"
 #include "dllimportcallback.h"
 #include "listlock.h"
@@ -2275,6 +2278,916 @@ BOOL MethodTable::IsClassPreInited()
 #pragma optimize("", on)
 #endif // _MSC_VER
 
+//========================================================================================
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+
+#if defined(_DEBUG) && defined(LOGGING)
+static
+const char* GetSystemVClassificationTypeName(SystemVClassificationType t)
+{
+    switch (t)
+    {
+    case SystemVClassificationTypeUnknown:          return "Unknown";
+    case SystemVClassificationTypeStruct:           return "Struct";
+    case SystemVClassificationTypeNoClass:          return "NoClass";
+    case SystemVClassificationTypeMemory:           return "Memory";
+    case SystemVClassificationTypeInteger:          return "Integer";
+    case SystemVClassificationTypeIntegerReference: return "IntegerReference";
+    case SystemVClassificationTypeSSE:              return "SSE";
+    default:                                        return "ERROR";
+    }
+};
+#endif // _DEBUG && LOGGING
+
+// If we have a field classification already, but there is a union, we must merge the classification type of the field. Returns the
+// new, merged classification type.
+/* static */
+SystemVClassificationType MethodTable::ReClassifyField(SystemVClassificationType originalClassification, SystemVClassificationType newFieldClassification)
+{
+    _ASSERTE((newFieldClassification == SystemVClassificationTypeInteger) ||
+             (newFieldClassification == SystemVClassificationTypeIntegerReference) ||
+             (newFieldClassification == SystemVClassificationTypeSSE));
+
+    switch (newFieldClassification)
+    {
+    case SystemVClassificationTypeInteger:
+        // Integer overrides everything; the resulting classification is Integer. Can't merge Integer and IntegerReference.
+        _ASSERTE((originalClassification == SystemVClassificationTypeInteger) ||
+                 (originalClassification == SystemVClassificationTypeSSE));
+
+        return SystemVClassificationTypeInteger;
+
+    case SystemVClassificationTypeSSE:
+        // If the old and new classifications are both SSE, then the merge is SSE, otherwise it will be integer. Can't merge SSE and IntegerReference.
+        _ASSERTE((originalClassification == SystemVClassificationTypeInteger) ||
+                 (originalClassification == SystemVClassificationTypeSSE));
+
+        if (originalClassification == SystemVClassificationTypeSSE)
+        {
+            return SystemVClassificationTypeSSE;
+        }
+        else
+        {
+            return SystemVClassificationTypeInteger;
+        }
+
+    case SystemVClassificationTypeIntegerReference:
+        // IntegerReference can only merge with IntegerReference.
+        _ASSERTE(originalClassification == SystemVClassificationTypeIntegerReference);
+        return SystemVClassificationTypeIntegerReference;
+
+    default:
+        _ASSERTE(false); // Unexpected type.
+        return SystemVClassificationTypeUnknown;
+    }
+}
+
+// Returns 'true' if the struct is passed in registers, 'false' otherwise.
+bool MethodTable::ClassifyEightBytes(SystemVStructRegisterPassingHelperPtr helperPtr, unsigned int nestingLevel, unsigned int startOffsetOfStruct)
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+        SO_TOLERANT;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+    
+    WORD numIntroducedFields = GetNumIntroducedInstanceFields();
+
+    // It appears the VM gives a struct with no fields of size 1.
+    // Don't pass in register such structure.
+    if (numIntroducedFields == 0)
+    {
+        return false;
+    }
+
+    // No struct register passing with explicit layout. There may be cases where explicit layout may be still
+    // eligible for register struct passing, but it is hard to tell the real intent. Make it simple and just 
+    // unconditionally disable register struct passing for explicit layout.
+    if (GetClass()->HasExplicitFieldOffsetLayout())
+    {
+        LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytes: struct %s has explicit layout; will not be enregistered\n",
+               nestingLevel * 5, "", this->GetDebugClassName()));
+        return false;
+    }
+#ifdef _DEBUG
+    LOG((LF_JIT, LL_EVERYTHING, "%*s**** Classify %s (%p), startOffset %d, total struct size %d\n",
+        nestingLevel * 5, "", this->GetDebugClassName(), this, startOffsetOfStruct, helperPtr->structSize));
+    int fieldNum = -1;
+#endif // _DEBUG
+
+    FieldDesc *pField = GetApproxFieldDescListRaw();
+    FieldDesc *pFieldEnd = pField + numIntroducedFields;
+
+    for (; pField < pFieldEnd; pField++)
+    {
+#ifdef _DEBUG
+        ++fieldNum;
+#endif // _DEBUG
+
+        DWORD fieldOffset = pField->GetOffset();
+        unsigned normalizedFieldOffset = fieldOffset + startOffsetOfStruct;
+
+        unsigned int fieldSize = pField->GetSize();
+        _ASSERTE(fieldSize != (unsigned int)-1);
+
+        // The field can't span past the end of the struct.
+        if ((normalizedFieldOffset + fieldSize) > helperPtr->structSize)
+        {
+            _ASSERTE(false && "Invalid struct size. The size of fields and overall size don't agree");
+            return false;
+        }
+
+        CorElementType fieldType = pField->GetFieldType();
+
+        SystemVClassificationType fieldClassificationType = CorInfoType2UnixAmd64Classification(fieldType);
+
+#ifdef _DEBUG
+        LPCUTF8 fieldName;
+        pField->GetName_NoThrow(&fieldName);
+#endif // _DEBUG
+
+        if (fieldClassificationType == SystemVClassificationTypeStruct)
+        {
+            TypeHandle th = pField->GetApproxFieldTypeHandleThrowing();
+            _ASSERTE(!th.IsNull());
+            MethodTable* pFieldMT = th.GetMethodTable();
+
+            bool inEmbeddedStructPrev = helperPtr->inEmbeddedStruct;
+            helperPtr->inEmbeddedStruct = true;
+            bool structRet = pFieldMT->ClassifyEightBytes(helperPtr, nestingLevel + 1, normalizedFieldOffset);
+            helperPtr->inEmbeddedStruct = inEmbeddedStructPrev;
+
+            if (!structRet)
+            {
+                // If the nested struct says not to enregister, there's no need to continue analyzing at this level. Just return do not enregister.
+                return false;
+            }
+
+            continue;
+        }
+
+        if ((normalizedFieldOffset % fieldSize) != 0)
+        {
+            // The spec requires that struct values on the stack from register passed fields expects
+            // those fields to be at their natural alignment.
+
+            LOG((LF_JIT, LL_EVERYTHING, "     %*sxxxx Field %d %s: offset %d (normalized %d), size %d not at natural alignment; not enregistering struct\n",
+                   nestingLevel * 5, "", fieldNum, fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldSize));
+            return false;
+        }
+
+        if ((int)normalizedFieldOffset <= helperPtr->largestFieldOffset)
+        {
+            // Find the field corresponding to this offset and update the size if needed.
+            // We assume that either it matches the offset of a previously seen field, or
+            // it is an out-of-order offset (the VM does give us structs in non-increasing
+            // offset order sometimes) that doesn't overlap any other field.
+
+            // REVIEW: will the offset ever match a previously seen field offset for cases that are NOT ExplicitLayout?
+            // If not, we can get rid of this loop, and just assume the offset is from an out-of-order field. We wouldn't
+            // need to maintain largestFieldOffset, either, since we would then assume all fields are unique. We could
+            // also get rid of ReClassifyField().
+            int i;
+            for (i = helperPtr->currentUniqueOffsetField - 1; i >= 0; i--)
+            {
+                if (helperPtr->fieldOffsets[i] == normalizedFieldOffset)
+                {
+                    if (fieldSize > helperPtr->fieldSizes[i])
+                    {
+                        helperPtr->fieldSizes[i] = fieldSize;
+                    }
+
+                    helperPtr->fieldClassifications[i] = ReClassifyField(helperPtr->fieldClassifications[i], fieldClassificationType);
+
+                    LOG((LF_JIT, LL_EVERYTHING, "     %*sxxxx Field %d %s: offset %d (normalized %d), size %d, union with uniqueOffsetField %d, field type classification %s, reclassified field to %s\n",
+                           nestingLevel * 5, "", fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldSize, i,
+                           GetSystemVClassificationTypeName(fieldClassificationType),
+                           GetSystemVClassificationTypeName(helperPtr->fieldClassifications[i])));
+
+                    break;
+                }
+                // Make sure the field doesn't start in the middle of another field.
+                _ASSERTE((normalizedFieldOffset <  helperPtr->fieldOffsets[i]) ||
+                         (normalizedFieldOffset >= helperPtr->fieldOffsets[i] + helperPtr->fieldSizes[i]));
+            }
+
+            if (i >= 0)
+            {
+                // The proper size of the union set of fields has been set above; continue to the next field.
+                continue;
+            }
+        }
+        else
+        {
+            helperPtr->largestFieldOffset = (int)normalizedFieldOffset;
+        }
+
+        // Set the data for a new field.
+
+        // The new field classification must not have been initialized yet.
+        _ASSERTE(helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] == SystemVClassificationTypeNoClass);
+
+        // There are only a few field classifications that are allowed.
+        _ASSERTE((fieldClassificationType == SystemVClassificationTypeInteger) ||
+                 (fieldClassificationType == SystemVClassificationTypeIntegerReference) ||
+                 (fieldClassificationType == SystemVClassificationTypeSSE));
+
+        helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] = fieldClassificationType;
+        helperPtr->fieldSizes[helperPtr->currentUniqueOffsetField] = fieldSize;
+        helperPtr->fieldOffsets[helperPtr->currentUniqueOffsetField] = normalizedFieldOffset;
+
+        LOG((LF_JIT, LL_EVERYTHING, "     %*s**** Field %d %s: offset %d (normalized %d), size %d, currentUniqueOffsetField %d, field type classification %s, chosen field classification %s\n",
+               nestingLevel * 5, "", fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldSize, helperPtr->currentUniqueOffsetField,
+               GetSystemVClassificationTypeName(fieldClassificationType),
+               GetSystemVClassificationTypeName(helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField])));
+
+        helperPtr->currentUniqueOffsetField++;
+        _ASSERTE(helperPtr->currentUniqueOffsetField < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT);
+    } // end per-field for loop
+
+    if (!helperPtr->inEmbeddedStruct)
+    {
+        _ASSERTE(nestingLevel == 0);
+
+        // We're at the top level of the recursion, and we're done looking at the fields.
+        // Now sort the fields by offset and set the output data.
+
+        int sortedFieldOrder[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            sortedFieldOrder[i] = -1;
+        }
+
+        for (unsigned i = 0; i < helperPtr->currentUniqueOffsetField; i++)
+        {
+            _ASSERTE(helperPtr->fieldOffsets[i] < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT);
+            _ASSERTE(sortedFieldOrder[helperPtr->fieldOffsets[i]] == -1); // we haven't seen this field offset yet.
+            sortedFieldOrder[helperPtr->fieldOffsets[i]] = i;
+        }
+
+        // Set the layoutSizes (includes holes from alignment of the fields.)
+        int lastField = -1;
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            int ordinal = sortedFieldOrder[i];
+            if (ordinal == -1)
+            {
+                continue;
+            }
+
+            if (lastField == -1)
+            {
+                lastField = ordinal;
+                continue;
+            }
+
+            helperPtr->fieldLayoutSizes[lastField] = helperPtr->fieldOffsets[ordinal] - helperPtr->fieldOffsets[lastField];
+
+            lastField = ordinal;
+        }
+        // Now the last field
+        _ASSERTE(lastField != -1); // if lastField==-1, then the struct has no fields!
+        helperPtr->fieldLayoutSizes[lastField] = helperPtr->structSize - helperPtr->fieldOffsets[lastField];
+
+        // Calculate the eightbytes and their types.
+        unsigned int accumulatedSizeForEightByte = 0;
+        unsigned int lastEightByteOffset = 0;
+        unsigned int currentEightByte = 0;
+
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            int ordinal = sortedFieldOrder[i];
+            if (ordinal == -1)
+            {
+                continue;
+            }
+
+            if ((accumulatedSizeForEightByte + helperPtr->fieldLayoutSizes[ordinal]) > SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES)
+            {
+                // Save data for this eightbyte.
+                helperPtr->eightByteSizes[currentEightByte] = accumulatedSizeForEightByte;
+                helperPtr->eightByteOffsets[currentEightByte] = lastEightByteOffset;
+
+                // Set up for next eightbyte.
+                currentEightByte++;
+                _ASSERTE(currentEightByte < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+                lastEightByteOffset = helperPtr->fieldOffsets[ordinal];
+                accumulatedSizeForEightByte = 0;
+            }
+
+            accumulatedSizeForEightByte += helperPtr->fieldLayoutSizes[ordinal];
+
+            _ASSERTE(helperPtr->fieldClassifications[ordinal] != SystemVClassificationTypeMemory);
+
+            if (helperPtr->eightByteClassifications[currentEightByte] == helperPtr->fieldClassifications[ordinal])
+            {
+                // Do nothing. The eight-byte is already classified.
+            }
+            else if (helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeNoClass)
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = helperPtr->fieldClassifications[ordinal];
+            }
+            else if ((helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeInteger) ||
+                     (helperPtr->fieldClassifications[ordinal] == SystemVClassificationTypeInteger))
+            {
+                _ASSERTE(helperPtr->fieldClassifications[ordinal] != SystemVClassificationTypeIntegerReference);
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeInteger;
+            }
+            else if ((helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeIntegerReference) ||
+                     (helperPtr->fieldClassifications[ordinal] == SystemVClassificationTypeIntegerReference))
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeIntegerReference;
+            }
+            else
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeSSE;
+            }
+        }
+
+        helperPtr->eightByteCount = currentEightByte + 1; 
+        helperPtr->eightByteSizes[currentEightByte] = accumulatedSizeForEightByte;
+        helperPtr->eightByteOffsets[currentEightByte] = lastEightByteOffset;
+        _ASSERTE(helperPtr->eightByteCount <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+#ifdef _DEBUG
+        LOG((LF_JIT, LL_EVERYTHING, "     ----\n"));
+        LOG((LF_JIT, LL_EVERYTHING, "     **** Number EightBytes: %d\n", helperPtr->eightByteCount));
+        for (unsigned i = 0; i < helperPtr->eightByteCount; i++)
+        {
+            LOG((LF_JIT, LL_EVERYTHING, "     **** eightByte %d -- classType: %s, eightByteOffset: %d, eightByteSize: %d\n",
+                i, GetSystemVClassificationTypeName(helperPtr->eightByteClassifications[i]), helperPtr->eightByteOffsets[i], helperPtr->eightByteSizes[i]));
+        }
+#endif // _DEBUG
+    }
+
+    return true;
+}
+
+// Returns 'true' if the struct is passed in registers, 'false' otherwise.
+bool MethodTable::ClassifyEightBytesForNativeStruct(SystemVStructRegisterPassingHelperPtr helperPtr, unsigned int nestingLevel, unsigned int startOffsetOfStruct)
+{
+    CONTRACTL
+    {
+        THROWS;
+        GC_TRIGGERS;
+        SO_TOLERANT;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+#ifdef DACCESS_COMPILE
+    // No register classification for this case.
+    return false;
+#else // DACCESS_COMPILE
+
+    if (!HasLayout())
+    {
+        return false;
+    }
+
+    const FieldMarshaler *pFieldMarshaler = GetLayoutInfo()->GetFieldMarshalers();
+    UINT  numIntroducedFields = GetLayoutInfo()->GetNumCTMFields();
+
+    // No fields.
+    if (numIntroducedFields == 0)
+    {
+        return false;
+    }
+
+    // No struct register passing with explicit layout. There may be cases where explicit layout may be still
+    // eligible for register struct passing, but it is hard to tell the real intent. Make it simple and just 
+    // unconditionally disable register struct passing for explicit layout.
+    if (GetClass()->HasExplicitFieldOffsetLayout())
+    {
+        LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytesForNativeStruct: struct %s has explicit layout; will not be enregistered\n",
+            nestingLevel * 5, "", this->GetDebugClassName()));
+        return false;
+    }
+#ifdef _DEBUG
+    LOG((LF_JIT, LL_EVERYTHING, "%*s**** Classify for native struct %s (%p), startOffset %d, total struct size %d\n",
+        nestingLevel * 5, "", this->GetDebugClassName(), this, startOffsetOfStruct, helperPtr->structSize));
+    int fieldNum = -1;
+#endif // _DEBUG
+
+    while (numIntroducedFields--)
+    {
+#ifdef _DEBUG
+        ++fieldNum;
+#endif // _DEBUG
+
+        FieldDesc *pField = pFieldMarshaler->GetFieldDesc();
+        CorElementType fieldType = pField->GetFieldType();
+
+        // Invalid field type.
+        if (fieldType == ELEMENT_TYPE_END)
+        {
+            return false;
+        }
+
+        DWORD fieldOffset = pFieldMarshaler->GetExternalOffset();
+        unsigned normalizedFieldOffset = fieldOffset + startOffsetOfStruct;
+
+        unsigned int fieldNativeSize = pFieldMarshaler->NativeSize();
+        if (fieldNativeSize > SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES)
+        {
+            // Pass on stack in this case.
+            return false;
+        }
+
+        _ASSERTE(fieldNativeSize != (unsigned int)-1);
+
+        // The field can't span past the end of the struct.
+        if ((normalizedFieldOffset + fieldNativeSize) > helperPtr->structSize)
+        {
+            _ASSERTE(false && "Invalid native struct size. The size of fields and overall size don't agree");
+            return false;
+        }
+
+        SystemVClassificationType fieldClassificationType = SystemVClassificationTypeUnknown;
+
+#ifdef _DEBUG
+        LPCUTF8 fieldName;
+        pField->GetName_NoThrow(&fieldName);
+#endif // _DEBUG
+
+        // Some NStruct Field Types have extra information and require special handling
+        NStructFieldType cls = pFieldMarshaler->GetNStructFieldType();
+        if (cls == NFT_FIXEDCHARARRAYANSI)
+        {
+            fieldClassificationType = SystemVClassificationTypeInteger;
+        }
+        else if (cls == NFT_FIXEDARRAY)
+        {
+            VARTYPE vtElement = ((FieldMarshaler_FixedArray*)pFieldMarshaler)->GetElementVT();
+            switch (vtElement)
+            {
+            case VT_EMPTY:
+            case VT_NULL:
+            case VT_BOOL:
+            case VT_I1:
+            case VT_I2:
+            case VT_I4:
+            case VT_I8:
+            case VT_UI1:
+            case VT_UI2:
+            case VT_UI4:
+            case VT_UI8:
+            case VT_PTR:
+            case VT_INT:
+            case VT_UINT:
+            case VT_LPSTR:
+            case VT_LPWSTR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case VT_R4:
+            case VT_R8:
+                fieldClassificationType = SystemVClassificationTypeSSE;
+                break;
+            case VT_DECIMAL:
+            case VT_DATE:
+            case VT_BSTR:
+            case VT_UNKNOWN:
+            case VT_DISPATCH:
+            case VT_SAFEARRAY:
+            case VT_ERROR:
+            case VT_HRESULT:
+            case VT_CARRAY:
+            case VT_USERDEFINED:
+            case VT_RECORD:
+            case VT_FILETIME:
+            case VT_BLOB:
+            case VT_STREAM:
+            case VT_STORAGE:
+            case VT_STREAMED_OBJECT:
+            case VT_STORED_OBJECT:
+            case VT_BLOB_OBJECT:
+            case VT_CF:
+            case VT_CLSID:
+            default:
+                // Not supported.
+                return false;
+            }
+        }
+#ifdef FEATURE_COMINTEROP
+        else if (cls == NFT_INTERFACE)
+        {
+            // COMInterop not supported for CORECLR.
+            _ASSERTE(false && "COMInterop not supported for CORECLR.");
+            return false;
+        }
+#ifdef FEATURE_CLASSIC_COMINTEROP
+        else if (cls == NFT_SAFEARRAY)
+        {
+            // COMInterop not supported for CORECLR.
+            _ASSERTE(false && "COMInterop not supported for CORECLR.");
+            return false;
+        }
+#endif // FEATURE_CLASSIC_COMINTEROP
+#endif // FEATURE_COMINTEROP
+        else if (cls == NFT_NESTEDLAYOUTCLASS)
+        {
+            MethodTable* pFieldMT = ((FieldMarshaler_NestedLayoutClass*)pFieldMarshaler)->GetMethodTable();
+
+            bool inEmbeddedStructPrev = helperPtr->inEmbeddedStruct;
+            helperPtr->inEmbeddedStruct = true;
+            bool structRet = pFieldMT->ClassifyEightBytesForNativeStruct(helperPtr, nestingLevel + 1, normalizedFieldOffset);
+            helperPtr->inEmbeddedStruct = inEmbeddedStructPrev;
+
+            if (!structRet)
+            {
+                // If the nested struct says not to enregister, there's no need to continue analyzing at this level. Just return do not enregister.
+                return false;
+            }
+
+            continue;
+        }
+        else if (cls == NFT_NESTEDVALUECLASS)
+        {
+            MethodTable* pFieldMT = ((FieldMarshaler_NestedValueClass*)pFieldMarshaler)->GetMethodTable();
+
+            bool inEmbeddedStructPrev = helperPtr->inEmbeddedStruct;
+            helperPtr->inEmbeddedStruct = true;
+            bool structRet = pFieldMT->ClassifyEightBytesForNativeStruct(helperPtr, nestingLevel + 1, normalizedFieldOffset);
+            helperPtr->inEmbeddedStruct = inEmbeddedStructPrev;
+
+            if (!structRet)
+            {
+                // If the nested struct says not to enregister, there's no need to continue analyzing at this level. Just return do not enregister.
+                return false;
+            }
+
+            continue;
+        }
+        else if (cls == NFT_COPY1)
+        {
+            // The following CorElementTypes are the only ones handled with FieldMarshaler_Copy1. 
+            switch (fieldType)
+            {
+            case ELEMENT_TYPE_I1:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_U1:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            default:
+                // Invalid entry.
+                return false; // Pass on stack.
+            }
+        }
+        else if (cls == NFT_COPY2)
+        {
+            // The following CorElementTypes are the only ones handled with FieldMarshaler_Copy2. 
+            switch (fieldType)
+            {
+            case ELEMENT_TYPE_CHAR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_I2:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_U2:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            default:
+                // Invalid entry.
+                return false; // Pass on stack.
+            }
+        }
+        else if (cls == NFT_COPY4)
+        {
+            // The following CorElementTypes are the only ones handled with FieldMarshaler_Copy4. 
+            switch (fieldType)
+            {
+                // At this point, ELEMENT_TYPE_I must be 4 bytes long.  Same for ELEMENT_TYPE_U.
+            case ELEMENT_TYPE_I:
+            case ELEMENT_TYPE_I4:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_U:
+            case ELEMENT_TYPE_U4:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_R4:
+                fieldClassificationType = SystemVClassificationTypeSSE;
+                break;
+
+            case ELEMENT_TYPE_PTR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            default:
+                // Invalid entry.
+                return false; // Pass on stack.
+            }
+        }
+        else if (cls == NFT_COPY8)
+        {
+            // The following CorElementTypes are the only ones handled with FieldMarshaler_Copy8. 
+            switch (fieldType)
+            {
+                // At this point, ELEMENT_TYPE_I must be 8 bytes long.  Same for ELEMENT_TYPE_U.
+            case ELEMENT_TYPE_I:
+            case ELEMENT_TYPE_I8:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_U:
+            case ELEMENT_TYPE_U8:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            case ELEMENT_TYPE_R8:
+                fieldClassificationType = SystemVClassificationTypeSSE;
+                break;
+
+            case ELEMENT_TYPE_PTR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+
+            default:
+                // Invalid entry.
+                return false; // Pass on stack.
+            }
+        }
+        else if (cls == NFT_FIXEDSTRINGUNI)
+        {
+            fieldClassificationType = SystemVClassificationTypeInteger;
+        }
+        else if (cls == NFT_FIXEDSTRINGANSI)
+        {
+            fieldClassificationType = SystemVClassificationTypeInteger;
+        }
+        else
+        {
+            // All other NStruct Field Types which do not require special handling.
+            switch (cls)
+            {
+#ifdef FEATURE_COMINTEROP
+            case NFT_BSTR:
+                // COMInterop not supported for CORECLR.
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+            case NFT_HSTRING:
+                // COMInterop not supported for CORECLR.
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+#endif  // FEATURE_COMINTEROP
+            case NFT_STRINGUNI:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_STRINGANSI:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_DELEGATE:
+                return false;
+#ifdef FEATURE_COMINTEROP
+            case NFT_VARIANT:
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+#endif  // FEATURE_COMINTEROP
+            case NFT_ANSICHAR:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_WINBOOL:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_CBOOL:
+                fieldClassificationType = SystemVClassificationTypeInteger;
+                break;
+            case NFT_DECIMAL:
+                return false;
+            case NFT_DATE:
+                return false;
+#ifdef FEATURE_COMINTEROP
+            case NFT_VARIANTBOOL:
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+            case NFT_CURRENCY:
+                _ASSERTE(false && "COMInterop not supported for CORECLR.");
+                return false;
+#endif  // FEATURE_COMINTEROP
+            case NFT_ILLEGAL:
+                return false;
+            case NFT_SAFEHANDLE:
+                return false;
+            case NFT_CRITICALHANDLE:
+                return false;
+            default:
+                return false;
+            }
+        }
+
+        if ((normalizedFieldOffset % fieldNativeSize) != 0)
+        {
+            // The spec requires that struct values on the stack from register passed fields expects
+            // those fields to be at their natural alignment.
+
+            LOG((LF_JIT, LL_EVERYTHING, "     %*sxxxx Native Field %d %s: offset %d (normalized %d), native size %d not at natural alignment; not enregistering struct\n",
+                nestingLevel * 5, "", fieldNum, fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldNativeSize));
+            return false;
+        }
+
+        if ((int)normalizedFieldOffset <= helperPtr->largestFieldOffset)
+        {
+            // Find the field corresponding to this offset and update the size if needed.
+            // We assume that either it matches the offset of a previously seen field, or
+            // it is an out-of-order offset (the VM does give us structs in non-increasing
+            // offset order sometimes) that doesn't overlap any other field.
+
+            int i;
+            for (i = helperPtr->currentUniqueOffsetField - 1; i >= 0; i--)
+            {
+                if (helperPtr->fieldOffsets[i] == normalizedFieldOffset)
+                {
+                    if (fieldNativeSize > helperPtr->fieldSizes[i])
+                    {
+                        helperPtr->fieldSizes[i] = fieldNativeSize;
+                    }
+
+                    helperPtr->fieldClassifications[i] = ReClassifyField(helperPtr->fieldClassifications[i], fieldClassificationType);
+
+                    LOG((LF_JIT, LL_EVERYTHING, "     %*sxxxx Native Field %d %s: offset %d (normalized %d), native size %d, union with uniqueOffsetField %d, field type classification %s, reclassified field to %s\n",
+                        nestingLevel * 5, "", fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldNativeSize, i,
+                        GetSystemVClassificationTypeName(fieldClassificationType),
+                        GetSystemVClassificationTypeName(helperPtr->fieldClassifications[i])));
+
+                    break;
+                }
+                // Make sure the field doesn't start in the middle of another field.
+                _ASSERTE((normalizedFieldOffset <  helperPtr->fieldOffsets[i]) ||
+                    (normalizedFieldOffset >= helperPtr->fieldOffsets[i] + helperPtr->fieldSizes[i]));
+            }
+
+            if (i >= 0)
+            {
+                // The proper size of the union set of fields has been set above; continue to the next field.
+                continue;
+            }
+        }
+        else
+        {
+            helperPtr->largestFieldOffset = (int)normalizedFieldOffset;
+        }
+
+        // Set the data for a new field.
+
+        // The new field classification must not have been initialized yet.
+        _ASSERTE(helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] == SystemVClassificationTypeNoClass);
+
+        // There are only a few field classifications that are allowed.
+        _ASSERTE((fieldClassificationType == SystemVClassificationTypeInteger) ||
+            (fieldClassificationType == SystemVClassificationTypeIntegerReference) ||
+            (fieldClassificationType == SystemVClassificationTypeSSE));
+
+        helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] = fieldClassificationType;
+        helperPtr->fieldSizes[helperPtr->currentUniqueOffsetField] = fieldNativeSize;
+        helperPtr->fieldOffsets[helperPtr->currentUniqueOffsetField] = normalizedFieldOffset;
+
+        LOG((LF_JIT, LL_EVERYTHING, "     %*s**** Native Field %d %s: offset %d (normalized %d), size %d, currentUniqueOffsetField %d, field type classification %s, chosen field classification %s\n",
+            nestingLevel * 5, "", fieldNum, fieldName, fieldOffset, normalizedFieldOffset, fieldNativeSize, helperPtr->currentUniqueOffsetField,
+            GetSystemVClassificationTypeName(fieldClassificationType),
+            GetSystemVClassificationTypeName(helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField])));
+
+        helperPtr->currentUniqueOffsetField++;
+        ((BYTE*&)pFieldMarshaler) += MAXFIELDMARSHALERSIZE;
+        _ASSERTE(helperPtr->currentUniqueOffsetField < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT);
+
+    } // end per-field for loop
+
+    if (!helperPtr->inEmbeddedStruct)
+    {
+        _ASSERTE(nestingLevel == 0);
+
+        // We're at the top level of the recursion, and we're done looking at the fields.
+        // Now sort the fields by offset and set the output data.
+
+        int sortedFieldOrder[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            sortedFieldOrder[i] = -1;
+        }
+
+        for (unsigned i = 0; i < helperPtr->currentUniqueOffsetField; i++)
+        {
+            _ASSERTE(helperPtr->fieldOffsets[i] < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT);
+            _ASSERTE(sortedFieldOrder[helperPtr->fieldOffsets[i]] == -1); // we haven't seen this field offset yet.
+            sortedFieldOrder[helperPtr->fieldOffsets[i]] = i;
+        }
+
+        // Set the layoutSizes (includes holes from alignment of the fields.)
+        int lastField = -1;
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            int ordinal = sortedFieldOrder[i];
+            if (ordinal == -1)
+            {
+                continue;
+            }
+
+            if (lastField == -1)
+            {
+                lastField = ordinal;
+                continue;
+            }
+
+            helperPtr->fieldLayoutSizes[lastField] = helperPtr->fieldOffsets[ordinal] - helperPtr->fieldOffsets[lastField];
+
+            lastField = ordinal;
+        }
+        // Now the last field
+        _ASSERTE(lastField != -1); // if lastField==-1, then the struct has no fields!
+        helperPtr->fieldLayoutSizes[lastField] = helperPtr->structSize - helperPtr->fieldOffsets[lastField];
+
+        // Calculate the eightbytes and their types.
+        unsigned int accumulatedSizeForEightByte = 0;
+        unsigned int lastEightByteOffset = 0;
+        unsigned int currentEightByte = 0;
+
+        for (unsigned i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            int ordinal = sortedFieldOrder[i];
+            if (ordinal == -1)
+            {
+                continue;
+            }
+
+            if ((accumulatedSizeForEightByte + helperPtr->fieldLayoutSizes[ordinal]) > SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES)
+            {
+                // Save data for this eightbyte.
+                helperPtr->eightByteSizes[currentEightByte] = accumulatedSizeForEightByte;
+                helperPtr->eightByteOffsets[currentEightByte] = lastEightByteOffset;
+
+                // Set up for next eightbyte.
+                currentEightByte++;
+                _ASSERTE(currentEightByte < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+                lastEightByteOffset = helperPtr->fieldOffsets[ordinal];
+                accumulatedSizeForEightByte = 0;
+            }
+
+            accumulatedSizeForEightByte += helperPtr->fieldLayoutSizes[ordinal];
+
+            _ASSERTE(helperPtr->fieldClassifications[ordinal] != SystemVClassificationTypeMemory);
+
+            if (helperPtr->eightByteClassifications[currentEightByte] == helperPtr->fieldClassifications[ordinal])
+            {
+                // Do nothing. The eight-byte is already classified.
+            }
+            else if (helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeNoClass)
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = helperPtr->fieldClassifications[ordinal];
+            }
+            else if ((helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeInteger) ||
+                (helperPtr->fieldClassifications[ordinal] == SystemVClassificationTypeInteger))
+            {
+                _ASSERTE(helperPtr->fieldClassifications[ordinal] != SystemVClassificationTypeIntegerReference);
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeInteger;
+            }
+            else if ((helperPtr->eightByteClassifications[currentEightByte] == SystemVClassificationTypeIntegerReference) ||
+                (helperPtr->fieldClassifications[ordinal] == SystemVClassificationTypeIntegerReference))
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeIntegerReference;
+            }
+            else
+            {
+                helperPtr->eightByteClassifications[currentEightByte] = SystemVClassificationTypeSSE;
+            }
+        }
+
+        helperPtr->eightByteCount = currentEightByte + 1;
+        helperPtr->eightByteSizes[currentEightByte] = accumulatedSizeForEightByte;
+        helperPtr->eightByteOffsets[currentEightByte] = lastEightByteOffset;
+        _ASSERTE(helperPtr->eightByteCount <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+
+#ifdef _DEBUG
+        LOG((LF_JIT, LL_EVERYTHING, "     ----\n"));
+        LOG((LF_JIT, LL_EVERYTHING, "     **** Number EightBytes: %d\n", helperPtr->eightByteCount));
+        for (unsigned i = 0; i < helperPtr->eightByteCount; i++)
+        {
+            LOG((LF_JIT, LL_EVERYTHING, "     **** eightByte %d -- classType: %s, eightByteOffset: %d, eightByteSize: %d\n",
+                i, GetSystemVClassificationTypeName(helperPtr->eightByteClassifications[i]), helperPtr->eightByteOffsets[i], helperPtr->eightByteSizes[i]));
+        }
+#endif // _DEBUG
+    }
+
+    return true;
+#endif // DACCESS_COMPILE
+}
+
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+
 #if !defined(DACCESS_COMPILE) && !defined(CROSSGEN_COMPILE)
 //==========================================================================================
 void MethodTable::AllocateRegularStaticBoxes()
@@ -2643,7 +3556,7 @@ void MethodTable::DoRunClassInitThrowing()
     }
 
     description = ".cctor lock";
-#if _DEBUG
+#ifdef _DEBUG
     description = GetDebugClassName();
 #endif
 
diff --git a/src/vm/methodtable.h b/src/vm/methodtable.h
index 8e6a59b6b3..e4aecf3140 100644
--- a/src/vm/methodtable.h
+++ b/src/vm/methodtable.h
@@ -53,7 +53,6 @@ class FCallMethodDesc;
 class    EEClass;
 class    EnCFieldDesc;
 class FieldDesc;
-class    FieldMarshaler;
 class JIT_TrialAlloc;
 struct LayoutRawFieldInfo;
 class MetaSig;
@@ -80,6 +79,7 @@ class   ComCallWrapperTemplate;
 #ifdef FEATURE_COMINTEROP_UNMANAGED_ACTIVATION
 class ClassFactoryBase;
 #endif // FEATURE_COMINTEROP_UNMANAGED_ACTIVATION
+class ArgDestination;
 
 //============================================================================
 // This is the in-memory structure of a class and it will evolve.
@@ -625,6 +625,112 @@ public:
 typedef DPTR(MethodTableWriteableData) PTR_MethodTableWriteableData;
 typedef DPTR(MethodTableWriteableData const) PTR_Const_MethodTableWriteableData;
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+inline
+SystemVClassificationType CorInfoType2UnixAmd64Classification(CorElementType eeType)
+{
+    static const SystemVClassificationType toSystemVAmd64ClassificationTypeMap[] = {
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_END
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_VOID
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_BOOLEAN
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_CHAR
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I1
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U1
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I2
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U2
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I4
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U4
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I8
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U8
+        SystemVClassificationTypeSSE,               // ELEMENT_TYPE_R4
+        SystemVClassificationTypeSSE,               // ELEMENT_TYPE_R8
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_STRING
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_PTR
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_BYREF
+        SystemVClassificationTypeStruct,            // ELEMENT_TYPE_VALUETYPE
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_CLASS
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_VAR              - (type variable)
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_ARRAY
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_GENERICINST
+        SystemVClassificationTypeStruct,            // ELEMENT_TYPE_TYPEDBYREF
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_VALUEARRAY_UNSUPPORTED
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_I
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_U
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_R_UNSUPPORTED
+
+        // put the correct type when we know our implementation
+        SystemVClassificationTypeInteger,           // ELEMENT_TYPE_FNPTR
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_OBJECT
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_SZARRAY
+        SystemVClassificationTypeIntegerReference,  // ELEMENT_TYPE_MVAR
+
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_CMOD_REQD
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_CMOD_OPT
+        SystemVClassificationTypeUnknown,           // ELEMENT_TYPE_INTERNAL
+    };
+
+    _ASSERTE(sizeof(toSystemVAmd64ClassificationTypeMap) == ELEMENT_TYPE_MAX);
+    _ASSERTE(eeType < (CorElementType) sizeof(toSystemVAmd64ClassificationTypeMap));
+    // spot check of the map
+    _ASSERTE((SystemVClassificationType)toSystemVAmd64ClassificationTypeMap[ELEMENT_TYPE_I4] == SystemVClassificationTypeInteger);
+    _ASSERTE((SystemVClassificationType)toSystemVAmd64ClassificationTypeMap[ELEMENT_TYPE_PTR] == SystemVClassificationTypeInteger);
+    _ASSERTE((SystemVClassificationType)toSystemVAmd64ClassificationTypeMap[ELEMENT_TYPE_TYPEDBYREF] == SystemVClassificationTypeStruct);
+
+    return (((int)eeType) < ELEMENT_TYPE_MAX) ? (toSystemVAmd64ClassificationTypeMap[eeType]) : SystemVClassificationTypeUnknown;
+};
+
+#define SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES                    8 // Size of an eightbyte in bytes.
+#define SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT    16 // Maximum number of fields in struct passed in registers
+
+struct SystemVStructRegisterPassingHelper
+{
+    SystemVStructRegisterPassingHelper(unsigned int totalStructSize) :
+        structSize(totalStructSize),
+        eightByteCount(0),
+        inEmbeddedStruct(false),
+        currentUniqueOffsetField(0),
+        largestFieldOffset(-1)
+    {
+        for (int i = 0; i < CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS; i++)
+        {
+            eightByteClassifications[i] = SystemVClassificationTypeNoClass;
+            eightByteSizes[i] = 0;
+            eightByteOffsets[i] = 0;
+        }
+
+        // Initialize the work arrays
+        for (int i = 0; i < SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT; i++)
+        {
+            fieldClassifications[i] = SystemVClassificationTypeNoClass;
+            fieldSizes[i] = 0;
+            fieldLayoutSizes[i] = 0;
+            fieldOffsets[i] = 0;
+        }
+    }
+
+    // Input state.
+    unsigned int                    structSize;
+
+    // These fields are the output; these are what is computed by the classification algorithm.
+    unsigned int                    eightByteCount;
+    SystemVClassificationType       eightByteClassifications[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    unsigned int                    eightByteSizes[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    unsigned int                    eightByteOffsets[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+
+    // Helper members to track state.
+    bool                            inEmbeddedStruct;
+    unsigned int                    currentUniqueOffsetField; // A virtual field that could encompass many overlapping fields.
+    int                             largestFieldOffset;
+    SystemVClassificationType       fieldClassifications[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+    unsigned int                    fieldSizes[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+    unsigned int                    fieldLayoutSizes[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+    unsigned int                    fieldOffsets[SYSTEMV_MAX_NUM_FIELDS_IN_REGISTER_PASSED_STRUCT];
+};
+
+typedef DPTR(SystemVStructRegisterPassingHelper) SystemVStructRegisterPassingHelperPtr;
+
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
 //===============================================================================================
 //
 // GC data appears before the beginning of the MethodTable
@@ -941,6 +1047,16 @@ public:
     // during object construction.
     void CheckRunClassInitAsIfConstructingThrowing();
 
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+    // Helper function for ClassifyEightBytes
+    static SystemVClassificationType ReClassifyField(SystemVClassificationType originalClassification, SystemVClassificationType newFieldClassification);
+
+    // Builds the internal data structures and classifies struct eightbytes for Amd System V calling convention.
+    bool ClassifyEightBytes(SystemVStructRegisterPassingHelperPtr helperPtr, unsigned int nestingLevel, unsigned int startOffsetOfStruct);
+    bool ClassifyEightBytesForNativeStruct(SystemVStructRegisterPassingHelperPtr helperPtr, unsigned int nestingLevel, unsigned int startOffsetOfStruct);
+    
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+
     // Copy m_dwFlags from another method table
     void CopyFlags(MethodTable * pOldMT)
     {
@@ -1929,7 +2045,7 @@ public:
         SetFlag(enum_flag_HasPreciseInitCctors);
     }
 
-#ifdef FEATURE_HFA
+#if defined(FEATURE_HFA)
     inline bool IsHFA()
     {
         LIMITED_METHOD_CONTRACT;
@@ -1941,6 +2057,23 @@ public:
         LIMITED_METHOD_CONTRACT;
         SetFlag(enum_flag_IsHFA);
     }
+#endif // FEATURE_HFA
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+    inline bool IsRegPassedStruct()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return !!GetFlag(enum_flag_IsRegStructPassed);
+    }
+
+    inline void SetRegPassedStruct()
+    {
+        LIMITED_METHOD_CONTRACT;
+        SetFlag(enum_flag_IsRegStructPassed);
+    }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+
+#ifdef FEATURE_HFA
 
     CorElementType GetHFAType();
 
@@ -2642,6 +2775,7 @@ public:
     OBJECTREF FastBox(void** data);
 #ifndef DACCESS_COMPILE
     BOOL UnBoxInto(void *dest, OBJECTREF src);
+    BOOL UnBoxIntoArg(ArgDestination *argDest, OBJECTREF src);
     void UnBoxIntoUnchecked(void *dest, OBJECTREF src);
 #endif
 
@@ -3775,7 +3909,19 @@ private:
         enum_flag_HasDefaultCtor            = 0x00000200,
         enum_flag_HasPreciseInitCctors      = 0x00000400,   // Do we need to run class constructors at allocation time? (Not perf important, could be moved to EEClass
 
+#if defined(FEATURE_HFA)
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+#error Can't define both FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#endif
         enum_flag_IsHFA                     = 0x00000800,   // This type is an HFA (Homogenous Floating-point Aggregate)
+#endif // FEATURE_HFA
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF)
+#if defined(FEATURE_HFA)
+#error Can't define both FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#endif
+        enum_flag_IsRegStructPassed         = 0x00000800,   // This type is a System V register passed struct.
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
 
         // In a perfect world we would fill these flags using other flags that we already have
         // which have a constant value for something which has a component size.
diff --git a/src/vm/methodtable.inl b/src/vm/methodtable.inl
index a993556db6..aa07eea9d1 100644
--- a/src/vm/methodtable.inl
+++ b/src/vm/methodtable.inl
@@ -1716,6 +1716,32 @@ inline BOOL MethodTable::UnBoxInto(void *dest, OBJECTREF src)
 }
 
 //==========================================================================================
+// unbox src into argument, making sure src is of the correct type.
+
+inline BOOL MethodTable::UnBoxIntoArg(ArgDestination *argDest, OBJECTREF src)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        SO_TOLERANT;
+        MODE_COOPERATIVE;
+    }
+    CONTRACTL_END;
+
+    if (Nullable::IsNullableType(TypeHandle(this)))
+        return Nullable::UnBoxIntoArgNoGC(argDest, src, this);
+    else  
+    {
+        if (src == NULL || src->GetMethodTable() != this)
+            return FALSE;
+
+        CopyValueClassArg(argDest, src->UnBox(), this, src->GetAppDomain(), 0);
+    }
+    return TRUE;
+}
+
+//==========================================================================================
 // unbox src into dest, No checks are done
 
 inline void MethodTable::UnBoxIntoUnchecked(void *dest, OBJECTREF src) 
diff --git a/src/vm/methodtablebuilder.cpp b/src/vm/methodtablebuilder.cpp
index e1d2dbb2e5..0e3cb45675 100644
--- a/src/vm/methodtablebuilder.cpp
+++ b/src/vm/methodtablebuilder.cpp
@@ -1897,8 +1897,23 @@ MethodTableBuilder::BuildMethodTableThrowing(
 #ifdef FEATURE_HFA
         CheckForHFA(pByValueClassCache);
 #endif
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#ifdef FEATURE_HFA
+#error Can't have FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF defined at the same time.
+#endif // FEATURE_HFA
+        SystemVAmd64CheckForPassStructInRegister();
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
     }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+#ifdef FEATURE_HFA
+#error Can't have FEATURE_HFA and FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF defined at the same time.
+#endif // FEATURE_HFA
+    if (HasLayout())
+    {
+        SystemVAmd64CheckForPassNativeStructInRegister();
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
 #ifdef FEATURE_HFA
     if (HasLayout())
     {
@@ -8429,6 +8444,93 @@ DWORD MethodTableBuilder::GetFieldSize(FieldDesc *pFD)
     return (1 << (DWORD)(DWORD_PTR&)(pFD->m_pMTOfEnclosingClass));
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+// checks whether the struct is enregisterable.
+void MethodTableBuilder::SystemVAmd64CheckForPassStructInRegister()
+{
+    STANDARD_VM_CONTRACT;
+
+    // This method should be called for valuetypes only
+    _ASSERTE(IsValueClass());
+
+    TypeHandle th(GetHalfBakedMethodTable());
+
+    if (th.IsTypeDesc())
+    {
+        // Not an enregisterable managed structure.
+        return;
+    }
+
+    DWORD totalStructSize = bmtFP->NumInstanceFieldBytes;
+
+    // If num of bytes for the fields is bigger than CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS
+    // pass through stack
+    if (totalStructSize > CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS)
+    {
+        LOG((LF_JIT, LL_EVERYTHING, "**** SystemVAmd64CheckForPassStructInRegister: struct %s is too big to pass in registers (%d bytes)\n",
+               this->GetDebugClassName(), totalStructSize));
+        return;
+    }
+
+    // Iterate through the fields and make sure they meet requirements to pass in registers
+    SystemVStructRegisterPassingHelper helper((unsigned int)totalStructSize);
+
+    if (GetHalfBakedMethodTable()->ClassifyEightBytes(&helper, 0, 0))
+    {
+        // All the above tests passed. It's registers passed struct!
+        GetHalfBakedMethodTable()->SetRegPassedStruct();
+
+        StoreEightByteClassification(&helper);
+    }
+}
+
+// checks whether the struct is enregisterable.
+void MethodTableBuilder::SystemVAmd64CheckForPassNativeStructInRegister()
+{
+    STANDARD_VM_CONTRACT;
+    DWORD totalStructSize = 0;
+
+    // If not a native value type, return.
+    if (!IsValueClass())
+    {
+        return;
+    }
+
+    totalStructSize = GetLayoutInfo()->GetNativeSize();
+
+    // If num of bytes for the fields is bigger than CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS
+    // pass through stack
+    if (totalStructSize > CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS)
+    {
+        LOG((LF_JIT, LL_EVERYTHING, "**** SystemVAmd64CheckForPassNativeStructInRegister: struct %s is too big to pass in registers (%d bytes)\n",
+            this->GetDebugClassName(), totalStructSize));
+        return;
+    }
+
+    _ASSERTE(HasLayout());
+
+    // Classify the native layout for this struct.
+   
+    // Iterate through the fields and make sure they meet requirements to pass in registers
+    SystemVStructRegisterPassingHelper helper((unsigned int)totalStructSize);
+    if (GetHalfBakedMethodTable()->ClassifyEightBytesForNativeStruct(&helper, 0, 0))
+    {
+        GetLayoutInfo()->SetNativeStructPassedInRegisters();
+    }
+}
+
+// Store the eightbyte classification into the EEClass
+void MethodTableBuilder::StoreEightByteClassification(SystemVStructRegisterPassingHelper* helper)
+{
+    EEClass* eeClass = GetHalfBakedMethodTable()->GetClass();
+    LoaderAllocator* pAllocator = MethodTableBuilder::GetLoaderAllocator();
+    AllocMemTracker* pamTracker = MethodTableBuilder::GetMemTracker();
+    EnsureOptionalFieldsAreAllocated(eeClass, pamTracker, pAllocator->GetLowFrequencyHeap());
+    eeClass->SetEightByteClassification(helper->eightByteCount, helper->eightByteClassifications, helper->eightByteSizes);
+}
+
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
 #ifdef FEATURE_HFA
 //---------------------------------------------------------------------------------------
 //
diff --git a/src/vm/methodtablebuilder.h b/src/vm/methodtablebuilder.h
index bc543c1bf8..10ba278535 100644
--- a/src/vm/methodtablebuilder.h
+++ b/src/vm/methodtablebuilder.h
@@ -2980,6 +2980,15 @@ private:
 
     VOID    CheckForNativeHFA();
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+    // checks whether the struct is enregisterable.
+    void SystemVAmd64CheckForPassStructInRegister();
+    void SystemVAmd64CheckForPassNativeStructInRegister();
+    // Store the eightbyte classification into the EEClass
+    void StoreEightByteClassification(SystemVStructRegisterPassingHelper* helper);
+
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING_ITF
+
     // this accesses the field size which is temporarily stored in m_pMTOfEnclosingClass
     // during class loading. Don't use any other time
     DWORD GetFieldSize(FieldDesc *pFD);
diff --git a/src/vm/object.cpp b/src/vm/object.cpp
index 3b07a12543..25a7109905 100644
--- a/src/vm/object.cpp
+++ b/src/vm/object.cpp
@@ -24,6 +24,7 @@
 #endif
 #include "field.h"
 #include "gcscan.h"
+#include "argdestination.h"
 
 #ifdef FEATURE_COMPRESSEDSTACK
 void* CompressedStackObject::GetUnmanagedCompressedStack()
@@ -1498,6 +1499,31 @@ void CopyValueClassChecked(void* dest, void* src, MethodTable *pMT, AppDomain *p
     EX_END_CATCH(SwallowAllExceptions);
     CopyValueClassUnchecked(dest,src,pMT);
 }
+
+// Copy value class into the argument specified by the argDest, performing an appdomain check first.
+// The destOffset is nonzero when copying values into Nullable<T>, it is the offset
+// of the T value inside of the Nullable<T>
+void CopyValueClassArgChecked(ArgDestination *argDest, void* src, MethodTable *pMT, AppDomain *pDomain, int destOffset)
+{
+    STATIC_CONTRACT_DEBUG_ONLY;
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_FORBID_FAULT;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+    DEBUG_ONLY_FUNCTION;
+
+    FAULT_NOT_FATAL();
+    EX_TRY
+    {
+        Object::AssignValueTypeAppDomain(pMT, src, pDomain);
+    }
+    EX_CATCH
+    {
+    }
+    EX_END_CATCH(SwallowAllExceptions);
+    CopyValueClassArgUnchecked(argDest, src, pMT, destOffset);
+}
 #endif
     
 void STDCALL CopyValueClassUnchecked(void* dest, void* src, MethodTable *pMT) 
@@ -1563,6 +1589,51 @@ void STDCALL CopyValueClassUnchecked(void* dest, void* src, MethodTable *pMT)
     }
 }
 
+// Copy value class into the argument specified by the argDest.
+// The destOffset is nonzero when copying values into Nullable<T>, it is the offset
+// of the T value inside of the Nullable<T>
+void STDCALL CopyValueClassArgUnchecked(ArgDestination *argDest, void* src, MethodTable *pMT, int destOffset) 
+{
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_FORBID_FAULT;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    if (argDest->IsStructPassedInRegs())
+    {
+        argDest->CopyStructToRegisters(src, pMT->GetNumInstanceFieldBytes(), destOffset);
+        return;
+    }
+
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // destOffset is only valid for Nullable<T> passed in registers
+    _ASSERTE(destOffset == 0);
+
+    CopyValueClassUnchecked(argDest->GetDestinationAddress(), src, pMT);
+}
+
+// Initialize the value class argument to zeros
+void InitValueClassArg(ArgDestination *argDest, MethodTable *pMT)
+{ 
+    STATIC_CONTRACT_NOTHROW;
+    STATIC_CONTRACT_GC_NOTRIGGER;
+    STATIC_CONTRACT_FORBID_FAULT;
+    STATIC_CONTRACT_MODE_COOPERATIVE;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    if (argDest->IsStructPassedInRegs())
+    {
+        argDest->ZeroStructInRegisters(pMT->GetNumInstanceFieldBytes());
+        return;
+    }
+
+#endif    
+    InitValueClass(argDest->GetDestinationAddress(), pMT);
+}
+
 #if defined (VERIFY_HEAP)
 
 #include "dbginterface.h"
@@ -3245,7 +3316,7 @@ BOOL Nullable::UnBox(void* destPtr, OBJECTREF boxedVal, MethodTable* destMT)
 
     if (boxedVal == NULL) 
     {
-        // logicall we are doing *dest->HasValueAddr(destMT) = false;
+        // Logically we are doing *dest->HasValueAddr(destMT) = false;
         // We zero out the whole structure becasue it may contain GC references
         // and these need to be initialized to zero.   (could optimize in the non-GC case)
         InitValueClass(destPtr, destMT);
@@ -3302,7 +3373,7 @@ BOOL Nullable::UnBoxNoGC(void* destPtr, OBJECTREF boxedVal, MethodTable* destMT)
 
     if (boxedVal == NULL) 
     {
-        // logicall we are doing *dest->HasValueAddr(destMT) = false;
+        // Logically we are doing *dest->HasValueAddr(destMT) = false;
         // We zero out the whole structure becasue it may contain GC references
         // and these need to be initialized to zero.   (could optimize in the non-GC case)
         InitValueClass(destPtr, destMT);
@@ -3328,6 +3399,64 @@ BOOL Nullable::UnBoxNoGC(void* destPtr, OBJECTREF boxedVal, MethodTable* destMT)
 }
 
 //===============================================================================
+// Special Logic to unbox a boxed T as a nullable<T> into an argument 
+// specified by the argDest.
+// Does not handle type equivalence (may conservatively return FALSE)
+BOOL Nullable::UnBoxIntoArgNoGC(ArgDestination *argDest, OBJECTREF boxedVal, MethodTable* destMT)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_COOPERATIVE;
+        SO_TOLERANT;
+    }
+    CONTRACTL_END;
+
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    if (argDest->IsStructPassedInRegs())
+    {
+        // We should only get here if we are unboxing a T as a Nullable<T>
+        _ASSERTE(IsNullableType(destMT));
+
+        // We better have a concrete instantiation, or our field offset asserts are not useful
+        _ASSERTE(!destMT->ContainsGenericVariables());
+
+        if (boxedVal == NULL) 
+        {
+            // Logically we are doing *dest->HasValueAddr(destMT) = false;
+            // We zero out the whole structure becasue it may contain GC references
+            // and these need to be initialized to zero.   (could optimize in the non-GC case)
+            InitValueClassArg(argDest, destMT);
+        }
+        else 
+        {
+            if (!IsNullableForTypeNoGC(destMT, boxedVal->GetMethodTable()))
+            {
+                // For safety's sake, also allow true nullables to be unboxed normally.  
+                // This should not happen normally, but we want to be robust
+                if (destMT == boxedVal->GetMethodTable())
+                {
+                    CopyValueClassArg(argDest, boxedVal->GetData(), destMT, boxedVal->GetAppDomain(), 0);
+                    return TRUE;
+                }
+                return FALSE;
+            }
+
+            Nullable* dest = (Nullable*)argDest->GetStructGenRegDestinationAddress();
+            *dest->HasValueAddr(destMT) = true;
+            int destOffset = (BYTE*)dest->ValueAddr(destMT) - (BYTE*)dest;
+            CopyValueClassArg(argDest, boxedVal->UnBox(), boxedVal->GetMethodTable(), boxedVal->GetAppDomain(), destOffset);
+        }
+        return TRUE;
+    }
+
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    return UnBoxNoGC(argDest->GetDestinationAddress(), boxedVal, destMT);
+}
+
+//===============================================================================
 // Special Logic to unbox a boxed T as a nullable<T>
 // Does not do any type checks.
 void Nullable::UnBoxNoCheck(void* destPtr, OBJECTREF boxedVal, MethodTable* destMT)
@@ -3350,7 +3479,7 @@ void Nullable::UnBoxNoCheck(void* destPtr, OBJECTREF boxedVal, MethodTable* dest
 
     if (boxedVal == NULL) 
     {
-        // logicall we are doing *dest->HasValueAddr(destMT) = false;
+        // Logically we are doing *dest->HasValueAddr(destMT) = false;
         // We zero out the whole structure becasue it may contain GC references
         // and these need to be initialized to zero.   (could optimize in the non-GC case)
         InitValueClass(destPtr, destMT);
diff --git a/src/vm/object.h b/src/vm/object.h
index abf15fa591..5808e6c0eb 100644
--- a/src/vm/object.h
+++ b/src/vm/object.h
@@ -94,6 +94,8 @@ class CtxStaticData;
 class DomainAssembly;
 class AssemblyNative;
 class WaitHandleNative;
+class ArgDestination;
+
 struct RCW;
 
 #if CHECK_APP_DOMAIN_LEAKS
@@ -702,6 +704,7 @@ inline void ClearObjectReference(OBJECTREF* dst)
 // CopyValueClass sets a value class field
 
 void STDCALL CopyValueClassUnchecked(void* dest, void* src, MethodTable *pMT);
+void STDCALL CopyValueClassArgUnchecked(ArgDestination *argDest, void* src, MethodTable *pMT, int destOffset);
 
 inline void InitValueClass(void *dest, MethodTable *pMT)
 { 
@@ -709,18 +712,24 @@ inline void InitValueClass(void *dest, MethodTable *pMT)
     ZeroMemoryInGCHeap(dest, pMT->GetNumInstanceFieldBytes());
 }
 
+// Initialize value class argument
+void InitValueClassArg(ArgDestination *argDest, MethodTable *pMT);
+
 #if CHECK_APP_DOMAIN_LEAKS
 
 void SetObjectReferenceChecked(OBJECTREF *dst,OBJECTREF ref, AppDomain *pAppDomain);
 void CopyValueClassChecked(void* dest, void* src, MethodTable *pMT, AppDomain *pAppDomain);
+void CopyValueClassArgChecked(ArgDestination *argDest, void* src, MethodTable *pMT, AppDomain *pAppDomain, int destOffset);
 
 #define SetObjectReference(_d,_r,_a)        SetObjectReferenceChecked(_d, _r, _a)
 #define CopyValueClass(_d,_s,_m,_a)         CopyValueClassChecked(_d,_s,_m,_a)      
+#define CopyValueClassArg(_d,_s,_m,_a,_o)   CopyValueClassArgChecked(_d,_s,_m,_a,_o)      
 
 #else
 
 #define SetObjectReference(_d,_r,_a)        SetObjectReferenceUnchecked(_d, _r)
 #define CopyValueClass(_d,_s,_m,_a)         CopyValueClassUnchecked(_d,_s,_m)       
+#define CopyValueClassArg(_d,_s,_m,_a,_o)   CopyValueClassArgUnchecked(_d,_s,_m,_o)       
 
 #endif
 
@@ -4649,6 +4658,7 @@ public:
     static OBJECTREF Box(void* src, MethodTable* nullable);
     static BOOL UnBox(void* dest, OBJECTREF boxedVal, MethodTable* destMT);
     static BOOL UnBoxNoGC(void* dest, OBJECTREF boxedVal, MethodTable* destMT);
+    static BOOL UnBoxIntoArgNoGC(ArgDestination *argDest, OBJECTREF boxedVal, MethodTable* destMT);
     static void UnBoxNoCheck(void* dest, OBJECTREF boxedVal, MethodTable* destMT);
     static OBJECTREF BoxedNullableNull(TypeHandle nullableType) { return 0; }
 
diff --git a/src/vm/reflectioninvocation.cpp b/src/vm/reflectioninvocation.cpp
index 777b120ad4..d3a3125ed0 100644
--- a/src/vm/reflectioninvocation.cpp
+++ b/src/vm/reflectioninvocation.cpp
@@ -34,6 +34,7 @@
 #endif
 
 #include "dbginterface.h"
+#include "argdestination.h"
 
 // these flags are defined in XXXInfo.cs and only those that are used are replicated here
 #define INVOCATION_FLAGS_UNKNOWN                    0x00000000
@@ -1578,7 +1579,7 @@ FCIMPL4(Object*, RuntimeMethodHandle::InvokeMethod,
 
         TypeHandle th = gc.pSig->GetArgumentAt(i);
 
-        int    ofs = argit.GetNextOffset();
+        int ofs = argit.GetNextOffset();
         _ASSERTE(ofs != TransitionBlock::InvalidOffset);
 
 #ifdef CALLDESCR_REGTYPEMAP
@@ -1590,16 +1591,22 @@ FCIMPL4(Object*, RuntimeMethodHandle::InvokeMethod,
         // least one such argument we point the call worker at the floating point area of the frame (we leave
         // it null otherwise since the worker can perform a useful optimization if it knows no floating point
         // registers need to be set up).
-        if ((ofs < 0) && (callDescrData.pFloatArgumentRegisters == NULL))
+
+        if (TransitionBlock::HasFloatRegister(ofs, argit.GetArgLocDescForStructInRegs()) && 
+            (callDescrData.pFloatArgumentRegisters == NULL))
+        {
             callDescrData.pFloatArgumentRegisters = (FloatArgumentRegisters*) (pTransitionBlock +
-                                                                TransitionBlock::GetOffsetOfFloatArgumentRegisters());
+                                                                               TransitionBlock::GetOffsetOfFloatArgumentRegisters());
+        }
 #endif
 
         UINT structSize = argit.GetArgSize();
 
         bool needsStackCopy = false;
-        PVOID pArgDst = pTransitionBlock + ofs;
 
+        // A boxed Nullable<T> is represented as boxed T. So to pass a Nullable<T> by reference, 
+        // we have to create a Nullable<T> on stack, copy the T into it, then pass it to the callee and
+        // after returning from the call, copy the T out of the Nullable<T> back to the boxed T.
         TypeHandle nullableType = NullableTypeOfByref(th);
         if (!nullableType.IsNull()) {
             th = nullableType;
@@ -1607,17 +1614,21 @@ FCIMPL4(Object*, RuntimeMethodHandle::InvokeMethod,
             needsStackCopy = true;
         }
 #ifdef ENREGISTERED_PARAMTYPE_MAXSIZE
-        else
-        if (argit.IsArgPassedByRef()) {
+        else if (argit.IsArgPassedByRef()) 
+        {
             needsStackCopy = true;
         }
 #endif
 
+        ArgDestination argDest(pTransitionBlock, ofs, argit.GetArgLocDescForStructInRegs());
+
         if(needsStackCopy)
         {
             MethodTable * pMT = th.GetMethodTable();
             _ASSERTE(pMT && pMT->IsValueType());
 
+            PVOID pArgDst = argDest.GetDestinationAddress();
+
             PVOID pStackCopy = _alloca(structSize);
             *(PVOID *)pArgDst = pStackCopy;
             pArgDst = pStackCopy;
@@ -1632,9 +1643,12 @@ FCIMPL4(Object*, RuntimeMethodHandle::InvokeMethod,
             {
                 pValueClasses = new (_alloca(sizeof(ValueClassInfo))) ValueClassInfo(pStackCopy, pMT, pValueClasses);
             }
+
+            // We need a new ArgDestination that points to the stack copy
+            argDest = ArgDestination(pStackCopy, 0, NULL);
         }
 
-        InvokeUtil::CopyArg(th, &(gc.args->m_Array[i]), pArgDst);
+        InvokeUtil::CopyArg(th, &(gc.args->m_Array[i]), &argDest);
     }
 
     ENDFORBIDGC();
diff --git a/src/vm/siginfo.cpp b/src/vm/siginfo.cpp
index 25fe157784..ec023e9d0b 100644
--- a/src/vm/siginfo.cpp
+++ b/src/vm/siginfo.cpp
@@ -25,6 +25,7 @@
 #include "sigbuilder.h"
 #include "../md/compiler/custattr.h"
 #include <corhlprpriv.h>
+#include "argdestination.h"
 
 /*******************************************************************/
 const CorTypeInfo::CorTypeInfoEntry CorTypeInfo::info[ELEMENT_TYPE_MAX] = 
@@ -4976,11 +4977,28 @@ void ReportPointersFromValueType(promote_func *fn, ScanContext *sc, PTR_MethodTa
     } while (cur >= last);
 }
 
+void ReportPointersFromValueTypeArg(promote_func *fn, ScanContext *sc, PTR_MethodTable pMT, ArgDestination *pSrc)
+{
+    WRAPPER_NO_CONTRACT;
+    
+    if (!pMT->ContainsPointers())
+        return;
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)    
+    if (pSrc->IsStructPassedInRegs())
+    {
+        pSrc->ReportPointersFromStructInRegisters(fn, sc, pMT->GetNumInstanceFieldBytes());
+        return;
+    }
+#endif // UNIX_AMD64_ABI && FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    ReportPointersFromValueType(fn, sc, pMT, pSrc->GetDestinationAddress());
+}
+
 //------------------------------------------------------------------
 // Perform type-specific GC promotion on the value (based upon the
 // last type retrieved by NextArg()).
 //------------------------------------------------------------------
-VOID MetaSig::GcScanRoots(PTR_VOID pValue,
+VOID MetaSig::GcScanRoots(ArgDestination *pValue,
                           promote_func *fn,
                           ScanContext* sc,
                           promote_carefully_func *fnc)
@@ -4997,7 +5015,7 @@ VOID MetaSig::GcScanRoots(PTR_VOID pValue,
     CONTRACTL_END
 
 
-    PTR_PTR_Object pArgPtr = (PTR_PTR_Object)pValue;
+    PTR_PTR_Object pArgPtr = (PTR_PTR_Object)pValue->GetDestinationAddress();
     if (fnc == NULL)
         fnc = &PromoteCarefully;
 
@@ -5083,7 +5101,7 @@ VOID MetaSig::GcScanRoots(PTR_VOID pValue,
                 }
 #endif // ENREGISTERED_PARAMTYPE_MAXSIZE
 
-                ReportPointersFromValueType(fn, sc, pMT, pArgPtr);
+                ReportPointersFromValueTypeArg(fn, sc, pMT, pValue);
             }
             break;
 
diff --git a/src/vm/siginfo.hpp b/src/vm/siginfo.hpp
index 06d3b66a24..586802b1b1 100644
--- a/src/vm/siginfo.hpp
+++ b/src/vm/siginfo.hpp
@@ -50,6 +50,7 @@ unsigned GetSizeForCorElementType(CorElementType etyp);
 const ElementTypeInfo* GetElementTypeInfo(CorElementType etyp);
 
 class SigBuilder;
+class ArgDestination;
 
 typedef const struct HardCodedMetaSig *LPHARDCODEDMETASIG;
 
@@ -841,7 +842,7 @@ class MetaSig
         // Perform type-specific GC promotion on the value (based upon the
         // last type retrieved by NextArg()).
         //------------------------------------------------------------------
-        VOID GcScanRoots(PTR_VOID pValue, promote_func *fn,
+        VOID GcScanRoots(ArgDestination *pValue, promote_func *fn,
                          ScanContext* sc, promote_carefully_func *fnc = NULL);
 
         //------------------------------------------------------------------
@@ -888,7 +889,7 @@ class MetaSig
         BOOL IsReturnTypeVoid() const;
 
 
-        enum RETURNTYPE {RETOBJ, RETBYREF, RETNONOBJ};
+        enum RETURNTYPE {RETOBJ, RETBYREF, RETNONOBJ, RETVALUETYPE};
 
         CorElementType GetReturnTypeNormalized(TypeHandle * pthValueType = NULL) const;
 
diff --git a/src/vm/stackbuildersink.cpp b/src/vm/stackbuildersink.cpp
index bcd8d62f50..5d6aa7bb15 100644
--- a/src/vm/stackbuildersink.cpp
+++ b/src/vm/stackbuildersink.cpp
@@ -404,13 +404,16 @@ void CallDescrWithObjectArray(OBJECTREF& pServer,
 #endif
 
 #ifdef CALLDESCR_FPARGREGS
-        // Under CALLDESCR_FPARGREGS -ve offsets indicate arguments in floating point registers. If we have at
+        // Under CALLDESCR_FPARGREGS we can have arguments in floating point registers. If we have at
         // least one such argument we point the call worker at the floating point area of the frame (we leave
         // it null otherwise since the worker can perform a useful optimization if it knows no floating point
         // registers need to be set up).
-        if (TransitionBlock::IsFloatArgumentRegisterOffset(ofs) && (pFloatArgumentRegisters == NULL))
+        if (TransitionBlock::HasFloatRegister(ofs, argit.GetArgLocDescForStructInRegs()) && 
+            (pFloatArgumentRegisters == NULL))
+        {
             pFloatArgumentRegisters = (FloatArgumentRegisters*)(pTransitionBlock +
                                                                 TransitionBlock::GetOffsetOfFloatArgumentRegisters());
+        }
 #endif
 
         if (argit.GetArgType() == ELEMENT_TYPE_BYREF)
diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp
index 065c396929..5e4c05f514 100644
--- a/src/vm/threads.cpp
+++ b/src/vm/threads.cpp
@@ -2242,6 +2242,9 @@ Thread::Thread()
 #endif
 
     m_pAllLoggedTypes = NULL;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    m_pHijackReturnTypeClass = NULL;
+#endif
 }
 
 
diff --git a/src/vm/threads.h b/src/vm/threads.h
index 0ab550f741..da94c0e2ce 100644
--- a/src/vm/threads.h
+++ b/src/vm/threads.h
@@ -689,6 +689,9 @@ void InitThreadManager();
 EXTERN_C void __stdcall OnHijackObjectTripThread();                 // hijacked JIT code is returning an objectref
 EXTERN_C void __stdcall OnHijackInteriorPointerTripThread();        // hijacked JIT code is returning a byref
 EXTERN_C void __stdcall OnHijackScalarTripThread();                 // hijacked JIT code is returning a non-objectref, non-FP
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+EXTERN_C void __stdcall OnHijackStructInRegsTripThread();           // hijacked JIT code is returning a struct in registers
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
 #ifdef _TARGET_X86_
 EXTERN_C void __stdcall OnHijackFloatingPointTripThread();          // hijacked JIT code is returning an FP value
@@ -1017,6 +1020,9 @@ typedef DWORD (*AppropriateWaitFunc) (void *args, DWORD timeout, DWORD option);
 EXTERN_C void STDCALL OnHijackObjectWorker(HijackArgs * pArgs);
 EXTERN_C void STDCALL OnHijackInteriorPointerWorker(HijackArgs * pArgs);
 EXTERN_C void STDCALL OnHijackScalarWorker(HijackArgs * pArgs);
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+EXTERN_C void STDCALL OnHijackStructInRegsWorker(HijackArgs * pArgs);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #endif // FEATURE_HIJACK
 
 // This is the code we pass around for Thread.Interrupt, mainly for assertions
@@ -1067,7 +1073,9 @@ class Thread: public IUnknown
     friend void STDCALL OnHijackObjectWorker(HijackArgs *pArgs);
     friend void STDCALL OnHijackInteriorPointerWorker(HijackArgs *pArgs);
     friend void STDCALL OnHijackScalarWorker(HijackArgs *pArgs);
-
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    friend void STDCALL OnHijackStructInRegsWorker(HijackArgs *pArgs);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 #ifdef PLATFORM_UNIX
     friend void PALAPI HandleGCSuspensionForInterruptedThread(CONTEXT *interruptedContext);
 #endif // PLATFORM_UNIX
@@ -5553,6 +5561,24 @@ public:
         _ASSERTE(pAllLoggedTypes != NULL ? m_pAllLoggedTypes == NULL : TRUE);
         m_pAllLoggedTypes = pAllLoggedTypes;
     }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+private:
+    EEClass* m_pHijackReturnTypeClass;
+public:
+    EEClass* GetHijackReturnTypeClass()
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        return m_pHijackReturnTypeClass;
+    }
+
+    void SetHijackReturnTypeClass(EEClass* pClass)
+    {
+        LIMITED_METHOD_CONTRACT;
+
+        m_pHijackReturnTypeClass = pClass;
+    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 };
 
 // End of class Thread
diff --git a/src/vm/threadsuspend.cpp b/src/vm/threadsuspend.cpp
index 10ea699faa..5d414192c4 100644
--- a/src/vm/threadsuspend.cpp
+++ b/src/vm/threadsuspend.cpp
@@ -7260,7 +7260,7 @@ void STDCALL OnHijackInteriorPointerWorker(HijackArgs * pArgs)
             GC_ON_TRANSITIONS (GCOnTransition);
         }
 #endif
-        pArgs->ReturnValue = (size_t)ptr;
+        *(size_t*)&pArgs->ReturnValue = (size_t)ptr;
     }
     GCPROTECT_END();        // trashes or here!
 
@@ -7327,6 +7327,90 @@ void STDCALL OnHijackScalarWorker(HijackArgs * pArgs)
 #endif
 }
 
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+// A hijacked method is returning a struct in registers to its caller.
+// The struct can possibly contain object references that we have to
+// protect.
+void STDCALL OnHijackStructInRegsWorker(HijackArgs * pArgs)
+{
+    CONTRACTL {
+        THROWS;
+        GC_TRIGGERS;
+        SO_TOLERANT;
+    } CONTRACTL_END;
+
+#ifdef HIJACK_NONINTERRUPTIBLE_THREADS
+    Thread         *thread = GetThread();
+
+    EEClass* eeClass = thread->GetHijackReturnTypeClass();
+
+    OBJECTREF oref[CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS];
+    int orefCount = 0;
+    for (int i = 0; i < eeClass->GetNumberEightBytes(); i++)
+    {
+        if (eeClass->GetEightByteClassification(i) == SystemVClassificationTypeIntegerReference)
+        {
+            oref[orefCount++] = ObjectToOBJECTREF(*(Object **) &pArgs->ReturnValue[i]);
+        }
+    }
+
+#ifdef FEATURE_STACK_PROBE
+    if (GetEEPolicy()->GetActionOnFailure(FAIL_StackOverflow) == eRudeUnloadAppDomain)
+    {
+        RetailStackProbe(ADJUST_PROBE(DEFAULT_ENTRY_PROBE_AMOUNT), thread);
+    }
+#endif
+
+    CONTRACT_VIOLATION(SOToleranceViolation);
+
+    thread->ResetThreadState(Thread::TS_Hijacked);
+
+    // Fix up our caller's stack, so it can resume from the hijack correctly
+    pArgs->ReturnAddress = (size_t)thread->m_pvHJRetAddr;
+
+    // Build a frame so that stack crawling can proceed from here back to where
+    // we will resume execution.
+    FrameWithCookie<HijackFrame> frame((void *)pArgs->ReturnAddress, thread, pArgs);
+
+    GCPROTECT_ARRAY_BEGIN(oref[0], orefCount)
+    {
+#ifdef _DEBUG
+        BOOL GCOnTransition = FALSE;
+        if (g_pConfig->FastGCStressLevel()) {
+            GCOnTransition = GC_ON_TRANSITIONS (FALSE);
+        }
+#endif
+
+#ifdef TIME_SUSPEND
+        g_SuspendStatistics.cntHijackTrap++;
+#endif
+
+        CommonTripThread();
+#ifdef _DEBUG
+        if (g_pConfig->FastGCStressLevel()) {
+            GC_ON_TRANSITIONS (GCOnTransition);
+        }
+#endif
+
+        // Update the references in the returned struct
+        orefCount = 0;
+        for (int i = 0; i < eeClass->GetNumberEightBytes(); i++)
+        {
+            if (eeClass->GetEightByteClassification(i) == SystemVClassificationTypeIntegerReference)
+            {
+                *((OBJECTREF *) &pArgs->ReturnValue[i]) = oref[orefCount++];
+            }
+        }
+    }
+    GCPROTECT_END();
+
+    frame.Pop();
+#else
+    PORTABILITY_ASSERT("OnHijackInteriorPointerWorker not implemented on this platform.");
+#endif
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
 #ifndef PLATFORM_UNIX
 
 // Get the ExecutionState for the specified SwitchIn thread.  Note that this is
@@ -7806,11 +7890,19 @@ BOOL Thread::HandledJITCase(BOOL ForTaskSwitchIn)
                 else
 #endif // _TARGET_X86_
                 {
-                    MetaSig::RETURNTYPE type = esb.m_pFD->ReturnsObject();
+                    MethodTable* pMT = NULL;
+                    MetaSig::RETURNTYPE type = esb.m_pFD->ReturnsObject(INDEBUG_COMMA(false) &pMT);
                     if (type == MetaSig::RETOBJ)
                         pvHijackAddr = OnHijackObjectTripThread;
                     else if (type == MetaSig::RETBYREF)
                         pvHijackAddr = OnHijackInteriorPointerTripThread;
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                    else if (type == MetaSig::RETVALUETYPE)
+                    {
+                        pThread->SetHijackReturnTypeClass(pMT->GetClass());
+                        pvHijackAddr = OnHijackStructInRegsTripThread;
+                    }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                 }
             }
 
@@ -8354,7 +8446,8 @@ void PALAPI HandleGCSuspensionForInterruptedThread(CONTEXT *interruptedContext)
         // Hijack the return address to point to the appropriate routine based on the method's return type.
         void *pvHijackAddr = OnHijackScalarTripThread;
         MethodDesc *pMethodDesc = codeInfo.GetMethodDesc();
-        MetaSig::RETURNTYPE type = pMethodDesc->ReturnsObject();
+        MethodTable* pMT = NULL;
+        MetaSig::RETURNTYPE type = pMethodDesc->ReturnsObject(INDEBUG_COMMA(false) &pMT);
         if (type == MetaSig::RETOBJ)
         {
             pvHijackAddr = OnHijackObjectTripThread;
@@ -8363,6 +8456,13 @@ void PALAPI HandleGCSuspensionForInterruptedThread(CONTEXT *interruptedContext)
         {
             pvHijackAddr = OnHijackInteriorPointerTripThread;
         }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        else if (type == MetaSig::RETVALUETYPE)
+        {
+            pThread->SetHijackReturnTypeClass(pMT->GetClass());
+            pvHijackAddr = OnHijackStructInRegsTripThread;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
         pThread->HijackThread(pvHijackAddr, &executionState);
     }
diff --git a/tests/src/JIT/SIMD/project.lock.json b/tests/src/JIT/SIMD/project.lock.json
index 5a0680001c..6cf037e202 100644
--- a/tests/src/JIT/SIMD/project.lock.json
+++ b/tests/src/JIT/SIMD/project.lock.json
@@ -242,7 +242,10 @@
         "ref/MonoTouch10/_._",
         "ref/net46/System.Console.dll",
         "ref/xamarinios10/_._",
-        "ref/xamarinmac20/_._"
+        "ref/xamarinmac20/_._",
+        "ru/System.Console.xml",
+        "zh-hans/System.Console.xml",
+        "zh-hant/System.Console.xml"
       ]
     },
     "System.Diagnostics.Debug/4.0.10": {