diff options
-rw-r--r-- | src/pal/inc/unixasmmacrosarm64.inc | 34 | ||||
-rw-r--r-- | src/vm/argdestination.h | 21 | ||||
-rw-r--r-- | src/vm/arm64/CallDescrWorkerARM64.asm | 8 | ||||
-rw-r--r-- | src/vm/arm64/asmconstants.h | 3 | ||||
-rw-r--r-- | src/vm/arm64/asmhelpers.S | 24 | ||||
-rw-r--r-- | src/vm/arm64/asmhelpers.asm | 29 | ||||
-rw-r--r-- | src/vm/arm64/asmmacros.h | 18 | ||||
-rw-r--r-- | src/vm/arm64/calldescrworkerarm64.S | 8 | ||||
-rw-r--r-- | src/vm/arm64/cgencpu.h | 12 | ||||
-rw-r--r-- | src/vm/callingconvention.h | 7 |
10 files changed, 83 insertions, 81 deletions
diff --git a/src/pal/inc/unixasmmacrosarm64.inc b/src/pal/inc/unixasmmacrosarm64.inc index f99efbe45d..6db1f242f3 100644 --- a/src/pal/inc/unixasmmacrosarm64.inc +++ b/src/pal/inc/unixasmmacrosarm64.inc @@ -137,14 +137,14 @@ C_FUNC(\Name\()_End): // ArgumentRegisters::x2 // ArgumentRegisters::x1 // ArgumentRegisters::x0 -// FloatRegisters::d7 -// FloatRegisters::d6 -// FloatRegisters::d5 -// FloatRegisters::d4 -// FloatRegisters::d3 -// FloatRegisters::d2 -// FloatRegisters::d1 -// FloatRegisters::d0 +// FloatRegisters::q7 +// FloatRegisters::q6 +// FloatRegisters::q5 +// FloatRegisters::q4 +// FloatRegisters::q3 +// FloatRegisters::q2 +// FloatRegisters::q1 +// FloatRegisters::q0 .macro PROLOG_WITH_TRANSITION_BLOCK extraLocals = 0, SaveFPArgs = 1 __PWTB_FloatArgumentRegisters = \extraLocals @@ -200,13 +200,13 @@ C_FUNC(\Name\()_End): .endm -// Reserve 64 bytes of memory before calling SAVE_FLOAT_ARGUMENT_REGISTERS +// Reserve 128 bytes of memory before calling SAVE_FLOAT_ARGUMENT_REGISTERS .macro SAVE_FLOAT_ARGUMENT_REGISTERS reg, ofs - stp d0, d1, [\reg, #(\ofs)] - stp d2, d3, [\reg, #(\ofs + 16)] - stp d4, d5, [\reg, #(\ofs + 32)] - stp d6, d7, [\reg, #(\ofs + 48)] + stp q0, q1, [\reg, #(\ofs)] + stp q2, q3, [\reg, #(\ofs + 32)] + stp q4, q5, [\reg, #(\ofs + 64)] + stp q6, q7, [\reg, #(\ofs + 96)] .endm @@ -222,10 +222,10 @@ C_FUNC(\Name\()_End): .macro RESTORE_FLOAT_ARGUMENT_REGISTERS reg, ofs - ldp d0, d1, [\reg, #(\ofs)] - ldp d2, d3, [\reg, #(\ofs + 16)] - ldp d4, d5, [\reg, #(\ofs + 32)] - ldp d6, d7, [\reg, #(\ofs + 48)] + ldp q0, q1, [\reg, #(\ofs)] + ldp q2, q3, [\reg, #(\ofs + 32)] + ldp q4, q5, [\reg, #(\ofs + 64)] + ldp q6, q7, [\reg, #(\ofs + 96)] .endm diff --git a/src/vm/argdestination.h b/src/vm/argdestination.h index 04968a1aff..439761bec2 100644 --- a/src/vm/argdestination.h +++ b/src/vm/argdestination.h @@ -65,20 +65,17 @@ public: int floatRegCount = m_argLocDescForStructInRegs->m_cFloatReg; bool typeFloat = m_argLocDescForStructInRegs->m_isSinglePrecision; - void* dest = this->GetDestinationAddress(); + UINT64* dest = (UINT64*) this->GetDestinationAddress(); - if (typeFloat) + for (int i = 0; i < floatRegCount; ++i) { - for (int i = 0; i < floatRegCount; ++i) - { - // Copy 4 bytes on 8 bytes alignment - *((UINT64*)dest + i) = *((UINT32*)src + i); - } - } - else - { - // We can just do a memcpy. - memcpyNoGCRefs(dest, src, fieldBytes); + // Copy 4 or 8 bytes from src. + UINT64 val = typeFloat ? *((UINT32*)src + i) : *((UINT64*)src + i); + // Always store 8 bytes + *(dest++) = val; + // For now, always zero the next 8 bytes. + // (When HVAs are supported we will get the next 8 bytes from src.) + *(dest++) = 0; } } diff --git a/src/vm/arm64/CallDescrWorkerARM64.asm b/src/vm/arm64/CallDescrWorkerARM64.asm index 65c7db6f3f..fe277ceb62 100644 --- a/src/vm/arm64/CallDescrWorkerARM64.asm +++ b/src/vm/arm64/CallDescrWorkerARM64.asm @@ -56,10 +56,10 @@ Ldonestack ;; given in x9. ldr x9, [x19,#CallDescrData__pFloatArgumentRegisters] cbz x9, LNoFloatingPoint - ldp d0, d1, [x9] - ldp d2, d3, [x9, #16] - ldp d4, d5, [x9, #32] - ldp d6, d7, [x9, #48] + ldp q0, q1, [x9] + ldp q2, q3, [x9, #32] + ldp q4, q5, [x9, #64] + ldp q6, q7, [x9, #96] LNoFloatingPoint ;; Copy [pArgumentRegisters, ..., pArgumentRegisters + 56] diff --git a/src/vm/arm64/asmconstants.h b/src/vm/arm64/asmconstants.h index 7d0a9f734b..1acc1b46d7 100644 --- a/src/vm/arm64/asmconstants.h +++ b/src/vm/arm64/asmconstants.h @@ -58,7 +58,8 @@ ASMCONSTANTS_C_ASSERT(AppDomain__m_dwId == offsetof(AppDomain, m_dwId)); #define SIZEOF__ArgumentRegisters 0x40 ASMCONSTANTS_C_ASSERT(SIZEOF__ArgumentRegisters == sizeof(ArgumentRegisters)) -#define SIZEOF__FloatArgumentRegisters 0x40 +// There are 8 128-bit registers in FloatArgumentRegisters +#define SIZEOF__FloatArgumentRegisters 0x80 ASMCONSTANTS_C_ASSERT(SIZEOF__FloatArgumentRegisters == sizeof(FloatArgumentRegisters)) #define CallDescrData__pSrc 0x00 diff --git a/src/vm/arm64/asmhelpers.S b/src/vm/arm64/asmhelpers.S index c0baa92456..ed48d66e58 100644 --- a/src/vm/arm64/asmhelpers.S +++ b/src/vm/arm64/asmhelpers.S @@ -121,18 +121,18 @@ LEAF_END HelperMethodFrameRestoreState, _TEXT // The call in ndirect import precode points to this function. NESTED_ENTRY NDirectImportThunk, _TEXT, NoHandler - PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -160 + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -224 SAVE_ARGUMENT_REGISTERS sp, 16 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 88 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 mov x0, x12 bl NDirectImportWorker mov x12, x0 // pop the stack and restore original register state - RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 88 + RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 96 RESTORE_ARGUMENT_REGISTERS sp, 16 - EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 160 + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 224 // If we got back from NDirectImportWorker, the MD has been successfully // linked. Proceed to execute the original DLL call. @@ -493,9 +493,9 @@ WRITE_BARRIER_END JIT_WriteBarrier NESTED_ENTRY VirtualMethodFixupStub, _TEXT, NoHandler // Save arguments and return address - PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -160 + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -224 SAVE_ARGUMENT_REGISTERS sp, 16 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 88 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 // Refer to ZapImportVirtualThunk::Save // for details on this. @@ -512,8 +512,8 @@ NESTED_ENTRY VirtualMethodFixupStub, _TEXT, NoHandler // pop the stack and restore original register state RESTORE_ARGUMENT_REGISTERS sp, 16 - RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 88 - EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 160 + RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 96 + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 224 PATCH_LABEL VirtualMethodFixupPatchLabel @@ -723,9 +723,9 @@ COMToCLRDispatchHelper_RegSetup NESTED_ENTRY TheUMEntryPrestub, _TEXT, UnhandledExceptionHandlerUnix // Save arguments and return address - PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -160 + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -224 SAVE_ARGUMENT_REGISTERS sp, 16 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 88 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 mov x0, x12 bl C_FUNC(TheUMEntryPrestubWorker) @@ -735,8 +735,8 @@ NESTED_ENTRY TheUMEntryPrestub, _TEXT, UnhandledExceptionHandlerUnix // pop the stack and restore original register state RESTORE_ARGUMENT_REGISTERS sp, 16 - RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 88 - EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 160 + RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 96 + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 224 // and tailcall to the actual method EPILOG_BRANCH_REG x12 diff --git a/src/vm/arm64/asmhelpers.asm b/src/vm/arm64/asmhelpers.asm index 37efbaeccd..7d8aad3e48 100644 --- a/src/vm/arm64/asmhelpers.asm +++ b/src/vm/arm64/asmhelpers.asm @@ -184,18 +184,18 @@ Done ; The call in ndirect import precode points to this function. NESTED_ENTRY NDirectImportThunk - PROLOG_SAVE_REG_PAIR fp, lr, #-160! + PROLOG_SAVE_REG_PAIR fp, lr, #-224! SAVE_ARGUMENT_REGISTERS sp, 16 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 88 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 mov x0, x12 bl NDirectImportWorker mov x12, x0 ; pop the stack and restore original register state - RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 88 + RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 96 RESTORE_ARGUMENT_REGISTERS sp, 16 - EPILOG_RESTORE_REG_PAIR fp, lr, #160! + EPILOG_RESTORE_REG_PAIR fp, lr, #224! ; If we got back from NDirectImportWorker, the MD has been successfully ; linked. Proceed to execute the original DLL call. @@ -538,9 +538,9 @@ Exit NESTED_ENTRY VirtualMethodFixupStub ; Save arguments and return address - PROLOG_SAVE_REG_PAIR fp, lr, #-160! + PROLOG_SAVE_REG_PAIR fp, lr, #-224! SAVE_ARGUMENT_REGISTERS sp, 16 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 88 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 ; Refer to ZapImportVirtualThunk::Save ; for details on this. @@ -557,8 +557,8 @@ Exit ; pop the stack and restore original register state RESTORE_ARGUMENT_REGISTERS sp, 16 - RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 88 - EPILOG_RESTORE_REG_PAIR fp, lr, #160! + RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 96 + EPILOG_RESTORE_REG_PAIR fp, lr, #224! PATCH_LABEL VirtualMethodFixupPatchLabel @@ -862,7 +862,10 @@ COMToCLRDispatchHelper_StackLoop COMToCLRDispatchHelper_RegSetup - RESTORE_FLOAT_ARGUMENT_REGISTERS x1, -1 * GenericComCallStub_FrameOffset + ; We need an aligned offset for restoring float args, so do the subtraction into + ; a scratch register + sub x5, x1, GenericComCallStub_FrameOffset + RESTORE_FLOAT_ARGUMENT_REGISTERS x5, 0 mov lr, x2 mov x12, x3 @@ -892,9 +895,9 @@ COMToCLRDispatchHelper_RegSetup NESTED_ENTRY TheUMEntryPrestub,,UMEntryPrestubUnwindFrameChainHandler ; Save arguments and return address - PROLOG_SAVE_REG_PAIR fp, lr, #-160! + PROLOG_SAVE_REG_PAIR fp, lr, #-224! SAVE_ARGUMENT_REGISTERS sp, 16 - SAVE_FLOAT_ARGUMENT_REGISTERS sp, 88 + SAVE_FLOAT_ARGUMENT_REGISTERS sp, 96 mov x0, x12 bl TheUMEntryPrestubWorker @@ -904,8 +907,8 @@ COMToCLRDispatchHelper_RegSetup ; pop the stack and restore original register state RESTORE_ARGUMENT_REGISTERS sp, 16 - RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 88 - EPILOG_RESTORE_REG_PAIR fp, lr, #160! + RESTORE_FLOAT_ARGUMENT_REGISTERS sp, 96 + EPILOG_RESTORE_REG_PAIR fp, lr, #224! ; and tailcall to the actual method EPILOG_BRANCH_REG x12 diff --git a/src/vm/arm64/asmmacros.h b/src/vm/arm64/asmmacros.h index 291fcf8e70..5c6195b405 100644 --- a/src/vm/arm64/asmmacros.h +++ b/src/vm/arm64/asmmacros.h @@ -183,7 +183,7 @@ __PWTB_SAVE_ARGUMENT_REGISTERS_OFFSET SETA 0 MEND -; Reserve 64 bytes of memory before calling SAVE_FLOAT_ARGUMENT_REGISTERS +; Reserve 128 bytes of memory before calling SAVE_FLOAT_ARGUMENT_REGISTERS MACRO SAVE_FLOAT_ARGUMENT_REGISTERS $reg, $offset @@ -195,10 +195,10 @@ __PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET SETA $offset __PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET SETA 0 ENDIF - stp d0, d1, [$reg, #(__PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET)] - stp d2, d3, [$reg, #(__PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 16)] - stp d4, d5, [$reg, #(__PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 32)] - stp d6, d7, [$reg, #(__PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 48)] + stp q0, q1, [$reg, #(__PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET)] + stp q2, q3, [$reg, #(__PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 32)] + stp q4, q5, [$reg, #(__PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 64)] + stp q6, q7, [$reg, #(__PWTB_SAVE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 96)] MEND MACRO @@ -231,10 +231,10 @@ __PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET SETA $offset __PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET SETA 0 ENDIF - ldp d0, d1, [$reg, #(__PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET)] - ldp d2, d3, [$reg, #(__PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 16)] - ldp d4, d5, [$reg, #(__PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 32)] - ldp d6, d7, [$reg, #(__PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 48)] + ldp q0, q1, [$reg, #(__PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET)] + ldp q2, q3, [$reg, #(__PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 32)] + ldp q4, q5, [$reg, #(__PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 64)] + ldp q6, q7, [$reg, #(__PWTB_RESTORE_FLOAT_ARGUMENT_REGISTERS_OFFSET + 96)] MEND ; ------------------------------------------------------------------ diff --git a/src/vm/arm64/calldescrworkerarm64.S b/src/vm/arm64/calldescrworkerarm64.S index c3ce06aa72..f987d402dd 100644 --- a/src/vm/arm64/calldescrworkerarm64.S +++ b/src/vm/arm64/calldescrworkerarm64.S @@ -48,10 +48,10 @@ LOCAL_LABEL(donestack): // given in x8. ldr x9, [x19,#CallDescrData__pFloatArgumentRegisters] cbz x9, LOCAL_LABEL(NoFloatingPoint) - ldp d0, d1, [x9] - ldp d2, d3, [x9, #16] - ldp d4, d5, [x9, #32] - ldp d6, d7, [x9, #48] + ldp q0, q1, [x9] + ldp q2, q3, [x9, #32] + ldp q4, q5, [x9, #64] + ldp q6, q7, [x9, #96] LOCAL_LABEL(NoFloatingPoint): // Copy [pArgumentRegisters, ..., pArgumentRegisters + 56] diff --git a/src/vm/arm64/cgencpu.h b/src/vm/arm64/cgencpu.h index a297a84e17..fd1fbafe96 100644 --- a/src/vm/arm64/cgencpu.h +++ b/src/vm/arm64/cgencpu.h @@ -100,7 +100,7 @@ static_assert(((STACK_ELEM_SIZE & (STACK_ELEM_SIZE-1)) == 0), "STACK_ELEM_SIZE m //********************************************************************** //-------------------------------------------------------------------- -// This represents the callee saved (non-volatile) registers saved as +// This represents the callee saved (non-volatile) integer registers saved as // of a FramedMethodFrame. //-------------------------------------------------------------------- typedef DPTR(struct CalleeSavedRegisters) PTR_CalleeSavedRegisters; @@ -111,7 +111,7 @@ struct CalleeSavedRegisters { }; //-------------------------------------------------------------------- -// This represents the arguments that are stored in volatile registers. +// This represents the arguments that are stored in volatile integer registers. // This should not overlap the CalleeSavedRegisters since those are already // saved separately and it would be wasteful to save the same register twice. // If we do use a non-volatile register as an argument, then the ArgIterator @@ -138,10 +138,10 @@ typedef DPTR(struct FloatArgumentRegisters) PTR_FloatArgumentRegisters; struct FloatArgumentRegisters { // armV8 supports 32 floating point registers. Each register is 128bits long. // It can be accessed as 128-bit value or 64-bit value(d0-d31) or as 32-bit value (s0-s31) - // or as 16-bit value or as 8-bit values. C# only has two builtin floating datatypes float(32-bit) and - // double(64-bit). It does not have a quad-precision floating point.So therefore it does not make sense to - // store full 128-bit values in Frame when the upper 64 bit will not contain any values. - double d[8]; // d0-d7 + // or as 16-bit value or as 8-bit values. + // Although C# only has two builtin floating datatypes float(32-bit) and double(64-bit), + // HW Intrinsics support using the full 128-bit value for passing Vectors. + NEON128 q[8]; // q0-q7 }; diff --git a/src/vm/callingconvention.h b/src/vm/callingconvention.h index e70e31f3a7..eaabaa42b2 100644 --- a/src/vm/callingconvention.h +++ b/src/vm/callingconvention.h @@ -583,8 +583,8 @@ public: if (TransitionBlock::IsFloatArgumentRegisterOffset(argOffset)) { - // Dividing by 8 as size of each register in FloatArgumentRegisters is 8 bytes. - pLoc->m_idxFloatReg = (argOffset - TransitionBlock::GetOffsetOfFloatArgumentRegisters()) / 8; + // Dividing by 16 as size of each register in FloatArgumentRegisters is 16 bytes. + pLoc->m_idxFloatReg = (argOffset - TransitionBlock::GetOffsetOfFloatArgumentRegisters()) / 16; if (!m_argTypeHandle.IsNull() && m_argTypeHandle.IsHFA()) { @@ -1332,7 +1332,8 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset() { if (cFPRegs + m_idxFPReg <= 8) { - int argOfs = TransitionBlock::GetOffsetOfFloatArgumentRegisters() + m_idxFPReg * 8; + // Each floating point register in the argument area is 16 bytes. + int argOfs = TransitionBlock::GetOffsetOfFloatArgumentRegisters() + m_idxFPReg * 16; m_idxFPReg += cFPRegs; return argOfs; } |