diff options
-rw-r--r-- | src/vm/comdelegate.cpp | 123 | ||||
-rw-r--r-- | src/vm/comdelegate.h | 1 | ||||
-rw-r--r-- | src/vm/i386/stublinkerx86.cpp | 75 | ||||
-rw-r--r-- | src/vm/i386/stublinkerx86.h | 3 |
4 files changed, 182 insertions, 20 deletions
diff --git a/src/vm/comdelegate.cpp b/src/vm/comdelegate.cpp index 7ee85b6136..9d92a7c6e5 100644 --- a/src/vm/comdelegate.cpp +++ b/src/vm/comdelegate.cpp @@ -237,6 +237,22 @@ int GetNormalizedArgumentSlotIndex(UINT16 offset) return index; } + +// Node of a directed graph where nodes represent registers / stack slots +// and edges represent moves of data. +struct ShuffleGraphNode +{ + static const UINT16 NoNode = 0xffff; + // Previous node (represents source of data for the register / stack of the current node) + UINT16 prev; + // Offset of the register / stack slot + UINT16 ofs; + // Set to true for nodes that are source of data for a destination node + UINT8 isSource; + // Nodes that are marked are either already processed or don't participate in the shuffling + UINT8 isMarked; +}; + #endif // UNIX_AMD64_ABI VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<ShuffleEntry> * pShuffleEntryArray) @@ -458,6 +474,7 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S _ASSERTE(!iteratorDst.HasNextOfs()); } + #if defined(UNIX_AMD64_ABI) // The Unix AMD64 ABI can cause a struct to be passed on stack for the source and in registers for the destination. // That can cause some arguments that are passed on stack for the destination to be passed in registers in the source. @@ -465,39 +482,107 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S // void fn(int, int, int, int, int, struct {int, double}, double, double, double, double, double, double, double, double, double, double) // For this signature, the shuffle needs to move slots as follows (please note the "forward" movement of xmm registers): // RDI->RSI, RDX->RCX, R8->RDX, R9->R8, stack[0]->R9, xmm0->xmm1, xmm1->xmm2, ... xmm6->xmm7, xmm7->stack[0], stack[1]->xmm0, stack[2]->stack[1], stack[3]->stack[2] - // To prevent overwriting of slots before they are moved, we need to sort the move operations. + // To prevent overwriting of slots before they are moved, we need to perform the shuffling in correct order + + NewArrayHolder<ShuffleGraphNode> pGraphNodes = new ShuffleGraphNode[argSlots]; + + // Initialize the graph array + for (unsigned int i = 0; i < argSlots; i++) + { + pGraphNodes[i].prev = ShuffleGraphNode::NoNode; + pGraphNodes[i].isMarked = true; + pGraphNodes[i].isSource = false; + } + + // Build the directed graph representing register and stack slot shuffling. + // The links are directed from destination to source. + // During the build also set isSource flag for nodes that are sources of data. + // The ones that don't have the isSource flag set are beginnings of non-cyclic + // segments of the graph. + for (unsigned int i = 0; i < pShuffleEntryArray->GetCount(); i++) + { + ShuffleEntry entry = (*pShuffleEntryArray)[i]; + + int srcIndex = GetNormalizedArgumentSlotIndex(entry.srcofs); + int dstIndex = GetNormalizedArgumentSlotIndex(entry.dstofs); + + // Unmark the node to indicate that it was not processed yet + pGraphNodes[srcIndex].isMarked = false; + // The node contains a register / stack slot that is a source from which we move data to a destination one + pGraphNodes[srcIndex].isSource = true; + pGraphNodes[srcIndex].ofs = entry.srcofs; + + // Unmark the node to indicate that it was not processed yet + pGraphNodes[dstIndex].isMarked = false; + // Link to the previous node in the graph (source of data for the current node) + pGraphNodes[dstIndex].prev = srcIndex; + pGraphNodes[dstIndex].ofs = entry.dstofs; + } - NewArrayHolder<bool> filledSlots = new bool[argSlots]; + // Now that we've built the graph, clear the array, we will regenerate it from the graph ensuring a proper order of shuffling + pShuffleEntryArray->Clear(); - bool reordered; - do + // Add all non-cyclic subgraphs to the target shuffle array and mark their nodes as visited + for (unsigned int startIndex = 0; startIndex < argSlots; startIndex++) { - reordered = false; + unsigned int index = startIndex; - for (int i = 0; i < argSlots; i++) + if (!pGraphNodes[index].isMarked && !pGraphNodes[index].isSource) { - filledSlots[i] = false; + // This node is not a source, that means it is an end of shuffle chain + // Generate shuffle array entries for all nodes in the chain in a correct + // order. + UINT16 dstOfs = ShuffleEntry::SENTINEL; + + do + { + pGraphNodes[index].isMarked = true; + if (dstOfs != ShuffleEntry::SENTINEL) + { + entry.srcofs = pGraphNodes[index].ofs; + entry.dstofs = dstOfs; + pShuffleEntryArray->Append(entry); + } + + dstOfs = pGraphNodes[index].ofs; + index = pGraphNodes[index].prev; + } + while (index != ShuffleGraphNode::NoNode); } - for (unsigned int i = 0; i < pShuffleEntryArray->GetCount(); i++) + } + + // Process all cycles in the graph + for (unsigned int startIndex = 0; startIndex < argSlots; startIndex++) + { + unsigned int index = startIndex; + + if (!pGraphNodes[index].isMarked) { - entry = (*pShuffleEntryArray)[i]; + // This node is part of a new cycle as all non-cyclic parts of the graphs were already visited - // If the slot that we are moving the argument to was filled in already, we need to move this entry in front - // of the entry that filled it in. - if (filledSlots[GetNormalizedArgumentSlotIndex(entry.srcofs)]) + // Move the first node register / stack slot to a helper reg + UINT16 dstOfs = ShuffleEntry::HELPERREG; + + do { - unsigned int j; - for (j = i; (*pShuffleEntryArray)[j].dstofs != entry.srcofs; j--) - (*pShuffleEntryArray)[j] = (*pShuffleEntryArray)[j - 1]; + pGraphNodes[index].isMarked = true; - (*pShuffleEntryArray)[j] = entry; - reordered = true; + entry.srcofs = pGraphNodes[index].ofs; + entry.dstofs = dstOfs; + pShuffleEntryArray->Append(entry); + + dstOfs = pGraphNodes[index].ofs; + index = pGraphNodes[index].prev; } + while (index != startIndex); - filledSlots[GetNormalizedArgumentSlotIndex(entry.dstofs)] = true; + // Move helper reg to the last node register / stack slot + entry.srcofs = ShuffleEntry::HELPERREG; + entry.dstofs = dstOfs; + pShuffleEntryArray->Append(entry); } } - while (reordered); + #endif // UNIX_AMD64_ABI entry.srcofs = ShuffleEntry::SENTINEL; diff --git a/src/vm/comdelegate.h b/src/vm/comdelegate.h index 3d5ec8e30e..330f1f8886 100644 --- a/src/vm/comdelegate.h +++ b/src/vm/comdelegate.h @@ -194,6 +194,7 @@ struct ShuffleEntry OFSMASK = 0x7fff, // Mask to get stack offset OFSREGMASK = 0x1fff, // Mask to get register index SENTINEL = 0xffff, // Indicates end of shuffle array + HELPERREG = 0xcfff, // Use a helper register as source or destination (used to handle cycles in the shuffling) }; #if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI) diff --git a/src/vm/i386/stublinkerx86.cpp b/src/vm/i386/stublinkerx86.cpp index 7020e40e51..983fc3f36a 100644 --- a/src/vm/i386/stublinkerx86.cpp +++ b/src/vm/i386/stublinkerx86.cpp @@ -1739,6 +1739,49 @@ VOID StubLinkerCPU::X64EmitMovSSToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs X64EmitMovXmmWorker(0xF3, 0x11, Xmmreg, baseReg, ofs); } +VOID StubLinkerCPU::X64EmitMovqRegXmm(X86Reg reg, X86Reg Xmmreg) +{ + STANDARD_VM_CONTRACT; + X64EmitMovqWorker(0x7e, Xmmreg, reg); +} + +VOID StubLinkerCPU::X64EmitMovqXmmReg(X86Reg Xmmreg, X86Reg reg) +{ + STANDARD_VM_CONTRACT; + X64EmitMovqWorker(0x6e, Xmmreg, reg); +} + +//----------------------------------------------------------------------------- +// Helper method for emitting movq between xmm and general purpose reqister +//----------------------------------------------------------------------------- +VOID StubLinkerCPU::X64EmitMovqWorker(BYTE opcode, X86Reg Xmmreg, X86Reg reg) +{ + BYTE codeBuffer[10]; + unsigned int nBytes = 0; + codeBuffer[nBytes++] = 0x66; + BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT; + if (reg >= kR8) + { + rex |= REX_MODRM_RM_EXT; + reg = X86RegFromAMD64Reg(reg); + } + if (Xmmreg >= kXMM8) + { + rex |= REX_MODRM_REG_EXT; + Xmmreg = X86RegFromAMD64Reg(Xmmreg); + } + codeBuffer[nBytes++] = rex; + codeBuffer[nBytes++] = 0x0f; + codeBuffer[nBytes++] = opcode; + BYTE modrm = static_cast<BYTE>((Xmmreg << 3) | reg); + codeBuffer[nBytes++] = 0xC0|modrm; + + _ASSERTE(nBytes <= _countof(codeBuffer)); + + // Lastly, emit the encoded bytes + EmitBytes(codeBuffer, nBytes); +} + //--------------------------------------------------------------- // Helper method for emitting of XMM from/to memory moves //--------------------------------------------------------------- @@ -3800,7 +3843,37 @@ VOID StubLinkerCPU::EmitShuffleThunk(ShuffleEntry *pShuffleEntryArray) #ifdef UNIX_AMD64_ABI for (ShuffleEntry* pEntry = pShuffleEntryArray; pEntry->srcofs != ShuffleEntry::SENTINEL; pEntry++) { - if (pEntry->srcofs & ShuffleEntry::REGMASK) + if (pEntry->srcofs == ShuffleEntry::HELPERREG) + { + if (pEntry->dstofs & ShuffleEntry::REGMASK) + { + // movq dstReg, xmm8 + int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK; + X64EmitMovqRegXmm(c_argRegs[dstRegIndex], (X86Reg)kXMM8); + } + else + { + // movsd [rax + dst], xmm8 + int dstOffset = (pEntry->dstofs + 1) * sizeof(void*); + X64EmitMovSDToMem((X86Reg)kXMM8, SCRATCH_REGISTER_X86REG, dstOffset); + } + } + else if (pEntry->dstofs == ShuffleEntry::HELPERREG) + { + if (pEntry->srcofs & ShuffleEntry::REGMASK) + { + // movq xmm8, srcReg + int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK; + X64EmitMovqXmmReg((X86Reg)kXMM8, c_argRegs[srcRegIndex]); + } + else + { + // movsd xmm8, [rax + src] + int srcOffset = (pEntry->srcofs + 1) * sizeof(void*); + X64EmitMovSDFromMem((X86Reg)(kXMM8), SCRATCH_REGISTER_X86REG, srcOffset); + } + } + else if (pEntry->srcofs & ShuffleEntry::REGMASK) { // Source in a general purpose or float register, destination in the same kind of a register or on stack int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK; diff --git a/src/vm/i386/stublinkerx86.h b/src/vm/i386/stublinkerx86.h index 3d79b51868..49731a8ce4 100644 --- a/src/vm/i386/stublinkerx86.h +++ b/src/vm/i386/stublinkerx86.h @@ -200,8 +200,11 @@ class StubLinkerCPU : public StubLinker VOID X64EmitMovSDToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0); VOID X64EmitMovSSFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0); VOID X64EmitMovSSToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0); + VOID X64EmitMovqRegXmm(X86Reg reg, X86Reg Xmmreg); + VOID X64EmitMovqXmmReg(X86Reg Xmmreg, X86Reg reg); VOID X64EmitMovXmmWorker(BYTE prefix, BYTE opcode, X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0); + VOID X64EmitMovqWorker(BYTE opcode, X86Reg Xmmreg, X86Reg reg); #endif VOID X86EmitZeroOutReg(X86Reg reg); |