summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJUNG DONG-HEON <dheon.jung@samsung.com>2020-05-28 11:15:47 +0900
committer이형주/Common Platform Lab(SR)/Staff Engineer/삼성전자 <leee.lee@samsung.com>2020-06-18 07:38:46 +0900
commit488be5d790020489f7f4dd7d43680f43b101dbd4 (patch)
treee051d18299dfa580fabdb1ca32863f517f0ab356
parent45278569f1e32a5e3ab52f1643ee5f0f3b1f00c9 (diff)
downloadcoreclr-488be5d790020489f7f4dd7d43680f43b101dbd4.tar.gz
coreclr-488be5d790020489f7f4dd7d43680f43b101dbd4.tar.bz2
coreclr-488be5d790020489f7f4dd7d43680f43b101dbd4.zip
Fix GenerateShuffleArray to support cyclic shuffles (dotnet/coreclr#26169)
* Fix GenerateShuffleArray to support cyclic shuffles The GenerateShuffleArray was not handling case when there was a cycle in the register / stack slots shuffle and it resulted in an infinite loop in this function. This issue is Unix Amd64 ABI specific. To fix that, this change reworks the algorithm completely. Besides fixing the issue, it has also better performance in some cases. To fix the cyclic shuffling, I needed an extra helper register. However, there was no available general purpose register available, so I had to use xmm8 for this purpose. * Remove special handling of the hang from ABI stress
-rw-r--r--src/vm/comdelegate.cpp123
-rw-r--r--src/vm/comdelegate.h1
-rw-r--r--src/vm/i386/stublinkerx86.cpp75
-rw-r--r--src/vm/i386/stublinkerx86.h3
4 files changed, 182 insertions, 20 deletions
diff --git a/src/vm/comdelegate.cpp b/src/vm/comdelegate.cpp
index 7ee85b6136..9d92a7c6e5 100644
--- a/src/vm/comdelegate.cpp
+++ b/src/vm/comdelegate.cpp
@@ -237,6 +237,22 @@ int GetNormalizedArgumentSlotIndex(UINT16 offset)
return index;
}
+
+// Node of a directed graph where nodes represent registers / stack slots
+// and edges represent moves of data.
+struct ShuffleGraphNode
+{
+ static const UINT16 NoNode = 0xffff;
+ // Previous node (represents source of data for the register / stack of the current node)
+ UINT16 prev;
+ // Offset of the register / stack slot
+ UINT16 ofs;
+ // Set to true for nodes that are source of data for a destination node
+ UINT8 isSource;
+ // Nodes that are marked are either already processed or don't participate in the shuffling
+ UINT8 isMarked;
+};
+
#endif // UNIX_AMD64_ABI
VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<ShuffleEntry> * pShuffleEntryArray)
@@ -458,6 +474,7 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
_ASSERTE(!iteratorDst.HasNextOfs());
}
+
#if defined(UNIX_AMD64_ABI)
// The Unix AMD64 ABI can cause a struct to be passed on stack for the source and in registers for the destination.
// That can cause some arguments that are passed on stack for the destination to be passed in registers in the source.
@@ -465,39 +482,107 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray<S
// void fn(int, int, int, int, int, struct {int, double}, double, double, double, double, double, double, double, double, double, double)
// For this signature, the shuffle needs to move slots as follows (please note the "forward" movement of xmm registers):
// RDI->RSI, RDX->RCX, R8->RDX, R9->R8, stack[0]->R9, xmm0->xmm1, xmm1->xmm2, ... xmm6->xmm7, xmm7->stack[0], stack[1]->xmm0, stack[2]->stack[1], stack[3]->stack[2]
- // To prevent overwriting of slots before they are moved, we need to sort the move operations.
+ // To prevent overwriting of slots before they are moved, we need to perform the shuffling in correct order
+
+ NewArrayHolder<ShuffleGraphNode> pGraphNodes = new ShuffleGraphNode[argSlots];
+
+ // Initialize the graph array
+ for (unsigned int i = 0; i < argSlots; i++)
+ {
+ pGraphNodes[i].prev = ShuffleGraphNode::NoNode;
+ pGraphNodes[i].isMarked = true;
+ pGraphNodes[i].isSource = false;
+ }
+
+ // Build the directed graph representing register and stack slot shuffling.
+ // The links are directed from destination to source.
+ // During the build also set isSource flag for nodes that are sources of data.
+ // The ones that don't have the isSource flag set are beginnings of non-cyclic
+ // segments of the graph.
+ for (unsigned int i = 0; i < pShuffleEntryArray->GetCount(); i++)
+ {
+ ShuffleEntry entry = (*pShuffleEntryArray)[i];
+
+ int srcIndex = GetNormalizedArgumentSlotIndex(entry.srcofs);
+ int dstIndex = GetNormalizedArgumentSlotIndex(entry.dstofs);
+
+ // Unmark the node to indicate that it was not processed yet
+ pGraphNodes[srcIndex].isMarked = false;
+ // The node contains a register / stack slot that is a source from which we move data to a destination one
+ pGraphNodes[srcIndex].isSource = true;
+ pGraphNodes[srcIndex].ofs = entry.srcofs;
+
+ // Unmark the node to indicate that it was not processed yet
+ pGraphNodes[dstIndex].isMarked = false;
+ // Link to the previous node in the graph (source of data for the current node)
+ pGraphNodes[dstIndex].prev = srcIndex;
+ pGraphNodes[dstIndex].ofs = entry.dstofs;
+ }
- NewArrayHolder<bool> filledSlots = new bool[argSlots];
+ // Now that we've built the graph, clear the array, we will regenerate it from the graph ensuring a proper order of shuffling
+ pShuffleEntryArray->Clear();
- bool reordered;
- do
+ // Add all non-cyclic subgraphs to the target shuffle array and mark their nodes as visited
+ for (unsigned int startIndex = 0; startIndex < argSlots; startIndex++)
{
- reordered = false;
+ unsigned int index = startIndex;
- for (int i = 0; i < argSlots; i++)
+ if (!pGraphNodes[index].isMarked && !pGraphNodes[index].isSource)
{
- filledSlots[i] = false;
+ // This node is not a source, that means it is an end of shuffle chain
+ // Generate shuffle array entries for all nodes in the chain in a correct
+ // order.
+ UINT16 dstOfs = ShuffleEntry::SENTINEL;
+
+ do
+ {
+ pGraphNodes[index].isMarked = true;
+ if (dstOfs != ShuffleEntry::SENTINEL)
+ {
+ entry.srcofs = pGraphNodes[index].ofs;
+ entry.dstofs = dstOfs;
+ pShuffleEntryArray->Append(entry);
+ }
+
+ dstOfs = pGraphNodes[index].ofs;
+ index = pGraphNodes[index].prev;
+ }
+ while (index != ShuffleGraphNode::NoNode);
}
- for (unsigned int i = 0; i < pShuffleEntryArray->GetCount(); i++)
+ }
+
+ // Process all cycles in the graph
+ for (unsigned int startIndex = 0; startIndex < argSlots; startIndex++)
+ {
+ unsigned int index = startIndex;
+
+ if (!pGraphNodes[index].isMarked)
{
- entry = (*pShuffleEntryArray)[i];
+ // This node is part of a new cycle as all non-cyclic parts of the graphs were already visited
- // If the slot that we are moving the argument to was filled in already, we need to move this entry in front
- // of the entry that filled it in.
- if (filledSlots[GetNormalizedArgumentSlotIndex(entry.srcofs)])
+ // Move the first node register / stack slot to a helper reg
+ UINT16 dstOfs = ShuffleEntry::HELPERREG;
+
+ do
{
- unsigned int j;
- for (j = i; (*pShuffleEntryArray)[j].dstofs != entry.srcofs; j--)
- (*pShuffleEntryArray)[j] = (*pShuffleEntryArray)[j - 1];
+ pGraphNodes[index].isMarked = true;
- (*pShuffleEntryArray)[j] = entry;
- reordered = true;
+ entry.srcofs = pGraphNodes[index].ofs;
+ entry.dstofs = dstOfs;
+ pShuffleEntryArray->Append(entry);
+
+ dstOfs = pGraphNodes[index].ofs;
+ index = pGraphNodes[index].prev;
}
+ while (index != startIndex);
- filledSlots[GetNormalizedArgumentSlotIndex(entry.dstofs)] = true;
+ // Move helper reg to the last node register / stack slot
+ entry.srcofs = ShuffleEntry::HELPERREG;
+ entry.dstofs = dstOfs;
+ pShuffleEntryArray->Append(entry);
}
}
- while (reordered);
+
#endif // UNIX_AMD64_ABI
entry.srcofs = ShuffleEntry::SENTINEL;
diff --git a/src/vm/comdelegate.h b/src/vm/comdelegate.h
index 3d5ec8e30e..330f1f8886 100644
--- a/src/vm/comdelegate.h
+++ b/src/vm/comdelegate.h
@@ -194,6 +194,7 @@ struct ShuffleEntry
OFSMASK = 0x7fff, // Mask to get stack offset
OFSREGMASK = 0x1fff, // Mask to get register index
SENTINEL = 0xffff, // Indicates end of shuffle array
+ HELPERREG = 0xcfff, // Use a helper register as source or destination (used to handle cycles in the shuffling)
};
#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI)
diff --git a/src/vm/i386/stublinkerx86.cpp b/src/vm/i386/stublinkerx86.cpp
index 7020e40e51..983fc3f36a 100644
--- a/src/vm/i386/stublinkerx86.cpp
+++ b/src/vm/i386/stublinkerx86.cpp
@@ -1739,6 +1739,49 @@ VOID StubLinkerCPU::X64EmitMovSSToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs
X64EmitMovXmmWorker(0xF3, 0x11, Xmmreg, baseReg, ofs);
}
+VOID StubLinkerCPU::X64EmitMovqRegXmm(X86Reg reg, X86Reg Xmmreg)
+{
+ STANDARD_VM_CONTRACT;
+ X64EmitMovqWorker(0x7e, Xmmreg, reg);
+}
+
+VOID StubLinkerCPU::X64EmitMovqXmmReg(X86Reg Xmmreg, X86Reg reg)
+{
+ STANDARD_VM_CONTRACT;
+ X64EmitMovqWorker(0x6e, Xmmreg, reg);
+}
+
+//-----------------------------------------------------------------------------
+// Helper method for emitting movq between xmm and general purpose reqister
+//-----------------------------------------------------------------------------
+VOID StubLinkerCPU::X64EmitMovqWorker(BYTE opcode, X86Reg Xmmreg, X86Reg reg)
+{
+ BYTE codeBuffer[10];
+ unsigned int nBytes = 0;
+ codeBuffer[nBytes++] = 0x66;
+ BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT;
+ if (reg >= kR8)
+ {
+ rex |= REX_MODRM_RM_EXT;
+ reg = X86RegFromAMD64Reg(reg);
+ }
+ if (Xmmreg >= kXMM8)
+ {
+ rex |= REX_MODRM_REG_EXT;
+ Xmmreg = X86RegFromAMD64Reg(Xmmreg);
+ }
+ codeBuffer[nBytes++] = rex;
+ codeBuffer[nBytes++] = 0x0f;
+ codeBuffer[nBytes++] = opcode;
+ BYTE modrm = static_cast<BYTE>((Xmmreg << 3) | reg);
+ codeBuffer[nBytes++] = 0xC0|modrm;
+
+ _ASSERTE(nBytes <= _countof(codeBuffer));
+
+ // Lastly, emit the encoded bytes
+ EmitBytes(codeBuffer, nBytes);
+}
+
//---------------------------------------------------------------
// Helper method for emitting of XMM from/to memory moves
//---------------------------------------------------------------
@@ -3800,7 +3843,37 @@ VOID StubLinkerCPU::EmitShuffleThunk(ShuffleEntry *pShuffleEntryArray)
#ifdef UNIX_AMD64_ABI
for (ShuffleEntry* pEntry = pShuffleEntryArray; pEntry->srcofs != ShuffleEntry::SENTINEL; pEntry++)
{
- if (pEntry->srcofs & ShuffleEntry::REGMASK)
+ if (pEntry->srcofs == ShuffleEntry::HELPERREG)
+ {
+ if (pEntry->dstofs & ShuffleEntry::REGMASK)
+ {
+ // movq dstReg, xmm8
+ int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK;
+ X64EmitMovqRegXmm(c_argRegs[dstRegIndex], (X86Reg)kXMM8);
+ }
+ else
+ {
+ // movsd [rax + dst], xmm8
+ int dstOffset = (pEntry->dstofs + 1) * sizeof(void*);
+ X64EmitMovSDToMem((X86Reg)kXMM8, SCRATCH_REGISTER_X86REG, dstOffset);
+ }
+ }
+ else if (pEntry->dstofs == ShuffleEntry::HELPERREG)
+ {
+ if (pEntry->srcofs & ShuffleEntry::REGMASK)
+ {
+ // movq xmm8, srcReg
+ int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK;
+ X64EmitMovqXmmReg((X86Reg)kXMM8, c_argRegs[srcRegIndex]);
+ }
+ else
+ {
+ // movsd xmm8, [rax + src]
+ int srcOffset = (pEntry->srcofs + 1) * sizeof(void*);
+ X64EmitMovSDFromMem((X86Reg)(kXMM8), SCRATCH_REGISTER_X86REG, srcOffset);
+ }
+ }
+ else if (pEntry->srcofs & ShuffleEntry::REGMASK)
{
// Source in a general purpose or float register, destination in the same kind of a register or on stack
int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK;
diff --git a/src/vm/i386/stublinkerx86.h b/src/vm/i386/stublinkerx86.h
index 3d79b51868..49731a8ce4 100644
--- a/src/vm/i386/stublinkerx86.h
+++ b/src/vm/i386/stublinkerx86.h
@@ -200,8 +200,11 @@ class StubLinkerCPU : public StubLinker
VOID X64EmitMovSDToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
VOID X64EmitMovSSFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
VOID X64EmitMovSSToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+ VOID X64EmitMovqRegXmm(X86Reg reg, X86Reg Xmmreg);
+ VOID X64EmitMovqXmmReg(X86Reg Xmmreg, X86Reg reg);
VOID X64EmitMovXmmWorker(BYTE prefix, BYTE opcode, X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0);
+ VOID X64EmitMovqWorker(BYTE opcode, X86Reg Xmmreg, X86Reg reg);
#endif
VOID X86EmitZeroOutReg(X86Reg reg);