summaryrefslogtreecommitdiff
path: root/src/jit
diff options
context:
space:
mode:
authorCarol Eidt <carol.eidt@microsoft.com>2016-12-07 11:28:30 -0800
committerGitHub <noreply@github.com>2016-12-07 11:28:30 -0800
commit4d35a3a3dba0a4d2fc17e7b31dab87d8ac9b81dc (patch)
tree7034056ab118728e0f37ea897d216ee6b9fe30c5 /src/jit
parent0d926c5949d4fbf541e9cefbcf627967f318d67b (diff)
parent0403e4d81f67a9abe61bf8637deff85b971381b7 (diff)
downloadcoreclr-4d35a3a3dba0a4d2fc17e7b31dab87d8ac9b81dc.tar.gz
coreclr-4d35a3a3dba0a4d2fc17e7b31dab87d8ac9b81dc.tar.bz2
coreclr-4d35a3a3dba0a4d2fc17e7b31dab87d8ac9b81dc.zip
Merge pull request #8482 from CarolEidt/Fix8220
Use only lower floats for Vector3 dot and equality
Diffstat (limited to 'src/jit')
-rw-r--r--src/jit/gentree.h3
-rw-r--r--src/jit/simd.cpp22
-rw-r--r--src/jit/simd.h15
-rw-r--r--src/jit/simdcodegenxarch.cpp58
4 files changed, 77 insertions, 21 deletions
diff --git a/src/jit/gentree.h b/src/jit/gentree.h
index 9be18b8bff..05a3a4a070 100644
--- a/src/jit/gentree.h
+++ b/src/jit/gentree.h
@@ -955,6 +955,9 @@ public:
// struct fields constituting a single call argument.
//----------------------------------------------------------------
+#define GTF_SIMD12_OP 0x80000000 // GT_SIMD -- Indicates that the operands need to be handled as SIMD12
+ // even if they have been retyped as SIMD16.
+//----------------------------------------------------------------
#define GTF_STMT_CMPADD 0x80000000 // GT_STMT -- added by compiler
#define GTF_STMT_HAS_CSE 0x40000000 // GT_STMT -- CSE def or use was subsituted
diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp
index 7b3896089f..39664c47bf 100644
--- a/src/jit/simd.cpp
+++ b/src/jit/simd.cpp
@@ -754,7 +754,7 @@ SIMDIntrinsicID Compiler::impSIMDLongRelOpEqual(CORINFO_CLASS_HANDLE typeHnd,
//
// Equality(v1, v2):
// tmp = (v1 == v2) i.e. compare for equality as if v1 and v2 are vector<int>
- // result = BitwiseAnd(t, shuffle(t, (2, 3, 1 0)))
+ // result = BitwiseAnd(t, shuffle(t, (2, 3, 0, 1)))
// Shuffle is meant to swap the comparison results of low-32-bits and high 32-bits of respective long elements.
// Compare vector<long> as if they were vector<int> and assign the result to a temp
@@ -768,7 +768,7 @@ SIMDIntrinsicID Compiler::impSIMDLongRelOpEqual(CORINFO_CLASS_HANDLE typeHnd,
// op2 = Shuffle(tmp, 0xB1)
// IntrinsicId = BitwiseAnd
*pOp1 = gtNewOperNode(GT_COMMA, simdType, asg, tmp);
- *pOp2 = gtNewSIMDNode(simdType, gtNewLclvNode(lclNum, simdType), gtNewIconNode(SHUFFLE_ZWYX, TYP_INT),
+ *pOp2 = gtNewSIMDNode(simdType, gtNewLclvNode(lclNum, simdType), gtNewIconNode(SHUFFLE_ZWXY, TYP_INT),
SIMDIntrinsicShuffleSSE2, TYP_INT, size);
return SIMDIntrinsicBitwiseAnd;
}
@@ -2248,7 +2248,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode,
assert(op2->TypeGet() == simdType);
simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, SIMDIntrinsicOpEquality, baseType, size);
- retVal = simdTree;
+ if (simdType == TYP_SIMD12)
+ {
+ simdTree->gtFlags |= GTF_SIMD12_OP;
+ }
+ retVal = simdTree;
}
break;
@@ -2259,7 +2263,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode,
op2 = impSIMDPopStack(simdType);
op1 = impSIMDPopStack(simdType, instMethod);
simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, SIMDIntrinsicOpInEquality, baseType, size);
- retVal = simdTree;
+ if (simdType == TYP_SIMD12)
+ {
+ simdTree->gtFlags |= GTF_SIMD12_OP;
+ }
+ retVal = simdTree;
}
break;
@@ -2407,7 +2415,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode,
op1 = impSIMDPopStack(simdType, instMethod);
simdTree = gtNewSIMDNode(baseType, op1, op2, simdIntrinsicID, baseType, size);
- retVal = simdTree;
+ if (simdType == TYP_SIMD12)
+ {
+ simdTree->gtFlags |= GTF_SIMD12_OP;
+ }
+ retVal = simdTree;
}
break;
diff --git a/src/jit/simd.h b/src/jit/simd.h
index 807ccf90f1..c4a8866b07 100644
--- a/src/jit/simd.h
+++ b/src/jit/simd.h
@@ -32,11 +32,16 @@ struct SIMDIntrinsicInfo
#ifdef _TARGET_XARCH_
// SSE2 Shuffle control byte to shuffle vector <W, Z, Y, X>
// These correspond to shuffle immediate byte in shufps SSE2 instruction.
-#define SHUFFLE_XXXX 0x00
-#define SHUFFLE_ZWYX 0xB1
-#define SHUFFLE_WWYY 0xF5
-#define SHUFFLE_ZZXX 0xA0
-#endif // _TARGET_XARCH_
+#define SHUFFLE_XXXX 0x00 // 00 00 00 00
+#define SHUFFLE_XXWW 0x0F // 00 00 11 11
+#define SHUFFLE_XYZW 0x1B // 00 01 10 11
+#define SHUFFLE_YXYX 0x44 // 01 00 01 00
+#define SHUFFLE_YYZZ 0x5A // 01 01 10 10
+#define SHUFFLE_ZXXY 0x81 // 10 00 00 01
+#define SHUFFLE_ZWXY 0xB1 // 10 11 00 01
+#define SHUFFLE_WWYY 0xF5 // 11 11 01 01
+#define SHUFFLE_ZZXX 0xA0 // 10 10 00 00
+#endif
#endif // FEATURE_SIMD
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
index effb68bb92..5641ad1681 100644
--- a/src/jit/simdcodegenxarch.cpp
+++ b/src/jit/simdcodegenxarch.cpp
@@ -1199,11 +1199,26 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
// "all ones" where the number of ones is given by the vector byte size, not by the
// vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
// for SSE registers we need to compare to 0x0000FFFF.
+ // The SIMD12 case is handled specially, because we can't rely on the upper bytes being
+ // zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF).
// Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
// that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
// encoding instead of 83F8FF.
- getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg,
- emitActualTypeSize(simdType) == 32 ? -1 : 0x0000FFFF);
+ ssize_t mask;
+ if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
+ {
+ mask = 0x00000FFF;
+ getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask);
+ }
+ else if (emitActualTypeSize(simdType) == 32)
+ {
+ mask = -1;
+ }
+ else
+ {
+ mask = 0x0000FFFF;
+ }
+ getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask);
}
if (targetReg != REG_NA)
@@ -1343,15 +1358,34 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
// DotProduct(v1, v2)
// Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1
- if (baseType == TYP_FLOAT)
+ if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
{
+ assert(baseType == TYP_FLOAT);
// v0 = v1 * v2
// tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
// // position
- // tmp = shuffle(tmp, tmp, Shuffle(2,3,0,1)) // tmp = (2, 3, 0, 1)
+ // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper
+ // // bits
+ // v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1)
+ // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2)
+ // v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2)
+ //
+ inst_RV_RV(INS_mulps, targetReg, op2Reg);
+ inst_RV_RV(INS_movaps, tmpReg1, targetReg);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY);
+ inst_RV_RV(INS_addps, targetReg, tmpReg1);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW);
+ inst_RV_RV(INS_addps, targetReg, tmpReg1);
+ }
+ else if (baseType == TYP_FLOAT)
+ {
+ // v0 = v1 * v2
+ // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
+ // // position
+ // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1)
// v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1)
// tmp = v0
- // tmp = shuffle(tmp, tmp, Shuffle(0,1,2,3)) // tmp = (0+1, 1+0, 2+3, 3+2)
+ // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2)
// v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
// // Essentially horizontal addtion of all elements.
// // We could achieve the same using SSEv3 instruction
@@ -1359,10 +1393,10 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
//
inst_RV_RV(INS_mulps, targetReg, op2Reg);
inst_RV_RV(INS_movaps, tmpReg1, targetReg);
- inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, 0xb1);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY);
inst_RV_RV(INS_addps, targetReg, tmpReg1);
inst_RV_RV(INS_movaps, tmpReg1, targetReg);
- inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, 0x1b);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW);
inst_RV_RV(INS_addps, targetReg, tmpReg1);
}
else
@@ -1406,8 +1440,10 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
emitAttr emitSize = emitActualTypeSize(simdEvalType);
if (baseType == TYP_FLOAT)
{
- inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, 0xf1);
-
+ // dpps computes the dot product of the upper & lower halves of the 32-byte register.
+ // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
+ unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1;
+ inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask);
// dpps computes the dot product of the upper & lower halves of the 32-byte register.
// Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
// If this is TYP_SIMD32, we need to combine the lower & upper results.
@@ -1993,7 +2029,7 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);
// combine upper 4 bytes and lower 8 bytes in targetReg
- getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
+ getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
genProduceReg(treeNode);
}
@@ -2076,7 +2112,7 @@ void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
// combine upper 4 bytes and lower 8 bytes in targetReg
- getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
+ getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
genProduceReg(treeNode);
}