diff options
author | Mike Danes <onemihaid@hotmail.com> | 2017-09-17 12:49:10 +0300 |
---|---|---|
committer | Mike Danes <onemihaid@hotmail.com> | 2017-10-02 08:30:07 +0300 |
commit | 0b637a639e8b8972b0a17322cecb50bf0d99b7c6 (patch) | |
tree | d8911bef1bd51c97053e825f324d376efb2c82fd /src/jit | |
parent | 9b49593f5ca1b3168a4ff947cbfc3d52a97671cb (diff) | |
download | coreclr-0b637a639e8b8972b0a17322cecb50bf0d99b7c6.tar.gz coreclr-0b637a639e8b8972b0a17322cecb50bf0d99b7c6.tar.bz2 coreclr-0b637a639e8b8972b0a17322cecb50bf0d99b7c6.zip |
Simplify SIMD EQ/NE optimization
Diffstat (limited to 'src/jit')
-rw-r--r-- | src/jit/codegenxarch.cpp | 21 | ||||
-rw-r--r-- | src/jit/lower.cpp | 50 | ||||
-rw-r--r-- | src/jit/lowerxarch.cpp | 67 | ||||
-rw-r--r-- | src/jit/lsra.cpp | 2 | ||||
-rw-r--r-- | src/jit/lsraxarch.cpp | 51 | ||||
-rw-r--r-- | src/jit/simdcodegenxarch.cpp | 45 |
6 files changed, 85 insertions, 151 deletions
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 58c6c10222..43327afc55 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -6282,27 +6282,6 @@ void CodeGen::genCompareInt(GenTreePtr treeNode) return; } -#ifdef FEATURE_SIMD - // If we have GT_JTRUE(GT_EQ/NE(GT_SIMD((in)Equality, v1, v2), true/false)), - // then we don't need to generate code for GT_EQ/GT_NE, since SIMD (in)Equality intrinsic - // would set or clear Zero flag. - if ((targetReg == REG_NA) && tree->OperIs(GT_EQ, GT_NE)) - { - // Is it a SIMD (in)Equality that doesn't need to materialize result into a register? - if ((op1->gtRegNum == REG_NA) && op1->IsSIMDEqualityOrInequality()) - { - // Must be comparing against true or false. - assert(op2->IsIntegralConst(0) || op2->IsIntegralConst(1)); - assert(op2->isContainedIntOrIImmed()); - - // In this case SIMD (in)Equality will set or clear - // Zero flag, based on which GT_JTRUE would generate - // the right conditional jump. - return; - } - } -#endif // FEATURE_SIMD - genConsumeOperands(tree); assert(!op1->isContainedIntOrIImmed()); diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index d82676e540..0b9f9b22f7 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -5606,56 +5606,6 @@ void Lowering::ContainCheckJTrue(GenTreeOp* node) // The compare does not need to be generated into a register. GenTree* cmp = node->gtGetOp1(); cmp->gtLsraInfo.isNoRegCompare = true; - -#ifdef FEATURE_SIMD - assert(node->OperIs(GT_JTRUE)); - - // Say we have the following IR - // simdCompareResult = GT_SIMD((In)Equality, v1, v2) - // integerCompareResult = GT_EQ/NE(simdCompareResult, true/false) - // GT_JTRUE(integerCompareResult) - // - // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality - // intrinsic will set or clear the Zero flag. - genTreeOps cmpOper = cmp->OperGet(); - if (cmpOper == GT_EQ || cmpOper == GT_NE) - { - GenTree* cmpOp1 = cmp->gtGetOp1(); - GenTree* cmpOp2 = cmp->gtGetOp2(); - - if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1))) - { - // We always generate code for a SIMD equality comparison, though it produces no value. - // Neither the GT_JTRUE nor the immediate need to be evaluated. - MakeSrcContained(cmp, cmpOp2); - cmpOp1->gtLsraInfo.isNoRegCompare = true; - // We have to reverse compare oper in the following cases: - // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it. - // Therefore, if compare oper is == or != against false(0), we will - // be checking opposite of what is required. - // - // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it. - // Therefore, if compare oper is == or != against true(1), we will - // be checking opposite of what is required. - GenTreeSIMD* simdNode = cmpOp1->AsSIMD(); - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) - { - if (cmpOp2->IsIntegralConst(0)) - { - cmp->SetOper(GenTree::ReverseRelop(cmpOper)); - } - } - else - { - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality); - if (cmpOp2->IsIntegralConst(1)) - { - cmp->SetOper(GenTree::ReverseRelop(cmpOper)); - } - } - } - } -#endif // FEATURE_SIMD } #endif // !LEGACY_BACKEND diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index bb45279887..834cd34fda 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -794,6 +794,73 @@ void Lowering::LowerSIMD(GenTreeSIMD* simdNode) // the addr of SIMD vector with the given index. simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; } + else if (simdNode->IsSIMDEqualityOrInequality()) + { + LIR::Use simdUse; + + if (BlockRange().TryGetUse(simdNode, &simdUse)) + { + // + // Try to transform JTRUE(EQ|NE(SIMD<OpEquality|OpInEquality>(x, y), 0|1)) into + // JCC(SIMD<OpEquality|OpInEquality>(x, y)). SIMD<OpEquality|OpInEquality>(x, y) + // is expected to set the Zero flag appropriately. + // All the involved nodes must form a continuous range, there's no other way to + // guarantee that condition flags aren't changed between the SIMD node and the JCC + // node. + // + + bool transformed = false; + GenTree* simdUser = simdUse.User(); + + if (simdUser->OperIs(GT_EQ, GT_NE) && simdUser->gtGetOp2()->IsCnsIntOrI() && + (simdNode->gtNext == simdUser->gtGetOp2()) && (simdUser->gtGetOp2()->gtNext == simdUser)) + { + ssize_t relopOp2Value = simdUser->gtGetOp2()->AsIntCon()->IconValue(); + + if ((relopOp2Value == 0) || (relopOp2Value == 1)) + { + GenTree* jtrue = simdUser->gtNext; + + if ((jtrue != nullptr) && jtrue->OperIs(GT_JTRUE) && (jtrue->gtGetOp1() == simdUser)) + { + if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) != simdUser->OperIs(GT_EQ)) + { + relopOp2Value ^= 1; + } + + jtrue->ChangeOper(GT_JCC); + GenTreeCC* jcc = jtrue->AsCC(); + jcc->gtFlags |= GTF_USE_FLAGS; + jcc->gtCondition = (relopOp2Value == 0) ? GT_NE : GT_EQ; + + BlockRange().Remove(simdUser->gtGetOp2()); + BlockRange().Remove(simdUser); + transformed = true; + } + } + } + + if (!transformed) + { + // + // The code generated for SIMD SIMD<OpEquality|OpInEquality>(x, y) nodes sets + // the Zero flag like integer compares do so we can simply use SETCC<EQ|NE> + // to produce the desired result. This avoids the need for subsequent phases + // to have to handle 2 cases (set flags/set destination register). + // + + genTreeOps condition = (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? GT_EQ : GT_NE; + GenTreeCC* setcc = new (comp, GT_SETCC) GenTreeCC(GT_SETCC, condition, simdNode->TypeGet()); + setcc->gtFlags |= GTF_USE_FLAGS; + BlockRange().InsertAfter(simdNode, setcc); + simdUse.ReplaceWith(comp, setcc); + } + } + + simdNode->gtFlags |= GTF_SET_FLAGS; + simdNode->SetUnusedValue(); + simdNode->gtLsraInfo.isNoRegCompare = true; + } #endif ContainCheckSIMD(simdNode); } diff --git a/src/jit/lsra.cpp b/src/jit/lsra.cpp index db6e699134..1ace2f0da6 100644 --- a/src/jit/lsra.cpp +++ b/src/jit/lsra.cpp @@ -4842,7 +4842,7 @@ void LinearScan::buildIntervals() TreeNodeInfoInit(node); // If the node produces an unused value, mark it as a local def-use - if (node->IsValue() && node->IsUnusedValue()) + if (node->IsValue() && node->IsUnusedValue() && !node->gtLsraInfo.isNoRegCompare) { node->gtLsraInfo.isLocalDefUse = true; node->gtLsraInfo.dstCount = 0; diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 8ac8e235b5..4fff111506 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -254,20 +254,6 @@ void LinearScan::TreeNodeInfoInit(GenTree* tree) GenTree* cmp = tree->gtGetOp1(); assert(cmp->gtLsraInfo.dstCount == 0); - -#ifdef FEATURE_SIMD - GenTree* cmpOp1 = cmp->gtGetOp1(); - GenTree* cmpOp2 = cmp->gtGetOp2(); - if (cmpOp1->IsSIMDEqualityOrInequality() && cmpOp2->isContained()) - { - genTreeOps cmpOper = cmp->OperGet(); - - // We always generate code for a SIMD equality comparison, but the compare itself produces no value. - // Neither the SIMD node nor the immediate need to be evaluated into a register. - assert(cmpOp1->gtLsraInfo.dstCount == 0); - assert(cmpOp2->gtLsraInfo.dstCount == 0); - } -#endif // FEATURE_SIMD } break; @@ -2136,41 +2122,26 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicOpEquality: case SIMDIntrinsicOpInEquality: - - // On SSE4/AVX, we can generate optimal code for (in)equality - // against zero using ptest. We can safely do this optimization - // for integral vectors but not for floating-point for the reason - // that we have +0.0 and -0.0 and +0.0 == -0.0 if (simdTree->gtGetOp2()->isContained()) { + // If the second operand is contained then ContainCheckSIMD has determined + // that PTEST can be used. We only need a single source register and no + // internal registers. info->srcCount = 1; } else { - info->srcCount = 2; - // Need one SIMD register as scratch. - // See genSIMDIntrinsicRelOp() for details on code sequence generated and - // the need for one scratch register. - // - // Note these intrinsics produce a BOOL result, hence internal float - // registers reserved are guaranteed to be different from target - // integer register without explicitly specifying. + // Can't use PTEST so we need 2 source registers, 1 internal SIMD register + // (to hold the result of PCMPEQD or other similar SIMD compare instruction) + // and one internal INT register (to hold the result of PMOVMSKB). + info->srcCount = 2; info->internalFloatCount = 1; info->setInternalCandidates(this, allSIMDRegs()); + info->internalIntCount = 1; + info->addInternalCandidates(this, allRegs(TYP_INT)); } - if (info->isNoRegCompare) - { - info->dstCount = 0; - // Codegen of SIMD (in)Equality uses target integer reg only for setting flags. - // A target reg is not needed on AVX when comparing against Vector Zero. - // In all other cases we need to reserve an int type internal register if we - // don't have a target register on the compare. - if (!compiler->canUseAVX() || !simdTree->gtGetOp2()->IsIntegralConstVector(0)) - { - info->internalIntCount = 1; - info->addInternalCandidates(this, allRegs(TYP_INT)); - } - } + // These SIMD nodes only set the condition flags. + info->dstCount = 0; break; case SIMDIntrinsicDotProduct: diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index b7c9b29088..429cff0bb3 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -2049,6 +2049,9 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) case SIMDIntrinsicOpEquality: case SIMDIntrinsicOpInEquality: { + // We're only setting condition flags, if a 0/1 value is desired then Lowering should have inserted a SETCC. + assert(targetReg == REG_NA); + var_types simdType = op1->TypeGet(); // TODO-1stClassStructs: Temporary to minimize asmDiffs if (simdType == TYP_DOUBLE) @@ -2064,9 +2067,9 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) } // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest. - if ((compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0)) + if (op2->isContained()) { - assert(op2->isContained()); + assert((compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0)); inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType)); } else @@ -2103,22 +2106,7 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); } - regNumber intReg; - if (targetReg == REG_NA) - { - // If we are not materializing result into a register, - // we would have reserved an int type internal register. - intReg = simdNode->GetSingleTempReg(RBM_ALLINT); - } - else - { - // We can use targetReg for setting flags. - intReg = targetReg; - - // Must have not reserved any int type internal registers. - assert(simdNode->AvailableTempRegCount(RBM_ALLINT) == 0); - } - + regNumber intReg = simdNode->GetSingleTempReg(RBM_ALLINT); inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType)); // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a @@ -2147,27 +2135,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) } getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask); } - - if (targetReg != REG_NA) - { - // If we need to materialize result into a register, targetReg needs to - // be set to 1 on true and zero on false. - // Equality: - // cmp targetReg, 0xFFFFFFFF or 0xFFFF - // sete targetReg - // movzx targetReg, targetReg - // - // InEquality: - // cmp targetReg, 0xFFFFFFFF or 0xFFFF - // setne targetReg - // movzx targetReg, targetReg - // - assert(simdNode->TypeGet() == TYP_INT); - inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, - TYP_INT, EA_1BYTE); - // Set the higher bytes to 0 - inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE)); - } } break; |