diff options
Diffstat (limited to 'src/jit/simdcodegenxarch.cpp')
-rw-r--r-- | src/jit/simdcodegenxarch.cpp | 698 |
1 files changed, 446 insertions, 252 deletions
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index 702f967aad..ec933fd5d7 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -17,7 +17,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator. -#ifdef _TARGET_AMD64_ +#ifdef _TARGET_XARCH_ #include "emit.h" #include "codegen.h" #include "sideeffects.h" @@ -62,7 +62,7 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory. // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg. // If we decide to use AVX2 only, we can remove this assert. - if ((compiler->opts.eeFlags & CORJIT_FLG_USE_AVX2) == 0) + if (!compiler->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_USE_AVX2)) { assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE); } @@ -205,12 +205,9 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type { result = INS_pmullw; } - else if (compiler->canUseAVX()) + else if ((baseType == TYP_INT) && (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4)) { - if (baseType == TYP_INT) - { - result = INS_pmulld; - } + result = INS_pmulld; } break; @@ -300,7 +297,8 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type { result = INS_pcmpeqb; } - else if (compiler->canUseAVX() && (baseType == TYP_ULONG || baseType == TYP_LONG)) + else if ((baseType == TYP_ULONG || baseType == TYP_LONG) && + (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4)) { result = INS_pcmpeqq; } @@ -359,7 +357,7 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type { result = INS_pcmpgtb; } - else if (compiler->canUseAVX() && (baseType == TYP_LONG)) + else if ((baseType == TYP_LONG) && (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4)) { result = INS_pcmpgtq; } @@ -464,7 +462,8 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type // to target mm reg, zeroing out the upper bits if and only if specified. // // Arguments: -// type the type of value to be moved +// targetType the target type +// baseType the base type of value to be moved // targetReg the target reg // srcReg the src reg // moveType action to be performed on target upper bits @@ -475,10 +474,10 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type // Notes: // This is currently only supported for floating point types. // -void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType) +void CodeGen::genSIMDScalarMove( + var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType) { - var_types targetType = compiler->getSIMDVectorType(); - assert(varTypeIsFloating(type)); + assert(varTypeIsFloating(baseType)); #ifdef FEATURE_AVX_SUPPORT if (compiler->getSIMDInstructionSet() == InstructionSet_AVX) { @@ -487,17 +486,17 @@ void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber s case SMT_PreserveUpper: if (srcReg != targetReg) { - instruction ins = ins_Store(type); + instruction ins = ins_Store(baseType); if (getEmitter()->IsThreeOperandMoveAVXInstruction(ins)) { // In general, when we use a three-operands move instruction, we want to merge the src with // itself. This is an exception in that we actually want the "merge" behavior, so we must // specify it with all 3 operands. - inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(targetType)); + inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType)); } else { - inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType)); + inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); } } break; @@ -516,9 +515,9 @@ void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber s case SMT_ZeroInitUpper_SrcHasUpperZeros: if (srcReg != targetReg) { - instruction ins = ins_Copy(type); + instruction ins = ins_Copy(baseType); assert(!getEmitter()->IsThreeOperandMoveAVXInstruction(ins)); - inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType)); + inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); } break; @@ -536,7 +535,7 @@ void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber s case SMT_PreserveUpper: if (srcReg != targetReg) { - inst_RV_RV(ins_Store(type), targetReg, srcReg, targetType, emitTypeSize(targetType)); + inst_RV_RV(ins_Store(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType)); } break; @@ -545,22 +544,22 @@ void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber s { // There is no guarantee that upper bits of op1Reg are zero. // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes. - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, type); + instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); - ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, type); + ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); } else { genSIMDZero(targetType, TYP_FLOAT, targetReg); - inst_RV_RV(ins_Store(type), targetReg, srcReg); + inst_RV_RV(ins_Store(baseType), targetReg, srcReg); } break; case SMT_ZeroInitUpper_SrcHasUpperZeros: if (srcReg != targetReg) { - inst_RV_RV(ins_Copy(type), targetReg, srcReg, targetType, emitTypeSize(targetType)); + inst_RV_RV(ins_Copy(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType)); } break; @@ -676,7 +675,7 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode) SIMDScalarMoveType moveType = op1->IsCnsFltOrDbl() || op1->isMemoryOp() ? SMT_ZeroInitUpper_SrcHasUpperZeros : SMT_ZeroInitUpper; - genSIMDScalarMove(TYP_FLOAT, targetReg, op1Reg, moveType); + genSIMDScalarMove(targetType, TYP_FLOAT, targetReg, op1Reg, moveType); if (size == 8) { @@ -786,7 +785,7 @@ void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode) { getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize); } - genSIMDScalarMove(baseType, vectorReg, operandReg, SMT_PreserveUpper); + genSIMDScalarMove(targetType, baseType, vectorReg, operandReg, SMT_PreserveUpper); offset += baseTypeSize; } @@ -1033,11 +1032,10 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) // void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) { - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types baseType = simdNode->gtSIMDBaseType; - regNumber targetReg = simdNode->gtRegNum; - assert(targetReg != REG_NA); + GenTree* op1 = simdNode->gtGetOp1(); + GenTree* op2 = simdNode->gtGetOp2(); + var_types baseType = simdNode->gtSIMDBaseType; + regNumber targetReg = simdNode->gtRegNum; var_types targetType = simdNode->TypeGet(); InstructionSet iset = compiler->getSIMDInstructionSet(); @@ -1051,8 +1049,16 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) case SIMDIntrinsicEqual: case SIMDIntrinsicGreaterThan: { - // SSE2: vector<(u)long> relation op should be implemented in terms of TYP_INT comparison operations - assert(((iset == InstructionSet_AVX) || (baseType != TYP_LONG)) && (baseType != TYP_ULONG)); + assert(targetReg != REG_NA); + +#ifdef DEBUG + // SSE2: vector<(u)long> relational op should be implemented in terms of + // TYP_INT comparison operations + if (baseType == TYP_LONG || baseType == TYP_ULONG) + { + assert(iset >= InstructionSet_SSE3_4); + } +#endif // Greater-than: Floating point vectors use "<" with swapped operands if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan) @@ -1093,6 +1099,8 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) case SIMDIntrinsicLessThan: case SIMDIntrinsicLessThanOrEqual: { + assert(targetReg != REG_NA); + // Int vectors use ">" and ">=" with swapped operands assert(varTypeIsFloating(baseType)); @@ -1115,17 +1123,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) case SIMDIntrinsicOpEquality: case SIMDIntrinsicOpInEquality: { - assert(genIsValidIntReg(targetReg)); - - // We need two additional XMM register as scratch - assert(simdNode->gtRsvdRegs != RBM_NONE); - assert(genCountBits(simdNode->gtRsvdRegs) == 2); - - regMaskTP tmpRegsMask = simdNode->gtRsvdRegs; - regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask); - tmpRegsMask &= ~tmpReg1Mask; - regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask); - regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask); var_types simdType = op1->TypeGet(); // TODO-1stClassStructs: Temporary to minimize asmDiffs if (simdType == TYP_DOUBLE) @@ -1140,96 +1137,111 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) simdType = TYP_SIMD16; } - // tmpReg1 = (op1Reg == op2Reg) - // Call this value of tmpReg1 as 'compResult' for further reference below. - regNumber otherReg = op2Reg; - if (tmpReg1 != op2Reg) + // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest. + if ((compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0)) { - if (tmpReg1 != op1Reg) - { - inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType)); - } + assert(op2->isContained()); + inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType)); } else { - otherReg = op1Reg; - } + // We need one additional SIMD register to store the result of the SIMD compare. + regNumber tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLFLOAT); - // For all integer types we can use TYP_INT comparison. - unsigned ival = 0; - instruction ins = - getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival); + // tmpReg1 = (op1Reg == op2Reg) + // Call this value of tmpReg1 as 'compResult' for further reference below. + regNumber otherReg = op2Reg; + if (tmpReg1 != op2Reg) + { + if (tmpReg1 != op1Reg) + { + inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType)); + } + } + else + { + otherReg = op1Reg; + } - if (varTypeIsFloating(baseType)) - { - getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival); - } - else - { - inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); + // For all integer types we can use TYP_INT comparison. + unsigned ival = 0; + instruction ins = + getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival); + + if (varTypeIsFloating(baseType)) + { + getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival); + } + else + { + inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); + } + + regNumber intReg; + if (targetReg == REG_NA) + { + // If we are not materializing result into a register, + // we would have reserved an int type internal register. + intReg = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLINT); + } + else + { + // We can use targetReg for setting flags. + intReg = targetReg; + + // Must have not reserved any int type internal registers. + assert(genCountBits(simdNode->gtRsvdRegs & RBM_ALLINT) == 0); + } + + inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType)); + // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare + // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a + // subset of each component's ones/zeroes. In the end we need to know if the result is + // "all ones" where the number of ones is given by the vector byte size, not by the + // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and + // for SSE registers we need to compare to 0x0000FFFF. + // The SIMD12 case is handled specially, because we can't rely on the upper bytes being + // zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF). + // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize + // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF + // encoding instead of 83F8FF. + ssize_t mask; + if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) + { + mask = 0x00000FFF; + getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask); + } + else if (emitActualTypeSize(simdType) == 32) + { + mask = -1; + } + else + { + mask = 0x0000FFFF; + } + getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask); } - // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result. - if (compiler->canUseAVX() && (simdType == TYP_SIMD32)) + if (targetReg != REG_NA) { - // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits + // If we need to materialize result into a register, targetReg needs to + // be set to 1 on true and zero on false. + // Equality: + // cmp targetReg, 0xFFFFFFFF or 0xFFFF + // sete targetReg + // movzx targetReg, targetReg // - // Generated code sequence - // - vextractf128 tmpReg2, tmpReg1, 0x01 - // tmpReg2[128..255] <- 0 - // tmpReg2[0..127] <- tmpReg1[128..255] - // - vandps tmpReg1, tempReg2 - // This will zero-out upper portion of tmpReg1 and - // lower portion of tmpReg1 is and of upper and lower 128-bit comparison result. - getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01); - inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); - } - // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result. - if (simdType != TYP_SIMD8) - { - // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2)) - // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE - getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E); - - // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2) + // InEquality: + // cmp targetReg, 0xFFFFFFFF or 0xFFFF + // setne targetReg + // movzx targetReg, targetReg // - // Note that what we have computed is as follows at this point: - // tmpReg1[0] = compResult[0] & compResult[2] - // tmpReg1[1] = compResult[1] & compResult[3] - inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); + assert(simdNode->TypeGet() == TYP_INT); + inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, + TYP_INT, EA_1BYTE); + // Set the higher bytes to 0 + inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE)); } - // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1], - // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields. - - // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1)) - // tmpReg2[0] = compResult[1] & compResult[3] - getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1); - - // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2) - // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3] - inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps?? - - // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3] - // (Note that for mov_xmm2i, the int register is always in the reg2 position. - inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT); - - // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false. - // Equality: - // cmp targetReg, 0xFFFFFFFF - // sete targetReg - // movzx targetReg, targetReg - // - // InEquality: - // cmp targetReg, 0xFFFFFFFF - // setne targetReg - // movzx targetReg, targetReg - // - getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF); - inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT, - EA_1BYTE); - assert(simdNode->TypeGet() == TYP_INT); - // Set the higher bytes to 0 - inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE)); } break; @@ -1267,45 +1279,68 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode) regNumber targetReg = simdNode->gtRegNum; assert(targetReg != REG_NA); - // DotProduct is only supported on floating point types. var_types targetType = simdNode->TypeGet(); assert(targetType == baseType); - assert(varTypeIsFloating(baseType)); genConsumeOperands(simdNode); - regNumber op1Reg = op1->gtRegNum; - regNumber op2Reg = op2->gtRegNum; + regNumber op1Reg = op1->gtRegNum; + regNumber op2Reg = op2->gtRegNum; + regNumber tmpReg1 = REG_NA; + regNumber tmpReg2 = REG_NA; - regNumber tmpReg = REG_NA; - // For SSE, or AVX with 32-byte vectors, we need an additional Xmm register as scratch. - // However, it must be distinct from targetReg, so we request two from the register allocator. - // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. - if ((compiler->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdEvalType == TYP_SIMD32)) + InstructionSet iset = compiler->getSIMDInstructionSet(); + + // Dot product intrinsic is supported only on float/double vectors + // and 32-byte int vectors on AVX. + // + // Float/Double Vectors: + // For SSE, or AVX with 32-byte vectors, we need one additional Xmm register + // different from targetReg as scratch. Note that if this is a TYP_SIMD16 or + // smaller on AVX, then we don't need a tmpReg. + // + // 32-byte integer vector on AVX: we need two additional Xmm registers + // different from targetReg as scratch. + // + // 16-byte integer vector on SSE4: we need one additional Xmm register + // different from targetReg as scratch. + if (varTypeIsFloating(baseType)) { - assert(simdNode->gtRsvdRegs != RBM_NONE); - assert(genCountBits(simdNode->gtRsvdRegs) == 2); + if ((compiler->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdEvalType == TYP_SIMD32)) + { + assert(simdNode->gtRsvdRegs != RBM_NONE); + assert(genCountBits(simdNode->gtRsvdRegs) == 1); - regMaskTP tmpRegsMask = simdNode->gtRsvdRegs; - regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask); - tmpRegsMask &= ~tmpReg1Mask; - regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask); - regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask); + tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs); + assert(tmpReg1 != REG_NA); + assert(tmpReg1 != targetReg); + } + else + { + assert(simdNode->gtRsvdRegs == RBM_NONE); + } + } + else + { + assert(baseType == TYP_INT); + assert(iset >= InstructionSet_SSE3_4); - // Choose any register different from targetReg as tmpReg - if (tmpReg1 != targetReg) + if (iset == InstructionSet_SSE3_4) { - tmpReg = tmpReg1; + // Must have reserved 1 scratch register. + assert(genCountBits(simdNode->gtRsvdRegs) == 1); + tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs); } else { - assert(targetReg != tmpReg2); - tmpReg = tmpReg2; + // Must have reserved 2 scratch registers. + assert(genCountBits(simdNode->gtRsvdRegs) == 2); + regMaskTP tmpRegMask = genFindLowestBit(simdNode->gtRsvdRegs); + tmpReg1 = genRegNumFromMask(tmpRegMask); + tmpReg2 = genRegNumFromMask(simdNode->gtRsvdRegs & ~tmpRegMask); } - assert(tmpReg != REG_NA); - assert(tmpReg != targetReg); } - if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2) + if (iset == InstructionSet_SSE2) { // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg if (op1Reg == targetReg) @@ -1323,96 +1358,187 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode) } // DotProduct(v1, v2) - // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg - if (baseType == TYP_FLOAT) + // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1 + if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) + { + assert(baseType == TYP_FLOAT); + // v0 = v1 * v2 + // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its + // // position + // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper + // // bits + // v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1) + // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2) + // v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2) + // + inst_RV_RV(INS_mulps, targetReg, op2Reg); + inst_RV_RV(INS_movaps, tmpReg1, targetReg); + inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY); + inst_RV_RV(INS_addps, targetReg, tmpReg1); + inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW); + inst_RV_RV(INS_addps, targetReg, tmpReg1); + } + else if (baseType == TYP_FLOAT) { // v0 = v1 * v2 // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its // // position - // tmp = shuffle(tmp, tmp, Shuffle(2,3,0,1)) // tmp = (2, 3, 0, 1) + // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1) // v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1) // tmp = v0 - // tmp = shuffle(tmp, tmp, Shuffle(0,1,2,3)) // tmp = (0+1, 1+0, 2+3, 3+2) + // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2) // v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3) // // Essentially horizontal addtion of all elements. // // We could achieve the same using SSEv3 instruction // // HADDPS. // inst_RV_RV(INS_mulps, targetReg, op2Reg); - inst_RV_RV(INS_movaps, tmpReg, targetReg); - inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0xb1); - inst_RV_RV(INS_addps, targetReg, tmpReg); - inst_RV_RV(INS_movaps, tmpReg, targetReg); - inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0x1b); - inst_RV_RV(INS_addps, targetReg, tmpReg); + inst_RV_RV(INS_movaps, tmpReg1, targetReg); + inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY); + inst_RV_RV(INS_addps, targetReg, tmpReg1); + inst_RV_RV(INS_movaps, tmpReg1, targetReg); + inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW); + inst_RV_RV(INS_addps, targetReg, tmpReg1); } - else if (baseType == TYP_DOUBLE) + else { + assert(baseType == TYP_DOUBLE); + // v0 = v1 * v2 // tmp = v0 // v0 = (1, 0) - each element is given by its position // tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1) // v0 = v0 + tmp // v0 = (1+0, 0+1) inst_RV_RV(INS_mulpd, targetReg, op2Reg); - inst_RV_RV(INS_movaps, tmpReg, targetReg); - inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg, tmpReg, 0x01); - inst_RV_RV(INS_addpd, targetReg, tmpReg); - } - else - { - unreached(); + inst_RV_RV(INS_movaps, tmpReg1, targetReg); + inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, 0x01); + inst_RV_RV(INS_addpd, targetReg, tmpReg1); } } else { - // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg. - // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually - // use the 3-op form, so that we can avoid these copies. - // TODO-CQ: Add inst_RV_RV_RV_IV(). - if (op1Reg == targetReg) - { - // Best case - // nothing to do, we have registers in the right place - } - else if (op2Reg == targetReg) + assert(iset >= InstructionSet_SSE3_4); + + if (varTypeIsFloating(baseType)) { - op2Reg = op1Reg; + // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg. + // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually + // use the 3-op form, so that we can avoid these copies. + // TODO-CQ: Add inst_RV_RV_RV_IV(). + if (op1Reg == targetReg) + { + // Best case + // nothing to do, we have registers in the right place + } + else if (op2Reg == targetReg) + { + op2Reg = op1Reg; + } + else + { + inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType)); + } + + emitAttr emitSize = emitActualTypeSize(simdEvalType); + if (baseType == TYP_FLOAT) + { + // dpps computes the dot product of the upper & lower halves of the 32-byte register. + // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. + unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1; + inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask); + // dpps computes the dot product of the upper & lower halves of the 32-byte register. + // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. + // If this is TYP_SIMD32, we need to combine the lower & upper results. + if (simdEvalType == TYP_SIMD32) + { + getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01); + inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType)); + } + } + else if (baseType == TYP_DOUBLE) + { + if (simdEvalType == TYP_SIMD32) + { + // targetReg = targetReg * op2Reg + // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves + // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg + // targetReg = targetReg + tmpReg1 + inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType)); + inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType)); + getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01); + inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType)); + } + else + { + // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use + // dppd directly. + assert(iset == InstructionSet_SSE3_4); + inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, 0x31); + } + } } else { - inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType)); - } + // Dot product of 32-byte int vector on SSE4/AVX. + assert(baseType == TYP_INT); + assert(simdEvalType == TYP_SIMD16 || simdEvalType == TYP_SIMD32); + +#ifdef DEBUG + // SSE4: We need 1 scratch register. + // AVX2: We need 2 scratch registers. + if (simdEvalType == TYP_SIMD16) + { + assert(tmpReg1 != REG_NA); + } + else + { + assert(tmpReg1 != REG_NA); + assert(tmpReg2 != REG_NA); + } +#endif + + // tmpReg1 = op1 * op2 + if (iset == InstructionSet_AVX) + { + // On AVX take advantage 3 operand form of pmulld + inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType)); + } + else + { + inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType); + inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType); + } - emitAttr emitSize = emitActualTypeSize(simdEvalType); - if (baseType == TYP_FLOAT) - { - // dpps computes the dot product of the upper & lower halves of the 32-byte register. - // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. - inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, 0xf1); - // If this is TYP_SIMD32, we need to combine the lower & upper results. if (simdEvalType == TYP_SIMD32) { - getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01); - inst_RV_RV(INS_addps, targetReg, tmpReg, targetType, emitTypeSize(targetType)); + // tmpReg2[127..0] = Upper 128-bits of tmpReg1 + getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01); + + // tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0] + // This will compute + // tmpReg1[0] = op1[0]*op2[0] + op1[4]*op2[4] + // tmpReg1[1] = op1[1]*op2[1] + op1[5]*op2[5] + // tmpReg1[2] = op1[2]*op2[2] + op1[6]*op2[6] + // tmpReg1[4] = op1[4]*op2[4] + op1[7]*op2[7] + inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE); } - } - else if (baseType == TYP_DOUBLE) - { - // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use - // dppd directly. - assert(simdType == TYP_SIMD32); - - // targetReg = targetReg * op2Reg - // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves - // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg - // targetReg = targetReg + tmpReg - inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType)); - inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType)); - getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01); - inst_RV_RV(INS_addpd, targetReg, tmpReg, targetType, emitTypeSize(targetType)); - } - else - { - unreached(); + + // This horizontal add will compute + // + // TYP_SIMD16: + // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[1]*op2[1] + // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[4]*op2[4] + // + // TYP_SIMD32: + // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[4]*op2[4] + op1[1]*op2[1] + op1[5]*op2[5] + // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[6]*op2[6] + op1[4]*op2[4] + op1[7]*op2[7] + inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE); + + // DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1] + inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE); + + // TargetReg = integer result from tmpReg1 + // (Note that for mov_xmm2i, the int register is always in the reg2 position) + inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT); } } @@ -1456,6 +1582,59 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode) genConsumeOperands(simdNode); regNumber srcReg = op1->gtRegNum; + // Optimize the case of op1 is in memory and trying to access ith element. + if (op1->isMemoryOp()) + { + assert(op1->isContained()); + + regNumber baseReg; + regNumber indexReg; + int offset = 0; + + if (op1->OperGet() == GT_LCL_FLD) + { + // There are three parts to the total offset here: + // {offset of local} + {offset of SIMD Vector field} + {offset of element within SIMD vector}. + bool isEBPbased; + unsigned varNum = op1->gtLclVarCommon.gtLclNum; + offset += compiler->lvaFrameAddress(varNum, &isEBPbased); + offset += op1->gtLclFld.gtLclOffs; + + baseReg = (isEBPbased) ? REG_EBP : REG_ESP; + } + else + { + // Require GT_IND addr to be not contained. + assert(op1->OperGet() == GT_IND); + + GenTree* addr = op1->AsIndir()->Addr(); + assert(!addr->isContained()); + baseReg = addr->gtRegNum; + } + + if (op2->isContainedIntOrIImmed()) + { + indexReg = REG_NA; + offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType); + } + else + { + indexReg = op2->gtRegNum; + assert(genIsValidIntReg(indexReg)); + } + + // Now, load the desired element. + getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load + emitTypeSize(baseType), // Of the vector baseType + targetReg, // To targetReg + baseReg, // Base Reg + indexReg, // Indexed + genTypeSize(baseType), // by the size of the baseType + offset); + genProduceReg(simdNode); + return; + } + // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant. // For the non-constant case, we will use the SIMD temp location to store the vector, and // the load the desired element. @@ -1839,26 +2018,9 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode) // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg assert(treeNode->gtRsvdRegs != RBM_NONE); - assert(genCountBits(treeNode->gtRsvdRegs) == 2); - - regNumber tmpReg = REG_NA; - regMaskTP tmpRegsMask = treeNode->gtRsvdRegs; - regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask); - tmpRegsMask &= ~tmpReg1Mask; - regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask); - regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask); + assert(genCountBits(treeNode->gtRsvdRegs) == 1); - // Choose any register different from targetReg as tmpReg - if (tmpReg1 != targetReg) - { - tmpReg = tmpReg1; - } - else - { - assert(targetReg != tmpReg2); - tmpReg = tmpReg2; - } - assert(tmpReg != REG_NA); + regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs); assert(tmpReg != targetReg); // Load upper 4 bytes in tmpReg @@ -1868,7 +2030,7 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode) getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0); // combine upper 4 bytes and lower 8 bytes in targetReg - getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44); + getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX); genProduceReg(treeNode); } @@ -1912,9 +2074,9 @@ void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode) } //----------------------------------------------------------------------------- -// genLoadLclFldTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field. -// Since Vector3 is not a hardware supported write size, it is performed -// as two reads: 8 byte followed by 4-byte. +// genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field. +// Since Vector3 is not a hardware supported read size, it is performed +// as two reads: 4 byte followed by 8 byte. // // Arguments: // treeNode - tree node that is attempting to load TYP_SIMD12 field @@ -1922,37 +2084,26 @@ void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode) // Return Value: // None. // -void CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode) +void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode) { - assert(treeNode->OperGet() == GT_LCL_FLD); + assert((treeNode->OperGet() == GT_LCL_FLD) || (treeNode->OperGet() == GT_LCL_VAR)); regNumber targetReg = treeNode->gtRegNum; - unsigned offs = treeNode->gtLclFld.gtLclOffs; + unsigned offs = 0; unsigned varNum = treeNode->gtLclVarCommon.gtLclNum; assert(varNum < compiler->lvaCount); - // Need an addtional Xmm register to read upper 4 bytes - assert(treeNode->gtRsvdRegs != RBM_NONE); - assert(genCountBits(treeNode->gtRsvdRegs) == 2); - - regNumber tmpReg = REG_NA; - regMaskTP tmpRegsMask = treeNode->gtRsvdRegs; - regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask); - tmpRegsMask &= ~tmpReg1Mask; - regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask); - regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask); - - // Choose any register different from targetReg as tmpReg - if (tmpReg1 != targetReg) + if (treeNode->OperGet() == GT_LCL_FLD) { - tmpReg = tmpReg1; + offs = treeNode->gtLclFld.gtLclOffs; } - else - { - assert(targetReg != tmpReg2); - tmpReg = tmpReg2; - } - assert(tmpReg != REG_NA); + + // Need an additional Xmm register that is different from + // targetReg to read upper 4 bytes. + assert(treeNode->gtRsvdRegs != RBM_NONE); + assert(genCountBits(treeNode->gtRsvdRegs) == 1); + + regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs); assert(tmpReg != targetReg); // Read upper 4 bytes to tmpReg @@ -1962,11 +2113,54 @@ void CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode) getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs); // combine upper 4 bytes and lower 8 bytes in targetReg - getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44); + getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX); genProduceReg(treeNode); } +#ifdef _TARGET_X86_ + +//----------------------------------------------------------------------------- +// genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field. +// Since Vector3 is not a hardware supported write size, it is performed +// as two stores: 8 byte followed by 4-byte. +// +// Arguments: +// treeNode - tree node that is attempting to store TYP_SIMD12 field +// +// Return Value: +// None. +// +void CodeGen::genPutArgStkSIMD12(GenTree* treeNode) +{ + assert(treeNode->OperGet() == GT_PUTARG_STK); + + GenTreePtr op1 = treeNode->gtOp.gtOp1; + assert(!op1->isContained()); + regNumber operandReg = genConsumeReg(op1); + + // Need an addtional Xmm register to extract upper 4 bytes from data. + assert(treeNode->gtRsvdRegs != RBM_NONE); + assert(genCountBits(treeNode->gtRsvdRegs) == 1); + regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs); + + // Subtract from ESP; create space for argument. + // TODO-CQ: use 'push' instead? + inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE); + genStackLevel += 12; + + // 8-byte write + getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0); + + // Extract upper 4-bytes from data + getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02); + + // 4-byte write + getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8); +} + +#endif // _TARGET_X86_ + //----------------------------------------------------------------------------- // genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to // the given register, if any, or to memory. @@ -2139,5 +2333,5 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) } #endif // FEATURE_SIMD -#endif //_TARGET_AMD64_ +#endif //_TARGET_XARCH_ #endif // !LEGACY_BACKEND |