summaryrefslogtreecommitdiff
path: root/src/jit/simdcodegenxarch.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/jit/simdcodegenxarch.cpp')
-rw-r--r--src/jit/simdcodegenxarch.cpp698
1 files changed, 446 insertions, 252 deletions
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
index 702f967aad..ec933fd5d7 100644
--- a/src/jit/simdcodegenxarch.cpp
+++ b/src/jit/simdcodegenxarch.cpp
@@ -17,7 +17,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator.
-#ifdef _TARGET_AMD64_
+#ifdef _TARGET_XARCH_
#include "emit.h"
#include "codegen.h"
#include "sideeffects.h"
@@ -62,7 +62,7 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
// AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
// AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
// If we decide to use AVX2 only, we can remove this assert.
- if ((compiler->opts.eeFlags & CORJIT_FLG_USE_AVX2) == 0)
+ if (!compiler->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_USE_AVX2))
{
assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
}
@@ -205,12 +205,9 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
{
result = INS_pmullw;
}
- else if (compiler->canUseAVX())
+ else if ((baseType == TYP_INT) && (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4))
{
- if (baseType == TYP_INT)
- {
- result = INS_pmulld;
- }
+ result = INS_pmulld;
}
break;
@@ -300,7 +297,8 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
{
result = INS_pcmpeqb;
}
- else if (compiler->canUseAVX() && (baseType == TYP_ULONG || baseType == TYP_LONG))
+ else if ((baseType == TYP_ULONG || baseType == TYP_LONG) &&
+ (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4))
{
result = INS_pcmpeqq;
}
@@ -359,7 +357,7 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
{
result = INS_pcmpgtb;
}
- else if (compiler->canUseAVX() && (baseType == TYP_LONG))
+ else if ((baseType == TYP_LONG) && (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4))
{
result = INS_pcmpgtq;
}
@@ -464,7 +462,8 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
// to target mm reg, zeroing out the upper bits if and only if specified.
//
// Arguments:
-// type the type of value to be moved
+// targetType the target type
+// baseType the base type of value to be moved
// targetReg the target reg
// srcReg the src reg
// moveType action to be performed on target upper bits
@@ -475,10 +474,10 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
// Notes:
// This is currently only supported for floating point types.
//
-void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
+void CodeGen::genSIMDScalarMove(
+ var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
{
- var_types targetType = compiler->getSIMDVectorType();
- assert(varTypeIsFloating(type));
+ assert(varTypeIsFloating(baseType));
#ifdef FEATURE_AVX_SUPPORT
if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
{
@@ -487,17 +486,17 @@ void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber s
case SMT_PreserveUpper:
if (srcReg != targetReg)
{
- instruction ins = ins_Store(type);
+ instruction ins = ins_Store(baseType);
if (getEmitter()->IsThreeOperandMoveAVXInstruction(ins))
{
// In general, when we use a three-operands move instruction, we want to merge the src with
// itself. This is an exception in that we actually want the "merge" behavior, so we must
// specify it with all 3 operands.
- inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(targetType));
+ inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
}
else
{
- inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
+ inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
}
}
break;
@@ -516,9 +515,9 @@ void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber s
case SMT_ZeroInitUpper_SrcHasUpperZeros:
if (srcReg != targetReg)
{
- instruction ins = ins_Copy(type);
+ instruction ins = ins_Copy(baseType);
assert(!getEmitter()->IsThreeOperandMoveAVXInstruction(ins));
- inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
+ inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
}
break;
@@ -536,7 +535,7 @@ void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber s
case SMT_PreserveUpper:
if (srcReg != targetReg)
{
- inst_RV_RV(ins_Store(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
+ inst_RV_RV(ins_Store(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType));
}
break;
@@ -545,22 +544,22 @@ void CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber s
{
// There is no guarantee that upper bits of op1Reg are zero.
// We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, type);
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, type);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
}
else
{
genSIMDZero(targetType, TYP_FLOAT, targetReg);
- inst_RV_RV(ins_Store(type), targetReg, srcReg);
+ inst_RV_RV(ins_Store(baseType), targetReg, srcReg);
}
break;
case SMT_ZeroInitUpper_SrcHasUpperZeros:
if (srcReg != targetReg)
{
- inst_RV_RV(ins_Copy(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
+ inst_RV_RV(ins_Copy(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType));
}
break;
@@ -676,7 +675,7 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
SIMDScalarMoveType moveType =
op1->IsCnsFltOrDbl() || op1->isMemoryOp() ? SMT_ZeroInitUpper_SrcHasUpperZeros : SMT_ZeroInitUpper;
- genSIMDScalarMove(TYP_FLOAT, targetReg, op1Reg, moveType);
+ genSIMDScalarMove(targetType, TYP_FLOAT, targetReg, op1Reg, moveType);
if (size == 8)
{
@@ -786,7 +785,7 @@ void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
{
getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
}
- genSIMDScalarMove(baseType, vectorReg, operandReg, SMT_PreserveUpper);
+ genSIMDScalarMove(targetType, baseType, vectorReg, operandReg, SMT_PreserveUpper);
offset += baseTypeSize;
}
@@ -1033,11 +1032,10 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
//
void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
{
- GenTree* op1 = simdNode->gtGetOp1();
- GenTree* op2 = simdNode->gtGetOp2();
- var_types baseType = simdNode->gtSIMDBaseType;
- regNumber targetReg = simdNode->gtRegNum;
- assert(targetReg != REG_NA);
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
var_types targetType = simdNode->TypeGet();
InstructionSet iset = compiler->getSIMDInstructionSet();
@@ -1051,8 +1049,16 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
case SIMDIntrinsicEqual:
case SIMDIntrinsicGreaterThan:
{
- // SSE2: vector<(u)long> relation op should be implemented in terms of TYP_INT comparison operations
- assert(((iset == InstructionSet_AVX) || (baseType != TYP_LONG)) && (baseType != TYP_ULONG));
+ assert(targetReg != REG_NA);
+
+#ifdef DEBUG
+ // SSE2: vector<(u)long> relational op should be implemented in terms of
+ // TYP_INT comparison operations
+ if (baseType == TYP_LONG || baseType == TYP_ULONG)
+ {
+ assert(iset >= InstructionSet_SSE3_4);
+ }
+#endif
// Greater-than: Floating point vectors use "<" with swapped operands
if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan)
@@ -1093,6 +1099,8 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
case SIMDIntrinsicLessThan:
case SIMDIntrinsicLessThanOrEqual:
{
+ assert(targetReg != REG_NA);
+
// Int vectors use ">" and ">=" with swapped operands
assert(varTypeIsFloating(baseType));
@@ -1115,17 +1123,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
case SIMDIntrinsicOpEquality:
case SIMDIntrinsicOpInEquality:
{
- assert(genIsValidIntReg(targetReg));
-
- // We need two additional XMM register as scratch
- assert(simdNode->gtRsvdRegs != RBM_NONE);
- assert(genCountBits(simdNode->gtRsvdRegs) == 2);
-
- regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
- regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
- tmpRegsMask &= ~tmpReg1Mask;
- regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
- regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
var_types simdType = op1->TypeGet();
// TODO-1stClassStructs: Temporary to minimize asmDiffs
if (simdType == TYP_DOUBLE)
@@ -1140,96 +1137,111 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
simdType = TYP_SIMD16;
}
- // tmpReg1 = (op1Reg == op2Reg)
- // Call this value of tmpReg1 as 'compResult' for further reference below.
- regNumber otherReg = op2Reg;
- if (tmpReg1 != op2Reg)
+ // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest.
+ if ((compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0))
{
- if (tmpReg1 != op1Reg)
- {
- inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
- }
+ assert(op2->isContained());
+ inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType));
}
else
{
- otherReg = op1Reg;
- }
+ // We need one additional SIMD register to store the result of the SIMD compare.
+ regNumber tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLFLOAT);
- // For all integer types we can use TYP_INT comparison.
- unsigned ival = 0;
- instruction ins =
- getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
+ // tmpReg1 = (op1Reg == op2Reg)
+ // Call this value of tmpReg1 as 'compResult' for further reference below.
+ regNumber otherReg = op2Reg;
+ if (tmpReg1 != op2Reg)
+ {
+ if (tmpReg1 != op1Reg)
+ {
+ inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
+ }
+ }
+ else
+ {
+ otherReg = op1Reg;
+ }
- if (varTypeIsFloating(baseType))
- {
- getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
- }
- else
- {
- inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
+ // For all integer types we can use TYP_INT comparison.
+ unsigned ival = 0;
+ instruction ins =
+ getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
+
+ if (varTypeIsFloating(baseType))
+ {
+ getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
+ }
+ else
+ {
+ inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ regNumber intReg;
+ if (targetReg == REG_NA)
+ {
+ // If we are not materializing result into a register,
+ // we would have reserved an int type internal register.
+ intReg = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLINT);
+ }
+ else
+ {
+ // We can use targetReg for setting flags.
+ intReg = targetReg;
+
+ // Must have not reserved any int type internal registers.
+ assert(genCountBits(simdNode->gtRsvdRegs & RBM_ALLINT) == 0);
+ }
+
+ inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType));
+ // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare
+ // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a
+ // subset of each component's ones/zeroes. In the end we need to know if the result is
+ // "all ones" where the number of ones is given by the vector byte size, not by the
+ // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
+ // for SSE registers we need to compare to 0x0000FFFF.
+ // The SIMD12 case is handled specially, because we can't rely on the upper bytes being
+ // zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF).
+ // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
+ // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
+ // encoding instead of 83F8FF.
+ ssize_t mask;
+ if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
+ {
+ mask = 0x00000FFF;
+ getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask);
+ }
+ else if (emitActualTypeSize(simdType) == 32)
+ {
+ mask = -1;
+ }
+ else
+ {
+ mask = 0x0000FFFF;
+ }
+ getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask);
}
- // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
- if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
+ if (targetReg != REG_NA)
{
- // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
+ // If we need to materialize result into a register, targetReg needs to
+ // be set to 1 on true and zero on false.
+ // Equality:
+ // cmp targetReg, 0xFFFFFFFF or 0xFFFF
+ // sete targetReg
+ // movzx targetReg, targetReg
//
- // Generated code sequence
- // - vextractf128 tmpReg2, tmpReg1, 0x01
- // tmpReg2[128..255] <- 0
- // tmpReg2[0..127] <- tmpReg1[128..255]
- // - vandps tmpReg1, tempReg2
- // This will zero-out upper portion of tmpReg1 and
- // lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
- getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
- inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
- }
- // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
- if (simdType != TYP_SIMD8)
- {
- // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
- // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
- getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
-
- // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
+ // InEquality:
+ // cmp targetReg, 0xFFFFFFFF or 0xFFFF
+ // setne targetReg
+ // movzx targetReg, targetReg
//
- // Note that what we have computed is as follows at this point:
- // tmpReg1[0] = compResult[0] & compResult[2]
- // tmpReg1[1] = compResult[1] & compResult[3]
- inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
+ assert(simdNode->TypeGet() == TYP_INT);
+ inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg,
+ TYP_INT, EA_1BYTE);
+ // Set the higher bytes to 0
+ inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
}
- // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
- // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
-
- // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
- // tmpReg2[0] = compResult[1] & compResult[3]
- getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
-
- // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
- // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
- inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
-
- // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
- // (Note that for mov_xmm2i, the int register is always in the reg2 position.
- inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
-
- // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
- // Equality:
- // cmp targetReg, 0xFFFFFFFF
- // sete targetReg
- // movzx targetReg, targetReg
- //
- // InEquality:
- // cmp targetReg, 0xFFFFFFFF
- // setne targetReg
- // movzx targetReg, targetReg
- //
- getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
- inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT,
- EA_1BYTE);
- assert(simdNode->TypeGet() == TYP_INT);
- // Set the higher bytes to 0
- inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
}
break;
@@ -1267,45 +1279,68 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
regNumber targetReg = simdNode->gtRegNum;
assert(targetReg != REG_NA);
- // DotProduct is only supported on floating point types.
var_types targetType = simdNode->TypeGet();
assert(targetType == baseType);
- assert(varTypeIsFloating(baseType));
genConsumeOperands(simdNode);
- regNumber op1Reg = op1->gtRegNum;
- regNumber op2Reg = op2->gtRegNum;
+ regNumber op1Reg = op1->gtRegNum;
+ regNumber op2Reg = op2->gtRegNum;
+ regNumber tmpReg1 = REG_NA;
+ regNumber tmpReg2 = REG_NA;
- regNumber tmpReg = REG_NA;
- // For SSE, or AVX with 32-byte vectors, we need an additional Xmm register as scratch.
- // However, it must be distinct from targetReg, so we request two from the register allocator.
- // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
- if ((compiler->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdEvalType == TYP_SIMD32))
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+ // Dot product intrinsic is supported only on float/double vectors
+ // and 32-byte int vectors on AVX.
+ //
+ // Float/Double Vectors:
+ // For SSE, or AVX with 32-byte vectors, we need one additional Xmm register
+ // different from targetReg as scratch. Note that if this is a TYP_SIMD16 or
+ // smaller on AVX, then we don't need a tmpReg.
+ //
+ // 32-byte integer vector on AVX: we need two additional Xmm registers
+ // different from targetReg as scratch.
+ //
+ // 16-byte integer vector on SSE4: we need one additional Xmm register
+ // different from targetReg as scratch.
+ if (varTypeIsFloating(baseType))
{
- assert(simdNode->gtRsvdRegs != RBM_NONE);
- assert(genCountBits(simdNode->gtRsvdRegs) == 2);
+ if ((compiler->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdEvalType == TYP_SIMD32))
+ {
+ assert(simdNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(simdNode->gtRsvdRegs) == 1);
- regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
- regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
- tmpRegsMask &= ~tmpReg1Mask;
- regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
- regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
+ tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs);
+ assert(tmpReg1 != REG_NA);
+ assert(tmpReg1 != targetReg);
+ }
+ else
+ {
+ assert(simdNode->gtRsvdRegs == RBM_NONE);
+ }
+ }
+ else
+ {
+ assert(baseType == TYP_INT);
+ assert(iset >= InstructionSet_SSE3_4);
- // Choose any register different from targetReg as tmpReg
- if (tmpReg1 != targetReg)
+ if (iset == InstructionSet_SSE3_4)
{
- tmpReg = tmpReg1;
+ // Must have reserved 1 scratch register.
+ assert(genCountBits(simdNode->gtRsvdRegs) == 1);
+ tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs);
}
else
{
- assert(targetReg != tmpReg2);
- tmpReg = tmpReg2;
+ // Must have reserved 2 scratch registers.
+ assert(genCountBits(simdNode->gtRsvdRegs) == 2);
+ regMaskTP tmpRegMask = genFindLowestBit(simdNode->gtRsvdRegs);
+ tmpReg1 = genRegNumFromMask(tmpRegMask);
+ tmpReg2 = genRegNumFromMask(simdNode->gtRsvdRegs & ~tmpRegMask);
}
- assert(tmpReg != REG_NA);
- assert(tmpReg != targetReg);
}
- if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
+ if (iset == InstructionSet_SSE2)
{
// We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg
if (op1Reg == targetReg)
@@ -1323,96 +1358,187 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
}
// DotProduct(v1, v2)
- // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg
- if (baseType == TYP_FLOAT)
+ // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1
+ if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
+ {
+ assert(baseType == TYP_FLOAT);
+ // v0 = v1 * v2
+ // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
+ // // position
+ // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper
+ // // bits
+ // v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1)
+ // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2)
+ // v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2)
+ //
+ inst_RV_RV(INS_mulps, targetReg, op2Reg);
+ inst_RV_RV(INS_movaps, tmpReg1, targetReg);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY);
+ inst_RV_RV(INS_addps, targetReg, tmpReg1);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW);
+ inst_RV_RV(INS_addps, targetReg, tmpReg1);
+ }
+ else if (baseType == TYP_FLOAT)
{
// v0 = v1 * v2
// tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
// // position
- // tmp = shuffle(tmp, tmp, Shuffle(2,3,0,1)) // tmp = (2, 3, 0, 1)
+ // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1)
// v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1)
// tmp = v0
- // tmp = shuffle(tmp, tmp, Shuffle(0,1,2,3)) // tmp = (0+1, 1+0, 2+3, 3+2)
+ // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2)
// v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
// // Essentially horizontal addtion of all elements.
// // We could achieve the same using SSEv3 instruction
// // HADDPS.
//
inst_RV_RV(INS_mulps, targetReg, op2Reg);
- inst_RV_RV(INS_movaps, tmpReg, targetReg);
- inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0xb1);
- inst_RV_RV(INS_addps, targetReg, tmpReg);
- inst_RV_RV(INS_movaps, tmpReg, targetReg);
- inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0x1b);
- inst_RV_RV(INS_addps, targetReg, tmpReg);
+ inst_RV_RV(INS_movaps, tmpReg1, targetReg);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY);
+ inst_RV_RV(INS_addps, targetReg, tmpReg1);
+ inst_RV_RV(INS_movaps, tmpReg1, targetReg);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW);
+ inst_RV_RV(INS_addps, targetReg, tmpReg1);
}
- else if (baseType == TYP_DOUBLE)
+ else
{
+ assert(baseType == TYP_DOUBLE);
+
// v0 = v1 * v2
// tmp = v0 // v0 = (1, 0) - each element is given by its position
// tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1)
// v0 = v0 + tmp // v0 = (1+0, 0+1)
inst_RV_RV(INS_mulpd, targetReg, op2Reg);
- inst_RV_RV(INS_movaps, tmpReg, targetReg);
- inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg, tmpReg, 0x01);
- inst_RV_RV(INS_addpd, targetReg, tmpReg);
- }
- else
- {
- unreached();
+ inst_RV_RV(INS_movaps, tmpReg1, targetReg);
+ inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, 0x01);
+ inst_RV_RV(INS_addpd, targetReg, tmpReg1);
}
}
else
{
- // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
- // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
- // use the 3-op form, so that we can avoid these copies.
- // TODO-CQ: Add inst_RV_RV_RV_IV().
- if (op1Reg == targetReg)
- {
- // Best case
- // nothing to do, we have registers in the right place
- }
- else if (op2Reg == targetReg)
+ assert(iset >= InstructionSet_SSE3_4);
+
+ if (varTypeIsFloating(baseType))
{
- op2Reg = op1Reg;
+ // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
+ // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
+ // use the 3-op form, so that we can avoid these copies.
+ // TODO-CQ: Add inst_RV_RV_RV_IV().
+ if (op1Reg == targetReg)
+ {
+ // Best case
+ // nothing to do, we have registers in the right place
+ }
+ else if (op2Reg == targetReg)
+ {
+ op2Reg = op1Reg;
+ }
+ else
+ {
+ inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
+ }
+
+ emitAttr emitSize = emitActualTypeSize(simdEvalType);
+ if (baseType == TYP_FLOAT)
+ {
+ // dpps computes the dot product of the upper & lower halves of the 32-byte register.
+ // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
+ unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1;
+ inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask);
+ // dpps computes the dot product of the upper & lower halves of the 32-byte register.
+ // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
+ // If this is TYP_SIMD32, we need to combine the lower & upper results.
+ if (simdEvalType == TYP_SIMD32)
+ {
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01);
+ inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
+ }
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ if (simdEvalType == TYP_SIMD32)
+ {
+ // targetReg = targetReg * op2Reg
+ // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
+ // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg
+ // targetReg = targetReg + tmpReg1
+ inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01);
+ inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
+ }
+ else
+ {
+ // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use
+ // dppd directly.
+ assert(iset == InstructionSet_SSE3_4);
+ inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, 0x31);
+ }
+ }
}
else
{
- inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
- }
+ // Dot product of 32-byte int vector on SSE4/AVX.
+ assert(baseType == TYP_INT);
+ assert(simdEvalType == TYP_SIMD16 || simdEvalType == TYP_SIMD32);
+
+#ifdef DEBUG
+ // SSE4: We need 1 scratch register.
+ // AVX2: We need 2 scratch registers.
+ if (simdEvalType == TYP_SIMD16)
+ {
+ assert(tmpReg1 != REG_NA);
+ }
+ else
+ {
+ assert(tmpReg1 != REG_NA);
+ assert(tmpReg2 != REG_NA);
+ }
+#endif
+
+ // tmpReg1 = op1 * op2
+ if (iset == InstructionSet_AVX)
+ {
+ // On AVX take advantage 3 operand form of pmulld
+ inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType));
+ }
+ else
+ {
+ inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType);
+ inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType);
+ }
- emitAttr emitSize = emitActualTypeSize(simdEvalType);
- if (baseType == TYP_FLOAT)
- {
- // dpps computes the dot product of the upper & lower halves of the 32-byte register.
- // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
- inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, 0xf1);
- // If this is TYP_SIMD32, we need to combine the lower & upper results.
if (simdEvalType == TYP_SIMD32)
{
- getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
- inst_RV_RV(INS_addps, targetReg, tmpReg, targetType, emitTypeSize(targetType));
+ // tmpReg2[127..0] = Upper 128-bits of tmpReg1
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
+
+ // tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0]
+ // This will compute
+ // tmpReg1[0] = op1[0]*op2[0] + op1[4]*op2[4]
+ // tmpReg1[1] = op1[1]*op2[1] + op1[5]*op2[5]
+ // tmpReg1[2] = op1[2]*op2[2] + op1[6]*op2[6]
+ // tmpReg1[4] = op1[4]*op2[4] + op1[7]*op2[7]
+ inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE);
}
- }
- else if (baseType == TYP_DOUBLE)
- {
- // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use
- // dppd directly.
- assert(simdType == TYP_SIMD32);
-
- // targetReg = targetReg * op2Reg
- // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
- // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg
- // targetReg = targetReg + tmpReg
- inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
- inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
- getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
- inst_RV_RV(INS_addpd, targetReg, tmpReg, targetType, emitTypeSize(targetType));
- }
- else
- {
- unreached();
+
+ // This horizontal add will compute
+ //
+ // TYP_SIMD16:
+ // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[1]*op2[1]
+ // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[4]*op2[4]
+ //
+ // TYP_SIMD32:
+ // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[4]*op2[4] + op1[1]*op2[1] + op1[5]*op2[5]
+ // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[6]*op2[6] + op1[4]*op2[4] + op1[7]*op2[7]
+ inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
+
+ // DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1]
+ inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
+
+ // TargetReg = integer result from tmpReg1
+ // (Note that for mov_xmm2i, the int register is always in the reg2 position)
+ inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
}
}
@@ -1456,6 +1582,59 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
genConsumeOperands(simdNode);
regNumber srcReg = op1->gtRegNum;
+ // Optimize the case of op1 is in memory and trying to access ith element.
+ if (op1->isMemoryOp())
+ {
+ assert(op1->isContained());
+
+ regNumber baseReg;
+ regNumber indexReg;
+ int offset = 0;
+
+ if (op1->OperGet() == GT_LCL_FLD)
+ {
+ // There are three parts to the total offset here:
+ // {offset of local} + {offset of SIMD Vector field} + {offset of element within SIMD vector}.
+ bool isEBPbased;
+ unsigned varNum = op1->gtLclVarCommon.gtLclNum;
+ offset += compiler->lvaFrameAddress(varNum, &isEBPbased);
+ offset += op1->gtLclFld.gtLclOffs;
+
+ baseReg = (isEBPbased) ? REG_EBP : REG_ESP;
+ }
+ else
+ {
+ // Require GT_IND addr to be not contained.
+ assert(op1->OperGet() == GT_IND);
+
+ GenTree* addr = op1->AsIndir()->Addr();
+ assert(!addr->isContained());
+ baseReg = addr->gtRegNum;
+ }
+
+ if (op2->isContainedIntOrIImmed())
+ {
+ indexReg = REG_NA;
+ offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType);
+ }
+ else
+ {
+ indexReg = op2->gtRegNum;
+ assert(genIsValidIntReg(indexReg));
+ }
+
+ // Now, load the desired element.
+ getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
+ emitTypeSize(baseType), // Of the vector baseType
+ targetReg, // To targetReg
+ baseReg, // Base Reg
+ indexReg, // Indexed
+ genTypeSize(baseType), // by the size of the baseType
+ offset);
+ genProduceReg(simdNode);
+ return;
+ }
+
// SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
// For the non-constant case, we will use the SIMD temp location to store the vector, and
// the load the desired element.
@@ -1839,26 +2018,9 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
// Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg
assert(treeNode->gtRsvdRegs != RBM_NONE);
- assert(genCountBits(treeNode->gtRsvdRegs) == 2);
-
- regNumber tmpReg = REG_NA;
- regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
- regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
- tmpRegsMask &= ~tmpReg1Mask;
- regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
- regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
- // Choose any register different from targetReg as tmpReg
- if (tmpReg1 != targetReg)
- {
- tmpReg = tmpReg1;
- }
- else
- {
- assert(targetReg != tmpReg2);
- tmpReg = tmpReg2;
- }
- assert(tmpReg != REG_NA);
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
assert(tmpReg != targetReg);
// Load upper 4 bytes in tmpReg
@@ -1868,7 +2030,7 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);
// combine upper 4 bytes and lower 8 bytes in targetReg
- getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
+ getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
genProduceReg(treeNode);
}
@@ -1912,9 +2074,9 @@ void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
}
//-----------------------------------------------------------------------------
-// genLoadLclFldTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
-// Since Vector3 is not a hardware supported write size, it is performed
-// as two reads: 8 byte followed by 4-byte.
+// genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
+// Since Vector3 is not a hardware supported read size, it is performed
+// as two reads: 4 byte followed by 8 byte.
//
// Arguments:
// treeNode - tree node that is attempting to load TYP_SIMD12 field
@@ -1922,37 +2084,26 @@ void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
// Return Value:
// None.
//
-void CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
+void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
{
- assert(treeNode->OperGet() == GT_LCL_FLD);
+ assert((treeNode->OperGet() == GT_LCL_FLD) || (treeNode->OperGet() == GT_LCL_VAR));
regNumber targetReg = treeNode->gtRegNum;
- unsigned offs = treeNode->gtLclFld.gtLclOffs;
+ unsigned offs = 0;
unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
assert(varNum < compiler->lvaCount);
- // Need an addtional Xmm register to read upper 4 bytes
- assert(treeNode->gtRsvdRegs != RBM_NONE);
- assert(genCountBits(treeNode->gtRsvdRegs) == 2);
-
- regNumber tmpReg = REG_NA;
- regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
- regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
- tmpRegsMask &= ~tmpReg1Mask;
- regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
- regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
-
- // Choose any register different from targetReg as tmpReg
- if (tmpReg1 != targetReg)
+ if (treeNode->OperGet() == GT_LCL_FLD)
{
- tmpReg = tmpReg1;
+ offs = treeNode->gtLclFld.gtLclOffs;
}
- else
- {
- assert(targetReg != tmpReg2);
- tmpReg = tmpReg2;
- }
- assert(tmpReg != REG_NA);
+
+ // Need an additional Xmm register that is different from
+ // targetReg to read upper 4 bytes.
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
assert(tmpReg != targetReg);
// Read upper 4 bytes to tmpReg
@@ -1962,11 +2113,54 @@ void CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
// combine upper 4 bytes and lower 8 bytes in targetReg
- getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
+ getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
genProduceReg(treeNode);
}
+#ifdef _TARGET_X86_
+
+//-----------------------------------------------------------------------------
+// genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two stores: 8 byte followed by 4-byte.
+//
+// Arguments:
+// treeNode - tree node that is attempting to store TYP_SIMD12 field
+//
+// Return Value:
+// None.
+//
+void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
+{
+ assert(treeNode->OperGet() == GT_PUTARG_STK);
+
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+ assert(!op1->isContained());
+ regNumber operandReg = genConsumeReg(op1);
+
+ // Need an addtional Xmm register to extract upper 4 bytes from data.
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+ // Subtract from ESP; create space for argument.
+ // TODO-CQ: use 'push' instead?
+ inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
+ genStackLevel += 12;
+
+ // 8-byte write
+ getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
+
+ // Extract upper 4-bytes from data
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
+
+ // 4-byte write
+ getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+}
+
+#endif // _TARGET_X86_
+
//-----------------------------------------------------------------------------
// genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
// the given register, if any, or to memory.
@@ -2139,5 +2333,5 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
}
#endif // FEATURE_SIMD
-#endif //_TARGET_AMD64_
+#endif //_TARGET_XARCH_
#endif // !LEGACY_BACKEND