summaryrefslogtreecommitdiff
path: root/src/jit/simdcodegenxarch.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/jit/simdcodegenxarch.cpp')
-rw-r--r--src/jit/simdcodegenxarch.cpp1992
1 files changed, 1992 insertions, 0 deletions
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
new file mode 100644
index 0000000000..8ea039f47f
--- /dev/null
+++ b/src/jit/simdcodegenxarch.cpp
@@ -0,0 +1,1992 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for full license information.
+//
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX XX
+XX Amd64 SIMD Code Generator XX
+XX XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator.
+
+#ifdef _TARGET_AMD64_
+#include "emit.h"
+#include "codegen.h"
+#include "lower.h"
+#include "gcinfo.h"
+#include "gcinfoencoder.h"
+
+#ifdef FEATURE_SIMD
+
+// Instruction immediates
+
+// Insertps:
+// - bits 6 and 7 of the immediate indicate which source item to select (0..3)
+// - bits 4 and 5 of the immediate indicate which target item to insert into (0..3)
+// - bits 0 to 3 of the immediate indicate which target item to zero
+#define INSERTPS_SOURCE_SELECT(i) (i<<6)
+#define INSERTPS_TARGET_SELECT(i) (i<<4)
+#define INSERTPS_ZERO(i) (1<<i)
+
+// getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic
+//
+// Arguments:
+// intrinsicId - SIMD intrinsic Id
+// baseType - Base type of the SIMD vector
+// immed - Out param. Any immediate byte operand that needs to be passed to SSE2 opcode
+//
+//
+// Return Value:
+// Instruction (op) to be used, and immed is set if instruction requires an immediate operand.
+//
+instruction
+CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId,
+ var_types baseType,
+ unsigned *ival /*=nullptr*/)
+{
+ // Minimal required instruction set is SSE2.
+ assert(compiler->canUseSSE2());
+
+ instruction result = INS_invalid;
+ switch(intrinsicId)
+ {
+ case SIMDIntrinsicInit:
+ if (compiler->canUseAVX())
+ {
+ // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
+ // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
+ // If we decide to use AVX2 only, we can remove this assert.
+ if ((compiler->opts.eeFlags & CORJIT_FLG_USE_AVX2) == 0)
+ {
+ assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
+ }
+ switch (baseType)
+ {
+ case TYP_FLOAT: result = INS_vbroadcastss; break;
+ case TYP_DOUBLE: result = INS_vbroadcastsd; break;
+ case TYP_ULONG: __fallthrough;
+ case TYP_LONG: result = INS_vpbroadcastq; break;
+ case TYP_UINT: __fallthrough;
+ case TYP_INT: result = INS_vpbroadcastd; break;
+ case TYP_CHAR: __fallthrough;
+ case TYP_SHORT: result = INS_vpbroadcastw; break;
+ case TYP_UBYTE: __fallthrough;
+ case TYP_BYTE: result = INS_vpbroadcastb; break;
+ default: unreached();
+ }
+ break;
+ }
+ // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
+ __fallthrough;
+ case SIMDIntrinsicShuffleSSE2:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_shufps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_shufpd;
+ }
+ else if (baseType == TYP_INT || baseType == TYP_UINT)
+ {
+ result = INS_pshufd;
+ }
+ else if (baseType == TYP_LONG || baseType == TYP_ULONG)
+ {
+ // We don't have a seperate SSE2 instruction and will
+ // use the instruction meant for doubles since it is
+ // of the same size as a long.
+ result = INS_shufpd;
+ }
+ break;
+
+ case SIMDIntrinsicSqrt:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_sqrtps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_sqrtpd;
+ }
+ else
+ {
+ unreached();
+ }
+ break;
+
+ case SIMDIntrinsicAdd:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_addps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_addpd;
+ }
+ else if (baseType == TYP_INT || baseType == TYP_UINT)
+ {
+ result = INS_paddd;
+ }
+ else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
+ {
+ result = INS_paddw;
+ }
+ else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
+ {
+ result = INS_paddb;
+ }
+ else if (baseType == TYP_LONG || baseType == TYP_ULONG)
+ {
+ result = INS_paddq;
+ }
+ break;
+
+ case SIMDIntrinsicSub:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_subps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_subpd;
+ }
+ else if (baseType == TYP_INT || baseType == TYP_UINT)
+ {
+ result = INS_psubd;
+ }
+ else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
+ {
+ result = INS_psubw;
+ }
+ else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
+ {
+ result = INS_psubb;
+ }
+ else if (baseType == TYP_LONG || baseType == TYP_ULONG)
+ {
+ result = INS_psubq;
+ }
+ break;
+
+ case SIMDIntrinsicMul:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_mulps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_mulpd;
+ }
+ else if (baseType == TYP_SHORT)
+ {
+ result = INS_pmullw;
+ }
+ else if (compiler->canUseAVX())
+ {
+ if (baseType == TYP_INT)
+ {
+ result = INS_pmulld;
+ }
+ }
+ break;
+
+ case SIMDIntrinsicDiv:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_divps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_divpd;
+ }
+ else
+ {
+ unreached();
+ }
+ break;
+
+ case SIMDIntrinsicMin:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_minps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_minpd;
+ }
+ else if (baseType == TYP_UBYTE)
+ {
+ result = INS_pminub;
+ }
+ else if (baseType == TYP_SHORT)
+ {
+ result = INS_pminsw;
+ }
+ else
+ {
+ unreached();
+ }
+ break;
+
+ case SIMDIntrinsicMax:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_maxps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_maxpd;
+ }
+ else if (baseType == TYP_UBYTE)
+ {
+ result = INS_pmaxub;
+ }
+ else if (baseType == TYP_SHORT)
+ {
+ result = INS_pmaxsw;
+ }
+ else
+ {
+ unreached();
+ }
+ break;
+
+ case SIMDIntrinsicEqual:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_cmpps;
+ assert(ival != nullptr);
+ *ival = 0;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_cmppd;
+ assert(ival != nullptr);
+ *ival = 0;
+ }
+ else if (baseType == TYP_INT || baseType == TYP_UINT)
+ {
+ result = INS_pcmpeqd;
+ }
+ else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
+ {
+ result = INS_pcmpeqw;
+ }
+ else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
+ {
+ result = INS_pcmpeqb;
+ }
+ else if (compiler->canUseAVX() && (baseType == TYP_ULONG || baseType == TYP_LONG))
+ {
+ result = INS_pcmpeqq;
+ }
+ break;
+
+ case SIMDIntrinsicLessThan:
+ // Packed integers use > with swapped operands
+ assert(baseType != TYP_INT);
+
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_cmpps;
+ assert(ival != nullptr);
+ *ival = 1;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_cmppd;
+ assert(ival != nullptr);
+ *ival = 1;
+ }
+ break;
+
+ case SIMDIntrinsicLessThanOrEqual:
+ // Packed integers use (a==b) || ( b > a) in place of a <= b.
+ assert(baseType != TYP_INT);
+
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_cmpps;
+ assert(ival != nullptr);
+ *ival = 2;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_cmppd;
+ assert(ival != nullptr);
+ *ival = 2;
+ }
+ break;
+
+ case SIMDIntrinsicGreaterThan:
+ // Packed float/double use < with swapped operands
+ assert(!varTypeIsFloating(baseType));
+
+ // SSE2 supports only signed >
+ if (baseType == TYP_INT)
+ {
+ result = INS_pcmpgtd;
+ }
+ else if (baseType == TYP_SHORT)
+ {
+ result = INS_pcmpgtw;
+ }
+ else if (baseType == TYP_BYTE)
+ {
+ result = INS_pcmpgtb;
+ }
+ else if (compiler->canUseAVX() && (baseType == TYP_LONG))
+ {
+ result = INS_pcmpgtq;
+ }
+ break;
+
+ case SIMDIntrinsicBitwiseAnd:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_andps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_andpd;
+ }
+ else if (varTypeIsIntegral(baseType))
+ {
+ result = INS_pand;
+ }
+ break;
+
+ case SIMDIntrinsicBitwiseAndNot:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_andnps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_andnpd;
+ }
+ else if (baseType == TYP_INT)
+ {
+ result = INS_pandn;
+ }
+ else if (varTypeIsIntegral(baseType))
+ {
+ result = INS_pandn;
+ }
+ break;
+
+ case SIMDIntrinsicBitwiseOr:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_orps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_orpd;
+ }
+ else if (varTypeIsIntegral(baseType))
+ {
+ result = INS_por;
+ }
+ break;
+
+ case SIMDIntrinsicBitwiseXor:
+ if (baseType == TYP_FLOAT)
+ {
+ result = INS_xorps;
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ result = INS_xorpd;
+ }
+ else if (varTypeIsIntegral(baseType))
+ {
+ result = INS_pxor;
+ }
+ break;
+
+ case SIMDIntrinsicCast:
+ result = INS_movaps;
+ break;
+
+ case SIMDIntrinsicShiftLeftInternal:
+ // base type doesn't matter since the entire vector is shifted left
+ result = INS_pslldq;
+ break;
+
+ case SIMDIntrinsicShiftRightInternal:
+ // base type doesn't matter since the entire vector is shifted right
+ result = INS_psrldq;
+ break;
+
+ case SIMDIntrinsicUpperSave:
+ result = INS_vextractf128;
+ break;
+
+ case SIMDIntrinsicUpperRestore:
+ result = INS_insertps;
+ break;
+
+ default:
+ assert(!"Unsupported SIMD intrinsic");
+ unreached();
+ }
+
+ noway_assert(result != INS_invalid);
+ return result;
+}
+
+// genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg
+// to target mm reg, zeroing out the upper bits if and only if specified.
+//
+// Arguments:
+// type the type of value to be moved
+// targetReg the target reg
+// srcReg the src reg
+// zeroInit true if the upper bits of targetReg should be zero'd
+//
+// Return Value:
+// None
+//
+// Notes:
+// This is currently only supported for floating point types.
+//
+void
+CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg, bool zeroInit)
+{
+ var_types targetType = compiler->getSIMDVectorType();
+ assert(varTypeIsFloating(type));
+#ifdef FEATURE_AVX_SUPPORT
+ if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ if (zeroInit)
+ {
+ // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
+ // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
+ // to zero all but the lower bits.
+ unsigned int insertpsImm = (INSERTPS_TARGET_SELECT(0)|INSERTPS_ZERO(1)|INSERTPS_ZERO(2)|INSERTPS_ZERO(3));
+ inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
+ }
+ else if (srcReg != targetReg)
+ {
+ instruction ins = ins_Store(type);
+ if (getEmitter()->IsThreeOperandMoveAVXInstruction(ins))
+ {
+ // In general, when we use a three-operands move instruction, we want to merge the src with itself.
+ // This is an exception in that we actually want the "merge" behavior, so we must specify it with
+ // all 3 operands.
+ inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(targetType));
+ }
+ else
+ {
+ inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
+ }
+ }
+ }
+ else
+#endif // FEATURE_AVX_SUPPORT
+ {
+ // SSE
+ if (zeroInit)
+ {
+ if (srcReg == targetReg)
+ {
+ // There is no guarantee that upper bits of op1Reg are zero.
+ // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, type);
+ getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, type);
+ getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
+ }
+ else
+ {
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, type);
+ inst_RV_RV(ins, targetReg, targetReg, targetType, emitTypeSize(targetType));
+ inst_RV_RV(ins_Store(type), targetReg, srcReg);
+ }
+ }
+ else if (srcReg != targetReg)
+ {
+ inst_RV_RV(ins_Store(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types targetType = simdNode->TypeGet();
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+ unsigned size = simdNode->gtSIMDSize;
+
+ // Should never see small int base type vectors except for zero initialization.
+ noway_assert(!varTypeIsSmallInt(baseType) || op1->IsZero());
+
+ instruction ins = INS_invalid;
+ if (op1->isContained())
+ {
+ if (op1->IsZero())
+ {
+ // pxor reg, reg
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType);
+ inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
+ }
+ else if ((baseType == TYP_INT && op1->IsCnsIntOrI() && op1->AsIntConCommon()->IconValue() == 0xffffffff) ||
+ (baseType == TYP_LONG && op1->IsCnsIntOrI() && op1->AsIntConCommon()->IconValue() == 0xffffffffffffffffLL))
+ {
+ // case of initializing elements of vector with all 1's
+ // generate pcmpeqd reg, reg
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
+ inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
+ }
+#ifdef FEATURE_AVX_SUPPORT
+ else
+ {
+ assert(iset == InstructionSet_AVX);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType);
+ if (op1->IsCnsFltOrDbl())
+ {
+ getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1);
+ }
+ else if (op1->OperIsLocalAddr())
+ {
+ unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : 0;
+ getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum, offset);
+ }
+ else
+ {
+ unreached();
+ }
+ }
+#endif // FEATURE_AVX_SUPPORT
+ }
+ else if (iset == InstructionSet_AVX && ((size == 32) || (size == 16)))
+ {
+ regNumber srcReg = genConsumeReg(op1);
+ if (baseType == TYP_INT || baseType == TYP_UINT ||
+ baseType == TYP_LONG || baseType == TYP_ULONG)
+ {
+ ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
+ assert(ins != INS_invalid);
+ inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
+ srcReg = targetReg;
+ }
+
+ ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+ getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg);
+ }
+ else
+ {
+ // If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType.
+ // In either case we are going to use the SSE2 shuffle instruction.
+
+ regNumber op1Reg = genConsumeReg(op1);
+ unsigned shuffleControl = 0;
+
+ if (compiler->isSubRegisterSIMDType(simdNode))
+ {
+ assert(baseType == TYP_FLOAT);
+
+ // We cannot assume that upper bits of op1Reg or targetReg be zero.
+ // Therefore we need to explicitly zero out upper bits. This is
+ // essential for the shuffle operation performed below.
+ //
+ // If op1 is a float/double constant, we would have loaded it from
+ // data section using movss/sd. Similarly if op1 is a memory op we
+ // would have loaded it using movss/sd. Movss/sd when loading a xmm reg
+ // from memory would zero-out upper bits. In these cases we can
+ // avoid explicitly zero'ing out targetReg.
+ bool zeroInitRequired = !(op1->IsCnsFltOrDbl() || op1->isMemoryOp());
+ genSIMDScalarMove(TYP_FLOAT, targetReg, op1Reg, zeroInitRequired);
+
+ if (size == 8)
+ {
+ shuffleControl = 0x50;
+ }
+ else if (size == 12)
+ {
+ shuffleControl = 0x40;
+ }
+ else
+ {
+ noway_assert(!"Unexpected size for SIMD type");
+ }
+ }
+ else // Vector<T>
+ {
+ if (op1Reg != targetReg)
+ {
+ if (varTypeIsFloating(baseType))
+ {
+ ins = ins_Copy(targetType);
+ }
+ else if (baseType == TYP_INT || baseType == TYP_UINT ||
+ baseType == TYP_LONG || baseType == TYP_ULONG)
+ {
+ ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
+ }
+
+ assert(ins != INS_invalid);
+ inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType));
+ }
+ }
+
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
+ getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl);
+ }
+
+ genProduceReg(simdNode);
+}
+
+//-------------------------------------------------------------------------------------------
+// genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes
+// a number of arguments equal to the length of the Vector.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN);
+
+ // Right now this intrinsic is supported only on TYP_FLOAT vectors
+ var_types baseType = simdNode->gtSIMDBaseType;
+ noway_assert(baseType == TYP_FLOAT);
+
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+
+ var_types targetType = simdNode->TypeGet();
+
+ // Note that we cannot use targetReg before consumed all source operands. Therefore,
+ // Need an internal register to stitch together all the values into a single vector
+ // in an XMM reg.
+ assert(simdNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(simdNode->gtRsvdRegs) == 1);
+ regNumber vectorReg = genRegNumFromMask(simdNode->gtRsvdRegs);
+
+ // Zero out vectorReg if we are constructing a vector whose size is not equal to the SIMD vector size.
+ // For example in case of Vector4f we don't need to zero when using SSE2.
+ if (compiler->isSubRegisterSIMDType(simdNode))
+ {
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType);
+ inst_RV_RV(ins, vectorReg, vectorReg, targetType, emitActualTypeSize(targetType));
+ }
+
+ unsigned int offset = 0;
+ unsigned int baseTypeSize = genTypeSize(baseType);
+ instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2())
+ {
+ assert(list->OperGet() == GT_LIST);
+ GenTree* listItem = list->gtGetOp1();
+ assert(listItem->TypeGet() == baseType);
+ assert(!listItem->isContained());
+
+ // The list will have init values in the reverse order. This allows us
+ // to efficiently stitch together a vector as follows:
+ // vectorReg = (vectorReg << offset)
+ // VectorReg[0] = listItemReg
+ //
+ // Use genSIMDScalarMove with zeroInit of false in order to ensure that the upper
+ // bits of vectorReg are not modified.
+ regNumber listItemReg = genConsumeReg(listItem);
+ if (offset != 0)
+ {
+ getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
+ }
+ genSIMDScalarMove(baseType, vectorReg, listItem->gtRegNum, false /* do not zeroInit */);
+
+ offset += baseTypeSize;
+ }
+
+ noway_assert(offset == simdNode->gtSIMDSize);
+
+ // Load the initialized value.
+ if (targetReg != vectorReg)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType));
+ }
+ genProduceReg(simdNode);
+}
+
+//----------------------------------------------------------------------------------
+// genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types targetType = simdNode->TypeGet();
+
+ regNumber op1Reg = genConsumeReg(op1);
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+ if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast || targetReg != op1Reg)
+ {
+ inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+ genProduceReg(simdNode);
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
+// add, sub, mul, bit-wise And, AndNot and Or.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
+{
+ assert( simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin ||
+ simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax
+ );
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types targetType = simdNode->TypeGet();
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+ regNumber op1Reg = genConsumeReg(op1);
+ regNumber op2Reg = genConsumeReg(op2);
+ regNumber otherReg = op2Reg;
+
+ // Vector<Int>.Mul:
+ // SSE2 doesn't have an instruction to perform this operation directly
+ // whereas SSE4.1 does (pmulld). This is special cased and computed
+ // as follows.
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul &&
+ baseType == TYP_INT &&
+ iset == InstructionSet_SSE2)
+ {
+ // We need an additional xmm register as temp.
+ assert(simdNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(simdNode->gtRsvdRegs) == 2);
+
+ regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
+ regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
+ tmpRegsMask &= ~tmpReg1Mask;
+ regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
+ regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
+
+ // tmpReg1 = op1 >> 4-bytes
+ inst_RV_RV(INS_movaps, tmpReg1, op1Reg, targetType, emitActualTypeSize(targetType));
+ getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg1, 4);
+
+ // tmpReg2 = op2 >> 4-bytes
+ inst_RV_RV(INS_movaps, tmpReg2, op2Reg);
+ getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg2, 4);
+
+ // tmpReg1 = unsigned double word multiply of tmpReg1 and tmpReg2. Essentially
+ // tmpReg1[63:0] = op1[1] * op2[1]
+ // tmpReg2[127:64] = op1[3] * op2[3]
+ inst_RV_RV(INS_pmuludq, tmpReg1, tmpReg2, targetType, emitActualTypeSize(targetType));
+
+ // targetReg[63:0] = op1[0] * op2[0]
+ // targetReg[127:64] = op1[2] * op2[2]
+ if (op2Reg == targetReg)
+ {
+ otherReg = op1Reg;
+ }
+ else if (op1Reg != targetReg)
+ {
+ inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+ inst_RV_RV(INS_pmuludq, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
+
+ // Extract first and third double word results from tmpReg1
+ // tmpReg2 = shuffle(0,0,2,0) of tmpReg1
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg2, tmpReg1, 0x08);
+
+ // Extract first and third double word results from targetReg
+ // tmpReg1 = shuffle(0,0,2,0) of targetReg
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg1, targetReg, 0x08);
+
+ // pack the results into a single vector
+ inst_RV_RV(INS_movaps, targetReg, tmpReg1, targetType, emitActualTypeSize(targetType));
+ inst_RV_RV(INS_punpckldq, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType));
+ }
+ else
+ {
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+
+ //Currently AVX doesn't support integer.
+ //if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
+ if (op1Reg != targetReg &&
+ compiler->canUseAVX() &&
+ !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) &&
+ getEmitter()->IsThreeOperandAVXInstruction(ins))
+ {
+ inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
+ }
+ else
+ {
+ if (op2Reg == targetReg)
+ {
+ otherReg = op1Reg;
+ }
+ else if (op1Reg != targetReg)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+
+ inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
+ }
+ }
+
+ // Vector2/3 div: since the top-most elements will be zero, we end up
+ // perfoming 0/0 which is a NAN. Therefore, post division we need to set the
+ // top-most elements to zero. This is achieved by left logical shift followed
+ // by right logical shift of targetReg.
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16))
+ {
+ // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
+ unsigned shiftCount = 16 - simdNode->gtSIMDSize;
+ assert(shiftCount != 0);
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
+ }
+
+ genProduceReg(simdNode);
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater
+// <, <=, >, >= and ==
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
+{
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types targetType = simdNode->TypeGet();
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+ regNumber op1Reg = genConsumeReg(op1);
+ regNumber op2Reg = genConsumeReg(op2);
+ regNumber otherReg = op2Reg;
+
+ switch(simdNode->gtSIMDIntrinsicID)
+ {
+ case SIMDIntrinsicEqual:
+ case SIMDIntrinsicGreaterThan:
+ {
+ // SSE2: vector<(u)long> relation op should be implemented in terms of TYP_INT comparison operations
+ assert(((iset == InstructionSet_AVX) || (baseType != TYP_LONG)) &&
+ (baseType != TYP_ULONG));
+
+ // Greater-than: Floating point vectors use "<" with swapped operands
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan)
+ {
+ assert(!varTypeIsFloating(baseType));
+ }
+
+ unsigned ival = 0;
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
+
+ // targetReg = op1reg > op2reg
+ // Therefore, we can optimize if op1Reg == targetReg
+ otherReg = op2Reg;
+ if (op1Reg != targetReg)
+ {
+ if (op2Reg == targetReg)
+ {
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual);
+ otherReg = op1Reg;
+ }
+ else
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+ }
+
+ if (varTypeIsFloating(baseType))
+ {
+ getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, otherReg, ival);
+ }
+ else
+ {
+ inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
+ }
+ }
+ break;
+
+ case SIMDIntrinsicLessThan:
+ case SIMDIntrinsicLessThanOrEqual:
+ {
+ // Int vectors use ">" and ">=" with swapped operands
+ assert(varTypeIsFloating(baseType));
+
+ // Get the instruction opcode for compare operation
+ unsigned ival;
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
+
+ // targetReg = op1reg RelOp op2reg
+ // Thefore, we can optimize if op1Reg == targetReg
+ if (op1Reg != targetReg)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+
+ getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival);
+ }
+ break;
+
+ // (In)Equality that produces bool result instead of a bit vector
+ case SIMDIntrinsicOpEquality:
+ case SIMDIntrinsicOpInEquality:
+ {
+ assert(genIsValidIntReg(targetReg));
+
+ // We need two additional XMM register as scratch
+ assert(simdNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(simdNode->gtRsvdRegs) == 2);
+
+ regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
+ regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
+ tmpRegsMask &= ~tmpReg1Mask;
+ regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
+ regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
+ var_types simdType = op1->TypeGet();
+
+ // Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16
+ // since both the operands will be in XMM registers.
+ if (simdType == TYP_SIMD12)
+ {
+ simdType = TYP_SIMD16;
+ }
+
+ // tmpReg1 = (op1Reg == op2Reg)
+ // Call this value of tmpReg1 as 'compResult' for further reference below.
+ regNumber otherReg = op2Reg;
+ if (tmpReg1 != op2Reg)
+ {
+ if (tmpReg1 != op1Reg)
+ {
+ inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
+ }
+ }
+ else
+ {
+ otherReg = op1Reg;
+ }
+
+ // For all integer types we can use TYP_INT comparison.
+ unsigned ival = 0;
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
+
+ if (varTypeIsFloating(baseType))
+ {
+ getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
+ }
+ else
+ {
+ inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
+ if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
+ {
+ // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
+ //
+ // Generated code sequence
+ // - vextractf128 tmpReg2, tmpReg1, 0x01
+ // tmpReg2[128..255] <- 0
+ // tmpReg2[0..127] <- tmpReg1[128..255]
+ // - vandps tmpReg1, tempReg2
+ // This will zero-out upper portion of tmpReg1 and
+ // lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
+ inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
+ }
+ // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
+ if (simdType != TYP_DOUBLE)
+ {
+ // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
+ // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
+ getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
+
+ // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
+ //
+ // Note that what we have computed is as follows at this point:
+ // tmpReg1[0] = compResult[0] & compResult[2]
+ // tmpReg1[1] = compResult[1] & compResult[3]
+ inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
+ }
+ // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
+ // OR we have a Vector2 (TYPSIMD8 aka TYP_DOUBLE) in tmpReg1, which has only those two fields.
+
+ // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
+ // tmpReg2[0] = compResult[1] & compResult[3]
+ getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
+
+ // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
+ // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
+ inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
+
+ // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
+ // (Note that for mov_xmm2i, the int register is always in the reg2 position.
+ inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
+
+ // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
+ // Equality:
+ // cmp targetReg, 0xFFFFFFFF
+ // sete targetReg
+ // movzx targetReg, targetReg
+ //
+ // InEquality:
+ // cmp targetReg, 0xFFFFFFFF
+ // setne targetReg
+ // movzx targetReg, targetReg
+ //
+ getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
+ inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT, EA_1BYTE);
+ assert(simdNode->TypeGet() == TYP_INT);
+ // Set the higher bytes to 0
+ inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+ }
+ break;
+
+ default:
+ noway_assert("Unimplemented SIMD relational operation.");
+ unreached();
+ }
+
+ genProduceReg(simdNode);
+}
+
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ var_types simdType = op1->TypeGet();
+ var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+
+ // DotProduct is only supported on floating point types.
+ var_types targetType = simdNode->TypeGet();
+ assert(targetType == baseType);
+ assert(varTypeIsFloating(baseType));
+
+ regNumber op1Reg = genConsumeReg(op1);
+ regNumber op2Reg = genConsumeReg(op2);
+
+ regNumber tmpReg = REG_NA;
+ // For SSE, or AVX with 32-byte vectors, we need an additional Xmm register as scratch.
+ // However, it must be distinct from targetReg, so we request two from the register allocator.
+ // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
+ if ((compiler->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdEvalType == TYP_SIMD32))
+ {
+ assert(simdNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(simdNode->gtRsvdRegs) == 2);
+
+ regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
+ regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
+ tmpRegsMask &= ~tmpReg1Mask;
+ regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
+ regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
+
+ // Choose any register different from targetReg as tmpReg
+ if (tmpReg1 != targetReg)
+ {
+ tmpReg = tmpReg1;
+ }
+ else
+ {
+ assert(targetReg != tmpReg2);
+ tmpReg = tmpReg2;
+ }
+ assert(tmpReg != REG_NA);
+ assert(tmpReg != targetReg);
+ }
+
+ if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
+ {
+ // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg
+ if (op1Reg == targetReg)
+ {
+ // Best case
+ // nothing to do, we have registers in the right place
+ }
+ else if (op2Reg == targetReg)
+ {
+ op2Reg = op1Reg;
+ }
+ else
+ {
+ inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
+ }
+
+ // DotProduct(v1, v2)
+ // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg
+ if (baseType == TYP_FLOAT)
+ {
+ // v0 = v1 * v2
+ // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its position
+ // tmp = shuffle(tmp, tmp, Shuffle(2,3,0,1)) // tmp = (2, 3, 0, 1)
+ // v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1)
+ // tmp = v0
+ // tmp = shuffle(tmp, tmp, Shuffle(0,1,2,3)) // tmp = (0+1, 1+0, 2+3, 3+2)
+ // v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
+ // // Essentially horizontal addtion of all elements.
+ // // We could achieve the same using SSEv3 instruction HADDPS.
+ //
+ inst_RV_RV(INS_mulps, targetReg, op2Reg);
+ inst_RV_RV(INS_movaps, tmpReg, targetReg);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0xb1);
+ inst_RV_RV(INS_addps, targetReg, tmpReg);
+ inst_RV_RV(INS_movaps, tmpReg, targetReg);
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0x1b);
+ inst_RV_RV(INS_addps, targetReg, tmpReg);
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ // v0 = v1 * v2
+ // tmp = v0 // v0 = (1, 0) - each element is given by its position
+ // tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1)
+ // v0 = v0 + tmp // v0 = (1+0, 0+1)
+ inst_RV_RV(INS_mulpd, targetReg, op2Reg);
+ inst_RV_RV(INS_movaps, tmpReg, targetReg);
+ inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg, tmpReg, 0x01);
+ inst_RV_RV(INS_addpd, targetReg, tmpReg);
+ }
+ else
+ {
+ unreached();
+ }
+ }
+ else
+ {
+ // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
+ // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
+ // use the 3-op form, so that we can avoid these copies.
+ // TODO-CQ: Add inst_RV_RV_RV_IV().
+ if (op1Reg == targetReg)
+ {
+ // Best case
+ // nothing to do, we have registers in the right place
+ }
+ else if (op2Reg == targetReg)
+ {
+ op2Reg = op1Reg;
+ }
+ else
+ {
+ inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
+ }
+
+ emitAttr emitSize = emitActualTypeSize(simdEvalType);
+ if (baseType == TYP_FLOAT)
+ {
+ // dpps computes the dot product of the upper & lower halves of the 32-byte register.
+ // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
+ inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, 0xf1);
+ // If this is TYP_SIMD32, we need to combine the lower & upper results.
+ if (simdEvalType == TYP_SIMD32)
+ {
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
+ inst_RV_RV(INS_addps, targetReg, tmpReg, targetType, emitTypeSize(targetType));
+ }
+ }
+ else if (baseType == TYP_DOUBLE)
+ {
+ // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use
+ // dppd directly.
+ assert(simdType == TYP_SIMD32);
+
+ // targetReg = targetReg * op2Reg
+ // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
+ // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg
+ // targetReg = targetReg + tmpReg
+ inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
+ inst_RV_RV(INS_addpd, targetReg, tmpReg, targetType, emitTypeSize(targetType));
+ }
+ else
+ {
+ unreached();
+ }
+ }
+
+ genProduceReg(simdNode);
+}
+
+//------------------------------------------------------------------------------------
+// genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types simdType = op1->TypeGet();
+ assert(varTypeIsSIMD(simdType) || simdType == TYP_DOUBLE);
+
+ // op1 of TYP_SIMD12 should be considered as TYP_SIMD16,
+ // since it is in XMM register.
+ if (simdType == TYP_SIMD12)
+ {
+ simdType = TYP_SIMD16;
+ }
+
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types targetType = simdNode->TypeGet();
+ assert(targetType == genActualType(baseType));
+
+ // GetItem has 2 operands:
+ // - the source of SIMD type (op1)
+ // - the index of the value to be returned.
+ regNumber srcReg = genConsumeReg(op1);
+
+ // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
+ // For the non-constant case, we will use the SIMD temp location to store the vector, and
+ // the load the desired element.
+ // The range check will already have been performed, so at this point we know we have an index
+ // within the bounds of the vector.
+ if (!op2->IsCnsIntOrI())
+ {
+ unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
+ noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
+ bool isEBPbased;
+ unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
+ regNumber indexReg = genConsumeReg(op2);
+
+ // Store the vector to the temp location.
+ getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)),
+ emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0);
+
+ // Now, load the desired element.
+ getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
+ emitTypeSize(baseType), // Of the vector baseType
+ targetReg, // To targetReg
+ (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based
+ indexReg, // Indexed
+ genTypeSize(baseType), // by the size of the baseType
+ offs);
+ genProduceReg(simdNode);
+ return;
+ }
+
+ noway_assert(op2->isContained());
+ int byteShiftCnt = (int) op2->gtIntCon.gtIconVal * genTypeSize(baseType);
+
+ // Generate the following sequence:
+ // 1) baseType is floating point
+ // movaps targetReg, srcReg
+ // psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element
+ //
+ // 2) baseType is not floating point
+ // movaps tmpReg, srcReg <-- not generated if accessing zero'th element
+ // psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element
+ // mov_xmm2i targetReg, tmpReg
+ if (varTypeIsFloating(baseType))
+ {
+ if (targetReg != srcReg)
+ {
+ inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ if (byteShiftCnt != 0)
+ {
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
+ }
+ }
+ else
+ {
+ if (varTypeIsSmallInt(baseType))
+ {
+ // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits.
+ // In case of vector<short> we also need to sign extend the 16-bit value in targetReg
+ // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right
+ // by 8-bits if index is odd. In case of Vector<sbyte> also sign extend targetReg.
+ int index = (int) op2->gtIntCon.gtIconVal;
+ unsigned baseSize = genTypeSize(baseType);
+ if (baseSize == 1)
+ {
+ index /= 2;
+ }
+
+ getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index);
+
+ bool ZeroOrSignExtnReqd = true;
+ if (baseSize == 1)
+ {
+ if ((op2->gtIntCon.gtIconVal % 2) == 1)
+ {
+ // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element.
+ inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8);
+
+ // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE
+ ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
+ }
+ // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits
+ }
+ else
+ {
+ // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT
+ assert(baseSize == 2);
+ ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
+ }
+
+ if (ZeroOrSignExtnReqd)
+ {
+ // Zero/sign extend the byte/short to 32-bits
+ inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType));
+ }
+ }
+ else
+ {
+ // We need a temp xmm register if the baseType is not floating point and
+ // accessing non-zero'th element.
+ regNumber tmpReg = REG_NA;
+ instruction ins;
+
+ if (byteShiftCnt != 0)
+ {
+ assert(simdNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(simdNode->gtRsvdRegs) == 1);
+ tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
+
+ if (tmpReg != srcReg)
+ {
+ inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
+ }
+ else
+ {
+ assert(simdNode->gtRsvdRegs == RBM_NONE);
+ tmpReg = srcReg;
+ }
+
+ assert(tmpReg != REG_NA);
+ ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
+ // (Note that for mov_xmm2i, the int register is always in the reg2 position.
+ inst_RV_RV(ins, tmpReg, targetReg, baseType);
+ }
+ }
+
+ genProduceReg(simdNode);
+}
+
+//------------------------------------------------------------------------------------
+// genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+// TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case.
+//
+void
+CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
+{
+ // Determine index based on intrinsic ID
+ int index = -1;
+ switch(simdNode->gtSIMDIntrinsicID)
+ {
+ case SIMDIntrinsicSetX:
+ index = 0;
+ break;
+ case SIMDIntrinsicSetY:
+ index = 1;
+ break;
+ case SIMDIntrinsicSetZ:
+ index = 2;
+ break;
+ case SIMDIntrinsicSetW:
+ index = 3;
+ break;
+
+ default:
+ unreached();
+ }
+ assert(index != -1);
+
+ // op1 is the SIMD vector
+ // op2 is the value to be set
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types targetType = simdNode->TypeGet();
+ assert(varTypeIsSIMD(targetType) || targetType == TYP_DOUBLE);
+
+ // the following assert must hold.
+ // supported only on vector2f/3f/4f right now
+ noway_assert(baseType == TYP_FLOAT);
+ assert(op2->TypeGet() == baseType);
+ assert(simdNode->gtSIMDSize >= ((index + 1) * genTypeSize(baseType)));
+
+ regNumber op1Reg = genConsumeReg(op1);
+ regNumber op2Reg = genConsumeReg(op2);
+
+ // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate.
+ if (targetReg != op1Reg)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+
+ // Right now this intrinsic is supported only for float base type vectors.
+ // If in future need to support on other base type vectors, the below
+ // logic needs modification.
+ noway_assert(baseType == TYP_FLOAT);
+
+ if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
+ {
+ // We need one additional int register as scratch
+ assert(simdNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(simdNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
+ assert(genIsValidIntReg(tmpReg));
+
+ // Move the value from xmm reg to an int reg
+ instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT);
+ // (Note that for mov_xmm2i, the int register is always in the reg2 position.
+ inst_RV_RV(ins, op2Reg, tmpReg, baseType);
+
+ // First insert the lower 16-bits of tmpReg in targetReg at 2*index position
+ // since every float has two 16-bit words.
+ getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2*index);
+
+ // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position
+ inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16);
+ getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2*index+1);
+ }
+ else
+ {
+ unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0)|INSERTPS_TARGET_SELECT(index));
+ inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm);
+ }
+
+ genProduceReg(simdNode);
+}
+
+//------------------------------------------------------------------------
+// genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2);
+ noway_assert(compiler->getSIMDInstructionSet() == InstructionSet_SSE2);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ assert(op2->isContained());
+ assert(op2->IsCnsIntOrI());
+ int shuffleControl = (int) op2->AsIntConCommon()->IconValue();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ var_types targetType = simdNode->TypeGet();
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+
+ regNumber op1Reg = genConsumeReg(op1);
+ if (targetReg != op1Reg)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+ getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl);
+ genProduceReg(simdNode);
+}
+
+//-----------------------------------------------------------------------------
+// genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two writes: 8 byte followed by 4-byte.
+//
+// Arguments:
+// treeNode - tree node that is attempting to store indirect
+//
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
+{
+ assert(treeNode->OperGet() == GT_STOREIND);
+
+ GenTree* addr = treeNode->gtOp.gtOp1;
+ GenTree* data = treeNode->gtOp.gtOp2;
+
+ // addr and data should not be contained.
+ assert(!data->isContained());
+ assert(!addr->isContained());
+
+#ifdef DEBUG
+ // Should not require a write barrier
+ GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data);
+ assert(writeBarrierForm == GCInfo::WBF_NoBarrier);
+#endif
+
+ // Need an addtional Xmm register to extract upper 4 bytes from data.
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+ bool reverseOps = ((treeNode->gtFlags & GTF_REVERSE_OPS) != 0);
+ if (!reverseOps)
+ {
+ genConsumeReg(addr);
+ genConsumeReg(data);
+ }
+ else
+ {
+ genConsumeReg(data);
+ genConsumeReg(addr);
+ }
+
+ // 8-byte write
+ getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, 0);
+
+ // Extract upper 4-bytes from data
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, 0x02);
+
+ // 4-byte write
+ getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, 8);
+}
+
+//-----------------------------------------------------------------------------
+// genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two loads: 8 byte followed by 4-byte.
+//
+// Arguments:
+// treeNode - tree node of GT_IND
+//
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
+{
+ assert(treeNode->OperGet() == GT_IND);
+
+ regNumber targetReg = treeNode->gtRegNum;
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+ assert(!op1->isContained());
+ regNumber operandReg = genConsumeReg(op1);
+
+ // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 2);
+
+ regNumber tmpReg = REG_NA;
+ regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
+ regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
+ tmpRegsMask &= ~tmpReg1Mask;
+ regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
+ regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
+
+ // Choose any register different from targetReg as tmpReg
+ if (tmpReg1 != targetReg)
+ {
+ tmpReg = tmpReg1;
+ }
+ else
+ {
+ assert(targetReg != tmpReg2);
+ tmpReg = tmpReg2;
+ }
+ assert(tmpReg != REG_NA);
+ assert(tmpReg != targetReg);
+
+ // Load upper 4 bytes in tmpReg
+ getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, 8);
+
+ // Load lower 8 bytes in targetReg
+ getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);
+
+ // combine upper 4 bytes and lower 8 bytes in targetReg
+ getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
+
+ genProduceReg(treeNode);
+}
+
+//-----------------------------------------------------------------------------
+// genStoreLclFldTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two stores: 8 byte followed by 4-byte.
+//
+// Arguments:
+// treeNode - tree node that is attempting to store TYP_SIMD12 field
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
+{
+ assert(treeNode->OperGet() == GT_STORE_LCL_FLD);
+
+ unsigned offs = treeNode->gtLclFld.gtLclOffs;
+ unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
+ assert(varNum < compiler->lvaCount);
+
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+ assert(!op1->isContained());
+ regNumber operandReg = genConsumeReg(op1);
+
+ // Need an addtional Xmm register to extract upper 4 bytes from data.
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+ // store lower 8 bytes
+ getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs);
+
+ // Extract upper 4-bytes from operandReg
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
+
+ // Store upper 4 bytes
+ getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs+8);
+}
+
+//-----------------------------------------------------------------------------
+// genLoadLclFldTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two reads: 8 byte followed by 4-byte.
+//
+// Arguments:
+// treeNode - tree node that is attempting to load TYP_SIMD12 field
+//
+// Return Value:
+// None.
+//
+void
+CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
+{
+ assert(treeNode->OperGet() == GT_LCL_FLD);
+
+ regNumber targetReg = treeNode->gtRegNum;
+ unsigned offs = treeNode->gtLclFld.gtLclOffs;
+ unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
+ assert(varNum < compiler->lvaCount);
+
+ // Need an addtional Xmm register to read upper 4 bytes
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 2);
+
+ regNumber tmpReg = REG_NA;
+ regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
+ regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
+ tmpRegsMask &= ~tmpReg1Mask;
+ regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
+ regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
+
+ // Choose any register different from targetReg as tmpReg
+ if (tmpReg1 != targetReg)
+ {
+ tmpReg = tmpReg1;
+ }
+ else
+ {
+ assert(targetReg != tmpReg2);
+ tmpReg = tmpReg2;
+ }
+ assert(tmpReg != REG_NA);
+ assert(tmpReg != targetReg);
+
+ // Read upper 4 bytes to tmpReg
+ getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs+8);
+
+ // Read lower 8 bytes to targetReg
+ getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
+
+ // combine upper 4 bytes and lower 8 bytes in targetReg
+ getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
+
+ genProduceReg(treeNode);
+}
+
+//-----------------------------------------------------------------------------
+// genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
+// the given register, if any, or to memory.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+// Notes:
+// The upper half of all AVX registers is volatile, even the callee-save registers.
+// When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic
+// to cause the upper half to be saved. It will first attempt to find another, unused, callee-save
+// register. If such a register cannot be found, it will save it to an available caller-save register.
+// In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte
+// value to the stack. (Note that if there are no caller-save registers available, the entire 32 byte
+// value will be spilled to the stack.)
+//
+void
+CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
+ regNumber targetReg = simdNode->gtRegNum;
+ regNumber op1Reg = genConsumeReg(op1);
+ assert(op1Reg != REG_NA);
+ assert(targetReg != REG_NA);
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01);
+
+ genProduceReg(simdNode);
+}
+
+//-----------------------------------------------------------------------------
+// genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to
+// the given register, if any, or to memory.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+// Notes:
+// For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always
+// have their home register, this node has its targetReg on the lclVar child, and its source
+// on the simdNode.
+// Regarding spill, please see the note above on genSIMDIntrinsicUpperSave. If we have spilled
+// an upper-half to a caller save register, this node will be marked GTF_SPILLED. However, unlike
+// most spill scenarios, the saved tree will be different from the restored tree, but the spill
+// restore logic, which is triggered by the call to genConsumeReg, requires us to provide the
+// spilled tree (saveNode) in order to perform the reload. We can easily find that tree,
+// as it is in the spill descriptor for the register from which it was saved.
+//
+void
+CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
+ regNumber srcReg = simdNode->gtRegNum;
+ regNumber lclVarReg = genConsumeReg(op1);
+ unsigned varNum = op1->AsLclVarCommon()->gtLclNum;
+ assert(lclVarReg != REG_NA);
+ assert(srcReg != REG_NA);
+ if (simdNode->gtFlags & GTF_SPILLED)
+ {
+ GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree;
+ noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg));
+ genConsumeReg(saveNode);
+ }
+ getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01);
+}
+
+//------------------------------------------------------------------------
+// genSIMDIntrinsic: Generate code for a SIMD Intrinsic. This is the main
+// routine which in turn calls apropriate genSIMDIntrinsicXXX() routine.
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+// Notes:
+// Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and
+// a limited set of methods.
+//
+void
+CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
+{
+ // NYI for unsupported base types
+ if (simdNode->gtSIMDBaseType != TYP_INT &&
+ simdNode->gtSIMDBaseType != TYP_LONG &&
+ simdNode->gtSIMDBaseType != TYP_FLOAT &&
+ simdNode->gtSIMDBaseType != TYP_DOUBLE &&
+ simdNode->gtSIMDBaseType != TYP_CHAR &&
+ simdNode->gtSIMDBaseType != TYP_UBYTE &&
+ simdNode->gtSIMDBaseType != TYP_SHORT &&
+ simdNode->gtSIMDBaseType != TYP_BYTE &&
+ simdNode->gtSIMDBaseType != TYP_UINT &&
+ simdNode->gtSIMDBaseType != TYP_ULONG
+ )
+ {
+ noway_assert(!"SIMD intrinsic with unsupported base type.");
+ }
+
+ switch(simdNode->gtSIMDIntrinsicID)
+ {
+ case SIMDIntrinsicInit:
+ genSIMDIntrinsicInit(simdNode);
+ break;
+
+ case SIMDIntrinsicInitN:
+ genSIMDIntrinsicInitN(simdNode);
+ break;
+
+ case SIMDIntrinsicSqrt:
+ case SIMDIntrinsicCast:
+ genSIMDIntrinsicUnOp(simdNode);
+ break;
+
+ case SIMDIntrinsicAdd:
+ case SIMDIntrinsicSub:
+ case SIMDIntrinsicMul:
+ case SIMDIntrinsicDiv:
+ case SIMDIntrinsicBitwiseAnd:
+ case SIMDIntrinsicBitwiseAndNot:
+ case SIMDIntrinsicBitwiseOr:
+ case SIMDIntrinsicBitwiseXor:
+ case SIMDIntrinsicMin:
+ case SIMDIntrinsicMax:
+ genSIMDIntrinsicBinOp(simdNode);
+ break;
+
+ case SIMDIntrinsicOpEquality:
+ case SIMDIntrinsicOpInEquality:
+ case SIMDIntrinsicEqual:
+ case SIMDIntrinsicLessThan:
+ case SIMDIntrinsicGreaterThan:
+ case SIMDIntrinsicLessThanOrEqual:
+ case SIMDIntrinsicGreaterThanOrEqual:
+ genSIMDIntrinsicRelOp(simdNode);
+ break;
+
+ case SIMDIntrinsicDotProduct:
+ genSIMDIntrinsicDotProduct(simdNode);
+ break;
+
+ case SIMDIntrinsicGetItem:
+ genSIMDIntrinsicGetItem(simdNode);
+ break;
+
+ case SIMDIntrinsicShuffleSSE2:
+ genSIMDIntrinsicShuffleSSE2(simdNode);
+ break;
+
+ case SIMDIntrinsicSetX:
+ case SIMDIntrinsicSetY:
+ case SIMDIntrinsicSetZ:
+ case SIMDIntrinsicSetW:
+ genSIMDIntrinsicSetItem(simdNode);
+ break;
+
+ case SIMDIntrinsicUpperSave:
+ genSIMDIntrinsicUpperSave(simdNode);
+ break;
+ case SIMDIntrinsicUpperRestore:
+ genSIMDIntrinsicUpperRestore(simdNode);
+ break;
+
+ default:
+ noway_assert("Unimplemented SIMD intrinsic.");
+ unreached();
+ }
+}
+
+#endif // FEATURE_SIMD
+#endif //_TARGET_AMD64_
+#endif // !LEGACY_BACKEND