//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
//

/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XX                                                                           XX
XX                        Amd64 SIMD Code Generator                          XX
XX                                                                           XX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/
#include "jitpch.h"
#ifdef _MSC_VER
#pragma hdrstop
#endif

#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator.

#ifdef _TARGET_AMD64_
#include "emit.h"
#include "codegen.h"
#include "lower.h"
#include "gcinfo.h"
#include "gcinfoencoder.h"

#ifdef FEATURE_SIMD

// Instruction immediates

// Insertps:
// - bits 6 and 7 of the immediate indicate which source item to select (0..3)
// - bits 4 and 5 of the immediate indicate which target item to insert into (0..3)
// - bits 0 to 3 of the immediate indicate which target item to zero
#define INSERTPS_SOURCE_SELECT(i)       (i<<6)
#define INSERTPS_TARGET_SELECT(i)       (i<<4)
#define INSERTPS_ZERO(i)                (1<<i)

// getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic
//
// Arguments:
//   intrinsicId    -   SIMD intrinsic Id
//   baseType       -   Base type of the SIMD vector
//   immed          -   Out param. Any immediate byte operand that needs to be passed to SSE2 opcode
// 
//
// Return Value:
//   Instruction (op) to be used, and immed is set if instruction requires an immediate operand.
//
instruction 
CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, 
                               var_types baseType, 
                               unsigned *ival /*=nullptr*/)
{
    // Minimal required instruction set is SSE2.
    assert(compiler->canUseSSE2());

    instruction result = INS_invalid;
    switch(intrinsicId)
    {
        case SIMDIntrinsicInit:
			if (compiler->canUseAVX())
			{
				// AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
				// AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
				// If we decide to use AVX2 only, we can remove this assert.
				if ((compiler->opts.eeFlags & CORJIT_FLG_USE_AVX2) == 0)
				{
					assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
				}
				switch (baseType)
				{
				case TYP_FLOAT:     result = INS_vbroadcastss; break;
				case TYP_DOUBLE:    result = INS_vbroadcastsd; break;
				case TYP_ULONG:     __fallthrough;
				case TYP_LONG:      result = INS_vpbroadcastq; break;
				case TYP_UINT:      __fallthrough;
				case TYP_INT:       result = INS_vpbroadcastd; break;
				case TYP_CHAR:      __fallthrough;
				case TYP_SHORT:     result = INS_vpbroadcastw; break;
				case TYP_UBYTE:     __fallthrough;
				case TYP_BYTE:      result = INS_vpbroadcastb; break;
				default:            unreached();
				}
				break;
			}
            // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
			__fallthrough;
        case SIMDIntrinsicShuffleSSE2:
            if (baseType == TYP_FLOAT)
            {
                result = INS_shufps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_shufpd;
            }
            else if (baseType == TYP_INT || baseType == TYP_UINT)
            {
                result = INS_pshufd;
            }
            else if (baseType == TYP_LONG || baseType == TYP_ULONG)
            {
                // We don't have a seperate SSE2 instruction and will
                // use the instruction meant for doubles since it is 
                // of the same size as a long.
                result = INS_shufpd;
            }
            break;

        case SIMDIntrinsicSqrt:
            if (baseType == TYP_FLOAT)
            {
                result = INS_sqrtps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_sqrtpd;
            }
            else 
            {
                unreached();
            }
            break;

        case SIMDIntrinsicAdd:
            if (baseType == TYP_FLOAT)
            {
                result = INS_addps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_addpd;
            }
            else if (baseType == TYP_INT || baseType == TYP_UINT)
            {
                result = INS_paddd;
            }
            else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
            {
                result = INS_paddw;
            }
            else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
            {
                result = INS_paddb;
            }
            else if (baseType == TYP_LONG || baseType == TYP_ULONG)
            {
                result = INS_paddq;
            }
            break;

        case SIMDIntrinsicSub:
            if (baseType == TYP_FLOAT)
            {
                result = INS_subps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_subpd;
            }
            else if (baseType == TYP_INT || baseType == TYP_UINT)
            {
                result = INS_psubd;
            }
            else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
            {
                result = INS_psubw;
            }
            else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
            {
                result = INS_psubb;
            }
            else if (baseType == TYP_LONG || baseType == TYP_ULONG)
            {
                result = INS_psubq;
            }
            break;

        case SIMDIntrinsicMul:
            if (baseType == TYP_FLOAT)
            {
                result = INS_mulps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_mulpd;
            }
            else if (baseType == TYP_SHORT)
            {
                result = INS_pmullw;
            }
            else if (compiler->canUseAVX())
            {
                if (baseType == TYP_INT)
                {
                    result = INS_pmulld;
                }
            }
            break;

        case SIMDIntrinsicDiv:
            if (baseType == TYP_FLOAT)
            {
                result = INS_divps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_divpd;
            }
            else
            {
                unreached();
            }
            break;

        case SIMDIntrinsicMin:
            if (baseType == TYP_FLOAT)
            {
                result = INS_minps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_minpd;
            }
            else if (baseType == TYP_UBYTE)
            {
                result = INS_pminub;
            }
            else if (baseType == TYP_SHORT)
            {
                result = INS_pminsw;
            }
            else
            {
                unreached();
            }
            break;

        case SIMDIntrinsicMax:
            if (baseType == TYP_FLOAT)
            {
                result = INS_maxps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_maxpd;
            }
            else if (baseType == TYP_UBYTE)
            {
                result = INS_pmaxub;
            }
            else if (baseType == TYP_SHORT)
            {
                result = INS_pmaxsw;
            }
            else
            {
                unreached();
            }
            break;

        case SIMDIntrinsicEqual:
            if (baseType == TYP_FLOAT)
            {
                result = INS_cmpps;
                assert(ival != nullptr);
                *ival = 0;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_cmppd;
                assert(ival != nullptr);
                *ival = 0;
            }
            else if (baseType == TYP_INT || baseType == TYP_UINT)
            {
                result = INS_pcmpeqd;
            }
            else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
            {
                result = INS_pcmpeqw;
            }
            else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
            {
                result = INS_pcmpeqb;
            }
			else if (compiler->canUseAVX() && (baseType == TYP_ULONG || baseType == TYP_LONG))
            {
                result = INS_pcmpeqq;
            }
            break;

        case SIMDIntrinsicLessThan:
            // Packed integers use > with swapped operands
            assert(baseType != TYP_INT);

            if (baseType == TYP_FLOAT)
            {                
                result = INS_cmpps;
                assert(ival != nullptr);
                *ival = 1;
            }
            else if (baseType == TYP_DOUBLE)
            {                
                result = INS_cmppd;
                assert(ival != nullptr);
                *ival = 1;
            }
            break;

        case SIMDIntrinsicLessThanOrEqual:
            // Packed integers use (a==b) || ( b > a) in place of a <= b.
            assert(baseType != TYP_INT);

            if (baseType == TYP_FLOAT)
            {                
                result = INS_cmpps;
                assert(ival != nullptr);
                *ival = 2;
            }
            else if (baseType == TYP_DOUBLE)
            {                
                result = INS_cmppd;
                assert(ival != nullptr);
                *ival = 2;
            }
            break;

        case SIMDIntrinsicGreaterThan:
            // Packed float/double use < with swapped operands
            assert(!varTypeIsFloating(baseType));

            // SSE2 supports only signed >
            if (baseType == TYP_INT)
            {
                result = INS_pcmpgtd;
            }            
            else if (baseType == TYP_SHORT)
            {
                result = INS_pcmpgtw;
            }
            else if (baseType == TYP_BYTE)
            {
                result = INS_pcmpgtb;
            }
			else if (compiler->canUseAVX() && (baseType == TYP_LONG))
            {
                result = INS_pcmpgtq;
            }
            break;

        case SIMDIntrinsicBitwiseAnd:
            if (baseType == TYP_FLOAT)
            {
                result = INS_andps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_andpd;
            }
            else if (varTypeIsIntegral(baseType))
            {
                result = INS_pand;
            }
            break;

        case SIMDIntrinsicBitwiseAndNot:
            if (baseType == TYP_FLOAT)
            {
                result = INS_andnps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_andnpd;
            }
            else if (baseType == TYP_INT)
            {
                result = INS_pandn;
            }
            else if (varTypeIsIntegral(baseType))
            {
                result = INS_pandn;
            }
            break;

        case SIMDIntrinsicBitwiseOr:
            if (baseType == TYP_FLOAT)
            {
                result = INS_orps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_orpd;
            }
            else if (varTypeIsIntegral(baseType))
            {
                result = INS_por;
            }
            break;

        case SIMDIntrinsicBitwiseXor:
            if (baseType == TYP_FLOAT)
            {
                result = INS_xorps;
            }
            else if (baseType == TYP_DOUBLE)
            {
                result = INS_xorpd;
            }
            else if (varTypeIsIntegral(baseType))
            {
                result = INS_pxor;
            }
            break;

        case SIMDIntrinsicCast:
            result = INS_movaps;
            break;

        case SIMDIntrinsicShiftLeftInternal:
            // base type doesn't matter since the entire vector is shifted left
            result = INS_pslldq;
            break;

        case SIMDIntrinsicShiftRightInternal:
            // base type doesn't matter since the entire vector is shifted right
            result = INS_psrldq;
            break;

        case SIMDIntrinsicUpperSave:
            result = INS_vextractf128;
            break;

        case SIMDIntrinsicUpperRestore:
            result = INS_insertps;
            break;

        default:
            assert(!"Unsupported SIMD intrinsic");
            unreached();
    }

    noway_assert(result != INS_invalid);
    return result;
}

// genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg
// to target mm reg, zeroing out the upper bits if and only if specified.
//
// Arguments:
//    type             the type of value to be moved
//    targetReg        the target reg
//    srcReg           the src reg
//	  zeroInit         true if the upper bits of targetReg should be zero'd
//
// Return Value:
//    None
//
// Notes:
//    This is currently only supported for floating point types.
//
void
CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg, bool zeroInit)
{
    var_types targetType = compiler->getSIMDVectorType();
    assert(varTypeIsFloating(type));
#ifdef FEATURE_AVX_SUPPORT
    if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
    {
        if (zeroInit)
	    {
            // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
            // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
            // to zero all but the lower bits.
            unsigned int insertpsImm = (INSERTPS_TARGET_SELECT(0)|INSERTPS_ZERO(1)|INSERTPS_ZERO(2)|INSERTPS_ZERO(3));
		    inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
	    }
        else if (srcReg != targetReg)
        {
            instruction ins = ins_Store(type);
            if (getEmitter()->IsThreeOperandMoveAVXInstruction(ins))
            {
                // In general, when we use a three-operands move instruction, we want to merge the src with itself.
                // This is an exception in that we actually want the "merge" behavior, so we must specify it with
                // all 3 operands.
                inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(targetType));
            }
            else
            {
                inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
            }
        }
    }
    else
#endif // FEATURE_AVX_SUPPORT
    {
        // SSE
        if (zeroInit)
        {
            if (srcReg == targetReg)
		    {
			    // There is no guarantee that upper bits of op1Reg are zero.
			    // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
			    instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, type);
			    getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
			    ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, type);
			    getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
		    }
            else
	        {
		        instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, type);
                inst_RV_RV(ins, targetReg, targetReg, targetType, emitTypeSize(targetType));
		        inst_RV_RV(ins_Store(type), targetReg, srcReg);
	        }
        }
        else if (srcReg != targetReg)
        {
            inst_RV_RV(ins_Store(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
        }
    }
}

//------------------------------------------------------------------------
// genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
void
CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
{
    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit);

    GenTree* op1 = simdNode->gtGetOp1();
    var_types baseType = simdNode->gtSIMDBaseType;
    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);
    var_types targetType = simdNode->TypeGet();
    InstructionSet iset = compiler->getSIMDInstructionSet();
    unsigned size = simdNode->gtSIMDSize;

    // Should never see small int base type vectors except for zero initialization.
    noway_assert(!varTypeIsSmallInt(baseType) || op1->IsZero());

    instruction ins = INS_invalid;
    if (op1->isContained())
    {
        if (op1->IsZero())
        {   
            // pxor reg, reg
            ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType);
            inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
        }
        else if ((baseType == TYP_INT && op1->IsCnsIntOrI() && op1->AsIntConCommon()->IconValue() == 0xffffffff) ||
                 (baseType == TYP_LONG && op1->IsCnsIntOrI() && op1->AsIntConCommon()->IconValue() == 0xffffffffffffffffLL))
        {
            // case of initializing elements of vector with all 1's
            // generate pcmpeqd reg, reg
            ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
            inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
        }
#ifdef FEATURE_AVX_SUPPORT
        else
        {
            assert(iset == InstructionSet_AVX);
			ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType);
            if (op1->IsCnsFltOrDbl())
            {
                getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1);
            }
            else if (op1->OperIsLocalAddr())
            {
                unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : 0;
                getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum, offset);
            }
            else
            {
                unreached();
            }
        }
#endif // FEATURE_AVX_SUPPORT
    }
    else if (iset == InstructionSet_AVX && ((size == 32) || (size == 16)))
	{
		regNumber srcReg = genConsumeReg(op1);
		if (baseType == TYP_INT || baseType == TYP_UINT ||
			baseType == TYP_LONG || baseType == TYP_ULONG)
		{
			ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
			assert(ins != INS_invalid);
			inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
			srcReg = targetReg;
		}

		ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
		getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg);
	}
    else
    {
        // If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType.
        // In either case we are going to use the SSE2 shuffle instruction.

        regNumber op1Reg = genConsumeReg(op1);
        unsigned shuffleControl = 0;

        if (compiler->isSubRegisterSIMDType(simdNode))
        {
             assert(baseType == TYP_FLOAT);

            // We cannot assume that upper bits of op1Reg or targetReg be zero.
            // Therefore we need to explicitly zero out upper bits.  This is
            // essential for the shuffle operation performed below.
            //
            // If op1 is a float/double constant, we would have loaded it from 
            // data section using movss/sd.  Similarly if op1 is a memory op we
            // would have loaded it using movss/sd.  Movss/sd when loading a xmm reg
            // from memory would zero-out upper bits. In these cases we can
            // avoid explicitly zero'ing out targetReg.
            bool zeroInitRequired = !(op1->IsCnsFltOrDbl() || op1->isMemoryOp());
			genSIMDScalarMove(TYP_FLOAT, targetReg, op1Reg, zeroInitRequired);

            if (size == 8)
            {
                shuffleControl = 0x50;
            }
            else if (size == 12)
            {
                shuffleControl = 0x40;
            }
            else
            {                      
                noway_assert(!"Unexpected size for SIMD type");
            }
        }
        else // Vector<T>
        {     
            if (op1Reg != targetReg)
            {            
                if (varTypeIsFloating(baseType))
                {
                    ins = ins_Copy(targetType);
                }
                else if (baseType == TYP_INT || baseType == TYP_UINT ||
                         baseType == TYP_LONG || baseType == TYP_ULONG)
                {
                    ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
                }

                assert(ins != INS_invalid);
                inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType));
            }
        }

        ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
		getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl);
    }

    genProduceReg(simdNode);
}

//-------------------------------------------------------------------------------------------
// genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes
//                        a number of arguments equal to the length of the Vector.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
void
CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
{
    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN);    

    // Right now this intrinsic is supported only on TYP_FLOAT vectors
    var_types baseType = simdNode->gtSIMDBaseType;
    noway_assert(baseType == TYP_FLOAT);

    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);

    var_types targetType = simdNode->TypeGet();

    // Note that we cannot use targetReg before consumed all source operands. Therefore, 
    // Need an internal register to stitch together all the values into a single vector 
    // in an XMM reg.
    assert(simdNode->gtRsvdRegs != RBM_NONE);
    assert(genCountBits(simdNode->gtRsvdRegs) == 1);
    regNumber vectorReg = genRegNumFromMask(simdNode->gtRsvdRegs);

    // Zero out vectorReg if we are constructing a vector whose size is not equal to the SIMD vector size.
    // For example in case of Vector4f we don't need to zero when using SSE2.
    if (compiler->isSubRegisterSIMDType(simdNode))
    {
        instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType);
        inst_RV_RV(ins, vectorReg, vectorReg, targetType, emitActualTypeSize(targetType));
    }

    unsigned int offset = 0;
    unsigned int baseTypeSize = genTypeSize(baseType);
    instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
    for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2())
    {
        assert(list->OperGet() == GT_LIST);
        GenTree* listItem = list->gtGetOp1();
        assert(listItem->TypeGet() == baseType);
        assert(!listItem->isContained());
        
        // The list will have init values in the reverse order. This allows us
        // to efficiently stitch together a vector as follows:
        // vectorReg = (vectorReg << offset)
        // VectorReg[0] = listItemReg
        //
        // Use genSIMDScalarMove with zeroInit of false in order to ensure that the upper
        // bits of vectorReg are not modified.
        regNumber listItemReg = genConsumeReg(listItem);
        if (offset != 0)
        {                         
            getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
        }
        genSIMDScalarMove(baseType, vectorReg, listItem->gtRegNum, false /* do not zeroInit */);

        offset += baseTypeSize;
    }

    noway_assert(offset == simdNode->gtSIMDSize);
    
    // Load the initialized value.
    if (targetReg != vectorReg)
    {
        inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType));
    }
    genProduceReg(simdNode);
}

//----------------------------------------------------------------------------------
// genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
void
CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
{
    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast);

    GenTree* op1 = simdNode->gtGetOp1();
    var_types baseType = simdNode->gtSIMDBaseType;
    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);
    var_types targetType = simdNode->TypeGet();

    regNumber op1Reg = genConsumeReg(op1);
    instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
    if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast || targetReg != op1Reg)
    {
        inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
    }
    genProduceReg(simdNode);
}

//--------------------------------------------------------------------------------
// genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
// add, sub, mul, bit-wise And, AndNot and Or.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
void
CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
{
    assert( simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd           ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub           ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul           ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv           ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd    ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr     ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor    ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin           ||
            simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax 
           );
    
    GenTree* op1 = simdNode->gtGetOp1();
    GenTree* op2 = simdNode->gtGetOp2();
    var_types baseType = simdNode->gtSIMDBaseType;
    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);
    var_types targetType = simdNode->TypeGet();
    InstructionSet iset = compiler->getSIMDInstructionSet();

    regNumber op1Reg = genConsumeReg(op1);
    regNumber op2Reg = genConsumeReg(op2);
    regNumber otherReg = op2Reg;

    // Vector<Int>.Mul:
    // SSE2 doesn't have an instruction to perform this operation directly
    // whereas SSE4.1 does (pmulld).  This is special cased and computed
    // as follows.
    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul && 
        baseType == TYP_INT &&
        iset == InstructionSet_SSE2)
    {
        // We need an additional xmm register as temp.
        assert(simdNode->gtRsvdRegs != RBM_NONE);
        assert(genCountBits(simdNode->gtRsvdRegs) == 2);

        regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
        regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
        tmpRegsMask &= ~tmpReg1Mask;
        regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
        regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);      

        // tmpReg1 = op1 >> 4-bytes
        inst_RV_RV(INS_movaps, tmpReg1, op1Reg, targetType, emitActualTypeSize(targetType));
        getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg1, 4);

        // tmpReg2 = op2 >> 4-bytes
        inst_RV_RV(INS_movaps, tmpReg2, op2Reg);
        getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg2, 4);

        // tmpReg1 = unsigned double word multiply of tmpReg1 and tmpReg2. Essentially
        // tmpReg1[63:0] = op1[1] * op2[1] 
        // tmpReg2[127:64] = op1[3] * op2[3] 
        inst_RV_RV(INS_pmuludq, tmpReg1, tmpReg2, targetType, emitActualTypeSize(targetType));

        // targetReg[63:0] = op1[0] * op2[0]
        // targetReg[127:64] = op1[2] * op2[2]                
        if (op2Reg == targetReg)
        {
            otherReg = op1Reg;
        }
        else if (op1Reg != targetReg)
        {
            inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
        }
        inst_RV_RV(INS_pmuludq, targetReg, otherReg, targetType, emitActualTypeSize(targetType));

        // Extract first and third double word results from tmpReg1
        // tmpReg2 = shuffle(0,0,2,0) of tmpReg1
        getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg2, tmpReg1, 0x08);

        // Extract first and third double word results from targetReg
        // tmpReg1 = shuffle(0,0,2,0) of targetReg
        getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg1, targetReg, 0x08);

        // pack the results into a single vector
        inst_RV_RV(INS_movaps, targetReg, tmpReg1, targetType, emitActualTypeSize(targetType));
        inst_RV_RV(INS_punpckldq, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType));
    }
    else
    {
        instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);

        //Currently AVX doesn't support integer.
        //if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
        if (op1Reg != targetReg   &&
            compiler->canUseAVX() &&
            !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) &&
            getEmitter()->IsThreeOperandAVXInstruction(ins))
        {
            inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
        }
        else
        {
            if (op2Reg == targetReg)
            {
                otherReg = op1Reg;
            }
            else if (op1Reg != targetReg)
            {
                inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
            }

            inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
        }
    }

    // Vector2/3 div: since the top-most elements will be zero, we end up
    // perfoming 0/0 which is a NAN. Therefore, post division we need to set the
    // top-most elements to zero. This is achieved by left logical shift followed
    // by right logical shift of targetReg.
    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16))
    {
        // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
        unsigned shiftCount = 16 - simdNode->gtSIMDSize;
        assert(shiftCount != 0);
        instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
        getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
        ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
        getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
    }

    genProduceReg(simdNode);
}

//--------------------------------------------------------------------------------
// genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater
// <, <=, >, >= and ==
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
void
CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
{
    GenTree* op1 = simdNode->gtGetOp1();
    GenTree* op2 = simdNode->gtGetOp2();
    var_types baseType = simdNode->gtSIMDBaseType;
    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);
    var_types targetType = simdNode->TypeGet();
    InstructionSet iset = compiler->getSIMDInstructionSet();

    regNumber op1Reg = genConsumeReg(op1);
    regNumber op2Reg = genConsumeReg(op2);
    regNumber otherReg = op2Reg;

    switch(simdNode->gtSIMDIntrinsicID)
    {
    case SIMDIntrinsicEqual:
    case SIMDIntrinsicGreaterThan:
        {
            // SSE2: vector<(u)long> relation op should be implemented in terms of TYP_INT comparison operations
            assert(((iset == InstructionSet_AVX) || (baseType != TYP_LONG)) &&
                    (baseType != TYP_ULONG));

            // Greater-than: Floating point vectors use "<" with swapped operands
            if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan)
            {
                assert(!varTypeIsFloating(baseType));
            }

            unsigned ival = 0;
            instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);

            // targetReg = op1reg > op2reg
            // Therefore, we can optimize if op1Reg == targetReg
            otherReg = op2Reg;
            if (op1Reg != targetReg)
            {
                if (op2Reg == targetReg)
                {
                    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual);
                    otherReg = op1Reg;
                }
                else
                {
                    inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
                }
            }

            if (varTypeIsFloating(baseType))
            {
                getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, otherReg, ival);
            }
            else
            {
                inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
            }
        }
        break;

    case SIMDIntrinsicLessThan:
    case SIMDIntrinsicLessThanOrEqual:
        {
            // Int vectors use ">" and ">=" with swapped operands
            assert(varTypeIsFloating(baseType));

            // Get the instruction opcode for compare operation
            unsigned ival;
            instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);

            // targetReg = op1reg RelOp op2reg
            // Thefore, we can optimize if op1Reg == targetReg
            if (op1Reg != targetReg)
            {
                inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
            }

            getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival);
        }
        break;

    // (In)Equality that produces bool result instead of a bit vector
    case SIMDIntrinsicOpEquality:
    case SIMDIntrinsicOpInEquality:
        {
            assert(genIsValidIntReg(targetReg));

            // We need two additional XMM register as scratch
            assert(simdNode->gtRsvdRegs != RBM_NONE);
            assert(genCountBits(simdNode->gtRsvdRegs) == 2);

            regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
            regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
            tmpRegsMask &= ~tmpReg1Mask;
            regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
            regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
            var_types simdType = op1->TypeGet();

            // Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16
            // since both the operands will be in XMM registers.
            if (simdType == TYP_SIMD12)
            {
                simdType = TYP_SIMD16;
            }

            // tmpReg1 = (op1Reg == op2Reg)
            // Call this value of tmpReg1 as 'compResult' for further reference below.
            regNumber otherReg = op2Reg;
            if (tmpReg1 != op2Reg)
            {
                if (tmpReg1 != op1Reg)
                {
                    inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
                }
            }
            else
            {
                otherReg = op1Reg;
            }

            // For all integer types we can use TYP_INT comparison.
            unsigned ival = 0;
            instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);

            if (varTypeIsFloating(baseType))
            {
                getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
            }
            else
            {
                inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
            }
            
            // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
            if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
            {
                // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
                //
                // Generated code sequence
                // - vextractf128 tmpReg2, tmpReg1, 0x01
                //       tmpReg2[128..255] <- 0
                //       tmpReg2[0..127]   <- tmpReg1[128..255]
                // - vandps tmpReg1, tempReg2
                //       This will zero-out upper portion of tmpReg1 and 
                //       lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
                getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
                inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
            }
            // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
            if (simdType != TYP_DOUBLE)
            {
                // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
                // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
                getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);

                // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
                //
                // Note that what we have computed is as follows at this point:
                // tmpReg1[0] = compResult[0] & compResult[2]
                // tmpReg1[1] = compResult[1] & compResult[3]
                inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
            }
            // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
            // OR we have a Vector2 (TYPSIMD8 aka TYP_DOUBLE) in tmpReg1, which has only those two fields.

            // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
            // tmpReg2[0] = compResult[1] & compResult[3]
            getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);

            // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
            // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
            inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??

            // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
            // (Note that for mov_xmm2i, the int register is always in the reg2 position.
            inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);

            // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
            // Equality:
            //   cmp targetReg, 0xFFFFFFFF
            //   sete targetReg
            //   movzx targetReg, targetReg
            //
            // InEquality:
            //   cmp targetReg, 0xFFFFFFFF
            //   setne targetReg
            //   movzx targetReg, targetReg
            //
            getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
            inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT, EA_1BYTE);
            assert(simdNode->TypeGet() == TYP_INT);
            // Set the higher bytes to 0
            inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
        }
        break;

    default:
        noway_assert("Unimplemented SIMD relational operation.");
        unreached();
    }

    genProduceReg(simdNode);
}


//--------------------------------------------------------------------------------
// genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
void
CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
{
    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);

    GenTree* op1 = simdNode->gtGetOp1();
    GenTree* op2 = simdNode->gtGetOp2();
    var_types baseType = simdNode->gtSIMDBaseType;
    var_types simdType = op1->TypeGet();
    var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType;
    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);

    // DotProduct is only supported on floating point types.
    var_types targetType = simdNode->TypeGet();
    assert(targetType == baseType);
    assert(varTypeIsFloating(baseType));

    regNumber op1Reg = genConsumeReg(op1);
    regNumber op2Reg = genConsumeReg(op2);

    regNumber tmpReg = REG_NA;
    // For SSE, or AVX with 32-byte vectors, we need an additional Xmm register as scratch.
    // However, it must be distinct from targetReg, so we request two from the register allocator.
    // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
    if ((compiler->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdEvalType == TYP_SIMD32))
    {
        assert(simdNode->gtRsvdRegs != RBM_NONE);
        assert(genCountBits(simdNode->gtRsvdRegs) == 2);

        regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
        regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
        tmpRegsMask &= ~tmpReg1Mask;
        regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
        regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);

        // Choose any register different from targetReg as tmpReg
        if (tmpReg1 != targetReg)
        {
            tmpReg = tmpReg1;
        }
        else 
        {
            assert(targetReg != tmpReg2);
            tmpReg = tmpReg2;
        }
        assert(tmpReg != REG_NA);
        assert(tmpReg != targetReg);
    }

    if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
    {
        // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg  
        if (op1Reg == targetReg)
        {
            // Best case
            // nothing to do, we have registers in the right place
        }
        else if (op2Reg == targetReg)
        {
            op2Reg = op1Reg;
        }
        else
        {
            inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
        }

        // DotProduct(v1, v2)
        // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg
        if (baseType == TYP_FLOAT)
        {
            // v0 = v1 * v2
            // tmp = v0                                       // v0  = (3, 2, 1, 0) - each element is given by its position
            // tmp = shuffle(tmp, tmp, Shuffle(2,3,0,1))      // tmp = (2, 3, 0, 1)
            // v0 = v0 + tmp                                  // v0  = (3+2, 2+3, 1+0, 0+1)
            // tmp = v0                                       
            // tmp = shuffle(tmp, tmp, Shuffle(0,1,2,3))      // tmp = (0+1, 1+0, 2+3, 3+2)
            // v0 = v0 + tmp                                  // v0  = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
            //                                                // Essentially horizontal addtion of all elements.
            //                                                // We could achieve the same using SSEv3 instruction HADDPS.
            //
            inst_RV_RV(INS_mulps, targetReg, op2Reg);
            inst_RV_RV(INS_movaps, tmpReg, targetReg);
            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0xb1);
            inst_RV_RV(INS_addps, targetReg, tmpReg);
            inst_RV_RV(INS_movaps, tmpReg, targetReg);
            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0x1b);
            inst_RV_RV(INS_addps, targetReg, tmpReg);
        }
        else if (baseType == TYP_DOUBLE)
        {
            // v0 = v1 * v2
            // tmp = v0                                       // v0  = (1, 0) - each element is given by its position
            // tmp = shuffle(tmp, tmp, Shuffle(0,1))          // tmp = (0, 1)
            // v0 = v0 + tmp                                  // v0  = (1+0, 0+1)
            inst_RV_RV(INS_mulpd, targetReg, op2Reg);
            inst_RV_RV(INS_movaps, tmpReg, targetReg);
            inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg, tmpReg, 0x01);
            inst_RV_RV(INS_addpd, targetReg, tmpReg);
        }
        else
        {
            unreached();
        }
    }
    else
    {
        // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
        // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
        // use the 3-op form, so that we can avoid these copies.
        // TODO-CQ: Add inst_RV_RV_RV_IV().
        if (op1Reg == targetReg)
        {
            // Best case
            // nothing to do, we have registers in the right place
        }
        else if (op2Reg == targetReg)
        {
            op2Reg = op1Reg;
        }
        else
        {
            inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
        }

        emitAttr emitSize = emitActualTypeSize(simdEvalType);
        if (baseType == TYP_FLOAT)
        {
            // dpps computes the dot product of the upper & lower halves of the 32-byte register.
            // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
            inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, 0xf1);
            // If this is TYP_SIMD32, we need to combine the lower & upper results.
            if (simdEvalType == TYP_SIMD32)
            {
                getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
                inst_RV_RV(INS_addps, targetReg, tmpReg, targetType, emitTypeSize(targetType));
            }
        }
        else if (baseType == TYP_DOUBLE)
        {
            // On AVX, we have no 16-byte vectors of double.  Note that, if we did, we could use
            // dppd directly.
            assert(simdType == TYP_SIMD32);

            // targetReg = targetReg * op2Reg
            // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
            // tmpReg    = vextractf128(targetReg, 1)    ; Moves the upper sum into tempReg
            // targetReg = targetReg + tmpReg
            inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
            inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
            getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
            inst_RV_RV(INS_addpd, targetReg, tmpReg, targetType, emitTypeSize(targetType));
        }
        else
        {
            unreached();
        }
    }

    genProduceReg(simdNode);
}

//------------------------------------------------------------------------------------
// genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
void
CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
{
    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);

    GenTree* op1 = simdNode->gtGetOp1();
    GenTree* op2 = simdNode->gtGetOp2();
    var_types simdType = op1->TypeGet();    
    assert(varTypeIsSIMD(simdType) || simdType == TYP_DOUBLE);

    // op1 of TYP_SIMD12 should be considered as TYP_SIMD16, 
    // since it is in XMM register.
    if (simdType == TYP_SIMD12)
    {
        simdType = TYP_SIMD16;
    }

    var_types baseType = simdNode->gtSIMDBaseType;
    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);
    var_types targetType = simdNode->TypeGet();
    assert(targetType == genActualType(baseType));

    // GetItem has 2 operands:
    // - the source of SIMD type (op1)
    // - the index of the value to be returned.
    regNumber srcReg = genConsumeReg(op1);
    
    // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
    // For the non-constant case, we will use the SIMD temp location to store the vector, and
    // the load the desired element.
    // The range check will already have been performed, so at this point we know we have an index
    // within the bounds of the vector.
    if (!op2->IsCnsIntOrI())
    {
        unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
        noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
        bool isEBPbased;
        unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
        regNumber indexReg = genConsumeReg(op2);

        // Store the vector to the temp location.
        getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)), 
                                  emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0); 

        // Now, load the desired element.
        getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false),   // Load
                                    emitTypeSize(baseType),             // Of the vector baseType
                                    targetReg,                          // To targetReg
                                    (isEBPbased) ? REG_EBP : REG_ESP,   // Stack-based
                                    indexReg,                           // Indexed
                                    genTypeSize(baseType),              // by the size of the baseType
                                    offs);
        genProduceReg(simdNode);
        return;
    }

    noway_assert(op2->isContained());
    int byteShiftCnt = (int) op2->gtIntCon.gtIconVal * genTypeSize(baseType);

    // Generate the following sequence:
    // 1) baseType is floating point
    //   movaps    targetReg, srcReg
    //   psrldq    targetReg, byteShiftCnt  <-- not generated if accessing zero'th element
    //   
    // 2) baseType is not floating point
    //   movaps    tmpReg, srcReg           <-- not generated if accessing zero'th element
    //   psrldq    tmpReg, byteShiftCnt     <-- not generated if accessing zero'th element
    //   mov_xmm2i targetReg, tmpReg
    if (varTypeIsFloating(baseType))
    {
        if (targetReg != srcReg)
        {
            inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType));
        }

        if (byteShiftCnt != 0)
        {
            instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
            getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
        }
    }
    else 
    {    
        if (varTypeIsSmallInt(baseType))
        {
            // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits.
            // In case of vector<short> we also need to sign extend the 16-bit value in targetReg
            // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right
            // by 8-bits if index is odd.  In case of Vector<sbyte> also sign extend targetReg.
            int index = (int) op2->gtIntCon.gtIconVal;
            unsigned baseSize = genTypeSize(baseType);
            if (baseSize == 1)
            {
                index /= 2;
            }

            getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index);

            bool ZeroOrSignExtnReqd = true;
            if (baseSize == 1)
            {   
                if ((op2->gtIntCon.gtIconVal % 2) == 1)
                {
                    // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element.
                    inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8);

                    // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE
                    ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
                }
                // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits
            }
            else 
            {
                // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT
                assert(baseSize == 2);
                ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
            }
           
            if (ZeroOrSignExtnReqd)
            {
                // Zero/sign extend the byte/short to 32-bits
                inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType));
            }
        }
        else
        {
            // We need a temp xmm register if the baseType is not floating point and
            // accessing non-zero'th element.            
            regNumber tmpReg = REG_NA;
            instruction ins;

            if (byteShiftCnt != 0)
            {
                assert(simdNode->gtRsvdRegs != RBM_NONE);
                assert(genCountBits(simdNode->gtRsvdRegs) == 1);
                tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);

                if (tmpReg != srcReg)
                {
                    inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
                }

                ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
                getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
            }
            else
            {
                assert(simdNode->gtRsvdRegs == RBM_NONE);
                tmpReg = srcReg;
            }

            assert(tmpReg != REG_NA);
            ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
            // (Note that for mov_xmm2i, the int register is always in the reg2 position.
            inst_RV_RV(ins, tmpReg, targetReg, baseType);
        }
    }

    genProduceReg(simdNode);
}

//------------------------------------------------------------------------------------
// genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
// TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case.
//
void
CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
{
    // Determine index based on intrinsic ID
    int index = -1;
    switch(simdNode->gtSIMDIntrinsicID)
    {
    case SIMDIntrinsicSetX:
        index = 0;
        break;
    case SIMDIntrinsicSetY:
        index = 1;
        break;
    case SIMDIntrinsicSetZ:
        index = 2;
        break;
    case SIMDIntrinsicSetW:
        index = 3;
        break;

    default:
        unreached();
    }
    assert(index != -1);

    // op1 is the SIMD vector
    // op2 is the value to be set
    GenTree* op1 = simdNode->gtGetOp1();
    GenTree* op2 = simdNode->gtGetOp2();

    var_types baseType = simdNode->gtSIMDBaseType;
    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);
    var_types targetType = simdNode->TypeGet();
    assert(varTypeIsSIMD(targetType) || targetType == TYP_DOUBLE);

    // the following assert must hold.
    // supported only on vector2f/3f/4f right now
    noway_assert(baseType == TYP_FLOAT);
    assert(op2->TypeGet() == baseType);
    assert(simdNode->gtSIMDSize >= ((index + 1) * genTypeSize(baseType)));

    regNumber op1Reg = genConsumeReg(op1);
    regNumber op2Reg = genConsumeReg(op2);

    // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate.
    if (targetReg != op1Reg)
    {
        inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
    }

    // Right now this intrinsic is supported only for float base type vectors.
    // If in future need to support on other base type vectors, the below
    // logic needs modification.
    noway_assert(baseType == TYP_FLOAT);

    if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
    {   
        // We need one additional int register as scratch
        assert(simdNode->gtRsvdRegs != RBM_NONE);
        assert(genCountBits(simdNode->gtRsvdRegs) == 1);
        regNumber tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
        assert(genIsValidIntReg(tmpReg));

        // Move the value from xmm reg to an int reg
        instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT);
        // (Note that for mov_xmm2i, the int register is always in the reg2 position.
        inst_RV_RV(ins, op2Reg, tmpReg, baseType);

        // First insert the lower 16-bits of tmpReg in targetReg at 2*index position
        // since every float has two 16-bit words.
        getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2*index);

        // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position
        inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16);
        getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2*index+1);
    }
    else
    {
        unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0)|INSERTPS_TARGET_SELECT(index));
        inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm);
    }

    genProduceReg(simdNode);
}

//------------------------------------------------------------------------
// genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
void
CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode)
{
    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2);
    noway_assert(compiler->getSIMDInstructionSet() == InstructionSet_SSE2);

    GenTree* op1 = simdNode->gtGetOp1();
    GenTree* op2 = simdNode->gtGetOp2();
    assert(op2->isContained());
    assert(op2->IsCnsIntOrI());
    int shuffleControl = (int) op2->AsIntConCommon()->IconValue();
    var_types baseType = simdNode->gtSIMDBaseType;
    var_types targetType = simdNode->TypeGet();
    regNumber targetReg = simdNode->gtRegNum;
    assert(targetReg != REG_NA);

    regNumber op1Reg = genConsumeReg(op1);
    if (targetReg != op1Reg)
    {
        inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
    }

    instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
    getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl);
    genProduceReg(simdNode);
}

//-----------------------------------------------------------------------------
// genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
// Since Vector3 is not a hardware supported write size, it is performed
// as two writes: 8 byte followed by 4-byte.
//
// Arguments:
//    treeNode - tree node that is attempting to store indirect
//    
//
// Return Value:
//    None.
//
void
CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
{
    assert(treeNode->OperGet() == GT_STOREIND);

    GenTree* addr = treeNode->gtOp.gtOp1;
    GenTree* data = treeNode->gtOp.gtOp2;
    
    // addr and data should not be contained.
    assert(!data->isContained());
    assert(!addr->isContained());

#ifdef DEBUG
    // Should not require a write barrier
    GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data);
    assert(writeBarrierForm == GCInfo::WBF_NoBarrier);
#endif

    // Need an addtional Xmm register to extract upper 4 bytes from data.
    assert(treeNode->gtRsvdRegs != RBM_NONE);
    assert(genCountBits(treeNode->gtRsvdRegs) == 1);
    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);

    bool reverseOps = ((treeNode->gtFlags & GTF_REVERSE_OPS) != 0);
    if (!reverseOps)
    {
        genConsumeReg(addr);
        genConsumeReg(data);
    }
    else
    {
        genConsumeReg(data);
        genConsumeReg(addr);        
    }

    // 8-byte write
    getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, 0);

    // Extract upper 4-bytes from data
    getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, 0x02);

    // 4-byte write
    getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, 8);
}

//-----------------------------------------------------------------------------
// genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
// Since Vector3 is not a hardware supported write size, it is performed
// as two loads: 8 byte followed by 4-byte.
//
// Arguments:
//    treeNode - tree node of GT_IND
//    
//
// Return Value:
//    None.
//
void
CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
{
    assert(treeNode->OperGet() == GT_IND);

    regNumber targetReg = treeNode->gtRegNum;
    GenTreePtr op1 = treeNode->gtOp.gtOp1;
    assert(!op1->isContained());
    regNumber operandReg = genConsumeReg(op1);

    // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg
    assert(treeNode->gtRsvdRegs != RBM_NONE);
    assert(genCountBits(treeNode->gtRsvdRegs) == 2);

    regNumber tmpReg = REG_NA;
    regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
    regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
    tmpRegsMask &= ~tmpReg1Mask;
    regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
    regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);

    // Choose any register different from targetReg as tmpReg
    if (tmpReg1 != targetReg)
    {
        tmpReg = tmpReg1;
    }
    else 
    {
        assert(targetReg != tmpReg2);
        tmpReg = tmpReg2;
    }
    assert(tmpReg != REG_NA);
    assert(tmpReg != targetReg);

    // Load upper 4 bytes in tmpReg
    getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, 8);

    // Load lower 8 bytes in targetReg
    getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);

    // combine upper 4 bytes and lower 8 bytes in targetReg
    getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);

    genProduceReg(treeNode);
}

//-----------------------------------------------------------------------------
// genStoreLclFldTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
// Since Vector3 is not a hardware supported write size, it is performed
// as two stores: 8 byte followed by 4-byte.
//
// Arguments:
//    treeNode - tree node that is attempting to store TYP_SIMD12 field
//    
// Return Value:
//    None.
//
void
CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
{
    assert(treeNode->OperGet() == GT_STORE_LCL_FLD);

    unsigned offs = treeNode->gtLclFld.gtLclOffs;
    unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
    assert(varNum < compiler->lvaCount);

    GenTreePtr op1 = treeNode->gtOp.gtOp1;
    assert(!op1->isContained());
    regNumber operandReg = genConsumeReg(op1);

    // Need an addtional Xmm register to extract upper 4 bytes from data.
    assert(treeNode->gtRsvdRegs != RBM_NONE);
    assert(genCountBits(treeNode->gtRsvdRegs) == 1);
    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);

    // store lower 8 bytes
    getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs);

    // Extract upper 4-bytes from operandReg
    getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);

    // Store upper 4 bytes
    getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs+8);
}

//-----------------------------------------------------------------------------
// genLoadLclFldTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
// Since Vector3 is not a hardware supported write size, it is performed
// as two reads: 8 byte followed by 4-byte.
//
// Arguments:
//    treeNode - tree node that is attempting to load TYP_SIMD12 field
//    
// Return Value:
//    None.
//
void
CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
{
    assert(treeNode->OperGet() == GT_LCL_FLD);

    regNumber targetReg  = treeNode->gtRegNum;
    unsigned offs = treeNode->gtLclFld.gtLclOffs;
    unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
    assert(varNum < compiler->lvaCount);

    // Need an addtional Xmm register to read upper 4 bytes
    assert(treeNode->gtRsvdRegs != RBM_NONE);
    assert(genCountBits(treeNode->gtRsvdRegs) == 2);

    regNumber tmpReg = REG_NA;
    regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
    regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
    tmpRegsMask &= ~tmpReg1Mask;
    regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
    regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);

    // Choose any register different from targetReg as tmpReg
    if (tmpReg1 != targetReg)
    {
        tmpReg = tmpReg1;
    }
    else 
    {
        assert(targetReg != tmpReg2);
        tmpReg = tmpReg2;
    }
    assert(tmpReg != REG_NA);
    assert(tmpReg != targetReg);

    // Read upper 4 bytes to tmpReg
    getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs+8);

    // Read lower 8 bytes to targetReg    
    getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);

    // combine upper 4 bytes and lower 8 bytes in targetReg
    getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);

    genProduceReg(treeNode);
}

//-----------------------------------------------------------------------------
// genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
//                            the given register, if any, or to memory.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
// Notes:
//    The upper half of all AVX registers is volatile, even the callee-save registers.
//    When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic
//    to cause the upper half to be saved.  It will first attempt to find another, unused, callee-save
//    register.  If such a register cannot be found, it will save it to an available caller-save register.
//    In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte
//    value to the stack.  (Note that if there are no caller-save registers available, the entire 32 byte
//    value will be spilled to the stack.)
//
void
CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
{
    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave);

    GenTree* op1 = simdNode->gtGetOp1();
    assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
    regNumber targetReg = simdNode->gtRegNum;
    regNumber op1Reg = genConsumeReg(op1);
    assert(op1Reg != REG_NA);
    assert(targetReg != REG_NA);
    getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01);

    genProduceReg(simdNode);
}

//-----------------------------------------------------------------------------
// genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to
//                               the given register, if any, or to memory.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
// Notes:
//    For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always
//    have their home register, this node has its targetReg on the lclVar child, and its source
//    on the simdNode.
//    Regarding spill, please see the note above on genSIMDIntrinsicUpperSave.  If we have spilled
//    an upper-half to a caller save register, this node will be marked GTF_SPILLED.  However, unlike
//    most spill scenarios, the saved tree will be different from the restored tree, but the spill
//    restore logic, which is triggered by the call to genConsumeReg, requires us to provide the
//    spilled tree (saveNode) in order to perform the reload.  We can easily find that tree,
//    as it is in the spill descriptor for the register from which it was saved.
//
void
CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
{
    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore);

    GenTree* op1 = simdNode->gtGetOp1();
    assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
    regNumber srcReg = simdNode->gtRegNum;
    regNumber lclVarReg = genConsumeReg(op1);
    unsigned varNum = op1->AsLclVarCommon()->gtLclNum;
    assert(lclVarReg != REG_NA);
    assert(srcReg != REG_NA);
    if (simdNode->gtFlags & GTF_SPILLED)
    {
        GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree;
        noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg));
        genConsumeReg(saveNode);
    }
    getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01);
}

//------------------------------------------------------------------------
// genSIMDIntrinsic: Generate code for a SIMD Intrinsic.  This is the main
// routine which in turn calls apropriate genSIMDIntrinsicXXX() routine.
//
// Arguments:
//    simdNode - The GT_SIMD node
//
// Return Value:
//    None.
//
// Notes:
//    Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and
//    a limited set of methods.
//
void
CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
{
    // NYI for unsupported base types
    if (simdNode->gtSIMDBaseType != TYP_INT &&
        simdNode->gtSIMDBaseType != TYP_LONG &&
        simdNode->gtSIMDBaseType != TYP_FLOAT &&
        simdNode->gtSIMDBaseType != TYP_DOUBLE &&
        simdNode->gtSIMDBaseType != TYP_CHAR &&
        simdNode->gtSIMDBaseType != TYP_UBYTE  &&
        simdNode->gtSIMDBaseType != TYP_SHORT &&
        simdNode->gtSIMDBaseType != TYP_BYTE &&
        simdNode->gtSIMDBaseType != TYP_UINT &&
        simdNode->gtSIMDBaseType != TYP_ULONG
        )
    {
        noway_assert(!"SIMD intrinsic with unsupported base type.");
    }

    switch(simdNode->gtSIMDIntrinsicID)
    {
    case SIMDIntrinsicInit:
        genSIMDIntrinsicInit(simdNode);
        break;

    case SIMDIntrinsicInitN:
        genSIMDIntrinsicInitN(simdNode);
        break;

    case SIMDIntrinsicSqrt:
    case SIMDIntrinsicCast:
        genSIMDIntrinsicUnOp(simdNode);
        break;

    case SIMDIntrinsicAdd:
    case SIMDIntrinsicSub:
    case SIMDIntrinsicMul:
    case SIMDIntrinsicDiv:
    case SIMDIntrinsicBitwiseAnd:
    case SIMDIntrinsicBitwiseAndNot:
    case SIMDIntrinsicBitwiseOr:
    case SIMDIntrinsicBitwiseXor:
    case SIMDIntrinsicMin:
    case SIMDIntrinsicMax:
        genSIMDIntrinsicBinOp(simdNode);
        break;

    case SIMDIntrinsicOpEquality:
    case SIMDIntrinsicOpInEquality:
    case SIMDIntrinsicEqual:
    case SIMDIntrinsicLessThan:
    case SIMDIntrinsicGreaterThan:
    case SIMDIntrinsicLessThanOrEqual:
    case SIMDIntrinsicGreaterThanOrEqual:
        genSIMDIntrinsicRelOp(simdNode);
        break;

    case SIMDIntrinsicDotProduct:
        genSIMDIntrinsicDotProduct(simdNode);
        break;

    case SIMDIntrinsicGetItem:
        genSIMDIntrinsicGetItem(simdNode);
        break;

    case SIMDIntrinsicShuffleSSE2:
        genSIMDIntrinsicShuffleSSE2(simdNode);
        break;

    case SIMDIntrinsicSetX:
    case SIMDIntrinsicSetY:
    case SIMDIntrinsicSetZ:
    case SIMDIntrinsicSetW:
        genSIMDIntrinsicSetItem(simdNode);
        break;

    case SIMDIntrinsicUpperSave:
        genSIMDIntrinsicUpperSave(simdNode);
        break;
    case SIMDIntrinsicUpperRestore:
        genSIMDIntrinsicUpperRestore(simdNode);
        break;

    default:
        noway_assert("Unimplemented SIMD intrinsic.");
        unreached();
    }
}

#endif // FEATURE_SIMD
#endif //_TARGET_AMD64_
#endif // !LEGACY_BACKEND