summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Forstall <brucefo@microsoft.com>2017-05-19 10:24:18 -0700
committerGitHub <noreply@github.com>2017-05-19 10:24:18 -0700
commit7a755987c3ceee405a1457df4bff9ac6f4218eda (patch)
treeddb6ca75e8e08cd7481a9f9431c971c6cbab259f
parent61722c9b0b1eb4d49432b8ba77934d5a1dcfb600 (diff)
parent965d5eea7f9b2aac3e6bd5a2f0061b926b7c5f8c (diff)
downloadcoreclr-7a755987c3ceee405a1457df4bff9ac6f4218eda.tar.gz
coreclr-7a755987c3ceee405a1457df4bff9ac6f4218eda.tar.bz2
coreclr-7a755987c3ceee405a1457df4bff9ac6f4218eda.zip
Merge pull request #10662 from helloguo/VectorConversion
Add JIT intrinsics support for vector conversion on AMD64 and x86
-rw-r--r--src/jit/codegenlinear.h11
-rw-r--r--src/jit/emitfmtsxarch.h2
-rw-r--r--src/jit/emitxarch.cpp141
-rw-r--r--src/jit/emitxarch.h2
-rw-r--r--src/jit/instrsxarch.h40
-rw-r--r--src/jit/lsraxarch.cpp84
-rw-r--r--src/jit/simd.cpp59
-rw-r--r--src/jit/simd.h4
-rw-r--r--src/jit/simdcodegenxarch.cpp861
-rw-r--r--src/jit/simdintrinsiclist.h23
-rw-r--r--tests/src/JIT/SIMD/VectorConvert.cs15
-rw-r--r--tests/src/JIT/SIMD/VectorConvert_r.csproj3
-rw-r--r--tests/src/JIT/SIMD/VectorConvert_ro.csproj3
13 files changed, 1200 insertions, 48 deletions
diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h
index 3bd0eacf0d..5cead6d66a 100644
--- a/src/jit/codegenlinear.h
+++ b/src/jit/codegenlinear.h
@@ -80,6 +80,17 @@ void genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode);
void genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode);
void genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode);
void genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode);
+void genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID,
+ var_types simdType,
+ var_types baseType,
+ regNumber tmpReg,
+ regNumber tmpIntReg,
+ regNumber targetReg);
+void genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode);
+void genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode);
+void genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode);
+void genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg);
+void genSIMDIntrinsicWiden(GenTreeSIMD* simdNode);
void genSIMDIntrinsic(GenTreeSIMD* simdNode);
void genSIMDCheck(GenTree* treeNode);
diff --git a/src/jit/emitfmtsxarch.h b/src/jit/emitfmtsxarch.h
index 49afcb5c8b..6d15fcf22f 100644
--- a/src/jit/emitfmtsxarch.h
+++ b/src/jit/emitfmtsxarch.h
@@ -109,7 +109,7 @@ IF_DEF(RRW_RRW, IS_R1_RW|IS_R2_RW, NONE) // r/w reg , r/w re
IF_DEF(RRW_RRW_CNS, IS_R1_RW|IS_R2_RW, SCNS) // r/w reg , r/w reg2 , const
IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg , read reg2 , read reg3
-
+IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write reg , read reg2 , read reg3, const
//----------------------------------------------------------------------------
// The following formats are used for direct addresses (e.g. static data members)
//----------------------------------------------------------------------------
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index 86140696c6..9f43202cc7 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -94,7 +94,10 @@ bool emitter::IsThreeOperandBinaryAVXInstruction(instruction ins)
ins == INS_vinsertf128 || ins == INS_punpckldq || ins == INS_phaddd || ins == INS_pminub ||
ins == INS_pminsw || ins == INS_pminsb || ins == INS_pminsd || ins == INS_pminuw || ins == INS_pminud ||
ins == INS_pmaxub || ins == INS_pmaxsw || ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw ||
- ins == INS_pmaxud);
+ ins == INS_pmaxud || ins == INS_vinserti128 || ins == INS_punpckhbw || ins == INS_punpcklbw ||
+ ins == INS_punpckhqdq || ins == INS_punpcklqdq || ins == INS_punpckhwd || ins == INS_punpcklwd ||
+ ins == INS_punpckhdq || ins == INS_packssdw || ins == INS_packsswb || ins == INS_packuswb ||
+ ins == INS_packusdw || ins == INS_vperm2i128);
}
// Returns true if the AVX instruction is a move operator that requires 3 operands.
@@ -105,8 +108,8 @@ bool emitter::IsThreeOperandBinaryAVXInstruction(instruction ins)
// to indicate whether a 3-operand instruction.
bool emitter::IsThreeOperandMoveAVXInstruction(instruction ins)
{
- return IsAVXInstruction(ins) &&
- (ins == INS_movlpd || ins == INS_movlps || ins == INS_movhpd || ins == INS_movhps || ins == INS_movss);
+ return IsAVXInstruction(ins) && (ins == INS_movlpd || ins == INS_movlps || ins == INS_movhpd || ins == INS_movhps ||
+ ins == INS_movss || ins == INS_movlhps);
}
// ------------------------------------------------------------------------------
@@ -206,6 +209,14 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att
// Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix
bool TakesRexWPrefix(instruction ins, emitAttr attr)
{
+ // Because the current implementation of AVX does not have a way to distinguish between the register
+ // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are
+ // required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE),
+ // and here we must special case these by the opcode.
+ if (ins == INS_vpermq)
+ {
+ return true;
+ }
#ifdef _TARGET_AMD64_
// movsx should always sign extend out to 8 bytes just because we don't track
// whether the dest should be 4 bytes or 8 bytes (attr indicates the size
@@ -342,7 +353,6 @@ unsigned RegEncoding(regNumber reg)
// AVX: specific bits within VEX prefix need to be set in bit-inverted form.
emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
{
-#ifdef _TARGET_AMD64_
if (UseAVX() && IsAVXInstruction(ins))
{
// W-bit is available only in 3-byte VEX prefix that starts with byte C4.
@@ -351,7 +361,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
// W-bit is the only bit that is added in non bit-inverted form.
return code | 0x00008000000000ULL;
}
-
+#ifdef _TARGET_AMD64_
return code | 0x4800000000ULL;
#else
assert(!"UNREACHED");
@@ -3810,6 +3820,40 @@ void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg,
emitCurIGsize += sz;
}
+/**********************************************************************************
+* emitIns_R_R_R_I: Add an instruction with three register operands and an immediate.
+*
+* Arguments:
+* ins - the instruction to add
+* attr - the emitter attribute for instruction
+* targetReg - the target (destination) register
+* reg1 - the first source register
+* reg2 - the second source register
+* ival - the immediate value
+*/
+
+void emitter::emitIns_R_R_R_I(
+ instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, int ival)
+{
+ assert(IsSSEOrAVXInstruction(ins));
+ assert(IsThreeOperandAVXInstruction(ins));
+ // Currently vex prefix only use three bytes mode.
+ // size = vex + opcode + ModR/M + 1-byte-cns = 3 + 1 + 1 + 1 = 6
+ // TODO-XArch-CQ: We should create function which can calculate all kinds of AVX instructions size in future
+ UNATIVE_OFFSET sz = 6;
+
+ instrDesc* id = emitNewInstrCns(attr, ival);
+ id->idIns(ins);
+ id->idInsFmt(IF_RWR_RRD_RRD_CNS);
+ id->idReg1(targetReg);
+ id->idReg2(reg1);
+ id->idReg3(reg2);
+
+ id->idCodeSize(sz);
+ dispIns(id);
+ emitCurIGsize += sz;
+}
+
#endif
/*****************************************************************************
*
@@ -7001,6 +7045,15 @@ void emitter::emitDispIns(
printf("%s, ", emitRegName(id->idReg2(), attr));
printf("%s", emitRegName(id->idReg3(), attr));
break;
+ case IF_RWR_RRD_RRD_CNS:
+ assert(IsAVXInstruction(ins));
+ assert(IsThreeOperandAVXInstruction(ins));
+ printf("%s, ", emitRegName(id->idReg1(), attr));
+ printf("%s, ", emitRegName(id->idReg2(), attr));
+ printf("%s, ", emitRegName(id->idReg3(), attr));
+ val = emitGetInsSC(id);
+ goto PRINT_CONSTANT;
+ break;
#endif
case IF_RRW_RRW_CNS:
printf("%s,", emitRegName(id->idReg1(), attr));
@@ -9520,7 +9573,34 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
assert(id->idGCref() == GCT_NONE);
assert(valInByte);
- assert(ins == INS_psrldq || ins == INS_pslldq);
+ // The left and right shifts use the same encoding, and are distinguished by the Reg/Opcode field.
+ regNumber regOpcode;
+ switch (ins)
+ {
+ case INS_psrldq:
+ regOpcode = (regNumber)3;
+ break;
+ case INS_pslldq:
+ regOpcode = (regNumber)7;
+ break;
+ case INS_psrld:
+ case INS_psrlw:
+ case INS_psrlq:
+ regOpcode = (regNumber)2;
+ break;
+ case INS_pslld:
+ case INS_psllw:
+ case INS_psllq:
+ regOpcode = (regNumber)6;
+ break;
+ case INS_psrad:
+ regOpcode = (regNumber)4;
+ break;
+ default:
+ assert(!"Invalid instruction for SSE2 instruction of the form: opcode reg, immed8");
+ regOpcode = REG_NA;
+ break;
+ }
// Get the 'base' opcode.
code = insCodeMI(ins);
@@ -9534,14 +9614,6 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
code = insEncodeReg3456(ins, reg, size, code);
}
- // In case of psrldq
- // Reg/Opcode = 3
- // R/M = reg1
- //
- // In case of pslldq
- // Reg/Opcode = 7
- // R/M = reg1
- regNumber regOpcode = (regNumber)((ins == INS_psrldq) ? 3 : 7);
unsigned regcode = (insEncodeReg345(ins, regOpcode, size, &code) | insEncodeReg012(ins, reg, size, &code)) << 8;
// Output the REX prefix
@@ -10665,6 +10737,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
dst = emitOutputRRR(dst, id);
sz = emitSizeOfInsDsc(id);
break;
+ case IF_RWR_RRD_RRD_CNS:
+ dst = emitOutputRRR(dst, id);
+ sz = emitSizeOfInsDsc(id);
+ dst += emitOutputByte(dst, emitGetInsSC(id));
+ break;
#endif
case IF_RRW_RRW_CNS:
@@ -10696,6 +10773,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
assert(code & 0x00FF0000);
#ifdef FEATURE_AVX_SUPPORT
+ if (TakesRexWPrefix(ins, size))
+ {
+ code = AddRexWPrefix(ins, code);
+ }
+
if (TakesVexPrefix(ins))
{
if (IsThreeOperandBinaryAVXInstruction(ins))
@@ -10724,11 +10806,16 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
if (Is4ByteAVXInstruction(ins))
{
- // We just need to output the last byte of the opcode.
assert((code & 0xFF) == 0);
- assert((code & 0xFF00) != 0xC000);
- dst += emitOutputByte(dst, (code >> 8) & 0xFF);
- code = 0;
+ if ((code & 0xFF00) == 0xC000)
+ {
+ dst += emitOutputWord(dst, code | regcode);
+ }
+ else
+ {
+ dst += emitOutputByte(dst, (code >> 8) & 0xFF);
+ dst += emitOutputByte(dst, 0xC0 | (regcode >> 8));
+ }
}
else if (code & 0xFF000000)
{
@@ -10738,27 +10825,25 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
if (Is4ByteSSE4Instruction(ins))
{
dst += emitOutputWord(dst, code);
- code = 0;
+ dst += emitOutputByte(dst, 0xC0 | (regcode >> 8));
+ }
+ else
+ {
+ assert((code & 0xFF00) == 0xC000);
+ dst += emitOutputWord(dst, code | regcode);
}
}
else if (code & 0x00FF0000)
{
dst += emitOutputByte(dst, code >> 16);
code &= 0x0000FFFF;
- }
-
- // Note that regcode is shifted by 8-bits above to align with RM byte.
- if (code != 0)
- {
assert((code & 0xFF00) == 0xC000);
dst += emitOutputWord(dst, code | regcode);
}
else
{
- // This case occurs for SSE4/AVX instructions.
- // Note that regcode is left shifted by 8-bits.
- assert(Is4ByteAVXInstruction(ins) || Is4ByteSSE4Instruction(ins));
- dst += emitOutputByte(dst, 0xC0 | (regcode >> 8));
+ assert((code & 0xFF00) == 0xC000);
+ dst += emitOutputWord(dst, code | regcode);
}
dst += emitOutputByte(dst, emitGetInsSC(id));
diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h
index faeba7d942..d439f7e7f2 100644
--- a/src/jit/emitxarch.h
+++ b/src/jit/emitxarch.h
@@ -360,6 +360,8 @@ void emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg
#ifdef FEATURE_AVX_SUPPORT
void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3);
+
+void emitIns_R_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, int ival);
#endif
void emitIns_S(instruction ins, emitAttr attr, int varx, int offs);
diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
index 729bece554..095277064e 100644
--- a/src/jit/instrsxarch.h
+++ b/src/jit/instrsxarch.h
@@ -200,10 +200,13 @@ INST3( movapd, "movapd" , 0, IUM_WR, 0, 0, PCKDBL(0x29), BAD_CODE, PCK
INST3( movaps, "movaps" , 0, IUM_WR, 0, 0, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28))
INST3( movupd, "movupd" , 0, IUM_WR, 0, 0, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10))
INST3( movups, "movups" , 0, IUM_WR, 0, 0, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10))
+INST3( movlhps, "movlhps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x16))
INST3( shufps, "shufps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0xC6))
INST3( shufpd, "shufpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC6))
-
+
+INST3( punpckhdq, "punpckhdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6A))
+
// SSE 2 arith
INST3( addps, "addps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x58)) // Add packed singles
INST3( addss, "addss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x58)) // Add scalar singles
@@ -289,8 +292,19 @@ INST3( pand, "pand" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE,
INST3( pandn, "pandn" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDF)) // Packed bit-wise AND NOT of two xmm regs
INST3( por, "por" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEB)) // Packed bit-wise OR of two xmm regs
INST3( pxor, "pxor" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEF)) // Packed bit-wise XOR of two xmm regs
+
+// Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode,
+// which is handled in emitxarch.cpp.
INST3( psrldq, "psrldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift right logical of xmm reg by given number of bytes
INST3( pslldq, "pslldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift left logical of xmm reg by given number of bytes
+INST3( psllq, "psllq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift left logical of 64-bit integers
+INST3( psrlq, "psrlq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift right logical of 64-bit integers
+INST3( pslld, "pslld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift left logical of 32-bit integers
+INST3( psrld, "psrld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right logical of 32-bit integers
+INST3( psllw, "psllw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift left logical of 16-bit integers
+INST3( psrlw, "psrlw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift right logical of 16-bit integers
+INST3( psrad, "psrad" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right arithmetic of 32-bit integers
+
INST3( pmaxub, "pmaxub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDE)) // packed maximum unsigned bytes
INST3( pminub, "pminub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDA)) // packed minimum unsigned bytes
INST3( pmaxsw, "pmaxsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEE)) // packed maximum signed words
@@ -306,14 +320,24 @@ INST3( pshufd, "pshufd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE,
INST3( pextrw, "pextrw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC5)) // Extract 16-bit value into a r32 with zero extended to 32-bits
INST3( pinsrw, "pinsrw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC4)) // packed insert word
+INST3( punpckhbw, "punpckhbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x68)) // Packed logical (unsigned) widen ubyte to ushort (hi)
+INST3( punpcklbw, "punpcklbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x60)) // Packed logical (unsigned) widen ubyte to ushort (lo)
+INST3( punpckhqdq, "punpckhqdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6D)) // Packed logical (unsigned) widen uint to ulong (hi)
+INST3( punpcklqdq, "punpcklqdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6C)) // Packed logical (unsigned) widen uint to ulong (lo)
+INST3( punpckhwd, "punpckhwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x69)) // Packed logical (unsigned) widen ushort to uint (hi)
+INST3( punpcklwd, "punpcklwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x61)) // Packed logical (unsigned) widen ushort to uint (lo)
+
+INST3( packssdw, "packssdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6B)) // Pack (narrow) int to short with saturation
+INST3( packsswb, "packsswb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x63)) // Pack (narrow) short to byte with saturation
+INST3( packuswb, "packuswb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x67)) // Pack (narrow) short to unsigned byte with saturation
#endif // !LEGACY_BACKEND
INST3(LAST_SSE2_INSTRUCTION, "LAST_SSE2_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
#ifndef LEGACY_BACKEND
INST3(FIRST_SSE4_INSTRUCTION, "FIRST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
// enum name FP updmode rf wf MR MI RM
-INST3( dpps, "dpps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x40)) // Packed bit-wise AND NOT of two xmm regs
-INST3( dppd, "dppd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x41)) // Packed bit-wise AND NOT of two xmm regs
+INST3( dpps, "dpps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x40)) // Packed dot product of two float vector regs
+INST3( dppd, "dppd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x41)) // Packed dot product of two double vector regs
INST3( insertps, "insertps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x21)) // Insert packed single precision float value
INST3( pcmpeqq, "pcmpeqq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x29)) // Packed compare 64-bit integers for equality
INST3( pcmpgtq, "pcmpgtq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x37)) // Packed compare 64-bit integers for equality
@@ -331,6 +355,11 @@ INST3( pmaxsb, "pmaxsb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS
INST3( pmaxsd, "pmaxsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3D)) // packed maximum 32-bit signed integers
INST3( pmaxuw, "pmaxuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3E)) // packed maximum 16-bit unsigned integers
INST3( pmaxud, "pmaxud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3F)) // packed maximum 32-bit unsigned integers
+INST3( pmovsxbw, "pmovsxbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x20)) // Packed sign extend byte to short
+INST3( pmovsxwd, "pmovsxwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x23)) // Packed sign extend short to int
+INST3( pmovsxdq, "pmovsxdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x25)) // Packed sign extend int to long
+INST3( packusdw, "packusdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2B)) // Pack (narrow) int to unsigned short with saturation
+
INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
@@ -342,9 +371,12 @@ INST3( vpbroadcastw, "pbroadcastw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS
INST3( vpbroadcastd, "pbroadcastd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x58)) // Broadcast int32 value from reg/memory to entire ymm register
INST3( vpbroadcastq, "pbroadcastq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x59)) // Broadcast int64 value from reg/memory to entire ymm register
INST3( vextractf128, "extractf128" , 0, IUM_WR, 0, 0, SSE3A(0x19), BAD_CODE, BAD_CODE) // Extract 128-bit packed floating point values
+INST3( vextracti128, "extracti128" , 0, IUM_WR, 0, 0, SSE3A(0x39), BAD_CODE, BAD_CODE) // Extract 128-bit packed integer values
INST3( vinsertf128, "insertf128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x18)) // Insert 128-bit packed floating point values
+INST3( vinserti128, "inserti128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x38)) // Insert 128-bit packed integer values
INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
-
+INST3( vperm2i128, "perm2i128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x46)) // Permute 128-bit halves of input register
+INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x00)) // Permute 64-bit of input register
INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
#endif // !LEGACY_BACKEND
// enum name FP updmode rf wf R/M,R/M[reg] R/M,icon
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp
index 002e3d803f..987ac724ea 100644
--- a/src/jit/lsraxarch.cpp
+++ b/src/jit/lsraxarch.cpp
@@ -2676,6 +2676,90 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
info->srcCount = 1;
break;
+ case SIMDIntrinsicConvertToSingle:
+ info->srcCount = 1;
+ if (simdTree->gtSIMDBaseType == TYP_UINT)
+ {
+ // We need an internal register different from targetReg.
+ info->isInternalRegDelayFree = true;
+ info->internalIntCount = 1;
+ info->internalFloatCount = 2;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT));
+ }
+ break;
+
+ case SIMDIntrinsicConvertToUInt32:
+ case SIMDIntrinsicConvertToInt32:
+ info->srcCount = 1;
+ break;
+
+ case SIMDIntrinsicWidenLo:
+ case SIMDIntrinsicWidenHi:
+ info->srcCount = 1;
+ if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
+ {
+ // We need an internal register different from targetReg.
+ info->isInternalRegDelayFree = true;
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
+ break;
+
+ case SIMDIntrinsicConvertToInt64:
+ case SIMDIntrinsicConvertToUInt64:
+ // We need an internal register different from targetReg.
+ info->isInternalRegDelayFree = true;
+ info->srcCount = 1;
+ info->internalIntCount = 1;
+ if (comp->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ info->internalFloatCount = 2;
+ }
+ else
+ {
+ info->internalFloatCount = 1;
+ }
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT));
+ break;
+
+ case SIMDIntrinsicConvertToDouble:
+ // We need an internal register different from targetReg.
+ info->isInternalRegDelayFree = true;
+ info->srcCount = 1;
+ info->internalIntCount = 1;
+#ifdef _TARGET_X86_
+ if (simdTree->gtSIMDBaseType == TYP_LONG)
+ {
+ info->internalFloatCount = 3;
+ }
+ else
+#endif
+ if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) || (simdTree->gtSIMDBaseType == TYP_ULONG))
+ {
+ info->internalFloatCount = 2;
+ }
+ else
+ {
+ info->internalFloatCount = 1;
+ }
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT));
+ break;
+
+ case SIMDIntrinsicNarrow:
+ // We need an internal register different from targetReg.
+ info->isInternalRegDelayFree = true;
+ info->srcCount = 2;
+ if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
+ {
+ info->internalFloatCount = 2;
+ }
+ else
+ {
+ info->internalFloatCount = 1;
+ }
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ break;
+
case SIMDIntrinsicShuffleSSE2:
info->srcCount = 2;
// Second operand is an integer constant and marked as contained.
diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp
index 4ba7832cca..bbb9a57cc4 100644
--- a/src/jit/simd.cpp
+++ b/src/jit/simd.cpp
@@ -2609,6 +2609,10 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode,
// Unary operators that take and return a Vector.
case SIMDIntrinsicCast:
+ case SIMDIntrinsicConvertToSingle:
+ case SIMDIntrinsicConvertToDouble:
+ case SIMDIntrinsicConvertToInt32:
+ case SIMDIntrinsicConvertToUInt32:
{
op1 = impSIMDPopStack(simdType, instMethod);
@@ -2617,6 +2621,61 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode,
}
break;
+ case SIMDIntrinsicConvertToInt64:
+ case SIMDIntrinsicConvertToUInt64:
+ {
+#ifdef _TARGET_AMD64_
+ op1 = impSIMDPopStack(simdType, instMethod);
+
+ simdTree = gtNewSIMDNode(simdType, op1, nullptr, simdIntrinsicID, baseType, size);
+ retVal = simdTree;
+#else
+ JITDUMP("SIMD Conversion to Int64/UInt64 is not supported on this platform\n");
+ return nullptr;
+#endif
+ }
+ break;
+
+ case SIMDIntrinsicNarrow:
+ {
+ assert(!instMethod);
+ op2 = impSIMDPopStack(simdType);
+ op1 = impSIMDPopStack(simdType);
+ // op1 and op2 are two input Vector<T>.
+ simdTree = gtNewSIMDNode(simdType, op1, op2, simdIntrinsicID, baseType, size);
+ retVal = simdTree;
+ }
+ break;
+
+ case SIMDIntrinsicWiden:
+ {
+ GenTree* dstAddrHi = impSIMDPopStack(TYP_BYREF);
+ GenTree* dstAddrLo = impSIMDPopStack(TYP_BYREF);
+ op1 = impSIMDPopStack(simdType);
+ GenTree* dupOp1 = fgInsertCommaFormTemp(&op1, gtGetStructHandleForSIMD(simdType, baseType));
+
+ // Widen the lower half and assign it to dstAddrLo.
+ simdTree = gtNewSIMDNode(simdType, op1, nullptr, SIMDIntrinsicWidenLo, baseType, size);
+ GenTree* loDest =
+ new (this, GT_BLK) GenTreeBlk(GT_BLK, simdType, dstAddrLo, getSIMDTypeSizeInBytes(clsHnd));
+ GenTree* loAsg = gtNewBlkOpNode(loDest, simdTree, getSIMDTypeSizeInBytes(clsHnd),
+ false, // not volatile
+ true); // copyBlock
+ loAsg->gtFlags |= ((simdTree->gtFlags | dstAddrLo->gtFlags) & GTF_ALL_EFFECT);
+
+ // Widen the upper half and assign it to dstAddrHi.
+ simdTree = gtNewSIMDNode(simdType, dupOp1, nullptr, SIMDIntrinsicWidenHi, baseType, size);
+ GenTree* hiDest =
+ new (this, GT_BLK) GenTreeBlk(GT_BLK, simdType, dstAddrHi, getSIMDTypeSizeInBytes(clsHnd));
+ GenTree* hiAsg = gtNewBlkOpNode(hiDest, simdTree, getSIMDTypeSizeInBytes(clsHnd),
+ false, // not volatile
+ true); // copyBlock
+ hiAsg->gtFlags |= ((simdTree->gtFlags | dstAddrHi->gtFlags) & GTF_ALL_EFFECT);
+
+ retVal = gtNewOperNode(GT_COMMA, simdType, loAsg, hiAsg);
+ }
+ break;
+
case SIMDIntrinsicHWAccel:
{
GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, 1);
diff --git a/src/jit/simd.h b/src/jit/simd.h
index c4a8866b07..ff522fd52f 100644
--- a/src/jit/simd.h
+++ b/src/jit/simd.h
@@ -33,12 +33,16 @@ struct SIMDIntrinsicInfo
// SSE2 Shuffle control byte to shuffle vector <W, Z, Y, X>
// These correspond to shuffle immediate byte in shufps SSE2 instruction.
#define SHUFFLE_XXXX 0x00 // 00 00 00 00
+#define SHUFFLE_XXZX 0x08 // 00 00 10 00
#define SHUFFLE_XXWW 0x0F // 00 00 11 11
#define SHUFFLE_XYZW 0x1B // 00 01 10 11
#define SHUFFLE_YXYX 0x44 // 01 00 01 00
+#define SHUFFLE_YWXZ 0x72 // 01 11 00 10
#define SHUFFLE_YYZZ 0x5A // 01 01 10 10
+#define SHUFFLE_ZXXX 0x80 // 10 00 00 00
#define SHUFFLE_ZXXY 0x81 // 10 00 00 01
#define SHUFFLE_ZWXY 0xB1 // 10 11 00 01
+#define SHUFFLE_WYZX 0xD8 // 11 01 10 00
#define SHUFFLE_WWYY 0xF5 // 11 11 01 01
#define SHUFFLE_ZZXX 0xA0 // 10 10 00 00
#endif
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
index 940ba5f9e8..a28c6527e2 100644
--- a/src/jit/simdcodegenxarch.cpp
+++ b/src/jit/simdcodegenxarch.cpp
@@ -487,14 +487,151 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
result = INS_movaps;
break;
+ case SIMDIntrinsicConvertToSingle:
+ result = INS_cvtdq2ps;
+ break;
+
+ case SIMDIntrinsicConvertToDouble:
+ assert(baseType == TYP_LONG);
+ result = INS_cvtsi2sd;
+ break;
+
+ case SIMDIntrinsicConvertToInt32:
+ case SIMDIntrinsicConvertToUInt32:
+ assert(baseType == TYP_FLOAT);
+ result = INS_cvttps2dq;
+ break;
+
+ case SIMDIntrinsicConvertToInt64:
+ case SIMDIntrinsicConvertToUInt64:
+ assert(baseType == TYP_DOUBLE);
+ result = INS_cvttsd2si;
+ break;
+
+ case SIMDIntrinsicNarrow:
+ // Note that for the integer types the caller must zero the upper bits of
+ // each source element, since the instructions saturate.
+ switch (baseType)
+ {
+ case TYP_INT:
+ case TYP_UINT:
+ if (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4)
+ {
+ result = INS_packusdw;
+ }
+ else
+ {
+ result = INS_packssdw;
+ }
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ result = INS_packuswb;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicNarrow");
+ result = INS_invalid;
+ break;
+ }
+ break;
+
+ case SIMDIntrinsicWidenLo:
+ // Some of these have multiple instruction implementations, with one instruction to widen the lo half,
+ // and another to widen the hi half.
+ switch (baseType)
+ {
+ case TYP_FLOAT:
+ result = INS_cvtps2pd;
+ break;
+ case TYP_INT:
+ case TYP_UINT:
+ result = INS_punpckldq;
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ result = INS_punpcklwd;
+ break;
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ result = INS_punpcklbw;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicWidenLo");
+ result = INS_invalid;
+ break;
+ }
+ break;
+
+ case SIMDIntrinsicWidenHi:
+ switch (baseType)
+ {
+ case TYP_FLOAT:
+ // For this case, we actually use the same instruction.
+ result = INS_cvtps2pd;
+ break;
+ case TYP_INT:
+ case TYP_UINT:
+ result = INS_punpckhdq;
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ result = INS_punpckhwd;
+ break;
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ result = INS_punpckhbw;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicWidenHi");
+ result = INS_invalid;
+ break;
+ }
+ break;
+
case SIMDIntrinsicShiftLeftInternal:
- // base type doesn't matter since the entire vector is shifted left
- result = INS_pslldq;
+ switch (baseType)
+ {
+ case TYP_SIMD16:
+ // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
+ result = INS_pslldq;
+ break;
+ case TYP_UINT:
+ case TYP_INT:
+ result = INS_pslld;
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ case TYP_USHORT:
+ result = INS_psllw;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal");
+ result = INS_invalid;
+ break;
+ }
break;
case SIMDIntrinsicShiftRightInternal:
- // base type doesn't matter since the entire vector is shifted right
- result = INS_psrldq;
+ switch (baseType)
+ {
+ case TYP_SIMD16:
+ // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
+ result = INS_psrldq;
+ break;
+ case TYP_UINT:
+ case TYP_INT:
+ result = INS_psrld;
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ case TYP_USHORT:
+ result = INS_psrlw;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal");
+ result = INS_invalid;
+ break;
+ }
break;
case SIMDIntrinsicUpperSave:
@@ -600,9 +737,9 @@ void CodeGen::genSIMDScalarMove(
{
// There is no guarantee that upper bits of op1Reg are zero.
// We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
}
else
@@ -700,7 +837,7 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes
ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
@@ -871,7 +1008,7 @@ void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
}
unsigned int baseTypeSize = genTypeSize(baseType);
- instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
// We will first consume the list items in execution (left to right) order,
// and record the registers.
@@ -947,6 +1084,681 @@ void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
genProduceReg(simdNode);
}
+//----------------------------------------------------------------------------------
+// genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float)
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode)
+{
+ SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
+ assert((intrinsicID == SIMDIntrinsicConvertToSingle) || (intrinsicID == SIMDIntrinsicConvertToInt32) ||
+ (intrinsicID == SIMDIntrinsicConvertToUInt32));
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types targetType = simdNode->TypeGet();
+
+ regNumber op1Reg = genConsumeReg(op1);
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+ if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT)
+ {
+ regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
+ regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
+
+ // We will generate the following:
+ // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
+ // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
+ // vpsrld targetReg, 16 (get upper 16 bits of src and put it into targetReg)
+ // vpslld tmpReg2, 16
+ // vpsrld tmpReg2, 16 (get lower 16 bits of src and put it into tmpReg2)
+ // mov tmpIntReg, 0x5300000053000000
+ // vmovd tmpReg, tmpIntReg
+ // vpbroadcastd tmpReg, tmpReg (build mask for converting upper 16 bits of src)
+ // vorps targetReg, tmpReg
+ // vsubps targetReg, tmpReg (convert upper 16 bits of src and put it into targetReg)
+ // vcvtdq2ps tmpReg2, tmpReg2 (convert lower 16 bits of src and put it into tmpReg2)
+ // vaddps targetReg, tmpReg2 (add upper 16 bits and lower 16 bits)
+ inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType));
+ if (targetReg != op1Reg)
+ {
+ inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType));
+ }
+
+ // prepare upper 16 bits
+ getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, 16);
+
+ // prepare lower 16 bits
+ getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, 16);
+ getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, 16);
+
+// prepare mask
+#ifdef _TARGET_AMD64_
+ getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X5300000053000000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+ if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X53000000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ }
+ else
+ {
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X00005300);
+ inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+ getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 1);
+ getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 3);
+ }
+#endif
+ if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+ }
+
+ // convert upper 16 bits
+ inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+ inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+
+ // convert lower 16 bits
+ inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType));
+
+ // add lower 16 bits and upper 16 bits
+ inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType));
+ }
+ else
+ {
+ inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+ genProduceReg(simdNode);
+}
+
+//----------------------------------------------------------------------------------
+// genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double)
+//
+// Arguments:
+// intrinsicID the SIMD intrinsic ID
+// simdType the SIMD node type
+// baseType the base type of value to be converted
+// tmpReg the tmp reg
+// tmpIntReg the tmp integer reg
+// targetReg the target reg
+//
+// Return Value:
+// None.
+//
+void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID,
+ var_types simdType,
+ var_types baseType,
+ regNumber tmpReg,
+ regNumber tmpIntReg,
+ regNumber targetReg)
+{
+ instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType);
+ if (intrinsicID == SIMDIntrinsicConvertToDouble)
+ {
+ // Note that for mov_xmm2i, the int register is always in the reg2 position
+ inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG);
+ inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType));
+ }
+ else
+ {
+ inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType));
+ inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG);
+ }
+}
+
+//----------------------------------------------------------------------------------
+// genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double)
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Notes:
+// There are no instructions for converting to/from 64-bit integers, so for these we
+// do the conversion an element at a time.
+//
+void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode)
+{
+ SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
+ assert((intrinsicID == SIMDIntrinsicConvertToDouble) || (intrinsicID == SIMDIntrinsicConvertToInt64) ||
+ (intrinsicID == SIMDIntrinsicConvertToUInt64));
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types simdType = simdNode->TypeGet();
+ regNumber op1Reg = genConsumeReg(op1);
+ regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
+ regNumber tmpReg;
+ regNumber tmpReg2;
+ regNumber tmpReg3;
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+#ifdef _TARGET_X86_
+ if (baseType == TYP_LONG)
+ {
+ tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg);
+ }
+ else
+#endif
+ if (iset == InstructionSet_AVX || (baseType == TYP_ULONG))
+ {
+ tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ tmpReg3 = REG_NA;
+ assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
+ }
+ else
+ {
+ tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ assert(tmpReg != op1Reg);
+ tmpReg2 = REG_NA;
+ tmpReg3 = REG_NA;
+ }
+
+ if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG))
+ {
+ // We will generate the following
+ // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
+ // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
+ // vpsrlq targetReg, 32 (get upper 32 bits of src and put it into targetReg)
+ // vpsllq tmpReg2, 32
+ // vpsrlq tmpReg2, 32 (get lower 32 bits of src and put it into tmpReg2)
+ // mov tmpIntReg, 0x4530000000000000
+ // vmovd tmpReg, tmpIntReg
+ // vpbroadcastq tmpReg, tmpReg (build mask for upper 32 bits of src)
+ // vorpd targetReg, tmpReg
+ // vsubpd targetReg, tmpReg (convert upper 32 bits of src and put it into targetReg)
+ // mov tmpIntReg, 0x4330000000000000
+ // vmovd tmpReg, tmpIntReg
+ // vpbroadcastq tmpReg, tmpReg (build mask for lower 32 bits of src)
+ // vorpd tmpReg2, tmpReg
+ // vsubpd tmpReg2, tmpReg (convert lower 32 bits of src and put it into tmpReg2)
+ // vaddpd targetReg, tmpReg2 (add upper 32 bits and lower 32 bits together)
+ inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
+ if (targetReg != op1Reg)
+ {
+ inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType));
+ }
+
+ // prepare upper 32 bits
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
+
+ // prepare lower 32 bits
+ getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
+
+// prepare mask for converting upper 32 bits
+#ifdef _TARGET_AMD64_
+ getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4530000000000000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+#endif
+ if (iset == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // convert upper 32 bits
+ inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+
+// prepare mask for converting lower 32 bits
+#ifdef _TARGET_AMD64_
+ getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4330000000000000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+#endif
+ if (iset == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // convert lower 32 bits
+ inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+
+ // add lower 32 bits and upper 32 bits
+ inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
+ }
+ else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG))
+ {
+#ifdef _TARGET_AMD64_
+ instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+ instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+
+ if (iset == InstructionSet_AVX)
+ {
+ // Extract the high 16-bits
+ getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+
+ // Put v[3] (the high-order element) in tmpReg2 and convert it.
+ inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
+
+ // Shift the resulting 64-bits left.
+ getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+
+ // Convert v[2], in the lo bits of tmpReg.
+ // For the convert to double, the convert preserves the upper bits in tmpReg2.
+ // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2);
+ }
+
+ // Put v[1] in tmpReg.
+ inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+ // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+
+ // Shift the resulting 64-bits left.
+ getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+ // Convert the lo 64-bits into targetReg
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg);
+
+ // Merge or copy the results (only at this point are we done with op1Reg).
+ if (tmpReg != targetReg)
+ {
+ inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ if (iset == InstructionSet_AVX)
+ {
+ getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, 0x01);
+ }
+#else
+ // get the sign bit and put it in tmpReg3
+ inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, 63);
+ getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, 63);
+
+ // get the absolute value of src and put it into tmpReg2 and targetReg
+ inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY);
+ getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, 32);
+ inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType));
+
+ // prepare upper 32 bits
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
+
+ // prepare lower 32 bits
+ getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
+
+ // prepare mask for converting upper 32 bits
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+
+ if (iset == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // convert upper 32 bits
+ inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+
+ // prepare mask for converting lower 32 bits
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+
+ if (iset == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // convert lower 32 bits
+ inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+
+ // add lower 32 bits and upper 32 bits
+ inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
+
+ // add sign bit
+ inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType));
+#endif
+ }
+ else
+ {
+ instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+ instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+
+ if (iset == InstructionSet_AVX)
+ {
+ // Extract the high 16-bits
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+
+ // Put v[3] (the high-order element) in tmpReg2 and convert it.
+ inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
+
+ // Shift the resulting 64-bits left.
+ getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+
+ // Convert v[2], in the lo bits of tmpReg.
+ // For the convert to double, the convert preserves the upper bits in tmpReg2.
+ // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+ inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // Put v[1] in tmpReg.
+ inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+ // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+
+ // Shift the resulting 64-bits left.
+ getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+ // Convert the lo 64-bits into targetReg
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg);
+
+ // Merge or copy the results (only at this point are we done with op1Reg).
+ assert(tmpReg != targetReg);
+ inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ if (iset == InstructionSet_AVX)
+ {
+ getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, 0x01);
+ }
+ }
+ genProduceReg(simdNode);
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Notes:
+// This is used for the WidenHi intrinsic to extract the upper half.
+// On SSE*, this is 8 bytes, and on AVX2 it is 16 bytes.
+//
+void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg)
+{
+ var_types simdType = simdNode->TypeGet();
+ emitAttr emitSize = emitActualTypeSize(simdType);
+ if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ instruction extractIns = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128;
+ getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, 0x01);
+ }
+ else
+ {
+ instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+ if (tgtReg != srcReg)
+ {
+ inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize);
+ }
+ getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, 8);
+ }
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Notes:
+// The Widen intrinsics are broken into separate intrinsics for the two results.
+//
+void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode)
+{
+ assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) ||
+ (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi));
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types simdType = simdNode->TypeGet();
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+ genConsumeOperands(simdNode);
+ regNumber op1Reg = op1->gtRegNum;
+ regNumber srcReg = op1Reg;
+ emitAttr emitSize = emitActualTypeSize(simdType);
+ instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+
+ if (baseType == TYP_FLOAT)
+ {
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
+ {
+ genSIMDExtractUpperHalf(simdNode, srcReg, targetReg);
+ srcReg = targetReg;
+ }
+ inst_RV_RV(widenIns, targetReg, srcReg, simdType);
+ }
+ else
+ {
+ // We will generate the following on AVX:
+ // vpermq targetReg, op1Reg, 0xd4|0xe8
+ // vpxor tmpReg, tmpReg
+ // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed)
+ // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg
+ regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ assert(tmpReg != op1Reg);
+
+ if (iset == InstructionSet_AVX)
+ {
+ // permute op1Reg and put it into targetReg
+ unsigned ival = 0xd4;
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
+ {
+ ival = 0xe8;
+ }
+ getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival);
+ }
+ else if (targetReg != op1Reg)
+ {
+ inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
+ }
+
+ genSIMDZero(simdType, baseType, tmpReg);
+ if (!varTypeIsUnsigned(baseType))
+ {
+ instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType);
+ inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize);
+ }
+ inst_RV_RV(widenIns, targetReg, tmpReg, simdType);
+ }
+ genProduceReg(simdNode);
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Notes:
+// This intrinsic takes two arguments. The first operand is narrowed to produce the
+// lower elements of the results, and the second operand produces the high elements.
+//
+void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types simdType = simdNode->TypeGet();
+ emitAttr emitSize = emitTypeSize(simdType);
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+ genConsumeOperands(simdNode);
+ regNumber op1Reg = op1->gtRegNum;
+ regNumber op2Reg = op2->gtRegNum;
+ if (baseType == TYP_DOUBLE)
+ {
+ regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+ inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
+ inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
+ // Now insert the high-order result (in tmpReg) into the upper half of targetReg.
+ if (compiler->canUseAVX())
+ {
+ getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01);
+ }
+ else
+ {
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX);
+ }
+ }
+ else if (varTypeIsLong(baseType))
+ {
+ if (iset == InstructionSet_AVX)
+ {
+ // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg.
+ // We will generate the following:
+ // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg)
+ // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2)
+ // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg)
+ // mov tmpReg2, op1Reg
+ // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2)
+ // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg
+ // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg
+ // punpcklqdq tgtReg, tmpReg
+ regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+ getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
+ getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01);
+ inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize);
+ getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX);
+ inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize);
+ }
+ else
+ {
+ // We will generate the following:
+ // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements)
+ // psrldq targetReg, 8 (shift them right to get zeros in the high elements)
+ // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements)
+ // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements)
+ // por targetReg, tmpReg
+ regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+ instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+ emitAttr emitSize = emitTypeSize(simdType);
+
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX);
+ getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX);
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8);
+ inst_RV_RV(INS_por, targetReg, tmpReg, simdType);
+ }
+ }
+ else
+ {
+ // We will generate the following:
+ // mov targetReg, op1Reg
+ // mov tmpReg, op2Reg
+ // psll? targetReg, shiftCount
+ // pslr? targetReg, shiftCount
+ // psll? tmpReg, shiftCount
+ // pslr? tmpReg, shiftCount
+ // <pack> targetReg, tmpReg
+ // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType),
+ // and <pack> is the appropriate instruction to pack the result (note that we have to truncate to
+ // get CLR type semantics; otherwise it will saturate).
+ //
+ int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2);
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+ instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+
+ if (iset == InstructionSet_AVX)
+ {
+ regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+ // The AVX instructions generally operate on "lanes", so we have to permute the
+ // inputs so that the destination register has the low 128-bit halves of the two
+ // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs.
+ getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20);
+ getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31);
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount);
+ getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount);
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
+ getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount);
+ inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+ inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
+ inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize);
+
+ instruction tmpShiftRight = shiftRightIns;
+ if ((baseType == TYP_INT || baseType == TYP_UINT) && iset == InstructionSet_SSE2)
+ {
+ tmpShiftRight = INS_psrad;
+ }
+
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount);
+ getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount);
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
+ getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount);
+ inst_RV_RV(ins, targetReg, tmpReg, simdType);
+ }
+ }
+ genProduceReg(simdNode);
+}
+
//--------------------------------------------------------------------------------
// genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
// add, sub, mul, bit-wise And, AndNot and Or.
@@ -1076,7 +1888,7 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
// Extract first and third double word results from tmpReg
// tmpReg = shuffle(0,0,2,0) of tmpReg
- getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, 0x08);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX);
// targetReg[63:0] = op1[0] * op2[0]
// targetReg[127:64] = op1[2] * op2[2]
@@ -1085,7 +1897,7 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
// Extract first and third double word results from targetReg
// targetReg = shuffle(0,0,2,0) of targetReg
- getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, 0x08);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX);
// pack the results into a single vector
inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
@@ -1125,9 +1937,9 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
// These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
unsigned shiftCount = 16 - simdNode->gtSIMDSize;
assert(shiftCount != 0);
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
}
@@ -1834,7 +2646,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
if (byteShiftCnt != 0)
{
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
}
}
@@ -1904,7 +2716,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
}
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
}
else
@@ -2390,6 +3202,27 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
genSIMDIntrinsicUnOp(simdNode);
break;
+ case SIMDIntrinsicConvertToSingle:
+ case SIMDIntrinsicConvertToInt32:
+ case SIMDIntrinsicConvertToUInt32:
+ genSIMDIntrinsic32BitConvert(simdNode);
+ break;
+
+ case SIMDIntrinsicConvertToDouble:
+ case SIMDIntrinsicConvertToInt64:
+ case SIMDIntrinsicConvertToUInt64:
+ genSIMDIntrinsic64BitConvert(simdNode);
+ break;
+
+ case SIMDIntrinsicWidenLo:
+ case SIMDIntrinsicWidenHi:
+ genSIMDIntrinsicWiden(simdNode);
+ break;
+
+ case SIMDIntrinsicNarrow:
+ genSIMDIntrinsicNarrow(simdNode);
+ break;
+
case SIMDIntrinsicAdd:
case SIMDIntrinsicSub:
case SIMDIntrinsicMul:
diff --git a/src/jit/simdintrinsiclist.h b/src/jit/simdintrinsiclist.h
index 0160582892..2eb4df38ca 100644
--- a/src/jit/simdintrinsiclist.h
+++ b/src/jit/simdintrinsiclist.h
@@ -119,6 +119,23 @@ SIMD_INTRINSIC("ConditionalSelect", false, Select,
// Cast
SIMD_INTRINSIC("op_Explicit", false, Cast, "Cast", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
+// Convert int/uint to single
+SIMD_INTRINSIC("ConvertToSingle", false, ConvertToSingle, "ConvertToSingle", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert long/ulong to double
+SIMD_INTRINSIC("ConvertToDouble", false, ConvertToDouble, "ConvertToDouble", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_LONG, TYP_ULONG, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert single to int
+SIMD_INTRINSIC("ConvertToInt32", false, ConvertToInt32, "ConvertToInt32", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert single to uint
+SIMD_INTRINSIC("ConvertToUInt32", false, ConvertToUInt32, "ConvertToUInt32", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert double to long
+SIMD_INTRINSIC("ConvertToInt64", false, ConvertToInt64, "ConvertToInt64", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert double to ulong
+SIMD_INTRINSIC("ConvertToUInt64", false, ConvertToUInt64, "ConvertToUInt64", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Narrow two input Vector<T>s to a single Vector<T>. The return value's lower elements are the elements from src1, and the upper elements are from src2.
+SIMD_INTRINSIC("Narrow", false, Narrow, "Narrow", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_DOUBLE, TYP_LONG, TYP_CHAR, TYP_SHORT, TYP_UINT, TYP_ULONG, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Widen one input Vector<T> to two Vector<T>s: dest1 contains the lower half of elements in src, and dest2 contains the upper half of elements in src.
+SIMD_INTRINSIC("Widen", false, Widen, "Widen", TYP_VOID, 3, {TYP_STRUCT, TYP_BYREF, TYP_BYREF}, {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+
// Miscellaneous
SIMD_INTRINSIC("get_IsHardwareAccelerated", false, HWAccel, "HWAccel", TYP_BOOL, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
@@ -134,7 +151,11 @@ SIMD_INTRINSIC("ShiftRightInternal", false, ShiftRightInternal,
SIMD_INTRINSIC("UpperSave", false, UpperSave, "UpperSave Internal", TYP_STRUCT, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
SIMD_INTRINSIC("UpperRestore", false, UpperRestore, "UpperRestore Internal", TYP_STRUCT, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-SIMD_INTRINSIC(nullptr, false, Invalid, "Invalid", TYP_UNDEF, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Internal intrinsics for Widen
+SIMD_INTRINSIC("WidenHi", false, WidenHi, "WidenHi", TYP_VOID, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+SIMD_INTRINSIC("WidenLo", false, WidenLo, "WidenLo", TYP_VOID, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+
+SIMD_INTRINSIC(nullptr, false, Invalid, "Invalid", TYP_UNDEF, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
#undef SIMD_INTRINSIC
#else //_TARGET_XARCH_
diff --git a/tests/src/JIT/SIMD/VectorConvert.cs b/tests/src/JIT/SIMD/VectorConvert.cs
index 6c65b22d4d..c2e4eb1206 100644
--- a/tests/src/JIT/SIMD/VectorConvert.cs
+++ b/tests/src/JIT/SIMD/VectorConvert.cs
@@ -559,6 +559,21 @@ partial class VectorTest
returnVal = Fail;
}
}
+
+ JitLog jitLog = new JitLog();
+ if (!jitLog.Check("System.Numerics.Vector:ConvertToInt32(struct):struct")) returnVal = Fail;
+ if (!jitLog.Check("System.Numerics.Vector:ConvertToUInt32(struct):struct")) returnVal = Fail;
+ if (!jitLog.Check("System.Numerics.Vector:ConvertToSingle(struct):struct")) returnVal = Fail;
+ // Note: SIMD Conversion to Int64/UInt64 is not supported on x86
+#if !BIT32
+ if (!jitLog.Check("System.Numerics.Vector:ConvertToInt64(struct):struct")) returnVal = Fail;
+ if (!jitLog.Check("System.Numerics.Vector:ConvertToUInt64(struct):struct")) returnVal = Fail;
+#endif // !BIT32
+ if (!jitLog.Check("System.Numerics.Vector:ConvertToDouble(struct):struct")) returnVal = Fail;
+ if (!jitLog.Check("System.Numerics.Vector:Narrow(struct,struct):struct")) returnVal = Fail;
+ if (!jitLog.Check("System.Numerics.Vector:Widen(struct,byref,byref)")) returnVal = Fail;
+ jitLog.Dispose();
+
return returnVal;
}
}
diff --git a/tests/src/JIT/SIMD/VectorConvert_r.csproj b/tests/src/JIT/SIMD/VectorConvert_r.csproj
index 01231e2729..db6fb24920 100644
--- a/tests/src/JIT/SIMD/VectorConvert_r.csproj
+++ b/tests/src/JIT/SIMD/VectorConvert_r.csproj
@@ -14,6 +14,9 @@
<SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
<NuGetPackageImportStamp>7a9bfb7d</NuGetPackageImportStamp>
</PropertyGroup>
+ <PropertyGroup Condition="'$(BuildArch)' == 'x86'">
+ <DefineConstants>BIT32;$(DefineConstants)</DefineConstants>
+ </PropertyGroup>
<!-- Default configurations to help VS understand the configurations -->
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
diff --git a/tests/src/JIT/SIMD/VectorConvert_ro.csproj b/tests/src/JIT/SIMD/VectorConvert_ro.csproj
index f751b88ed1..82206efb9d 100644
--- a/tests/src/JIT/SIMD/VectorConvert_ro.csproj
+++ b/tests/src/JIT/SIMD/VectorConvert_ro.csproj
@@ -14,6 +14,9 @@
<SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
<NuGetPackageImportStamp>7a9bfb7d</NuGetPackageImportStamp>
</PropertyGroup>
+ <PropertyGroup Condition="'$(BuildArch)' == 'x86'">
+ <DefineConstants>BIT32;$(DefineConstants)</DefineConstants>
+ </PropertyGroup>
<!-- Default configurations to help VS understand the configurations -->
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />