diff options
Diffstat (limited to 'src/jit/simd.cpp')
-rw-r--r-- | src/jit/simd.cpp | 137 |
1 files changed, 87 insertions, 50 deletions
diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp index 1f0c867b55..39664c47bf 100644 --- a/src/jit/simd.cpp +++ b/src/jit/simd.cpp @@ -77,10 +77,10 @@ int Compiler::getSIMDVectorLength(CORINFO_CLASS_HANDLE typeHnd) // int Compiler::getSIMDTypeAlignment(var_types simdType) { -#ifdef _TARGET_AMD64_ +#ifdef _TARGET_XARCH_ // Fixed length vectors have the following alignment preference - // Vector2/3 = 8 byte alignment - // Vector4 = 16-byte alignment + // Vector2 = 8 byte alignment + // Vector3/4 = 16-byte alignment unsigned size = genTypeSize(simdType); // preferred alignment for SSE2 128-bit vectors is 16-bytes @@ -88,13 +88,16 @@ int Compiler::getSIMDTypeAlignment(var_types simdType) { return 8; } - - // As per Intel manual, AVX vectors preferred alignment is 32-bytes but on Amd64 - // RSP/EBP is aligned at 16-bytes, therefore to align SIMD types at 32-bytes we need even - // RSP/EBP to be 32-byte aligned. It is not clear whether additional stack space used in - // aligning stack is worth the benefit and for now will use 16-byte alignment for AVX - // 256-bit vectors with unaligned load/stores to/from memory. - return 16; + else if (size <= 16) + { + assert((size == 12) || (size == 16)); + return 16; + } + else + { + assert(size == 32); + return 32; + } #else assert(!"getSIMDTypeAlignment() unimplemented on target arch"); unreached(); @@ -391,7 +394,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in CORINFO_CLASS_HANDLE typeHnd = *inOutTypeHnd; *baseType = getBaseTypeAndSizeOfSIMDType(typeHnd, sizeBytes); - bool isHWAcceleratedIntrinsic = false; if (typeHnd == SIMDVectorHandle) { // All of the supported intrinsics on this static class take a first argument that's a vector, @@ -424,6 +426,16 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in return nullptr; } +#ifdef _TARGET_X86_ + // NYI: support LONG type SIMD intrinsics. Need support in long decomposition. + // (Don't use NYI fallback mechanism; just call the function.) + if ((*baseType == TYP_LONG) || (*baseType == TYP_ULONG)) + { + JITDUMP("NYI: x86 long base type SIMD intrinsics\n"); + return nullptr; + } +#endif // _TARGET_X86_ + // account for implicit "this" arg *argCount = sig->numArgs; if (sig->hasThis()) @@ -525,7 +537,8 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in // We don't check anything in that case. if (!isThisPtr || !isNewObj) { - GenTreePtr arg = impStackTop(stackIndex).val; + GenTreePtr arg = impStackTop(stackIndex).val; + var_types argType = arg->TypeGet(); var_types expectedArgType; if (argIndex < fixedArgCnt) @@ -540,6 +553,7 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in { // The type of the argument will be genActualType(*baseType). expectedArgType = genActualType(*baseType); + argType = genActualType(argType); } } else @@ -547,7 +561,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in expectedArgType = *baseType; } - var_types argType = arg->TypeGet(); if (!isThisPtr && argType == TYP_I_IMPL) { // The reference implementation has a constructor that takes a pointer. @@ -715,7 +728,7 @@ GenTreeSIMD* Compiler::impSIMDGetFixed(var_types simdType, var_types baseType, u return simdTree; } -#ifdef _TARGET_AMD64_ +#ifdef _TARGET_XARCH_ // impSIMDLongRelOpEqual: transforms operands and returns the SIMD intrinsic to be applied on // transformed operands to obtain == comparison result. // @@ -741,7 +754,7 @@ SIMDIntrinsicID Compiler::impSIMDLongRelOpEqual(CORINFO_CLASS_HANDLE typeHnd, // // Equality(v1, v2): // tmp = (v1 == v2) i.e. compare for equality as if v1 and v2 are vector<int> - // result = BitwiseAnd(t, shuffle(t, (2, 3, 1 0))) + // result = BitwiseAnd(t, shuffle(t, (2, 3, 0, 1))) // Shuffle is meant to swap the comparison results of low-32-bits and high 32-bits of respective long elements. // Compare vector<long> as if they were vector<int> and assign the result to a temp @@ -755,7 +768,7 @@ SIMDIntrinsicID Compiler::impSIMDLongRelOpEqual(CORINFO_CLASS_HANDLE typeHnd, // op2 = Shuffle(tmp, 0xB1) // IntrinsicId = BitwiseAnd *pOp1 = gtNewOperNode(GT_COMMA, simdType, asg, tmp); - *pOp2 = gtNewSIMDNode(simdType, gtNewLclvNode(lclNum, simdType), gtNewIconNode(SHUFFLE_ZWYX, TYP_INT), + *pOp2 = gtNewSIMDNode(simdType, gtNewLclvNode(lclNum, simdType), gtNewIconNode(SHUFFLE_ZWXY, TYP_INT), SIMDIntrinsicShuffleSSE2, TYP_INT, size); return SIMDIntrinsicBitwiseAnd; } @@ -971,7 +984,7 @@ SIMDIntrinsicID Compiler::impSIMDIntegralRelOpGreaterThanOrEqual( return SIMDIntrinsicBitwiseOr; } -#endif //_TARGET_AMD64_ +#endif // _TARGET_XARCH_ // Transforms operands and returns the SIMD intrinsic to be applied on // transformed operands to obtain given relop result. @@ -999,7 +1012,7 @@ SIMDIntrinsicID Compiler::impSIMDRelOp(SIMDIntrinsicID relOpIntrinsicId, assert(isRelOpSIMDIntrinsic(relOpIntrinsicId)); -#ifdef _TARGET_AMD64_ +#ifdef _TARGET_XARCH_ SIMDIntrinsicID intrinsicID = relOpIntrinsicId; var_types baseType = *inOutBaseType; @@ -1076,7 +1089,7 @@ SIMDIntrinsicID Compiler::impSIMDRelOp(SIMDIntrinsicID relOpIntrinsicId, // // We need to treat op1 and op2 as signed for comparison purpose after // the transformation. - ssize_t constVal = 0; + __int64 constVal = 0; switch (baseType) { case TYP_UBYTE: @@ -1105,9 +1118,19 @@ SIMDIntrinsicID Compiler::impSIMDRelOp(SIMDIntrinsicID relOpIntrinsicId, if (intrinsicID != SIMDIntrinsicEqual) { // For constructing const vector use either long or int base type. - var_types tempBaseType = (baseType == TYP_ULONG) ? TYP_LONG : TYP_INT; - GenTree* initVal = gtNewIconNode(constVal); - initVal->gtType = tempBaseType; + var_types tempBaseType; + GenTree* initVal; + if (baseType == TYP_ULONG) + { + tempBaseType = TYP_LONG; + initVal = gtNewLconNode(constVal); + } + else + { + tempBaseType = TYP_INT; + initVal = gtNewIconNode((ssize_t)constVal); + } + initVal->gtType = tempBaseType; GenTree* constVector = gtNewSIMDNode(simdType, initVal, nullptr, SIMDIntrinsicInit, tempBaseType, size); // Assign constVector to a temp, since we intend to use it more than once @@ -1127,10 +1150,10 @@ SIMDIntrinsicID Compiler::impSIMDRelOp(SIMDIntrinsicID relOpIntrinsicId, } return intrinsicID; -#else +#else // !_TARGET_XARCH_ assert(!"impSIMDRelOp() unimplemented on target arch"); unreached(); -#endif //_TARGET_AMD64_ +#endif // !_TARGET_XARCH_ } // Creates a GT_SIMD tree for Select operation @@ -1210,7 +1233,7 @@ GenTreePtr Compiler::impSIMDMinMax(SIMDIntrinsicID intrinsicId, var_types simdType = op1->TypeGet(); assert(op2->TypeGet() == simdType); -#ifdef _TARGET_AMD64_ +#ifdef _TARGET_XARCH_ // SSE2 has direct support for float/double/signed word/unsigned byte. // For other integer types we compute min/max as follows // @@ -1347,10 +1370,10 @@ GenTreePtr Compiler::impSIMDMinMax(SIMDIntrinsicID intrinsicId, assert(simdTree != nullptr); return simdTree; -#else +#else // !_TARGET_XARCH_ assert(!"impSIMDMinMax() unimplemented on target arch"); unreached(); -#endif //_TARGET_AMD64_ +#endif // !_TARGET_XARCH_ } //------------------------------------------------------------------------ @@ -1791,6 +1814,8 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, int length = getSIMDVectorLength(clsHnd); GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, length); retVal = intConstTree; + + intConstTree->gtFlags |= GTF_ICON_SIMD_COUNT; } break; @@ -2223,7 +2248,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, assert(op2->TypeGet() == simdType); simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, SIMDIntrinsicOpEquality, baseType, size); - retVal = simdTree; + if (simdType == TYP_SIMD12) + { + simdTree->gtFlags |= GTF_SIMD12_OP; + } + retVal = simdTree; } break; @@ -2234,7 +2263,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, op2 = impSIMDPopStack(simdType); op1 = impSIMDPopStack(simdType, instMethod); simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, SIMDIntrinsicOpInEquality, baseType, size); - retVal = simdTree; + if (simdType == TYP_SIMD12) + { + simdTree->gtFlags |= GTF_SIMD12_OP; + } + retVal = simdTree; } break; @@ -2262,7 +2295,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, case SIMDIntrinsicBitwiseOr: case SIMDIntrinsicBitwiseXor: { -#if defined(_TARGET_AMD64_) && defined(DEBUG) +#if defined(_TARGET_XARCH_) && defined(DEBUG) // check for the cases where we don't support intrinsics. // This check should be done before we make modifications to type stack. // Note that this is more of a double safety check for robustness since @@ -2290,7 +2323,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, return nullptr; } } -#endif //_TARGET_AMD64_ && DEBUG +#endif // _TARGET_XARCH_ && DEBUG // op1 is the first operand; if instance method, op1 is "this" arg // op2 is the second operand @@ -2331,9 +2364,9 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, { // op1 is a SIMD variable that is "this" arg // op2 is an index of TYP_INT - op2 = impSIMDPopStack(TYP_INT); - op1 = impSIMDPopStack(simdType, instMethod); - unsigned int vectorLength = getSIMDVectorLength(size, baseType); + op2 = impSIMDPopStack(TYP_INT); + op1 = impSIMDPopStack(simdType, instMethod); + int vectorLength = getSIMDVectorLength(size, baseType); if (!op2->IsCnsIntOrI() || op2->AsIntCon()->gtIconVal >= vectorLength) { // We need to bounds-check the length of the vector. @@ -2366,15 +2399,15 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, case SIMDIntrinsicDotProduct: { -#if defined(_TARGET_AMD64_) && defined(DEBUG) - // Right now dot product is supported only for float vectors. - // See SIMDIntrinsicList.h for supported base types for this intrinsic. - if (!varTypeIsFloating(baseType)) +#if defined(_TARGET_XARCH_) + // Right now dot product is supported only for float/double vectors and + // int vectors on SSE4/AVX. + if (!varTypeIsFloating(baseType) && + !(baseType == TYP_INT && getSIMDInstructionSet() >= InstructionSet_SSE3_4)) { - assert(!"Dot product on integer type vectors not supported"); return nullptr; } -#endif //_TARGET_AMD64_ && DEBUG +#endif // _TARGET_XARCH_ // op1 is a SIMD variable that is the first source and also "this" arg. // op2 is a SIMD variable which is the second source. @@ -2382,13 +2415,17 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, op1 = impSIMDPopStack(simdType, instMethod); simdTree = gtNewSIMDNode(baseType, op1, op2, simdIntrinsicID, baseType, size); - retVal = simdTree; + if (simdType == TYP_SIMD12) + { + simdTree->gtFlags |= GTF_SIMD12_OP; + } + retVal = simdTree; } break; case SIMDIntrinsicSqrt: { -#if defined(_TARGET_AMD64_) && defined(DEBUG) +#if defined(_TARGET_XARCH_) && defined(DEBUG) // SSE/AVX doesn't support sqrt on integer type vectors and hence // should never be seen as an intrinsic here. See SIMDIntrinsicList.h // for supported base types for this intrinsic. @@ -2397,7 +2434,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, assert(!"Sqrt not supported on integer vectors\n"); return nullptr; } -#endif // _TARGET_AMD64_ && DEBUG +#endif // _TARGET_XARCH_ && DEBUG op1 = impSIMDPopStack(simdType); @@ -2409,7 +2446,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, { op1 = impSIMDPopStack(simdType); -#ifdef _TARGET_AMD64_ +#ifdef _TARGET_XARCH_ if (varTypeIsFloating(baseType)) { // Abs(vf) = vf & new SIMDVector<float>(0x7fffffff); @@ -2448,10 +2485,10 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, unreached(); } -#else //!_TARGET_AMD64_ - assert(!"Abs intrinsic on non-Amd64 target not implemented"); +#else // !_TARGET_XARCH_ + assert(!"Abs intrinsic on non-xarch target not implemented"); unreached(); -#endif //!_TARGET_AMD64_ +#endif // !_TARGET_XARCH_ } break; @@ -2524,15 +2561,15 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, return nullptr; } -#ifdef _TARGET_AMD64_ - // Amd64: also indicate that we use floating point registers. +#ifdef _TARGET_XARCH_ + // XArch: also indicate that we use floating point registers. // The need for setting this here is that a method may not have SIMD // type lclvars, but might be exercising SIMD intrinsics on fields of // SIMD type. // // e.g. public Vector<float> ComplexVecFloat::sqabs() { return this.r * this.r + this.i * this.i; } compFloatingPointUsed = true; -#endif +#endif // _TARGET_XARCH_ // At this point, we have a tree that we are going to store into a destination. // TODO-1stClassStructs: This should be a simple store or assignment, and should not require |