1 files changed, 87 insertions, 50 deletions
diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp
index 1f0c867b55..39664c47bf 100644
--- a/src/jit/simd.cpp
+++ b/src/jit/simd.cpp
@@ -77,10 +77,10 @@ int Compiler::getSIMDVectorLength(CORINFO_CLASS_HANDLE typeHnd)
 //
 int Compiler::getSIMDTypeAlignment(var_types simdType)
 {
-#ifdef _TARGET_AMD64_
+#ifdef _TARGET_XARCH_
     // Fixed length vectors have the following alignment preference
-    // Vector2/3 = 8 byte alignment
-    // Vector4 = 16-byte alignment
+    // Vector2   = 8 byte alignment
+    // Vector3/4 = 16-byte alignment
     unsigned size = genTypeSize(simdType);
 
     // preferred alignment for SSE2 128-bit vectors is 16-bytes
@@ -88,13 +88,16 @@ int Compiler::getSIMDTypeAlignment(var_types simdType)
     {
         return 8;
     }
-
-    // As per Intel manual, AVX vectors preferred alignment is 32-bytes but on Amd64
-    // RSP/EBP is aligned at 16-bytes, therefore to align SIMD types at 32-bytes we need even
-    // RSP/EBP to be 32-byte aligned. It is not clear whether additional stack space used in
-    // aligning stack is worth the benefit and for now will use 16-byte alignment for AVX
-    // 256-bit vectors with unaligned load/stores to/from memory.
-    return 16;
+    else if (size <= 16)
+    {
+        assert((size == 12) || (size == 16));
+        return 16;
+    }
+    else
+    {
+        assert(size == 32);
+        return 32;
+    }
 #else
     assert(!"getSIMDTypeAlignment() unimplemented on target arch");
     unreached();
@@ -391,7 +394,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
     CORINFO_CLASS_HANDLE typeHnd = *inOutTypeHnd;
     *baseType                    = getBaseTypeAndSizeOfSIMDType(typeHnd, sizeBytes);
 
-    bool isHWAcceleratedIntrinsic = false;
     if (typeHnd == SIMDVectorHandle)
     {
         // All of the supported intrinsics on this static class take a first argument that's a vector,
@@ -424,6 +426,16 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
         return nullptr;
     }
 
+#ifdef _TARGET_X86_
+    // NYI: support LONG type SIMD intrinsics. Need support in long decomposition.
+    // (Don't use NYI fallback mechanism; just call the function.)
+    if ((*baseType == TYP_LONG) || (*baseType == TYP_ULONG))
+    {
+        JITDUMP("NYI: x86 long base type SIMD intrinsics\n");
+        return nullptr;
+    }
+#endif // _TARGET_X86_
+
     // account for implicit "this" arg
     *argCount = sig->numArgs;
     if (sig->hasThis())
@@ -525,7 +537,8 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
                 // We don't check anything in that case.
                 if (!isThisPtr || !isNewObj)
                 {
-                    GenTreePtr arg = impStackTop(stackIndex).val;
+                    GenTreePtr arg     = impStackTop(stackIndex).val;
+                    var_types  argType = arg->TypeGet();
 
                     var_types expectedArgType;
                     if (argIndex < fixedArgCnt)
@@ -540,6 +553,7 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
                         {
                             // The type of the argument will be genActualType(*baseType).
                             expectedArgType = genActualType(*baseType);
+                            argType         = genActualType(argType);
                         }
                     }
                     else
@@ -547,7 +561,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
                         expectedArgType = *baseType;
                     }
 
-                    var_types argType = arg->TypeGet();
                     if (!isThisPtr && argType == TYP_I_IMPL)
                     {
                         // The reference implementation has a constructor that takes a pointer.
@@ -715,7 +728,7 @@ GenTreeSIMD* Compiler::impSIMDGetFixed(var_types simdType, var_types baseType, u
     return simdTree;
 }
 
-#ifdef _TARGET_AMD64_
+#ifdef _TARGET_XARCH_
 // impSIMDLongRelOpEqual: transforms operands and returns the SIMD intrinsic to be applied on
 // transformed operands to obtain == comparison result.
 //
@@ -741,7 +754,7 @@ SIMDIntrinsicID Compiler::impSIMDLongRelOpEqual(CORINFO_CLASS_HANDLE typeHnd,
     //
     // Equality(v1, v2):
     // tmp = (v1 == v2) i.e. compare for equality as if v1 and v2 are vector<int>
-    // result = BitwiseAnd(t, shuffle(t, (2, 3, 1 0)))
+    // result = BitwiseAnd(t, shuffle(t, (2, 3, 0, 1)))
     // Shuffle is meant to swap the comparison results of low-32-bits and high 32-bits of respective long elements.
 
     // Compare vector<long> as if they were vector<int> and assign the result to a temp
@@ -755,7 +768,7 @@ SIMDIntrinsicID Compiler::impSIMDLongRelOpEqual(CORINFO_CLASS_HANDLE typeHnd,
     // op2 = Shuffle(tmp, 0xB1)
     // IntrinsicId = BitwiseAnd
     *pOp1 = gtNewOperNode(GT_COMMA, simdType, asg, tmp);
-    *pOp2 = gtNewSIMDNode(simdType, gtNewLclvNode(lclNum, simdType), gtNewIconNode(SHUFFLE_ZWYX, TYP_INT),
+    *pOp2 = gtNewSIMDNode(simdType, gtNewLclvNode(lclNum, simdType), gtNewIconNode(SHUFFLE_ZWXY, TYP_INT),
                           SIMDIntrinsicShuffleSSE2, TYP_INT, size);
     return SIMDIntrinsicBitwiseAnd;
 }
@@ -971,7 +984,7 @@ SIMDIntrinsicID Compiler::impSIMDIntegralRelOpGreaterThanOrEqual(
 
     return SIMDIntrinsicBitwiseOr;
 }
-#endif //_TARGET_AMD64_
+#endif // _TARGET_XARCH_
 
 // Transforms operands and returns the SIMD intrinsic to be applied on
 // transformed operands to obtain given relop result.
@@ -999,7 +1012,7 @@ SIMDIntrinsicID Compiler::impSIMDRelOp(SIMDIntrinsicID      relOpIntrinsicId,
 
     assert(isRelOpSIMDIntrinsic(relOpIntrinsicId));
 
-#ifdef _TARGET_AMD64_
+#ifdef _TARGET_XARCH_
     SIMDIntrinsicID intrinsicID = relOpIntrinsicId;
     var_types       baseType    = *inOutBaseType;
 
@@ -1076,7 +1089,7 @@ SIMDIntrinsicID Compiler::impSIMDRelOp(SIMDIntrinsicID      relOpIntrinsicId,
             //
             // We need to treat op1 and op2 as signed for comparison purpose after
             // the transformation.
-            ssize_t constVal = 0;
+            __int64 constVal = 0;
             switch (baseType)
             {
                 case TYP_UBYTE:
@@ -1105,9 +1118,19 @@ SIMDIntrinsicID Compiler::impSIMDRelOp(SIMDIntrinsicID      relOpIntrinsicId,
             if (intrinsicID != SIMDIntrinsicEqual)
             {
                 // For constructing const vector use either long or int base type.
-                var_types tempBaseType = (baseType == TYP_ULONG) ? TYP_LONG : TYP_INT;
-                GenTree*  initVal      = gtNewIconNode(constVal);
-                initVal->gtType        = tempBaseType;
+                var_types tempBaseType;
+                GenTree*  initVal;
+                if (baseType == TYP_ULONG)
+                {
+                    tempBaseType = TYP_LONG;
+                    initVal      = gtNewLconNode(constVal);
+                }
+                else
+                {
+                    tempBaseType = TYP_INT;
+                    initVal      = gtNewIconNode((ssize_t)constVal);
+                }
+                initVal->gtType      = tempBaseType;
                 GenTree* constVector = gtNewSIMDNode(simdType, initVal, nullptr, SIMDIntrinsicInit, tempBaseType, size);
 
                 // Assign constVector to a temp, since we intend to use it more than once
@@ -1127,10 +1150,10 @@ SIMDIntrinsicID Compiler::impSIMDRelOp(SIMDIntrinsicID      relOpIntrinsicId,
     }
 
     return intrinsicID;
-#else
+#else  // !_TARGET_XARCH_
     assert(!"impSIMDRelOp() unimplemented on target arch");
     unreached();
-#endif //_TARGET_AMD64_
+#endif // !_TARGET_XARCH_
 }
 
 // Creates a GT_SIMD tree for Select operation
@@ -1210,7 +1233,7 @@ GenTreePtr Compiler::impSIMDMinMax(SIMDIntrinsicID      intrinsicId,
     var_types simdType = op1->TypeGet();
     assert(op2->TypeGet() == simdType);
 
-#ifdef _TARGET_AMD64_
+#ifdef _TARGET_XARCH_
     // SSE2 has direct support for float/double/signed word/unsigned byte.
     // For other integer types we compute min/max as follows
     //
@@ -1347,10 +1370,10 @@ GenTreePtr Compiler::impSIMDMinMax(SIMDIntrinsicID      intrinsicId,
 
     assert(simdTree != nullptr);
     return simdTree;
-#else
+#else  // !_TARGET_XARCH_
     assert(!"impSIMDMinMax() unimplemented on target arch");
     unreached();
-#endif //_TARGET_AMD64_
+#endif // !_TARGET_XARCH_
 }
 
 //------------------------------------------------------------------------
@@ -1791,6 +1814,8 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             int            length       = getSIMDVectorLength(clsHnd);
             GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, length);
             retVal                      = intConstTree;
+
+            intConstTree->gtFlags |= GTF_ICON_SIMD_COUNT;
         }
         break;
 
@@ -2223,7 +2248,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             assert(op2->TypeGet() == simdType);
 
             simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, SIMDIntrinsicOpEquality, baseType, size);
-            retVal   = simdTree;
+            if (simdType == TYP_SIMD12)
+            {
+                simdTree->gtFlags |= GTF_SIMD12_OP;
+            }
+            retVal = simdTree;
         }
         break;
 
@@ -2234,7 +2263,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             op2      = impSIMDPopStack(simdType);
             op1      = impSIMDPopStack(simdType, instMethod);
             simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, SIMDIntrinsicOpInEquality, baseType, size);
-            retVal   = simdTree;
+            if (simdType == TYP_SIMD12)
+            {
+                simdTree->gtFlags |= GTF_SIMD12_OP;
+            }
+            retVal = simdTree;
         }
         break;
 
@@ -2262,7 +2295,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
         case SIMDIntrinsicBitwiseOr:
         case SIMDIntrinsicBitwiseXor:
         {
-#if defined(_TARGET_AMD64_) && defined(DEBUG)
+#if defined(_TARGET_XARCH_) && defined(DEBUG)
             // check for the cases where we don't support intrinsics.
             // This check should be done before we make modifications to type stack.
             // Note that this is more of a double safety check for robustness since
@@ -2290,7 +2323,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
                     return nullptr;
                 }
             }
-#endif //_TARGET_AMD64_ && DEBUG
+#endif // _TARGET_XARCH_ && DEBUG
 
             // op1 is the first operand; if instance method, op1 is "this" arg
             // op2 is the second operand
@@ -2331,9 +2364,9 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
         {
             // op1 is a SIMD variable that is "this" arg
             // op2 is an index of TYP_INT
-            op2                       = impSIMDPopStack(TYP_INT);
-            op1                       = impSIMDPopStack(simdType, instMethod);
-            unsigned int vectorLength = getSIMDVectorLength(size, baseType);
+            op2              = impSIMDPopStack(TYP_INT);
+            op1              = impSIMDPopStack(simdType, instMethod);
+            int vectorLength = getSIMDVectorLength(size, baseType);
             if (!op2->IsCnsIntOrI() || op2->AsIntCon()->gtIconVal >= vectorLength)
             {
                 // We need to bounds-check the length of the vector.
@@ -2366,15 +2399,15 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
 
         case SIMDIntrinsicDotProduct:
         {
-#if defined(_TARGET_AMD64_) && defined(DEBUG)
-            // Right now dot product is supported only for float vectors.
-            // See SIMDIntrinsicList.h for supported base types for this intrinsic.
-            if (!varTypeIsFloating(baseType))
+#if defined(_TARGET_XARCH_)
+            // Right now dot product is supported only for float/double vectors and
+            // int vectors on SSE4/AVX.
+            if (!varTypeIsFloating(baseType) &&
+                !(baseType == TYP_INT && getSIMDInstructionSet() >= InstructionSet_SSE3_4))
             {
-                assert(!"Dot product on integer type vectors not supported");
                 return nullptr;
             }
-#endif //_TARGET_AMD64_ && DEBUG
+#endif // _TARGET_XARCH_
 
             // op1 is a SIMD variable that is the first source and also "this" arg.
             // op2 is a SIMD variable which is the second source.
@@ -2382,13 +2415,17 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             op1 = impSIMDPopStack(simdType, instMethod);
 
             simdTree = gtNewSIMDNode(baseType, op1, op2, simdIntrinsicID, baseType, size);
-            retVal   = simdTree;
+            if (simdType == TYP_SIMD12)
+            {
+                simdTree->gtFlags |= GTF_SIMD12_OP;
+            }
+            retVal = simdTree;
         }
         break;
 
         case SIMDIntrinsicSqrt:
         {
-#if defined(_TARGET_AMD64_) && defined(DEBUG)
+#if defined(_TARGET_XARCH_) && defined(DEBUG)
             // SSE/AVX doesn't support sqrt on integer type vectors and hence
             // should never be seen as an intrinsic here. See SIMDIntrinsicList.h
             // for supported base types for this intrinsic.
@@ -2397,7 +2434,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
                 assert(!"Sqrt not supported on integer vectors\n");
                 return nullptr;
             }
-#endif // _TARGET_AMD64_ && DEBUG
+#endif // _TARGET_XARCH_ && DEBUG
 
             op1 = impSIMDPopStack(simdType);
 
@@ -2409,7 +2446,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
         {
             op1 = impSIMDPopStack(simdType);
 
-#ifdef _TARGET_AMD64_
+#ifdef _TARGET_XARCH_
             if (varTypeIsFloating(baseType))
             {
                 // Abs(vf) = vf & new SIMDVector<float>(0x7fffffff);
@@ -2448,10 +2485,10 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
                 unreached();
             }
 
-#else //!_TARGET_AMD64_
-            assert(!"Abs intrinsic on non-Amd64 target not implemented");
+#else // !_TARGET_XARCH_
+            assert(!"Abs intrinsic on non-xarch target not implemented");
             unreached();
-#endif //!_TARGET_AMD64_
+#endif // !_TARGET_XARCH_
         }
         break;
 
@@ -2524,15 +2561,15 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             return nullptr;
     }
 
-#ifdef _TARGET_AMD64_
-    // Amd64: also indicate that we use floating point registers.
+#ifdef _TARGET_XARCH_
+    // XArch: also indicate that we use floating point registers.
     // The need for setting this here is that a method may not have SIMD
     // type lclvars, but might be exercising SIMD intrinsics on fields of
     // SIMD type.
     //
     // e.g.  public Vector<float> ComplexVecFloat::sqabs() { return this.r * this.r + this.i * this.i; }
     compFloatingPointUsed = true;
-#endif
+#endif // _TARGET_XARCH_
 
     // At this point, we have a tree that we are going to store into a destination.
     // TODO-1stClassStructs: This should be a simple store or assignment, and should not require