ARM64: Fix for Multiplication with Overflow Check

For 4 byte integer multiplication, JIT emits a bad-code which is valid only for 8 byte (i8) multiplication. For the fix, I use ```smull```(signed)/```umull```(unsigned) instructions that contain 8 byte results from 4 byte by 4 byte multiplication. So only one multiplication is needed instead of two for this case. By simply shifting the results, we could get the upper results that is used to detect overflow. Similar transform is made for the unsigned case. Lower is also changed to reserve a register for overflow check. Before smulh w10, w8, w9 --> Incorrect use: smulh is for obtaining the upper bits [127:64] of x8 * x9 mul w8, w8, w9 cmp x10, x8, ASR #63 After smull x8, x8, x9 --> x8 = w8 * w9 lsr x10, x8, #32 --> shift the upper bit of x8 to get sign bit cmp w10, w8, ASR #31 --> check sign bit
author: Kyungwoo Lee <kyulee@microsoft.com> 2016-03-01 09:33:13 -0800
committer: Kyungwoo Lee <kyulee@microsoft.com> 2016-03-02 10:13:22 -0800
commit: 445a810039d0801d099fd5ba543876456b248c47 (patch)
tree: 2294b740b801df48e6b16b6f1f9e8120a3fadc89
parent: 9c8484734621f36d2e4113644e56109cb76192e3 (diff)
download: coreclr-445a810039d0801d099fd5ba543876456b248c47.tar.gz
coreclr-445a810039d0801d099fd5ba543876456b248c47.tar.bz2
coreclr-445a810039d0801d099fd5ba543876456b248c47.zip
3 files changed, 50 insertions, 34 deletions
diff --git a/src/jit/emitarm64.cpp b/src/jit/emitarm64.cpp
index 2b203499b4..d817b75c07 100644
--- a/src/jit/emitarm64.cpp
+++ b/src/jit/emitarm64.cpp
@@ -10824,7 +10824,6 @@ regNumber emitter::emitInsTernary(instruction ins, emitAttr attr, GenTree* dst,
     }
     bool isMulOverflow = false;
     bool isUnsignedMul = false;
-    instruction ins2 = INS_invalid;
     regNumber extraReg = REG_NA;
     if (dst->gtOverflowEx())
     {
@@ -10840,7 +10839,6 @@ regNumber emitter::emitInsTernary(instruction ins, emitAttr attr, GenTree* dst,
         {
             isMulOverflow = true;
             isUnsignedMul = ((dst->gtFlags & GTF_UNSIGNED) != 0);
-            ins2 = isUnsignedMul ? INS_umulh : INS_smulh;
             assert(intConst == nullptr);   // overflow format doesn't support an int constant operand
         }
         else
@@ -10856,43 +10854,66 @@ regNumber emitter::emitInsTernary(instruction ins, emitAttr attr, GenTree* dst,
     {
         if (isMulOverflow)
         {
+            // Make sure that we have an internal register
+            assert(genCountBits(dst->gtRsvdRegs) == 2);
+
+            // There will be two bits set in tmpRegsMask.
+            // Remove the bit for 'dst->gtRegNum' from 'tmpRegsMask'
+            regMaskTP tmpRegsMask = dst->gtRsvdRegs & ~genRegMask(dst->gtRegNum);
+            assert(tmpRegsMask != RBM_NONE);
+            regMaskTP tmpRegMask = genFindLowestBit(tmpRegsMask);   // set tmpRegMsk to a one-bit mask
+            extraReg = genRegNumFromMask(tmpRegMask);               // set tmpReg from that mask
+
             if (isUnsignedMul)
             {
-                assert(genCountBits(dst->gtRsvdRegs) == 1);
-                extraReg = genRegNumFromMask(dst->gtRsvdRegs);
+                if (attr == EA_4BYTE)
+                {
+                    // Compute 8 byte results from 4 byte by 4 byte multiplication.
+                    emitIns_R_R_R(INS_umull, EA_8BYTE, dst->gtRegNum, src1->gtRegNum, src2->gtRegNum);
 
-                // Compute the high result
-                emitIns_R_R_R(ins2, attr, extraReg, src1->gtRegNum, src2->gtRegNum);
+                    // Get the high result by shifting dst.
+                    emitIns_R_R_I(INS_lsr, EA_8BYTE, extraReg, dst->gtRegNum, 32);
+                }
+                else
+                {
+                    assert(attr == EA_8BYTE);
+                    // Compute the high result.
+                    emitIns_R_R_R(INS_umulh, attr, extraReg, src1->gtRegNum, src2->gtRegNum);
 
-                emitIns_R_I(INS_cmp, EA_8BYTE, extraReg, 0);
-                codeGen->genCheckOverflow(dst);
+                    // Now multiply without skewing the high result.
+                    emitIns_R_R_R(ins, attr, dst->gtRegNum, src1->gtRegNum, src2->gtRegNum);
+                }
 
-                // Now multiply without skewing the high result if no overflow.
-                emitIns_R_R_R(ins, attr, dst->gtRegNum, src1->gtRegNum, src2->gtRegNum);
+                // zero-sign bit comparision to detect overflow.
+                emitIns_R_I(INS_cmp, attr, extraReg, 0);
             }
             else
             {
-                // Make sure that we have an internal register
-                assert(genCountBits(dst->gtRsvdRegs) == 2);
-
-                // There will be two bits set in tmpRegsMask.
-                // Remove the bit for 'dst->gtRegNum' from 'tmpRegsMask'
-                regMaskTP tmpRegsMask = dst->gtRsvdRegs & ~genRegMask(dst->gtRegNum);
-                regMaskTP tmpRegMask = genFindLowestBit(tmpRegsMask);   // set tmpRegMsk to a one-bit mask
-                extraReg = genRegNumFromMask(tmpRegMask);               // set tmpReg from that mask
+                int bitShift = 0;
+                if (attr == EA_4BYTE)
+                {
+                    // Compute 8 byte results from 4 byte by 4 byte multiplication.
+                    emitIns_R_R_R(INS_smull, EA_8BYTE, dst->gtRegNum, src1->gtRegNum, src2->gtRegNum);
 
-                // Make sure the two registers are not the same.
-                assert(extraReg != dst->gtRegNum);
+                    // Get the high result by shifting dst.
+                    emitIns_R_R_I(INS_lsr, EA_8BYTE, extraReg, dst->gtRegNum, 32);
 
-                // Save the high result in a temporary register
-                emitIns_R_R_R(ins2, attr, extraReg, src1->gtRegNum, src2->gtRegNum);
+                    bitShift = 31;
+                }
+                else
+                {
+                    assert(attr == EA_8BYTE);
+                    // Save the high result in a temporary register.
+                    emitIns_R_R_R(INS_smulh, attr, extraReg, src1->gtRegNum, src2->gtRegNum);
 
-                // Now multiply without skewing the high result.
-                emitIns_R_R_R(ins, attr, dst->gtRegNum, src1->gtRegNum, src2->gtRegNum);
+                    // Now multiply without skewing the high result.
+                    emitIns_R_R_R(ins, attr, dst->gtRegNum, src1->gtRegNum, src2->gtRegNum);
 
-                emitIns_R_R_I(INS_cmp, EA_8BYTE, extraReg, dst->gtRegNum, 63, INS_OPTS_ASR);
+                    bitShift = 63;
+                }
 
-                codeGen->genCheckOverflow(dst);
+                // Sign bit comparision to detect overflow.
+                emitIns_R_R_I(INS_cmp, attr, extraReg, dst->gtRegNum, bitShift, INS_OPTS_ASR);
             }
         }
         else
@@ -10902,7 +10923,7 @@ regNumber emitter::emitInsTernary(instruction ins, emitAttr attr, GenTree* dst,
         }
     }
 
-    if (dst->gtOverflowEx() && !isMulOverflow)
+    if (dst->gtOverflowEx())
     {
         assert(!varTypeIsFloating(dst));
         codeGen->genCheckOverflow(dst);
diff --git a/src/jit/lowerarm64.cpp b/src/jit/lowerarm64.cpp
index 294b9c2d68..87bc04ecab 100644
--- a/src/jit/lowerarm64.cpp
+++ b/src/jit/lowerarm64.cpp
@@ -347,12 +347,7 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             break;
    
         case GT_MUL:
-            if ((tree->gtFlags & GTF_UNSIGNED) != 0)
-            {
-                // unsigned mul should only need one register
-                info->internalIntCount = 1;
-            }
-            else if (tree->gtOverflow())
+            if (tree->gtOverflow())
             {
                 // Need a register different from target reg to check
                 // for signed overflow.
diff --git a/tests/arm64/Tests.lst b/tests/arm64/Tests.lst
index e32b91ce46..56a6daea7b 100644
--- a/tests/arm64/Tests.lst
+++ b/tests/arm64/Tests.lst
@@ -5247,7 +5247,7 @@ WorkingDir=JIT\Regression\CLR-x86-JIT\V1-M09.5-PDC\b32093\b32093
 MaxAllowedDurationSeconds=600
 HostStyle=Any
 Expected=100
-Categories=JIT;EXPECTED_FAIL;NEED_TRIAGE
+Categories=JIT;EXPECTED_PASS
 [lclfldrem_cs_ro.exe_2875]
 RelativePath=JIT\Directed\coverage\oldtests\lclfldrem_cs_ro\lclfldrem_cs_ro.exe
 WorkingDir=JIT\Directed\coverage\oldtests\lclfldrem_cs_ro
author	Kyungwoo Lee <kyulee@microsoft.com>	2016-03-01 09:33:13 -0800
committer	Kyungwoo Lee <kyulee@microsoft.com>	2016-03-02 10:13:22 -0800
commit	445a810039d0801d099fd5ba543876456b248c47 (patch)
tree	2294b740b801df48e6b16b6f1f9e8120a3fadc89
parent	9c8484734621f36d2e4113644e56109cb76192e3 (diff)
download	coreclr-445a810039d0801d099fd5ba543876456b248c47.tar.gz coreclr-445a810039d0801d099fd5ba543876456b248c47.tar.bz2 coreclr-445a810039d0801d099fd5ba543876456b248c47.zip