diff options
Diffstat (limited to 'src/jit')
-rwxr-xr-x[-rw-r--r--] | src/jit/codegen.h | 3 | ||||
-rwxr-xr-x[-rw-r--r--] | src/jit/codegencommon.cpp | 27 | ||||
-rwxr-xr-x[-rw-r--r--] | src/jit/codegenxarch.cpp | 70 | ||||
-rwxr-xr-x[-rw-r--r--] | src/jit/ee_il_dll.cpp | 0 | ||||
-rwxr-xr-x[-rw-r--r--] | src/jit/lower.cpp | 16 |
5 files changed, 110 insertions, 6 deletions
diff --git a/src/jit/codegen.h b/src/jit/codegen.h index 2d7bc65597..ce08d9fa65 100644..100755 --- a/src/jit/codegen.h +++ b/src/jit/codegen.h @@ -281,6 +281,9 @@ protected: RegState *regState); void genEnregisterIncomingStackArgs(); void genCheckUseBlockInit(); +#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) && defined(FEATURE_SIMD) + void genClearStackVec3ArgUpperBits(); +#endif //FEATURE_UNIX_AMD64_STRUCT_PASSING && FEATURE_SIMD #if defined(_TARGET_ARM64_) bool genInstrWithConstant(instruction ins, diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp index 0ce079c6d3..42a404afbe 100644..100755 --- a/src/jit/codegencommon.cpp +++ b/src/jit/codegencommon.cpp @@ -3931,9 +3931,20 @@ void CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg, // RyuJit backend is making another implicit assumption that Vector3 type args when passed in // registers or on stack, the upper most 4-bytes will be zero. // - // TODO-64bit: assumptions 1 and 2 hold within RyuJIT generated code. It is not clear whether - // these assumptions hold when a Vector3 type arg is passed by native code. Example: PInvoke - // returning Vector3 type value or RPInvoke passing Vector3 type args. + // For P/Invoke return and Reverse P/Invoke argument passing, native compiler doesn't guarantee + // that upper 4-bytes of a Vector3 type struct is zero initialized and hence assumption 2 is + // invalid. + // + // RyuJIT x64 Windows: arguments are treated as passed by ref and hence read/written just 12 + // bytes. In case of Vector3 returns, Caller allocates a zero initialized Vector3 local and + // passes it retBuf arg and Callee method writes only 12 bytes to retBuf. For this reason, + // there is no need to clear upper 4-bytes of Vector3 type args. + // + // RyuJIT x64 Unix: arguments are treated as passed by value and read/writen as if TYP_SIMD16. + // Vector3 return values are returned two return registers and Caller assembles them into a + // single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3 + // type args in prolog and Vector3 type return value of a call + if (varDsc->lvType == TYP_SIMD12) { regType = TYP_DOUBLE; @@ -8519,6 +8530,16 @@ void CodeGen::genFnProlog() genPrologPadForReJit(); getEmitter()->emitMarkPrologEnd(); } + +#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) && defined(FEATURE_SIMD) + // The unused bits of Vector3 arguments must be cleared + // since native compiler doesn't initize the upper bits to zeros. + // + // TODO-Cleanup: This logic can be implemented in + // genFnPrologCalleeRegArgs() for argument registers and + // genEnregisterIncomingStackArgs() for stack arguments. + genClearStackVec3ArgUpperBits(); +#endif //FEATURE_UNIX_AMD64_STRUCT_PASSING && FEATURE_SIMD /*----------------------------------------------------------------------------- * Take care of register arguments first diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index b35268e225..25c9f7ce4f 100644..100755 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -3680,6 +3680,61 @@ void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode, unsigned base genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX, baseVarNum); instGen(INS_r_movsb); } + +//------------------------------------------------------------------------ +// If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits +// must be cleared to zeroes. The native compiler doesn't clear the upper bits +// and there is no way to know if the caller is native or not. So, the upper +// 32 bits of Vector argument on stack are always cleared to zero. +#ifdef FEATURE_SIMD +void CodeGen::genClearStackVec3ArgUpperBits() +{ +#ifdef DEBUG + if (verbose) + printf("*************** In genClearStackVec3ArgUpperBits()\n"); +#endif + + assert(compiler->compGeneratingProlog); + + unsigned varNum = 0; + + for (unsigned varNum = 0; varNum < compiler->info.compArgsCount; varNum++) + { + LclVarDsc* varDsc = &(compiler->lvaTable[varNum]); + assert(varDsc->lvIsParam); + + // Does var has simd12 type? + if (varDsc->lvType != TYP_SIMD12) + { + continue; + } + + if (!varDsc->lvIsRegArg) + { + // Clear the upper 32 bits by mov dword ptr [V_ARG_BASE+0xC], 0 + getEmitter()->emitIns_S_I( + ins_Store(TYP_INT), + EA_4BYTE, + varNum, + genTypeSize(TYP_FLOAT) * 3, + 0); + + } + else + { + // Assume that for x64 linux, an argument is fully in registers + // or fully on stack. + regNumber argReg = varDsc->GetOtherArgReg(); + + // Clear the upper 32 bits by two shift instructions. + // argReg = argReg << 96 + getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, 12); + // argReg = argReg >> 96 + getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, 12); + } + } +} +#endif // FEATURE_SIMD #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING // Generate code for CpObj nodes wich copy structs that have interleaved @@ -5948,6 +6003,21 @@ void CodeGen::genCallInstruction(GenTreePtr node) inst_RV_RV(ins_Copy(regType), allocatedReg, returnReg, regType); } } + +#ifdef FEATURE_SIMD + // A Vector3 return value is stored in xmm0 and xmm1. + // RyuJIT assumes that the upper unused bits of xmm1 are cleared but + // the native compiler doesn't guarantee it. + if (returnType == TYP_SIMD12) + { + returnReg = retTypeDesc->GetABIReturnReg(1); + // Clear the upper 32 bits by two shift instructions. + // retReg = retReg << 96 + // retReg = retReg >> 96 + getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12); + getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12); + } +#endif // FEATURE_SIMD } else { diff --git a/src/jit/ee_il_dll.cpp b/src/jit/ee_il_dll.cpp index b97c10cca4..b97c10cca4 100644..100755 --- a/src/jit/ee_il_dll.cpp +++ b/src/jit/ee_il_dll.cpp diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index 1df4f3bbcd..8238ef446f 100644..100755 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -694,9 +694,19 @@ void Lowering::LowerNode(GenTreePtr* ppTree, Compiler::fgWalkData* data) // RyuJit backend is making another implicit assumption that Vector3 type args when passed in // registers or on stack, the upper most 4-bytes will be zero. // - // TODO-64bit: assumptions 1 and 2 hold within RyuJIT generated code. It is not clear whether - // these assumptions hold when a Vector3 type arg is passed by native code. Example: PInvoke - // returning Vector3 type value or RPInvoke passing Vector3 type args. + // For P/Invoke return and Reverse P/Invoke argument passing, native compiler doesn't guarantee + // that upper 4-bytes of a Vector3 type struct is zero initialized and hence assumption 2 is + // invalid. + // + // RyuJIT x64 Windows: arguments are treated as passed by ref and hence read/written just 12 + // bytes. In case of Vector3 returns, Caller allocates a zero initialized Vector3 local and + // passes it retBuf arg and Callee method writes only 12 bytes to retBuf. For this reason, + // there is no need to clear upper 4-bytes of Vector3 type args. + // + // RyuJIT x64 Unix: arguments are treated as passed by value and read/writen as if TYP_SIMD16. + // Vector3 return values are returned two return registers and Caller assembles them into a + // single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3 + // type args in prolog and Vector3 type return value of a call (*ppTree)->gtType = TYP_SIMD16; #else NYI("Lowering of TYP_SIMD12 locals"); |