diff options
author | Bruce Forstall <brucefo@microsoft.com> | 2019-02-07 15:19:06 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-02-07 15:19:06 -0800 |
commit | aa8e508302816656477d2ba4a1ec691dfb7af9b1 (patch) | |
tree | d0d22f5ed9d13d28ce5e61d3e692a039c0c4a5ce | |
parent | 34d50b056150ba9101ee977dded9cee3c663fab6 (diff) | |
parent | c4ac1bc112d927de89e00810ea0fff39dc92f9fe (diff) | |
download | coreclr-aa8e508302816656477d2ba4a1ec691dfb7af9b1.tar.gz coreclr-aa8e508302816656477d2ba4a1ec691dfb7af9b1.tar.bz2 coreclr-aa8e508302816656477d2ba4a1ec691dfb7af9b1.zip |
Merge pull request #22023 from BruceForstall/FixGSWithLocalloc
Fix ARM64 GS with localloc
-rw-r--r-- | src/jit/codegen.h | 18 | ||||
-rw-r--r-- | src/jit/codegenarm64.cpp | 601 | ||||
-rw-r--r-- | src/jit/codegencommon.cpp | 496 | ||||
-rw-r--r-- | src/jit/codegeninterface.h | 5 | ||||
-rw-r--r-- | src/jit/compiler.cpp | 28 | ||||
-rw-r--r-- | src/jit/compiler.h | 9 | ||||
-rw-r--r-- | src/jit/jitconfigvalues.h | 31 | ||||
-rw-r--r-- | src/jit/lclvars.cpp | 296 | ||||
-rw-r--r-- | src/jit/utils.cpp | 188 | ||||
-rw-r--r-- | src/jit/utils.h | 54 |
10 files changed, 1302 insertions, 424 deletions
diff --git a/src/jit/codegen.h b/src/jit/codegen.h index 4db09e4f06..c1fb7b451d 100644 --- a/src/jit/codegen.h +++ b/src/jit/codegen.h @@ -279,10 +279,6 @@ protected: void genEpilogRestoreReg(regNumber reg1, int spOffset, int spDelta, regNumber tmpReg, bool* pTmpRegIsZero); -#ifdef DEBUG - static void genCheckSPOffset(bool isRegsCountOdd, int spOffset, int slotSize); -#endif // DEBUG - // A simple struct to keep register pairs for prolog and epilog. struct RegPair { @@ -305,12 +301,8 @@ protected: static int genGetSlotSizeForRegsInMask(regMaskTP regsMask); - void genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, - int spDelta, - int spOffset DEBUGARG(bool isRegsToSaveCountOdd)); - void genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, - int spDelta, - int spOffset DEBUGARG(bool isRegsToRestoreCountOdd)); + void genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, int spOffset); + void genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, int spOffset); void genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowestCalleeSavedOffset, int spDelta); void genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, int lowestCalleeSavedOffset, int spDelta); @@ -515,6 +507,12 @@ protected: void genAmd64EmitterUnitTests(); #endif +#ifdef _TARGET_ARM64_ + virtual void SetSaveFpLrWithAllCalleeSavedRegisters(bool value); + virtual bool IsSaveFpLrWithAllCalleeSavedRegisters(); + bool genSaveFpLrWithAllCalleeSavedRegisters; +#endif // _TARGET_ARM64_ + //------------------------------------------------------------------------- // // End prolog/epilog generation diff --git a/src/jit/codegenarm64.cpp b/src/jit/codegenarm64.cpp index 1e71256389..a9caf107a1 100644 --- a/src/jit/codegenarm64.cpp +++ b/src/jit/codegenarm64.cpp @@ -392,33 +392,6 @@ void CodeGen::genEpilogRestoreReg(regNumber reg1, int spOffset, int spDelta, reg } } -#ifdef DEBUG -//------------------------------------------------------------------------ -// genCheckSPOffset: Check Stack Pointer(SP) offset value, -// it must be 8 to account for alignment for the odd count -// or it must be 0 for the even count. -// -// Arguments: -// isRegsCountOdd - true if number of registers to save/restore is odd; -// spOffset - stack pointer offset value; -// slotSize - stack slot size in bytes. -// -// static -void CodeGen::genCheckSPOffset(bool isRegsCountOdd, int spOffset, int slotSize) -{ - if (isRegsCountOdd) - { - // The offset must be 8 to account for alignment for the odd count. - assert(spOffset == slotSize); - } - else - { - // The offset must be 0 for the even count. - assert(spOffset == 0); - } -} -#endif // DEBUG - //------------------------------------------------------------------------ // genBuildRegPairsStack: Build a stack of register pairs for prolog/epilog save/restore for the given mask. // The first register pair will contain the lowest register. Register pairs will combine neighbor @@ -454,15 +427,24 @@ void CodeGen::genBuildRegPairsStack(regMaskTP regsMask, ArrayStack<RegPair>* reg regNumber reg2 = genRegNumFromMask(reg2Mask); if (reg2 == REG_NEXT(reg1)) { - // Both registers must have the same type to be saved as pair. - if (genIsValidFloatReg(reg1) == genIsValidFloatReg(reg2)) + // The JIT doesn't allow saving pair (R28,FP), even though the + // save_regp register pair unwind code specification allows it. + // The JIT always saves (FP,LR) as a pair, and uses the save_fplr + // unwind code. This only comes up in stress mode scenarios + // where callee-saved registers are not allocated completely + // from lowest-to-highest, without gaps. + if (reg1 != REG_R28) { - isPairSave = true; + // Both registers must have the same type to be saved as pair. + if (genIsValidFloatReg(reg1) == genIsValidFloatReg(reg2)) + { + isPairSave = true; - regsMask &= ~reg2Mask; - regsCount -= 1; + regsMask &= ~reg2Mask; + regsCount -= 1; - regStack->Push(RegPair(reg1, reg2)); + regStack->Push(RegPair(reg1, reg2)); + } } } } @@ -531,7 +513,7 @@ void CodeGen::genSetUseSaveNextPairs(ArrayStack<RegPair>* regStack) // static int CodeGen::genGetSlotSizeForRegsInMask(regMaskTP regsMask) { - assert((regsMask & (RBM_CALLEE_SAVED | RBM_LR)) == regsMask); // Do not expect anything else. + assert((regsMask & (RBM_CALLEE_SAVED | RBM_FP | RBM_LR)) == regsMask); // Do not expect anything else. static_assert_no_msg(REGSIZE_BYTES == FPSAVE_REGSIZE_BYTES); return REGSIZE_BYTES; @@ -544,21 +526,11 @@ int CodeGen::genGetSlotSizeForRegsInMask(regMaskTP regsMask) // regsMask - a mask of registers for prolog generation; // spDelta - if non-zero, the amount to add to SP before the first register save (or together with it); // spOffset - the offset from SP that is the beginning of the callee-saved register area; -// isRegsToSaveCountOdd - (DEBUG only) true if number of registers to save is odd. // -void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, - int spDelta, - int spOffset DEBUGARG(bool isRegsToSaveCountOdd)) +void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, int spOffset) { const int slotSize = genGetSlotSizeForRegsInMask(regsMask); -#ifdef DEBUG - if (spDelta != 0) // The first store change SP offset, check its value before. - { - genCheckSPOffset(isRegsToSaveCountOdd, spOffset, slotSize); - } -#endif // DEBUG - ArrayStack<RegPair> regStack(compiler->getAllocator(CMK_Codegen)); genBuildRegPairsStack(regsMask, ®Stack); @@ -586,14 +558,23 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, //------------------------------------------------------------------------ // genSaveCalleeSavedRegistersHelp: Save the callee-saved registers in 'regsToSaveMask' to the stack frame -// in the function or funclet prolog. The save set does not contain FP, since that is -// guaranteed to be saved separately, so we can set up chaining. We can only use the instructions -// that are allowed by the unwind codes. Integer registers are stored at lower addresses, -// FP/SIMD registers are stored at higher addresses. The caller ensures that +// in the function or funclet prolog. Registers are saved in register number order from low addresses +// to high addresses. This means that integer registers are saved at lower addresses than floatint-point/SIMD +// registers. However, when genSaveFpLrWithAllCalleeSavedRegisters is true, the integer registers are stored +// at higher addresses than floating-point/SIMD registers, that is, the relative order of these two classes +// is reveresed. This is done to put the saved frame pointer very high in the frame, for simplicity. +// +// TODO: We could always put integer registers at the higher addresses, if desired, to remove this special +// case. It would cause many asm diffs when first implemented. +// +// If establishing frame pointer chaining, it must be done after saving the callee-saved registers. +// +// We can only use the instructions that are allowed by the unwind codes. The caller ensures that // there is enough space on the frame to store these registers, and that the store instructions -// we need to use (STR or STP) are encodable with the stack-pointer immediate offsets we need to -// use. The caller can tell us to fold in a stack pointer adjustment, which we will do with the first instruction. Note -// that the stack pointer adjustment must be by a multiple of 16 to preserve the invariant that the +// we need to use (STR or STP) are encodable with the stack-pointer immediate offsets we need to use. +// +// The caller can tell us to fold in a stack pointer adjustment, which we will do with the first instruction. +// Note that the stack pointer adjustment must be by a multiple of 16 to preserve the invariant that the // stack pointer is always 16 byte aligned. If we are saving an odd number of callee-saved // registers, though, we will have an empty aligment slot somewhere. It turns out we will put // it below (at a lower address) the callee-saved registers, as that is currently how we @@ -609,7 +590,7 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, // zero). // // Notes: -// the save set can contain LR in which case LR is saved along with the other callee-saved registers. +// The save set can contain LR in which case LR is saved along with the other callee-saved registers. // But currently Jit doesn't use frames without frame pointer on arm64. // void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowestCalleeSavedOffset, int spDelta) @@ -628,18 +609,33 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe } assert((spDelta % 16) == 0); - assert((regsToSaveMask & RBM_FP) == 0); // We never save FP here. - // We also save LR, even though it is not in RBM_CALLEE_SAVED. - assert(regsToSaveCount <= genCountBits(RBM_CALLEE_SAVED | RBM_LR)); + // We also can save FP and LR, even though they are not in RBM_CALLEE_SAVED. + assert(regsToSaveCount <= genCountBits(RBM_CALLEE_SAVED | RBM_FP | RBM_LR)); -#ifdef DEBUG - bool isRegsToSaveCountOdd = ((regsToSaveCount % 2) != 0); -#endif // DEBUG + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + // TODO: always save int regs higher than float, to be consistent? + regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT; + regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat; - int spOffset = lowestCalleeSavedOffset; // this is the offset *after* we change SP. + if (maskSaveRegsFloat != RBM_NONE) + { + genSaveCalleeSavedRegisterGroup(maskSaveRegsFloat, spDelta, lowestCalleeSavedOffset); + spDelta = 0; + lowestCalleeSavedOffset += genCountBits(maskSaveRegsFloat) * FPSAVE_REGSIZE_BYTES; + } - genSaveCalleeSavedRegisterGroup(regsToSaveMask, spDelta, spOffset DEBUGARG(isRegsToSaveCountOdd)); + if (maskSaveRegsInt != RBM_NONE) + { + genSaveCalleeSavedRegisterGroup(maskSaveRegsInt, spDelta, lowestCalleeSavedOffset); + // No need to update spDelta, lowestCalleeSavedOffset since they're not used after this. + } + } + else + { + genSaveCalleeSavedRegisterGroup(regsToSaveMask, spDelta, lowestCalleeSavedOffset); + } } //------------------------------------------------------------------------ @@ -650,9 +646,7 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe // spDelta - if non-zero, the amount to add to SP after the last register restore (or together with it); // spOffset - the offset from SP that is the beginning of the callee-saved register area; // -void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, - int spDelta, - int spOffset DEBUGARG(bool isRegsToRestoreCountOdd)) +void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, int spOffset) { const int slotSize = genGetSlotSizeForRegsInMask(regsMask); @@ -685,13 +679,6 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, genEpilogRestoreReg(regPair.reg1, spOffset, stackDelta, REG_IP1, nullptr); } } - -#ifdef DEBUG - if (stackDelta != 0) // The last restore (the first save) changes SP offset, check its value after. - { - genCheckSPOffset(isRegsToRestoreCountOdd, spOffset, slotSize); - } -#endif // DEBUG } //------------------------------------------------------------------------ @@ -741,21 +728,40 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in } assert((spDelta % 16) == 0); - assert((regsToRestoreMask & RBM_FP) == 0); // We never restore FP here. - // We also restore LR, even though it is not in RBM_CALLEE_SAVED. - assert(regsToRestoreCount <= genCountBits(RBM_CALLEE_SAVED | RBM_LR)); + // We also can restore FP and LR, even though they are not in RBM_CALLEE_SAVED. + assert(regsToRestoreCount <= genCountBits(RBM_CALLEE_SAVED | RBM_FP | RBM_LR)); -#ifdef DEBUG - bool isRegsToRestoreCountOdd = ((regsToRestoreCount % 2) != 0); -#endif // DEBUG + // Point past the end, to start. We predecrement to find the offset to load from. + static_assert_no_msg(REGSIZE_BYTES == FPSAVE_REGSIZE_BYTES); + int spOffset = lowestCalleeSavedOffset + regsToRestoreCount * REGSIZE_BYTES; + + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + // TODO: always save int regs higher than float, to be consistent? + regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT; + regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat; - assert(REGSIZE_BYTES == FPSAVE_REGSIZE_BYTES); - int spOffset = lowestCalleeSavedOffset + regsToRestoreCount * REGSIZE_BYTES; // Point past the end, to start. We - // predecrement to find the offset to - // load from. + // Restore in the opposite order of saving. + + if (maskRestoreRegsInt != RBM_NONE) + { + int spIntDelta = (maskRestoreRegsFloat != RBM_NONE) ? 0 : spDelta; // should we delay the SP adjustment? + genRestoreCalleeSavedRegisterGroup(maskRestoreRegsInt, spIntDelta, spOffset); + spOffset -= genCountBits(maskRestoreRegsInt) * REGSIZE_BYTES; + } - genRestoreCalleeSavedRegisterGroup(regsToRestoreMask, spDelta, spOffset DEBUGARG(isRegsToRestoreCountOdd)); + if (maskRestoreRegsFloat != RBM_NONE) + { + // If there is any spDelta, it must be used here. + genRestoreCalleeSavedRegisterGroup(maskRestoreRegsFloat, spDelta, spOffset); + // No need to update spOffset since it's not used after this. + } + } + else + { + genRestoreCalleeSavedRegisterGroup(regsToRestoreMask, spDelta, spOffset); + } } // clang-format off @@ -780,16 +786,17 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in * * Frame type 1: * For #outsz == 0 and #framesz <= 512: - * stp fp,lr,[sp,-#framesz]! ; establish the frame, save FP/LR + * stp fp,lr,[sp,-#framesz]! ; establish the frame (predecrement by #framesz), save FP/LR * stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary * * The funclet frame is thus: * * | | * |-----------------------| - * | incoming | - * | arguments | + * | incoming arguments | * +=======================+ <---- Caller's SP + * | Varargs regs space | // Only for varargs main functions; 64 bytes + * |-----------------------| * |Callee saved registers | // multiple of 8 bytes * |-----------------------| * | PSP slot | // 8 bytes (omitted in CoreRT ABI) @@ -798,9 +805,9 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in * |-----------------------| * | Saved FP, LR | // 16 bytes * |-----------------------| <---- Ambient SP - * | | | - * ~ | Stack grows ~ - * | | downward | + * | | | + * ~ | Stack grows ~ + * | | downward | * V * * Frame type 2: @@ -813,9 +820,10 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in * * | | * |-----------------------| - * | incoming | - * | arguments | + * | incoming arguments | * +=======================+ <---- Caller's SP + * | Varargs regs space | // Only for varargs main functions; 64 bytes + * |-----------------------| * |Callee saved registers | // multiple of 8 bytes * |-----------------------| * | PSP slot | // 8 bytes (omitted in CoreRT ABI) @@ -826,14 +834,15 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in * |-----------------------| * | Outgoing arg space | // multiple of 8 bytes * |-----------------------| <---- Ambient SP - * | | | - * ~ | Stack grows ~ - * | | downward | + * | | | + * ~ | Stack grows ~ + * | | downward | * V * * Frame type 3: * For #framesz > 512: - * stp fp,lr,[sp,- (#framesz - #outsz)]! ; establish the frame, save FP/LR: note that it is guaranteed here that (#framesz - #outsz) <= 168 + * stp fp,lr,[sp,- (#framesz - #outsz)]! ; establish the frame, save FP/LR + * ; note that it is guaranteed here that (#framesz - #outsz) <= 240 * stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary * sub sp,sp,#outsz ; create space for outgoing argument space * @@ -841,9 +850,10 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in * * | | * |-----------------------| - * | incoming | - * | arguments | + * | incoming arguments | * +=======================+ <---- Caller's SP + * | Varargs regs space | // Only for varargs main functions; 64 bytes + * |-----------------------| * |Callee saved registers | // multiple of 8 bytes * |-----------------------| * | PSP slot | // 8 bytes (omitted in CoreRT ABI) @@ -856,24 +866,123 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in * |-----------------------| * | Outgoing arg space | // multiple of 8 bytes * |-----------------------| <---- Ambient SP - * | | | - * ~ | Stack grows ~ - * | | downward | + * | | | + * ~ | Stack grows ~ + * | | downward | * V * * Both #1 and #2 only change SP once. That means that there will be a maximum of one alignment slot needed. For the general case, #3, * it is possible that we will need to add alignment to both changes to SP, leading to 16 bytes of alignment. Remember that the stack - * pointer needs to be 16 byte aligned at all times. The size of the PSP slot plus callee-saved registers space is a maximum of 168 bytes: - * (1 PSP slot + 12 integer registers + 8 FP/SIMD registers) * 8 bytes. The outgoing argument size, however, can be very large, if we call a - * function that takes a large number of arguments (note that we currently use the same outgoing argument space size in the funclet as for the main - * function, even if the funclet doesn't have any calls, or has a much smaller, or larger, maximum number of outgoing arguments for any call). - * In that case, we need to 16-byte align the initial change to SP, before saving off the callee-saved registers and establishing the PSPsym, - * so we can use the limited immediate offset encodings we have available, before doing another 16-byte aligned SP adjustment to create the - * outgoing argument space. Both changes to SP might need to add alignment padding. + * pointer needs to be 16 byte aligned at all times. The size of the PSP slot plus callee-saved registers space is a maximum of 240 bytes: + * + * FP,LR registers + * 10 int callee-saved register x19-x28 + * 8 float callee-saved registers v8-v15 + * 8 saved integer argument registers x0-x7, if varargs function + * 1 PSP slot + * 1 alignment slot + * == 30 slots * 8 bytes = 240 bytes. + * + * The outgoing argument size, however, can be very large, if we call a function that takes a large number of + * arguments (note that we currently use the same outgoing argument space size in the funclet as for the main + * function, even if the funclet doesn't have any calls, or has a much smaller, or larger, maximum number of + * outgoing arguments for any call). In that case, we need to 16-byte align the initial change to SP, before + * saving off the callee-saved registers and establishing the PSPsym, so we can use the limited immediate offset + * encodings we have available, before doing another 16-byte aligned SP adjustment to create the outgoing argument + * space. Both changes to SP might need to add alignment padding. + * + * In addition to the above "standard" frames, we also need to support a frame where the saved FP/LR are at the + * highest addresses. This is to match the frame layout (specifically, callee-saved registers including FP/LR + * and the PSPSym) that is used in the main function when a GS cookie is required due to the use of localloc. + * (Note that localloc cannot be used in a funclet.) In these variants, not only has the position of FP/LR + * changed, but where the alignment padding is placed has also changed. + * + * Frame type 4 (variant of frame types 1 and 2): + * For #framesz <= 512: + * sub sp,sp,#framesz ; establish the frame + * stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary + * stp fp,lr,[sp,#yyy] ; save FP/LR. + * ; write PSPSym + * + * The "#framesz <= 512" condition ensures that after we've established the frame, we can use "stp" with its + * maximum allowed offset (504) to save the callee-saved register at the highest address. + * + * We use "sub" instead of folding it into the next instruction as a predecrement, as we need to write PSPSym + * at the bottom of the stack, and there might also be an alignment padding slot. + * + * The funclet frame is thus: + * + * | | + * |-----------------------| + * | incoming arguments | + * +=======================+ <---- Caller's SP + * | Varargs regs space | // Only for varargs main functions; 64 bytes + * |-----------------------| + * | Saved LR | // 8 bytes + * |-----------------------| + * | Saved FP | // 8 bytes + * |-----------------------| + * |Callee saved registers | // multiple of 8 bytes + * |-----------------------| + * | PSP slot | // 8 bytes (omitted in CoreRT ABI) + * |-----------------------| + * ~ alignment padding ~ // To make the whole frame 16 byte aligned. + * |-----------------------| + * | Outgoing arg space | // multiple of 8 bytes (optional; if #outsz > 0) + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + * + * Frame type 5 (variant of frame type 3): + * For #framesz > 512: + * sub sp,sp,(#framesz - #outsz) ; establish part of the frame. Note that it is guaranteed here that (#framesz - #outsz) <= 240 + * stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary + * stp fp,lr,[sp,#yyy] ; save FP/LR. + * sub sp,sp,#outsz ; create space for outgoing argument space + * ; write PSPSym + * + * For large frames with "#framesz > 512", we must do one SP adjustment first, after which we can save callee-saved + * registers with up to the maximum "stp" offset of 504. Then, we can establish the rest of the frame (namely, the + * space for the outgoing argument space). + * + * The funclet frame is thus: + * + * | | + * |-----------------------| + * | incoming arguments | + * +=======================+ <---- Caller's SP + * | Varargs regs space | // Only for varargs main functions; 64 bytes + * |-----------------------| + * | Saved LR | // 8 bytes + * |-----------------------| + * | Saved FP | // 8 bytes + * |-----------------------| + * |Callee saved registers | // multiple of 8 bytes + * |-----------------------| + * | PSP slot | // 8 bytes (omitted in CoreRT ABI) + * |-----------------------| + * ~ alignment padding ~ // To make the first SP subtraction 16 byte aligned + * |-----------------------| + * ~ alignment padding ~ // To make the whole frame 16 byte aligned (specifically, to 16-byte align the outgoing argument space). + * |-----------------------| + * | Outgoing arg space | // multiple of 8 bytes + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + * + * Note that in this case we might have 16 bytes of alignment that is adjacent. This is because we are doing 2 SP + * subtractions, and each one must be aligned up to 16 bytes. * * Note that in all cases, the PSPSym is in exactly the same position with respect to Caller-SP, and that location is the same relative to Caller-SP * as in the main function. * + * Funclets do not have varargs arguments. However, because the PSPSym must exist at the same offset from Caller-SP as in the main function, we + * must add buffer space for the saved varargs argument registers here, if the main function did the same. + * * ; After this header, fill the PSP slot, for use by the VM (it gets reported with the GC info), or by code generation of nested filters. * ; This is not part of the "OS prolog"; it has no associated unwind data, and is not reversed in the funclet epilog. * @@ -919,27 +1028,6 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in * ldp fp,lr,[sp],#framesz * ret lr * - * The funclet frame is thus: - * - * | | - * |-----------------------| - * | incoming | - * | arguments | - * +=======================+ <---- Caller's SP - * |Callee saved registers | // multiple of 8 bytes - * |-----------------------| - * | PSP slot | // 8 bytes (omitted in CoreRT ABI) - * |-----------------------| - * | Saved FP, LR | // 16 bytes - * |-----------------------| - * ~ alignment padding ~ // To make the whole frame 16 byte aligned. - * |-----------------------| - * | Outgoing arg space | // multiple of 8 bytes - * |-----------------------| <---- Ambient SP - * | | | - * ~ | Stack grows ~ - * | | downward | - * V */ // clang-format on @@ -982,14 +1070,14 @@ void CodeGen::genFuncletProlog(BasicBlock* block) maskArgRegsLiveIn = RBM_R0; } - int lowestCalleeSavedOffset = genFuncletInfo.fiSP_to_CalleeSave_delta; - if (genFuncletInfo.fiFrameType == 1) { getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSpDelta1, INS_OPTS_PRE_INDEX); compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1); + maskSaveRegsInt &= ~(RBM_LR | RBM_FP); // We've saved these now + assert(genFuncletInfo.fiSpDelta2 == 0); assert(genFuncletInfo.fiSP_to_FPLR_save_delta == 0); } @@ -1007,21 +1095,40 @@ void CodeGen::genFuncletProlog(BasicBlock* block) getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSP_to_FPLR_save_delta); compiler->unwindSaveRegPair(REG_FP, REG_LR, genFuncletInfo.fiSP_to_FPLR_save_delta); + + maskSaveRegsInt &= ~(RBM_LR | RBM_FP); // We've saved these now } - else + else if (genFuncletInfo.fiFrameType == 3) { - assert(genFuncletInfo.fiFrameType == 3); getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSpDelta1, INS_OPTS_PRE_INDEX); compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1); - lowestCalleeSavedOffset += genFuncletInfo.fiSpDelta2; // We haven't done the second adjustment of SP yet. + maskSaveRegsInt &= ~(RBM_LR | RBM_FP); // We've saved these now + } + else if (genFuncletInfo.fiFrameType == 4) + { + // fiFrameType==4 constraints: + assert(genFuncletInfo.fiSpDelta1 < 0); + assert(genFuncletInfo.fiSpDelta1 >= -512); + + // generate sub SP,SP,imm + genStackPointerAdjustment(genFuncletInfo.fiSpDelta1, REG_NA, nullptr); + + assert(genFuncletInfo.fiSpDelta2 == 0); + } + else + { + assert(genFuncletInfo.fiFrameType == 5); + + // Nothing to do here; the first SP adjustment will be done by saving the callee-saved registers. } - maskSaveRegsInt &= ~(RBM_LR | RBM_FP); // We've saved these now + int lowestCalleeSavedOffset = genFuncletInfo.fiSP_to_CalleeSave_delta + + genFuncletInfo.fiSpDelta2; // We haven't done the second adjustment of SP yet (if any) genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, lowestCalleeSavedOffset, 0); - if (genFuncletInfo.fiFrameType == 3) + if ((genFuncletInfo.fiFrameType == 3) || (genFuncletInfo.fiFrameType == 5)) { // Note that genFuncletInfo.fiSpDelta2 is always a negative value assert(genFuncletInfo.fiSpDelta2 < 0); @@ -1033,43 +1140,42 @@ void CodeGen::genFuncletProlog(BasicBlock* block) // This is the end of the OS-reported prolog for purposes of unwinding compiler->unwindEndProlog(); - // If there is no PSPSym (CoreRT ABI), we are done. - if (compiler->lvaPSPSym == BAD_VAR_NUM) - { - return; - } - - if (isFilter) - { - // This is the first block of a filter - // Note that register x1 = CallerSP of the containing function - // X1 is overwritten by the first Load (new callerSP) - // X2 is scratch when we have a large constant offset - - // Load the CallerSP of the main function (stored in the PSP of the dynamically containing funclet or function) - genInstrWithConstant(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_R1, - genFuncletInfo.fiCallerSP_to_PSP_slot_delta, REG_R2, false); - regSet.verifyRegUsed(REG_R1); - - // Store the PSP value (aka CallerSP) - genInstrWithConstant(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_SPBASE, - genFuncletInfo.fiSP_to_PSP_slot_delta, REG_R2, false); - - // re-establish the frame pointer - genInstrWithConstant(INS_add, EA_PTRSIZE, REG_FPBASE, REG_R1, genFuncletInfo.fiFunction_CallerSP_to_FP_delta, - REG_R2, false); - } - else // This is a non-filter funclet + // If there is no PSPSym (CoreRT ABI), we are done. Otherwise, we need to set up the PSPSym in the functlet frame. + if (compiler->lvaPSPSym != BAD_VAR_NUM) { - // X3 is scratch, X2 can also become scratch + if (isFilter) + { + // This is the first block of a filter + // Note that register x1 = CallerSP of the containing function + // X1 is overwritten by the first Load (new callerSP) + // X2 is scratch when we have a large constant offset + + // Load the CallerSP of the main function (stored in the PSP of the dynamically containing funclet or + // function) + genInstrWithConstant(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_R1, + genFuncletInfo.fiCallerSP_to_PSP_slot_delta, REG_R2, false); + regSet.verifyRegUsed(REG_R1); + + // Store the PSP value (aka CallerSP) + genInstrWithConstant(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_SPBASE, + genFuncletInfo.fiSP_to_PSP_slot_delta, REG_R2, false); + + // re-establish the frame pointer + genInstrWithConstant(INS_add, EA_PTRSIZE, REG_FPBASE, REG_R1, + genFuncletInfo.fiFunction_CallerSP_to_FP_delta, REG_R2, false); + } + else // This is a non-filter funclet + { + // X3 is scratch, X2 can also become scratch - // compute the CallerSP, given the frame pointer. x3 is scratch. - genInstrWithConstant(INS_add, EA_PTRSIZE, REG_R3, REG_FPBASE, -genFuncletInfo.fiFunction_CallerSP_to_FP_delta, - REG_R2, false); - regSet.verifyRegUsed(REG_R3); + // compute the CallerSP, given the frame pointer. x3 is scratch. + genInstrWithConstant(INS_add, EA_PTRSIZE, REG_R3, REG_FPBASE, + -genFuncletInfo.fiFunction_CallerSP_to_FP_delta, REG_R2, false); + regSet.verifyRegUsed(REG_R3); - genInstrWithConstant(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R3, REG_SPBASE, - genFuncletInfo.fiSP_to_PSP_slot_delta, REG_R2, false); + genInstrWithConstant(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R3, REG_SPBASE, + genFuncletInfo.fiSP_to_PSP_slot_delta, REG_R2, false); + } } } @@ -1103,22 +1209,21 @@ void CodeGen::genFuncletEpilog() assert((maskRestoreRegsInt & RBM_LR) != 0); assert((maskRestoreRegsInt & RBM_FP) != 0); - maskRestoreRegsInt &= ~(RBM_LR | RBM_FP); // We restore FP/LR at the end - - int lowestCalleeSavedOffset = genFuncletInfo.fiSP_to_CalleeSave_delta; - - if (genFuncletInfo.fiFrameType == 3) + if ((genFuncletInfo.fiFrameType == 3) || (genFuncletInfo.fiFrameType == 5)) { // Note that genFuncletInfo.fiSpDelta2 is always a negative value assert(genFuncletInfo.fiSpDelta2 < 0); // generate add SP,SP,imm genStackPointerAdjustment(-genFuncletInfo.fiSpDelta2, REG_R2, nullptr); - - lowestCalleeSavedOffset += genFuncletInfo.fiSpDelta2; } regMaskTP regsToRestoreMask = maskRestoreRegsInt | maskRestoreRegsFloat; + if ((genFuncletInfo.fiFrameType == 1) || (genFuncletInfo.fiFrameType == 2) || (genFuncletInfo.fiFrameType == 3)) + { + regsToRestoreMask &= ~(RBM_LR | RBM_FP); // We restore FP/LR at the end + } + int lowestCalleeSavedOffset = genFuncletInfo.fiSP_to_CalleeSave_delta + genFuncletInfo.fiSpDelta2; genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, lowestCalleeSavedOffset, 0); if (genFuncletInfo.fiFrameType == 1) @@ -1145,14 +1250,34 @@ void CodeGen::genFuncletEpilog() assert(genFuncletInfo.fiSpDelta2 == 0); } - else + else if (genFuncletInfo.fiFrameType == 3) { - assert(genFuncletInfo.fiFrameType == 3); - getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -genFuncletInfo.fiSpDelta1, INS_OPTS_POST_INDEX); compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1); } + else if (genFuncletInfo.fiFrameType == 4) + { + // fiFrameType==4 constraints: + assert(genFuncletInfo.fiSpDelta1 < 0); + assert(genFuncletInfo.fiSpDelta1 >= -512); + + // generate add SP,SP,imm + genStackPointerAdjustment(-genFuncletInfo.fiSpDelta1, REG_NA, nullptr); + + assert(genFuncletInfo.fiSpDelta2 == 0); + } + else + { + assert(genFuncletInfo.fiFrameType == 5); + // Same work as fiFrameType==4, but different asserts. + + assert(genFuncletInfo.fiSpDelta1 < 0); + assert(genFuncletInfo.fiSpDelta1 >= -240); + + // generate add SP,SP,imm + genStackPointerAdjustment(-genFuncletInfo.fiSpDelta1, REG_NA, nullptr); + } inst_RV(INS_ret, REG_LR, TYP_I_IMPL); compiler->unwindReturn(REG_LR); @@ -1176,8 +1301,9 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo() return; assert(isFramePointerUsed()); - assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT); // The frame size and offsets must be - // finalized + + // The frame size and offsets must be finalized + assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT); genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta(); @@ -1207,27 +1333,44 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo() int SP_to_PSP_slot_delta; int CallerSP_to_PSP_slot_delta; + unsigned funcletFrameSize = saveRegsPlusPSPSize + compiler->lvaOutgoingArgSpaceSize; + unsigned funcletFrameSizeAligned = roundUp(funcletFrameSize, STACK_ALIGN); + assert(funcletFrameSizeAligned <= maxFuncletFrameSizeAligned); + + unsigned funcletFrameAlignmentPad = funcletFrameSizeAligned - funcletFrameSize; + assert((funcletFrameAlignmentPad == 0) || (funcletFrameAlignmentPad == REGSIZE_BYTES)); + if (maxFuncletFrameSizeAligned <= 512) { - unsigned funcletFrameSize = saveRegsPlusPSPSize + compiler->lvaOutgoingArgSpaceSize; - unsigned funcletFrameSizeAligned = roundUp(funcletFrameSize, STACK_ALIGN); - assert(funcletFrameSizeAligned <= maxFuncletFrameSizeAligned); - - unsigned funcletFrameAlignmentPad = funcletFrameSizeAligned - funcletFrameSize; - assert((funcletFrameAlignmentPad == 0) || (funcletFrameAlignmentPad == REGSIZE_BYTES)); + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + SP_to_FPLR_save_delta = funcletFrameSizeAligned - (2 /* FP, LR */ * REGSIZE_BYTES); + if (compiler->info.compIsVarArgs) + { + SP_to_FPLR_save_delta -= MAX_REG_ARG * REGSIZE_BYTES; + } - SP_to_FPLR_save_delta = compiler->lvaOutgoingArgSpaceSize; - SP_to_PSP_slot_delta = SP_to_FPLR_save_delta + 2 /* FP, LR */ * REGSIZE_BYTES + funcletFrameAlignmentPad; - CallerSP_to_PSP_slot_delta = -(int)(saveRegsPlusPSPSize - 2 /* FP, LR */ * REGSIZE_BYTES); + SP_to_PSP_slot_delta = compiler->lvaOutgoingArgSpaceSize + funcletFrameAlignmentPad; + CallerSP_to_PSP_slot_delta = -(int)saveRegsPlusPSPSize; - if (compiler->lvaOutgoingArgSpaceSize == 0) - { - genFuncletInfo.fiFrameType = 1; + genFuncletInfo.fiFrameType = 4; } else { - genFuncletInfo.fiFrameType = 2; + SP_to_FPLR_save_delta = compiler->lvaOutgoingArgSpaceSize; + SP_to_PSP_slot_delta = SP_to_FPLR_save_delta + 2 /* FP, LR */ * REGSIZE_BYTES + funcletFrameAlignmentPad; + CallerSP_to_PSP_slot_delta = -(int)(saveRegsPlusPSPSize - 2 /* FP, LR */ * REGSIZE_BYTES); + + if (compiler->lvaOutgoingArgSpaceSize == 0) + { + genFuncletInfo.fiFrameType = 1; + } + else + { + genFuncletInfo.fiFrameType = 2; + } } + genFuncletInfo.fiSpDelta1 = -(int)funcletFrameSizeAligned; genFuncletInfo.fiSpDelta2 = 0; @@ -1238,14 +1381,32 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo() unsigned saveRegsPlusPSPAlignmentPad = saveRegsPlusPSPSizeAligned - saveRegsPlusPSPSize; assert((saveRegsPlusPSPAlignmentPad == 0) || (saveRegsPlusPSPAlignmentPad == REGSIZE_BYTES)); - SP_to_FPLR_save_delta = outgoingArgSpaceAligned; - SP_to_PSP_slot_delta = SP_to_FPLR_save_delta + 2 /* FP, LR */ * REGSIZE_BYTES + saveRegsPlusPSPAlignmentPad; - CallerSP_to_PSP_slot_delta = - -(int)(saveRegsPlusPSPSizeAligned - 2 /* FP, LR */ * REGSIZE_BYTES - saveRegsPlusPSPAlignmentPad); + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + SP_to_FPLR_save_delta = funcletFrameSizeAligned - (2 /* FP, LR */ * REGSIZE_BYTES); + if (compiler->info.compIsVarArgs) + { + SP_to_FPLR_save_delta -= MAX_REG_ARG * REGSIZE_BYTES; + } + + SP_to_PSP_slot_delta = + compiler->lvaOutgoingArgSpaceSize + funcletFrameAlignmentPad + saveRegsPlusPSPAlignmentPad; + CallerSP_to_PSP_slot_delta = -(int)saveRegsPlusPSPSize; + + genFuncletInfo.fiFrameType = 5; + } + else + { + SP_to_FPLR_save_delta = outgoingArgSpaceAligned; + SP_to_PSP_slot_delta = SP_to_FPLR_save_delta + 2 /* FP, LR */ * REGSIZE_BYTES + saveRegsPlusPSPAlignmentPad; + CallerSP_to_PSP_slot_delta = + -(int)(saveRegsPlusPSPSizeAligned - 2 /* FP, LR */ * REGSIZE_BYTES - saveRegsPlusPSPAlignmentPad); + + genFuncletInfo.fiFrameType = 3; + } - genFuncletInfo.fiFrameType = 3; - genFuncletInfo.fiSpDelta1 = -(int)saveRegsPlusPSPSizeAligned; - genFuncletInfo.fiSpDelta2 = -(int)outgoingArgSpaceAligned; + genFuncletInfo.fiSpDelta1 = -(int)saveRegsPlusPSPSizeAligned; + genFuncletInfo.fiSpDelta2 = -(int)outgoingArgSpaceAligned; assert(genFuncletInfo.fiSpDelta1 + genFuncletInfo.fiSpDelta2 == -(int)maxFuncletFrameSizeAligned); } @@ -3506,12 +3667,27 @@ void CodeGen::genCodeForJumpCompare(GenTreeOp* tree) } } +//--------------------------------------------------------------------- +// genSPtoFPdelta - return offset from the stack pointer (Initial-SP) to the frame pointer. The frame pointer +// will point to the saved frame pointer slot (i.e., there will be frame pointer chaining). +// int CodeGenInterface::genSPtoFPdelta() { - int delta; + assert(isFramePointerUsed()); + int delta = -1; // initialization to illegal value - // We place the saved frame pointer immediately above the outgoing argument space. - delta = (int)compiler->lvaOutgoingArgSpaceSize; + if (IsSaveFpLrWithAllCalleeSavedRegisters()) + { + // The saved frame pointer is at the top of the frame, just beneath the saved varargs register space and the + // saved LR. + delta = genTotalFrameSize() - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - + 2 /* FP, LR */ * REGSIZE_BYTES; + } + else + { + // We place the saved frame pointer immediately above the outgoing argument space. + delta = (int)compiler->lvaOutgoingArgSpaceSize; + } assert(delta >= 0); return delta; @@ -3574,6 +3750,25 @@ int CodeGenInterface::genCallerSPtoInitialSPdelta() return callerSPtoSPdelta; } +//--------------------------------------------------------------------- +// SetSaveFpLrWithAllCalleeSavedRegisters - Set the variable that indicates if FP/LR registers +// are stored with the rest of the callee-saved registers. +// +void CodeGen::SetSaveFpLrWithAllCalleeSavedRegisters(bool value) +{ + JITDUMP("Setting genSaveFpLrWithAllCalleeSavedRegisters to %s\n", dspBool(value)); + genSaveFpLrWithAllCalleeSavedRegisters = value; +} + +//--------------------------------------------------------------------- +// IsSaveFpLrWithAllCalleeSavedRegisters - Return the value that indicates where FP/LR registers +// are stored in the prolog. +// +bool CodeGen::IsSaveFpLrWithAllCalleeSavedRegisters() +{ + return genSaveFpLrWithAllCalleeSavedRegisters; +} + /***************************************************************************** * Emit a call to a helper function. * diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp index 214fd35c8d..1ff2871ee7 100644 --- a/src/jit/codegencommon.cpp +++ b/src/jit/codegencommon.cpp @@ -165,6 +165,10 @@ CodeGen::CodeGen(Compiler* theCompiler) : CodeGenInterface(theCompiler) genInterruptibleUsed = false; genCurDispOffset = (unsigned)-1; #endif + +#ifdef _TARGET_ARM64_ + genSaveFpLrWithAllCalleeSavedRegisters = false; +#endif // _TARGET_ARM64_ } void CodeGenInterface::genMarkTreeInReg(GenTree* tree, regNumber reg) @@ -4827,11 +4831,12 @@ void CodeGen::genPushCalleeSavedRegisters() } #elif defined(_TARGET_ARM64_) // See the document "ARM64 JIT Frame Layout" and/or "ARM64 Exception Data" for more details or requirements and - // options. Case numbers in comments here refer to this document. + // options. Case numbers in comments here refer to this document. See also Compiler::lvaAssignFrameOffsets() + // for pictures of the general frame layouts, and CodeGen::genFuncletProlog() implementations (per architecture) + // for pictures of the funclet frame layouts. // // For most frames, generate, e.g.: - // stp fp, lr, [sp,-0x80]! // predecrement SP with full frame size, and store FP/LR pair. Store pair - // // ensures stack stays aligned. + // stp fp, lr, [sp,-0x80]! // predecrement SP with full frame size, and store FP/LR pair. // stp r19, r20, [sp, 0x60] // store at positive offset from SP established above, into callee-saved area // // at top of frame (highest addresses). // stp r21, r22, [sp, 0x70] @@ -4843,8 +4848,67 @@ void CodeGen::genPushCalleeSavedRegisters() // 3. General-purpose registers are 8 bytes, floating-point registers are 16 bytes, but FP/SIMD registers only // preserve their lower 8 bytes, by calling convention. // 4. For frames with varargs, we spill the integer register arguments to the stack, so all the arguments are - // consecutive. + // consecutive, and at the top of the frame. // 5. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc). + // + // For functions with GS and localloc, we change the frame so the frame pointer and LR are saved at the top + // of the frame, just under the varargs registers (if any). Note that the funclet frames must follow the same + // rule, and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP. + // Since this frame type is relatively rare, we force using it via stress modes, for additional coverage. + // + // The frames look like the following (simplified to only include components that matter for establishing the + // frames). See also Compiler::lvaAssignFrameOffsets(). + // + // Frames with FP, LR saved at bottom of frame (above outgoing argument space): + // + // | | + // |-----------------------| + // | incoming arguments | + // +=======================+ <---- Caller's SP + // | Varargs regs space | // Only for varargs functions; 64 bytes + // |-----------------------| + // |Callee saved registers | // not including FP/LR; multiple of 8 bytes + // |-----------------------| + // | PSP slot | // 8 bytes (omitted in CoreRT ABI) + // |-----------------------| + // | locals, temps, etc. | + // |-----------------------| + // | Saved LR | // 8 bytes + // |-----------------------| + // | Saved FP | // 8 bytes + // |-----------------------| + // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) + // |-----------------------| <---- Ambient SP + // | | | + // ~ | Stack grows ~ + // | | downward | + // V + // + // Frames with FP, LR saved at top of frame (below saved varargs incoming arguments): + // + // | | + // |-----------------------| + // | incoming arguments | + // +=======================+ <---- Caller's SP + // | Varargs regs space | // Only for varargs functions; 64 bytes + // |-----------------------| + // | Saved LR | // 8 bytes + // |-----------------------| + // | Saved FP | // 8 bytes + // |-----------------------| + // |Callee saved registers | // not including FP/LR; multiple of 8 bytes + // |-----------------------| + // | PSP slot | // 8 bytes (omitted in CoreRT ABI) + // |-----------------------| + // | locals, temps, etc. | + // |-----------------------| + // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) + // |-----------------------| <---- Ambient SP + // | | | + // ~ | Stack grows ~ + // | | downward | + // V + // int totalFrameSize = genTotalFrameSize(); @@ -4853,10 +4917,25 @@ void CodeGen::genPushCalleeSavedRegisters() regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT; regMaskTP maskSaveRegsInt = rsPushRegs & ~maskSaveRegsFloat; - int frameType = 0; // This number is arbitrary, is defined below, and corresponds to one of the frame styles we - // generate based on various sizes. - int calleeSaveSPDelta = 0; - int calleeSaveSPDeltaUnaligned = 0; +#ifdef DEBUG + if (verbose) + { + printf("Save float regs: "); + dspRegMask(maskSaveRegsFloat); + printf("\n"); + printf("Save int regs: "); + dspRegMask(maskSaveRegsInt); + printf("\n"); + } +#endif // DEBUG + + // The frameType number is arbitrary, is defined below, and corresponds to one of the frame styles we + // generate based on various sizes. + int frameType = 0; + + // The amount to subtract from SP before starting to store the callee-saved registers. It might be folded into the + // first save instruction as a "predecrement" amount, if possible. + int calleeSaveSPDelta = 0; if (isFramePointerUsed()) { @@ -4865,7 +4944,27 @@ void CodeGen::genPushCalleeSavedRegisters() assert((maskSaveRegsInt & RBM_FP) != 0); assert((maskSaveRegsInt & RBM_LR) != 0); - if ((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize < 512)) + // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address + // (FP and LR) are protected from buffer overrun by the GS cookie. If FP/LR are at the lowest addresses, + // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will + // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our + // saved FP/LR. In that case, we save FP/LR along with the rest of the callee-saved registers, above + // the GS cookie. + // + // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to + // create a frame pointer chain. + // + // Do we need another frame pointer register to get good code quality in the case of having the frame pointer + // point high in the frame, so we can take advantage of arm64's preference for positive offsets? C++ native + // code dedicates callee-saved x19 to this, so generates: + // mov x19, sp + // in the prolog, then uses x19 for local var accesses. Given that this case is so rare, we currently do + // not do this. That means that negative offsets from FP might need to use the reserved register to form + // the local variable offset for an addressing mode. + + // TODO-ARM64-Bug?: should this be "totalFrameSize <= 512"? + if (((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize < 512)) && + !genSaveFpLrWithAllCalleeSavedRegisters) { // Case #1. // @@ -4873,12 +4972,15 @@ void CodeGen::genPushCalleeSavedRegisters() // stp fp,lr,[sp,#-framesz]! // // The (totalFrameSize < 512) condition ensures that both the predecrement - // and the postincrement of SP can occur with STP. + // and the postincrement of SP can occur with STP. // // After saving callee-saved registers, we establish the frame pointer with: // mov fp,sp // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. + JITDUMP("Frame type 1. #outsz=0; #framesz=%d; LclFrameSize=%d\n", totalFrameSize, + compiler->compLclFrameSize); + frameType = 1; getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -totalFrameSize, @@ -4892,55 +4994,77 @@ void CodeGen::genPushCalleeSavedRegisters() { // Case #2. // - // Generate: - // sub sp,sp,#framesz - // stp fp,lr,[sp,#outsz] // note that by necessity, #outsz <= #framesz - 16, so #outsz <= 496. - // - // The (totalFrameSize <= 512) condition ensures the callee-saved registers can all be saved using STP with - // signed offset encoding. + // The (totalFrameSize <= 512) condition ensures the callee-saved registers can all be saved using STP + // with signed offset encoding. // // After saving callee-saved registers, we establish the frame pointer with: // add fp,sp,#outsz // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. - frameType = 2; + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + JITDUMP("Frame type 4 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - assert(compiler->lvaOutgoingArgSpaceSize + 2 * REGSIZE_BYTES <= (unsigned)totalFrameSize); + frameType = 4; - getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize); - compiler->unwindAllocStack(totalFrameSize); + // The frame will be allocated below, when the callee-saved registers are saved. This might mean a + // separate SUB instruction or the SP adjustment might be folded in to the first STP if there is + // no outgoing argument space AND no local frame space, that is, if the only thing the frame does + // is save callee-saved registers (and possibly varargs argument registers). + calleeSaveSPDelta = totalFrameSize; - getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, - compiler->lvaOutgoingArgSpaceSize); - compiler->unwindSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize); + offset = (int)compiler->compLclFrameSize; + } + else + { + JITDUMP("Frame type 2 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR - offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR + frameType = 2; + + // Generate: + // sub sp,sp,#framesz + // stp fp,lr,[sp,#outsz] // note that by necessity, #outsz <= #framesz - 16, so #outsz <= 496. + + getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize); + compiler->unwindAllocStack(totalFrameSize); + + assert(compiler->lvaOutgoingArgSpaceSize + 2 * REGSIZE_BYTES <= (unsigned)totalFrameSize); + + getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, + compiler->lvaOutgoingArgSpaceSize); + compiler->unwindSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize); + + maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR + offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR + } } else { // Case 5 or 6. // - // First, the callee-saved registers will be saved, and the callee-saved register code must use pre-index - // to subtract from SP as the first instruction. It must also leave space for varargs registers to be - // stored. For example: + // First, the callee-saved registers will be saved, and the callee-saved register code must use + // pre-index to subtract from SP as the first instruction. It must also leave space for varargs + // registers to be stored. For example: // stp r19,r20,[sp,#-96]! // stp d8,d9,[sp,#16] // ... save varargs incoming integer registers ... // Note that all SP alterations must be 16-byte aligned. We have already calculated any alignment to be - // lower on the stack than the callee-saved registers (see lvaAlignFrame() for how we calculate alignment). - // So, if there is an odd number of callee-saved registers, we use (for example, with just one saved - // register): + // lower on the stack than the callee-saved registers (see lvaAlignFrame() for how we calculate + // alignment). So, if there is an odd number of callee-saved registers, we use (for example, with just + // one saved register): // sub sp,sp,#16 // str r19,[sp,#8] // This is one additional instruction, but it centralizes the aligned space. Otherwise, it might be // possible to have two 8-byte alignment padding words, one below the callee-saved registers, and one // above them. If that is preferable, we could implement it. - // Note that any varargs saved space will always be 16-byte aligned, since there are 8 argument registers. + // + // Note that any varargs saved space will always be 16-byte aligned, since there are 8 argument + // registers. // // Then, define #remainingFrameSz = #framesz - (callee-saved size + varargs space + possible alignment - // padding from above). - // Note that #remainingFrameSz must not be zero, since we still need to save FP,SP. + // padding from above). Note that #remainingFrameSz must not be zero, since we still need to save FP,SP. // // Generate: // sub sp,sp,#remainingFrameSz @@ -4952,10 +5076,10 @@ void CodeGen::genPushCalleeSavedRegisters() // stp fp,lr,[sp,#outsz] // add fp,sp,#outsz // - // However, we need to handle the case where #outsz is larger than the constant signed offset encoding can - // handle. And, once again, we might need to deal with #outsz that is not aligned to 16-bytes (i.e., - // STACK_ALIGN). So, in the case of large #outsz we will have an additional SP adjustment, using one of the - // following sequences: + // However, we need to handle the case where #outsz is larger than the constant signed offset encoding + // can handle. And, once again, we might need to deal with #outsz that is not aligned to 16-bytes (i.e., + // STACK_ALIGN). So, in the case of large #outsz we will have an additional SP adjustment, using one of + // the following sequences: // // Define #remainingFrameSz2 = #remainingFrameSz - #outsz. // @@ -4966,9 +5090,9 @@ void CodeGen::genPushCalleeSavedRegisters() // // Or: // - // sub sp,sp,roundUp(#remainingFrameSz2,16) // if #remainingFrameSz2 is not 16-byte aligned (it is - // // always guaranteed to be 8 byte aligned). - // stp fp,lr,[sp,#8] // it will always be #8 in the unaligned case + // sub sp,sp,roundUp(#remainingFrameSz2,16) // if #remainingFrameSz2 is not 16-byte aligned (it is + // // always guaranteed to be 8 byte aligned). + // stp fp,lr,[sp,#8] // it will always be #8 in the unaligned case // add fp,sp,#8 // sub sp,sp,#outsz - #8 // @@ -4976,21 +5100,47 @@ void CodeGen::genPushCalleeSavedRegisters() // mov rX, #outsz - #8 // maybe multiple instructions // sub sp,sp,rX // ) + // + // Note that even if we align the SP alterations, that does not imply that we are creating empty alignment + // slots. In fact, we are not; any empty alignment slots were calculated in + // Compiler::lvaAssignFrameOffsets() and its callees. - frameType = 3; + int calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize; + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + JITDUMP("Frame type 5 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + // This case is much simpler, because we allocate space for the callee-saved register area, including + // FP/LR. Note the SP adjustment might be SUB or be folded into the first store as a predecrement. + // Then, we use a single SUB to establish the rest of the frame. We need to be careful about where + // to establish the frame pointer, as there is a limit of 2040 bytes offset from SP to FP in the + // unwind codes when FP is established. + frameType = 5; + } + else + { + JITDUMP("Frame type 3 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + frameType = 3; + + calleeSaveSPDeltaUnaligned -= 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll save later. + + // We'll take care of these later, but callee-saved regs code shouldn't see them. + maskSaveRegsInt &= ~(RBM_FP | RBM_LR); + } - calleeSaveSPDeltaUnaligned = - totalFrameSize - compiler->compLclFrameSize - 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll save later. assert(calleeSaveSPDeltaUnaligned >= 0); assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned. calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN); offset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned; - assert((offset == 0) || (offset == REGSIZE_BYTES)); // At most one alignment slot between SP and where we - // store the callee-saved registers. - // We'll take care of these later, but callee-saved regs code shouldn't see them. - maskSaveRegsInt &= ~(RBM_FP | RBM_LR); + JITDUMP(" calleeSaveSPDelta=%d, offset=%d\n", calleeSaveSPDelta, offset); + + // At most one alignment slot between SP and where we store the callee-saved registers. + assert((offset == 0) || (offset == REGSIZE_BYTES)); } } else @@ -4999,8 +5149,8 @@ void CodeGen::genPushCalleeSavedRegisters() assert((maskSaveRegsInt & RBM_FP) == 0); assert((maskSaveRegsInt & RBM_LR) != 0); - // Note that there is no pre-indexed save_lrpair unwind code variant, so we can't allocate the frame using 'stp' - // if we only have one callee-saved register plus LR to save. + // Note that there is no pre-indexed save_lrpair unwind code variant, so we can't allocate the frame using + // 'stp' if we only have one callee-saved register plus LR to save. NYI("Frame without frame pointer"); offset = 0; @@ -5008,6 +5158,7 @@ void CodeGen::genPushCalleeSavedRegisters() assert(frameType != 0); + JITDUMP(" offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta); genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, offset, -calleeSaveSPDelta); offset += genCountBits(maskSaveRegsInt | maskSaveRegsFloat) * REGSIZE_BYTES; @@ -5018,6 +5169,8 @@ void CodeGen::genPushCalleeSavedRegisters() if (compiler->info.compIsVarArgs) { + JITDUMP(" compIsVarArgs=true\n"); + // There are 8 general-purpose registers to home, thus 'offset' must be 16-byte aligned here. assert((offset % 16) == 0); for (regNumber reg1 = REG_ARG_FIRST; reg1 < REG_ARG_LAST; reg1 = REG_NEXT(REG_NEXT(reg1))) @@ -5030,18 +5183,27 @@ void CodeGen::genPushCalleeSavedRegisters() } } + // By default, we'll establish the frame pointer chain. (Note that currently frames without FP are NYI.) + bool establishFramePointer = true; + + // If we do establish the frame pointer, what is the amount we add to SP to do so? + unsigned offsetSpToSavedFp = 0; + if (frameType == 1) { - getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_FPBASE, REG_SPBASE); - compiler->unwindSetFrameReg(REG_FPBASE, 0); + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + assert(offsetSpToSavedFp == 0); } else if (frameType == 2) { - getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize); - compiler->unwindSetFrameReg(REG_FPBASE, compiler->lvaOutgoingArgSpaceSize); + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + + offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; } else if (frameType == 3) { + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; assert(remainingFrameSz > 0); assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- @@ -5057,19 +5219,28 @@ void CodeGen::genPushCalleeSavedRegisters() int alignmentAdjustment2 = spAdjustment2 - spAdjustment2Unaligned; assert((alignmentAdjustment2 == 0) || (alignmentAdjustment2 == 8)); + JITDUMP(" spAdjustment2=%d\n", spAdjustment2); + genPrologSaveRegPair(REG_FP, REG_LR, alignmentAdjustment2, -spAdjustment2, false, initReg, pInitRegZeroed); offset += spAdjustment2; - // Now subtract off the #outsz (or the rest of the #outsz if it was unaligned, and the above "sub" included - // some of it) + // Now subtract off the #outsz (or the rest of the #outsz if it was unaligned, and the above "sub" + // included some of it) int spAdjustment3 = compiler->lvaOutgoingArgSpaceSize - alignmentAdjustment2; assert(spAdjustment3 > 0); assert((spAdjustment3 % 16) == 0); - getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, alignmentAdjustment2); - compiler->unwindSetFrameReg(REG_FPBASE, alignmentAdjustment2); + JITDUMP(" alignmentAdjustment2=%d\n", alignmentAdjustment2); + genEstablishFramePointer(alignmentAdjustment2, /* reportUnwindData */ true); + + // We just established the frame pointer chain; don't do it again. + establishFramePointer = false; + JITDUMP(" spAdjustment3=%d\n", spAdjustment3); + + // TODO-ARM64-CQ: we're reporting this SUB SP in the unwind info. Do we need to, since we've already + // established the frame pointer? genStackPointerAdjustment(-spAdjustment3, initReg, pInitRegZeroed); offset += spAdjustment3; } @@ -5079,10 +5250,49 @@ void CodeGen::genPushCalleeSavedRegisters() pInitRegZeroed); offset += remainingFrameSz; - getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize); - compiler->unwindSetFrameReg(REG_FPBASE, compiler->lvaOutgoingArgSpaceSize); + offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; } } + else if (frameType == 4) + { + assert(genSaveFpLrWithAllCalleeSavedRegisters); + offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - + 2 * REGSIZE_BYTES; // -2 for FP, LR + } + else if (frameType == 5) + { + assert(genSaveFpLrWithAllCalleeSavedRegisters); + + offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - + 2 * REGSIZE_BYTES; // -2 for FP, LR + JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); + genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); + + // We just established the frame pointer chain; don't do it again. + establishFramePointer = false; + + int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; + assert(remainingFrameSz > 0); + assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- + // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. + + JITDUMP(" remainingFrameSz=%d\n", remainingFrameSz); + + // TODO-ARM64-CQ: we're reporting this SUB SP in the unwind info. Do we need to, since we've already + // established the frame pointer? + genStackPointerAdjustment(-remainingFrameSz, initReg, pInitRegZeroed); + offset += remainingFrameSz; + } + else + { + unreached(); + } + + if (establishFramePointer) + { + JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); + genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); + } assert(offset == totalFrameSize); @@ -5499,16 +5709,20 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) int totalFrameSize = genTotalFrameSize(); - int calleeSaveSPOffset; // This will be the starting place for restoring the callee-saved registers, in decreasing - // order. - int frameType = 0; // An indicator of what type of frame we are popping. - int calleeSaveSPDelta = 0; - int calleeSaveSPDeltaUnaligned = 0; + int calleeSaveSPOffset = 0; // This will be the starting place for restoring the callee-saved registers, in + // decreasing order. + int frameType = 0; // An indicator of what type of frame we are popping. + int calleeSaveSPDelta = 0; // Amount to add to SP after callee-saved registers have been restored. if (isFramePointerUsed()) { - if ((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize < 512)) + // TODO-ARM64-Bug?: should this be "totalFrameSize <= 512"? + if ((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize < 512) && + !genSaveFpLrWithAllCalleeSavedRegisters) { + JITDUMP("Frame type 1. #outsz=0; #framesz=%d; localloc? %s\n", totalFrameSize, + dspBool(compiler->compLocallocUsed)); + frameType = 1; if (compiler->compLocallocUsed) { @@ -5520,38 +5734,64 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) regsToRestoreMask &= ~(RBM_FP | RBM_LR); // We'll restore FP/LR at the end, and post-index SP. - // Compute callee save SP offset which is at the top of local frame while the FP/LR is saved at the bottom - // of stack. + // Compute callee save SP offset which is at the top of local frame while the FP/LR is saved at the + // bottom of stack. calleeSaveSPOffset = compiler->compLclFrameSize + 2 * REGSIZE_BYTES; } else if (totalFrameSize <= 512) { - frameType = 2; if (compiler->compLocallocUsed) { // Restore sp from fp - // sub sp, fp, #outsz - getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, - compiler->lvaOutgoingArgSpaceSize); - compiler->unwindSetFrameReg(REG_FPBASE, compiler->lvaOutgoingArgSpaceSize); + // sub sp, fp, #outsz // Uses #outsz if FP/LR stored at bottom + int SPtoFPdelta = genSPtoFPdelta(); + getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, SPtoFPdelta); + compiler->unwindSetFrameReg(REG_FPBASE, SPtoFPdelta); } - regsToRestoreMask &= ~(RBM_FP | RBM_LR); // We'll restore FP/LR at the end, and post-index SP. + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + JITDUMP("Frame type 4 (save FP/LR at top). #outsz=%d; #framesz=%d; localloc? %s\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, + dspBool(compiler->compLocallocUsed)); - // Compute callee save SP offset which is at the top of local frame while the FP/LR is saved at the bottom - // of stack. - calleeSaveSPOffset = compiler->compLclFrameSize + 2 * REGSIZE_BYTES; + frameType = 4; + + calleeSaveSPOffset = compiler->compLclFrameSize; + + // Remove the frame after we're done restoring the callee-saved registers. + calleeSaveSPDelta = totalFrameSize; + } + else + { + JITDUMP("Frame type 2 (save FP/LR at bottom). #outsz=%d; #framesz=%d; localloc? %s\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, + dspBool(compiler->compLocallocUsed)); + + frameType = 2; + + regsToRestoreMask &= ~(RBM_FP | RBM_LR); // We'll restore FP/LR at the end, and post-index SP. + + // Compute callee save SP offset which is at the top of local frame while the FP/LR is saved at the + // bottom of stack. + calleeSaveSPOffset = compiler->compLclFrameSize + 2 * REGSIZE_BYTES; + } } - else + else if (!genSaveFpLrWithAllCalleeSavedRegisters) { + JITDUMP("Frame type 3 (save FP/LR at bottom). #outsz=%d; #framesz=%d; localloc? %s\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, dspBool(compiler->compLocallocUsed)); + frameType = 3; - calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize - - 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll restore later. + int calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize - + 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll restore later. assert(calleeSaveSPDeltaUnaligned >= 0); assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned. calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN); + JITDUMP(" calleeSaveSPDelta=%d\n", calleeSaveSPDelta); + regsToRestoreMask &= ~(RBM_FP | RBM_LR); // We'll restore FP/LR at the end, and (hopefully) post-index SP. int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; @@ -5569,8 +5809,8 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) if (compiler->compLocallocUsed) { - // Restore sp from fp. No need to update sp after this since we've set up fp before adjusting sp in - // prolog. + // Restore sp from fp. No need to update sp after this since we've set up fp before adjusting sp + // in prolog. // sub sp, fp, #alignmentAdjustment2 getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, alignmentAdjustment2); compiler->unwindSetFrameReg(REG_FPBASE, alignmentAdjustment2); @@ -5583,23 +5823,29 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) int spAdjustment3 = compiler->lvaOutgoingArgSpaceSize - alignmentAdjustment2; assert(spAdjustment3 > 0); assert((spAdjustment3 % 16) == 0); + + JITDUMP(" spAdjustment3=%d\n", spAdjustment3); + genStackPointerAdjustment(spAdjustment3, REG_IP0, nullptr); } // Generate: // ldp fp,lr,[sp] // add sp,sp,#remainingFrameSz + + JITDUMP(" alignmentAdjustment2=%d\n", alignmentAdjustment2); genEpilogRestoreRegPair(REG_FP, REG_LR, alignmentAdjustment2, spAdjustment2, false, REG_IP1, nullptr); } else { if (compiler->compLocallocUsed) { - // Restore sp from fp + // Restore sp from fp; here that's #outsz from SP // sub sp, fp, #outsz - getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, - compiler->lvaOutgoingArgSpaceSize); - compiler->unwindSetFrameReg(REG_FPBASE, compiler->lvaOutgoingArgSpaceSize); + int SPtoFPdelta = genSPtoFPdelta(); + assert(SPtoFPdelta == compiler->lvaOutgoingArgSpaceSize); + getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, SPtoFPdelta); + compiler->unwindSetFrameReg(REG_FPBASE, SPtoFPdelta); } // Generate: @@ -5607,6 +5853,8 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) // add sp,sp,#remainingFrameSz ; might need to load this constant in a scratch register if // ; it's large + JITDUMP(" remainingFrameSz=%d\n", remainingFrameSz); + genEpilogRestoreRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, remainingFrameSz, false, REG_IP1, nullptr); } @@ -5617,6 +5865,32 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) calleeSaveSPOffset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned; assert((calleeSaveSPOffset == 0) || (calleeSaveSPOffset == REGSIZE_BYTES)); } + else + { + JITDUMP("Frame type 5 (save FP/LR at top). #outsz=%d; #framesz=%d; localloc? %s\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, dspBool(compiler->compLocallocUsed)); + + frameType = 5; + + int calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize; + assert(calleeSaveSPDeltaUnaligned >= 0); + assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned. + calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN); + + calleeSaveSPOffset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned; + assert((calleeSaveSPOffset == 0) || (calleeSaveSPOffset == REGSIZE_BYTES)); + + // Restore sp from fp: + // sub sp, fp, #sp-to-fp-delta + // This is the same whether there is localloc or not. Note that we don't need to do anything to remove the + // "remainingFrameSz" to reverse the SUB of that amount in the prolog. The unwind codes won't match. + + int offsetSpToSavedFp = calleeSaveSPDelta - + (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - + 2 * REGSIZE_BYTES; // -2 for FP, LR + getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, offsetSpToSavedFp); + compiler->unwindSetFrameReg(REG_FPBASE, offsetSpToSavedFp); + } } else { @@ -5625,6 +5899,7 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) calleeSaveSPOffset = 0; } + JITDUMP(" calleeSaveSPOffset=%d, calleeSaveSPDelta=%d\n", calleeSaveSPOffset, calleeSaveSPDelta); genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, calleeSaveSPOffset, calleeSaveSPDelta); if (frameType == 1) @@ -5653,6 +5928,14 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) { // Nothing to do after restoring callee-saved registers. } + else if (frameType == 4) + { + // Nothing to do after restoring callee-saved registers. + } + else if (frameType == 5) + { + // Nothing to do after restoring callee-saved registers. + } else { unreached(); @@ -5858,8 +6141,8 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, regMask = genFindLowestBit(availMask); rZero2 = genRegNumFromMask(regMask); availMask &= ~regMask; - assert((genRegMask(rZero2) & intRegState.rsCalleeRegArgMaskLiveIn) == - 0); // rZero2 is not a live incoming argument reg + assert((genRegMask(rZero2) & intRegState.rsCalleeRegArgMaskLiveIn) == 0); // rZero2 is not a live incoming + // argument reg // We pick the next lowest register number for rAddr noway_assert(availMask != RBM_NONE); @@ -5918,8 +6201,8 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, if (useLoop) { noway_assert(uCntSlots >= 2); - assert((genRegMask(rCnt) & intRegState.rsCalleeRegArgMaskLiveIn) == - 0); // rCnt is not a live incoming argument reg + assert((genRegMask(rCnt) & intRegState.rsCalleeRegArgMaskLiveIn) == 0); // rCnt is not a live incoming + // argument reg instGen_Set_Reg_To_Imm(EA_PTRSIZE, rCnt, (ssize_t)uCntSlots / 2); } @@ -6053,8 +6336,8 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, } else if (genInitStkLclCnt > 0) { - assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) == - 0); // initReg is not a live incoming argument reg + assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) == 0); // initReg is not a live incoming + // argument reg /* Initialize any lvMustInit vars on the stack */ @@ -7349,6 +7632,22 @@ void CodeGen::genEstablishFramePointer(int delta, bool reportUnwindData) compiler->unwindPadding(); } +#elif defined(_TARGET_ARM64_) + + if (delta == 0) + { + getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_FPBASE, REG_SPBASE); + } + else + { + getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, delta); + } + + if (reportUnwindData) + { + compiler->unwindSetFrameReg(REG_FPBASE, delta); + } + #else NYI("establish frame pointer"); #endif @@ -9034,8 +9333,8 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo() if (compiler->ehAnyFunclets()) { assert(isFramePointerUsed()); - assert(compiler->lvaDoneFrameLayout == - Compiler::FINAL_FRAME_LAYOUT); // The frame size and offsets must be finalized + assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT); // The frame size and offsets must be + // finalized // Frame pointer doesn't point at the end, it points at the pushed r11. So, instead // of adding the number of callee-saved regs to CallerSP, we add 1 for lr and 1 for r11 @@ -9080,10 +9379,11 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo() printf(" PSP slot SP offset: %d\n", genFuncletInfo.fiPSP_slot_SP_offset); printf(" PSP slot Caller SP offset: %d\n", genFuncletInfo.fiPSP_slot_CallerSP_offset); - if (PSP_slot_CallerSP_offset != - compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)) // for debugging + if (PSP_slot_CallerSP_offset != compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)) + { printf("lvaGetCallerSPRelativeOffset(lvaPSPSym): %d\n", compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)); + } } #endif // DEBUG @@ -9299,8 +9599,8 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo() // because we're not going to allocate the same size frame as the parent. assert(isFramePointerUsed()); - assert(compiler->lvaDoneFrameLayout == - Compiler::FINAL_FRAME_LAYOUT); // The frame size and offsets must be finalized + assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT); // The frame size and offsets must be + // finalized assert(compiler->compCalleeFPRegsSavedMask != (regMaskTP)-1); // The float registers to be preserved is finalized // Even though lvaToInitialSPRelativeOffset() depends on compLclFrameSize, diff --git a/src/jit/codegeninterface.h b/src/jit/codegeninterface.h index 34d2a4fce9..c5d9ec6a69 100644 --- a/src/jit/codegeninterface.h +++ b/src/jit/codegeninterface.h @@ -176,6 +176,11 @@ public: int genSPtoFPdelta(); int genTotalFrameSize(); +#ifdef _TARGET_ARM64_ + virtual void SetSaveFpLrWithAllCalleeSavedRegisters(bool value) = 0; + virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() = 0; +#endif // _TARGET_ARM64_ + regNumber genGetThisArgReg(GenTreeCall* call) const; #ifdef _TARGET_XARCH_ diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp index bb2ce7dedd..6123fb126a 100644 --- a/src/jit/compiler.cpp +++ b/src/jit/compiler.cpp @@ -49,6 +49,10 @@ AssemblyNamesList2* Compiler::s_pAltJitExcludeAssembliesList = nullpt // static bool Compiler::s_pJitDisasmIncludeAssembliesListInitialized = false; AssemblyNamesList2* Compiler::s_pJitDisasmIncludeAssembliesList = nullptr; + +// static +bool Compiler::s_pJitFunctionFileInitialized = false; +MethodSet* Compiler::s_pJitMethodSet = nullptr; #endif // DEBUG /***************************************************************************** @@ -3207,6 +3211,11 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compJitELTHookEnabled = false; #endif // PROFILING_SUPPORTED +#if defined(_TARGET_ARM64_) + // 0 is default: use the appropriate frame type based on the function. + opts.compJitSaveFpLrWithCalleeSavedRegisters = 0; +#endif // defined(_TARGET_ARM64_) + #ifdef DEBUG opts.dspInstrs = false; opts.dspEmit = false; @@ -3418,6 +3427,18 @@ void Compiler::compInitOptions(JitFlags* jitFlags) memset(compActiveStressModes, 0, sizeof(compActiveStressModes)); + // Read function list, if not already read, and there exists such a list. + if (!s_pJitFunctionFileInitialized) + { + const wchar_t* functionFileName = JitConfig.JitFunctionFile(); + if (functionFileName != nullptr) + { + s_pJitMethodSet = + new (HostAllocator::getHostAllocator()) MethodSet(functionFileName, HostAllocator::getHostAllocator()); + } + s_pJitFunctionFileInitialized = true; + } + #endif // DEBUG //------------------------------------------------------------------------- @@ -3673,6 +3694,13 @@ void Compiler::compInitOptions(JitFlags* jitFlags) } #endif // UNIX_AMD64_ABI #endif + +#if defined(DEBUG) && defined(_TARGET_ARM64_) + if ((s_pJitMethodSet == nullptr) || s_pJitMethodSet->IsActiveMethod(info.compFullName, info.compMethodHash())) + { + opts.compJitSaveFpLrWithCalleeSavedRegisters = JitConfig.JitSaveFpLrWithCalleeSavedRegisters(); + } +#endif // defined(DEBUG) && defined(_TARGET_ARM64_) } #ifdef DEBUG diff --git a/src/jit/compiler.h b/src/jit/compiler.h index 940893c0b7..2940bc9d77 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -8486,6 +8486,12 @@ public: bool compTailCallLoopOpt; #endif +#if defined(_TARGET_ARM64_) + // Decision about whether to save FP/LR registers with callee-saved registers (see + // COMPlus_JitSaveFpLrWithCalleSavedRegisters). + int compJitSaveFpLrWithCalleeSavedRegisters; +#endif // defined(_TARGET_ARM64_) + #ifdef ARM_SOFTFP static const bool compUseSoftFP = true; #else // !ARM_SOFTFP @@ -8503,6 +8509,9 @@ public: #ifdef DEBUG static bool s_pJitDisasmIncludeAssembliesListInitialized; static AssemblyNamesList2* s_pJitDisasmIncludeAssembliesList; + + static bool s_pJitFunctionFileInitialized; + static MethodSet* s_pJitMethodSet; #endif // DEBUG #ifdef DEBUG diff --git a/src/jit/jitconfigvalues.h b/src/jit/jitconfigvalues.h index 328ea0c360..842d729111 100644 --- a/src/jit/jitconfigvalues.h +++ b/src/jit/jitconfigvalues.h @@ -371,6 +371,37 @@ CONFIG_INTEGER(JitGuardedDevirtualizationGuessUniqueInterface, W("JitGuardedDevi CONFIG_INTEGER(JitGuardedDevirtualizationGuessBestClass, W("JitGuardedDevirtualizationGuessBestClass"), 1) #endif // DEBUG +#if defined(DEBUG) +// JitFunctionFile: Name of a file that contains a list of functions. If the currently compiled function is in the +// file, certain other JIT config variables will be active. If the currently compiled function is not in the file, +// the specific JIT config variables will not be active. +// +// Functions are approximately in the format output by JitFunctionTrace, e.g.: +// +// System.CLRConfig:GetBoolValue(ref,byref):bool (MethodHash=3c54d35e) +// -- use the MethodHash, not the function name +// +// System.CLRConfig:GetBoolValue(ref,byref):bool +// -- use just the name +// +// Lines with leading ";" "#" or "//" are ignored. +// +// If this is unset, then the JIT config values have their normal behavior. +// +CONFIG_STRING(JitFunctionFile, W("JitFunctionFile")) +#endif // DEBUG + +#if defined(DEBUG) +#if defined(_TARGET_ARM64_) +// JitSaveFpLrWithCalleeSavedRegisters: +// 0: use default frame type decision +// 1: disable frames that save FP/LR registers with the callee-saved registers (at the top of the frame) +// 2: force all frames to use the frame types that save FP/LR registers with the callee-saved registers (at the top +// of the frame) +CONFIG_INTEGER(JitSaveFpLrWithCalleeSavedRegisters, W("JitSaveFpLrWithCalleeSavedRegisters"), 0) +#endif // defined(_TARGET_ARM64_) +#endif // DEBUG + #undef CONFIG_INTEGER #undef CONFIG_STRING #undef CONFIG_METHODSET diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp index c9029443b8..473ade919d 100644 --- a/src/jit/lclvars.cpp +++ b/src/jit/lclvars.cpp @@ -4319,28 +4319,28 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * * The frame is laid out as follows for x86: * - * ESP frames + * ESP frames * - * | | - * |-----------------------| - * | incoming | - * | arguments | - * |-----------------------| <---- Virtual '0' - * | return address | + * | | + * |-----------------------| + * | incoming | + * | arguments | + * |-----------------------| <---- Virtual '0' + * | return address | * +=======================+ - * |Callee saved registers | - * |-----------------------| - * | Temps | - * |-----------------------| - * | Variables | + * |Callee saved registers | + * |-----------------------| + * | Temps | + * |-----------------------| + * | Variables | * |-----------------------| <---- Ambient ESP - * | Arguments for the | - * ~ next function ~ - * | | - * | | | - * | | Stack grows | - * | downward - * V + * | Arguments for the | + * ~ next function ~ + * | | + * | | | + * | | Stack grows | + * | downward + * V * * * EBP frames @@ -4349,13 +4349,13 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * |-----------------------| * | incoming | * | arguments | - * |-----------------------| <---- Virtual '0' - * | return address | + * |-----------------------| <---- Virtual '0' + * | return address | * +=======================+ * | incoming EBP | * |-----------------------| <---- EBP - * |Callee saved registers | - * |-----------------------| + * |Callee saved registers | + * |-----------------------| * | security object | * |-----------------------| * | ParamTypeArg | @@ -4385,39 +4385,39 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * * The frame is laid out as follows for x64: * - * RSP frames - * | | - * |-----------------------| - * | incoming | - * | arguments | - * |-----------------------| - * | 4 fixed incoming | - * | argument slots | + * RSP frames + * | | + * |-----------------------| + * | incoming | + * | arguments | + * |-----------------------| + * | 4 fixed incoming | + * | argument slots | * |-----------------------| <---- Caller's SP & Virtual '0' - * | return address | + * | return address | * +=======================+ - * | Callee saved Int regs | + * | Callee saved Int regs | * ------------------------- * | Padding | <---- this padding (0 or 8 bytes) is to ensure flt registers are saved at a mem location aligned at 16-bytes * | | so that we can save 128-bit callee saved xmm regs using performant "movaps" instruction instead of "movups" * ------------------------- * | Callee saved Flt regs | <----- entire 128-bits of callee saved xmm registers are stored here - * |-----------------------| - * | Temps | - * |-----------------------| - * | Variables | * |-----------------------| - * | Arguments for the | - * ~ next function ~ - * | | - * |-----------------------| - * | 4 fixed outgoing | - * | argument slots | + * | Temps | + * |-----------------------| + * | Variables | + * |-----------------------| + * | Arguments for the | + * ~ next function ~ + * | | + * |-----------------------| + * | 4 fixed outgoing | + * | argument slots | * |-----------------------| <---- Ambient RSP - * | | | - * ~ | Stack grows ~ - * | | downward | - * V + * | | | + * ~ | Stack grows ~ + * | | downward | + * V * * * RBP frames @@ -4425,30 +4425,30 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * |-----------------------| * | incoming | * | arguments | - * |-----------------------| - * | 4 fixed incoming | - * | argument slots | + * |-----------------------| + * | 4 fixed incoming | + * | argument slots | * |-----------------------| <---- Caller's SP & Virtual '0' - * | return address | + * | return address | * +=======================+ - * | Callee saved Int regs | + * | Callee saved Int regs | * ------------------------- - * | Padding | + * | Padding | * ------------------------- - * | Callee saved Flt regs | - * |-----------------------| + * | Callee saved Flt regs | + * |-----------------------| * | security object | * |-----------------------| * | ParamTypeArg | * |-----------------------| * | | - * | | + * | | * ~ Variables ~ - * | | + * | | * | | * |-----------------------| * | Temps | - * |-----------------------| + * |-----------------------| * | | * ~ localloc ~ // not in frames with EH * | | @@ -4456,31 +4456,31 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * | PSPSym | // only in frames with EH (thus no localloc) * | | * |-----------------------| <---- RBP in localloc frames (max 240 bytes from Initial-SP) - * | Arguments for the | - * ~ next function ~ - * | | - * |-----------------------| - * | 4 fixed outgoing | - * | argument slots | + * | Arguments for the | + * ~ next function ~ + * | | + * |-----------------------| + * | 4 fixed outgoing | + * | argument slots | * |-----------------------| <---- Ambient RSP (before localloc, this is Initial-SP) - * | | | - * ~ | Stack grows ~ - * | | downward | + * | | | + * ~ | Stack grows ~ + * | | downward | * V * * * The frame is laid out as follows for ARM (this is a general picture; details may differ for different conditions): * - * SP frames - * | | - * |-----------------------| - * | incoming | - * | arguments | + * SP frames + * | | + * |-----------------------| + * | incoming | + * | arguments | * +=======================+ <---- Caller's SP - * | Pre-spill registers | + * | Pre-spill registers | * |-----------------------| <---- Virtual '0' - * |Callee saved registers | - * |-----------------------| + * |Callee saved registers | + * |-----------------------| * ~ possible double align ~ * |-----------------------| * | security object | @@ -4501,13 +4501,13 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * |-----------------------| * ~ possible double align ~ * |-----------------------| - * | Arguments for the | - * ~ next function ~ - * | | + * | Arguments for the | + * ~ next function ~ + * | | * |-----------------------| <---- Ambient SP - * | | | - * ~ | Stack grows ~ - * | | downward | + * | | | + * ~ | Stack grows ~ + * | | downward | * V * * @@ -4517,10 +4517,10 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * | incoming | * | arguments | * +=======================+ <---- Caller's SP - * | Pre-spill registers | + * | Pre-spill registers | * |-----------------------| <---- Virtual '0' - * |Callee saved registers | - * |-----------------------| + * |Callee saved registers | + * |-----------------------| * | PSPSym | // Only for frames with EH, which means FP-based frames * |-----------------------| * ~ possible double align ~ @@ -4545,13 +4545,13 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * |-----------------------| * | localloc | * |-----------------------| - * | Arguments for the | - * ~ next function ~ - * | | + * | Arguments for the | + * ~ next function ~ + * | | * |-----------------------| <---- Ambient SP - * | | | - * ~ | Stack grows ~ - * | | downward | + * | | | + * ~ | Stack grows ~ + * | | downward | * V * * @@ -4560,17 +4560,17 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * NOTE: SP must be 16-byte aligned, so there may be alignment slots in the frame. * We will often save and establish a frame pointer to create better ETW stack walks. * - * SP frames - * | | - * |-----------------------| - * | incoming | - * | arguments | + * SP frames + * | | + * |-----------------------| + * | incoming | + * | arguments | * +=======================+ <---- Caller's SP * | homed | // this is only needed if reg argument need to be homed, e.g., for varargs - * | register arguments | + * | register arguments | * |-----------------------| <---- Virtual '0' * |Callee saved registers | - * | except fp/lr | + * | except fp/lr | * |-----------------------| * | security object | * |-----------------------| @@ -4591,13 +4591,13 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * | Saved LR | * |-----------------------| * | Saved FP | <---- Frame pointer - * |-----------------------| + * |-----------------------| * | Stack arguments for | * | the next function | * |-----------------------| <---- SP - * | | | - * ~ | Stack grows ~ - * | | downward | + * | | | + * ~ | Stack grows ~ + * | | downward | * V * * @@ -4608,10 +4608,10 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * | arguments | * +=======================+ <---- Caller's SP * | optional homed | // this is only needed if reg argument need to be homed, e.g., for varargs - * | register arguments | - * |-----------------------| <---- Virtual '0' + * | register arguments | + * |-----------------------| <---- Virtual '0' * |Callee saved registers | - * | except fp/lr | + * | except fp/lr | * |-----------------------| * | PSPSym | // Only for frames with EH, which requires FP-based frames * |-----------------------| @@ -4640,9 +4640,53 @@ unsigned Compiler::lvaGetMaxSpillTempSize() * | Stack arguments for | * | the next function | * |-----------------------| <---- Ambient SP - * | | | - * ~ | Stack grows ~ - * | | downward | + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + * + * + * FP (R29 / x29) frames where FP/LR are stored at the top of the frame (frames requiring GS that have localloc) + * | | + * |-----------------------| + * | incoming | + * | arguments | + * +=======================+ <---- Caller's SP + * | optional homed | // this is only needed if reg argument need to be homed, e.g., for varargs + * | register arguments | + * |-----------------------| <---- Virtual '0' + * | Saved LR | + * |-----------------------| + * | Saved FP | <---- Frame pointer + * |-----------------------| + * |Callee saved registers | + * |-----------------------| + * | PSPSym | // Only for frames with EH, which requires FP-based frames + * |-----------------------| + * | security object | + * |-----------------------| + * | ParamTypeArg | + * |-----------------------| + * | possible GS cookie | + * |-----------------------| + * | Variables | + * |-----------------------| + * | possible GS cookie | + * |-----------------------| + * | Temps | + * |-----------------------| + * | Stub Argument Var | + * |-----------------------| + * |Inlined PInvoke Frame V| + * |-----------------------| + * ~ localloc ~ + * |-----------------------| + * | Stack arguments for | + * | the next function | + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | * V * * @@ -5576,6 +5620,30 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() codeGen->setFramePointerUsed(codeGen->isFramePointerRequired()); } +#ifdef _TARGET_ARM64_ + // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is + // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we + // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value, + // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the + // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress. + // (It should be legal to use these frame types for every frame). + + if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) + { + // Default configuration + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) || + compStressCompile(STRESS_GENERIC_VARN, 20)); + } + else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames + } + else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames + } +#endif // _TARGET_ARM64_ + #ifdef _TARGET_XARCH_ // On x86/amd64, the return address has already been pushed by the call instruction in the caller. stkOffs -= TARGET_POINTER_SIZE; // return address; @@ -5618,15 +5686,16 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() stkOffs -= initialStkOffs; } - if (isFramePointerUsed()) + if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() || + !isFramePointerUsed()) // Note that currently we always have a frame pointer { - // Subtract off FP and LR. - assert(compCalleeRegsPushed >= 2); - stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES; + stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES; } else { - stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES; + // Subtract off FP and LR. + assert(compCalleeRegsPushed >= 2); + stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES; } #else // !_TARGET_ARM64_ @@ -6207,7 +6276,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() #endif // FEATURE_EH_FUNCLETS && defined(_TARGET_AMD64_) #ifdef _TARGET_ARM64_ - if (isFramePointerUsed()) + if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && + isFramePointerUsed()) // Note that currently we always have a frame pointer { // Create space for saving FP and LR. stkOffs -= 2 * REGSIZE_BYTES; diff --git a/src/jit/utils.cpp b/src/jit/utils.cpp index ae7dd60ad4..2010678242 100644 --- a/src/jit/utils.cpp +++ b/src/jit/utils.cpp @@ -1557,6 +1557,194 @@ bool AssemblyNamesList2::IsInList(const char* assemblyName) return false; } +//============================================================================= +// MethodSet +//============================================================================= + +MethodSet::MethodSet(const wchar_t* filename, HostAllocator alloc) : m_pInfos(nullptr), m_alloc(alloc) +{ + FILE* methodSetFile = _wfopen(filename, W("r")); + if (methodSetFile == nullptr) + { + return; + } + + MethodInfo* lastInfo = m_pInfos; + char buffer[1024]; + + while (true) + { + // Get next line + if (fgets(buffer, sizeof(buffer), methodSetFile) == nullptr) + { + break; + } + + // Ignore lines starting with leading ";" "#" "//". + if ((0 == _strnicmp(buffer, ";", 1)) || (0 == _strnicmp(buffer, "#", 1)) || (0 == _strnicmp(buffer, "//", 2))) + { + continue; + } + + // Remove trailing newline, if any. + char* p = strpbrk(buffer, "\r\n"); + if (p != nullptr) + { + *p = '\0'; + } + + char* methodName; + unsigned methodHash = 0; + + // Parse the line. Very simple. One of: + // + // <method-name> + // <method-name><whitespace>(MethodHash=<hash>) + + const char methodHashPattern[] = " (MethodHash="; + p = strstr(buffer, methodHashPattern); + if (p == nullptr) + { + // Just use it without the hash. + methodName = _strdup(buffer); + } + else + { + // There's a method hash; use that. + + // First, get the method name. + char* p2 = p; + *p = '\0'; + + // Null terminate method at first whitespace. (Don't have any leading whitespace!) + p = strpbrk(buffer, " \t"); + if (p != nullptr) + { + *p = '\0'; + } + methodName = _strdup(buffer); + + // Now get the method hash. + p2 += strlen(methodHashPattern); + char* p3 = strchr(p2, ')'); + if (p3 == nullptr) + { + // Malformed line: no trailing slash. + JITDUMP("Couldn't parse: %s\n", p2); + // We can still just use the method name. + } + else + { + // Convert the slash to null. + *p3 = '\0'; + + // Now parse it as hex. + int count = sscanf_s(p2, "%x", &methodHash); + if (count != 1) + { + JITDUMP("Couldn't parse: %s\n", p2); + // Still, use the method name. + } + } + } + + MethodInfo* newInfo = new (m_alloc) MethodInfo(methodName, methodHash); + if (m_pInfos == nullptr) + { + m_pInfos = lastInfo = newInfo; + } + else + { + lastInfo->m_next = newInfo; + lastInfo = newInfo; + } + } + + if (m_pInfos == nullptr) + { + JITDUMP("No methods read from %ws\n", filename); + } + else + { + JITDUMP("Methods read from %ws:\n", filename); + + int methodCount = 0; + for (MethodInfo* pInfo = m_pInfos; pInfo != nullptr; pInfo = pInfo->m_next) + { + JITDUMP(" %s (MethodHash: %x)\n", pInfo->m_MethodName, pInfo->m_MethodHash); + ++methodCount; + } + + if (methodCount > 100) + { + JITDUMP("Warning: high method count (%d) for MethodSet with linear search lookups might be slow\n", + methodCount); + } + } +} + +MethodSet::~MethodSet() +{ + for (MethodInfo* pInfo = m_pInfos; pInfo != nullptr; /**/) + { + MethodInfo* cur = pInfo; + pInfo = pInfo->m_next; + + m_alloc.deallocate(cur->m_MethodName); + m_alloc.deallocate(cur); + } +} + +// TODO: make this more like JitConfigValues::MethodSet::contains()? +bool MethodSet::IsInSet(const char* methodName) +{ + for (MethodInfo* pInfo = m_pInfos; pInfo != nullptr; pInfo = pInfo->m_next) + { + if (_stricmp(pInfo->m_MethodName, methodName) == 0) + { + return true; + } + } + + return false; +} + +bool MethodSet::IsInSet(int methodHash) +{ + for (MethodInfo* pInfo = m_pInfos; pInfo != nullptr; pInfo = pInfo->m_next) + { + if (pInfo->m_MethodHash == methodHash) + { + return true; + } + } + + return false; +} + +bool MethodSet::IsActiveMethod(const char* methodName, int methodHash) +{ + if (methodHash != 0) + { + // Use the method hash. + if (IsInSet(methodHash)) + { + JITDUMP("Method active in MethodSet (hash match): %s Hash: %x\n", methodName, methodHash); + return true; + } + } + + // Else, fall back and use the method name. + assert(methodName != nullptr); + if (IsInSet(methodName)) + { + JITDUMP("Method active in MethodSet (name match): %s Hash: %x\n", methodName, methodHash); + return true; + } + + return false; +} + #ifdef FEATURE_JIT_METHOD_PERF CycleCount::CycleCount() : cps(CycleTimer::CyclesPerSecond()) { diff --git a/src/jit/utils.h b/src/jit/utils.h index ec3b0e3e32..1bc3daf8b3 100644 --- a/src/jit/utils.h +++ b/src/jit/utils.h @@ -550,6 +550,60 @@ public: } }; +// MethodSet: Manage a list of methods that is read from a file. +// +// Methods are approximately in the format output by JitFunctionTrace, e.g.: +// +// System.CLRConfig:GetBoolValue(ref,byref):bool (MethodHash=3c54d35e) +// -- use the MethodHash, not the method name +// +// System.CLRConfig:GetBoolValue(ref,byref):bool +// -- use just the name +// +// Method names should not have any leading whitespace. +// +// TODO: Should this be more related to JitConfigValues::MethodSet? +// +class MethodSet +{ + // TODO: use a hash table? or two: one on hash value, one on function name + struct MethodInfo + { + char* m_MethodName; + int m_MethodHash; + MethodInfo* m_next; + + MethodInfo(char* methodName, int methodHash) + : m_MethodName(methodName), m_MethodHash(methodHash), m_next(nullptr) + { + } + }; + + MethodInfo* m_pInfos; // List of function info + HostAllocator m_alloc; // HostAllocator to use in this class + +public: + // Take a Unicode string with the filename containing a list of function names, parse it, and store it. + MethodSet(const wchar_t* filename, HostAllocator alloc); + + ~MethodSet(); + + // Return 'true' if 'functionName' (in UTF-8 format) is in the stored set of assembly names. + bool IsInSet(const char* functionName); + + // Return 'true' if 'functionHash' (in UTF-8 format) is in the stored set of assembly names. + bool IsInSet(int functionHash); + + // Return 'true' if this method is active. Prefer non-zero methodHash for check over (non-null) methodName. + bool IsActiveMethod(const char* methodName, int methodHash); + + // Return 'true' if the assembly name set is empty. + bool IsEmpty() + { + return m_pInfos == nullptr; + } +}; + #ifdef FEATURE_JIT_METHOD_PERF // When Start() is called time is noted and when ElapsedTime // is called we know how much time was spent in msecs. |