diff options
Diffstat (limited to 'src/jit/codegenarm64.cpp')
-rw-r--r-- | src/jit/codegenarm64.cpp | 8687 |
1 files changed, 8687 insertions, 0 deletions
diff --git a/src/jit/codegenarm64.cpp b/src/jit/codegenarm64.cpp new file mode 100644 index 0000000000..71b238308d --- /dev/null +++ b/src/jit/codegenarm64.cpp @@ -0,0 +1,8687 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. +// + +/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XX XX +XX Arm64 Code Generator XX +XX XX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +*/ +#include "jitpch.h" +#ifdef _MSC_VER +#pragma hdrstop +#endif + +#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator + +#ifdef _TARGET_ARM64_ +#include "emit.h" +#include "codegen.h" +#include "lower.h" +#include "gcinfo.h" +#include "gcinfoencoder.h" + +/* +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XX XX +XX Prolog / Epilog XX +XX XX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +*/ + +//------------------------------------------------------------------------ +// genStackPointerAdjustment: add a specified constant value to the stack pointer in either the prolog +// or the epilog. The unwind codes for the generated instructions are produced. An available temporary +// register is required to be specified, in case the constant is too large to encode in an "add" +// instruction (or "sub" instruction if we choose to use one), such that we need to load the constant +// into a register first, before using it. +// +// Arguments: +// spDelta - the value to add to SP (can be negative) +// tmpReg - an available temporary register +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. + +void CodeGen::genStackPointerAdjustment(ssize_t spDelta, regNumber tmpReg, bool* pTmpRegIsZero) +{ + unsigned unwindSpDelta; + + if (emitter::emitIns_valid_imm_for_add(spDelta, EA_8BYTE)) + { + getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, spDelta); + + unwindSpDelta = (unsigned)abs(spDelta); + } + else + { + bool adjustmentIsNegative = (spDelta < 0); + spDelta = abs(spDelta); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, spDelta); + if (pTmpRegIsZero != nullptr) + { + *pTmpRegIsZero = false; + } + compiler->unwindPadding(); + + getEmitter()->emitIns_R_R_R(adjustmentIsNegative ? INS_sub : INS_add, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, tmpReg); + + unwindSpDelta = (unsigned)spDelta; + } + + // spDelta is negative in the prolog, positive in the epilog, but we always tell the unwind codes the positive value. + compiler->unwindAllocStack(unwindSpDelta); +} + +//------------------------------------------------------------------------ +// genPrologSaveRegPair: Save a pair of general-purpose or floating-point/SIMD registers in a function or funclet prolog. +// If possible, we use pre-indexed addressing to adjust SP and store the registers with a single instruction. +// The caller must ensure that we can use the STP instruction, and that spOffset will be in the legal range for that instruction. +// +// Arguments: +// reg1 - First register of pair to save. +// reg2 - Second register of pair to save. +// spOffset - The offset from SP to store reg1 (must be positive or zero). +// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or zero). +// lastSavedWasPreviousPair - True if the last prolog instruction was to save the previous register pair. This allows us to +// emit the "save_next" unwind code. +// tmpReg - An available temporary register. Needed for the case of large frames. +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. + +void CodeGen::genPrologSaveRegPair(regNumber reg1, + regNumber reg2, + int spOffset, + int spDelta, + bool lastSavedWasPreviousPair, + regNumber tmpReg, + bool* pTmpRegIsZero) +{ + assert(spOffset >= 0); + assert(spDelta <= 0); + assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned + assert(genIsValidFloatReg(reg1) == genIsValidFloatReg(reg2)); // registers must be both general-purpose, or both FP/SIMD + + bool needToSaveRegs = true; + if (spDelta != 0) + { + if ((spOffset == 0) && (spDelta >= -512)) + { + // We can use pre-indexed addressing. + // stp REG, REG + 1, [SP, #spDelta]! + // 64-bit STP offset range: -512 to 504, multiple of 8. + getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spDelta, INS_OPTS_PRE_INDEX); + compiler->unwindSaveRegPairPreindexed(reg1, reg2, spDelta); + + needToSaveRegs = false; + } + else + { + // We need to do SP adjustment separately from the store; we can't fold in a pre-indexed addressing and the non-zero offset. + genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero); + } + } + + if (needToSaveRegs) + { + // stp REG, REG + 1, [SP, #offset] + // 64-bit STP offset range: -512 to 504, multiple of 8. + assert(spOffset <= 504); + getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spOffset); + + if (lastSavedWasPreviousPair) + { + // This works as long as we've only been saving pairs, in order, and we've saved the previous one just before this one. + compiler->unwindSaveNext(); + } + else + { + compiler->unwindSaveRegPair(reg1, reg2, spOffset); + } + } +} + +//------------------------------------------------------------------------ +// genPrologSaveRegPair: Like genPrologSaveRegPair, but for a single register. Save a single general-purpose or floating-point/SIMD register +// in a function or funclet prolog. Note that if we wish to change SP (i.e., spDelta != 0), then spOffset must be 8. This is because +// otherwise we would create an alignment hole above the saved register, not below it, which we currently don't support. This restriction +// could be loosened if the callers change to handle it (and this function changes to support using pre-indexed STR addressing). +// The caller must ensure that we can use the STR instruction, and that spOffset will be in the legal range for that instruction. +// +// Arguments: +// reg1 - Register to save. +// spOffset - The offset from SP to store reg1 (must be positive or zero). +// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or zero). +// tmpReg - An available temporary register. Needed for the case of large frames. +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. + +void CodeGen::genPrologSaveReg(regNumber reg1, + int spOffset, + int spDelta, + regNumber tmpReg, + bool* pTmpRegIsZero) +{ + assert(spOffset >= 0); + assert(spDelta <= 0); + assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned + + if (spDelta != 0) + { + // If saving a single callee-save register, and we need to change SP, the offset cannot be zero. It must be 8 to account + // for alignment. + assert(spOffset != 0); + assert(spOffset == REGSIZE_BYTES); + + genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero); + } + + // str REG, [SP, #offset] + // 64-bit STR offset range: 0 to 32760, multiple of 8. + getEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); + compiler->unwindSaveReg(reg1, spOffset); +} + +//------------------------------------------------------------------------ +// genEpilogRestoreRegPair: This is the opposite of genPrologSaveRegPair(), run in the epilog instead of the prolog. +// The stack pointer adjustment, if requested, is done after the register restore, using post-index addressing. +// The caller must ensure that we can use the LDP instruction, and that spOffset will be in the legal range for that instruction. +// +// Arguments: +// reg1 - First register of pair to restore. +// reg2 - Second register of pair to restore. +// spOffset - The offset from SP to load reg1 (must be positive or zero). +// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or zero). +// tmpReg - An available temporary register. Needed for the case of large frames. +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. + +void CodeGen::genEpilogRestoreRegPair(regNumber reg1, + regNumber reg2, + int spOffset, + int spDelta, + regNumber tmpReg, + bool* pTmpRegIsZero) +{ + assert(spOffset >= 0); + assert(spDelta >= 0); + assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned + + if (spDelta != 0) + { + if ((spOffset == 0) && (spDelta <= 504)) + { + // Fold the SP change into this instruction. + // ldp reg1, reg2, [SP], #spDelta + getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spDelta, INS_OPTS_POST_INDEX); + compiler->unwindSaveRegPairPreindexed(reg1, reg2, -spDelta); + } + else + { + // Can't fold in the SP change; need to use a separate ADD instruction. + + // ldp reg1, reg2, [SP, #offset] + getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spOffset); + compiler->unwindSaveRegPair(reg1, reg2, spOffset); + + genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero); + } + } + else + { + // ldp reg1, reg2, [SP, #offset] + getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spOffset); + compiler->unwindSaveRegPair(reg1, reg2, spOffset); + } +} + +//------------------------------------------------------------------------ +// genEpilogRestoreReg: The opposite of genPrologSaveReg(), run in the epilog instead of the prolog. +// +// Arguments: +// reg1 - Register to restore. +// spOffset - The offset from SP to restore reg1 (must be positive or zero). +// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or zero). +// tmpReg - An available temporary register. Needed for the case of large frames. +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. + +void CodeGen::genEpilogRestoreReg(regNumber reg1, + int spOffset, + int spDelta, + regNumber tmpReg, + bool* pTmpRegIsZero) +{ + assert(spOffset >= 0); + assert(spDelta >= 0); + assert((spDelta % 16) == 0); // SP changes must be 16-byte aligned + + // ldr reg1, [SP, #offset] + getEmitter()->emitIns_R_R_I(INS_ldr, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); + compiler->unwindSaveReg(reg1, spOffset); + + if (spDelta != 0) + { + assert(spOffset != 0); + genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero); + } +} + +//------------------------------------------------------------------------ +// genSaveCalleeSavedRegistersHelp: Save the callee-saved registers in 'regsToSaveMask' to the stack frame +// in the function or funclet prolog. The save set does not contain FP, since that is +// guaranteed to be saved separately, so we can set up chaining. We can only use the instructions +// that are allowed by the unwind codes. Integer registers are stored at lower addresses, +// FP/SIMD registers are stored at higher addresses. There are no gaps. The caller ensures that +// there is enough space on the frame to store these registers, and that the store instructions +// we need to use (STR or STP) are encodable with the stack-pointer immediate offsets we need to +// use. Note that the save set can contain LR if this is a frame without a frame pointer, in +// which case LR is saved along with the other callee-saved registers. The caller can tell us +// to fold in a stack pointer adjustment, which we will do with the first instruction. Note that +// the stack pointer adjustment must be by a multiple of 16 to preserve the invariant that the +// stack pointer is always 16 byte aligned. If we are saving an odd number of callee-saved +// registers, though, we will have an empty aligment slot somewhere. It turns out we will put +// it below (at a lower address) the callee-saved registers, as that is currently how we +// do frame layout. This means that the first stack offset will be 8 and the stack pointer +// adjustment must be done by a SUB, and not folded in to a pre-indexed store. +// +// Arguments: +// regsToSaveMask - The mask of callee-saved registers to save. If empty, this function does nothing. +// lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area. Note that +// if non-zero spDelta, then this is the offset of the first save *after* that +// SP adjustment. +// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or zero). +// +// Return Value: +// None. + +void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, + int lowestCalleeSavedOffset, + int spDelta) +{ + unsigned regsToSaveCount = genCountBits(regsToSaveMask); + if (regsToSaveCount == 0) + { + return; + } + + assert(spDelta <= 0); + assert((spDelta % 16) == 0); + assert((regsToSaveMask & RBM_FP) == 0); // we never save FP here + assert(regsToSaveCount <= genCountBits(RBM_CALLEE_SAVED | RBM_LR)); // We also save LR, even though it is not in RBM_CALLEE_SAVED. + + regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT; + regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat; + + int spOffset = lowestCalleeSavedOffset; // this is the offset *after* we change SP. + + if (maskSaveRegsInt != RBM_NONE) + { + // Save the integer registers + + unsigned intRegsToSaveCount = genCountBits(maskSaveRegsInt); + bool lastSavedWasPair = false; + + while (maskSaveRegsInt != RBM_NONE) + { + regMaskTP reg1Mask = genFindLowestBit(maskSaveRegsInt); + regNumber reg1 = genRegNumFromMask(reg1Mask); + maskSaveRegsInt &= ~reg1Mask; + + if (intRegsToSaveCount >= 2) + { + // We can use a STP instruction. + + regMaskTP reg2Mask = genFindLowestBit(maskSaveRegsInt); + regNumber reg2 = genRegNumFromMask(reg2Mask); + assert((reg2 == REG_NEXT(reg1)) || (reg2 == REG_LR)); + maskSaveRegsInt &= ~reg2Mask; + + genPrologSaveRegPair(reg1, reg2, spOffset, spDelta, lastSavedWasPair, REG_IP0, nullptr); + + // TODO-ARM64-CQ: this code works in the prolog, but it's a bit weird to think about "next" when generating this epilog, to + // get the codes to match. Turn this off until that is better understood. + // lastSavedWasPair = true; + + intRegsToSaveCount -= 2; + spOffset += 2 * REGSIZE_BYTES; + } + else + { + // No register pair; we use a STR instruction. + + assert(intRegsToSaveCount == 1); // this will be the last store we do + + genPrologSaveReg(reg1, spOffset, spDelta, REG_IP0, nullptr); + + lastSavedWasPair = false; + + intRegsToSaveCount -= 1; + spOffset += REGSIZE_BYTES; + } + + spDelta = 0; // We've now changed SP already, if necessary; don't do it again. + } + + assert(intRegsToSaveCount == 0); + } + + if (maskSaveRegsFloat != RBM_NONE) + { + // Save the floating-point/SIMD registers + + unsigned floatRegsToSaveCount = genCountBits(maskSaveRegsFloat); + bool lastSavedWasPair = false; + + while (maskSaveRegsFloat != RBM_NONE) + { + regMaskTP reg1Mask = genFindLowestBit(maskSaveRegsFloat); + regNumber reg1 = genRegNumFromMask(reg1Mask); + maskSaveRegsFloat &= ~reg1Mask; + + if (floatRegsToSaveCount >= 2) + { + // We can use a STP instruction. + + regMaskTP reg2Mask = genFindLowestBit(maskSaveRegsFloat); + regNumber reg2 = genRegNumFromMask(reg2Mask); + assert(reg2 == REG_NEXT(reg1)); + maskSaveRegsFloat &= ~reg2Mask; + + genPrologSaveRegPair(reg1, reg2, spOffset, spDelta, lastSavedWasPair, REG_IP0, nullptr); + + // TODO-ARM64-CQ: this code works in the prolog, but it's a bit weird to think about "next" when generating this epilog, to + // get the codes to match. Turn this off until that is better understood. + // lastSavedWasPair = true; + + floatRegsToSaveCount -= 2; + spOffset += 2 * FPSAVE_REGSIZE_BYTES; + } + else + { + // No register pair; we use a STR instruction. + + assert(floatRegsToSaveCount == 1); + + genPrologSaveReg(reg1, spOffset, spDelta, REG_IP0, nullptr); + + lastSavedWasPair = false; + + floatRegsToSaveCount -= 1; + spOffset += FPSAVE_REGSIZE_BYTES; + } + + spDelta = 0; // We've now changed SP already, if necessary; don't do it again. + } + + assert(floatRegsToSaveCount == 0); + } +} + + +//------------------------------------------------------------------------ +// genRestoreCalleeSavedRegistersHelp: Restore the callee-saved registers in 'regsToRestoreMask' from the stack frame +// in the function or funclet epilog. This exactly reverses the actions of genSaveCalleeSavedRegistersHelp(). +// +// Arguments: +// regsToRestoreMask - The mask of callee-saved registers to restore. If empty, this function does nothing. +// lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area. +// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or zero). +// +// Here's an example restore sequence: +// ldp x27, x28, [sp,#96] +// ldp x25, x26, [sp,#80] +// ldp x23, x24, [sp,#64] +// ldp x21, x22, [sp,#48] +// ldp x19, x20, [sp,#32] +// +// For the case of non-zero spDelta, we assume the base of the callee-save registers to restore is at SP, and +// the last restore adjusts SP by the specified amount. For example: +// ldp x27, x28, [sp,#64] +// ldp x25, x26, [sp,#48] +// ldp x23, x24, [sp,#32] +// ldp x21, x22, [sp,#16] +// ldp x19, x20, [sp], #80 +// +// Note you call the unwind functions specifying the prolog operation that is being un-done. So, for example, when generating +// a post-indexed load, you call the unwind function for specifying the corresponding preindexed store. +// +// Return Value: +// None. + +void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, + int lowestCalleeSavedOffset, + int spDelta) +{ + unsigned regsToRestoreCount = genCountBits(regsToRestoreMask); + if (regsToRestoreCount == 0) + { + return; + } + + assert(spDelta >= 0); + assert((spDelta % 16) == 0); + assert((regsToRestoreMask & RBM_FP) == 0); // we never restore FP here + assert(regsToRestoreCount <= genCountBits(RBM_CALLEE_SAVED | RBM_LR)); // We also save LR, even though it is not in RBM_CALLEE_SAVED. + + regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT; + regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat; + + assert(REGSIZE_BYTES == FPSAVE_REGSIZE_BYTES); + int spOffset = lowestCalleeSavedOffset + regsToRestoreCount * REGSIZE_BYTES; // Point past the end, to start. We predecrement to find the offset to load from. + + // We want to restore in the opposite order we saved, so the unwind codes match. Be careful to handle odd numbers of + // callee-saved registers properly. + + if (maskRestoreRegsFloat != RBM_NONE) + { + // Restore the floating-point/SIMD registers + + unsigned floatRegsToRestoreCount = genCountBits(maskRestoreRegsFloat); + + while (maskRestoreRegsFloat != RBM_NONE) + { + if ((floatRegsToRestoreCount % 2) == 0) + { + assert(floatRegsToRestoreCount >= 2); + + regMaskTP reg2Mask = genFindHighestBit(maskRestoreRegsFloat); + regNumber reg2 = genRegNumFromMask(reg2Mask); + maskRestoreRegsFloat &= ~reg2Mask; + + regMaskTP reg1Mask = genFindHighestBit(maskRestoreRegsFloat); + regNumber reg1 = genRegNumFromMask(reg1Mask); + maskRestoreRegsFloat &= ~reg1Mask; + + spOffset -= 2 * FPSAVE_REGSIZE_BYTES; + + // Is this the last restore instruction? And have we've been told to adjust SP? + bool thisIsTheLastRestoreInstruction = (floatRegsToRestoreCount == 2) && (maskRestoreRegsInt == RBM_NONE); + genEpilogRestoreRegPair(reg1, reg2, spOffset, thisIsTheLastRestoreInstruction ? spDelta : 0, REG_IP0, nullptr); + + floatRegsToRestoreCount -= 2; + } + else + { + // We do the odd register first when restoring, last when saving. + assert((floatRegsToRestoreCount % 2) == 1); + + regMaskTP reg1Mask = genFindHighestBit(maskRestoreRegsFloat); + regNumber reg1 = genRegNumFromMask(reg1Mask); + maskRestoreRegsFloat &= ~reg1Mask; + + spOffset -= FPSAVE_REGSIZE_BYTES; + + // Is this the last restore instruction? And have we've been told to adjust SP? + bool thisIsTheLastRestoreInstruction = (floatRegsToRestoreCount == 1) && (maskRestoreRegsInt == RBM_NONE); + genEpilogRestoreReg(reg1, spOffset, thisIsTheLastRestoreInstruction ? spDelta : 0, REG_IP0, nullptr); + + floatRegsToRestoreCount -= 1; + } + } + + assert(floatRegsToRestoreCount == 0); + } + + if (maskRestoreRegsInt != RBM_NONE) + { + // Restore the integer registers + + unsigned intRegsToRestoreCount = genCountBits(maskRestoreRegsInt); + + while (maskRestoreRegsInt != RBM_NONE) + { + if ((intRegsToRestoreCount % 2) == 0) + { + assert(intRegsToRestoreCount >= 2); + + regMaskTP reg2Mask = genFindHighestBit(maskRestoreRegsInt); + regNumber reg2 = genRegNumFromMask(reg2Mask); + maskRestoreRegsInt &= ~reg2Mask; + + regMaskTP reg1Mask = genFindHighestBit(maskRestoreRegsInt); + regNumber reg1 = genRegNumFromMask(reg1Mask); + maskRestoreRegsInt &= ~reg1Mask; + + spOffset -= 2 * REGSIZE_BYTES; + + // Is this the last restore instruction? And have we've been told to adjust SP? + bool thisIsTheLastRestoreInstruction = (intRegsToRestoreCount == 2); + genEpilogRestoreRegPair(reg1, reg2, spOffset, thisIsTheLastRestoreInstruction ? spDelta : 0, REG_IP0, nullptr); + + intRegsToRestoreCount -= 2; + } + else + { + // We do the odd register first when restoring, last when saving. + assert((intRegsToRestoreCount % 2) == 1); + + regMaskTP reg1Mask = genFindHighestBit(maskRestoreRegsInt); + regNumber reg1 = genRegNumFromMask(reg1Mask); + maskRestoreRegsInt &= ~reg1Mask; + + spOffset -= REGSIZE_BYTES; + + // Is this the last restore instruction? And have we've been told to adjust SP? + bool thisIsTheLastRestoreInstruction = (intRegsToRestoreCount == 1); + genEpilogRestoreReg(reg1, spOffset, thisIsTheLastRestoreInstruction ? spDelta : 0, REG_IP0, nullptr); + + intRegsToRestoreCount -= 1; + } + } + + assert(intRegsToRestoreCount == 0); + } +} + + +/***************************************************************************** + * + * Generates code for an EH funclet prolog. + * + * Funclets have the following incoming arguments: + * + * catch: x0 = the exception object that was caught (see GT_CATCH_ARG) + * filter: x0 = the exception object to filter (see GT_CATCH_ARG), x1 = CallerSP of the containing function + * finally/fault: none + * + * Funclets set the following registers on exit: + * + * catch: x0 = the address at which execution should resume (see BBJ_EHCATCHRET) + * filter: x0 = non-zero if the handler should handle the exception, zero otherwise (see GT_RETFILT) + * finally/fault: none + * + * The ARM64 funclet prolog sequence is one of the following (Note: #framesz is total funclet frame size, + * including everything; #outsz is outgoing argument space. #framesz must be a multiple of 16): + * + * Frame type 1: + * For #outsz == 0 and #framesz <= 512: + * stp fp,lr,[sp,-#framesz]! ; establish the frame, save FP/LR + * stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary + * + * The funclet frame is thus: + * + * | | + * |-----------------------| + * | incoming | + * | arguments | + * +=======================+ <---- Caller's SP + * |Callee saved registers | // multiple of 8 bytes + * |-----------------------| + * | PSP slot | // 8 bytes + * |-----------------------| + * ~ alignment padding ~ // To make the whole frame 16 byte aligned. + * |-----------------------| + * | Saved FP, LR | // 16 bytes + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + * + * Frame type 2: + * For #outsz != 0 and #framesz <= 512: + * sub sp,sp,#framesz ; establish the frame + * stp fp,lr,[sp,#outsz] ; save FP/LR. + * stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary + * + * The funclet frame is thus: + * + * | | + * |-----------------------| + * | incoming | + * | arguments | + * +=======================+ <---- Caller's SP + * |Callee saved registers | // multiple of 8 bytes + * |-----------------------| + * | PSP slot | // 8 bytes + * |-----------------------| + * ~ alignment padding ~ // To make the whole frame 16 byte aligned. + * |-----------------------| + * | Saved FP, LR | // 16 bytes + * |-----------------------| + * | Outgoing arg space | // multiple of 8 bytes + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + * + * Frame type 3: + * For #framesz > 512: + * stp fp,lr,[sp,- (#framesz - #outsz)]! ; establish the frame, save FP/LR: note that it is guaranteed here that (#framesz - #outsz) <= 168 + * stp x19,x20,[sp,#xxx] ; save callee-saved registers, as necessary + * sub sp,sp,#outsz ; create space for outgoing argument space + * + * The funclet frame is thus: + * + * | | + * |-----------------------| + * | incoming | + * | arguments | + * +=======================+ <---- Caller's SP + * |Callee saved registers | // multiple of 8 bytes + * |-----------------------| + * | PSP slot | // 8 bytes + * |-----------------------| + * ~ alignment padding ~ // To make the first SP subtraction 16 byte aligned + * |-----------------------| + * | Saved FP, LR | // 16 bytes + * |-----------------------| + * ~ alignment padding ~ // To make the whole frame 16 byte aligned (specifically, to 16-byte align the outgoing argument space). + * |-----------------------| + * | Outgoing arg space | // multiple of 8 bytes + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + * + * Both #1 and #2 only change SP once. That means that there will be a maximum of one alignment slot needed. For the general case, #3, + * it is possible that we will need to add alignment to both changes to SP, leading to 16 bytes of alignment. Remember that the stack + * pointer needs to be 16 byte aligned at all times. The size of the PSP slot plus callee-saved registers space is a maximum of 168 bytes: + * (1 PSP slot + 12 integer registers + 8 FP/SIMD registers) * 8 bytes. The outgoing argument size, however, can be very large, if we call a + * function that takes a large number of arguments (note that we currently use the same outgoing argument space size in the funclet as for the main + * function, even if the funclet doesn't have any calls, or has a much smaller, or larger, maximum number of outgoing arguments for any call). + * In that case, we need to 16-byte align the initial change to SP, before saving off the callee-saved registers and establishing the PSPsym, + * so we can use the limited immediate offset encodings we have available, before doing another 16-byte aligned SP adjustment to create the + * outgoing argument space. Both changes to SP might need to add alignment padding. + * + * Note that in all cases, the PSPSym is in exactly the same position with respect to Caller-SP, and that location is the same relative to Caller-SP + * as in the main function. + * + * ; After this header, fill the PSP slot, for use by the VM (it gets reported with the GC info), or by code generation of nested filters. + * ; This is not part of the "OS prolog"; it has no associated unwind data, and is not reversed in the funclet epilog. + * + * if (this is a filter funclet) + * { + * // x1 on entry to a filter funclet is CallerSP of the containing function: + * // either the main function, or the funclet for a handler that this filter is dynamically nested within. + * // Note that a filter can be dynamically nested within a funclet even if it is not statically within + * // a funclet. Consider: + * // + * // try { + * // try { + * // throw new Exception(); + * // } catch(Exception) { + * // throw new Exception(); // The exception thrown here ... + * // } + * // } filter { // ... will be processed here, while the "catch" funclet frame is still on the stack + * // } filter-handler { + * // } + * // + * // Because of this, we need a PSP in the main function anytime a filter funclet doesn't know whether the enclosing frame will + * // be a funclet or main function. We won't know any time there is a filter protecting nested EH. To simplify, we just always + * // create a main function PSP for any function with a filter. + * + * ldr x1, [x1, #CallerSP_to_PSP_slot_delta] ; Load the CallerSP of the main function (stored in the PSP of the dynamically containing funclet or function) + * str x1, [sp, #SP_to_PSP_slot_delta] ; store the PSP + * add fp, x1, #Function_CallerSP_to_FP_delta ; re-establish the frame pointer + * } + * else + * { + * // This is NOT a filter funclet. The VM re-establishes the frame pointer on entry. + * // TODO-ARM64-CQ: if VM set x1 to CallerSP on entry, like for filters, we could save an instruction. + * + * add x3, fp, #Function_FP_to_CallerSP_delta ; compute the CallerSP, given the frame pointer. x3 is scratch. + * str x3, [sp, #SP_to_PSP_slot_delta] ; store the PSP + * } + * + * An example epilog sequence is then: + * + * add sp,sp,#outsz ; if any outgoing argument space + * ... ; restore callee-saved registers + * ldp x19,x20,[sp,#xxx] + * ldp fp,lr,[sp],#framesz + * ret lr + * + * The funclet frame is thus: + * + * | | + * |-----------------------| + * | incoming | + * | arguments | + * +=======================+ <---- Caller's SP + * |Callee saved registers | // multiple of 8 bytes + * |-----------------------| + * | PSP slot | // 8 bytes + * |-----------------------| + * | Saved FP, LR | // 16 bytes + * |-----------------------| + * ~ alignment padding ~ // To make the whole frame 16 byte aligned. + * |-----------------------| + * | Outgoing arg space | // multiple of 8 bytes + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + */ + +void CodeGen::genFuncletProlog(BasicBlock* block) +{ +#ifdef DEBUG + if (verbose) + printf("*************** In genFuncletProlog()\n"); +#endif + + assert(block != NULL); + assert(block->bbFlags && BBF_FUNCLET_BEG); + + ScopedSetVariable<bool> _setGeneratingProlog(&compiler->compGeneratingProlog, true); + + gcInfo.gcResetForBB(); + + compiler->unwindBegProlog(); + + regMaskTP maskSaveRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT; + regMaskTP maskSaveRegsInt = genFuncletInfo.fiSaveRegs & ~maskSaveRegsFloat; + + // Funclets must always save LR and FP, since when we have funclets we must have an FP frame. + assert((maskSaveRegsInt & RBM_LR) != 0); + assert((maskSaveRegsInt & RBM_FP) != 0); + + bool isFilter = (block->bbCatchTyp == BBCT_FILTER); + + regMaskTP maskArgRegsLiveIn; + if (isFilter) + { + maskArgRegsLiveIn = RBM_R0 | RBM_R1; + } + else if ((block->bbCatchTyp == BBCT_FINALLY) || (block->bbCatchTyp == BBCT_FAULT)) + { + maskArgRegsLiveIn = RBM_NONE; + } + else + { + maskArgRegsLiveIn = RBM_R0; + } + + int lowestCalleeSavedOffset = genFuncletInfo.fiSP_to_CalleeSave_delta; + + if (genFuncletInfo.fiFrameType == 1) + { + getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSpDelta1, INS_OPTS_PRE_INDEX); + compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1); + + assert(genFuncletInfo.fiSpDelta2 == 0); + assert(genFuncletInfo.fiSP_to_FPLR_save_delta == 0); + } + else if (genFuncletInfo.fiFrameType == 2) + { + getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -genFuncletInfo.fiSpDelta1); + compiler->unwindAllocStack(-genFuncletInfo.fiSpDelta1); + + assert(genFuncletInfo.fiSpDelta2 == 0); + + getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSP_to_FPLR_save_delta); + compiler->unwindSaveRegPair(REG_FP, REG_LR, genFuncletInfo.fiSP_to_FPLR_save_delta); + } + else + { + assert(genFuncletInfo.fiFrameType == 3); + + getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSpDelta1, INS_OPTS_PRE_INDEX); + compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1); + + lowestCalleeSavedOffset += genFuncletInfo.fiSpDelta2; // We haven't done the second adjustment of SP yet. + } + maskSaveRegsInt &= ~(RBM_LR | RBM_FP); // We've saved these now + + genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, lowestCalleeSavedOffset, 0); + + if (genFuncletInfo.fiFrameType == 3) + { + assert(genFuncletInfo.fiSpDelta2 != 0); + getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -genFuncletInfo.fiSpDelta2); + compiler->unwindAllocStack(-genFuncletInfo.fiSpDelta2); + } + + // This is the end of the OS-reported prolog for purposes of unwinding + compiler->unwindEndProlog(); + + if (isFilter) + { + // This is the first block of a filter + + getEmitter()->emitIns_R_R_I(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_R1, genFuncletInfo.fiCallerSP_to_PSP_slot_delta); + regTracker.rsTrackRegTrash(REG_R1); + getEmitter()->emitIns_R_R_I(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_SPBASE, genFuncletInfo.fiSP_to_PSP_slot_delta); + getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_R1, genFuncletInfo.fiFunction_CallerSP_to_FP_delta); + } + else + { + // This is a non-filter funclet + getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_R3, REG_FPBASE, -genFuncletInfo.fiFunction_CallerSP_to_FP_delta); + regTracker.rsTrackRegTrash(REG_R3); + getEmitter()->emitIns_R_R_I(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R3, REG_SPBASE, genFuncletInfo.fiSP_to_PSP_slot_delta); + } +} + + +/***************************************************************************** + * + * Generates code for an EH funclet epilog. + */ + +void CodeGen::genFuncletEpilog() +{ +#ifdef DEBUG + if (verbose) + printf("*************** In genFuncletEpilog()\n"); +#endif + + ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true); + + bool unwindStarted = false; + + if (!unwindStarted) + { + // We can delay this until we know we'll generate an unwindable instruction, if necessary. + compiler->unwindBegEpilog(); + unwindStarted = true; + } + + regMaskTP maskRestoreRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT; + regMaskTP maskRestoreRegsInt = genFuncletInfo.fiSaveRegs & ~maskRestoreRegsFloat; + + // Funclets must always save LR and FP, since when we have funclets we must have an FP frame. + assert((maskRestoreRegsInt & RBM_LR) != 0); + assert((maskRestoreRegsInt & RBM_FP) != 0); + + maskRestoreRegsInt &= ~(RBM_LR | RBM_FP); // We restore FP/LR at the end + + int lowestCalleeSavedOffset = genFuncletInfo.fiSP_to_CalleeSave_delta; + + if (genFuncletInfo.fiFrameType == 3) + { + assert(genFuncletInfo.fiSpDelta2 != 0); + getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -genFuncletInfo.fiSpDelta2); + compiler->unwindAllocStack(-genFuncletInfo.fiSpDelta2); + + lowestCalleeSavedOffset += genFuncletInfo.fiSpDelta2; + } + + regMaskTP regsToRestoreMask = maskRestoreRegsInt | maskRestoreRegsFloat; + genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, lowestCalleeSavedOffset, 0); + + if (genFuncletInfo.fiFrameType == 1) + { + getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -genFuncletInfo.fiSpDelta1, INS_OPTS_POST_INDEX); + compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1); + + assert(genFuncletInfo.fiSpDelta2 == 0); + assert(genFuncletInfo.fiSP_to_FPLR_save_delta == 0); + } + else if (genFuncletInfo.fiFrameType == 2) + { + getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, genFuncletInfo.fiSP_to_FPLR_save_delta); + compiler->unwindSaveRegPair(REG_FP, REG_LR, genFuncletInfo.fiSP_to_FPLR_save_delta); + + getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -genFuncletInfo.fiSpDelta1); + compiler->unwindAllocStack(-genFuncletInfo.fiSpDelta1); + + assert(genFuncletInfo.fiSpDelta2 == 0); + } + else + { + assert(genFuncletInfo.fiFrameType == 3); + + getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -genFuncletInfo.fiSpDelta1, INS_OPTS_POST_INDEX); + compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, genFuncletInfo.fiSpDelta1); + } + + inst_RV(INS_ret, REG_LR, TYP_I_IMPL); + compiler->unwindReturn(REG_LR); + + compiler->unwindEndEpilog(); +} + + +/***************************************************************************** + * + * Capture the information used to generate the funclet prologs and epilogs. + * Note that all funclet prologs are identical, and all funclet epilogs are + * identical (per type: filters are identical, and non-filters are identical). + * Thus, we compute the data used for these just once. + * + * See genFuncletProlog() for more information about the prolog/epilog sequences. + */ + +void CodeGen::genCaptureFuncletPrologEpilogInfo() +{ + if (!compiler->ehAnyFunclets()) + return; + + assert(isFramePointerUsed()); + assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT); // The frame size and offsets must be finalized + + genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta(); + + regMaskTP rsMaskSaveRegs = regSet.rsMaskCalleeSaved; + assert((rsMaskSaveRegs & RBM_LR) != 0); + assert((rsMaskSaveRegs & RBM_FP) != 0); + + unsigned saveRegsCount = genCountBits(rsMaskSaveRegs); + unsigned saveRegsPlusPSPSize = saveRegsCount * REGSIZE_BYTES + /* PSPSym */ REGSIZE_BYTES; + unsigned saveRegsPlusPSPSizeAligned = (unsigned)roundUp(saveRegsPlusPSPSize, STACK_ALIGN); + + assert(compiler->lvaOutgoingArgSpaceSize % REGSIZE_BYTES == 0); + unsigned outgoingArgSpaceAligned = (unsigned)roundUp(compiler->lvaOutgoingArgSpaceSize, STACK_ALIGN); + + unsigned maxFuncletFrameSizeAligned = saveRegsPlusPSPSizeAligned + outgoingArgSpaceAligned; + assert((maxFuncletFrameSizeAligned % STACK_ALIGN) == 0); + + int SP_to_FPLR_save_delta; + int SP_to_PSP_slot_delta; + int CallerSP_to_PSP_slot_delta; + + if (maxFuncletFrameSizeAligned <= 512) + { + unsigned funcletFrameSize = saveRegsPlusPSPSize + compiler->lvaOutgoingArgSpaceSize; + unsigned funcletFrameSizeAligned = (unsigned)roundUp(funcletFrameSize, STACK_ALIGN); + assert(funcletFrameSizeAligned <= maxFuncletFrameSizeAligned); + + unsigned funcletFrameAlignmentPad = funcletFrameSizeAligned - funcletFrameSize; + assert((funcletFrameAlignmentPad == 0) || (funcletFrameAlignmentPad == REGSIZE_BYTES)); + + SP_to_FPLR_save_delta = compiler->lvaOutgoingArgSpaceSize; + SP_to_PSP_slot_delta = SP_to_FPLR_save_delta + 2 /* FP, LR */ * REGSIZE_BYTES + funcletFrameAlignmentPad; + CallerSP_to_PSP_slot_delta = -(int)(saveRegsPlusPSPSize - 2 /* FP, LR */ * REGSIZE_BYTES); + + if (compiler->lvaOutgoingArgSpaceSize == 0) + { + genFuncletInfo.fiFrameType = 1; + } + else + { + genFuncletInfo.fiFrameType = 2; + } + genFuncletInfo.fiSpDelta1 = -(int)funcletFrameSizeAligned; + genFuncletInfo.fiSpDelta2 = 0; + + assert(genFuncletInfo.fiSpDelta1 + genFuncletInfo.fiSpDelta2 == -(int)funcletFrameSizeAligned); + } + else + { + unsigned saveRegsPlusPSPAlignmentPad = saveRegsPlusPSPSizeAligned - saveRegsPlusPSPSize; + assert((saveRegsPlusPSPAlignmentPad == 0) || (saveRegsPlusPSPAlignmentPad == REGSIZE_BYTES)); + + SP_to_FPLR_save_delta = outgoingArgSpaceAligned; + SP_to_PSP_slot_delta = SP_to_FPLR_save_delta + 2 /* FP, LR */ * REGSIZE_BYTES + saveRegsPlusPSPAlignmentPad; + CallerSP_to_PSP_slot_delta = -(int)(saveRegsPlusPSPSizeAligned - 2 /* FP, LR */ * REGSIZE_BYTES - saveRegsPlusPSPAlignmentPad); + + genFuncletInfo.fiFrameType = 3; + genFuncletInfo.fiSpDelta1 = -(int)saveRegsPlusPSPSizeAligned; + genFuncletInfo.fiSpDelta2 = -(int)outgoingArgSpaceAligned; + + assert(genFuncletInfo.fiSpDelta1 + genFuncletInfo.fiSpDelta2 == -(int)maxFuncletFrameSizeAligned); + } + + /* Now save it for future use */ + + genFuncletInfo.fiSaveRegs = rsMaskSaveRegs; + genFuncletInfo.fiSP_to_FPLR_save_delta = SP_to_FPLR_save_delta; + genFuncletInfo.fiSP_to_PSP_slot_delta = SP_to_PSP_slot_delta; + genFuncletInfo.fiSP_to_CalleeSave_delta = SP_to_PSP_slot_delta + REGSIZE_BYTES; + genFuncletInfo.fiCallerSP_to_PSP_slot_delta = CallerSP_to_PSP_slot_delta; + +#ifdef DEBUG + if (verbose) + { + printf("\n"); + printf("Funclet prolog / epilog info\n"); + printf(" Save regs: "); dspRegMask(genFuncletInfo.fiSaveRegs); printf("\n"); + printf(" Function CallerSP-to-FP delta: %d\n", genFuncletInfo.fiFunction_CallerSP_to_FP_delta); + printf(" SP to FP/LR save location delta: %d\n", genFuncletInfo.fiSP_to_FPLR_save_delta); + printf(" SP to PSP slot delta: %d\n", genFuncletInfo.fiSP_to_PSP_slot_delta); + printf(" SP to callee-saved area delta: %d\n", genFuncletInfo.fiSP_to_CalleeSave_delta); + printf(" Caller SP to PSP slot delta: %d\n", genFuncletInfo.fiCallerSP_to_PSP_slot_delta); + printf(" Frame type: %d\n", genFuncletInfo.fiFrameType); + printf(" SP delta 1: %d\n", genFuncletInfo.fiSpDelta1); + printf(" SP delta 2: %d\n", genFuncletInfo.fiSpDelta2); + + if (CallerSP_to_PSP_slot_delta != compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)) // for debugging + { + printf("lvaGetCallerSPRelativeOffset(lvaPSPSym): %d\n", compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)); + } + } +#endif // DEBUG + + assert(genFuncletInfo.fiSP_to_FPLR_save_delta >= 0); + assert(genFuncletInfo.fiSP_to_PSP_slot_delta >= 0); + assert(genFuncletInfo.fiSP_to_CalleeSave_delta >= 0); + assert(genFuncletInfo.fiCallerSP_to_PSP_slot_delta <= 0); + assert(compiler->lvaPSPSym != BAD_VAR_NUM); + assert(genFuncletInfo.fiCallerSP_to_PSP_slot_delta == compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)); // same offset used in main function and funclet! +} + +/* +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XX XX +XX End Prolog / Epilog XX +XX XX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +*/ + +// Get the register assigned to the given node + +regNumber CodeGenInterface::genGetAssignedReg(GenTreePtr tree) +{ + return tree->gtRegNum; +} + +//------------------------------------------------------------------------ +// genSpillVar: Spill a local variable +// +// Arguments: +// tree - the lclVar node for the variable being spilled +// +// Return Value: +// None. +// +// Assumptions: +// The lclVar must be a register candidate (lvRegCandidate) + +void CodeGen::genSpillVar(GenTreePtr tree) +{ + unsigned varNum = tree->gtLclVarCommon.gtLclNum; + LclVarDsc * varDsc = &(compiler->lvaTable[varNum]); + + assert(varDsc->lvIsRegCandidate()); + + // We don't actually need to spill if it is already living in memory + bool needsSpill = ((tree->gtFlags & GTF_VAR_DEF) == 0 && varDsc->lvIsInReg()); + if (needsSpill) + { + var_types lclTyp = varDsc->TypeGet(); + if (varDsc->lvNormalizeOnStore()) + lclTyp = genActualType(lclTyp); + emitAttr size = emitTypeSize(lclTyp); + + bool restoreRegVar = false; + if (tree->gtOper == GT_REG_VAR) + { + tree->SetOper(GT_LCL_VAR); + restoreRegVar = true; + } + + // mask off the flag to generate the right spill code, then bring it back + tree->gtFlags &= ~GTF_REG_VAL; + + instruction storeIns = ins_Store(tree->TypeGet(), compiler->isSIMDTypeLocalAligned(varNum)); + + if (varTypeIsMultiReg(tree)) + { + assert(varDsc->lvRegNum == genRegPairLo(tree->gtRegPair)); + assert(varDsc->lvOtherReg == genRegPairHi(tree->gtRegPair)); + regNumber regLo = genRegPairLo(tree->gtRegPair); + regNumber regHi = genRegPairHi(tree->gtRegPair); + inst_TT_RV(storeIns, tree, regLo); + inst_TT_RV(storeIns, tree, regHi, 4); + } + else + { + assert(varDsc->lvRegNum == tree->gtRegNum); + inst_TT_RV(storeIns, tree, tree->gtRegNum, 0, size); + } + tree->gtFlags |= GTF_REG_VAL; + + if (restoreRegVar) + { + tree->SetOper(GT_REG_VAR); + } + + genUpdateRegLife(varDsc, /*isBorn*/ false, /*isDying*/ true DEBUGARG(tree)); + gcInfo.gcMarkRegSetNpt(varDsc->lvRegMask()); + + if (VarSetOps::IsMember(compiler, gcInfo.gcTrkStkPtrLcls, varDsc->lvVarIndex)) + { +#ifdef DEBUG + if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex)) + { + JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum); + } + else + { + JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum); + } +#endif + VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex); + } + + } + + tree->gtFlags &= ~GTF_SPILL; + varDsc->lvRegNum = REG_STK; + if (varTypeIsMultiReg(tree)) + { + varDsc->lvOtherReg = REG_STK; + } +} + +// inline +void CodeGenInterface::genUpdateVarReg(LclVarDsc * varDsc, GenTreePtr tree) +{ + assert(tree->OperIsScalarLocal() || (tree->gtOper == GT_COPY)); + varDsc->lvRegNum = tree->gtRegNum; +} + + +/*****************************************************************************/ +/*****************************************************************************/ + +/***************************************************************************** + * + * Generate code that will set the given register to the integer constant. + */ + +void CodeGen::genSetRegToIcon(regNumber reg, + ssize_t val, + var_types type, + insFlags flags) +{ + // Reg cannot be a FP reg + assert(!genIsValidFloatReg(reg)); + + // The only TYP_REF constant that can come this path is a managed 'null' since it is not + // relocatable. Other ref type constants (e.g. string objects) go through a different + // code path. + noway_assert(type != TYP_REF || val == 0); + + instGen_Set_Reg_To_Imm(emitActualTypeSize(type), reg, val, flags); +} + + +/***************************************************************************** + * + * Generate code to check that the GS cookie wasn't thrashed by a buffer + * overrun. On ARM64 we always use REG_TMP_0 and REG_TMP_1 as temp registers + * and this works fine in the case of tail calls + * Implementation Note: pushReg = true, in case of tail calls. + */ +void CodeGen::genEmitGSCookieCheck(bool pushReg) +{ + noway_assert(compiler->gsGlobalSecurityCookieAddr || compiler->gsGlobalSecurityCookieVal); + + // Make sure that the return register is reported as live GC-ref so that any GC that kicks in while + // executing GS cookie check will not collect the object pointed to by REG_INTRET (R0). + if (!pushReg && (compiler->info.compRetType == TYP_REF)) + gcInfo.gcRegGCrefSetCur |= RBM_INTRET; + + regNumber regGSConst = REG_TMP_0; + regNumber regGSValue = REG_TMP_1; + + if (compiler->gsGlobalSecurityCookieAddr == nullptr) + { + // load the GS cookie constant into a reg + // + genSetRegToIcon(regGSConst, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL); + } + else + { + // Ngen case - GS cookie constant needs to be accessed through an indirection. + instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, regGSConst, (ssize_t)compiler->gsGlobalSecurityCookieAddr); + getEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSConst, regGSConst, 0); + } + // Load this method's GS value from the stack frame + getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSValue, compiler->lvaGSSecurityCookie, 0); + // Compare with the GC cookie constant + getEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, regGSConst, regGSValue); + + BasicBlock *gsCheckBlk = genCreateTempLabel(); + inst_JMP(genJumpKindForOper(GT_EQ, true), gsCheckBlk); + genEmitHelperCall(CORINFO_HELP_FAIL_FAST, 0, EA_UNKNOWN); + genDefineTempLabel(gsCheckBlk); +} + +/***************************************************************************** + * + * Generate code for all the basic blocks in the function. + */ + +void CodeGen::genCodeForBBlist() +{ + unsigned varNum; + LclVarDsc * varDsc; + + unsigned savedStkLvl; + +#ifdef DEBUG + genInterruptibleUsed = true; + unsigned stmtNum = 0; + UINT64 totalCostEx = 0; + UINT64 totalCostSz = 0; + + // You have to be careful if you create basic blocks from now on + compiler->fgSafeBasicBlockCreation = false; + + // This stress mode is not comptible with fully interruptible GC + if (genInterruptible && compiler->opts.compStackCheckOnCall) + { + compiler->opts.compStackCheckOnCall = false; + } + + // This stress mode is not comptible with fully interruptible GC + if (genInterruptible && compiler->opts.compStackCheckOnRet) + { + compiler->opts.compStackCheckOnRet = false; + } +#endif // DEBUG + + // Prepare the blocks for exception handling codegen: mark the blocks that needs labels. + genPrepForEHCodegen(); + + assert(!compiler->fgFirstBBScratch || compiler->fgFirstBB == compiler->fgFirstBBScratch); // compiler->fgFirstBBScratch has to be first. + + /* Initialize the spill tracking logic */ + + regSet.rsSpillBeg(); + + /* Initialize the line# tracking logic */ + +#ifdef DEBUGGING_SUPPORT + if (compiler->opts.compScopeInfo) + { + siInit(); + } +#endif + + // The current implementation of switch tables requires the first block to have a label so it + // can generate offsets to the switch label targets. + // TODO-ARM64-CQ: remove this when switches have been re-implemented to not use this. + if (compiler->fgHasSwitch) + { + compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET; + } + + genPendingCallLabel = nullptr; + + /* Initialize the pointer tracking code */ + + gcInfo.gcRegPtrSetInit(); + gcInfo.gcVarPtrSetInit(); + + /* If any arguments live in registers, mark those regs as such */ + + for (varNum = 0, varDsc = compiler->lvaTable; + varNum < compiler->lvaCount; + varNum++ , varDsc++) + { + /* Is this variable a parameter assigned to a register? */ + + if (!varDsc->lvIsParam || !varDsc->lvRegister) + continue; + + /* Is the argument live on entry to the method? */ + + if (!VarSetOps::IsMember(compiler, compiler->fgFirstBB->bbLiveIn, varDsc->lvVarIndex)) + continue; + + /* Is this a floating-point argument? */ + + if (varDsc->IsFloatRegType()) + continue; + + noway_assert(!varTypeIsFloating(varDsc->TypeGet())); + + /* Mark the register as holding the variable */ + + regTracker.rsTrackRegLclVar(varDsc->lvRegNum, varNum); + } + + unsigned finallyNesting = 0; + + // Make sure a set is allocated for compiler->compCurLife (in the long case), so we can set it to empty without + // allocation at the start of each basic block. + VarSetOps::AssignNoCopy(compiler, compiler->compCurLife, VarSetOps::MakeEmpty(compiler)); + + /*------------------------------------------------------------------------- + * + * Walk the basic blocks and generate code for each one + * + */ + + BasicBlock * block; + BasicBlock * lblk; /* previous block */ + + for (lblk = NULL, block = compiler->fgFirstBB; + block != NULL; + lblk = block, block = block->bbNext) + { +#ifdef DEBUG + if (compiler->verbose) + { + printf("\n=============== Generating "); + block->dspBlockHeader(compiler, true, true); + compiler->fgDispBBLiveness(block); + } +#endif // DEBUG + + /* Figure out which registers hold variables on entry to this block */ + + regSet.rsMaskVars = RBM_NONE; + gcInfo.gcRegGCrefSetCur = RBM_NONE; + gcInfo.gcRegByrefSetCur = RBM_NONE; + + compiler->m_pLinearScan->recordVarLocationsAtStartOfBB(block); + + genUpdateLife(block->bbLiveIn); + + // Even if liveness didn't change, we need to update the registers containing GC references. + // genUpdateLife will update the registers live due to liveness changes. But what about registers that didn't change? + // We cleared them out above. Maybe we should just not clear them out, but update the ones that change here. + // That would require handling the changes in recordVarLocationsAtStartOfBB(). + + regMaskTP newLiveRegSet = RBM_NONE; + regMaskTP newRegGCrefSet = RBM_NONE; + regMaskTP newRegByrefSet = RBM_NONE; +#ifdef DEBUG + VARSET_TP VARSET_INIT_NOCOPY(removedGCVars, VarSetOps::MakeEmpty(compiler)); + VARSET_TP VARSET_INIT_NOCOPY(addedGCVars, VarSetOps::MakeEmpty(compiler)); +#endif + VARSET_ITER_INIT(compiler, iter, block->bbLiveIn, varIndex); + while (iter.NextElem(compiler, &varIndex)) + { + unsigned varNum = compiler->lvaTrackedToVarNum[varIndex]; + LclVarDsc* varDsc = &(compiler->lvaTable[varNum]); + + if (varDsc->lvIsInReg()) + { + newLiveRegSet |= varDsc->lvRegMask(); + if (varDsc->lvType == TYP_REF) + { + newRegGCrefSet |= varDsc->lvRegMask(); + } + else if (varDsc->lvType == TYP_BYREF) + { + newRegByrefSet |= varDsc->lvRegMask(); + } +#ifdef DEBUG + if (verbose && VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varIndex)) + { + VarSetOps::AddElemD(compiler, removedGCVars, varIndex); + } +#endif DEBUG + VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varIndex); + } + else if (compiler->lvaIsGCTracked(varDsc)) + { +#ifdef DEBUG + if (verbose && !VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varIndex)) + { + VarSetOps::AddElemD(compiler, addedGCVars, varIndex); + } +#endif DEBUG + VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varIndex); + } + } + +#ifdef DEBUG + if (compiler->verbose) + { + printf("\t\t\t\t\t\t\tLive regs: "); + if (regSet.rsMaskVars == newLiveRegSet) + { + printf("(unchanged) "); + } + else + { + printRegMaskInt(regSet.rsMaskVars); + compiler->getEmitter()->emitDispRegSet(regSet.rsMaskVars); + printf(" => "); + } + printRegMaskInt(newLiveRegSet); + compiler->getEmitter()->emitDispRegSet(newLiveRegSet); + printf("\n"); + if (!VarSetOps::IsEmpty(compiler, addedGCVars)) + { + printf("\t\t\t\t\t\t\tAdded GCVars: "); + dumpConvertedVarSet(compiler, addedGCVars); + printf("\n"); + } + if (!VarSetOps::IsEmpty(compiler, removedGCVars)) + { + printf("\t\t\t\t\t\t\tRemoved GCVars: "); + dumpConvertedVarSet(compiler, removedGCVars); + printf("\n"); + } + } +#endif // DEBUG + + regSet.rsMaskVars = newLiveRegSet; + gcInfo.gcMarkRegSetGCref(newRegGCrefSet DEBUG_ARG(true)); + gcInfo.gcMarkRegSetByref(newRegByrefSet DEBUG_ARG(true)); + + /* Blocks with handlerGetsXcptnObj()==true use GT_CATCH_ARG to + represent the exception object (TYP_REF). + We mark REG_EXCEPTION_OBJECT as holding a GC object on entry + to the block, it will be the first thing evaluated + (thanks to GTF_ORDER_SIDEEFF). + */ + + if (handlerGetsXcptnObj(block->bbCatchTyp)) + { +#if JIT_FEATURE_SSA_SKIP_DEFS + GenTreePtr firstStmt = block->FirstNonPhiDef(); +#else + GenTreePtr firstStmt = block->bbTreeList; +#endif + if (firstStmt != NULL) + { + GenTreePtr firstTree = firstStmt->gtStmt.gtStmtExpr; + if (compiler->gtHasCatchArg(firstTree)) + { + gcInfo.gcMarkRegSetGCref(RBM_EXCEPTION_OBJECT); + } + } + } + + /* Start a new code output block */ + + genUpdateCurrentFunclet(block); + +#ifdef _TARGET_XARCH_ + if (genAlignLoops && block->bbFlags & BBF_LOOP_HEAD) + { + getEmitter()->emitLoopAlign(); + } +#endif + +#ifdef DEBUG + if (compiler->opts.dspCode) + printf("\n L_M%03u_BB%02u:\n", Compiler::s_compMethodsCount, block->bbNum); +#endif + + block->bbEmitCookie = NULL; + + if (block->bbFlags & (BBF_JMP_TARGET|BBF_HAS_LABEL)) + { + /* Mark a label and update the current set of live GC refs */ + + block->bbEmitCookie = getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, + gcInfo.gcRegGCrefSetCur, + gcInfo.gcRegByrefSetCur, + FALSE); + } + + if (block == compiler->fgFirstColdBlock) + { +#ifdef DEBUG + if (compiler->verbose) + { + printf("\nThis is the start of the cold region of the method\n"); + } +#endif + // We should never have a block that falls through into the Cold section + noway_assert(!lblk->bbFallsThrough()); + + // We require the block that starts the Cold section to have a label + noway_assert(block->bbEmitCookie); + getEmitter()->emitSetFirstColdIGCookie(block->bbEmitCookie); + } + + /* Both stacks are always empty on entry to a basic block */ + + genStackLevel = 0; + + savedStkLvl = genStackLevel; + + /* Tell everyone which basic block we're working on */ + + compiler->compCurBB = block; + +#ifdef DEBUGGING_SUPPORT + siBeginBlock(block); + + // BBF_INTERNAL blocks don't correspond to any single IL instruction. + if (compiler->opts.compDbgInfo && + (block->bbFlags & BBF_INTERNAL) && + !compiler->fgBBisScratch(block)) // If the block is the distinguished first scratch block, then no need to emit a NO_MAPPING entry, immediately after the prolog. + { + genIPmappingAdd((IL_OFFSETX) ICorDebugInfo::NO_MAPPING, true); + } + + bool firstMapping = true; +#endif // DEBUGGING_SUPPORT + + /*--------------------------------------------------------------------- + * + * Generate code for each statement-tree in the block + * + */ + + if (block->bbFlags & BBF_FUNCLET_BEG) + { + genReserveFuncletProlog(block); + } + + for (GenTreePtr stmt = block->FirstNonPhiDef(); stmt; stmt = stmt->gtNext) + { + noway_assert(stmt->gtOper == GT_STMT); + + if (stmt->AsStmt()->gtStmtIsEmbedded()) + continue; + + /* Get hold of the statement tree */ + GenTreePtr tree = stmt->gtStmt.gtStmtExpr; + +#if defined(DEBUGGING_SUPPORT) + + /* Do we have a new IL-offset ? */ + + if (stmt->gtStmt.gtStmtILoffsx != BAD_IL_OFFSET) + { + /* Create and append a new IP-mapping entry */ + genIPmappingAdd(stmt->gtStmt.gtStmt.gtStmtILoffsx, firstMapping); + firstMapping = false; + } + +#endif // DEBUGGING_SUPPORT + +#ifdef DEBUG + noway_assert(stmt->gtStmt.gtStmtLastILoffs <= compiler->info.compILCodeSize || + stmt->gtStmt.gtStmtLastILoffs == BAD_IL_OFFSET); + + if (compiler->opts.dspCode && compiler->opts.dspInstrs && + stmt->gtStmt.gtStmtLastILoffs != BAD_IL_OFFSET) + { + while (genCurDispOffset <= stmt->gtStmt.gtStmtLastILoffs) + { + genCurDispOffset += + dumpSingleInstr(compiler->info.compCode, genCurDispOffset, "> "); + } + } + + stmtNum++; + if (compiler->verbose) + { + printf("\nGenerating BB%02u, stmt %u\t\t", block->bbNum, stmtNum); + printf("Holding variables: "); + dspRegMask(regSet.rsMaskVars); printf("\n\n"); + if (compiler->verboseTrees) + { + compiler->gtDispTree(compiler->opts.compDbgInfo ? stmt : tree); + printf("\n"); + } + } + totalCostEx += ((UINT64)stmt->gtCostEx * block->getBBWeight(compiler)); + totalCostSz += (UINT64) stmt->gtCostSz; +#endif // DEBUG + + // Traverse the tree in linear order, generating code for each node in the + // tree as we encounter it + + compiler->compCurLifeTree = NULL; + compiler->compCurStmt = stmt; + for (GenTreePtr treeNode = stmt->gtStmt.gtStmtList; + treeNode != NULL; + treeNode = treeNode->gtNext) + { + genCodeForTreeNode(treeNode); + if (treeNode->gtHasReg() && treeNode->gtLsraInfo.isLocalDefUse) + { + genConsumeReg(treeNode); + } + } + + regSet.rsSpillChk(); + +#ifdef DEBUG + /* Make sure we didn't bungle pointer register tracking */ + + regMaskTP ptrRegs = (gcInfo.gcRegGCrefSetCur|gcInfo.gcRegByrefSetCur); + regMaskTP nonVarPtrRegs = ptrRegs & ~regSet.rsMaskVars; + + // If return is a GC-type, clear it. Note that if a common + // epilog is generated (genReturnBB) it has a void return + // even though we might return a ref. We can't use the compRetType + // as the determiner because something we are tracking as a byref + // might be used as a return value of a int function (which is legal) + if (tree->gtOper == GT_RETURN && + (varTypeIsGC(compiler->info.compRetType) || + (tree->gtOp.gtOp1 != 0 && varTypeIsGC(tree->gtOp.gtOp1->TypeGet())))) + { + nonVarPtrRegs &= ~RBM_INTRET; + } + + // When profiling, the first statement in a catch block will be the + // harmless "inc" instruction (does not interfere with the exception + // object). + + if ((compiler->opts.eeFlags & CORJIT_FLG_BBINSTR) && + (stmt == block->bbTreeList) && + handlerGetsXcptnObj(block->bbCatchTyp)) + { + nonVarPtrRegs &= ~RBM_EXCEPTION_OBJECT; + } + + if (nonVarPtrRegs) + { + printf("Regset after tree="); + compiler->printTreeID(tree); + printf(" BB%02u gcr=", block->bbNum); + printRegMaskInt(gcInfo.gcRegGCrefSetCur & ~regSet.rsMaskVars); + compiler->getEmitter()->emitDispRegSet(gcInfo.gcRegGCrefSetCur & ~regSet.rsMaskVars); + printf(", byr="); + printRegMaskInt(gcInfo.gcRegByrefSetCur & ~regSet.rsMaskVars); + compiler->getEmitter()->emitDispRegSet(gcInfo.gcRegByrefSetCur & ~regSet.rsMaskVars); + printf(", regVars="); + printRegMaskInt(regSet.rsMaskVars); + compiler->getEmitter()->emitDispRegSet(regSet.rsMaskVars); + printf("\n"); + } + + noway_assert(nonVarPtrRegs == 0); + + for (GenTree * node = stmt->gtStmt.gtStmtList; node; node=node->gtNext) + { + assert(!(node->gtFlags & GTF_SPILL)); + } + +#endif // DEBUG + + noway_assert(stmt->gtOper == GT_STMT); + +#ifdef DEBUGGING_SUPPORT + genEnsureCodeEmitted(stmt->gtStmt.gtStmtILoffsx); +#endif + + } //-------- END-FOR each statement-tree of the current block --------- + +#if defined(DEBUG) && defined(_TARGET_ARM64_) + if (block->bbNext == nullptr) + { + // Unit testing of the ARM64 emitter: generate a bunch of instructions into the last block + // (it's as good as any, but better than the prolog, which can only be a single instruction + // group) then use COMPLUS_JitLateDisasm=* to see if the late disassembler + // thinks the instructions are the same as we do. + genArm64EmitterUnitTests(); + } +#endif // defined(DEBUG) && defined(_TARGET_ARM64_) + +#ifdef DEBUGGING_SUPPORT + + if (compiler->opts.compScopeInfo && (compiler->info.compVarScopesCount > 0)) + { + siEndBlock(block); + + /* Is this the last block, and are there any open scopes left ? */ + + bool isLastBlockProcessed = (block->bbNext == NULL); + if (block->isBBCallAlwaysPair()) + { + isLastBlockProcessed = (block->bbNext->bbNext == NULL); + } + + if (isLastBlockProcessed && siOpenScopeList.scNext) + { + /* This assert no longer holds, because we may insert a throw + block to demarcate the end of a try or finally region when they + are at the end of the method. It would be nice if we could fix + our code so that this throw block will no longer be necessary. */ + + //noway_assert(block->bbCodeOffsEnd != compiler->info.compILCodeSize); + + siCloseAllOpenScopes(); + } + } + +#endif // DEBUGGING_SUPPORT + + genStackLevel -= savedStkLvl; + +#ifdef DEBUG + // compCurLife should be equal to the liveOut set, except that we don't keep + // it up to date for vars that are not register candidates + // (it would be nice to have a xor set function) + + VARSET_TP VARSET_INIT_NOCOPY(extraLiveVars, VarSetOps::Diff(compiler, block->bbLiveOut, compiler->compCurLife)); + VarSetOps::UnionD(compiler, extraLiveVars, VarSetOps::Diff(compiler, compiler->compCurLife, block->bbLiveOut)); + VARSET_ITER_INIT(compiler, extraLiveVarIter, extraLiveVars, extraLiveVarIndex); + while (extraLiveVarIter.NextElem(compiler, &extraLiveVarIndex)) + { + unsigned varNum = compiler->lvaTrackedToVarNum[extraLiveVarIndex]; + LclVarDsc * varDsc = compiler->lvaTable + varNum; + assert(!varDsc->lvIsRegCandidate()); + } +#endif + + /* Both stacks should always be empty on exit from a basic block */ + + noway_assert(genStackLevel == 0); + +#if 0 + // On AMD64, we need to generate a NOP after a call that is the last instruction of the block, in several + // situations, to support proper exception handling semantics. This is mostly to ensure that when the stack + // walker computes an instruction pointer for a frame, that instruction pointer is in the correct EH region. + // The document "X64 and ARM ABIs.docx" has more details. The situations: + // 1. If the call instruction is in a different EH region as the instruction that follows it. + // 2. If the call immediately precedes an OS epilog. (Note that what the JIT or VM consider an epilog might + // be slightly different from what the OS considers an epilog, and it is the OS-reported epilog that matters here.) + // We handle case #1 here, and case #2 in the emitter. + if (getEmitter()->emitIsLastInsCall()) + { + // Ok, the last instruction generated is a call instruction. Do any of the other conditions hold? + // Note: we may be generating a few too many NOPs for the case of call preceding an epilog. Technically, + // if the next block is a BBJ_RETURN, an epilog will be generated, but there may be some instructions + // generated before the OS epilog starts, such as a GS cookie check. + if ((block->bbNext == nullptr) || + !BasicBlock::sameEHRegion(block, block->bbNext)) + { + // We only need the NOP if we're not going to generate any more code as part of the block end. + + switch (block->bbJumpKind) + { + case BBJ_ALWAYS: + case BBJ_THROW: + case BBJ_CALLFINALLY: + case BBJ_EHCATCHRET: + // We're going to generate more code below anyway, so no need for the NOP. + + case BBJ_RETURN: + case BBJ_EHFINALLYRET: + case BBJ_EHFILTERRET: + // These are the "epilog follows" case, handled in the emitter. + + break; + + case BBJ_NONE: + if (block->bbNext == nullptr) + { + // Call immediately before the end of the code; we should never get here . + instGen(INS_BREAKPOINT); // This should never get executed + } + else + { + // We need the NOP + instGen(INS_nop); + } + break; + + case BBJ_COND: + case BBJ_SWITCH: + // These can't have a call as the last instruction! + + default: + noway_assert(!"Unexpected bbJumpKind"); + break; + } + } + } +#endif // 0 + + /* Do we need to generate a jump or return? */ + + switch (block->bbJumpKind) + { + case BBJ_ALWAYS: + inst_JMP(EJ_jmp, block->bbJumpDest); + break; + + case BBJ_RETURN: + genExitCode(block); + break; + + case BBJ_THROW: + // If we have a throw at the end of a function or funclet, we need to emit another instruction + // afterwards to help the OS unwinder determine the correct context during unwind. + // We insert an unexecuted breakpoint instruction in several situations + // following a throw instruction: + // 1. If the throw is the last instruction of the function or funclet. This helps + // the OS unwinder determine the correct context during an unwind from the + // thrown exception. + // 2. If this is this is the last block of the hot section. + // 3. If the subsequent block is a special throw block. + // 4. On AMD64, if the next block is in a different EH region. + if ((block->bbNext == NULL) + || (block->bbNext->bbFlags & BBF_FUNCLET_BEG) + || !BasicBlock::sameEHRegion(block, block->bbNext) + || (!isFramePointerUsed() && compiler->fgIsThrowHlpBlk(block->bbNext)) + || block->bbNext == compiler->fgFirstColdBlock + ) + { + instGen(INS_BREAKPOINT); // This should never get executed + } + + break; + + case BBJ_CALLFINALLY: + + // Generate a call to the finally, like this: + // mov x0,qword ptr [fp + 10H] // Load x0 with PSPSym + // bl finally-funclet + // b finally-return // Only for non-retless finally calls + // The 'b' can be a NOP if we're going to the next block. + + getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_R0, compiler->lvaPSPSym, 0); + getEmitter()->emitIns_J(INS_bl_local, block->bbJumpDest); + + if (block->bbFlags & BBF_RETLESS_CALL) + { + // We have a retless call, and the last instruction generated was a call. + // If the next block is in a different EH region (or is the end of the code + // block), then we need to generate a breakpoint here (since it will never + // get executed) to get proper unwind behavior. + + if ((block->bbNext == nullptr) || + !BasicBlock::sameEHRegion(block, block->bbNext)) + { + instGen(INS_BREAKPOINT); // This should never get executed + } + } + else + { + // Because of the way the flowgraph is connected, the liveness info for this one instruction + // after the call is not (can not be) correct in cases where a variable has a last use in the + // handler. So turn off GC reporting for this single instruction. + getEmitter()->emitMakeRemainderNonInterruptible(); + + // Now go to where the finally funclet needs to return to. + if (block->bbNext->bbJumpDest == block->bbNext->bbNext) + { + // Fall-through. + // TODO-ARM64-CQ: Can we get rid of this instruction, and just have the call return directly + // to the next instruction? This would depend on stack walking from within the finally + // handler working without this instruction being in this special EH region. + instGen(INS_nop); + } + else + { + inst_JMP(EJ_jmp, block->bbNext->bbJumpDest); + } + } + + // The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the + // jump target using bbJumpDest - that is already used to point + // to the finally block. So just skip past the BBJ_ALWAYS unless the + // block is RETLESS. + if ( !(block->bbFlags & BBF_RETLESS_CALL) ) + { + assert(block->isBBCallAlwaysPair()); + + lblk = block; + block = block->bbNext; + } + break; + + case BBJ_EHCATCHRET: + getEmitter()->emitIns_R_L(INS_adr, EA_4BYTE_DSP_RELOC, block->bbJumpDest, REG_INTRET); + + __fallthrough; + + case BBJ_EHFINALLYRET: + case BBJ_EHFILTERRET: + genReserveFuncletEpilog(block); + break; + + case BBJ_NONE: + case BBJ_COND: + case BBJ_SWITCH: + break; + + default: + noway_assert(!"Unexpected bbJumpKind"); + break; + } + +#ifdef DEBUG + compiler->compCurBB = 0; +#endif + + } //------------------ END-FOR each block of the method ------------------- + + /* Nothing is live at this point */ + genUpdateLife(VarSetOps::MakeEmpty(compiler)); + + /* Finalize the spill tracking logic */ + + regSet.rsSpillEnd(); + + /* Finalize the temp tracking logic */ + + compiler->tmpEnd(); + +#ifdef DEBUG + if (compiler->verbose) + { + printf("\n# "); + printf("totalCostEx = %6d, totalCostSz = %5d ", + totalCostEx, totalCostSz); + printf("%s\n", compiler->info.compFullName); + } +#endif +} + +// return the child that has the same reg as the dst (if any) +// other child returned (out param) in 'other' +// TODO-Cleanup: move to CodeGenCommon.cpp +GenTree * +sameRegAsDst(GenTree *tree, GenTree *&other /*out*/) +{ + if (tree->gtRegNum == REG_NA) + { + other = nullptr; + return NULL; + } + + GenTreePtr op1 = tree->gtOp.gtOp1; + GenTreePtr op2 = tree->gtOp.gtOp2; + if (op1->gtRegNum == tree->gtRegNum) + { + other = op2; + return op1; + } + if (op2->gtRegNum == tree->gtRegNum) + { + other = op1; + return op2; + } + else + { + other = nullptr; + return NULL; + } +} + +// move an immediate value into an integer register + +void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, + regNumber reg, + ssize_t imm, + insFlags flags) +{ + // reg cannot be a FP register + assert(!genIsValidFloatReg(reg)); + + if (!compiler->opts.compReloc) + { + size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs + } + + if (EA_IS_RELOC(size)) + { + NYI("Reloc constant"); + } + else if (imm == 0) + { + instGen_Set_Reg_To_Zero(size, reg, flags); + } + else + { + if (emitter::emitIns_valid_imm_for_mov(imm, size)) + { + getEmitter()->emitIns_R_I(INS_mov, size, reg, imm); + } + else + { + getEmitter()->emitIns_R_I(INS_mov, size, reg, (imm & 0xffff)); + getEmitter()->emitIns_R_I_I(INS_movk, size, reg, ((imm >> 16) & 0xffff), 16, INS_OPTS_LSL); + + if ((size == EA_8BYTE) && ((imm >> 32) != 0)) // Sometimes the upper 32 bits are zero and the first mov has zero-ed them + { + getEmitter()->emitIns_R_I_I(INS_movk, EA_8BYTE, reg, ((imm >> 32) & 0xffff), 32, INS_OPTS_LSL); + if ((imm >> 48) != 0) // Frequently the upper 16 bits are zero and the first mov has zero-ed them + { + getEmitter()->emitIns_R_I_I(INS_movk, EA_8BYTE, reg, ((imm >> 48) & 0xffff), 48, INS_OPTS_LSL); + } + } + } + // The caller may have requested that the flags be set on this mov (rarely/never) + if (flags == INS_FLAGS_SET) + { + getEmitter()->emitIns_R_I(INS_tst, size, reg, 0); + } + } + + regTracker.rsTrackRegIntCns(reg, imm); +} + +/*********************************************************************************** + * + * Generate code to set a register 'targetReg' of type 'targetType' to the constant + * specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call + * genProduceReg() on the target register. + */ +void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTreePtr tree) +{ + switch (tree->gtOper) + { + case GT_CNS_INT: + { + // relocatable values tend to come down as a CNS_INT of native int type + // so the line between these two opcodes is kind of blurry + GenTreeIntConCommon* con = tree->AsIntConCommon(); + ssize_t cnsVal = con->IconValue(); + + bool needReloc = compiler->opts.compReloc && tree->IsIconHandle(); + if (needReloc) + { + instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, targetReg, cnsVal); + regTracker.rsTrackRegTrash(targetReg); + } + else + { + genSetRegToIcon(targetReg, cnsVal, targetType); + } + } + break; + + case GT_CNS_DBL: + { + emitter *emit = getEmitter(); + emitAttr size = emitTypeSize(tree); + GenTreeDblCon *dblConst = tree->AsDblCon(); + double constValue = dblConst->gtDblCon.gtDconVal; + + // Make sure we use "movi reg, 0x00" only for positive zero (0.0) and not for negative zero (-0.0) + if (*(__int64*)&constValue == 0) + { + // A faster/smaller way to generate 0.0 + // We will just zero out the entire vector register for both float and double + emit->emitIns_R_I(INS_movi, EA_16BYTE, targetReg, 0x00, INS_OPTS_16B); + } + else if (emitter::emitIns_valid_imm_for_fmov(constValue)) + { + // We can load the FP constant using the fmov FP-immediate for this constValue + emit->emitIns_R_F(INS_fmov, size, targetReg, constValue); + } + else + { + // We must load the FP constant from the constant pool + // Emit a data section constant for the float or double constant. + CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(dblConst); + emit->emitIns_R_C(INS_ldr, size, targetReg, hnd, 0); + } + } + break; + + default: + unreached(); + } +} + + +// Generate code to get the high N bits of a N*N=2N bit multiplication result +void CodeGen::genCodeForMulHi(GenTreeOp* treeNode) +{ + assert(!(treeNode->gtFlags & GTF_UNSIGNED)); + assert(!treeNode->gtOverflowEx()); + +#if 0 + regNumber targetReg = treeNode->gtRegNum; + var_types targetType = treeNode->TypeGet(); + emitter *emit = getEmitter(); + emitAttr size = emitTypeSize(treeNode); + GenTree *op1 = treeNode->gtOp.gtOp1; + GenTree *op2 = treeNode->gtOp.gtOp2; + + // to get the high bits of the multiply, we are constrained to using the + // 1-op form: RDX:RAX = RAX * rm + // The 3-op form (Rx=Ry*Rz) does not support it. + + genConsumeOperands(treeNode->AsOp()); + + GenTree* regOp = op1; + GenTree* rmOp = op2; + + // Set rmOp to the contained memory operand (if any) + // + if (op1->isContained() || (!op2->isContained() && (op2->gtRegNum == targetReg))) + { + regOp = op2; + rmOp = op1; + } + assert(!regOp->isContained()); + + // Setup targetReg when neither of the source operands was a matching register + if (regOp->gtRegNum != targetReg) + { + inst_RV_RV(ins_Copy(targetType), targetReg, regOp->gtRegNum, targetType); + } + + emit->emitInsBinary(INS_imulEAX, size, treeNode, rmOp); + + // Move the result to the desired register, if necessary + if (targetReg != REG_RDX) + { + inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType); + } +#else // !0 + NYI("genCodeForMulHi"); +#endif // !0 +} + +// generate code for a DIV or MOD operation +// +void CodeGen::genCodeForDivMod(GenTreeOp* treeNode) +{ + // unused on ARM64 +} + +// Generate code for ADD, SUB, MUL, DIV, UDIV, AND, OR and XOR +void CodeGen::genCodeForBinary(GenTree* treeNode) +{ + const genTreeOps oper = treeNode->OperGet(); + regNumber targetReg = treeNode->gtRegNum; + var_types targetType = treeNode->TypeGet(); + emitter *emit = getEmitter(); + + assert (oper == GT_ADD || + oper == GT_SUB || + oper == GT_MUL || + oper == GT_DIV || + oper == GT_UDIV || + oper == GT_AND || + oper == GT_OR || + oper == GT_XOR); + + GenTreePtr op1 = treeNode->gtGetOp1(); + GenTreePtr op2 = treeNode->gtGetOp2(); + instruction ins = genGetInsForOper(treeNode->OperGet(), targetType); + + // The arithmetic node must be sitting in a register (since it's not contained) + noway_assert(targetReg != REG_NA); + + genConsumeOperands(treeNode->AsOp()); + + regNumber r = emit->emitInsTernary(ins, emitTypeSize(treeNode), treeNode, op1, op2); + noway_assert(r == targetReg); + + genProduceReg(treeNode); +} + + +/***************************************************************************** + * + * Generate code for a single node in the tree. + * Preconditions: All operands have been evaluated + * + */ +void +CodeGen::genCodeForTreeNode(GenTreePtr treeNode) +{ + regNumber targetReg = treeNode->gtRegNum; + var_types targetType = treeNode->TypeGet(); + emitter *emit = getEmitter(); + +#ifdef DEBUG + if (compiler->verbose) + { + unsigned seqNum = treeNode->gtSeqNum; // Useful for setting a conditional break in Visual Studio + printf("Generating: "); + compiler->gtDispTree(treeNode, nullptr, nullptr, true); + } +#endif // DEBUG + + // Is this a node whose value is already in a register? LSRA denotes this by + // setting the GTF_REUSE_REG_VAL flag. + if (treeNode->IsReuseRegVal()) + { + // For now, this is only used for constant nodes. + assert((treeNode->OperGet() == GT_CNS_INT) || (treeNode->OperGet() == GT_CNS_DBL)); + JITDUMP(" TreeNode is marked ReuseReg\n"); + return; + } + + // contained nodes are part of their parents for codegen purposes + // ex : immediates, most LEAs + if (treeNode->isContained()) + { + return; + } + + switch (treeNode->gtOper) + { + case GT_START_NONGC: + getEmitter()->emitMakeRemainderNonInterruptible(); + break; + + case GT_PROF_HOOK: + // We should be seeing this only if profiler hook is needed + noway_assert(compiler->compIsProfilerHookNeeded()); + +#ifdef PROFILING_SUPPORTED + // Right now this node is used only for tail calls. In future if + // we intend to use it for Enter or Leave hooks, add a data member + // to this node indicating the kind of profiler hook. For example, + // helper number can be used. + genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL); +#endif // PROFILING_SUPPORTED + break; + + case GT_LCLHEAP: + genLclHeap(treeNode); + break; + + case GT_CNS_INT: + case GT_CNS_DBL: + genSetRegToConst(targetReg, targetType, treeNode); + genProduceReg(treeNode); + break; + + case GT_NOT: + assert(!varTypeIsFloating(targetType)); + + __fallthrough; + + case GT_NEG: + { + instruction ins = genGetInsForOper(treeNode->OperGet(), targetType); + + // The arithmetic node must be sitting in a register (since it's not contained) + assert(!treeNode->isContained()); + // The dst can only be a register. + assert(targetReg != REG_NA); + + GenTreePtr operand = treeNode->gtGetOp1(); + assert(!operand->isContained()); + // The src must be a register. + regNumber operandReg = genConsumeReg(operand); + + getEmitter()->emitIns_R_R(ins, emitTypeSize(treeNode), targetReg, operandReg); + } + genProduceReg(treeNode); + break; + + case GT_DIV: + case GT_UDIV: + if (varTypeIsFloating(targetType)) + { + // Floating point divide never raises an exception + genCodeForBinary(treeNode); + } + else // integer divide operation + { + GenTreePtr divisorOp = treeNode->gtGetOp2(); + + // TODO-ARM64-CQ: Optimize a divide by power of 2 as we do for AMD64 + + if (divisorOp->IsZero()) + { + genJumpToThrowHlpBlk(EJ_je, Compiler::ACK_DIV_BY_ZERO); + // We don't need to generate the sdiv/udiv instruction + } + else + { + emitAttr cmpSize = EA_ATTR(genTypeSize(genActualType(treeNode->TypeGet()))); + regNumber divisorReg = divisorOp->gtRegNum; + + if (treeNode->gtOper == GT_DIV) + { + BasicBlock* sdivLabel = genCreateTempLabel(); + + // Two possible exceptions: + // (AnyVal / 0) => DivideByZeroException + // (MinInt / -1) => ArithmeticException + // + bool checkDividend = true; + // Do we have a contained immediate for the 'divisorOp'? + if (divisorOp->isContainedIntOrIImmed()) + { + GenTreeIntConCommon* intConst = divisorOp->AsIntConCommon(); + assert(intConst->IconValue() != 0); // already checked above by IsZero() + if (intConst->IconValue() != -1) + { + checkDividend = false; // We statically know that the dividend is not -1 + } + } + else + { + // Check if the divisor is zero throw a DivideByZeroException + emit->emitIns_R_I(INS_cmp, cmpSize, divisorReg, 0); + genJumpToThrowHlpBlk(EJ_je, Compiler::ACK_DIV_BY_ZERO); + + // Check if the divisor is not -1 branch to 'sdivLabel' + emit->emitIns_R_I(INS_cmp, cmpSize, divisorReg, -1); + inst_JMP(genJumpKindForOper(GT_NE, true), sdivLabel); + // If control flow continues past here the 'divisorReg' is known to be -1 + } + + if (checkDividend) + { + regNumber dividendReg = treeNode->gtGetOp1()->gtRegNum; + // At this point the divisor is known to be -1 + // + // Issue 'adds zr, dividendReg, dividendReg' instruction + // this will set the Z and V flags only when dividendReg is MinInt + // + emit->emitIns_R_R_R(INS_adds, cmpSize, REG_ZR, dividendReg, dividendReg); + inst_JMP(genJumpKindForOper(GT_NE, true), sdivLabel); // goto sdiv if Z flag is clear + genJumpToThrowHlpBlk(EJ_jo, Compiler::ACK_ARITH_EXCPN); // if the V flags is set throw ArithmeticException + } + + genDefineTempLabel(sdivLabel); + genCodeForBinary(treeNode); // Generate the sdiv instruction + } + else // (treeNode->gtOper == GT_UDIV) + { + // Only one possible exception + // (AnyVal / 0) => DivideByZeroException + // + // Note that division by the constant 0 was already checked for above by the op2->IsZero() check + // + if (!divisorOp->isContainedIntOrIImmed()) + { + emit->emitIns_R_I(INS_cmp, cmpSize, divisorReg, 0); + genJumpToThrowHlpBlk(EJ_je, Compiler::ACK_DIV_BY_ZERO); + } + + genCodeForBinary(treeNode); // Generate the udiv instruction + } + } + } + break; + + case GT_OR: + case GT_XOR: + case GT_AND: + assert(varTypeIsIntegralOrI(treeNode)); + __fallthrough; + case GT_ADD: + case GT_SUB: + case GT_MUL: + genCodeForBinary(treeNode); + break; + + case GT_LSH: + case GT_RSH: + case GT_RSZ: + genCodeForShift(treeNode->gtGetOp1(), treeNode->gtGetOp2(), treeNode); + // genCodeForShift() calls genProduceReg() + break; + + case GT_CAST: + if (varTypeIsFloating(targetType) && varTypeIsFloating(treeNode->gtOp.gtOp1)) + { + // Casts float/double <--> double/float + genFloatToFloatCast(treeNode); + } + else if (varTypeIsFloating(treeNode->gtOp.gtOp1)) + { + // Casts float/double --> int32/int64 + genFloatToIntCast(treeNode); + } + else if (varTypeIsFloating(targetType)) + { + // Casts int32/uint32/int64/uint64 --> float/double + genIntToFloatCast(treeNode); + } + else + { + // Casts int <--> int + genIntToIntCast(treeNode); + } + // The per-case functions call genProduceReg() + break; + + case GT_LCL_VAR: + { + // lcl_vars are not defs + assert((treeNode->gtFlags & GTF_VAR_DEF) == 0); + + GenTreeLclVarCommon *lcl = treeNode->AsLclVarCommon(); + bool isRegCandidate = compiler->lvaTable[lcl->gtLclNum].lvIsRegCandidate(); + + if (isRegCandidate && !(treeNode->gtFlags & GTF_VAR_DEATH)) + { + assert((treeNode->InReg()) || (treeNode->gtFlags & GTF_SPILLED)); + } + + // If this is a register candidate that has been spilled, genConsumeReg() will + // reload it at the point of use. Otherwise, if it's not in a register, we load it here. + + if (!treeNode->InReg() && !(treeNode->gtFlags & GTF_SPILLED)) + { + assert(!isRegCandidate); + emit->emitIns_R_S(ins_Load(targetType, compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)), + emitTypeSize(treeNode), targetReg, lcl->gtLclNum, 0); + genProduceReg(treeNode); + } + } + break; + + case GT_LCL_FLD_ADDR: + case GT_LCL_VAR_ADDR: + // Address of a local var. This by itself should never be allocated a register. + // If it is worth storing the address in a register then it should be cse'ed into + // a temp and that would be allocated a register. + noway_assert(targetType == TYP_BYREF); + noway_assert(!treeNode->InReg()); + + inst_RV_TT(INS_lea, targetReg, treeNode, 0, EA_BYREF); + genProduceReg(treeNode); + break; + + case GT_LCL_FLD: + { + noway_assert(targetType != TYP_STRUCT); + noway_assert(targetReg != REG_NA); + + unsigned offs = treeNode->gtLclFld.gtLclOffs; + unsigned varNum = treeNode->gtLclVarCommon.gtLclNum; + assert(varNum < compiler->lvaCount); + + emit->emitIns_R_S(ins_Move_Extend(targetType, treeNode->InReg()), EA_8BYTE, targetReg, varNum, offs); + genProduceReg(treeNode); + } + break; + + case GT_STORE_LCL_FLD: + { + NYI_IF(varTypeIsFloating(targetType), "Code generation for FP field assignment"); + + noway_assert(targetType != TYP_STRUCT); + noway_assert(!treeNode->InReg()); + + unsigned offs = treeNode->gtLclFld.gtLclOffs; + unsigned varNum = treeNode->gtLclVarCommon.gtLclNum; + assert(varNum < compiler->lvaCount); + + GenTreePtr op1 = treeNode->gtOp.gtOp1; + genConsumeRegs(op1); + + emit->emitIns_R_S(ins_Store(targetType), emitTypeSize(targetType), op1->gtRegNum, varNum, offs); + } + break; + + case GT_STORE_LCL_VAR: + { + noway_assert(targetType != TYP_STRUCT); + + unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum; + LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]); + + // Ensure that lclVar nodes are typed correctly. + assert(!varDsc->lvNormalizeOnStore() || targetType == genActualType(varDsc->TypeGet())); + + GenTreePtr op1 = treeNode->gtOp.gtOp1; + genConsumeRegs(op1); + if (targetReg == REG_NA) + { + // stack store + emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)), emitTypeSize(treeNode), treeNode); + varDsc->lvRegNum = REG_STK; + } + else // store into register (i.e move into register) + { + if (op1->isContained()) + { + // Currently, we assume that the contained source of a GT_STORE_LCL_VAR writing to a register + // must be a constant. However, in the future we might want to support a contained memory op. + // This is a bit tricky because we have to decide it's contained before register allocation, + // and this would be a case where, once that's done, we need to mark that node as always + // requiring a register - which we always assume now anyway, but once we "optimize" that + // we'll have to take cases like this into account. + assert((op1->gtRegNum == REG_NA) && op1->OperIsConst()); + genSetRegToConst(targetReg, targetType, op1); + } + else if (op1->gtRegNum != targetReg) + { + // Setup targetReg when op1 is not a matching register + assert(op1->gtRegNum != REG_NA); + inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType); + } + genProduceReg(treeNode); + } + } + break; + + case GT_RETFILT: + // A void GT_RETFILT is the end of a finally. For non-void filter returns we need to load the result in + // the return register, if it's not already there. The processing is the same as GT_RETURN. + if (targetType != TYP_VOID) + { + // For filters, the IL spec says the result is type int32. Further, the only specified legal values + // are 0 or 1, with the use of other values "undefined". + assert(targetType == TYP_INT); + } + + __fallthrough; + + case GT_RETURN: + { + GenTreePtr op1 = treeNode->gtOp.gtOp1; + if (targetType == TYP_VOID) + { + assert(op1 == nullptr); + } + else + { + assert(op1 != nullptr); + noway_assert(op1->gtRegNum != REG_NA); + + genConsumeReg(op1); + + regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET; + + bool movRequired = (op1->gtRegNum != retReg); + + if (!movRequired) + { + if (op1->OperGet() == GT_LCL_VAR) + { + GenTreeLclVarCommon *lcl = op1->AsLclVarCommon(); + bool isRegCandidate = compiler->lvaTable[lcl->gtLclNum].lvIsRegCandidate(); + if (isRegCandidate && ((op1->gtFlags & GTF_SPILLED) == 0)) + { + assert(op1->InReg()); + + // We may need to generate a zero-extending mov instruction to load the value from this GT_LCL_VAR + + unsigned lclNum = lcl->gtLclNum; + LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]); + var_types op1Type = genActualType(op1->TypeGet()); + var_types lclType = genActualType(varDsc->TypeGet()); + + if (genTypeSize(op1Type) < genTypeSize(lclType)) + { + movRequired = true; + } + } + } + } + + if (movRequired) + { + emitAttr movSize = EA_ATTR(genTypeSize(targetType)); + getEmitter()->emitIns_R_R(INS_mov, movSize, retReg, op1->gtRegNum); + } + } + +#ifdef PROFILING_SUPPORTED + // There will be a single return block while generating profiler ELT callbacks. + // + // Reason for not materializing Leave callback as a GT_PROF_HOOK node after GT_RETURN: + // In flowgraph and other places assert that the last node of a block marked as + // GT_RETURN is either a GT_RETURN or GT_JMP or a tail call. It would be nice to + // maintain such an invariant irrespective of whether profiler hook needed or not. + // Also, there is not much to be gained by materializing it as an explicit node. + if (compiler->compCurBB == compiler->genReturnBB) + { + genProfilingLeaveCallback(); + } +#endif + } + break; + + case GT_LEA: + { + // if we are here, it is the case where there is an LEA that cannot + // be folded into a parent instruction + GenTreeAddrMode *lea = treeNode->AsAddrMode(); + genLeaInstruction(lea); + } + // genLeaInstruction calls genProduceReg() + break; + + case GT_IND: + genConsumeAddress(treeNode->AsIndir()->Addr()); + emit->emitInsMov(ins_Load(targetType), emitTypeSize(treeNode), treeNode); + genProduceReg(treeNode); + break; + + case GT_MULHI: + genCodeForMulHi(treeNode->AsOp()); + genProduceReg(treeNode); + break; + + case GT_MOD: + case GT_UMOD: + // Integer MOD should have been morphed into a sequence of sub, mul, div in fgMorph. + // + // We shouldn't be seeing GT_MOD on float/double as it is morphed into a helper call by front-end. + noway_assert(!"Codegen for GT_MOD/GT_UMOD"); + break; + + case GT_MATH: + genMathIntrinsic(treeNode); + break; + +#ifdef FEATURE_SIMD + case GT_SIMD: + genSIMDIntrinsic(treeNode->AsSIMD()); + break; +#endif // FEATURE_SIMD + + case GT_CKFINITE: + genCkfinite(treeNode); + break; + + case GT_EQ: + case GT_NE: + case GT_LT: + case GT_LE: + case GT_GE: + case GT_GT: + { + // TODO-ARM64-CQ: Check if we can use the currently set flags. + // TODO-ARM64-CQ: Check for the case where we can simply transfer the carry bit to a register + // (signed < or >= where targetReg != REG_NA) + + GenTreeOp* tree = treeNode->AsOp(); + GenTreePtr op1 = tree->gtOp1; + GenTreePtr op2 = tree->gtOp2; + var_types op1Type = op1->TypeGet(); + var_types op2Type = op2->TypeGet(); + + assert(!op1->isContainedMemoryOp()); + assert(!op2->isContainedMemoryOp()); + + genConsumeOperands(tree); + + emitAttr cmpSize = EA_UNKNOWN; + + if (varTypeIsFloating(op1Type)) + { + assert(varTypeIsFloating(op2Type)); + assert(!op1->isContained()); + assert(op1Type == op2Type); + cmpSize = EA_ATTR(genTypeSize(op1Type)); + + if (op2->IsZero()) + { + emit->emitIns_R_F(INS_fcmp, cmpSize, op1->gtRegNum, 0.0); + } + else + { + assert(!op2->isContained()); + emit->emitIns_R_R(INS_fcmp, cmpSize, op1->gtRegNum, op2->gtRegNum); + } + } + else + { + assert(!varTypeIsFloating(op2Type)); + // We don't support swapping op1 and op2 to generate cmp reg, imm + assert(!op1->isContainedIntOrIImmed()); + + // TODO-ARM64-CQ: the second register argument of a CMP can be sign/zero + // extended as part of the instruction (using "CMP (extended register)"). + // We should use that if possible, swapping operands + // (and reversing the condition) if necessary. + unsigned op1Size = genTypeSize(op1Type); + unsigned op2Size = genTypeSize(op2Type); + + if ((op1Size < 4) || (op1Size < op2Size)) + { + // We need to sign/zero extend op1 up to 32 or 64 bits. + instruction ins = ins_Move_Extend(op1Type, true); + inst_RV_RV(ins, op1->gtRegNum, op1->gtRegNum); + } + + if (!op2->isContainedIntOrIImmed()) + { + if ((op2Size < 4) || (op2Size < op1Size)) + { + // We need to sign/zero extend op2 up to 32 or 64 bits. + instruction ins = ins_Move_Extend(op2Type, true); + inst_RV_RV(ins, op2->gtRegNum, op2->gtRegNum); + } + } + cmpSize = EA_4BYTE; + if ((op1Size == EA_8BYTE) || (op2Size == EA_8BYTE)) + { + cmpSize = EA_8BYTE; + } + + if (op2->isContainedIntOrIImmed()) + { + GenTreeIntConCommon* intConst = op2->AsIntConCommon(); + emit->emitIns_R_I(INS_cmp, cmpSize, op1->gtRegNum, intConst->IconValue()); + } + else + { + emit->emitIns_R_R(INS_cmp, cmpSize, op1->gtRegNum, op2->gtRegNum); + } + } + + // Are we evaluating this into a register? + if (targetReg != REG_NA) + { + genSetRegToCond(targetReg, tree); + genProduceReg(tree); + } + } + break; + + case GT_JTRUE: + { + GenTree *cmp = treeNode->gtOp.gtOp1->gtEffectiveVal(); + assert(cmp->OperIsCompare()); + assert(compiler->compCurBB->bbJumpKind == BBJ_COND); + + // Get the "jmpKind" using the gtOper kind + // Note that whether it is an unsigned cmp is governed by the GTF_UNSIGNED flags + + emitJumpKind jmpKind = genJumpKindForOper(cmp->gtOper, (cmp->gtFlags & GTF_UNSIGNED) != 0); + BasicBlock * jmpTarget = compiler->compCurBB->bbJumpDest; + + inst_JMP(jmpKind, jmpTarget); + } + break; + + case GT_RETURNTRAP: + { + // this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC + // based on the contents of 'data' + + GenTree *data = treeNode->gtOp.gtOp1; + genConsumeRegs(data); + emit->emitIns_R_I(INS_cmp, EA_4BYTE, data->gtRegNum, 0); + + BasicBlock* skipLabel = genCreateTempLabel(); + + inst_JMP(genJumpKindForOper(GT_EQ, true), skipLabel); + // emit the call to the EE-helper that stops for GC (or other reasons) + + genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN); + genDefineTempLabel(skipLabel); + } + break; + + case GT_STOREIND: + { + GenTree* data = treeNode->gtOp.gtOp2; + GenTree* addr = treeNode->gtOp.gtOp1; + GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data); + if (writeBarrierForm != GCInfo::WBF_NoBarrier) + { + // data and addr must be in registers. + // Consume both registers so that any copies of interfering + // registers are taken care of. + genConsumeOperands(treeNode->AsOp()); + +#if NOGC_WRITE_BARRIERS + // At this point, we should not have any interference. + // That is, 'data' must not be in REG_WRITE_BARRIER_DST_BYREF, + // as that is where 'addr' must go. + noway_assert(data->gtRegNum != REG_WRITE_BARRIER_DST_BYREF); + + // 'addr' goes into x14 (REG_WRITE_BARRIER_DST_BYREF) + if (addr->gtRegNum != REG_WRITE_BARRIER_DST_BYREF) + { + inst_RV_RV(INS_mov, REG_WRITE_BARRIER_DST_BYREF, addr->gtRegNum, addr->TypeGet()); + } + + // 'data' goes into x15 (REG_WRITE_BARRIER) + if (data->gtRegNum != REG_WRITE_BARRIER) + { + inst_RV_RV(INS_mov, REG_WRITE_BARRIER, data->gtRegNum, data->TypeGet()); + } +#else + // At this point, we should not have any interference. + // That is, 'data' must not be in REG_ARG_0, + // as that is where 'addr' must go. + noway_assert(data->gtRegNum != REG_ARG_0); + + // addr goes in REG_ARG_0 + if (addr->gtRegNum != REG_ARG_0) + { + inst_RV_RV(INS_mov, REG_ARG_0, addr->gtRegNum, addr->TypeGet()); + } + + // data goes in REG_ARG_1 + if (data->gtRegNum != REG_ARG_1) + { + inst_RV_RV(INS_mov, REG_ARG_1, data->gtRegNum, data->TypeGet()); + } +#endif // NOGC_WRITE_BARRIERS + + genGCWriteBarrier(treeNode, writeBarrierForm); + } + else + { + bool reverseOps = ((treeNode->gtFlags & GTF_REVERSE_OPS) != 0); + bool dataIsUnary = false; + GenTree* nonRMWsrc = nullptr; + // We must consume the operands in the proper execution order, + // so that liveness is updated appropriately. + if (!reverseOps) + { + genConsumeAddress(addr); + } + if (data->isContained() && !data->OperIsLeaf()) + { + dataIsUnary = (GenTree::OperIsUnary(data->OperGet()) != 0); + if (!dataIsUnary) + { + nonRMWsrc = data->gtGetOp1(); + if (nonRMWsrc->isIndir() && Lowering::IndirsAreEquivalent(nonRMWsrc, treeNode)) + { + nonRMWsrc = data->gtGetOp2(); + } + genConsumeRegs(nonRMWsrc); + } + } + else + { + genConsumeRegs(data); + } + if (reverseOps) + { + genConsumeAddress(addr); + } + if (data->isContained() && !data->OperIsLeaf()) + { + NYI("RMW?"); + } + else + { + emit->emitInsMov(ins_Store(targetType), emitTypeSize(treeNode), treeNode); + } + } + } + break; + + case GT_COPY: + // This is handled at the time we call genConsumeReg() on the GT_COPY + break; + + case GT_SWAP: + { + // Swap is only supported for lclVar operands that are enregistered + // We do not consume or produce any registers. Both operands remain enregistered. + // However, the gc-ness may change. + assert(genIsRegCandidateLocal(treeNode->gtOp.gtOp1) && genIsRegCandidateLocal(treeNode->gtOp.gtOp2)); + + GenTreeLclVarCommon* lcl1 = treeNode->gtOp.gtOp1->AsLclVarCommon(); + LclVarDsc* varDsc1 = &(compiler->lvaTable[lcl1->gtLclNum]); + var_types type1 = varDsc1->TypeGet(); + GenTreeLclVarCommon* lcl2 = treeNode->gtOp.gtOp2->AsLclVarCommon(); + LclVarDsc* varDsc2 = &(compiler->lvaTable[lcl2->gtLclNum]); + var_types type2 = varDsc2->TypeGet(); + + // We must have both int or both fp regs + assert(!varTypeIsFloating(type1) || varTypeIsFloating(type2)); + + // FP swap is not yet implemented (and should have NYI'd in LSRA) + assert(!varTypeIsFloating(type1)); + + regNumber oldOp1Reg = lcl1->gtRegNum; + regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg); + regNumber oldOp2Reg = lcl2->gtRegNum; + regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg); + + // We don't call genUpdateVarReg because we don't have a tree node with the new register. + varDsc1->lvRegNum = oldOp2Reg; + varDsc2->lvRegNum = oldOp1Reg; + + // Do the xchg + emitAttr size = EA_PTRSIZE; + if (varTypeGCtype(type1) != varTypeGCtype(type2)) + { + // If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers. + // Otherwise it will leave them alone, which is correct if they have the same GC-ness. + size = EA_GCREF; + } + + NYI("register swap"); + // inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size); + + // Update the gcInfo. + // Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output) + gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask|oldOp2RegMask); + gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask|oldOp2RegMask); + + // gcMarkRegPtrVal will do the appropriate thing for non-gc types. + // It will also dump the updates. + gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1); + gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2); + } + break; + + case GT_LIST: + case GT_ARGPLACE: + // Nothing to do + break; + + case GT_PUTARG_STK: + { + noway_assert(targetType != TYP_STRUCT); + + // Get argument offset on stack. + // Here we cross check that argument offset hasn't changed from lowering to codegen since + // we are storing arg slot number in GT_PUTARG_STK node in lowering phase. + int argOffset = treeNode->AsPutArgStk()->gtSlotNum * TARGET_POINTER_SIZE; + +#ifdef DEBUG + fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(treeNode->AsPutArgStk()->gtCall, treeNode); + assert(curArgTabEntry); + assert(argOffset == (int)curArgTabEntry->slotNum * TARGET_POINTER_SIZE); +#endif + + GenTreePtr data = treeNode->gtOp.gtOp1; + unsigned varNum; + +#if FEATURE_FASTTAILCALL + bool putInIncomingArgArea = treeNode->AsPutArgStk()->putInIncomingArgArea; +#else + const bool putInIncomingArgArea = false; +#endif + // Whether to setup stk arg in incoming or out-going arg area? + // Fast tail calls implemented as epilog+jmp = stk arg is setup in incoming arg area. + // All other calls - stk arg is setup in out-going arg area. + if (putInIncomingArgArea) + { + // The first varNum is guaranteed to be the first incoming arg of the method being compiled. + // See lvaInitTypeRef() for the order in which lvaTable entries are initialized. + varNum = 0; +#ifdef DEBUG +#if FEATURE_FASTTAILCALL + // This must be a fast tail call. + assert(treeNode->AsPutArgStk()->gtCall->AsCall()->IsFastTailCall()); + + // Since it is a fast tail call, the existence of first incoming arg is guaranteed + // because fast tail call requires that in-coming arg area of caller is >= out-going + // arg area required for tail call. + LclVarDsc* varDsc = compiler->lvaTable; + assert(varDsc != nullptr); + assert(varDsc->lvIsRegArg && ((varDsc->lvArgReg == REG_ARG_0) || (varDsc->lvArgReg == REG_FLTARG_0))); +#endif // FEATURE_FASTTAILCALL +#endif + } + else + { + varNum = compiler->lvaOutgoingArgSpaceVar; + } + + if (data->isContained()) + { + getEmitter()->emitIns_S_I(ins_Store(targetType), emitTypeSize(targetType), varNum, + argOffset, (int) data->AsIntConCommon()->IconValue()); + } + else + { + genConsumeReg(data); + getEmitter()->emitIns_S_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, varNum, argOffset); + } + } + break; + + case GT_PUTARG_REG: + { + noway_assert(targetType != TYP_STRUCT); + + // commas show up here commonly, as part of a nullchk operation + GenTree *op1 = treeNode->gtOp.gtOp1; + // If child node is not already in the register we need, move it + genConsumeReg(op1); + if (targetReg != op1->gtRegNum) + { + inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType); + } + } + genProduceReg(treeNode); + break; + + case GT_CALL: + genCallInstruction(treeNode); + break; + + case GT_JMP: + genJmpMethod(treeNode); + break; + + case GT_LOCKADD: + case GT_XCHG: + case GT_XADD: + genLockedInstructions(treeNode); + break; + + case GT_MEMORYBARRIER: + instGen_MemoryBarrier(); + break; + + case GT_CMPXCHG: + NYI("GT_CMPXCHG"); + break; + + case GT_RELOAD: + // do nothing - reload is just a marker. + // The parent node will call genConsumeReg on this which will trigger the unspill of this node's child + // into the register specified in this node. + break; + + case GT_NOP: + break; + + case GT_NO_OP: + if (treeNode->gtFlags & GTF_NO_OP_NO) + { + noway_assert(!"GTF_NO_OP_NO should not be set"); + } + else + { + instGen(INS_nop); + } + break; + + case GT_ARR_BOUNDS_CHECK: +#ifdef FEATURE_SIMD + case GT_SIMD_CHK: +#endif // FEATURE_SIMD + genRangeCheck(treeNode); + break; + + case GT_PHYSREG: + if (targetReg != treeNode->AsPhysReg()->gtSrcReg) + { + inst_RV_RV(ins_Copy(targetType), targetReg, treeNode->AsPhysReg()->gtSrcReg, targetType); + + genTransferRegGCState(targetReg, treeNode->AsPhysReg()->gtSrcReg); + } + genProduceReg(treeNode); + break; + + case GT_PHYSREGDST: + break; + + case GT_NULLCHECK: + { + assert(!treeNode->gtOp.gtOp1->isContained()); + regNumber reg = genConsumeReg(treeNode->gtOp.gtOp1); + emit->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, reg, 0); + } + break; + + case GT_CATCH_ARG: + + noway_assert(handlerGetsXcptnObj(compiler->compCurBB->bbCatchTyp)); + + /* Catch arguments get passed in a register. genCodeForBBlist() + would have marked it as holding a GC object, but not used. */ + + noway_assert(gcInfo.gcRegGCrefSetCur & RBM_EXCEPTION_OBJECT); + genConsumeReg(treeNode); + break; + + case GT_PINVOKE_PROLOG: + noway_assert(((gcInfo.gcRegGCrefSetCur|gcInfo.gcRegByrefSetCur) & ~RBM_ARG_REGS) == 0); + + // the runtime side requires the codegen here to be consistent + emit->emitDisableRandomNops(); + break; + + case GT_LABEL: + genPendingCallLabel = genCreateTempLabel(); + treeNode->gtLabel.gtLabBB = genPendingCallLabel; + emit->emitIns_R_L(INS_adr, EA_PTRSIZE, genPendingCallLabel, targetReg); + break; + + case GT_COPYOBJ: + genCodeForCpObj(treeNode->AsCpObj()); + break; + + case GT_COPYBLK: + { + GenTreeCpBlk* cpBlkOp = treeNode->AsCpBlk(); + if (cpBlkOp->gtBlkOpGcUnsafe) + { + getEmitter()->emitDisableGC(); + } + + switch (cpBlkOp->gtBlkOpKind) + { + case GenTreeBlkOp::BlkOpKindHelper: + genCodeForCpBlk(cpBlkOp); + break; + case GenTreeBlkOp::BlkOpKindUnroll: + genCodeForCpBlkUnroll(cpBlkOp); + break; + default: + unreached(); + } + if (cpBlkOp->gtBlkOpGcUnsafe) + { + getEmitter()->emitEnableGC(); + } + } + break; + + case GT_INITBLK: + { + GenTreeInitBlk* initBlkOp = treeNode->AsInitBlk(); + switch (initBlkOp->gtBlkOpKind) + { + case GenTreeBlkOp::BlkOpKindHelper: + genCodeForInitBlk(initBlkOp); + break; + case GenTreeBlkOp::BlkOpKindUnroll: + genCodeForInitBlkUnroll(initBlkOp); + break; + default: + unreached(); + } + } + break; + + case GT_JMPTABLE: + genJumpTable(treeNode); + break; + + case GT_SWITCH_TABLE: + genTableBasedSwitch(treeNode); + break; + + case GT_ARR_INDEX: + genCodeForArrIndex(treeNode->AsArrIndex()); + break; + + case GT_ARR_OFFSET: + genCodeForArrOffset(treeNode->AsArrOffs()); + break; + + case GT_CLS_VAR_ADDR: + NYI("GT_CLS_VAR_ADDR"); + break; + + default: + { +#ifdef DEBUG + char message[256]; + sprintf(message, "Unimplemented node type %s\n", GenTree::NodeName(treeNode->OperGet())); +#endif + assert(!"Unknown node in codegen"); + } + break; + } +} + + +// Generate code for division (or mod) by power of two +// or negative powers of two. (meaning -1 * a power of two, not 2^(-1)) +// Op2 must be a contained integer constant. +void +CodeGen::genCodeForPow2Div(GenTreeOp* tree) +{ +#if 0 + GenTree *dividend = tree->gtOp.gtOp1; + GenTree *divisor = tree->gtOp.gtOp2; + genTreeOps oper = tree->OperGet(); + emitAttr size = emitTypeSize(tree); + emitter *emit = getEmitter(); + regNumber targetReg = tree->gtRegNum; + var_types targetType = tree->TypeGet(); + + bool isSigned = oper == GT_MOD || oper == GT_DIV; + + // precondition: extended dividend is in RDX:RAX + // which means it is either all zeros or all ones + + noway_assert(divisor->isContained()); + GenTreeIntConCommon* divImm = divisor->AsIntConCommon(); + int64_t imm = divImm->IconValue(); + ssize_t abs_imm = abs(imm); + noway_assert(isPow2(abs_imm)); + + + if (isSigned) + { + if (imm == 1) + { + if (targetReg != REG_RAX) + inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType); + + return; + } + + if (abs_imm == 2) + { + if (oper == GT_MOD) + { + emit->emitIns_R_I(INS_and, size, REG_RAX, 1); // result is 0 or 1 + // xor with rdx will flip all bits if negative + emit->emitIns_R_R(INS_xor, size, REG_RAX, REG_RDX); // 111.11110 or 0 + } + else + { + assert(oper == GT_DIV); + // add 1 if it's negative + emit->emitIns_R_R(INS_sub, size, REG_RAX, REG_RDX); + } + } + else + { + // add imm-1 if negative + emit->emitIns_R_I(INS_and, size, REG_RDX, abs_imm - 1); + emit->emitIns_R_R(INS_add, size, REG_RAX, REG_RDX); + } + + if (oper == GT_DIV) + { + unsigned shiftAmount = genLog2(unsigned(abs_imm)); + inst_RV_SH(INS_sar, size, REG_RAX, shiftAmount); + + if (imm < 0) + { + emit->emitIns_R(INS_neg, size, REG_RAX); + } + } + else + { + assert(oper == GT_MOD); + if (abs_imm > 2) + { + emit->emitIns_R_I(INS_and, size, REG_RAX, abs_imm - 1); + } + // RDX contains 'imm-1' if negative + emit->emitIns_R_R(INS_sub, size, REG_RAX, REG_RDX); + } + + if (targetReg != REG_RAX) + { + inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType); + } + } + else + { + assert (imm > 0); + + if (targetReg != dividend->gtRegNum) + { + inst_RV_RV(INS_mov, targetReg, dividend->gtRegNum, targetType); + } + + if (oper == GT_UDIV) + { + inst_RV_SH(INS_shr, size, targetReg, genLog2(unsigned(imm))); + } + else + { + assert(oper == GT_UMOD); + + emit->emitIns_R_I(INS_and, size, targetReg, imm -1); + } + } +#else // !0 + NYI("genCodeForPow2Div"); +#endif // !0 +} + + +/*********************************************************************************************** + * Generate code for localloc + */ +void +CodeGen::genLclHeap(GenTreePtr tree) +{ + assert(tree->OperGet() == GT_LCLHEAP); + + GenTreePtr size = tree->gtOp.gtOp1; + noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL)); + + regNumber targetReg = tree->gtRegNum; + regMaskTP tmpRegsMask = tree->gtRsvdRegs; + regNumber regCnt = REG_NA; + regNumber pspSymReg = REG_NA; + var_types type = genActualType(size->gtType); + emitAttr easz = emitTypeSize(type); + BasicBlock* endLabel = nullptr; + +#ifdef DEBUG + // Verify ESP + if (compiler->opts.compStackCheckOnRet) + { + noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC && compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister && compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame); + getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0); + + BasicBlock * esp_check = genCreateTempLabel(); + inst_JMP(genJumpKindForOper(GT_EQ, true), esp_check); + getEmitter()->emitIns(INS_BREAKPOINT); + genDefineTempLabel(esp_check); + } +#endif + + noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes + noway_assert(genStackLevel == 0); // Can't have anything on the stack + + // Whether method has PSPSym. + bool hasPspSym; +#if FEATURE_EH_FUNCLETS + hasPspSym = (compiler->lvaPSPSym != BAD_VAR_NUM); +#else + hasPspSym = false; +#endif + + // compute the amount of memory to allocate to properly STACK_ALIGN. + size_t amount = 0; + if (size->IsCnsIntOrI()) + { + // If size is a constant, then it must be contained. + assert(size->isContained()); + + // If amount is zero then return null in targetReg + amount = size->gtIntCon.gtIconVal; + if (amount == 0) + { + instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg); + goto BAILOUT; + } + + // 'amount' is the total numbe of bytes to localloc to properly STACK_ALIGN + amount = AlignUp(amount, STACK_ALIGN); + } + else + { + // If 0 bail out by returning null in targetReg + genConsumeRegAndCopy(size, targetReg); + endLabel = genCreateTempLabel(); + getEmitter()->emitIns_R_R(INS_TEST, easz, targetReg, targetReg); + inst_JMP(EJ_je, endLabel); + + // Compute the size of the block to allocate and perform alignment. + // If the method has no PSPSym and compInitMem=true, we can reuse targetReg as regcnt, + // since we don't need any internal registers. + if (!hasPspSym && compiler->info.compInitMem) + { + assert(genCountBits(tmpRegsMask) == 0); + regCnt = targetReg; + } + else + { + assert(genCountBits(tmpRegsMask) >= 1); + regMaskTP regCntMask = genFindLowestBit(tmpRegsMask); + tmpRegsMask &= ~regCntMask; + regCnt = genRegNumFromMask(regCntMask); + if (regCnt != targetReg) + inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet()); + } + + // Align to STACK_ALIGN + // regCnt will be the total number of bytes to localloc + inst_RV_IV(INS_add, regCnt, (STACK_ALIGN - 1), emitActualTypeSize(type)); + inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type)); + } + + unsigned stackAdjustment = 0; +#if FEATURE_EH_FUNCLETS + // If we have PSPsym, then need to re-locate it after localloc. + if (hasPspSym) + { + stackAdjustment += STACK_ALIGN; + + // Save a copy of PSPSym + assert(genCountBits(tmpRegsMask) >= 1); + regMaskTP pspSymRegMask = genFindLowestBit(tmpRegsMask); + tmpRegsMask &= ~pspSymRegMask; + pspSymReg = genRegNumFromMask(pspSymRegMask); + getEmitter()->emitIns_R_S(ins_Store(TYP_I_IMPL), EA_PTRSIZE, pspSymReg, compiler->lvaPSPSym, 0); + } +#endif + + +#if FEATURE_FIXED_OUT_ARGS + // If we have an outgoing arg area then we must adjust the SP by popping off the + // outgoing arg area. We will restore it right before we return from this method. + // + // Localloc is supposed to return stack space that is STACK_ALIGN'ed. The following + // are the cases that needs to be handled: + // i) Method has PSPSym + out-going arg area. + // It is guaranteed that size of out-going arg area is STACK_ALIGNED (see fgMorphArgs). + // Therefore, we will pop-off RSP upto out-going arg area before locallocating. + // We need to add padding to ensure RSP is STACK_ALIGN'ed while re-locating PSPSym + arg area. + // ii) Method has no PSPSym but out-going arg area. + // Almost same case as above without the requirement to pad for the final RSP to be STACK_ALIGN'ed. + // iii) Method has PSPSym but no out-going arg area. + // Nothing to pop-off from the stack but needs to relocate PSPSym with SP padded. + // iv) Method has neither PSPSym nor out-going arg area. + // Nothing needs to popped off from stack nor relocated. + if (compiler->lvaOutgoingArgSpaceSize > 0) + { + assert((compiler->lvaOutgoingArgSpaceSize % STACK_ALIGN) == 0); // This must be true for the stack to remain aligned + inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE); + stackAdjustment += compiler->lvaOutgoingArgSpaceSize; + } +#endif + + if (size->IsCnsIntOrI()) + { + // We should reach here only for non-zero, constant size allocations. + assert(amount > 0); + + // For small allocations we will generate up to four stp instructions + size_t cntStackAlignedWidthItems = (amount >> STACK_ALIGN_SHIFT); + if (cntStackAlignedWidthItems <= 4) + { + while (cntStackAlignedWidthItems != 0) + { + // We can use pre-indexed addressing. + // stp ZR, ZR, [SP, #-16]! + getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX); + cntStackAlignedWidthItems -= 1; + } + + goto ALLOC_DONE; + } + else if (!compiler->info.compInitMem && (amount < CORINFO_PAGE_SIZE)) // must be < not <= + { + // Since the size is a page or less, simply adjust ESP + // ESP might already be in the guard page, must touch it BEFORE + // the alloc, not after. + getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0); + inst_RV_IV(INS_sub, REG_SPBASE, amount, EA_PTRSIZE); + + goto ALLOC_DONE; + } + + // else, "mov regCnt, amount" + // If the method has no PSPSym and compInitMem=true, we can reuse targetReg as regcnt. + // Since size is a constant, regCnt is not yet initialized. + assert(regCnt == REG_NA); + if (!hasPspSym && compiler->info.compInitMem) + { + assert(genCountBits(tmpRegsMask) == 0); + regCnt = targetReg; + } + else + { + assert(genCountBits(tmpRegsMask) >= 1); + regMaskTP regCntMask = genFindLowestBit(tmpRegsMask); + tmpRegsMask &= ~regCntMask; + regCnt = genRegNumFromMask(regCntMask); + } + genSetRegToIcon(regCnt, amount, ((int)amount == amount)? TYP_INT : TYP_LONG); + } + + BasicBlock* loop = genCreateTempLabel(); + if (compiler->info.compInitMem) + { + // At this point 'regCnt' is set to the total number of bytes to locAlloc. + // Since we have to zero out the allocated memory AND ensure that RSP is always valid + // by tickling the pages, we will just push 0's on the stack. + // + // Note: regCnt is guaranteed to be even on Amd64 since STACK_ALIGN/TARGET_POINTER_SIZE = 2 + // and localloc size is a multiple of STACK_ALIGN. + + // Loop: + genDefineTempLabel(loop); + + // We can use pre-indexed addressing. + // stp ZR, ZR, [SP, #-16]! + getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX); + + // If not done, loop + // Note that regCnt is the number of bytes to stack allocate. + // Therefore we need to subtract 16 from regcnt here. + assert(genIsValidIntReg(regCnt)); + inst_RV_IV(INS_subs, regCnt, 16, emitActualTypeSize(type)); + inst_JMP(EJ_jne, loop); + } + else + { + //At this point 'regCnt' is set to the total number of bytes to locAlloc. + // + //We don't need to zero out the allocated memory. However, we do have + //to tickle the pages to ensure that ESP is always valid and is + //in sync with the "stack guard page". Note that in the worst + //case ESP is on the last byte of the guard page. Thus you must + //touch ESP+0 first not ESP+x01000. + // + //Another subtlety is that you don't want ESP to be exactly on the + //boundary of the guard page because PUSH is predecrement, thus + //call setup would not touch the guard page but just beyond it + // + //Note that we go through a few hoops so that ESP never points to + //illegal pages at any time during the ticking process + // + // neg REGCNT + // add REGCNT, ESP // reg now holds ultimate ESP + // jb loop // result is smaller than orignial ESP (no wrap around) + // xor REGCNT, REGCNT, // Overflow, pick lowest possible number + // loop: + // test ESP, [ESP+0] // tickle the page + // mov REGTMP, ESP + // sub REGTMP, PAGE_SIZE + // mov ESP, REGTMP + // cmp ESP, REGCNT + // jae loop + // + // mov ESP, REG + // end: + inst_RV(INS_NEG, regCnt, TYP_I_IMPL); + inst_RV_RV(INS_adds, regCnt, REG_SPBASE, TYP_I_IMPL); + inst_JMP(EJ_jb, loop); + + instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt); + + genDefineTempLabel(loop); + + // Tickle the decremented value, and move back to ESP, + // note that it has to be done BEFORE the update of ESP since + // ESP might already be on the guard page. It is OK to leave + // the final value of ESP on the guard page + getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0); + + // This is a harmless workaround to avoid the emitter trying to track the + // decrement of the ESP - we do the subtraction in another reg instead + // of adjusting ESP directly. + assert(tmpRegsMask != RBM_NONE); + assert(genCountBits(tmpRegsMask) == 1); + regNumber regTmp = genRegNumFromMask(tmpRegsMask); + + inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL); + inst_RV_IV(INS_sub, regTmp, CORINFO_PAGE_SIZE, EA_PTRSIZE); + inst_RV_RV(INS_mov, REG_SPBASE, regTmp, TYP_I_IMPL); + + inst_RV_RV(INS_cmp, REG_SPBASE, regCnt, TYP_I_IMPL); + inst_JMP(EJ_jae, loop); + + // Move the final value to ESP + inst_RV_RV(INS_mov, REG_SPBASE, regCnt); + } + +ALLOC_DONE: + // Re-adjust SP to allocate PSPSym and out-going arg area + if (stackAdjustment != 0) + { + assert((stackAdjustment % STACK_ALIGN) == 0); // This must be true for the stack to remain aligned + assert(stackAdjustment > 0); + getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, (int) stackAdjustment); + +#if FEATURE_EH_FUNCLETS + // Write PSPSym to its new location. + if (hasPspSym) + { + assert(genIsValidIntReg(pspSymReg)); + getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, pspSymReg, compiler->lvaPSPSym, 0); + } +#endif + // Return the stackalloc'ed address in result register. + // TargetReg = RSP + stackAdjustment. + // + getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, targetReg, REG_SPBASE, (int) stackAdjustment); + } + else // stackAdjustment == 0 + { + // Move the final value of SP to targetReg + inst_RV_RV(INS_mov, targetReg, REG_SPBASE); + } + +BAILOUT: + if (endLabel != nullptr) + genDefineTempLabel(endLabel); + + // Write the lvaShadowSPfirst stack frame slot + noway_assert(compiler->lvaLocAllocSPvar != BAD_VAR_NUM); + getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, targetReg, compiler->lvaLocAllocSPvar, 0); + +#if STACK_PROBES + if (compiler->opts.compNeedStackProbes) + { + genGenerateStackProbe(); + } +#endif + +#ifdef DEBUG + // Update new ESP + if (compiler->opts.compStackCheckOnRet) + { + noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC && compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister && compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame); + getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, targetReg, compiler->lvaReturnEspCheck, 0); + } +#endif + + genProduceReg(tree); +} + +// Generate code for InitBlk by performing a loop unroll +// Preconditions: +// a) Both the size and fill byte value are integer constants. +// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes. +void CodeGen::genCodeForInitBlkUnroll(GenTreeInitBlk* initBlkNode) +{ +#if 0 + // Make sure we got the arguments of the initblk/initobj operation in the right registers + GenTreePtr blockSize = initBlkNode->Size(); + GenTreePtr dstAddr = initBlkNode->Dest(); + GenTreePtr initVal = initBlkNode->InitVal(); + +#ifdef DEBUG + assert(!dstAddr->isContained()); + assert(!initVal->isContained()); + assert(blockSize->isContained()); + + assert(blockSize->IsCnsIntOrI()); +#endif // DEBUG + + size_t size = blockSize->gtIntCon.gtIconVal; + + assert(size <= INITBLK_UNROLL_LIMIT); + assert(initVal->gtSkipReloadOrCopy()->IsCnsIntOrI()); + + emitter *emit = getEmitter(); + + genConsumeReg(initVal); + genConsumeReg(dstAddr); + + // If the initVal was moved, or spilled and reloaded to a different register, + // get the original initVal from below the GT_RELOAD, but only after capturing the valReg, + // which needs to be the new register. + regNumber valReg = initVal->gtRegNum; + initVal = initVal->gtSkipReloadOrCopy(); +#else // !0 + NYI("genCodeForInitBlkUnroll"); +#endif // !0 +} + +// Generates code for InitBlk by calling the VM memset helper function. +// Preconditions: +// a) The size argument of the InitBlk is not an integer constant. +// b) The size argument of the InitBlk is >= INITBLK_STOS_LIMIT bytes. +void CodeGen::genCodeForInitBlk(GenTreeInitBlk* initBlkNode) +{ + // Make sure we got the arguments of the initblk operation in the right registers + GenTreePtr blockSize = initBlkNode->Size(); + GenTreePtr dstAddr = initBlkNode->Dest(); + GenTreePtr initVal = initBlkNode->InitVal(); + +#ifdef DEBUG + assert(!dstAddr->isContained()); + assert(!initVal->isContained()); + assert(!blockSize->isContained()); + + // TODO-ARM64-CQ: When initblk loop unrolling is implemented + // put this assert back on. +#if 0 + if (blockSize->IsCnsIntOrI()) + { + assert(blockSize->gtIntCon.gtIconVal >= INITBLK_UNROLL_LIMIT); + } +#endif // 0 +#endif // DEBUG + + genConsumeRegAndCopy(blockSize, REG_ARG_2); + genConsumeRegAndCopy(initVal, REG_ARG_1); + genConsumeRegAndCopy(dstAddr, REG_ARG_0); + + genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN); +} + + +// Generate code for a load from some address + offset +// base: tree node which can be either a local address or arbitrary node +// offset: distance from the base from which to load +void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset) +{ +#if 0 + emitter *emit = getEmitter(); + + if (base->OperIsLocalAddr()) + { + if (base->gtOper == GT_LCL_FLD_ADDR) + offset += base->gtLclFld.gtLclOffs; + emit->emitIns_R_S(ins, size, dst, base->gtLclVarCommon.gtLclNum, offset); + } + else + { + emit->emitIns_R_AR(ins, size, dst, base->gtRegNum, offset); + } +#else // !0 + NYI("genCodeForLoadOffset"); +#endif // !0 +} + +// Generate code for a store to some address + offset +// base: tree node which can be either a local address or arbitrary node +// offset: distance from the base from which to load +void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* base, unsigned offset) +{ +#if 0 + emitter *emit = getEmitter(); + + if (base->OperIsLocalAddr()) + { + if (base->gtOper == GT_LCL_FLD_ADDR) + offset += base->gtLclFld.gtLclOffs; + emit->emitIns_S_R(ins, size, src, base->gtLclVarCommon.gtLclNum, offset); + } + else + { + emit->emitIns_AR_R(ins, size, src, base->gtRegNum, offset); + } +#else // !0 + NYI("genCodeForStoreOffset"); +#endif // !0 +} + + +// Generates CpBlk code by performing a loop unroll +// Preconditions: +// The size argument of the CpBlk node is a constant and <= 64 bytes. +// This may seem small but covers >95% of the cases in several framework assemblies. +void CodeGen::genCodeForCpBlkUnroll(GenTreeCpBlk* cpBlkNode) +{ +#if 0 + // Make sure we got the arguments of the cpblk operation in the right registers + GenTreePtr blockSize = cpBlkNode->Size(); + GenTreePtr dstAddr = cpBlkNode->Dest(); + GenTreePtr srcAddr = cpBlkNode->Source(); + + assert(blockSize->IsCnsIntOrI()); + size_t size = blockSize->gtIntCon.gtIconVal; + assert(size <= CPBLK_UNROLL_LIMIT); + + emitter *emit = getEmitter(); + + if (!srcAddr->isContained()) + genConsumeReg(srcAddr); + + if (!dstAddr->isContained()) + genConsumeReg(dstAddr); + + unsigned offset = 0; + + // If the size of this struct is larger than 16 bytes + // let's use SSE2 to be able to do 16 byte at a time + // loads and stores. + if (size >= XMM_REGSIZE_BYTES) + { + assert(cpBlkNode->gtRsvdRegs != RBM_NONE); + assert(genCountBits(cpBlkNode->gtRsvdRegs) == 1); + regNumber xmmReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs); + assert(genIsValidFloatReg(xmmReg)); + size_t slots = size / XMM_REGSIZE_BYTES; + + while (slots-- > 0) + { + // Load + genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr, offset); + // Store + genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset); + offset += XMM_REGSIZE_BYTES; + } + } + + // Fill the remainder (15 bytes or less) if there's one. + if ((size & 0xf) != 0) + { + // Grab the integer temp register to emit the remaining loads and stores. + regNumber tmpReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLINT); + + if ((size & 8) != 0) + { + genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr, offset); + genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset); + offset += 8; + } + if ((size & 4) != 0) + { + genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset); + genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset); + offset += 4; + } + if ((size & 2) != 0) + { + genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr, offset); + genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset); + offset += 2; + } + if ((size & 1) != 0) + { + genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr, offset); + genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset); + } + } +#else // !0 + NYI("genCodeForCpBlkUnroll"); +#endif // !0 +} + +// Generate code for CpObj nodes wich copy structs that have interleaved +// GC pointers. +// For this case we'll generate a sequence of loads/stores in the case of struct +// slots that don't contain GC pointers. The generated code will look like: +// ldr tempReg, [R13, #8] +// str tempReg, [R14, #8] +// +// In the case of a GC-Pointer we'll call the ByRef write barrier helper +// who happens to use the same registers as the previous call to maintain +// the same register requirements and register killsets: +// bl CORINFO_HELP_ASSIGN_BYREF +// +// So finally an example would look like this: +// ldr tempReg, [R13, #8] +// str tempReg, [R14, #8] +// bl CORINFO_HELP_ASSIGN_BYREF +// ldr tempReg, [R13, #8] +// str tempReg, [R14, #8] +// bl CORINFO_HELP_ASSIGN_BYREF +// ldr tempReg, [R13, #8] +// str tempReg, [R14, #8] +void CodeGen::genCodeForCpObj(GenTreeCpObj* cpObjNode) +{ + // Make sure we got the arguments of the cpobj operation in the right registers + GenTreePtr clsTok = cpObjNode->ClsTok(); + GenTreePtr dstAddr = cpObjNode->Dest(); + GenTreePtr srcAddr = cpObjNode->Source(); + + bool dstOnStack = dstAddr->OperIsLocalAddr(); + +#ifdef DEBUG + assert(!dstAddr->isContained()); + assert(!srcAddr->isContained()); + + // This GenTree node has data about GC pointers, this means we're dealing + // with CpObj. + assert(cpObjNode->gtGcPtrCount > 0); +#endif // DEBUG + + // Consume these registers. + // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing"). + genConsumeRegAndCopy(srcAddr, REG_WRITE_BARRIER_SRC_BYREF); + gcInfo.gcMarkRegPtrVal(REG_WRITE_BARRIER_SRC_BYREF, srcAddr->TypeGet()); + + genConsumeRegAndCopy(dstAddr, REG_WRITE_BARRIER_DST_BYREF); + gcInfo.gcMarkRegPtrVal(REG_WRITE_BARRIER_DST_BYREF, dstAddr->TypeGet()); + + // Temp register used to perform the sequence of loads and stores. + regNumber tmpReg = genRegNumFromMask(cpObjNode->gtRsvdRegs); + +#ifdef DEBUG + assert(cpObjNode->gtRsvdRegs != RBM_NONE); + assert(genCountBits(cpObjNode->gtRsvdRegs) == 1); + assert(genIsValidIntReg(tmpReg)); +#endif // DEBUG + + unsigned slots = cpObjNode->gtSlots; + emitter *emit = getEmitter(); + + // If we can prove it's on the stack we don't need to use the write barrier. + if (dstOnStack) + { + // TODO-ARM64-CQ: Consider using LDP/STP to save codesize. + while (slots > 0) + { + emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_SRC_BYREF, TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX); + emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_DST_BYREF, TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX); + slots--; + } + } + else + { + BYTE* gcPtrs = cpObjNode->gtGcPtrs; + unsigned gcPtrCount = cpObjNode->gtGcPtrCount; + + unsigned i = 0; + while (i < slots) + { + switch (gcPtrs[i]) + { + case TYPE_GC_NONE: + // TODO-ARM64-CQ: Consider using LDP/STP to save codesize in case of contigous NON-GC slots. + emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_SRC_BYREF, TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX); + emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_DST_BYREF, TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX); + break; + + default: + // We have a GC pointer, call the memory barrier. + genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE); + gcPtrCount--; + break; + } + ++i; + } + assert(gcPtrCount == 0); + } + + // Clear the gcInfo for REG_WRITE_BARRIER_SRC_BYREF and REG_WRITE_BARRIER_DST_BYREF. + // While we normally update GC info prior to the last instruction that uses them, + // these actually live into the helper call. + gcInfo.gcMarkRegSetNpt(RBM_WRITE_BARRIER_SRC_BYREF | RBM_WRITE_BARRIER_DST_BYREF); +} + +// Generate code for a CpBlk node by the means of the VM memcpy helper call +// Preconditions: +// a) The size argument of the CpBlk is not an integer constant +// b) The size argument is a constant but is larger than CPBLK_MOVS_LIMIT bytes. +void CodeGen::genCodeForCpBlk(GenTreeCpBlk* cpBlkNode) +{ + // Make sure we got the arguments of the cpblk operation in the right registers + GenTreePtr blockSize = cpBlkNode->Size(); + GenTreePtr dstAddr = cpBlkNode->Dest(); + GenTreePtr srcAddr = cpBlkNode->Source(); + + assert(!dstAddr->isContained()); + assert(!srcAddr->isContained()); + assert(!blockSize->isContained()); + + // Enable this when we support cpblk loop unrolling. +#if 0 +#ifdef DEBUG + if (blockSize->IsCnsIntOrI()) + { + assert(blockSize->gtIntCon.gtIconVal >= CPBLK_UNROLL_LIMIT); + } +#endif // DEBUG +#endif // 0 + + genConsumeRegAndCopy(blockSize, REG_ARG_2); + genConsumeRegAndCopy(srcAddr, REG_ARG_1); + genConsumeRegAndCopy(dstAddr, REG_ARG_0); + + genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN); +} + + +// generate code do a switch statement based on a table of ip-relative offsets +void +CodeGen::genTableBasedSwitch(GenTree* treeNode) +{ + NYI("Emit table based switch"); + genConsumeOperands(treeNode->AsOp()); + regNumber idxReg = treeNode->gtOp.gtOp1->gtRegNum; + regNumber baseReg = treeNode->gtOp.gtOp2->gtRegNum; + + regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs); + + // load the ip-relative offset (which is relative to start of fgFirstBB) + //getEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, 4, 0); + + // add it to the absolute address of fgFirstBB + compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET; + //getEmitter()->emitIns_R_L(INS_lea, EA_PTRSIZE, compiler->fgFirstBB, tmpReg); + //getEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, baseReg, tmpReg); + // jmp baseReg + // getEmitter()->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), baseReg); +} + +// emits the table and an instruction to get the address of the first element +void +CodeGen::genJumpTable(GenTree* treeNode) +{ + NYI("Emit Jump table"); + noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH); + assert(treeNode->OperGet() == GT_JMPTABLE); + + unsigned jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount; + BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab; + unsigned jmpTabOffs; + unsigned jmpTabBase; + + jmpTabBase = getEmitter()->emitBBTableDataGenBeg(jumpCount, true); + + jmpTabOffs = 0; + + JITDUMP("\n J_M%03u_DS%02u LABEL DWORD\n", Compiler::s_compMethodsCount, jmpTabBase); + + for (unsigned i = 0; i<jumpCount; i++) + { + BasicBlock* target = *jumpTable++; + noway_assert(target->bbFlags & BBF_JMP_TARGET); + + JITDUMP(" DD L_M%03u_BB%02u\n", Compiler::s_compMethodsCount, target->bbNum); + + getEmitter()->emitDataGenData(i, target); + }; + + getEmitter()->emitDataGenEnd(); + + // Access to inline data is 'abstracted' by a special type of static member + // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference + // to constant data, not a real static field. + getEmitter()->emitIns_R_C(INS_lea, + emitTypeSize(TYP_I_IMPL), + treeNode->gtRegNum, + compiler->eeFindJitDataOffs(jmpTabBase), + 0); + genProduceReg(treeNode); +} + + +// generate code for the locked operations: +// GT_LOCKADD, GT_XCHG, GT_XADD +void +CodeGen::genLockedInstructions(GenTree* treeNode) +{ +#if 0 + GenTree* data = treeNode->gtOp.gtOp2; + GenTree* addr = treeNode->gtOp.gtOp1; + regNumber targetReg = treeNode->gtRegNum; + regNumber dataReg = data->gtRegNum; + regNumber addrReg = addr->gtRegNum; + instruction ins; + + // all of these nodes implicitly do an indirection on op1 + // so create a temporary node to feed into the pattern matching + GenTreeIndir i = indirForm(data->TypeGet(), addr); + genConsumeReg(addr); + + // The register allocator should have extended the lifetime of the address + // so that it is not used as the target. + noway_assert(addrReg != targetReg); + + // If data is a lclVar that's not a last use, we'd better have allocated a register + // for the result (except in the case of GT_LOCKADD which does not produce a register result). + assert(targetReg != REG_NA || treeNode->OperGet() == GT_LOCKADD || !genIsRegCandidateLocal(data) || (data->gtFlags & GTF_VAR_DEATH) != 0); + + genConsumeIfReg(data); + if (targetReg != REG_NA && dataReg != REG_NA && dataReg != targetReg) + { + inst_RV_RV(ins_Copy(data->TypeGet()), targetReg, dataReg); + data->gtRegNum = targetReg; + + // TODO-ARM64-Cleanup: Consider whether it is worth it, for debugging purposes, to restore the + // original gtRegNum on data, after calling emitInsBinary below. + } + switch (treeNode->OperGet()) + { + case GT_LOCKADD: + instGen(INS_lock); + ins = INS_add; + break; + case GT_XCHG: + // lock is implied by xchg + ins = INS_xchg; + break; + case GT_XADD: + instGen(INS_lock); + ins = INS_xadd; + break; + default: + unreached(); + } + getEmitter()->emitInsBinary(ins, emitTypeSize(data), &i, data); + + if (treeNode->gtRegNum != REG_NA) + { + genProduceReg(treeNode); + } +#else // !0 + NYI("genLockedInstructions"); +#endif // !0 +} + + +// generate code for BoundsCheck nodes +void +CodeGen::genRangeCheck(GenTreePtr oper) +{ +#ifdef FEATURE_SIMD + noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK || oper->OperGet() == GT_SIMD_CHK); +#else // !FEATURE_SIMD + noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK); +#endif // !FEATURE_SIMD + + GenTreeBoundsChk* bndsChk = oper->AsBoundsChk(); + + GenTreePtr arrLen = bndsChk->gtArrLen; + GenTreePtr arrIndex = bndsChk->gtIndex; + GenTreePtr arrRef = NULL; + int lenOffset = 0; + + GenTree *src1, *src2; + emitJumpKind jmpKind; + + genConsumeRegs(arrLen); + genConsumeRegs(arrIndex); + + if (arrIndex->isContainedIntOrIImmed()) + { + src1 = arrLen; + src2 = arrIndex; + jmpKind = EJ_jbe; + } + else + { + src1 = arrIndex; + src2 = arrLen; + jmpKind = EJ_jae; + } + + GenTreeIntConCommon* intConst = nullptr; + if (src2->isContainedIntOrIImmed()) + { + intConst = src2->AsIntConCommon(); + } + + if (intConst != nullptr) + { + getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, src1->gtRegNum, intConst->IconValue()); + } + else + { + getEmitter()->emitIns_R_R(INS_cmp, EA_4BYTE, src1->gtRegNum, src2->gtRegNum); + } + + genJumpToThrowHlpBlk(jmpKind, Compiler::ACK_RNGCHK_FAIL, bndsChk->gtIndRngFailBB); +} + +//------------------------------------------------------------------------ +// genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the +// lower bound for the given dimension. +// +// Arguments: +// elemType - the element type of the array +// rank - the rank of the array +// dimension - the dimension for which the lower bound offset will be returned. +// +// Return Value: +// The offset. +// TODO-Cleanup: move to CodeGenCommon.cpp + +// static +unsigned +CodeGen::genOffsetOfMDArrayLowerBound(var_types elemType, unsigned rank, unsigned dimension) +{ + // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets. + return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * (dimension + rank); +} + +//------------------------------------------------------------------------ +// genOffsetOfMDArrayLength: Returns the offset from the Array object to the +// size for the given dimension. +// +// Arguments: +// elemType - the element type of the array +// rank - the rank of the array +// dimension - the dimension for which the lower bound offset will be returned. +// +// Return Value: +// The offset. +// TODO-Cleanup: move to CodeGenCommon.cpp + +// static +unsigned +CodeGen::genOffsetOfMDArrayDimensionSize(var_types elemType, unsigned rank, unsigned dimension) +{ + // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets. + return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * dimension; +} + +//------------------------------------------------------------------------ +// genCodeForArrIndex: Generates code to bounds check the index for one dimension of an array reference, +// producing the effective index by subtracting the lower bound. +// +// Arguments: +// arrIndex - the node for which we're generating code +// +// Return Value: +// None. +// + +void +CodeGen::genCodeForArrIndex(GenTreeArrIndex* arrIndex) +{ +#if 0 + GenTreePtr arrObj = arrIndex->ArrObj(); + GenTreePtr indexNode = arrIndex->IndexExpr(); + + regNumber arrReg = genConsumeReg(arrObj); + regNumber indexReg = genConsumeReg(indexNode); + regNumber tgtReg = arrIndex->gtRegNum; + + unsigned dim = arrIndex->gtCurrDim; + unsigned rank = arrIndex->gtArrRank; + var_types elemType = arrIndex->gtArrElemType; + + noway_assert(tgtReg != REG_NA); + + // Subtract the lower bound for this dimension. + // TODO-ARM64-CQ: make this contained if it's an immediate that fits. + if (tgtReg != indexReg) + { + inst_RV_RV(INS_mov, tgtReg, indexReg, indexNode->TypeGet()); + } + getEmitter()->emitIns_R_AR(INS_sub, + emitActualTypeSize(TYP_INT), + tgtReg, + arrReg, + genOffsetOfMDArrayLowerBound(elemType, rank, dim)); + getEmitter()->emitIns_R_AR(INS_cmp, + emitActualTypeSize(TYP_INT), + tgtReg, + arrReg, + genOffsetOfMDArrayDimensionSize(elemType, rank, dim)); + genJumpToThrowHlpBlk(EJ_jae, Compiler::ACK_RNGCHK_FAIL); + + genProduceReg(arrIndex); +#else // !0 + NYI("genCodeForArrIndex"); +#endif // !0 +} + +//------------------------------------------------------------------------ +// genCodeForArrOffset: Generates code to compute the flattened array offset for +// one dimension of an array reference: +// result = (prevDimOffset * dimSize) + effectiveIndex +// where dimSize is obtained from the arrObj operand +// +// Arguments: +// arrOffset - the node for which we're generating code +// +// Return Value: +// None. +// +// Notes: +// dimSize and effectiveIndex are always non-negative, the former by design, +// and the latter because it has been normalized to be zero-based. + +void +CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset) +{ +#if 0 + GenTreePtr offsetNode = arrOffset->gtOffset; + GenTreePtr indexNode = arrOffset->gtIndex; + GenTreePtr arrObj = arrOffset->gtArrObj; + + regNumber tgtReg = arrOffset->gtRegNum; + + noway_assert(tgtReg != REG_NA); + + unsigned dim = arrOffset->gtCurrDim; + unsigned rank = arrOffset->gtArrRank; + var_types elemType = arrOffset->gtArrElemType; + + // We will use a temp register for the offset*scale+effectiveIndex computation. + regMaskTP tmpRegMask = arrOffset->gtRsvdRegs; + regNumber tmpReg = genRegNumFromMask(tmpRegMask); + + if (!offsetNode->IsZero()) + { + // Evaluate tgtReg = offsetReg*dim_size + indexReg. + // tmpReg is used to load dim_size and the result of the multiplication. + // Note that dim_size will never be negative. + regNumber offsetReg = genConsumeReg(offsetNode); + regNumber indexReg = genConsumeReg(indexNode); + regNumber arrReg = genConsumeReg(arrObj); + + getEmitter()->emitIns_R_AR(INS_mov, + emitActualTypeSize(TYP_INT), + tmpReg, + arrReg, + genOffsetOfMDArrayDimensionSize(elemType, rank, dim)); + inst_RV_RV(INS_imul, tmpReg, offsetReg); + + if (tmpReg == tgtReg) + { + inst_RV_RV(INS_add, tmpReg, indexReg); + } + else + { + if (indexReg != tgtReg) + { + inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_I_IMPL); + } + inst_RV_RV(INS_add, tgtReg, tmpReg); + } + } + else + { + regNumber indexReg = genConsumeReg(indexNode); + if (indexReg != tgtReg) + { + inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_INT); + } + } + genProduceReg(arrOffset); +#else // !0 + NYI("genCodeForArrOffset"); +#endif // !0 +} + +// make a temporary indir we can feed to pattern matching routines +// in cases where we don't want to instantiate all the indirs that happen +// +// TODO-Cleanup: move to CodeGenCommon.cpp +GenTreeIndir CodeGen::indirForm(var_types type, GenTree *base) +{ + GenTreeIndir i(GT_IND, type, base, nullptr); + i.gtRegNum = REG_NA; + // has to be nonnull (because contained nodes can't be the last in block) + // but don't want it to be a valid pointer + i.gtNext = (GenTree *)(-1); + return i; +} + +// make a temporary int we can feed to pattern matching routines +// in cases where we don't want to instantiate +// +// TODO-Cleanup: move to CodeGenCommon.cpp +GenTreeIntCon CodeGen::intForm(var_types type, ssize_t value) +{ + GenTreeIntCon i(type, value); + i.gtRegNum = REG_NA; + // has to be nonnull (because contained nodes can't be the last in block) + // but don't want it to be a valid pointer + i.gtNext = (GenTree *)(-1); + return i; +} + + +instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type) +{ + instruction ins = INS_brk; + + if (varTypeIsFloating(type)) + { + switch (oper) + { + case GT_ADD: + ins = INS_fadd; + break; + case GT_SUB: + ins = INS_fsub; + break; + case GT_MUL: + ins = INS_fmul; + break; + case GT_DIV: + ins = INS_fdiv; + break; + case GT_NEG: + ins = INS_fneg; + break; + + default: + NYI("Unhandled oper in genGetInsForOper() - float"); + unreached(); + break; + } + } + else + { + switch (oper) + { + case GT_ADD: + ins = INS_add; + break; + case GT_AND: + ins = INS_and; + break; + case GT_DIV: + ins = INS_sdiv; + break; + case GT_UDIV: + ins = INS_udiv; + break; + case GT_MUL: + ins = INS_mul; + break; + case GT_LSH: + ins = INS_lsl; + break; + case GT_NOT: + ins = INS_mvn; + break; + case GT_OR: + ins = INS_orr; + break; + case GT_RSH: + ins = INS_asr; + break; + case GT_RSZ: + ins = INS_lsr; + break; + case GT_SUB: + ins = INS_sub; + break; + case GT_XOR: + ins = INS_eor; + break; + + default: + NYI("Unhandled oper in genGetInsForOper() - integer"); + unreached(); + break; + } + } + return ins; +} + +/** Generates the code sequence for a GenTree node that + * represents a bit shift operation (<<, >>, >>>). + * + * Arguments: operand: the value to be shifted by shiftBy bits. + * shiftBy: the number of bits to shift the operand. + * parent: the actual bitshift node (that specifies the + * type of bitshift to perform. + * + * Preconditions: a) All GenTrees are register allocated. + * b) Either shiftBy is a contained constant or + * it's an expression sitting in RCX. + * c) The actual bit shift node is not stack allocated + * nor contained (not yet supported). + */ +void CodeGen::genCodeForShift(GenTreePtr operand, + GenTreePtr shiftBy, + GenTreePtr parent) +{ + var_types targetType = parent->TypeGet(); + genTreeOps oper = parent->OperGet(); + instruction ins = genGetInsForOper(oper, targetType); + emitAttr size = emitTypeSize(parent); + + assert(parent->gtRegNum != REG_NA); + genConsumeReg(operand); + + if (!shiftBy->IsCnsIntOrI()) + { + genConsumeReg(shiftBy); + getEmitter()->emitIns_R_R_R(ins, size, parent->gtRegNum, operand->gtRegNum, shiftBy->gtRegNum); + } + else + { + getEmitter()->emitIns_R_R_I(ins, size, parent->gtRegNum, operand->gtRegNum, shiftBy->gtIntCon.gtIconVal); + } + + genProduceReg(parent); +} + +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genUnspillRegIfNeeded(GenTree *tree) +{ + regNumber dstReg = tree->gtRegNum; + + GenTree* unspillTree = tree; + if (tree->gtOper == GT_RELOAD) + { + unspillTree = tree->gtOp.gtOp1; + } + if (unspillTree->gtFlags & GTF_SPILLED) + { + if (genIsRegCandidateLocal(unspillTree)) + { + // Reset spilled flag, since we are going to load a local variable from its home location. + unspillTree->gtFlags &= ~GTF_SPILLED; + + GenTreeLclVarCommon* lcl = unspillTree->AsLclVarCommon(); + LclVarDsc* varDsc = &compiler->lvaTable[lcl->gtLclNum]; + + // Load local variable from its home location. + inst_RV_TT(ins_Load(unspillTree->gtType, compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)), dstReg, unspillTree); + + unspillTree->SetInReg(); + + // TODO-Review: We would like to call: + // genUpdateRegLife(varDsc, /*isBorn*/ true, /*isDying*/ false DEBUGARG(tree)); + // instead of the following code, but this ends up hitting this assert: + // assert((regSet.rsMaskVars & regMask) == 0); + // due to issues with LSRA resolution moves. + // So, just force it for now. This probably indicates a condition that creates a GC hole! + // + // Extra note: I think we really want to call something like gcInfo.gcUpdateForRegVarMove, + // because the variable is not really going live or dead, but that method is somewhat poorly + // factored because it, in turn, updates rsMaskVars which is part of RegSet not GCInfo. + // This code exists in other CodeGen*.cpp files. + + // Don't update the variable's location if we are just re-spilling it again. + + if ((unspillTree->gtFlags & GTF_SPILL) == 0) + { + genUpdateVarReg(varDsc, tree); +#ifdef DEBUG + if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex)) + { + JITDUMP("\t\t\t\t\t\t\tRemoving V%02u from gcVarPtrSetCur\n", lcl->gtLclNum); + } +#endif // DEBUG + VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex); + +#ifdef DEBUG + if (compiler->verbose) + { + printf("\t\t\t\t\t\t\tV%02u in reg ", lcl->gtLclNum); + varDsc->PrintVarReg(); + printf(" is becoming live "); + compiler->printTreeID(unspillTree); + printf("\n"); + } +#endif // DEBUG + + regSet.rsMaskVars |= genGetRegMask(varDsc); + } + } + else + { + TempDsc* t = regSet.rsUnspillInPlace(unspillTree); + getEmitter()->emitIns_R_S(ins_Load(unspillTree->gtType), + emitActualTypeSize(unspillTree->gtType), + dstReg, + t->tdTempNum(), + 0); + compiler->tmpRlsTemp(t); + + unspillTree->gtFlags &= ~GTF_SPILLED; + unspillTree->SetInReg(); + } + + gcInfo.gcMarkRegPtrVal(dstReg, unspillTree->TypeGet()); + } +} + +// Do Liveness update for a subnodes that is being consumed by codegen +// including the logic for reload in case is needed and also takes care +// of locating the value on the desired register. +void CodeGen::genConsumeRegAndCopy(GenTree *tree, regNumber needReg) +{ + regNumber treeReg = genConsumeReg(tree); + if (treeReg != needReg) + { + var_types targetType = tree->TypeGet(); + inst_RV_RV(ins_Copy(targetType), needReg, treeReg, targetType); + } +} + +void CodeGen::genRegCopy(GenTree* treeNode) +{ + assert(treeNode->OperGet() == GT_COPY); + + var_types targetType = treeNode->TypeGet(); + regNumber targetReg = treeNode->gtRegNum; + assert(targetReg != REG_NA); + + GenTree* op1 = treeNode->gtOp.gtOp1; + + // Check whether this node and the node from which we're copying the value have the same + // register type. + // This can happen if (currently iff) we have a SIMD vector type that fits in an integer + // register, in which case it is passed as an argument, or returned from a call, + // in an integer register and must be copied if it's in an xmm register. + + if (varTypeIsFloating(treeNode) != varTypeIsFloating(op1)) + { +#if 0 + instruction ins; + regNumber fpReg; + regNumber intReg; + if(varTypeIsFloating(treeNode)) + { + ins = INS_mov_i2xmm; + fpReg = targetReg; + intReg = op1->gtRegNum; + } + else + { + ins = INS_mov_xmm2i; + intReg = targetReg; + fpReg = op1->gtRegNum; + } + inst_RV_RV(ins, fpReg, intReg, targetType); +#else + NYI_ARM64("CodeGen - FP/Int RegCopy"); +#endif + } + else + { + inst_RV_RV(ins_Copy(targetType), targetReg, genConsumeReg(op1), targetType); + } + + if (op1->IsLocal()) + { + // The lclVar will never be a def. + // If it is a last use, the lclVar will be killed by genConsumeReg(), as usual, and genProduceReg will + // appropriately set the gcInfo for the copied value. + // If not, there are two cases we need to handle: + // - If this is a TEMPORARY copy (indicated by the GTF_VAR_DEATH flag) the variable + // will remain live in its original register. + // genProduceReg() will appropriately set the gcInfo for the copied value, + // and genConsumeReg will reset it. + // - Otherwise, we need to update register info for the lclVar. + + GenTreeLclVarCommon* lcl = op1->AsLclVarCommon(); + assert((lcl->gtFlags & GTF_VAR_DEF) == 0); + + if ((lcl->gtFlags & GTF_VAR_DEATH) == 0 && (treeNode->gtFlags & GTF_VAR_DEATH) == 0) + { + LclVarDsc* varDsc = &compiler->lvaTable[lcl->gtLclNum]; + + // If we didn't just spill it (in genConsumeReg, above), then update the register info + if (varDsc->lvRegNum != REG_STK) + { + // The old location is dying + genUpdateRegLife(varDsc, /*isBorn*/ false, /*isDying*/ true DEBUGARG(op1)); + + gcInfo.gcMarkRegSetNpt(genRegMask(op1->gtRegNum)); + + genUpdateVarReg(varDsc, treeNode); + + // The new location is going live + genUpdateRegLife(varDsc, /*isBorn*/ true, /*isDying*/ false DEBUGARG(treeNode)); + } + } + } + genProduceReg(treeNode); +} + +// Do liveness update for a subnode that is being consumed by codegen. +// TODO-Cleanup: move to CodeGenCommon.cpp +regNumber CodeGen::genConsumeReg(GenTree *tree) +{ + if (tree->OperGet() == GT_COPY) + { + genRegCopy(tree); + } + // Handle the case where we have a lclVar that needs to be copied before use (i.e. because it + // interferes with one of the other sources (or the target, if it's a "delayed use" register)). + // TODO-Cleanup: This is a special copyReg case in LSRA - consider eliminating these and + // always using GT_COPY to make the lclVar location explicit. + // Note that we have to do this before calling genUpdateLife because otherwise if we spill it + // the lvRegNum will be set to REG_STK and we will lose track of what register currently holds + // the lclVar (normally when a lclVar is spilled it is then used from its former register + // location, which matches the gtRegNum on the node). + // (Note that it doesn't matter if we call this before or after genUnspillRegIfNeeded + // because if it's on the stack it will always get reloaded into tree->gtRegNum). + if (genIsRegCandidateLocal(tree)) + { + GenTreeLclVarCommon *lcl = tree->AsLclVarCommon(); + LclVarDsc* varDsc = &compiler->lvaTable[lcl->GetLclNum()]; + if ((varDsc->lvRegNum != REG_STK) && (varDsc->lvRegNum != tree->gtRegNum)) + { + inst_RV_RV(ins_Copy(tree->TypeGet()), tree->gtRegNum, varDsc->lvRegNum); + } + } + + genUnspillRegIfNeeded(tree); + + // genUpdateLife() will also spill local var if marked as GTF_SPILL by calling CodeGen::genSpillVar + genUpdateLife(tree); + assert(tree->gtRegNum != REG_NA); + + // there are three cases where consuming a reg means clearing the bit in the live mask + // 1. it was not produced by a local + // 2. it was produced by a local that is going dead + // 3. it was produced by a local that does not live in that reg (like one allocated on the stack) + + if (genIsRegCandidateLocal(tree)) + { + GenTreeLclVarCommon *lcl = tree->AsLclVarCommon(); + LclVarDsc* varDsc = &compiler->lvaTable[lcl->GetLclNum()]; + assert(varDsc->lvLRACandidate); + + if ((tree->gtFlags & GTF_VAR_DEATH) != 0) + { + gcInfo.gcMarkRegSetNpt(genRegMask(varDsc->lvRegNum)); + } + else if (varDsc->lvRegNum == REG_STK) + { + // We have loaded this into a register only temporarily + gcInfo.gcMarkRegSetNpt(genRegMask(tree->gtRegNum)); + } + } + else + { + gcInfo.gcMarkRegSetNpt(genRegMask(tree->gtRegNum)); + } + + return tree->gtRegNum; +} + +// Do liveness update for an address tree: one of GT_LEA, GT_LCL_VAR, or GT_CNS_INT (for call indirect). +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genConsumeAddress(GenTree* addr) +{ + if (addr->OperGet() == GT_LEA) + { + genConsumeAddrMode(addr->AsAddrMode()); + } + else if (!addr->isContained()) + { + genConsumeReg(addr); + } +} + +// do liveness update for a subnode that is being consumed by codegen +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genConsumeAddrMode(GenTreeAddrMode *addr) +{ + if (addr->Base()) + genConsumeReg(addr->Base()); + if (addr->Index()) + genConsumeReg(addr->Index()); +} + +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genConsumeRegs(GenTree* tree) +{ + if (tree->isContained()) + { + if (tree->isIndir()) + { + genConsumeAddress(tree->AsIndir()->Addr()); + } + else if (tree->OperGet() == GT_AND) + { + // This is the special contained GT_AND that we created in Lowering::LowerCmp() + // Now we need to consume the operands of the GT_AND node. + genConsumeOperands(tree->AsOp()); + } + else + { + assert(tree->OperIsLeaf()); + } + } + else + { + genConsumeReg(tree); + } +} + +//------------------------------------------------------------------------ +// genConsumeOperands: Do liveness update for the operands of a unary or binary tree +// +// Arguments: +// tree - the GenTreeOp whose operands will have their liveness updated. +// +// Return Value: +// None. +// +// Notes: +// Note that this logic is localized here because we must do the liveness update in +// the correct execution order. This is important because we may have two operands +// that involve the same lclVar, and if one is marked "lastUse" we must handle it +// after the first. +// TODO-Cleanup: move to CodeGenCommon.cpp + +void CodeGen::genConsumeOperands(GenTreeOp* tree) +{ + GenTree* firstOp = tree->gtOp1; + GenTree* secondOp = tree->gtOp2; + if ((tree->gtFlags & GTF_REVERSE_OPS) != 0) + { + assert(secondOp != nullptr); + firstOp = secondOp; + secondOp = tree->gtOp1; + } + if (firstOp != nullptr) + { + genConsumeRegs(firstOp); + } + if (secondOp != nullptr) + { + genConsumeRegs(secondOp); + } +} + +// do liveness update for register produced by the current node in codegen +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genProduceReg(GenTree *tree) +{ + if (tree->gtFlags & GTF_SPILL) + { + if (genIsRegCandidateLocal(tree)) + { + // Store local variable to its home location. + tree->gtFlags &= ~GTF_REG_VAL; + inst_TT_RV(ins_Store(tree->gtType, compiler->isSIMDTypeLocalAligned(tree->gtLclVarCommon.gtLclNum)), tree, tree->gtRegNum); + } + else + { + tree->SetInReg(); + regSet.rsSpillTree(tree->gtRegNum, tree); + tree->gtFlags |= GTF_SPILLED; + tree->gtFlags &= ~GTF_SPILL; + gcInfo.gcMarkRegSetNpt(genRegMask(tree->gtRegNum)); + return; + } + } + + genUpdateLife(tree); + + // If we've produced a register, mark it as a pointer, as needed. + if (tree->gtHasReg()) + { + // We only mark the register in the following cases: + // 1. It is not a register candidate local. In this case, we're producing a + // register from a local, but the local is not a register candidate. Thus, + // we must be loading it as a temp register, and any "last use" flag on + // the register wouldn't be relevant. + // 2. The register candidate local is going dead. There's no point to mark + // the register as live, with a GC pointer, if the variable is dead. + if (!genIsRegCandidateLocal(tree) || + ((tree->gtFlags & GTF_VAR_DEATH) == 0)) + { + gcInfo.gcMarkRegPtrVal(tree->gtRegNum, tree->TypeGet()); + } + } + tree->SetInReg(); +} + +// transfer gc/byref status of src reg to dst reg +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genTransferRegGCState(regNumber dst, regNumber src) +{ + regMaskTP srcMask = genRegMask(src); + regMaskTP dstMask = genRegMask(dst); + + if (gcInfo.gcRegGCrefSetCur & srcMask) + { + gcInfo.gcMarkRegSetGCref(dstMask); + } + else if (gcInfo.gcRegByrefSetCur & srcMask) + { + gcInfo.gcMarkRegSetByref(dstMask); + } + else + { + gcInfo.gcMarkRegSetNpt(dstMask); + } +} + + +// generates an ip-relative call or indirect call via reg ('call reg') +// pass in 'addr' for a relative call or 'base' for a indirect register call +// methHnd - optional, only used for pretty printing +// retSize - emitter type of return for GC purposes, should be EA_BYREF, EA_GCREF, or EA_PTRSIZE(not GC) +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genEmitCall(int callType, + CORINFO_METHOD_HANDLE methHnd, + INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) + void* addr, + emitAttr retSize, + IL_OFFSETX ilOffset, + regNumber base, + bool isJump, + bool isNoGC) +{ + + getEmitter()->emitIns_Call(emitter::EmitCallType(callType), + methHnd, + INDEBUG_LDISASM_COMMA(sigInfo) + addr, + 0, + retSize, + gcInfo.gcVarPtrSetCur, + gcInfo.gcRegGCrefSetCur, + gcInfo.gcRegByrefSetCur, + ilOffset, + base, REG_NA, 0, 0, + isJump, + emitter::emitNoGChelper(compiler->eeGetHelperNum(methHnd))); +} + +// generates an indirect call via addressing mode (call []) given an indir node +// methHnd - optional, only used for pretty printing +// retSize - emitter type of return for GC purposes, should be EA_BYREF, EA_GCREF, or EA_PTRSIZE(not GC) +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genEmitCall(int callType, + CORINFO_METHOD_HANDLE methHnd, + INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) + GenTreeIndir* indir, + emitAttr retSize, + IL_OFFSETX ilOffset) +{ + genConsumeAddress(indir->Addr()); + + getEmitter()->emitIns_Call(emitter::EmitCallType(callType), + methHnd, + INDEBUG_LDISASM_COMMA(sigInfo) + nullptr, + 0, + retSize, + gcInfo.gcVarPtrSetCur, + gcInfo.gcRegGCrefSetCur, + gcInfo.gcRegByrefSetCur, + ilOffset, + indir->Base() ? indir->Base()->gtRegNum : REG_NA, + indir->Index() ? indir->Index()->gtRegNum : REG_NA, + indir->Scale(), + indir->Offset()); +} + +// Produce code for a GT_CALL node +void CodeGen::genCallInstruction(GenTreePtr node) +{ + GenTreeCall *call = node->AsCall(); + + assert(call->gtOper == GT_CALL); + + gtCallTypes callType = (gtCallTypes)call->gtCallType; + + IL_OFFSETX ilOffset = BAD_IL_OFFSET; + + // all virtuals should have been expanded into a control expression + assert (!call->IsVirtual() || call->gtControlExpr || call->gtCallAddr); + + // Consume all the arg regs + for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext()) + { + assert(list->IsList()); + + GenTreePtr argNode = list->Current(); + + fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode->gtSkipReloadOrCopy()); + assert(curArgTabEntry); + + if (curArgTabEntry->regNum == REG_STK) + continue; + + regNumber argReg = curArgTabEntry->regNum; + genConsumeReg(argNode); + if (argNode->gtRegNum != argReg) + { + inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum); + } + + // In the case of a varargs call, + // the ABI dictates that if we have floating point args, + // we must pass the enregistered arguments in both the + // integer and floating point registers so, let's do that. + if (call->IsVarargs() && varTypeIsFloating(argNode)) + { + NYI_ARM64("CodeGen - IsVarargs"); + } + } + + // Insert a null check on "this" pointer if asked. + if (call->NeedsNullCheck()) + { + const regNumber regThis = genGetThisArgReg(call); + getEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, regThis, 0); + } + + // Either gtControlExpr != null or gtCallAddr != null or it is a direct non-virtual call to a user or helper method. + CORINFO_METHOD_HANDLE methHnd; + GenTree* target = call->gtControlExpr; + if (callType == CT_INDIRECT) + { + assert(target == nullptr); + target = call->gtCall.gtCallAddr; + methHnd = nullptr; + } + else + { + methHnd = call->gtCallMethHnd; + } + + CORINFO_SIG_INFO* sigInfo = nullptr; +#ifdef DEBUG + // Pass the call signature information down into the emitter so the emitter can associate + // native call sites with the signatures they were generated from. + if (callType != CT_HELPER) + { + sigInfo = call->callSig; + } +#endif // DEBUG + + // If fast tail call, then we are done. In this case we setup the args (both reg args + // and stack args in incoming arg area) and call target in rax. Epilog sequence would + // generate "br x0". + if (call->IsFastTailCall()) + { + NYI_ARM64("CodeGen - IsFastTailCall"); + + // Don't support fast tail calling JIT helpers + assert(callType != CT_HELPER); + + // Fast tail calls materialize call target either in gtControlExpr or in gtCallAddr. + assert(target != nullptr); + + genConsumeReg(target); +#if 0 + if (target->gtRegNum != REG_RAX) + { + inst_RV_RV(INS_mov, REG_RAX, target->gtRegNum); + } +#endif + return; + } + + // For a pinvoke to unmanged code we emit a label to clear + // the GC pointer state before the callsite. + // We can't utilize the typical lazy killing of GC pointers + // at (or inside) the callsite. + if (call->IsUnmanaged()) + { + genDefineTempLabel(genCreateTempLabel()); + } + + // Determine return value size. + emitAttr retSize = EA_PTRSIZE; + if (call->gtType == TYP_REF || + call->gtType == TYP_ARRAY) + { + retSize = EA_GCREF; + } + else if (call->gtType == TYP_BYREF) + { + retSize = EA_BYREF; + } + +#ifdef DEBUGGING_SUPPORT + // We need to propagate the IL offset information to the call instruction, so we can emit + // an IL to native mapping record for the call, to support managed return value debugging. + // We don't want tail call helper calls that were converted from normal calls to get a record, + // so we skip this hash table lookup logic in that case. + if (compiler->opts.compDbgInfo && compiler->genCallSite2ILOffsetMap != nullptr && !call->IsTailCall()) + { + (void)compiler->genCallSite2ILOffsetMap->Lookup(call, &ilOffset); + } +#endif // DEBUGGING_SUPPORT + + if (target != nullptr) + { + // For Arm64 a call target can not be a contained indirection + assert(!target->isContainedIndir()); + + // We have already generated code for gtControlExpr evaluating it into a register. + // We just need to emit "call reg" in this case. + // + assert(genIsValidIntReg(target->gtRegNum)); + + genEmitCall(emitter::EC_INDIR_R, + methHnd, + INDEBUG_LDISASM_COMMA(sigInfo) + nullptr, //addr + retSize, + ilOffset, + genConsumeReg(target)); + } + else + { + // Generate a direct call to a non-virtual user defined or helper method + assert(callType == CT_HELPER || callType == CT_USER_FUNC); + + void *addr = nullptr; + if (callType == CT_HELPER) + { + // Direct call to a helper method. + CorInfoHelpFunc helperNum = compiler->eeGetHelperNum(methHnd); + noway_assert(helperNum != CORINFO_HELP_UNDEF); + + void *pAddr = nullptr; + addr = compiler->compGetHelperFtn(helperNum, (void **)&pAddr); + + if (addr == nullptr) + { + addr = pAddr; + } + } + else + { + // Direct call to a non-virtual user function. + CORINFO_ACCESS_FLAGS aflags = CORINFO_ACCESS_ANY; + if (call->IsSameThis()) + { + aflags = (CORINFO_ACCESS_FLAGS)(aflags | CORINFO_ACCESS_THIS); + } + + if ((call->NeedsNullCheck()) == 0) + { + aflags = (CORINFO_ACCESS_FLAGS)(aflags | CORINFO_ACCESS_NONNULL); + } + + CORINFO_CONST_LOOKUP addrInfo; + compiler->info.compCompHnd->getFunctionEntryPoint(methHnd, &addrInfo, aflags); + + addr = addrInfo.addr; + } +#if 0 + // Use this path if you want to load an absolute call target using + // a sequence of movs followed by an indirect call (blr instruction) + + // Load the call target address in x16 + instGen_Set_Reg_To_Imm(EA_8BYTE, REG_IP0, (ssize_t) addr); + + // indirect call to constant address in IP0 + genEmitCall(emitter::EC_INDIR_R, + methHnd, + INDEBUG_LDISASM_COMMA(sigInfo) + nullptr, //addr + retSize, + ilOffset, + REG_IP0); +#else + // Non-virtual direct call to known addresses + genEmitCall(emitter::EC_FUNC_TOKEN, + methHnd, + INDEBUG_LDISASM_COMMA(sigInfo) + addr, + retSize, + ilOffset); +#endif + } + + // if it was a pinvoke we may have needed to get the address of a label + if (genPendingCallLabel) + { + assert(call->IsUnmanaged()); + genDefineTempLabel(genPendingCallLabel); + genPendingCallLabel = nullptr; + } + + // Update GC info: + // All Callee arg registers are trashed and no longer contain any GC pointers. + // TODO-ARM64-Bug?: As a matter of fact shouldn't we be killing all of callee trashed regs here? + // For now we will assert that other than arg regs gc ref/byref set doesn't contain any other + // registers from RBM_CALLEE_TRASH + assert((gcInfo.gcRegGCrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == 0); + assert((gcInfo.gcRegByrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == 0); + gcInfo.gcRegGCrefSetCur &= ~RBM_ARG_REGS; + gcInfo.gcRegByrefSetCur &= ~RBM_ARG_REGS; + + var_types returnType = call->TypeGet(); + if (returnType != TYP_VOID) + { + regNumber returnReg = (varTypeIsFloating(returnType) ? REG_FLOATRET : REG_INTRET); + if (call->gtRegNum != returnReg) + { + inst_RV_RV(ins_Copy(returnType), call->gtRegNum, returnReg, returnType); + } + genProduceReg(call); + } + + // If there is nothing next, that means the result is thrown away, so this value is not live. + // However, for minopts or debuggable code, we keep it live to support managed return value debugging. + if ((call->gtNext == nullptr) && !compiler->opts.MinOpts() && !compiler->opts.compDbgCode) + { + gcInfo.gcMarkRegSetNpt(RBM_INTRET); + } +} + +// Produce code for a GT_JMP node. +// The arguments of the caller needs to be transferred to the callee before exiting caller. +// The actual jump to callee is generated as part of caller epilog sequence. +// Therefore the codegen of GT_JMP is to ensure that the callee arguments are correctly setup. +void CodeGen::genJmpMethod(GenTreePtr jmp) +{ + assert(jmp->OperGet() == GT_JMP); + assert(compiler->compJmpOpUsed); + + // If no arguments, nothing to do + if (compiler->info.compArgsCount == 0) + { + return; + } + +#if 0 + // Make sure register arguments are in their initial registers + // and stack arguments are put back as well. + unsigned varNum; + LclVarDsc* varDsc; + + // First move any en-registered stack arguments back to the stack. + // At the same time any reg arg not in correct reg is moved back to its stack location. + // + // We are not strictly required to spill reg args that are not in the desired reg for a jmp call + // But that would require us to deal with circularity while moving values around. Spilling + // to stack makes the implementation simple, which is not a bad trade off given Jmp calls + // are not frequent. + for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++) + { + varDsc = compiler->lvaTable + varNum; + + if (varDsc->lvPromoted) + { + noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here + + unsigned fieldVarNum = varDsc->lvFieldLclStart; + varDsc = compiler->lvaTable + fieldVarNum; + } + noway_assert(varDsc->lvIsParam); + + if (varDsc->lvIsRegArg && (varDsc->lvRegNum != REG_STK)) + { + // Skip reg args which are already in its right register for jmp call. + // If not, we will spill such args to their stack locations. + // + // If we need to generate a tail call profiler hook, then spill all + // arg regs to free them up for the callback. + if (!compiler->compIsProfilerHookNeeded() && (varDsc->lvRegNum == varDsc->lvArgReg)) + continue; + } + else if (varDsc->lvRegNum == REG_STK) + { + // Skip args which are currently living in stack. + continue; + } + + // If we came here it means either a reg argument not in the right register or + // a stack argument currently living in a register. In either case the following + // assert should hold. + assert(varDsc->lvRegNum != REG_STK); + + var_types loadType = varDsc->lvaArgType(); + getEmitter()->emitIns_S_R(ins_Store(loadType), emitTypeSize(loadType), varDsc->lvRegNum, varNum, 0); + + // Update lvRegNum life and GC info to indicate lvRegNum is dead and varDsc stack slot is going live. + // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it. + // Therefore manually update life of varDsc->lvRegNum. + regMaskTP tempMask = genRegMask(varDsc->lvRegNum); + regSet.rsMaskVars &= ~tempMask; + gcInfo.gcMarkRegSetNpt(tempMask); + if (varDsc->lvTracked) + { + VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varNum); + } + } + +#ifdef PROFILING_SUPPORTED + // At this point all arg regs are free. + // Emit tail call profiler callback. + genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL); +#endif + + // Next move any un-enregistered register arguments back to their register. + regMaskTP fixedIntArgMask = RBM_NONE; // tracks the int arg regs occupying fixed args in case of a vararg method. + unsigned firstArgVarNum = BAD_VAR_NUM; // varNum of the first argument in case of a vararg method. + for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++) + { + varDsc = compiler->lvaTable + varNum; + if (varDsc->lvPromoted) + { + noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here + + unsigned fieldVarNum = varDsc->lvFieldLclStart; + varDsc = compiler->lvaTable + fieldVarNum; + } + noway_assert(varDsc->lvIsParam); + + // Skip if arg not passed in a register. + if (!varDsc->lvIsRegArg) + continue; + + // Register argument + noway_assert(isRegParamType(genActualType(varDsc->TypeGet()))); + + // Is register argument already in the right register? + // If not load it from its stack location. + var_types loadType = varDsc->lvaArgType(); + regNumber argReg = varDsc->lvArgReg; // incoming arg register + + if (varDsc->lvRegNum != argReg) + { + assert(genIsValidReg(argReg)); + + getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0); + + // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live. + // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it. + // Therefore manually update life of argReg. Note that GT_JMP marks the end of the basic block + // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList(). + regSet.rsMaskVars |= genRegMask(argReg); + gcInfo.gcMarkRegPtrVal(argReg, loadType); + if (varDsc->lvTracked) + { + VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varNum); + } + } + + // In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg register. + if (compiler->info.compIsVarArgs) + { + regNumber intArgReg; + if (varTypeIsFloating(loadType)) + { + intArgReg = compiler->getCallArgIntRegister(argReg); + inst_RV_RV(INS_mov_xmm2i, argReg, intArgReg, loadType); + } + else + { + intArgReg = argReg; + } + + fixedIntArgMask |= genRegMask(intArgReg); + + if (intArgReg == REG_ARG_0) + { + assert(firstArgVarNum == BAD_VAR_NUM); + firstArgVarNum = varNum; + } + } + } + + // Jmp call to a vararg method - if the method has fewer than 4 fixed arguments, + // load the remaining arg registers (both int and float) from the corresponding + // shadow stack slots. This is for the reason that we don't know the number and type + // of non-fixed params passed by the caller, therefore we have to assume the worst case + // of caller passing float/double args both in int and float arg regs. + // + // The caller could have passed gc-ref/byref type var args. Since these are var args + // the callee no way of knowing their gc-ness. Therefore, mark the region that loads + // remaining arg registers from shadow stack slots as non-gc interruptible. + if (fixedIntArgMask != RBM_NONE) + { + assert(compiler->info.compIsVarArgs); + assert(firstArgVarNum != BAD_VAR_NUM); + + regMaskTP remainingIntArgMask = RBM_ARG_REGS & ~fixedIntArgMask; + if (remainingIntArgMask != RBM_NONE) + { + getEmitter()->emitDisableGC(); + for (int argNum = 0, argOffset=0; argNum < MAX_REG_ARG; ++argNum) + { + regNumber argReg = intArgRegs[argNum]; + regMaskTP argRegMask = genRegMask(argReg); + + if ((remainingIntArgMask & argRegMask) != 0) + { + remainingIntArgMask &= ~argRegMask; + getEmitter()->emitIns_R_S(INS_mov, EA_8BYTE, argReg, firstArgVarNum, argOffset); + + // also load it in corresponding float arg reg + regNumber floatReg = compiler->getCallArgFloatRegister(argReg); + inst_RV_RV(INS_mov_i2xmm, floatReg, argReg); + } + + argOffset += REGSIZE_BYTES; + } + getEmitter()->emitEnableGC(); + } + } +#else // !0 + NYI("genJmpMethod"); +#endif // !0 +} + +// produce code for a GT_LEA subnode +void CodeGen::genLeaInstruction(GenTreeAddrMode *lea) +{ + genConsumeOperands(lea); + emitter *emit = getEmitter(); + emitAttr size = emitTypeSize(lea); + + // In ARM64 we can only load addresses of the form: + // + // [Base + index*scale] + // [Base + Offset] + // [Literal] (PC-Relative) + // + // So for the case of a LEA node of the form [Base + Index*Scale + Offset] we will generate: + // destReg = baseReg + indexReg * scale; + // destReg = destReg + offset; + // + // TODO-ARM64-CQ: The purpose of the GT_LEA node is to directly reflect a single target architecture + // addressing mode instruction. Currently we're 'cheating' by producing one or more + // instructions to generate the addressing mode so we need to modify lowering to + // produce LEAs that are a 1:1 relationship to the ARM64 architecture. + if (lea->Base() && lea->Index()) + { + DWORD lsl; + + assert(isPow2(lea->gtScale)); + BitScanForward(&lsl, lea->gtScale); + + assert(lsl <= 4); + + // First, generate code to load rd = [base + index*scale] + if (lsl > 0) + { + emit->emitIns_R_R_R_I(INS_add, size, lea->gtRegNum, lea->Base()->gtRegNum, lea->Index()->gtRegNum, lsl, INS_OPTS_LSL); + } + else + { + emit->emitIns_R_R_R(INS_add, size, lea->gtRegNum, lea->Base()->gtRegNum, lea->Index()->gtRegNum); + } + // If the offset is not zero, then compute rd = [rd + offset] + if (lea->gtOffset != 0) + { + emit->emitIns_R_R_I(INS_add, size, lea->gtRegNum, lea->gtRegNum, (int) lea->gtOffset); + } + } + else if (lea->Base()) + { + if (lea->gtOffset != 0) + { + emit->emitIns_R_R_I(INS_add, size, lea->gtRegNum, lea->Base()->gtRegNum, (int) lea->gtOffset); + } + else + { + emit->emitIns_R_R(INS_mov, size, lea->gtRegNum, lea->Base()->gtRegNum); + } + } + else if (lea->Index()) + { + // If we encounter a GT_LEA node without a base it means it came out + // when attempting to optimize an arbitrary arithmetic expression during lower. + // This is currently disabled in ARM64 since we need to adjust lower to account + // for the simpler instructions ARM64 supports. + // TODO-ARM64-CQ: Fix this and let LEA optimize arithmetic trees too. + assert(!"We shouldn't see a baseless address computation during CodeGen for ARM64"); + } + + genProduceReg(lea); +} + +// Generate code to materialize a condition into a register +// (the condition codes must already have been appropriately set) + +void CodeGen::genSetRegToCond(regNumber dstReg, GenTreePtr tree) +{ + // Get the "jmpKind" using the gtOper kind + // Note that whether it is an unsigned cmp is governed by the GTF_UNSIGNED flags + + emitJumpKind jmpKind = genJumpKindForOper(tree->gtOper, (tree->gtFlags & GTF_UNSIGNED) != 0); + + inst_SET(jmpKind, dstReg); +} + +//------------------------------------------------------------------------ +// genIntToIntCast: Generate code for an integer cast +// This method handles integer overflow checking casts +// as well as ordinary integer casts. +// +// Arguments: +// treeNode - The GT_CAST node +// +// Return Value: +// None. +// +// Assumptions: +// The treeNode is not a contained node and must have an assigned register. +// For a signed convert from byte, the source must be in a byte-addressable register. +// Neither the source nor target type can be a floating point type. +// +// TODO-ARM64-CQ: Allow castOp to be a contained node without an assigned register. +// +void CodeGen::genIntToIntCast(GenTreePtr treeNode) +{ + assert(treeNode->OperGet() == GT_CAST); + + GenTreePtr castOp = treeNode->gtCast.CastOp(); + emitter * emit = getEmitter(); + + var_types dstType = treeNode->CastToType(); + var_types srcType = genActualType(castOp->TypeGet()); + emitAttr movSize = emitActualTypeSize(dstType); + bool movRequired = false; + + bool isUnsignedDst = varTypeIsUnsigned(dstType); + bool isUnsignedSrc = varTypeIsUnsigned(srcType); + + bool requiresOverflowCheck = false; + + regNumber targetReg = treeNode->gtRegNum; + regNumber sourceReg = castOp->gtRegNum; + + assert(genIsValidIntReg(targetReg)); + assert(genIsValidIntReg(sourceReg)); + + instruction ins = INS_invalid; + + // If necessary, force the srcType to unsigned when the GT_UNSIGNED flag is set. + if (!isUnsignedSrc && (treeNode->gtFlags & GTF_UNSIGNED) != 0) + { + srcType = genUnsignedType(srcType); + isUnsignedSrc = true; + } + + if (treeNode->gtOverflow() && (genTypeSize(srcType) >= genTypeSize(dstType) || (srcType == TYP_INT && dstType == TYP_ULONG))) + { + requiresOverflowCheck = true; + } + + genConsumeReg(castOp); + + if (requiresOverflowCheck) + { + emitAttr cmpSize = EA_ATTR(genTypeSize(srcType)); + ssize_t typeMin = 0; + ssize_t typeMax = 0; + ssize_t typeMask = 0; + bool signCheckOnly = false; + + /* Do we need to compare the value, or just check masks */ + + switch (dstType) + { + case TYP_BYTE: + typeMask = ssize_t((int)0xFFFFFF80); + typeMin = SCHAR_MIN; + typeMax = SCHAR_MAX; + break; + + case TYP_UBYTE: + typeMask = ssize_t((int)0xFFFFFF00L); + break; + + case TYP_SHORT: + typeMask = ssize_t((int)0xFFFF8000); + typeMin = SHRT_MIN; + break; + + case TYP_CHAR: + typeMask = ssize_t((int)0xFFFF0000L); + break; + + case TYP_INT: + if (srcType == TYP_UINT) + { + signCheckOnly = true; + } + else + { + typeMask = 0xFFFFFFFF80000000LL; + typeMin = INT_MIN; + typeMax = INT_MAX; + } + break; + + case TYP_UINT: + if (srcType == TYP_INT) + { + signCheckOnly = true; + } + else + { + typeMask = 0xFFFFFFFF00000000LL; + } + break; + + case TYP_LONG: + noway_assert(srcType == TYP_ULONG); + signCheckOnly = true; + break; + + case TYP_ULONG: + noway_assert((srcType == TYP_LONG) || (srcType == TYP_INT)); + signCheckOnly = true; + break; + + default: + NO_WAY("Unknown type"); + return; + } + + if (signCheckOnly) + { + // We only need to check for a negative value in sourceReg + emit->emitIns_R_I(INS_cmp, cmpSize, sourceReg, 0); + genJumpToThrowHlpBlk(EJ_jl, Compiler::ACK_OVERFLOW); + if (dstType == TYP_ULONG) + { + // cast to TYP_ULONG: + // We use a mov with size=EA_4BYTE + // which will zero out the upper bits + movSize = EA_4BYTE; + movRequired = true; + } + } + else + { + // When we are converting from/to unsigned, + // we only have to check for any bits set in 'typeMask' + if (isUnsignedSrc || isUnsignedDst) + { + noway_assert(typeMask != 0); + emit->emitIns_R_I(INS_tst, cmpSize, sourceReg, typeMask); + genJumpToThrowHlpBlk(EJ_jne, Compiler::ACK_OVERFLOW); + } + else + { + // For a narrowing signed cast + // + // We must check the value is in a signed range. + + // Compare with the MAX + + noway_assert((typeMin != 0) && (typeMax != 0)); + + emit->emitIns_R_I(INS_cmp, cmpSize, sourceReg, typeMax); + genJumpToThrowHlpBlk(EJ_jg, Compiler::ACK_OVERFLOW); + + // Compare with the MIN + + emit->emitIns_R_I(INS_cmp, cmpSize, sourceReg, typeMin); + genJumpToThrowHlpBlk(EJ_jl, Compiler::ACK_OVERFLOW); + } + } + ins = INS_mov; + } + else // Non-overflow checking cast. + { + if (genTypeSize(srcType) == genTypeSize(dstType)) + { + ins = INS_mov; + } + else + { + var_types extendType; + + if (genTypeSize(srcType) < genTypeSize(dstType)) + { + extendType = srcType; + if (srcType == TYP_UINT) + { + movSize = EA_4BYTE; // force a mov EA_4BYTE to zero the upper bits + movRequired = true; + } + } + else // (genTypeSize(srcType) > genTypeSize(dstType)) + { + extendType = dstType; + if (dstType == TYP_INT) + { + movSize = EA_8BYTE; // a sxtw instruction requires EA_8BYTE + } + } + + ins = ins_Move_Extend(extendType, castOp->InReg()); + } + } + + if ((ins != INS_mov) || movRequired || (targetReg != sourceReg)) + { + emit->emitIns_R_R(ins, movSize, targetReg, sourceReg); + } + + genProduceReg(treeNode); +} + +//------------------------------------------------------------------------ +// genFloatToFloatCast: Generate code for a cast between float and double +// +// Arguments: +// treeNode - The GT_CAST node +// +// Return Value: +// None. +// +// Assumptions: +// Cast is a non-overflow conversion. +// The treeNode must have an assigned register. +// The cast is between float and double or vice versa. +// +void +CodeGen::genFloatToFloatCast(GenTreePtr treeNode) +{ + // float <--> double conversions are always non-overflow ones + assert(treeNode->OperGet() == GT_CAST); + assert(!treeNode->gtOverflow()); + + regNumber targetReg = treeNode->gtRegNum; + assert(genIsValidFloatReg(targetReg)); + + GenTreePtr op1 = treeNode->gtOp.gtOp1; + assert(!op1->isContained()); // Cannot be contained + assert(genIsValidFloatReg(op1->gtRegNum)); // Must be a valid float reg. + + var_types dstType = treeNode->CastToType(); + var_types srcType = op1->TypeGet(); + assert(varTypeIsFloating(srcType) && varTypeIsFloating(dstType)); + assert(srcType != dstType); // Must specify two different types + + insOpts cvtOption = (srcType == TYP_FLOAT) ? INS_OPTS_S_TO_D // convert Single to Double + : INS_OPTS_D_TO_S; // convert Double to Single + + genConsumeOperands(treeNode->AsOp()); + + // treeNode must be a reg + assert(!treeNode->isContained()); + + getEmitter()->emitIns_R_R(INS_fcvt, emitTypeSize(treeNode), treeNode->gtRegNum, op1->gtRegNum, cvtOption); + + genProduceReg(treeNode); +} + +//------------------------------------------------------------------------ +// genIntToFloatCast: Generate code to cast an int/long to float/double +// +// Arguments: +// treeNode - The GT_CAST node +// +// Return Value: +// None. +// +// Assumptions: +// Cast is a non-overflow conversion. +// The treeNode must have an assigned register. +// SrcType= int32/uint32/int64/uint64 and DstType=float/double. +// +void +CodeGen::genIntToFloatCast(GenTreePtr treeNode) +{ + // int type --> float/double conversions are always non-overflow ones + assert(treeNode->OperGet() == GT_CAST); + assert(!treeNode->gtOverflow()); + + regNumber targetReg = treeNode->gtRegNum; + assert(genIsValidFloatReg(targetReg)); + + GenTreePtr op1 = treeNode->gtOp.gtOp1; + assert(!op1->isContained()); // Cannot be contained + assert(genIsValidIntReg(op1->gtRegNum)); // Must be a valid int reg. + + var_types dstType = treeNode->CastToType(); + var_types srcType = op1->TypeGet(); + assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType)); + + // force the srcType to unsigned if GT_UNSIGNED flag is set + if (treeNode->gtFlags & GTF_UNSIGNED) + { + srcType = genUnsignedType(srcType); + } + + // We should never see a srcType whose size is neither EA_4BYTE or EA_8BYTE + // For conversions from small types (byte/sbyte/int16/uint16) to float/double, + // we expect the front-end or lowering phase to have generated two levels of cast. + // + emitAttr srcSize = EA_ATTR(genTypeSize(srcType)); + noway_assert((srcSize == EA_4BYTE) ||(srcSize == EA_8BYTE)); + + instruction ins = INS_scvtf; // default to sign converts + insOpts cvtOption = INS_OPTS_NONE; // invalid value + + if (varTypeIsUnsigned(dstType)) + { + ins = INS_ucvtf; // use unsigned converts + } + + if (dstType == TYP_DOUBLE) + { + if (srcSize == EA_4BYTE) + { + cvtOption = INS_OPTS_4BYTE_TO_D; + } + else + { + assert(srcSize == EA_8BYTE); + cvtOption = INS_OPTS_8BYTE_TO_D; + } + } + else + { + assert(dstType == TYP_FLOAT); + if (srcSize == EA_4BYTE) + { + cvtOption = INS_OPTS_4BYTE_TO_S; + } + else + { + assert(srcSize == EA_8BYTE); + cvtOption = INS_OPTS_8BYTE_TO_S; + } + } + + genConsumeOperands(treeNode->AsOp()); + + getEmitter()->emitIns_R_R(ins, emitTypeSize(dstType), treeNode->gtRegNum, op1->gtRegNum, cvtOption); + + genProduceReg(treeNode); +} + +//------------------------------------------------------------------------ +// genFloatToIntCast: Generate code to cast float/double to int/long +// +// Arguments: +// treeNode - The GT_CAST node +// +// Return Value: +// None. +// +// Assumptions: +// Cast is a non-overflow conversion. +// The treeNode must have an assigned register. +// SrcType=float/double and DstType= int32/uint32/int64/uint64 +// +void +CodeGen::genFloatToIntCast(GenTreePtr treeNode) +{ + // we don't expect to see overflow detecting float/double --> int type conversions here + // as they should have been converted into helper calls by front-end. + assert(treeNode->OperGet() == GT_CAST); + assert(!treeNode->gtOverflow()); + + regNumber targetReg = treeNode->gtRegNum; + assert(genIsValidIntReg(targetReg)); // Must be a valid int reg. + + GenTreePtr op1 = treeNode->gtOp.gtOp1; + assert(!op1->isContained()); // Cannot be contained + assert(genIsValidFloatReg(op1->gtRegNum)); // Must be a valid float reg. + + var_types dstType = treeNode->CastToType(); + var_types srcType = op1->TypeGet(); + assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType)); + + // We should never see a dstType whose size is neither EA_4BYTE or EA_8BYTE + // For conversions to small types (byte/sbyte/int16/uint16) from float/double, + // we expect the front-end or lowering phase to have generated two levels of cast. + // + emitAttr dstSize = EA_ATTR(genTypeSize(dstType)); + noway_assert((dstSize == EA_4BYTE) ||(dstSize == EA_8BYTE)); + + instruction ins = INS_fcvtzs; // default to sign converts + insOpts cvtOption = INS_OPTS_NONE; // invalid value + + if (varTypeIsUnsigned(dstType)) + { + ins = INS_fcvtzu; // use unsigned converts + } + + if (srcType == TYP_DOUBLE) + { + if (dstSize == EA_4BYTE) + { + cvtOption = INS_OPTS_D_TO_4BYTE; + } + else + { + assert(dstSize == EA_8BYTE); + cvtOption = INS_OPTS_D_TO_8BYTE; + } + } + else + { + assert(srcType == TYP_FLOAT); + if (dstSize == EA_4BYTE) + { + cvtOption = INS_OPTS_S_TO_4BYTE; + } + else + { + assert(dstSize == EA_8BYTE); + cvtOption = INS_OPTS_S_TO_8BYTE; + } + } + + genConsumeOperands(treeNode->AsOp()); + + getEmitter()->emitIns_R_R(ins, dstSize, treeNode->gtRegNum, op1->gtRegNum, cvtOption); + + genProduceReg(treeNode); +} + +//------------------------------------------------------------------------ +// genCkfinite: Generate code for ckfinite opcode. +// +// Arguments: +// treeNode - The GT_CKFINITE node +// +// Return Value: +// None. +// +// Assumptions: +// GT_CKFINITE node has reserved an internal register. +// +// TODO-ARM64-CQ - mark the operand as contained if known to be in +// memory (e.g. field or an array element). +// +void +CodeGen::genCkfinite(GenTreePtr treeNode) +{ + assert(treeNode->OperGet() == GT_CKFINITE); + +#if 0 + GenTreePtr op1 = treeNode->gtOp.gtOp1; + var_types targetType = treeNode->TypeGet(); + int expMask = (targetType == TYP_FLOAT) ? 0x7F800000 : 0x7FF00000; // Bit mask to extract exponent. + + // Extract exponent into a register. + assert(treeNode->gtRsvdRegs != RBM_NONE); + assert(genCountBits(treeNode->gtRsvdRegs) == 1); + regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs); + + inst_RV_RV(INS_mov_xmm2i, genConsumeReg(op1), tmpReg, targetType); + if (targetType == TYP_DOUBLE) + { + // right shift by 32 bits to get to exponent. + inst_RV_SH(INS_shr, EA_8BYTE, tmpReg, 32); + } + + // Mask of exponent with all 1's and check if the exponent is all 1's + inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE); + inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE); + + // If exponent is all 1's, throw ArithmeticException + genJumpToThrowHlpBlk(EJ_je, Compiler::ACK_ARITH_EXCPN); + + // if it is a finite value copy it to targetReg + if (treeNode->gtRegNum != op1->gtRegNum) + { + inst_RV_RV(ins_Copy(targetType), treeNode->gtRegNum, op1->gtRegNum, targetType); + } + genProduceReg(treeNode); +#else // !0 + NYI("genCkfinite"); +#endif // !0 +} + +int CodeGenInterface::genSPtoFPdelta() +{ + int delta; + + // We place the saved frame pointer immediately above the outgoing argument space. + delta = (int)compiler->lvaOutgoingArgSpaceSize; + + assert(delta >= 0); + return delta; +} + + +//--------------------------------------------------------------------- +// genTotalFrameSize - return the total size of the stack frame, including local size, +// callee-saved register size, etc. +// +// Return value: +// Total frame size +// + +int CodeGenInterface::genTotalFrameSize() +{ + // For varargs functions, we home all the incoming register arguments. They are not + // included in the compCalleeRegsPushed count. This is like prespill on ARM32, but + // since we don't use "push" instructions to save them, we don't have to do the + // save of these varargs register arguments as the first thing in the prolog. + + assert(!IsUninitialized(compiler->compCalleeRegsPushed)); + + int totalFrameSize = (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) + + compiler->compCalleeRegsPushed * REGSIZE_BYTES + + compiler->compLclFrameSize; + + assert(totalFrameSize >= 0); + return totalFrameSize; +} + + +//--------------------------------------------------------------------- +// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer. +// This number is going to be negative, since the Caller-SP is at a higher +// address than the frame pointer. +// +// There must be a frame pointer to call this function! + +int CodeGenInterface::genCallerSPtoFPdelta() +{ + assert(isFramePointerUsed()); + int callerSPtoFPdelta; + + callerSPtoFPdelta = genCallerSPtoInitialSPdelta() + genSPtoFPdelta(); + + assert(callerSPtoFPdelta <= 0); + return callerSPtoFPdelta; +} + + +//--------------------------------------------------------------------- +// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP. +// +// This number will be negative. + +int CodeGenInterface::genCallerSPtoInitialSPdelta() +{ + int callerSPtoSPdelta = 0; + + callerSPtoSPdelta -= genTotalFrameSize(); + + assert(callerSPtoSPdelta <= 0); + return callerSPtoSPdelta; +} + + +//--------------------------------------------------------------------- +// genMathIntrinsic - generate code for a given math intrinsic +// +// Arguments +// treeNode - the GT_MATH node +// +// Return value: +// None +// +void +CodeGen::genMathIntrinsic(GenTreePtr treeNode) +{ +#if 0 + // Right now only Sqrt/Abs are treated as math intrinsics. + switch(treeNode->gtMath.gtMathFN) + { + case CORINFO_INTRINSIC_Sqrt: + noway_assert(treeNode->TypeGet() == TYP_DOUBLE); + genConsumeOperands(treeNode->AsOp()); + getEmitter()->emitInsBinary(INS_sqrtsd, emitTypeSize(treeNode), treeNode, treeNode->gtOp.gtOp1); + break; + + case CORINFO_INTRINSIC_Abs: + genSSE2BitwiseOp(treeNode); + break; + + default: + assert(!"genMathIntrinsic: Unsupported math intrinsic"); + unreached(); + } + + genProduceReg(treeNode); +#else // !0 + NYI("genMathIntrinsic"); +#endif // !0 +} + +/***************************************************************************** + * + * Create and record GC Info for the function. + */ +void +CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUG_ARG(void* codePtr)) +{ + genCreateAndStoreGCInfoX64(codeSize, prologSize DEBUG_ARG(codePtr)); +} + +void +CodeGen::genCreateAndStoreGCInfoX64(unsigned codeSize, unsigned prologSize DEBUG_ARG(void* codePtr)) +{ + IAllocator* allowZeroAlloc = new (compiler, CMK_GC) AllowZeroAllocator(compiler->getAllocatorGC()); + GcInfoEncoder* gcInfoEncoder = new (compiler, CMK_GC) GcInfoEncoder(compiler->info.compCompHnd, compiler->info.compMethodInfo, allowZeroAlloc); + assert(gcInfoEncoder != nullptr); + + // Follow the code pattern of the x86 gc info encoder (genCreateAndStoreGCInfoJIT32). + gcInfo.gcInfoBlockHdrSave(gcInfoEncoder, codeSize, prologSize); + + // First we figure out the encoder ID's for the stack slots and registers. + gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_ASSIGN_SLOTS); + + // Now we've requested all the slots we'll need; "finalize" these (make more compact data structures for them). + gcInfoEncoder->FinalizeSlotIds(); + + // Now we can actually use those slot ID's to declare live ranges. + gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_DO_WORK); + +#if defined(DEBUGGING_SUPPORT) + if (compiler->opts.compDbgEnC) + { + // what we have to preserve is called the "frame header" (see comments in VM\eetwain.cpp) + // which is: + // -return address + // -saved off RBP + // -saved 'this' pointer and bool for synchronized methods + + // 4 slots for RBP + return address + RSI + RDI + int preservedAreaSize = 4 * REGSIZE_BYTES; + + if (compiler->info.compFlags & CORINFO_FLG_SYNCH) + { + if (!(compiler->info.compFlags & CORINFO_FLG_STATIC)) + preservedAreaSize += REGSIZE_BYTES; + + preservedAreaSize += 1; // bool for synchronized methods + } + + // Used to signal both that the method is compiled for EnC, and also the size of the block at the top of the frame + gcInfoEncoder->SetSizeOfEditAndContinuePreservedArea(preservedAreaSize); + } +#endif + + gcInfoEncoder->Build(); + + //GC Encoder automatically puts the GC info in the right spot using ICorJitInfo::allocGCInfo(size_t) + //let's save the values anyway for debugging purposes + compiler->compInfoBlkAddr = gcInfoEncoder->Emit(); + compiler->compInfoBlkSize = 0; //not exposed by the GCEncoder interface +} + +/***************************************************************************** + * Emit a call to a helper function. + * + */ + +void CodeGen::genEmitHelperCall(unsigned helper, + int argSize, + emitAttr retSize) +{ + void* addr = nullptr; + void* pAddr = nullptr; + + emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN; + addr = compiler->compGetHelperFtn((CorInfoHelpFunc)helper, &pAddr); + regNumber callTarget = REG_NA; + + if (addr == nullptr) + { + NYI("genEmitHelperCall indirect"); +#if 0 + assert(pAddr != nullptr); + if (genAddrShouldUsePCRel((size_t)pAddr)) + { + // generate call whose target is specified by PC-relative 32-bit offset. + callType = emitter::EC_FUNC_TOKEN_INDIR; + addr = pAddr; + } + else + { + // If this address cannot be encoded as PC-relative 32-bit offset, load it into REG_HELPER_CALL_TARGET + // and use register indirect addressing mode to make the call. + // mov reg, addr + // call [reg] + callTarget = callTargetReg; + CodeGen::genSetRegToIcon(callTarget, (ssize_t) pAddr, TYP_I_IMPL); + callType = emitter::EC_INDIR_ARD; + } +#endif // 0 + } + + getEmitter()->emitIns_Call(callType, + compiler->eeFindHelper(helper), + INDEBUG_LDISASM_COMMA(nullptr) + addr, + argSize, + retSize, + gcInfo.gcVarPtrSetCur, + gcInfo.gcRegGCrefSetCur, + gcInfo.gcRegByrefSetCur, + BAD_IL_OFFSET, /* IL offset */ + callTarget, /* ireg */ + REG_NA, 0, 0, /* xreg, xmul, disp */ + false, /* isJump */ + emitter::emitNoGChelper(helper)); + + regMaskTP killMask = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper); + regTracker.rsTrashRegSet(killMask); + regTracker.rsTrashRegsForGCInterruptability(); +} + +/*****************************************************************************/ +#ifdef DEBUGGING_SUPPORT +/***************************************************************************** + * genSetScopeInfo + * + * Called for every scope info piece to record by the main genSetScopeInfo() + */ + +// TODO-Cleanup: move to CodeGenCommon.cpp +void CodeGen::genSetScopeInfo (unsigned which, + UNATIVE_OFFSET startOffs, + UNATIVE_OFFSET length, + unsigned varNum, + unsigned LVnum, + bool avail, + Compiler::siVarLoc& varLoc) +{ + /* We need to do some mapping while reporting back these variables */ + + unsigned ilVarNum = compiler->compMap2ILvarNum(varNum); + noway_assert((int)ilVarNum != ICorDebugInfo::UNKNOWN_ILNUM); + + VarName name = nullptr; + +#ifdef DEBUG + + for (unsigned scopeNum = 0; scopeNum < compiler->info.compVarScopesCount; scopeNum++) + { + if (LVnum == compiler->info.compVarScopes[scopeNum].vsdLVnum) + { + name = compiler->info.compVarScopes[scopeNum].vsdName; + } + } + + // Hang on to this compiler->info. + + TrnslLocalVarInfo &tlvi = genTrnslLocalVarInfo[which]; + + tlvi.tlviVarNum = ilVarNum; + tlvi.tlviLVnum = LVnum; + tlvi.tlviName = name; + tlvi.tlviStartPC = startOffs; + tlvi.tlviLength = length; + tlvi.tlviAvailable = avail; + tlvi.tlviVarLoc = varLoc; + +#endif // DEBUG + + compiler->eeSetLVinfo(which, startOffs, length, ilVarNum, LVnum, name, avail, varLoc); +} +#endif // DEBUGGING_SUPPORT + + +/***************************************************************************** + * Unit testing of the ARM64 emitter: generate a bunch of instructions into the prolog + * (it's as good a place as any), then use COMPLUS_JitLateDisasm=* to see if the late + * disassembler thinks the instructions as the same as we do. + */ + +// Uncomment "#define ALL_ARM64_EMITTER_UNIT_TESTS" to run all the unit tests here. +// After adding a unit test, and verifying it works, put it under this #ifdef, so we don't see it run every time. +//#define ALL_ARM64_EMITTER_UNIT_TESTS + +#if defined(DEBUG) +void CodeGen::genArm64EmitterUnitTests() +{ + if (!verbose) + { + return; + } + + if (!compiler->opts.altJit) + { + // No point doing this in a "real" JIT. + return; + } + + // Mark the "fake" instructions in the output. + printf("*************** In genArm64EmitterUnitTests()\n"); + + emitter* theEmitter = getEmitter(); + + // We use this: + // genDefineTempLabel(genCreateTempLabel()); + // to create artificial labels to help separate groups of tests. + + // + // Loads/Stores basic general register + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // ldr/str Xt, [reg] + theEmitter->emitIns_R_R(INS_ldr, EA_8BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_ldrb, EA_1BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_ldrh, EA_2BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_str, EA_8BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_strb, EA_1BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_strh, EA_2BYTE, REG_R8, REG_R9); + + // ldr/str Wt, [reg] + theEmitter->emitIns_R_R(INS_ldr, EA_4BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_ldrb, EA_1BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_ldrh, EA_2BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_str, EA_4BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_strb, EA_1BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_strh, EA_2BYTE, REG_R8, REG_R9); + + theEmitter->emitIns_R_R(INS_ldrsb, EA_4BYTE, REG_R8, REG_R9); // target Wt + theEmitter->emitIns_R_R(INS_ldrsh, EA_4BYTE, REG_R8, REG_R9); // target Wt + theEmitter->emitIns_R_R(INS_ldrsb, EA_8BYTE, REG_R8, REG_R9); // target Xt + theEmitter->emitIns_R_R(INS_ldrsh, EA_8BYTE, REG_R8, REG_R9); // target Xt + theEmitter->emitIns_R_R(INS_ldrsw, EA_8BYTE, REG_R8, REG_R9); // target Xt + + theEmitter->emitIns_R_R_I(INS_ldurb, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldurh, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_sturb, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_sturh, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldursb, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldursb, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldursh, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldursh, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldur, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldur, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_stur, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_stur, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldursw, EA_8BYTE, REG_R8, REG_R9, 1); + + // SP and ZR tests + theEmitter->emitIns_R_R_I(INS_ldur, EA_8BYTE, REG_R8, REG_SP, 1); + theEmitter->emitIns_R_R_I(INS_ldurb, EA_8BYTE, REG_ZR, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldurh, EA_8BYTE, REG_ZR, REG_SP, 1); + + // scaled + theEmitter->emitIns_R_R_I(INS_ldrb, EA_1BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldrh, EA_2BYTE, REG_R8, REG_R9, 2); + theEmitter->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_R8, REG_R9, 4); + theEmitter->emitIns_R_R_I(INS_ldr, EA_8BYTE, REG_R8, REG_R9, 8); + + // pre-/post-indexed (unscaled) + theEmitter->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_R8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_R8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_8BYTE, REG_R8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_8BYTE, REG_R8, REG_R9, 1, INS_OPTS_PRE_INDEX); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // Compares + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // cmp reg, reg + theEmitter->emitIns_R_R(INS_cmp, EA_8BYTE, REG_R8, REG_R9); + theEmitter->emitIns_R_R(INS_cmn, EA_8BYTE, REG_R8, REG_R9); + + // cmp reg, imm + theEmitter->emitIns_R_I(INS_cmp, EA_8BYTE, REG_R8, 0); + theEmitter->emitIns_R_I(INS_cmp, EA_8BYTE, REG_R8, 4095); + theEmitter->emitIns_R_I(INS_cmp, EA_8BYTE, REG_R8, 1 << 12); + theEmitter->emitIns_R_I(INS_cmp, EA_8BYTE, REG_R8, 4095 << 12); + + theEmitter->emitIns_R_I(INS_cmn, EA_8BYTE, REG_R8, 0); + theEmitter->emitIns_R_I(INS_cmn, EA_8BYTE, REG_R8, 4095); + theEmitter->emitIns_R_I(INS_cmn, EA_8BYTE, REG_R8, 1 << 12); + theEmitter->emitIns_R_I(INS_cmn, EA_8BYTE, REG_R8, 4095 << 12); + + theEmitter->emitIns_R_I(INS_cmp, EA_8BYTE, REG_R8, -1); + theEmitter->emitIns_R_I(INS_cmp, EA_8BYTE, REG_R8, -0xfff); + theEmitter->emitIns_R_I(INS_cmp, EA_8BYTE, REG_R8, 0xfffffffffffff000LL); + theEmitter->emitIns_R_I(INS_cmp, EA_8BYTE, REG_R8, 0xffffffffff800000LL); + + theEmitter->emitIns_R_I(INS_cmn, EA_8BYTE, REG_R8, -1); + theEmitter->emitIns_R_I(INS_cmn, EA_8BYTE, REG_R8, -0xfff); + theEmitter->emitIns_R_I(INS_cmn, EA_8BYTE, REG_R8, 0xfffffffffffff000LL); + theEmitter->emitIns_R_I(INS_cmn, EA_8BYTE, REG_R8, 0xffffffffff800000LL); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + + // R_R + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R(INS_cls, EA_8BYTE, REG_R1, REG_R12); + theEmitter->emitIns_R_R(INS_clz, EA_8BYTE, REG_R2, REG_R13); + theEmitter->emitIns_R_R(INS_rbit, EA_8BYTE, REG_R3, REG_R14); + theEmitter->emitIns_R_R(INS_rev, EA_8BYTE, REG_R4, REG_R15); + theEmitter->emitIns_R_R(INS_rev16, EA_8BYTE, REG_R5, REG_R0); + theEmitter->emitIns_R_R(INS_rev32, EA_8BYTE, REG_R6, REG_R1); + + theEmitter->emitIns_R_R(INS_cls, EA_4BYTE, REG_R7, REG_R2); + theEmitter->emitIns_R_R(INS_clz, EA_4BYTE, REG_R8, REG_R3); + theEmitter->emitIns_R_R(INS_rbit, EA_4BYTE, REG_R9, REG_R4); + theEmitter->emitIns_R_R(INS_rev, EA_4BYTE, REG_R10, REG_R5); + theEmitter->emitIns_R_R(INS_rev16, EA_4BYTE, REG_R11, REG_R6); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_I + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // mov reg, imm(i16,hw) + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x0000000000001234); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x0000000043210000); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x0000567800000000); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x8765000000000000); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0xFFFFFFFFFFFF1234); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0xFFFFFFFF4321FFFF); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0xFFFF5678FFFFFFFF); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x8765FFFFFFFFFFFF); + + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0x00001234); + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0x87650000); + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0xFFFF1234); + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0x4567FFFF); + + // mov reg, imm(N,r,s) + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x00FFFFF000000000); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x6666666666666666); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_SP, 0x7FFF00007FFF0000); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x5555555555555555); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0xE003E003E003E003); + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_R8, 0x0707070707070707); + + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0x00FFFFF0); + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0x66666666); + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0x03FFC000); + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0x55555555); + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0xE003E003); + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_R8, 0x07070707); + + theEmitter->emitIns_R_I(INS_tst, EA_8BYTE, REG_R8, 0xE003E003E003E003); + theEmitter->emitIns_R_I(INS_tst, EA_8BYTE, REG_R8, 0x00FFFFF000000000); + theEmitter->emitIns_R_I(INS_tst, EA_8BYTE, REG_R8, 0x6666666666666666); + theEmitter->emitIns_R_I(INS_tst, EA_8BYTE, REG_R8, 0x0707070707070707); + theEmitter->emitIns_R_I(INS_tst, EA_8BYTE, REG_R8, 0x7FFF00007FFF0000); + theEmitter->emitIns_R_I(INS_tst, EA_8BYTE, REG_R8, 0x5555555555555555); + + theEmitter->emitIns_R_I(INS_tst, EA_4BYTE, REG_R8, 0xE003E003); + theEmitter->emitIns_R_I(INS_tst, EA_4BYTE, REG_R8, 0x00FFFFF0); + theEmitter->emitIns_R_I(INS_tst, EA_4BYTE, REG_R8, 0x66666666); + theEmitter->emitIns_R_I(INS_tst, EA_4BYTE, REG_R8, 0x07070707); + theEmitter->emitIns_R_I(INS_tst, EA_4BYTE, REG_R8, 0xFFF00000); + theEmitter->emitIns_R_I(INS_tst, EA_4BYTE, REG_R8, 0x55555555); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // tst reg, reg + theEmitter->emitIns_R_R(INS_tst, EA_8BYTE, REG_R7, REG_R10); + + // mov reg, reg + theEmitter->emitIns_R_R(INS_mov, EA_8BYTE, REG_R7, REG_R10); + theEmitter->emitIns_R_R(INS_mov, EA_8BYTE, REG_R8, REG_SP); + theEmitter->emitIns_R_R(INS_mov, EA_8BYTE, REG_SP, REG_R9); + + theEmitter->emitIns_R_R(INS_mvn, EA_8BYTE, REG_R5, REG_R11); + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_R4, REG_R12); + theEmitter->emitIns_R_R(INS_negs, EA_8BYTE, REG_R3, REG_R13); + + theEmitter->emitIns_R_R(INS_mov, EA_4BYTE, REG_R7, REG_R10); + theEmitter->emitIns_R_R(INS_mvn, EA_4BYTE, REG_R5, REG_R11); + theEmitter->emitIns_R_R(INS_neg, EA_4BYTE, REG_R4, REG_R12); + theEmitter->emitIns_R_R(INS_negs, EA_4BYTE, REG_R3, REG_R13); + + theEmitter->emitIns_R_R(INS_sxtb, EA_8BYTE, REG_R7, REG_R10); + theEmitter->emitIns_R_R(INS_sxth, EA_8BYTE, REG_R5, REG_R11); + theEmitter->emitIns_R_R(INS_sxtw, EA_8BYTE, REG_R4, REG_R12); + theEmitter->emitIns_R_R(INS_uxtb, EA_8BYTE, REG_R3, REG_R13); // map to Wt + theEmitter->emitIns_R_R(INS_uxth, EA_8BYTE, REG_R2, REG_R14); // map to Wt + + theEmitter->emitIns_R_R(INS_sxtb, EA_4BYTE, REG_R7, REG_R10); + theEmitter->emitIns_R_R(INS_sxth, EA_4BYTE, REG_R5, REG_R11); + theEmitter->emitIns_R_R(INS_uxtb, EA_4BYTE, REG_R3, REG_R13); + theEmitter->emitIns_R_R(INS_uxth, EA_4BYTE, REG_R2, REG_R14); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_I_I + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // mov reg, imm(i16,hw) + theEmitter->emitIns_R_I_I(INS_mov, EA_8BYTE, REG_R8, 0x1234, 0, INS_OPTS_LSL); + theEmitter->emitIns_R_I_I(INS_mov, EA_8BYTE, REG_R8, 0x4321, 16, INS_OPTS_LSL); + + theEmitter->emitIns_R_I_I(INS_movk, EA_8BYTE, REG_R8, 0x4321, 16, INS_OPTS_LSL); + theEmitter->emitIns_R_I_I(INS_movn, EA_8BYTE, REG_R8, 0x5678, 32, INS_OPTS_LSL); + theEmitter->emitIns_R_I_I(INS_movz, EA_8BYTE, REG_R8, 0x8765, 48, INS_OPTS_LSL); + + theEmitter->emitIns_R_I_I(INS_movk, EA_4BYTE, REG_R8, 0x4321, 16, INS_OPTS_LSL); + theEmitter->emitIns_R_I_I(INS_movn, EA_4BYTE, REG_R8, 0x5678, 16, INS_OPTS_LSL); + theEmitter->emitIns_R_I_I(INS_movz, EA_4BYTE, REG_R8, 0x8765, 16, INS_OPTS_LSL); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_I + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_I(INS_lsl, EA_8BYTE, REG_R0, REG_R0, 1); + theEmitter->emitIns_R_R_I(INS_lsl, EA_4BYTE, REG_R9, REG_R3, 18); + theEmitter->emitIns_R_R_I(INS_lsr, EA_8BYTE, REG_R7, REG_R0, 37); + theEmitter->emitIns_R_R_I(INS_lsr, EA_4BYTE, REG_R0, REG_R1, 2); + theEmitter->emitIns_R_R_I(INS_asr, EA_8BYTE, REG_R2, REG_R3, 53); + theEmitter->emitIns_R_R_I(INS_asr, EA_4BYTE, REG_R9, REG_R3, 18); + + theEmitter->emitIns_R_R_I(INS_and, EA_8BYTE, REG_R2, REG_R3, 0x5555555555555555); + theEmitter->emitIns_R_R_I(INS_ands, EA_8BYTE, REG_R1, REG_R5, 0x6666666666666666); + theEmitter->emitIns_R_R_I(INS_eor, EA_8BYTE, REG_R8, REG_R9, 0x0707070707070707); + theEmitter->emitIns_R_R_I(INS_orr, EA_8BYTE, REG_SP, REG_R3, 0xFFFC000000000000); + theEmitter->emitIns_R_R_I(INS_ands, EA_4BYTE, REG_R8, REG_R9, 0xE003E003); + + theEmitter->emitIns_R_R_I(INS_ror, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ror, EA_8BYTE, REG_R8, REG_R9, 31); + theEmitter->emitIns_R_R_I(INS_ror, EA_8BYTE, REG_R8, REG_R9, 32); + theEmitter->emitIns_R_R_I(INS_ror, EA_8BYTE, REG_R8, REG_R9, 63); + + theEmitter->emitIns_R_R_I(INS_ror, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ror, EA_4BYTE, REG_R8, REG_R9, 31); + + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, 0); // == mov + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, -1); + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, 0xfff); + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, -0xfff); + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, 0x1000); + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, 0xfff000); + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, 0xfffffffffffff000LL); + theEmitter->emitIns_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, 0xffffffffff800000LL); + + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, 0); // == mov + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, -1); + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, 0xfff); + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, -0xfff); + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, 0x1000); + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, 0xfff000); + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, 0xfffffffffffff000LL); + theEmitter->emitIns_R_R_I(INS_add, EA_4BYTE, REG_R8, REG_R9, 0xffffffffff800000LL); + + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, 0); // == mov + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, -1); + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, 0xfff); + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, -0xfff); + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, 0x1000); + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, 0xfff000); + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, 0xfffffffffffff000LL); + theEmitter->emitIns_R_R_I(INS_sub, EA_8BYTE, REG_R8, REG_R9, 0xffffffffff800000LL); + + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, 0); // == mov + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, -1); + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, 0xfff); + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, -0xfff); + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, 0x1000); + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, 0xfff000); + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, 0xfffffffffffff000LL); + theEmitter->emitIns_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, 0xffffffffff800000LL); + + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, 0); // == mov + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, -1); + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, 0xfff); + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, -0xfff); + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, 0x1000); + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, 0xfff000); + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, 0xfffffffffffff000LL); + theEmitter->emitIns_R_R_I(INS_adds, EA_8BYTE, REG_R8, REG_R9, 0xffffffffff800000LL); + + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, 0); // == mov + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, -1); + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, 0xfff); + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, -0xfff); + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, 0x1000); + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, 0xfff000); + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, 0xfffffffffffff000LL); + theEmitter->emitIns_R_R_I(INS_adds, EA_4BYTE, REG_R8, REG_R9, 0xffffffffff800000LL); + + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, 0); // == mov + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, -1); + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, 0xfff); + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, -0xfff); + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, 0x1000); + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, 0xfff000); + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, 0xfffffffffffff000LL); + theEmitter->emitIns_R_R_I(INS_subs, EA_8BYTE, REG_R8, REG_R9, 0xffffffffff800000LL); + + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, 0); // == mov + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, -1); + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, 0xfff); + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, -0xfff); + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, 0x1000); + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, 0xfff000); + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, 0xfffffffffffff000LL); + theEmitter->emitIns_R_R_I(INS_subs, EA_4BYTE, REG_R8, REG_R9, 0xffffffffff800000LL); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_I cmp/txt + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // cmp + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0); + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 0); + + // CMP (shifted register) + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 31, INS_OPTS_LSL); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 32, INS_OPTS_LSR); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 33, INS_OPTS_ASR); + + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 21, INS_OPTS_LSL); + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 22, INS_OPTS_LSR); + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 23, INS_OPTS_ASR); + + // TST (shifted register) + theEmitter->emitIns_R_R_I(INS_tst, EA_8BYTE, REG_R8, REG_R9, 31, INS_OPTS_LSL); + theEmitter->emitIns_R_R_I(INS_tst, EA_8BYTE, REG_R8, REG_R9, 32, INS_OPTS_LSR); + theEmitter->emitIns_R_R_I(INS_tst, EA_8BYTE, REG_R8, REG_R9, 33, INS_OPTS_ASR); + theEmitter->emitIns_R_R_I(INS_tst, EA_8BYTE, REG_R8, REG_R9, 34, INS_OPTS_ROR); + + theEmitter->emitIns_R_R_I(INS_tst, EA_4BYTE, REG_R8, REG_R9, 21, INS_OPTS_LSL); + theEmitter->emitIns_R_R_I(INS_tst, EA_4BYTE, REG_R8, REG_R9, 22, INS_OPTS_LSR); + theEmitter->emitIns_R_R_I(INS_tst, EA_4BYTE, REG_R8, REG_R9, 23, INS_OPTS_ASR); + theEmitter->emitIns_R_R_I(INS_tst, EA_4BYTE, REG_R8, REG_R9, 24, INS_OPTS_ROR); + + // CMP (extended register) + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0, INS_OPTS_UXTB); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0, INS_OPTS_UXTH); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0, INS_OPTS_UXTW); // "cmp x8, x9, UXTW"; msdis disassembles this "cmp x8,x9", which looks like an msdis issue. + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0, INS_OPTS_UXTX); + + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0, INS_OPTS_SXTB); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0, INS_OPTS_SXTH); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 0, INS_OPTS_SXTX); + + // CMP 64-bit (extended register) and left shift + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 1, INS_OPTS_UXTB); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 2, INS_OPTS_UXTH); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 3, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 4, INS_OPTS_UXTX); + + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 1, INS_OPTS_SXTB); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 2, INS_OPTS_SXTH); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 3, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_I(INS_cmp, EA_8BYTE, REG_R8, REG_R9, 4, INS_OPTS_SXTX); + + // CMP 32-bit (extended register) and left shift + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 0, INS_OPTS_UXTB); + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 2, INS_OPTS_UXTH); + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 4, INS_OPTS_UXTW); + + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 0, INS_OPTS_SXTB); + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 2, INS_OPTS_SXTH); + theEmitter->emitIns_R_R_I(INS_cmp, EA_4BYTE, REG_R8, REG_R9, 4, INS_OPTS_SXTW); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_R(INS_lsl, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_lsr, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_asr, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_ror, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_adc, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_adcs, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_sbc, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_sbcs, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_udiv, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_sdiv, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_mul, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_mneg, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_smull, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_smnegl, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_smulh, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_umull, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_umnegl, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_umulh, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_lslv, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_lsrv, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_asrv, EA_8BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_rorv, EA_8BYTE, REG_R8, REG_R9, REG_R10); + + theEmitter->emitIns_R_R_R(INS_lsl, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_lsr, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_asr, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_ror, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_adc, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_adcs, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_sbc, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_sbcs, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_udiv, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_sdiv, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_mul, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_mneg, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_smull, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_smnegl, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_smulh, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_umull, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_umnegl, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_umulh, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_lslv, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_lsrv, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_asrv, EA_4BYTE, REG_R8, REG_R9, REG_R10); + theEmitter->emitIns_R_R_R(INS_rorv, EA_4BYTE, REG_R8, REG_R9, REG_R10); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_I_I + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_I_I(INS_sbfm, EA_8BYTE, REG_R2, REG_R3, 4, 39); + theEmitter->emitIns_R_R_I_I(INS_bfm, EA_8BYTE, REG_R1, REG_R5, 20, 23); + theEmitter->emitIns_R_R_I_I(INS_ubfm, EA_8BYTE, REG_R8, REG_R9, 36, 7); + + theEmitter->emitIns_R_R_I_I(INS_sbfiz, EA_8BYTE, REG_R2, REG_R3, 7, 37); + theEmitter->emitIns_R_R_I_I(INS_bfi, EA_8BYTE, REG_R1, REG_R5, 23, 21); + theEmitter->emitIns_R_R_I_I(INS_ubfiz, EA_8BYTE, REG_R8, REG_R9, 39, 5); + + theEmitter->emitIns_R_R_I_I(INS_sbfx, EA_8BYTE, REG_R2, REG_R3, 10, 24); + theEmitter->emitIns_R_R_I_I(INS_bfxil, EA_8BYTE, REG_R1, REG_R5, 26, 16); + theEmitter->emitIns_R_R_I_I(INS_ubfx, EA_8BYTE, REG_R8, REG_R9, 42, 8); + + theEmitter->emitIns_R_R_I_I(INS_sbfm, EA_4BYTE, REG_R2, REG_R3, 4, 19); + theEmitter->emitIns_R_R_I_I(INS_bfm, EA_4BYTE, REG_R1, REG_R5, 10, 13); + theEmitter->emitIns_R_R_I_I(INS_ubfm, EA_4BYTE, REG_R8, REG_R9, 16, 7); + + theEmitter->emitIns_R_R_I_I(INS_sbfiz, EA_4BYTE, REG_R2, REG_R3, 5, 17); + theEmitter->emitIns_R_R_I_I(INS_bfi, EA_4BYTE, REG_R1, REG_R5, 13, 11); + theEmitter->emitIns_R_R_I_I(INS_ubfiz, EA_4BYTE, REG_R8, REG_R9, 19, 5); + + theEmitter->emitIns_R_R_I_I(INS_sbfx, EA_4BYTE, REG_R2, REG_R3, 3, 14); + theEmitter->emitIns_R_R_I_I(INS_bfxil, EA_4BYTE, REG_R1, REG_R5, 11, 9); + theEmitter->emitIns_R_R_I_I(INS_ubfx, EA_4BYTE, REG_R8, REG_R9, 22, 8); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R_I + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // ADD (extended register) + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_UXTB); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_UXTH); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_SXTB); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_SXTH); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_SXTX); + + // ADD (extended register) and left shift + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_UXTB); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_UXTH); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_SXTB); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_SXTH); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_SXTX); + + // ADD (shifted register) + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 31, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 32, INS_OPTS_LSR); + theEmitter->emitIns_R_R_R_I(INS_add, EA_8BYTE, REG_R8, REG_R9, REG_R10, 33, INS_OPTS_ASR); + + // EXTR (extract field from register pair) + theEmitter->emitIns_R_R_R_I(INS_extr, EA_8BYTE, REG_R8, REG_R9, REG_R10, 1); + theEmitter->emitIns_R_R_R_I(INS_extr, EA_8BYTE, REG_R8, REG_R9, REG_R10, 31); + theEmitter->emitIns_R_R_R_I(INS_extr, EA_8BYTE, REG_R8, REG_R9, REG_R10, 32); + theEmitter->emitIns_R_R_R_I(INS_extr, EA_8BYTE, REG_R8, REG_R9, REG_R10, 63); + + theEmitter->emitIns_R_R_R_I(INS_extr, EA_4BYTE, REG_R8, REG_R9, REG_R10, 1); + theEmitter->emitIns_R_R_R_I(INS_extr, EA_4BYTE, REG_R8, REG_R9, REG_R10, 31); + + // SUB (extended register) + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_UXTB); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_UXTH); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_SXTB); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_SXTH); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0, INS_OPTS_SXTX); + + // SUB (extended register) and left shift + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_UXTB); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_UXTH); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_SXTB); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_SXTH); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_SXTX); + + // SUB (shifted register) + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 27, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 28, INS_OPTS_LSR); + theEmitter->emitIns_R_R_R_I(INS_sub, EA_4BYTE, REG_R8, REG_R9, REG_R10, 29, INS_OPTS_ASR); + + // bit operations + theEmitter->emitIns_R_R_R_I(INS_and, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_ands, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_eor, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_orr, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_bic, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_bics, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_eon, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_orn, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + + theEmitter->emitIns_R_R_R_I(INS_and, EA_8BYTE, REG_R8, REG_R9, REG_R10, 1, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_I(INS_ands, EA_8BYTE, REG_R8, REG_R9, REG_R10, 2, INS_OPTS_LSR); + theEmitter->emitIns_R_R_R_I(INS_eor, EA_8BYTE, REG_R8, REG_R9, REG_R10, 3, INS_OPTS_ASR); + theEmitter->emitIns_R_R_R_I(INS_orr, EA_8BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_ROR); + theEmitter->emitIns_R_R_R_I(INS_bic, EA_8BYTE, REG_R8, REG_R9, REG_R10, 5, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_I(INS_bics, EA_8BYTE, REG_R8, REG_R9, REG_R10, 6, INS_OPTS_LSR); + theEmitter->emitIns_R_R_R_I(INS_eon, EA_8BYTE, REG_R8, REG_R9, REG_R10, 7, INS_OPTS_ASR); + theEmitter->emitIns_R_R_R_I(INS_orn, EA_8BYTE, REG_R8, REG_R9, REG_R10, 8, INS_OPTS_ROR); + + theEmitter->emitIns_R_R_R_I(INS_and, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_ands, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_eor, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_orr, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_bic, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_bics, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_eon, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_orn, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + + theEmitter->emitIns_R_R_R_I(INS_and, EA_4BYTE, REG_R8, REG_R9, REG_R10, 1, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_I(INS_ands, EA_4BYTE, REG_R8, REG_R9, REG_R10, 2, INS_OPTS_LSR); + theEmitter->emitIns_R_R_R_I(INS_eor, EA_4BYTE, REG_R8, REG_R9, REG_R10, 3, INS_OPTS_ASR); + theEmitter->emitIns_R_R_R_I(INS_orr, EA_4BYTE, REG_R8, REG_R9, REG_R10, 4, INS_OPTS_ROR); + theEmitter->emitIns_R_R_R_I(INS_bic, EA_4BYTE, REG_R8, REG_R9, REG_R10, 5, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_I(INS_bics, EA_4BYTE, REG_R8, REG_R9, REG_R10, 6, INS_OPTS_LSR); + theEmitter->emitIns_R_R_R_I(INS_eon, EA_4BYTE, REG_R8, REG_R9, REG_R10, 7, INS_OPTS_ASR); + theEmitter->emitIns_R_R_R_I(INS_orn, EA_4BYTE, REG_R8, REG_R9, REG_R10, 8, INS_OPTS_ROR); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R_I -- load/store pair + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + theEmitter->emitIns_R_R_R_I(INS_ldnp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_ldnp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 8); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 8); + + theEmitter->emitIns_R_R_R_I(INS_ldnp, EA_4BYTE, REG_R8, REG_R9, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_4BYTE, REG_R8, REG_R9, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_ldnp, EA_4BYTE, REG_R8, REG_R9, REG_SP, 8); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_4BYTE, REG_R8, REG_R9, REG_SP, 8); + + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 16); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 16); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_PRE_INDEX); + + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_4BYTE, REG_R8, REG_R9, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_4BYTE, REG_R8, REG_R9, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_4BYTE, REG_R8, REG_R9, REG_SP, 16); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_4BYTE, REG_R8, REG_R9, REG_SP, 16); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_4BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_4BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_4BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_4BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_PRE_INDEX); + + theEmitter->emitIns_R_R_R_I(INS_ldpsw, EA_4BYTE, REG_R8, REG_R9, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_ldpsw, EA_4BYTE, REG_R8, REG_R9, REG_R10, 16); + theEmitter->emitIns_R_R_R_I(INS_ldpsw, EA_4BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ldpsw, EA_4BYTE, REG_R8, REG_R9, REG_R10, 16, INS_OPTS_PRE_INDEX); + + // SP and ZR tests + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_ZR, REG_R1, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_R0, REG_ZR, REG_SP, 16); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_R1, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_R0, REG_ZR, REG_SP, 16); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SP, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_R8, 16, INS_OPTS_PRE_INDEX); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R_Ext -- load/store shifted/extend + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // LDR (register) + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL, 3); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW, 3); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW, 3); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX, 3); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX, 3); + + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX, 2); + + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldrh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX, 1); + + theEmitter->emitIns_R_R_R_Ext(INS_ldrb, EA_1BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldrb, EA_1BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrb, EA_1BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrb, EA_1BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldrb, EA_1BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsw, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX, 2); + + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_4BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_8BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsh, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX, 1); + + theEmitter->emitIns_R_R_R_Ext(INS_ldrsb, EA_4BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsb, EA_8BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsb, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsb, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsb, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldrsb, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + + // STR (register) + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL, 3); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW, 3); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW, 3); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX, 3); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_8BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX, 3); + + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL, 2); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW, 2); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW, 2); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX, 2); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_str, EA_4BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX, 2); + + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_LSL, 1); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW, 1); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW, 1); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX, 1); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_strh, EA_2BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX, 1); + + theEmitter->emitIns_R_R_R_Ext(INS_strb, EA_1BYTE, REG_R8, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_strb, EA_1BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_strb, EA_1BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_strb, EA_1BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_strb, EA_1BYTE, REG_R8, REG_SP, REG_R9, INS_OPTS_UXTX); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R_R + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_R_R(INS_madd, EA_4BYTE, REG_R0, REG_R12, REG_R27, REG_R10); + theEmitter->emitIns_R_R_R_R(INS_msub, EA_4BYTE, REG_R1, REG_R13, REG_R28, REG_R11); + theEmitter->emitIns_R_R_R_R(INS_smaddl, EA_4BYTE, REG_R2, REG_R14, REG_R0, REG_R12); + theEmitter->emitIns_R_R_R_R(INS_smsubl, EA_4BYTE, REG_R3, REG_R15, REG_R1, REG_R13); + theEmitter->emitIns_R_R_R_R(INS_umaddl, EA_4BYTE, REG_R4, REG_R19, REG_R2, REG_R14); + theEmitter->emitIns_R_R_R_R(INS_umsubl, EA_4BYTE, REG_R5, REG_R20, REG_R3, REG_R15); + + theEmitter->emitIns_R_R_R_R(INS_madd, EA_8BYTE, REG_R6, REG_R21, REG_R4, REG_R19); + theEmitter->emitIns_R_R_R_R(INS_msub, EA_8BYTE, REG_R7, REG_R22, REG_R5, REG_R20); + theEmitter->emitIns_R_R_R_R(INS_smaddl, EA_8BYTE, REG_R8, REG_R23, REG_R6, REG_R21); + theEmitter->emitIns_R_R_R_R(INS_smsubl, EA_8BYTE, REG_R9, REG_R24, REG_R7, REG_R22); + theEmitter->emitIns_R_R_R_R(INS_umaddl, EA_8BYTE, REG_R10, REG_R25, REG_R8, REG_R23); + theEmitter->emitIns_R_R_R_R(INS_umsubl, EA_8BYTE, REG_R11, REG_R26, REG_R9, REG_R24); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // R_COND + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // cset reg, cond + theEmitter->emitIns_R_COND(INS_cset, EA_8BYTE, REG_R9, INS_COND_EQ); // eq + theEmitter->emitIns_R_COND(INS_cset, EA_4BYTE, REG_R8, INS_COND_NE); // ne + theEmitter->emitIns_R_COND(INS_cset, EA_4BYTE, REG_R7, INS_COND_HS); // hs + theEmitter->emitIns_R_COND(INS_cset, EA_8BYTE, REG_R6, INS_COND_LO); // lo + theEmitter->emitIns_R_COND(INS_cset, EA_8BYTE, REG_R5, INS_COND_MI); // mi + theEmitter->emitIns_R_COND(INS_cset, EA_4BYTE, REG_R4, INS_COND_PL); // pl + theEmitter->emitIns_R_COND(INS_cset, EA_4BYTE, REG_R3, INS_COND_VS); // vs + theEmitter->emitIns_R_COND(INS_cset, EA_8BYTE, REG_R2, INS_COND_VC); // vc + theEmitter->emitIns_R_COND(INS_cset, EA_8BYTE, REG_R1, INS_COND_HI); // hi + theEmitter->emitIns_R_COND(INS_cset, EA_4BYTE, REG_R0, INS_COND_LS); // ls + theEmitter->emitIns_R_COND(INS_cset, EA_4BYTE, REG_R9, INS_COND_GE); // ge + theEmitter->emitIns_R_COND(INS_cset, EA_8BYTE, REG_R8, INS_COND_LT); // lt + theEmitter->emitIns_R_COND(INS_cset, EA_8BYTE, REG_R7, INS_COND_GT); // gt + theEmitter->emitIns_R_COND(INS_cset, EA_4BYTE, REG_R6, INS_COND_LE); // le + + // csetm reg, cond + theEmitter->emitIns_R_COND(INS_csetm, EA_4BYTE, REG_R9, INS_COND_EQ); // eq + theEmitter->emitIns_R_COND(INS_csetm, EA_8BYTE, REG_R8, INS_COND_NE); // ne + theEmitter->emitIns_R_COND(INS_csetm, EA_8BYTE, REG_R7, INS_COND_HS); // hs + theEmitter->emitIns_R_COND(INS_csetm, EA_4BYTE, REG_R6, INS_COND_LO); // lo + theEmitter->emitIns_R_COND(INS_csetm, EA_4BYTE, REG_R5, INS_COND_MI); // mi + theEmitter->emitIns_R_COND(INS_csetm, EA_8BYTE, REG_R4, INS_COND_PL); // pl + theEmitter->emitIns_R_COND(INS_csetm, EA_8BYTE, REG_R3, INS_COND_VS); // vs + theEmitter->emitIns_R_COND(INS_csetm, EA_4BYTE, REG_R2, INS_COND_VC); // vc + theEmitter->emitIns_R_COND(INS_csetm, EA_4BYTE, REG_R1, INS_COND_HI); // hi + theEmitter->emitIns_R_COND(INS_csetm, EA_8BYTE, REG_R0, INS_COND_LS); // ls + theEmitter->emitIns_R_COND(INS_csetm, EA_8BYTE, REG_R9, INS_COND_GE); // ge + theEmitter->emitIns_R_COND(INS_csetm, EA_4BYTE, REG_R8, INS_COND_LT); // lt + theEmitter->emitIns_R_COND(INS_csetm, EA_4BYTE, REG_R7, INS_COND_GT); // gt + theEmitter->emitIns_R_COND(INS_csetm, EA_8BYTE, REG_R6, INS_COND_LE); // le + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // R_R_COND + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // cinc reg, reg, cond + // cinv reg, reg, cond + // cneg reg, reg, cond + theEmitter->emitIns_R_R_COND(INS_cinc, EA_8BYTE, REG_R0, REG_R4, INS_COND_EQ); // eq + theEmitter->emitIns_R_R_COND(INS_cinv, EA_4BYTE, REG_R1, REG_R5, INS_COND_NE); // ne + theEmitter->emitIns_R_R_COND(INS_cneg, EA_4BYTE, REG_R2, REG_R6, INS_COND_HS); // hs + theEmitter->emitIns_R_R_COND(INS_cinc, EA_8BYTE, REG_R3, REG_R7, INS_COND_LO); // lo + theEmitter->emitIns_R_R_COND(INS_cinv, EA_4BYTE, REG_R4, REG_R8, INS_COND_MI); // mi + theEmitter->emitIns_R_R_COND(INS_cneg, EA_8BYTE, REG_R5, REG_R9, INS_COND_PL); // pl + theEmitter->emitIns_R_R_COND(INS_cinc, EA_8BYTE, REG_R6, REG_R0, INS_COND_VS); // vs + theEmitter->emitIns_R_R_COND(INS_cinv, EA_4BYTE, REG_R7, REG_R1, INS_COND_VC); // vc + theEmitter->emitIns_R_R_COND(INS_cneg, EA_8BYTE, REG_R8, REG_R2, INS_COND_HI); // hi + theEmitter->emitIns_R_R_COND(INS_cinc, EA_4BYTE, REG_R9, REG_R3, INS_COND_LS); // ls + theEmitter->emitIns_R_R_COND(INS_cinv, EA_4BYTE, REG_R0, REG_R4, INS_COND_GE); // ge + theEmitter->emitIns_R_R_COND(INS_cneg, EA_8BYTE, REG_R2, REG_R5, INS_COND_LT); // lt + theEmitter->emitIns_R_R_COND(INS_cinc, EA_4BYTE, REG_R2, REG_R6, INS_COND_GT); // gt + theEmitter->emitIns_R_R_COND(INS_cinv, EA_8BYTE, REG_R3, REG_R7, INS_COND_LE); // le + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // R_R_R_COND + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // csel reg, reg, reg, cond + // csinc reg, reg, reg, cond + // csinv reg, reg, reg, cond + // csneg reg, reg, reg, cond + theEmitter->emitIns_R_R_R_COND(INS_csel, EA_8BYTE, REG_R0, REG_R4, REG_R8, INS_COND_EQ); // eq + theEmitter->emitIns_R_R_R_COND(INS_csinc, EA_4BYTE, REG_R1, REG_R5, REG_R9, INS_COND_NE); // ne + theEmitter->emitIns_R_R_R_COND(INS_csinv, EA_4BYTE, REG_R2, REG_R6, REG_R0, INS_COND_HS); // hs + theEmitter->emitIns_R_R_R_COND(INS_csneg, EA_8BYTE, REG_R3, REG_R7, REG_R1, INS_COND_LO); // lo + theEmitter->emitIns_R_R_R_COND(INS_csel, EA_4BYTE, REG_R4, REG_R8, REG_R2, INS_COND_MI); // mi + theEmitter->emitIns_R_R_R_COND(INS_csinc, EA_8BYTE, REG_R5, REG_R9, REG_R3, INS_COND_PL); // pl + theEmitter->emitIns_R_R_R_COND(INS_csinv, EA_8BYTE, REG_R6, REG_R0, REG_R4, INS_COND_VS); // vs + theEmitter->emitIns_R_R_R_COND(INS_csneg, EA_4BYTE, REG_R7, REG_R1, REG_R5, INS_COND_VC); // vc + theEmitter->emitIns_R_R_R_COND(INS_csel, EA_8BYTE, REG_R8, REG_R2, REG_R6, INS_COND_HI); // hi + theEmitter->emitIns_R_R_R_COND(INS_csinc, EA_4BYTE, REG_R9, REG_R3, REG_R7, INS_COND_LS); // ls + theEmitter->emitIns_R_R_R_COND(INS_csinv, EA_4BYTE, REG_R0, REG_R4, REG_R8, INS_COND_GE); // ge + theEmitter->emitIns_R_R_R_COND(INS_csneg, EA_8BYTE, REG_R2, REG_R5, REG_R9, INS_COND_LT); // lt + theEmitter->emitIns_R_R_R_COND(INS_csel, EA_4BYTE, REG_R2, REG_R6, REG_R0, INS_COND_GT); // gt + theEmitter->emitIns_R_R_R_COND(INS_csinc, EA_8BYTE, REG_R3, REG_R7, REG_R1, INS_COND_LE); // le + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // R_R_FLAGS_COND + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // ccmp reg1, reg2, nzcv, cond + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R9, REG_R3, INS_FLAGS_V, INS_COND_EQ); // eq + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R8, REG_R2, INS_FLAGS_C, INS_COND_NE); // ne + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R7, REG_R1, INS_FLAGS_Z, INS_COND_HS); // hs + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R6, REG_R0, INS_FLAGS_N, INS_COND_LO); // lo + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R5, REG_R3, INS_FLAGS_CV, INS_COND_MI); // mi + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R4, REG_R2, INS_FLAGS_ZV, INS_COND_PL); // pl + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R3, REG_R1, INS_FLAGS_ZC, INS_COND_VS); // vs + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R2, REG_R0, INS_FLAGS_NV, INS_COND_VC); // vc + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R1, REG_R3, INS_FLAGS_NC, INS_COND_HI); // hi + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R0, REG_R2, INS_FLAGS_NZ, INS_COND_LS); // ls + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R9, REG_R1, INS_FLAGS_NONE, INS_COND_GE); // ge + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R8, REG_R0, INS_FLAGS_NZV, INS_COND_LT); // lt + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R7, REG_R3, INS_FLAGS_NZC, INS_COND_GT); // gt + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R6, REG_R2, INS_FLAGS_NZCV, INS_COND_LE); // le + + // ccmp reg1, imm, nzcv, cond + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R9, 3, INS_FLAGS_V, INS_COND_EQ); // eq + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R8, 2, INS_FLAGS_C, INS_COND_NE); // ne + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R7, 1, INS_FLAGS_Z, INS_COND_HS); // hs + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R6, 0, INS_FLAGS_N, INS_COND_LO); // lo + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R5, 31, INS_FLAGS_CV, INS_COND_MI); // mi + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R4, 28, INS_FLAGS_ZV, INS_COND_PL); // pl + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R3, 25, INS_FLAGS_ZC, INS_COND_VS); // vs + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R2, 22, INS_FLAGS_NV, INS_COND_VC); // vc + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R1, 19, INS_FLAGS_NC, INS_COND_HI); // hi + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R0, 16, INS_FLAGS_NZ, INS_COND_LS); // ls + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R9, 13, INS_FLAGS_NONE, INS_COND_GE); // ge + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R8, 10, INS_FLAGS_NZV, INS_COND_LT); // lt + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R7, 7, INS_FLAGS_NZC, INS_COND_GT); // gt + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R6, 4, INS_FLAGS_NZCV, INS_COND_LE); // le + + // ccmp reg1, imm, nzcv, cond -- encoded as ccmn + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R9, -3, INS_FLAGS_V, INS_COND_EQ); // eq + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R8, -2, INS_FLAGS_C, INS_COND_NE); // ne + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R7, -1, INS_FLAGS_Z, INS_COND_HS); // hs + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R6, -5, INS_FLAGS_N, INS_COND_LO); // lo + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R5, -31, INS_FLAGS_CV, INS_COND_MI); // mi + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R4, -28, INS_FLAGS_ZV, INS_COND_PL); // pl + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R3, -25, INS_FLAGS_ZC, INS_COND_VS); // vs + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R2, -22, INS_FLAGS_NV, INS_COND_VC); // vc + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R1, -19, INS_FLAGS_NC, INS_COND_HI); // hi + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R0, -16, INS_FLAGS_NZ, INS_COND_LS); // ls + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R9, -13, INS_FLAGS_NONE, INS_COND_GE); // ge + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R8, -10, INS_FLAGS_NZV, INS_COND_LT); // lt + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_8BYTE, REG_R7, -7, INS_FLAGS_NZC, INS_COND_GT); // gt + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmp, EA_4BYTE, REG_R6, -4, INS_FLAGS_NZCV, INS_COND_LE); // le + + // ccmn reg1, reg2, nzcv, cond + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R9, REG_R3, INS_FLAGS_V, INS_COND_EQ); // eq + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R8, REG_R2, INS_FLAGS_C, INS_COND_NE); // ne + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R7, REG_R1, INS_FLAGS_Z, INS_COND_HS); // hs + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R6, REG_R0, INS_FLAGS_N, INS_COND_LO); // lo + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R5, REG_R3, INS_FLAGS_CV, INS_COND_MI); // mi + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R4, REG_R2, INS_FLAGS_ZV, INS_COND_PL); // pl + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R3, REG_R1, INS_FLAGS_ZC, INS_COND_VS); // vs + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R2, REG_R0, INS_FLAGS_NV, INS_COND_VC); // vc + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R1, REG_R3, INS_FLAGS_NC, INS_COND_HI); // hi + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R0, REG_R2, INS_FLAGS_NZ, INS_COND_LS); // ls + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R9, REG_R1, INS_FLAGS_NONE, INS_COND_GE); // ge + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R8, REG_R0, INS_FLAGS_NZV, INS_COND_LT); // lt + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R7, REG_R3, INS_FLAGS_NZC, INS_COND_GT); // gt + theEmitter->emitIns_R_R_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R6, REG_R2, INS_FLAGS_NZCV, INS_COND_LE); // le + + // ccmn reg1, imm, nzcv, cond + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R9, 3, INS_FLAGS_V, INS_COND_EQ); // eq + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R8, 2, INS_FLAGS_C, INS_COND_NE); // ne + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R7, 1, INS_FLAGS_Z, INS_COND_HS); // hs + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R6, 0, INS_FLAGS_N, INS_COND_LO); // lo + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R5, 31, INS_FLAGS_CV, INS_COND_MI); // mi + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R4, 28, INS_FLAGS_ZV, INS_COND_PL); // pl + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R3, 25, INS_FLAGS_ZC, INS_COND_VS); // vs + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R2, 22, INS_FLAGS_NV, INS_COND_VC); // vc + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R1, 19, INS_FLAGS_NC, INS_COND_HI); // hi + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R0, 16, INS_FLAGS_NZ, INS_COND_LS); // ls + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R9, 13, INS_FLAGS_NONE, INS_COND_GE); // ge + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R8, 10, INS_FLAGS_NZV, INS_COND_LT); // lt + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_8BYTE, REG_R7, 7, INS_FLAGS_NZC, INS_COND_GT); // gt + theEmitter->emitIns_R_I_FLAGS_COND(INS_ccmn, EA_4BYTE, REG_R6, 4, INS_FLAGS_NZCV, INS_COND_LE); // le + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // Branch to register + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R(INS_br, EA_PTRSIZE, REG_R8); + theEmitter->emitIns_R(INS_blr, EA_PTRSIZE, REG_R9); + theEmitter->emitIns_R(INS_ret, EA_PTRSIZE, REG_R8); + theEmitter->emitIns_R(INS_ret, EA_PTRSIZE, REG_LR); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // Misc + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_I(INS_brk, EA_PTRSIZE, 0); + theEmitter->emitIns_I(INS_brk, EA_PTRSIZE, 65535); + + theEmitter->emitIns_BARR(INS_dsb, INS_BARRIER_OSHLD); + theEmitter->emitIns_BARR(INS_dmb, INS_BARRIER_OSHST); + theEmitter->emitIns_BARR(INS_isb, INS_BARRIER_OSH); + + theEmitter->emitIns_BARR(INS_dmb, INS_BARRIER_NSHLD); + theEmitter->emitIns_BARR(INS_isb, INS_BARRIER_NSHST); + theEmitter->emitIns_BARR(INS_dsb, INS_BARRIER_NSH); + + theEmitter->emitIns_BARR(INS_isb, INS_BARRIER_ISHLD); + theEmitter->emitIns_BARR(INS_dsb, INS_BARRIER_ISHST); + theEmitter->emitIns_BARR(INS_dmb, INS_BARRIER_ISH); + + theEmitter->emitIns_BARR(INS_dsb, INS_BARRIER_LD); + theEmitter->emitIns_BARR(INS_dmb, INS_BARRIER_ST); + theEmitter->emitIns_BARR(INS_isb, INS_BARRIER_SY); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + //////////////////////////////////////////////////////////////////////////////// + // + // SIMD and Floating point + // + //////////////////////////////////////////////////////////////////////////////// + + // + // Load/Stores vector register + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // ldr/str Vt, [reg] + theEmitter->emitIns_R_R(INS_ldr, EA_8BYTE, REG_V1, REG_R9); + theEmitter->emitIns_R_R(INS_str, EA_8BYTE, REG_V2, REG_R8); + theEmitter->emitIns_R_R(INS_ldr, EA_4BYTE, REG_V3, REG_R7); + theEmitter->emitIns_R_R(INS_str, EA_4BYTE, REG_V4, REG_R6); + theEmitter->emitIns_R_R(INS_ldr, EA_2BYTE, REG_V5, REG_R5); + theEmitter->emitIns_R_R(INS_str, EA_2BYTE, REG_V6, REG_R4); + theEmitter->emitIns_R_R(INS_ldr, EA_1BYTE, REG_V7, REG_R3); + theEmitter->emitIns_R_R(INS_str, EA_1BYTE, REG_V8, REG_R2); + theEmitter->emitIns_R_R(INS_ldr, EA_16BYTE, REG_V9, REG_R1); + theEmitter->emitIns_R_R(INS_str, EA_16BYTE, REG_V10, REG_R0); + + // ldr/str Vt, [reg+cns] -- scaled + theEmitter->emitIns_R_R_I(INS_ldr, EA_1BYTE, REG_V8, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_ldr, EA_2BYTE, REG_V8, REG_R9, 2); + theEmitter->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_V8, REG_R9, 4); + theEmitter->emitIns_R_R_I(INS_ldr, EA_8BYTE, REG_V8, REG_R9, 8); + theEmitter->emitIns_R_R_I(INS_ldr, EA_16BYTE, REG_V8, REG_R9, 16); + + theEmitter->emitIns_R_R_I(INS_ldr, EA_1BYTE, REG_V7, REG_R10, 1); + theEmitter->emitIns_R_R_I(INS_ldr, EA_2BYTE, REG_V7, REG_R10, 2); + theEmitter->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_V7, REG_R10, 4); + theEmitter->emitIns_R_R_I(INS_ldr, EA_8BYTE, REG_V7, REG_R10, 8); + theEmitter->emitIns_R_R_I(INS_ldr, EA_16BYTE, REG_V7, REG_R10, 16); + + // ldr/str Vt, [reg],cns -- post-indexed (unscaled) + // ldr/str Vt, [reg+cns]! -- post-indexed (unscaled) + theEmitter->emitIns_R_R_I(INS_ldr, EA_1BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_2BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_8BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_16BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + + theEmitter->emitIns_R_R_I(INS_ldr, EA_1BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_2BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_8BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_ldr, EA_16BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + + theEmitter->emitIns_R_R_I(INS_str, EA_1BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_str, EA_2BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_str, EA_4BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_str, EA_8BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I(INS_str, EA_16BYTE, REG_V8, REG_R9, 1, INS_OPTS_POST_INDEX); + + theEmitter->emitIns_R_R_I(INS_str, EA_1BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_str, EA_2BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_str, EA_4BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_str, EA_8BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_I(INS_str, EA_16BYTE, REG_V8, REG_R9, 1, INS_OPTS_PRE_INDEX); + + theEmitter->emitIns_R_R_I(INS_ldur, EA_1BYTE, REG_V8, REG_R9, 2); + theEmitter->emitIns_R_R_I(INS_ldur, EA_2BYTE, REG_V8, REG_R9, 3); + theEmitter->emitIns_R_R_I(INS_ldur, EA_4BYTE, REG_V8, REG_R9, 5); + theEmitter->emitIns_R_R_I(INS_ldur, EA_8BYTE, REG_V8, REG_R9, 9); + theEmitter->emitIns_R_R_I(INS_ldur, EA_16BYTE, REG_V8, REG_R9, 17); + + theEmitter->emitIns_R_R_I(INS_stur, EA_1BYTE, REG_V7, REG_R10, 2); + theEmitter->emitIns_R_R_I(INS_stur, EA_2BYTE, REG_V7, REG_R10, 3); + theEmitter->emitIns_R_R_I(INS_stur, EA_4BYTE, REG_V7, REG_R10, 5); + theEmitter->emitIns_R_R_I(INS_stur, EA_8BYTE, REG_V7, REG_R10, 9); + theEmitter->emitIns_R_R_I(INS_stur, EA_16BYTE, REG_V7, REG_R10, 17); + + // load/store pair + theEmitter->emitIns_R_R_R (INS_ldnp, EA_8BYTE, REG_V0, REG_V1, REG_R10); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_8BYTE, REG_V1, REG_V2, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_ldnp, EA_8BYTE, REG_V2, REG_V3, REG_R10, 8); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_8BYTE, REG_V3, REG_V4, REG_R10, 24); + + theEmitter->emitIns_R_R_R (INS_ldnp, EA_4BYTE, REG_V4, REG_V5, REG_SP); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_4BYTE, REG_V5, REG_V6, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_ldnp, EA_4BYTE, REG_V6, REG_V7, REG_SP, 4); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_4BYTE, REG_V7, REG_V8, REG_SP, 12); + + theEmitter->emitIns_R_R_R (INS_ldnp, EA_16BYTE, REG_V8, REG_V9, REG_R10); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_16BYTE, REG_V9, REG_V10, REG_R10, 0); + theEmitter->emitIns_R_R_R_I(INS_ldnp, EA_16BYTE, REG_V10, REG_V11, REG_R10, 16); + theEmitter->emitIns_R_R_R_I(INS_stnp, EA_16BYTE, REG_V11, REG_V12, REG_R10, 48); + + theEmitter->emitIns_R_R_R (INS_ldp, EA_8BYTE, REG_V0, REG_V1, REG_R10); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_V1, REG_V2, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_V2, REG_V3, REG_SP, 8); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_V3, REG_V4, REG_R10, 16); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_V4, REG_V5, REG_R10, 24, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_V5, REG_V6, REG_SP, 32, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, REG_V6, REG_V7, REG_SP, 40, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_V7, REG_V8, REG_R10, 48, INS_OPTS_PRE_INDEX); + + theEmitter->emitIns_R_R_R (INS_ldp, EA_4BYTE, REG_V0, REG_V1, REG_R10); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_4BYTE, REG_V1, REG_V2, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_4BYTE, REG_V2, REG_V3, REG_SP, 4); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_4BYTE, REG_V3, REG_V4, REG_R10, 8); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_4BYTE, REG_V4, REG_V5, REG_R10, 12, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_4BYTE, REG_V5, REG_V6, REG_SP, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_4BYTE, REG_V6, REG_V7, REG_SP, 20, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_4BYTE, REG_V7, REG_V8, REG_R10, 24, INS_OPTS_PRE_INDEX); + + theEmitter->emitIns_R_R_R (INS_ldp, EA_16BYTE, REG_V0, REG_V1, REG_R10); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_16BYTE, REG_V1, REG_V2, REG_SP, 0); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_16BYTE, REG_V2, REG_V3, REG_SP, 16); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_16BYTE, REG_V3, REG_V4, REG_R10, 32); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_16BYTE, REG_V4, REG_V5, REG_R10, 48, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_16BYTE, REG_V5, REG_V6, REG_SP, 64, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ldp, EA_16BYTE, REG_V6, REG_V7, REG_SP, 80, INS_OPTS_PRE_INDEX); + theEmitter->emitIns_R_R_R_I(INS_stp, EA_16BYTE, REG_V7, REG_V8, REG_R10, 96, INS_OPTS_PRE_INDEX); + + // LDR (register) + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V1, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V2, REG_R7, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V3, REG_R7, REG_R9, INS_OPTS_LSL, 3); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V4, REG_R7, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V5, REG_R7, REG_R9, INS_OPTS_SXTW, 3); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V6, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V7, REG_R7, REG_R9, INS_OPTS_UXTW, 3); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V8, REG_R7, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V9, REG_R7, REG_R9, INS_OPTS_SXTX, 3); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V10, REG_R7, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_8BYTE, REG_V11, REG_SP, REG_R9, INS_OPTS_UXTX, 3); + + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V1, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V2, REG_R7, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V3, REG_R7, REG_R9, INS_OPTS_LSL, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V4, REG_R7, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V5, REG_R7, REG_R9, INS_OPTS_SXTW, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V6, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V7, REG_R7, REG_R9, INS_OPTS_UXTW, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V8, REG_R7, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V9, REG_R7, REG_R9, INS_OPTS_SXTX, 2); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V10, REG_R7, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_4BYTE, REG_V11, REG_SP, REG_R9, INS_OPTS_UXTX, 2); + + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V1, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V2, REG_R7, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V3, REG_R7, REG_R9, INS_OPTS_LSL, 4); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V4, REG_R7, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V5, REG_R7, REG_R9, INS_OPTS_SXTW, 4); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V6, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V7, REG_R7, REG_R9, INS_OPTS_UXTW, 4); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V8, REG_R7, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V9, REG_R7, REG_R9, INS_OPTS_SXTX, 4); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V10, REG_R7, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_16BYTE, REG_V11, REG_SP, REG_R9, INS_OPTS_UXTX, 4); + + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V1, REG_SP, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V2, REG_R7, REG_R9, INS_OPTS_LSL); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V3, REG_R7, REG_R9, INS_OPTS_LSL, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V4, REG_R7, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V5, REG_R7, REG_R9, INS_OPTS_SXTW, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V6, REG_SP, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V7, REG_R7, REG_R9, INS_OPTS_UXTW, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V8, REG_R7, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V9, REG_R7, REG_R9, INS_OPTS_SXTX, 1); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V10, REG_R7, REG_R9, INS_OPTS_UXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_2BYTE, REG_V11, REG_SP, REG_R9, INS_OPTS_UXTX, 1); + + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_1BYTE, REG_V1, REG_R7, REG_R9); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_1BYTE, REG_V2, REG_SP, REG_R9, INS_OPTS_SXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_1BYTE, REG_V3, REG_R7, REG_R9, INS_OPTS_UXTW); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_1BYTE, REG_V4, REG_SP, REG_R9, INS_OPTS_SXTX); + theEmitter->emitIns_R_R_R_Ext(INS_ldr, EA_1BYTE, REG_V5, REG_R7, REG_R9, INS_OPTS_UXTX); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R mov and aliases for mov + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // mov vector to vector + theEmitter->emitIns_R_R(INS_mov, EA_8BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_mov, EA_16BYTE, REG_V2, REG_V3); + + theEmitter->emitIns_R_R(INS_mov, EA_4BYTE, REG_V12, REG_V13); + theEmitter->emitIns_R_R(INS_mov, EA_2BYTE, REG_V14, REG_V15); + theEmitter->emitIns_R_R(INS_mov, EA_1BYTE, REG_V16, REG_V17); + + // mov vector to general + theEmitter->emitIns_R_R(INS_mov, EA_8BYTE, REG_R0, REG_V4); + theEmitter->emitIns_R_R(INS_mov, EA_4BYTE, REG_R1, REG_V5); + theEmitter->emitIns_R_R(INS_mov, EA_2BYTE, REG_R2, REG_V6); + theEmitter->emitIns_R_R(INS_mov, EA_1BYTE, REG_R3, REG_V7); + + // mov general to vector + theEmitter->emitIns_R_R(INS_mov, EA_8BYTE, REG_V8, REG_R4); + theEmitter->emitIns_R_R(INS_mov, EA_4BYTE, REG_V9, REG_R5); + theEmitter->emitIns_R_R(INS_mov, EA_2BYTE, REG_V10, REG_R6); + theEmitter->emitIns_R_R(INS_mov, EA_1BYTE, REG_V11, REG_R7); + + // mov vector[index] to vector + theEmitter->emitIns_R_R_I(INS_mov, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_mov, EA_4BYTE, REG_V2, REG_V3, 3); + theEmitter->emitIns_R_R_I(INS_mov, EA_2BYTE, REG_V4, REG_V5, 7); + theEmitter->emitIns_R_R_I(INS_mov, EA_1BYTE, REG_V6, REG_V7, 15); + + // mov to general from vector[index] + theEmitter->emitIns_R_R_I(INS_mov, EA_8BYTE, REG_R8, REG_V16, 1); + theEmitter->emitIns_R_R_I(INS_mov, EA_4BYTE, REG_R9, REG_V17, 2); + theEmitter->emitIns_R_R_I(INS_mov, EA_2BYTE, REG_R10, REG_V18, 3); + theEmitter->emitIns_R_R_I(INS_mov, EA_1BYTE, REG_R11, REG_V19, 4); + + // mov to vector[index] from general + theEmitter->emitIns_R_R_I(INS_mov, EA_8BYTE, REG_V20, REG_R12, 1); + theEmitter->emitIns_R_R_I(INS_mov, EA_4BYTE, REG_V21, REG_R13, 2); + theEmitter->emitIns_R_R_I(INS_mov, EA_2BYTE, REG_V22, REG_R14, 6); + theEmitter->emitIns_R_R_I(INS_mov, EA_1BYTE, REG_V23, REG_R15, 8); + + // mov vector[index] to vector[index2] + theEmitter->emitIns_R_R_I_I(INS_mov, EA_8BYTE, REG_V8, REG_V9, 1, 0); + theEmitter->emitIns_R_R_I_I(INS_mov, EA_4BYTE, REG_V10, REG_V11, 2, 1); + theEmitter->emitIns_R_R_I_I(INS_mov, EA_2BYTE, REG_V12, REG_V13, 5, 2); + theEmitter->emitIns_R_R_I_I(INS_mov, EA_1BYTE, REG_V14, REG_V15, 12, 3); + + ////////////////////////////////////////////////////////////////////////////////// + + // mov/dup scalar + theEmitter->emitIns_R_R_I(INS_dup, EA_8BYTE, REG_V24, REG_V25, 1); + theEmitter->emitIns_R_R_I(INS_dup, EA_4BYTE, REG_V26, REG_V27, 3); + theEmitter->emitIns_R_R_I(INS_dup, EA_2BYTE, REG_V28, REG_V29, 7); + theEmitter->emitIns_R_R_I(INS_dup, EA_1BYTE, REG_V30, REG_V31, 15); + + // mov/ins vector element + theEmitter->emitIns_R_R_I_I(INS_ins, EA_8BYTE, REG_V0, REG_V1, 0, 1); + theEmitter->emitIns_R_R_I_I(INS_ins, EA_4BYTE, REG_V2, REG_V3, 2, 2); + theEmitter->emitIns_R_R_I_I(INS_ins, EA_2BYTE, REG_V4, REG_V5, 4, 3); + theEmitter->emitIns_R_R_I_I(INS_ins, EA_1BYTE, REG_V6, REG_V7, 8, 4); + + // umov to general from vector element + theEmitter->emitIns_R_R_I(INS_umov, EA_8BYTE, REG_R0, REG_V8, 1); + theEmitter->emitIns_R_R_I(INS_umov, EA_4BYTE, REG_R1, REG_V9, 2); + theEmitter->emitIns_R_R_I(INS_umov, EA_2BYTE, REG_R2, REG_V10, 4); + theEmitter->emitIns_R_R_I(INS_umov, EA_1BYTE, REG_R3, REG_V11, 8); + + // ins to vector element from general + theEmitter->emitIns_R_R_I(INS_ins, EA_8BYTE, REG_V12, REG_R4, 1); + theEmitter->emitIns_R_R_I(INS_ins, EA_4BYTE, REG_V13, REG_R5, 3); + theEmitter->emitIns_R_R_I(INS_ins, EA_2BYTE, REG_V14, REG_R6, 7); + theEmitter->emitIns_R_R_I(INS_ins, EA_1BYTE, REG_V15, REG_R7, 15); + + // smov to general from vector element + theEmitter->emitIns_R_R_I(INS_smov, EA_4BYTE, REG_R5, REG_V17, 2); + theEmitter->emitIns_R_R_I(INS_smov, EA_2BYTE, REG_R6, REG_V18, 4); + theEmitter->emitIns_R_R_I(INS_smov, EA_1BYTE, REG_R7, REG_V19, 8); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_I movi and mvni + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // movi imm8 (vector) + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V0, 0x00, INS_OPTS_8B); + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V1, 0xFF, INS_OPTS_8B); + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V2, 0x00, INS_OPTS_16B); + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V3, 0xFF, INS_OPTS_16B); + + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V4, 0x007F, INS_OPTS_4H); + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V5, 0x7F00, INS_OPTS_4H); // LSL 8 + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V6, 0x003F, INS_OPTS_8H); + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V7, 0x3F00, INS_OPTS_8H); // LSL 8 + + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V8, 0x1F, INS_OPTS_2S); + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V9, 0x1F00, INS_OPTS_2S); // LSL 8 + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V10, 0x1F0000, INS_OPTS_2S); // LSL 16 + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V11, 0x1F000000, INS_OPTS_2S); // LSL 24 + + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V12, 0x1FFF, INS_OPTS_2S); // MSL 8 + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V13, 0x1FFFFF, INS_OPTS_2S); // MSL 16 + + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V14, 0x37, INS_OPTS_4S); + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V15, 0x3700, INS_OPTS_4S); // LSL 8 + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V16, 0x370000, INS_OPTS_4S); // LSL 16 + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V17, 0x37000000, INS_OPTS_4S); // LSL 24 + + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V18, 0x37FF, INS_OPTS_4S); // MSL 8 + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V19, 0x37FFFF, INS_OPTS_4S); // MSL 16 + + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V20, 0xFF80, INS_OPTS_4H); // mvni + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V21, 0xFFC0, INS_OPTS_8H); // mvni + + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V22, 0xFFFFFFE0, INS_OPTS_2S); // mvni + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V23, 0xFFFFF0FF, INS_OPTS_4S); // mvni LSL 8 + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V24, 0xFFF8FFFF, INS_OPTS_2S); // mvni LSL 16 + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V25, 0xFCFFFFFF, INS_OPTS_4S); // mvni LSL 24 + + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V26, 0xFFFFFE00, INS_OPTS_2S); // mvni MSL 8 + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V27, 0xFFFC0000, INS_OPTS_4S); // mvni MSL 16 + + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V28, 0x00FF00FF00FF00FF, INS_OPTS_1D); + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V29, 0x00FFFF0000FFFF00, INS_OPTS_2D); + theEmitter->emitIns_R_I(INS_movi, EA_8BYTE, REG_V30, 0xFF000000FF000000); + theEmitter->emitIns_R_I(INS_movi, EA_16BYTE, REG_V31, 0x0, INS_OPTS_2D); + + theEmitter->emitIns_R_I(INS_mvni, EA_8BYTE, REG_V0, 0x0022, INS_OPTS_4H); + theEmitter->emitIns_R_I(INS_mvni, EA_8BYTE, REG_V1, 0x2200, INS_OPTS_4H); // LSL 8 + theEmitter->emitIns_R_I(INS_mvni, EA_16BYTE, REG_V2, 0x0033, INS_OPTS_8H); + theEmitter->emitIns_R_I(INS_mvni, EA_16BYTE, REG_V3, 0x3300, INS_OPTS_8H); // LSL 8 + + theEmitter->emitIns_R_I(INS_mvni, EA_8BYTE, REG_V4, 0x42, INS_OPTS_2S); + theEmitter->emitIns_R_I(INS_mvni, EA_8BYTE, REG_V5, 0x4200, INS_OPTS_2S); // LSL 8 + theEmitter->emitIns_R_I(INS_mvni, EA_8BYTE, REG_V6, 0x420000, INS_OPTS_2S); // LSL 16 + theEmitter->emitIns_R_I(INS_mvni, EA_8BYTE, REG_V7, 0x42000000, INS_OPTS_2S); // LSL 24 + + theEmitter->emitIns_R_I(INS_mvni, EA_8BYTE, REG_V8, 0x42FF, INS_OPTS_2S); // MSL 8 + theEmitter->emitIns_R_I(INS_mvni, EA_8BYTE, REG_V9, 0x42FFFF, INS_OPTS_2S); // MSL 16 + + theEmitter->emitIns_R_I(INS_mvni, EA_16BYTE, REG_V10, 0x5D, INS_OPTS_4S); + theEmitter->emitIns_R_I(INS_mvni, EA_16BYTE, REG_V11, 0x5D00, INS_OPTS_4S); // LSL 8 + theEmitter->emitIns_R_I(INS_mvni, EA_16BYTE, REG_V12, 0x5D0000, INS_OPTS_4S); // LSL 16 + theEmitter->emitIns_R_I(INS_mvni, EA_16BYTE, REG_V13, 0x5D000000, INS_OPTS_4S); // LSL 24 + + theEmitter->emitIns_R_I(INS_mvni, EA_16BYTE, REG_V14, 0x5DFF, INS_OPTS_4S); // MSL 8 + theEmitter->emitIns_R_I(INS_mvni, EA_16BYTE, REG_V15, 0x5DFFFF, INS_OPTS_4S); // MSL 16 + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_I orr/bic vector immediate + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + theEmitter->emitIns_R_I(INS_orr, EA_8BYTE, REG_V0, 0x0022, INS_OPTS_4H); + theEmitter->emitIns_R_I(INS_orr, EA_8BYTE, REG_V1, 0x2200, INS_OPTS_4H); // LSL 8 + theEmitter->emitIns_R_I(INS_orr, EA_16BYTE, REG_V2, 0x0033, INS_OPTS_8H); + theEmitter->emitIns_R_I(INS_orr, EA_16BYTE, REG_V3, 0x3300, INS_OPTS_8H); // LSL 8 + + theEmitter->emitIns_R_I(INS_orr, EA_8BYTE, REG_V4, 0x42, INS_OPTS_2S); + theEmitter->emitIns_R_I(INS_orr, EA_8BYTE, REG_V5, 0x4200, INS_OPTS_2S); // LSL 8 + theEmitter->emitIns_R_I(INS_orr, EA_8BYTE, REG_V6, 0x420000, INS_OPTS_2S); // LSL 16 + theEmitter->emitIns_R_I(INS_orr, EA_8BYTE, REG_V7, 0x42000000, INS_OPTS_2S); // LSL 24 + + theEmitter->emitIns_R_I(INS_orr, EA_16BYTE, REG_V10, 0x5D, INS_OPTS_4S); + theEmitter->emitIns_R_I(INS_orr, EA_16BYTE, REG_V11, 0x5D00, INS_OPTS_4S); // LSL 8 + theEmitter->emitIns_R_I(INS_orr, EA_16BYTE, REG_V12, 0x5D0000, INS_OPTS_4S); // LSL 16 + theEmitter->emitIns_R_I(INS_orr, EA_16BYTE, REG_V13, 0x5D000000, INS_OPTS_4S); // LSL 24 + + theEmitter->emitIns_R_I(INS_bic, EA_8BYTE, REG_V0, 0x0022, INS_OPTS_4H); + theEmitter->emitIns_R_I(INS_bic, EA_8BYTE, REG_V1, 0x2200, INS_OPTS_4H); // LSL 8 + theEmitter->emitIns_R_I(INS_bic, EA_16BYTE, REG_V2, 0x0033, INS_OPTS_8H); + theEmitter->emitIns_R_I(INS_bic, EA_16BYTE, REG_V3, 0x3300, INS_OPTS_8H); // LSL 8 + + theEmitter->emitIns_R_I(INS_bic, EA_8BYTE, REG_V4, 0x42, INS_OPTS_2S); + theEmitter->emitIns_R_I(INS_bic, EA_8BYTE, REG_V5, 0x4200, INS_OPTS_2S); // LSL 8 + theEmitter->emitIns_R_I(INS_bic, EA_8BYTE, REG_V6, 0x420000, INS_OPTS_2S); // LSL 16 + theEmitter->emitIns_R_I(INS_bic, EA_8BYTE, REG_V7, 0x42000000, INS_OPTS_2S); // LSL 24 + + theEmitter->emitIns_R_I(INS_bic, EA_16BYTE, REG_V10, 0x5D, INS_OPTS_4S); + theEmitter->emitIns_R_I(INS_bic, EA_16BYTE, REG_V11, 0x5D00, INS_OPTS_4S); // LSL 8 + theEmitter->emitIns_R_I(INS_bic, EA_16BYTE, REG_V12, 0x5D0000, INS_OPTS_4S); // LSL 16 + theEmitter->emitIns_R_I(INS_bic, EA_16BYTE, REG_V13, 0x5D000000, INS_OPTS_4S); // LSL 24 + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_F cmp/fmov immediate + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // fmov imm8 (scalar) + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V14, 1.0); + theEmitter->emitIns_R_F(INS_fmov, EA_4BYTE, REG_V15, -1.0); + theEmitter->emitIns_R_F(INS_fmov, EA_4BYTE, REG_V0, 2.0); // encodes imm8 == 0 + theEmitter->emitIns_R_F(INS_fmov, EA_4BYTE, REG_V16, 10.0); + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V17, -10.0); + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V18, 31); // Largest encodable value + theEmitter->emitIns_R_F(INS_fmov, EA_4BYTE, REG_V19, -31); + theEmitter->emitIns_R_F(INS_fmov, EA_4BYTE, REG_V20, 1.25); + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V21, -1.25); + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V22, 0.125); // Smallest encodable value + theEmitter->emitIns_R_F(INS_fmov, EA_4BYTE, REG_V23, -0.125); + + // fmov imm8 (vector) + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V0, 2.0, INS_OPTS_2S); + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V24, 1.0, INS_OPTS_2S); + theEmitter->emitIns_R_F(INS_fmov, EA_16BYTE, REG_V25, 1.0, INS_OPTS_4S); + theEmitter->emitIns_R_F(INS_fmov, EA_16BYTE, REG_V26, 1.0, INS_OPTS_2D); + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V27, -10.0, INS_OPTS_2S); + theEmitter->emitIns_R_F(INS_fmov, EA_16BYTE, REG_V28, -10.0, INS_OPTS_4S); + theEmitter->emitIns_R_F(INS_fmov, EA_16BYTE, REG_V29, -10.0, INS_OPTS_2D); + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V30, 31.0, INS_OPTS_2S); + theEmitter->emitIns_R_F(INS_fmov, EA_16BYTE, REG_V31, 31.0, INS_OPTS_4S); + theEmitter->emitIns_R_F(INS_fmov, EA_16BYTE, REG_V0, 31.0, INS_OPTS_2D); + theEmitter->emitIns_R_F(INS_fmov, EA_8BYTE, REG_V1, -0.125, INS_OPTS_2S); + theEmitter->emitIns_R_F(INS_fmov, EA_16BYTE, REG_V2, -0.125, INS_OPTS_4S); + theEmitter->emitIns_R_F(INS_fmov, EA_16BYTE, REG_V3, -0.125, INS_OPTS_2D); + + // fcmp with 0.0 + theEmitter->emitIns_R_F(INS_fcmp, EA_8BYTE, REG_V12, 0.0); + theEmitter->emitIns_R_F(INS_fcmp, EA_4BYTE, REG_V13, 0.0); + theEmitter->emitIns_R_F(INS_fcmpe, EA_8BYTE, REG_V14, 0.0); + theEmitter->emitIns_R_F(INS_fcmpe, EA_4BYTE, REG_V15, 0.0); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R fmov/fcmp/fcvt + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // fmov to vector to vector + theEmitter->emitIns_R_R(INS_fmov, EA_8BYTE, REG_V0, REG_V2); + theEmitter->emitIns_R_R(INS_fmov, EA_4BYTE, REG_V1, REG_V3); + + // fmov to vector to general + theEmitter->emitIns_R_R(INS_fmov, EA_8BYTE, REG_R0, REG_V4); + theEmitter->emitIns_R_R(INS_fmov, EA_4BYTE, REG_R1, REG_V5); + // using the optional conversion specifier + theEmitter->emitIns_R_R(INS_fmov, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_D_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fmov, EA_4BYTE, REG_R3, REG_V7, INS_OPTS_S_TO_4BYTE); + + // fmov to general to vector + theEmitter->emitIns_R_R(INS_fmov, EA_8BYTE, REG_V8, REG_R4); + theEmitter->emitIns_R_R(INS_fmov, EA_4BYTE, REG_V9, REG_R5); + // using the optional conversion specifier + theEmitter->emitIns_R_R(INS_fmov, EA_8BYTE, REG_V10, REG_R6, INS_OPTS_8BYTE_TO_D); + theEmitter->emitIns_R_R(INS_fmov, EA_4BYTE, REG_V11, REG_R7, INS_OPTS_4BYTE_TO_S); + + // fcmp/fcmpe + theEmitter->emitIns_R_R(INS_fcmp, EA_8BYTE, REG_V8, REG_V16); + theEmitter->emitIns_R_R(INS_fcmp, EA_4BYTE, REG_V9, REG_V17); + theEmitter->emitIns_R_R(INS_fcmpe, EA_8BYTE, REG_V10, REG_V18); + theEmitter->emitIns_R_R(INS_fcmpe, EA_4BYTE, REG_V11, REG_V19); + + // fcvt + theEmitter->emitIns_R_R(INS_fcvt, EA_8BYTE, REG_V24, REG_V25, INS_OPTS_S_TO_D); // Single to Double + theEmitter->emitIns_R_R(INS_fcvt, EA_4BYTE, REG_V26, REG_V27, INS_OPTS_D_TO_S); // Double to Single + + theEmitter->emitIns_R_R(INS_fcvt, EA_4BYTE, REG_V1, REG_V2, INS_OPTS_H_TO_S); + theEmitter->emitIns_R_R(INS_fcvt, EA_8BYTE, REG_V3, REG_V4, INS_OPTS_H_TO_D); + + theEmitter->emitIns_R_R(INS_fcvt, EA_2BYTE, REG_V5, REG_V6, INS_OPTS_S_TO_H); + theEmitter->emitIns_R_R(INS_fcvt, EA_2BYTE, REG_V7, REG_V8, INS_OPTS_D_TO_H); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R floating point conversions + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // fcvtas scalar + theEmitter->emitIns_R_R(INS_fcvtas, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtas, EA_8BYTE, REG_V2, REG_V3); + + // fcvtas scalar to general + theEmitter->emitIns_R_R(INS_fcvtas, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtas, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtas, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtas, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtas vector + theEmitter->emitIns_R_R(INS_fcvtas, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtas, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtas, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // fcvtau scalar + theEmitter->emitIns_R_R(INS_fcvtau, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtau, EA_8BYTE, REG_V2, REG_V3); + + // fcvtau scalar to general + theEmitter->emitIns_R_R(INS_fcvtau, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtau, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtau, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtau, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtau vector + theEmitter->emitIns_R_R(INS_fcvtau, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtau, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtau, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + //////////////////////////////////////////////////////////////////////////////// + + // fcvtms scalar + theEmitter->emitIns_R_R(INS_fcvtms, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtms, EA_8BYTE, REG_V2, REG_V3); + + // fcvtms scalar to general + theEmitter->emitIns_R_R(INS_fcvtms, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtms, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtms, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtms, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtms vector + theEmitter->emitIns_R_R(INS_fcvtms, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtms, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtms, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // fcvtmu scalar + theEmitter->emitIns_R_R(INS_fcvtmu, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtmu, EA_8BYTE, REG_V2, REG_V3); + + // fcvtmu scalar to general + theEmitter->emitIns_R_R(INS_fcvtmu, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtmu, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtmu, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtmu, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtmu vector + theEmitter->emitIns_R_R(INS_fcvtmu, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtmu, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtmu, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + //////////////////////////////////////////////////////////////////////////////// + + // fcvtns scalar + theEmitter->emitIns_R_R(INS_fcvtns, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtns, EA_8BYTE, REG_V2, REG_V3); + + // fcvtns scalar to general + theEmitter->emitIns_R_R(INS_fcvtns, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtns, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtns, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtns, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtns vector + theEmitter->emitIns_R_R(INS_fcvtns, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtns, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtns, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // fcvtnu scalar + theEmitter->emitIns_R_R(INS_fcvtnu, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtnu, EA_8BYTE, REG_V2, REG_V3); + + // fcvtnu scalar to general + theEmitter->emitIns_R_R(INS_fcvtnu, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtnu, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtnu, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtnu, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtnu vector + theEmitter->emitIns_R_R(INS_fcvtnu, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtnu, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtnu, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + //////////////////////////////////////////////////////////////////////////////// + + // fcvtps scalar + theEmitter->emitIns_R_R(INS_fcvtps, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtps, EA_8BYTE, REG_V2, REG_V3); + + // fcvtps scalar to general + theEmitter->emitIns_R_R(INS_fcvtps, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtps, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtps, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtps, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtps vector + theEmitter->emitIns_R_R(INS_fcvtps, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtps, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtps, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // fcvtpu scalar + theEmitter->emitIns_R_R(INS_fcvtpu, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtpu, EA_8BYTE, REG_V2, REG_V3); + + // fcvtpu scalar to general + theEmitter->emitIns_R_R(INS_fcvtpu, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtpu, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtpu, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtpu, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtpu vector + theEmitter->emitIns_R_R(INS_fcvtpu, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtpu, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtpu, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + //////////////////////////////////////////////////////////////////////////////// + + // fcvtzs scalar + theEmitter->emitIns_R_R(INS_fcvtzs, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtzs, EA_8BYTE, REG_V2, REG_V3); + + // fcvtzs scalar to general + theEmitter->emitIns_R_R(INS_fcvtzs, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtzs, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtzs, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtzs, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtzs vector + theEmitter->emitIns_R_R(INS_fcvtzs, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtzs, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtzs, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // fcvtzu scalar + theEmitter->emitIns_R_R(INS_fcvtzu, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fcvtzu, EA_8BYTE, REG_V2, REG_V3); + + // fcvtzu scalar to general + theEmitter->emitIns_R_R(INS_fcvtzu, EA_4BYTE, REG_R0, REG_V4, INS_OPTS_S_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtzu, EA_4BYTE, REG_R1, REG_V5, INS_OPTS_D_TO_4BYTE); + theEmitter->emitIns_R_R(INS_fcvtzu, EA_8BYTE, REG_R2, REG_V6, INS_OPTS_S_TO_8BYTE); + theEmitter->emitIns_R_R(INS_fcvtzu, EA_8BYTE, REG_R3, REG_V7, INS_OPTS_D_TO_8BYTE); + + // fcvtzu vector + theEmitter->emitIns_R_R(INS_fcvtzu, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtzu, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fcvtzu, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + //////////////////////////////////////////////////////////////////////////////// + + // scvtf scalar + theEmitter->emitIns_R_R(INS_scvtf, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_scvtf, EA_8BYTE, REG_V2, REG_V3); + + // scvtf scalar from general + theEmitter->emitIns_R_R(INS_scvtf, EA_4BYTE, REG_V4, REG_R0, INS_OPTS_4BYTE_TO_S); + theEmitter->emitIns_R_R(INS_scvtf, EA_4BYTE, REG_V5, REG_R1, INS_OPTS_8BYTE_TO_S); + theEmitter->emitIns_R_R(INS_scvtf, EA_8BYTE, REG_V6, REG_R2, INS_OPTS_4BYTE_TO_D); + theEmitter->emitIns_R_R(INS_scvtf, EA_8BYTE, REG_V7, REG_R3, INS_OPTS_8BYTE_TO_D); + + // scvtf vector + theEmitter->emitIns_R_R(INS_scvtf, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_scvtf, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_scvtf, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // ucvtf scalar + theEmitter->emitIns_R_R(INS_ucvtf, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_ucvtf, EA_8BYTE, REG_V2, REG_V3); + + // ucvtf scalar from general + theEmitter->emitIns_R_R(INS_ucvtf, EA_4BYTE, REG_V4, REG_R0, INS_OPTS_4BYTE_TO_S); + theEmitter->emitIns_R_R(INS_ucvtf, EA_4BYTE, REG_V5, REG_R1, INS_OPTS_8BYTE_TO_S); + theEmitter->emitIns_R_R(INS_ucvtf, EA_8BYTE, REG_V6, REG_R2, INS_OPTS_4BYTE_TO_D); + theEmitter->emitIns_R_R(INS_ucvtf, EA_8BYTE, REG_V7, REG_R3, INS_OPTS_8BYTE_TO_D); + + // ucvtf vector + theEmitter->emitIns_R_R(INS_ucvtf, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ucvtf, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ucvtf, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R floating point operations, one dest, one source + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // fabs scalar + theEmitter->emitIns_R_R(INS_fabs, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fabs, EA_8BYTE, REG_V2, REG_V3); + + // fabs vector + theEmitter->emitIns_R_R(INS_fabs, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fabs, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fabs, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // fneg scalar + theEmitter->emitIns_R_R(INS_fneg, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fneg, EA_8BYTE, REG_V2, REG_V3); + + // fneg vector + theEmitter->emitIns_R_R(INS_fneg, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fneg, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fneg, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // fsqrt scalar + theEmitter->emitIns_R_R(INS_fsqrt, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_fsqrt, EA_8BYTE, REG_V2, REG_V3); + + // fsqrt vector + theEmitter->emitIns_R_R(INS_fsqrt, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fsqrt, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_fsqrt, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + genDefineTempLabel(genCreateTempLabel()); + + // abs scalar + theEmitter->emitIns_R_R(INS_abs, EA_8BYTE, REG_V2, REG_V3); + + // abs vector + theEmitter->emitIns_R_R(INS_abs, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_abs, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_abs, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_abs, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_abs, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_abs, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_abs, EA_16BYTE, REG_V16, REG_V17, INS_OPTS_2D); + + // neg scalar + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V2, REG_V3); + + // neg vector + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V16, REG_V17, INS_OPTS_2D); + + // mvn vector + theEmitter->emitIns_R_R(INS_mvn, EA_8BYTE, REG_V4, REG_V5); + theEmitter->emitIns_R_R(INS_mvn, EA_8BYTE, REG_V6, REG_V7, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_mvn, EA_16BYTE, REG_V8, REG_V9); + theEmitter->emitIns_R_R(INS_mvn, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_16B); + + // cnt vector + theEmitter->emitIns_R_R(INS_cnt, EA_8BYTE, REG_V22, REG_V23, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_cnt, EA_16BYTE, REG_V24, REG_V25, INS_OPTS_16B); + + // not vector (the same encoding as mvn) + theEmitter->emitIns_R_R(INS_not, EA_8BYTE, REG_V12, REG_V13); + theEmitter->emitIns_R_R(INS_not, EA_8BYTE, REG_V14, REG_V15, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_not, EA_16BYTE, REG_V16, REG_V17); + theEmitter->emitIns_R_R(INS_not, EA_16BYTE, REG_V18, REG_V19, INS_OPTS_16B); + + // cls vector + theEmitter->emitIns_R_R(INS_cls, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_cls, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_cls, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_cls, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_cls, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_cls, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); + + // clz vector + theEmitter->emitIns_R_R(INS_clz, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_clz, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_clz, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_clz, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_clz, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_clz, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); + + // rbit vector + theEmitter->emitIns_R_R(INS_rbit, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_rbit, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); + + // rev16 vector + theEmitter->emitIns_R_R(INS_rev16, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_rev16, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); + + // rev32 vector + theEmitter->emitIns_R_R(INS_rev32, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_rev32, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_rev32, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_rev32, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); + + // rev64 vector + theEmitter->emitIns_R_R(INS_rev64, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_rev64, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_rev64, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_rev64, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_rev64, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_rev64, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); + +#endif + + // + // R_R floating point round to int, one dest, one source + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + // frinta scalar + theEmitter->emitIns_R_R(INS_frinta, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_frinta, EA_8BYTE, REG_V2, REG_V3); + + // frinta vector + theEmitter->emitIns_R_R(INS_frinta, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frinta, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frinta, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // frinti scalar + theEmitter->emitIns_R_R(INS_frinti, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_frinti, EA_8BYTE, REG_V2, REG_V3); + + // frinti vector + theEmitter->emitIns_R_R(INS_frinti, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frinti, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frinti, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // frintm scalar + theEmitter->emitIns_R_R(INS_frintm, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_frintm, EA_8BYTE, REG_V2, REG_V3); + + // frintm vector + theEmitter->emitIns_R_R(INS_frintm, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frintm, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frintm, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // frintn scalar + theEmitter->emitIns_R_R(INS_frintn, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_frintn, EA_8BYTE, REG_V2, REG_V3); + + // frintn vector + theEmitter->emitIns_R_R(INS_frintn, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frintn, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frintn, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // frintp scalar + theEmitter->emitIns_R_R(INS_frintp, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_frintp, EA_8BYTE, REG_V2, REG_V3); + + // frintp vector + theEmitter->emitIns_R_R(INS_frintp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frintp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frintp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // frintx scalar + theEmitter->emitIns_R_R(INS_frintx, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_frintx, EA_8BYTE, REG_V2, REG_V3); + + // frintx vector + theEmitter->emitIns_R_R(INS_frintx, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frintx, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frintx, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // frintz scalar + theEmitter->emitIns_R_R(INS_frintz, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_frintz, EA_8BYTE, REG_V2, REG_V3); + + // frintz vector + theEmitter->emitIns_R_R(INS_frintz, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frintz, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frintz, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R floating point operations, one dest, two source + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_R(INS_fadd, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fadd, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_fadd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fadd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fadd, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R(INS_fsub, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fsub, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_fsub, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fsub, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fsub, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R(INS_fdiv, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fdiv, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_fdiv, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fdiv, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fdiv, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R(INS_fmax, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fmax, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_fmax, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fmax, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fmax, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R(INS_fmin, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fmin, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_fmin, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fmin, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fmin, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + // fabd + theEmitter->emitIns_R_R_R(INS_fabd, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fabd, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_fabd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fabd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fabd, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_R(INS_fmul, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fmul, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_fmul, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fmul, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fmul, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R_I(INS_fmul, EA_4BYTE, REG_V15, REG_V16, REG_V17, 3); // scalar by elem 4BYTE + theEmitter->emitIns_R_R_R_I(INS_fmul, EA_8BYTE, REG_V18, REG_V19, REG_V20, 1); // scalar by elem 8BYTE + theEmitter->emitIns_R_R_R_I(INS_fmul, EA_8BYTE, REG_V21, REG_V22, REG_V23, 0, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_fmul, EA_16BYTE, REG_V24, REG_V25, REG_V26, 2, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_fmul, EA_16BYTE, REG_V27, REG_V28, REG_V29, 0, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R(INS_fmulx, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fmulx, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_fmulx, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fmulx, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fmulx, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R_I(INS_fmulx, EA_4BYTE, REG_V15, REG_V16, REG_V17, 3); // scalar by elem 4BYTE + theEmitter->emitIns_R_R_R_I(INS_fmulx, EA_8BYTE, REG_V18, REG_V19, REG_V20, 1); // scalar by elem 8BYTE + theEmitter->emitIns_R_R_R_I(INS_fmulx, EA_8BYTE, REG_V21, REG_V22, REG_V23, 0, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_fmulx, EA_16BYTE, REG_V24, REG_V25, REG_V26, 2, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_fmulx, EA_16BYTE, REG_V27, REG_V28, REG_V29, 0, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R(INS_fnmul, EA_4BYTE, REG_V0, REG_V1, REG_V2); // scalar 4BYTE + theEmitter->emitIns_R_R_R(INS_fnmul, EA_8BYTE, REG_V3, REG_V4, REG_V5); // scalar 8BYTE + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_I vector operations, one dest, one source reg, one immed + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // 'sshr' scalar + theEmitter->emitIns_R_R_I(INS_sshr, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_sshr, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_sshr, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_sshr, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_sshr, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'sshr' vector + theEmitter->emitIns_R_R_I(INS_sshr, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_sshr, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_sshr, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_sshr, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_sshr, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_sshr, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_sshr, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_sshr, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'ssra' scalar + theEmitter->emitIns_R_R_I(INS_ssra, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_ssra, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_ssra, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_ssra, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_ssra, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'ssra' vector + theEmitter->emitIns_R_R_I(INS_ssra, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ssra, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ssra, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ssra, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ssra, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ssra, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ssra, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_ssra, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'srshr' scalar + theEmitter->emitIns_R_R_I(INS_srshr, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_srshr, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_srshr, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_srshr, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_srshr, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'srshr' vector + theEmitter->emitIns_R_R_I(INS_srshr, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_srshr, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_srshr, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_srshr, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_srshr, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_srshr, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_srshr, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_srshr, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'srsra' scalar + theEmitter->emitIns_R_R_I(INS_srsra, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_srsra, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_srsra, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_srsra, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_srsra, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'srsra' vector + theEmitter->emitIns_R_R_I(INS_srsra, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_srsra, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_srsra, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_srsra, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_srsra, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_srsra, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_srsra, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_srsra, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'shl' scalar + theEmitter->emitIns_R_R_I(INS_shl, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_shl, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_shl, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_shl, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_shl, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'shl' vector + theEmitter->emitIns_R_R_I(INS_shl, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_shl, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_shl, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_shl, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_shl, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_shl, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_shl, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_shl, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'ushr' scalar + theEmitter->emitIns_R_R_I(INS_ushr, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_ushr, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_ushr, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_ushr, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_ushr, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'ushr' vector + theEmitter->emitIns_R_R_I(INS_ushr, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ushr, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ushr, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ushr, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ushr, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ushr, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ushr, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_ushr, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'usra' scalar + theEmitter->emitIns_R_R_I(INS_usra, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_usra, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_usra, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_usra, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_usra, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'usra' vector + theEmitter->emitIns_R_R_I(INS_usra, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_usra, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_usra, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_usra, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_usra, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_usra, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_usra, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_usra, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'urshr' scalar + theEmitter->emitIns_R_R_I(INS_urshr, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_urshr, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_urshr, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_urshr, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_urshr, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'urshr' vector + theEmitter->emitIns_R_R_I(INS_urshr, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_urshr, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_urshr, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_urshr, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_urshr, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_urshr, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_urshr, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_urshr, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'ursra' scalar + theEmitter->emitIns_R_R_I(INS_ursra, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_ursra, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_ursra, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_ursra, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_ursra, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'srsra' vector + theEmitter->emitIns_R_R_I(INS_ursra, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ursra, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ursra, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ursra, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ursra, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ursra, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ursra, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_ursra, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'sri' scalar + theEmitter->emitIns_R_R_I(INS_sri, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_sri, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_sri, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_sri, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_sri, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'sri' vector + theEmitter->emitIns_R_R_I(INS_sri, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_sri, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_sri, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_sri, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_sri, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_sri, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_sri, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_sri, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'sli' scalar + theEmitter->emitIns_R_R_I(INS_sli, EA_8BYTE, REG_V0, REG_V1, 1); + theEmitter->emitIns_R_R_I(INS_sli, EA_8BYTE, REG_V2, REG_V3, 14); + theEmitter->emitIns_R_R_I(INS_sli, EA_8BYTE, REG_V4, REG_V5, 27); + theEmitter->emitIns_R_R_I(INS_sli, EA_8BYTE, REG_V6, REG_V7, 40); + theEmitter->emitIns_R_R_I(INS_sli, EA_8BYTE, REG_V8, REG_V9, 63); + + // 'sli' vector + theEmitter->emitIns_R_R_I(INS_sli, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_sli, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_sli, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_sli, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_sli, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_sli, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_sli, EA_16BYTE, REG_V12, REG_V13, 33, INS_OPTS_2D); + theEmitter->emitIns_R_R_I(INS_sli, EA_16BYTE, REG_V14, REG_V15, 63, INS_OPTS_2D); + + // 'sshll' vector + theEmitter->emitIns_R_R_I(INS_sshll, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_sshll2, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_sshll, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_sshll2, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_sshll, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_sshll2, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + + // 'ushll' vector + theEmitter->emitIns_R_R_I(INS_ushll, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ushll2, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ushll, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ushll2, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ushll, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ushll2, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + + // 'shrn' vector + theEmitter->emitIns_R_R_I(INS_shrn, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_shrn2, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_shrn, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_shrn2, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_shrn, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_shrn2, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + + // 'rshrn' vector + theEmitter->emitIns_R_R_I(INS_rshrn, EA_8BYTE, REG_V0, REG_V1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_rshrn2, EA_16BYTE, REG_V2, REG_V3, 7, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_rshrn, EA_8BYTE, REG_V4, REG_V5, 9, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_rshrn2, EA_16BYTE, REG_V6, REG_V7, 15, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_rshrn, EA_8BYTE, REG_V8, REG_V9, 17, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_rshrn2, EA_16BYTE, REG_V10, REG_V11, 31, INS_OPTS_4S); + + // 'sxtl' vector + theEmitter->emitIns_R_R(INS_sxtl, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_sxtl2, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_sxtl, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_sxtl2, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_sxtl, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_sxtl2, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + + // 'uxtl' vector + theEmitter->emitIns_R_R(INS_uxtl, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_uxtl2, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_uxtl, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_uxtl2, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_uxtl, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_uxtl2, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R vector operations, one dest, two source + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + // Specifying an Arrangement is optional + // + theEmitter->emitIns_R_R_R(INS_and, EA_8BYTE, REG_V6, REG_V7, REG_V8); + theEmitter->emitIns_R_R_R(INS_bic, EA_8BYTE, REG_V9, REG_V10, REG_V11); + theEmitter->emitIns_R_R_R(INS_eor, EA_8BYTE, REG_V12, REG_V13, REG_V14); + theEmitter->emitIns_R_R_R(INS_orr, EA_8BYTE, REG_V15, REG_V16, REG_V17); + theEmitter->emitIns_R_R_R(INS_orn, EA_8BYTE, REG_V18, REG_V19, REG_V20); + theEmitter->emitIns_R_R_R(INS_and, EA_16BYTE, REG_V21, REG_V22, REG_V23); + theEmitter->emitIns_R_R_R(INS_bic, EA_16BYTE, REG_V24, REG_V25, REG_V26); + theEmitter->emitIns_R_R_R(INS_eor, EA_16BYTE, REG_V27, REG_V28, REG_V29); + theEmitter->emitIns_R_R_R(INS_orr, EA_16BYTE, REG_V30, REG_V31, REG_V0); + theEmitter->emitIns_R_R_R(INS_orn, EA_16BYTE, REG_V1, REG_V2, REG_V3); + + theEmitter->emitIns_R_R_R(INS_bsl, EA_8BYTE, REG_V4, REG_V5, REG_V6); + theEmitter->emitIns_R_R_R(INS_bit, EA_8BYTE, REG_V7, REG_V8, REG_V9); + theEmitter->emitIns_R_R_R(INS_bif, EA_8BYTE, REG_V10, REG_V11, REG_V12); + theEmitter->emitIns_R_R_R(INS_bsl, EA_16BYTE, REG_V13, REG_V14, REG_V15); + theEmitter->emitIns_R_R_R(INS_bit, EA_16BYTE, REG_V16, REG_V17, REG_V18); + theEmitter->emitIns_R_R_R(INS_bif, EA_16BYTE, REG_V19, REG_V20, REG_V21); + + // Default Arrangement as per the ARM64 manual + // + theEmitter->emitIns_R_R_R(INS_and, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_bic, EA_8BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_eor, EA_8BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_orr, EA_8BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_orn, EA_8BYTE, REG_V18, REG_V19, REG_V20, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_and, EA_16BYTE, REG_V21, REG_V22, REG_V23, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_bic, EA_16BYTE, REG_V24, REG_V25, REG_V26, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_eor, EA_16BYTE, REG_V27, REG_V28, REG_V29, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_orr, EA_16BYTE, REG_V30, REG_V31, REG_V0, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_orn, EA_16BYTE, REG_V1, REG_V2, REG_V3, INS_OPTS_16B); + + theEmitter->emitIns_R_R_R(INS_bsl, EA_8BYTE, REG_V4, REG_V5, REG_V6, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_bit, EA_8BYTE, REG_V7, REG_V8, REG_V9, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_bif, EA_8BYTE, REG_V10, REG_V11, REG_V12, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_bsl, EA_16BYTE, REG_V13, REG_V14, REG_V15, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_bit, EA_16BYTE, REG_V16, REG_V17, REG_V18, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_bif, EA_16BYTE, REG_V19, REG_V20, REG_V21, INS_OPTS_16B); + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_V0, REG_V1, REG_V2); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_add, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_add, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_add, EA_16BYTE, REG_V18, REG_V19, REG_V20, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_add, EA_16BYTE, REG_V21, REG_V22, REG_V23, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R(INS_sub, EA_8BYTE, REG_V1, REG_V2, REG_V3); // scalar 8BYTE + theEmitter->emitIns_R_R_R(INS_sub, EA_8BYTE, REG_V4, REG_V5, REG_V6, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_sub, EA_8BYTE, REG_V7, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_sub, EA_8BYTE, REG_V10, REG_V11, REG_V12, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_sub, EA_16BYTE, REG_V13, REG_V14, REG_V15, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_sub, EA_16BYTE, REG_V16, REG_V17, REG_V18, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_sub, EA_16BYTE, REG_V19, REG_V20, REG_V21, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_sub, EA_16BYTE, REG_V22, REG_V23, REG_V24, INS_OPTS_2D); + + genDefineTempLabel(genCreateTempLabel()); + + // saba vector + theEmitter->emitIns_R_R_R(INS_saba, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_saba, EA_16BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_saba, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_saba, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_saba, EA_8BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_saba, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // sabd vector + theEmitter->emitIns_R_R_R(INS_sabd, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_sabd, EA_16BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_sabd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_sabd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_sabd, EA_8BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_sabd, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uaba vector + theEmitter->emitIns_R_R_R(INS_uaba, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uaba, EA_16BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uaba, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uaba, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uaba, EA_8BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_uaba, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uabd vector + theEmitter->emitIns_R_R_R(INS_uabd, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uabd, EA_16BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uabd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uabd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uabd, EA_8BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_uabd, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R vector multiply + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_R(INS_mul, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_mul, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_mul, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_mul, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_mul, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_mul, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + theEmitter->emitIns_R_R_R(INS_pmul, EA_8BYTE, REG_V18, REG_V19, REG_V20, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_pmul, EA_16BYTE, REG_V21, REG_V22, REG_V23, INS_OPTS_16B); + + // 'mul' vector by elem + theEmitter->emitIns_R_R_R_I(INS_mul, EA_8BYTE, REG_V0, REG_V1, REG_V16, 0, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_8BYTE, REG_V2, REG_V3, REG_V15, 1, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_8BYTE, REG_V4, REG_V5, REG_V17, 3, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_8BYTE, REG_V6, REG_V7, REG_V0, 0, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_8BYTE, REG_V8, REG_V9, REG_V1, 3, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_8BYTE, REG_V10, REG_V11, REG_V2, 7, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_16BYTE, REG_V12, REG_V13, REG_V14, 0, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_16BYTE, REG_V14, REG_V15, REG_V18, 1, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_16BYTE, REG_V16, REG_V17, REG_V13, 3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_16BYTE, REG_V18, REG_V19, REG_V3, 0, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_16BYTE, REG_V20, REG_V21, REG_V4, 3, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_mul, EA_16BYTE, REG_V22, REG_V23, REG_V5, 7, INS_OPTS_8H); + + // 'mla' vector by elem + theEmitter->emitIns_R_R_R_I(INS_mla, EA_8BYTE, REG_V0, REG_V1, REG_V16, 0, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_8BYTE, REG_V2, REG_V3, REG_V15, 1, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_8BYTE, REG_V4, REG_V5, REG_V17, 3, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_8BYTE, REG_V6, REG_V7, REG_V0, 0, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_8BYTE, REG_V8, REG_V9, REG_V1, 3, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_8BYTE, REG_V10, REG_V11, REG_V2, 7, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_16BYTE, REG_V12, REG_V13, REG_V14, 0, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_16BYTE, REG_V14, REG_V15, REG_V18, 1, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_16BYTE, REG_V16, REG_V17, REG_V13, 3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_16BYTE, REG_V18, REG_V19, REG_V3, 0, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_16BYTE, REG_V20, REG_V21, REG_V4, 3, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_mla, EA_16BYTE, REG_V22, REG_V23, REG_V5, 7, INS_OPTS_8H); + + // 'mls' vector by elem + theEmitter->emitIns_R_R_R_I(INS_mls, EA_8BYTE, REG_V0, REG_V1, REG_V16, 0, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_8BYTE, REG_V2, REG_V3, REG_V15, 1, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_8BYTE, REG_V4, REG_V5, REG_V17, 3, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_8BYTE, REG_V6, REG_V7, REG_V0, 0, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_8BYTE, REG_V8, REG_V9, REG_V1, 3, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_8BYTE, REG_V10, REG_V11, REG_V2, 7, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V12, REG_V13, REG_V14, 0, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V14, REG_V15, REG_V18, 1, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V16, REG_V17, REG_V13, 3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V18, REG_V19, REG_V3, 0, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V20, REG_V21, REG_V4, 3, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V22, REG_V23, REG_V5, 7, INS_OPTS_8H); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R floating point operations, one source/dest, and two source + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + genDefineTempLabel(genCreateTempLabel()); + + theEmitter->emitIns_R_R_R(INS_fmla, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fmla, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fmla, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R_I(INS_fmla, EA_4BYTE, REG_V15, REG_V16, REG_V17, 3); // scalar by elem 4BYTE + theEmitter->emitIns_R_R_R_I(INS_fmla, EA_8BYTE, REG_V18, REG_V19, REG_V20, 1); // scalar by elem 8BYTE + theEmitter->emitIns_R_R_R_I(INS_fmla, EA_8BYTE, REG_V21, REG_V22, REG_V23, 0, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_fmla, EA_16BYTE, REG_V24, REG_V25, REG_V26, 2, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_fmla, EA_16BYTE, REG_V27, REG_V28, REG_V29, 0, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R(INS_fmls, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_fmls, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_fmls, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_2D); + + theEmitter->emitIns_R_R_R_I(INS_fmls, EA_4BYTE, REG_V15, REG_V16, REG_V17, 3); // scalar by elem 4BYTE + theEmitter->emitIns_R_R_R_I(INS_fmls, EA_8BYTE, REG_V18, REG_V19, REG_V20, 1); // scalar by elem 8BYTE + theEmitter->emitIns_R_R_R_I(INS_fmls, EA_8BYTE, REG_V21, REG_V22, REG_V23, 0, INS_OPTS_2S); + theEmitter->emitIns_R_R_R_I(INS_fmls, EA_16BYTE, REG_V24, REG_V25, REG_V26, 2, INS_OPTS_4S); + theEmitter->emitIns_R_R_R_I(INS_fmls, EA_16BYTE, REG_V27, REG_V28, REG_V29, 0, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + // + // R_R_R_R floating point operations, one dest, and three source + // + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + theEmitter->emitIns_R_R_R_R(INS_fmadd, EA_4BYTE, REG_V0, REG_V8, REG_V16, REG_V24); + theEmitter->emitIns_R_R_R_R(INS_fmsub, EA_4BYTE, REG_V1, REG_V9, REG_V17, REG_V25); + theEmitter->emitIns_R_R_R_R(INS_fnmadd, EA_4BYTE, REG_V2, REG_V10, REG_V18, REG_V26); + theEmitter->emitIns_R_R_R_R(INS_fnmsub, EA_4BYTE, REG_V3, REG_V11, REG_V19, REG_V27); + + theEmitter->emitIns_R_R_R_R(INS_fmadd, EA_8BYTE, REG_V4, REG_V12, REG_V20, REG_V28); + theEmitter->emitIns_R_R_R_R(INS_fmsub, EA_8BYTE, REG_V5, REG_V13, REG_V21, REG_V29); + theEmitter->emitIns_R_R_R_R(INS_fnmadd, EA_8BYTE, REG_V6, REG_V14, REG_V22, REG_V30); + theEmitter->emitIns_R_R_R_R(INS_fnmsub, EA_8BYTE, REG_V7, REG_V15, REG_V23, REG_V31); + +#endif + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + + BasicBlock* label = genCreateTempLabel(); + genDefineTempLabel(label); + instGen(INS_nop); + instGen(INS_nop); + instGen(INS_nop); + instGen(INS_nop); + theEmitter->emitIns_R_L(INS_adr, EA_4BYTE_DSP_RELOC, label, REG_R0); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + + printf("*************** End of genArm64EmitterUnitTests()\n"); +} +#endif // defined(DEBUG) + +#endif // _TARGET_ARM64_ + +#endif // !LEGACY_BACKEND |