From abc02dffb79318ac48521480270aa5d87231e79f Mon Sep 17 00:00:00 2001 From: Spencer Comin Date: Tue, 25 Jun 2024 09:30:10 -0400 Subject: [PATCH 01/11] Z peephole: Remove unnecessary L(L)GFR A 32 bit load instruction followed by a zero/sign extend instruction can be replaced with an equivalent load and zero/sign extend instruction. Signed-off-by: Spencer Comin --- compiler/z/codegen/OMRPeephole.cpp | 167 +++++++++++++++++++++++++++-- compiler/z/codegen/OMRPeephole.hpp | 42 ++++++-- 2 files changed, 191 insertions(+), 18 deletions(-) diff --git a/compiler/z/codegen/OMRPeephole.cpp b/compiler/z/codegen/OMRPeephole.cpp index 6db7ba5c9d1..3f02f106e37 100644 --- a/compiler/z/codegen/OMRPeephole.cpp +++ b/compiler/z/codegen/OMRPeephole.cpp @@ -38,7 +38,7 @@ isBarrierToPeepHoleLookback(TR::Instruction* cursor) { if (cursor == NULL) return true; - + if (cursor->isLabel()) return true; @@ -198,6 +198,11 @@ OMR::Z::Peephole::performOnInstruction(TR::Instruction* cursor) performed |= performedCurrentPeephole; break; } + case TR::InstOpCode::LGFR: + { + performed |= self()->tryToRemoveRedundant32To64BitExtend(true); + break; + } case TR::InstOpCode::LHI: { performed |= self()->tryToReduceLHIToXR(); @@ -213,6 +218,11 @@ OMR::Z::Peephole::performOnInstruction(TR::Instruction* cursor) performed |= self()->tryToReduceLToLZRF(TR::InstOpCode::LLZRGF); break; } + case TR::InstOpCode::LLGFR: + { + performed |= self()->tryToRemoveRedundant32To64BitExtend(false); + break; + } case TR::InstOpCode::LR: { bool performedCurrentPeephole = false; @@ -254,7 +264,7 @@ OMR::Z::Peephole::performOnInstruction(TR::Instruction* cursor) if (!performedCurrentPeephole) performedCurrentPeephole |= self()->tryToRemoveDuplicateLoadRegister(); - + performed |= performedCurrentPeephole; break; } @@ -358,7 +368,7 @@ OMR::Z::Peephole::tryLoadStoreReduction(TR::InstOpCode::Mnemonic storeOpCode, ui return false; } - if (performTransformation(self()->comp(), "O^O S390 PEEPHOLE: Transforming load-store sequence at %p to MVC.", storeInst)) + if (performTransformation(self()->comp(), "O^O S390 PEEPHOLE: Transforming load-store sequence at %p to MVC.\n", storeInst)) { TR::DebugCounter::incStaticDebugCounter(self()->comp(), "z/peephole/load-store"); @@ -942,7 +952,7 @@ OMR::Z::Peephole::tryToReduceAGI() { if (performTransformation(self()->comp(), "O^O S390 PEEPHOLE: AGI LA reduction on [%p] from source load [%p].\n", current, cursor)) { - auto laInst = generateRXInstruction(self()->cg(), TR::InstOpCode::LA, cursor->getNode(), lgrTargetReg, + auto laInst = generateRXInstruction(self()->cg(), TR::InstOpCode::LA, cursor->getNode(), lgrTargetReg, generateS390MemoryReference(lgrSourceReg, 0, self()->cg()), cursor->getPrev()); self()->cg()->replaceInst(cursor, laInst); @@ -1328,7 +1338,7 @@ OMR::Z::Peephole::tryToReduceLLCToLLGC() memRef->resetMemRefUsedBefore(); auto llgcInst = generateRXInstruction(self()->cg(), TR::InstOpCode::LLGC, cursor->getNode(), llcTgtReg, memRef, cursor->getPrev()); self()->cg()->replaceInst(cursor, llgcInst); - + return true; } } @@ -1419,7 +1429,7 @@ OMR::Z::Peephole::tryToReduceLTRToCHI() TR::InstOpCode lgrOpCode = cursor->getOpCode(); if (lgrTargetReg == lgrSourceReg && - (lgrOpCode.getOpCodeValue() == TR::InstOpCode::LTR || + (lgrOpCode.getOpCodeValue() == TR::InstOpCode::LTR || lgrOpCode.getOpCodeValue() == TR::InstOpCode::LTGR)) { if (seekRegInFutureMemRef(cursor, 4, lgrTargetReg)) @@ -1528,7 +1538,7 @@ OMR::Z::Peephole::tryToRemoveDuplicateLoadRegister() windowSize = 0; setCC = setCC || current->getOpCode().setsCC(); useCC = useCC || current->getOpCode().readsCC(); - + rrInst->remove(); continue; @@ -1740,7 +1750,7 @@ OMR::Z::Peephole::tryToRemoveRedundantLA() if (performTransformation(self()->comp(), "O^O S390 PEEPHOLE: Removing redundant LA [%p].\n", cursor)) { cursor->remove(); - + return true; } } @@ -1828,7 +1838,7 @@ OMR::Z::Peephole::tryToRemoveRedundantLTR() TR::Register *lgrSourceReg = cursor->getRegisterOperand(2); TR::Register *lgrTargetReg = cursor->getRegisterOperand(1); - + if (lgrTargetReg == lgrSourceReg) { TR::Instruction *prevInst = cursor->getPrev(); @@ -1861,3 +1871,142 @@ OMR::Z::Peephole::tryToRemoveRedundantLTR() return false; } + +bool +OMR::Z::Peephole::tryToRemoveRedundant32To64BitExtend(bool isSigned) + { + static const bool disableRemoveExtend = feGetEnv("TR_DisableRemoveRedundant32to64Extend") != NULL; + if (disableRemoveExtend) + { + return false; + } + + int32_t windowSize = 0; + const int32_t maxWindowSize = 10; + + const char *lgfrMnemonicName = isSigned ? "LGFR" : "LLGFR"; + TR::Compilation *comp = self()->comp(); + TR::Instruction *lgfr = cursor; + TR::Register *lgfrReg = lgfr->getRegisterOperand(1); + + if (lgfrReg != lgfr->getRegisterOperand(2)) + return false; + + TR::Instruction *current = lgfr->getPrev(); + + while ((current != NULL) && + !isBarrierToPeepHoleLookback(current) && + windowSize < maxWindowSize) + { + TR::InstOpCode::Mnemonic curOpMnemonic = current->getOpCode().getMnemonic(); + + if (current->getNumRegisterOperands() > 0 && lgfrReg == current->getRegisterOperand(1)) + { + TR::MemoryReference *mr = NULL; + TR::Instruction *replacement = NULL; + switch (curOpMnemonic) + { + case TR::InstOpCode::L: + if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging L [%p] and %s [%p] into %s.\n", + current, lgfrMnemonicName, lgfr, isSigned ? "LGF" : "LLGF")) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), isSigned ? TR::InstOpCode::LGF : TR::InstOpCode::LLGF, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + case TR::InstOpCode::LH: + if (isSigned && performTransformation(comp, "O^O S390 PEEPHOLE: Merging LH [%p] and LGFR [%p] into LGH.\n", current, lgfr)) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), TR::InstOpCode::LGH, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + case TR::InstOpCode::LLH: + if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging LLH [%p] and %s [%p] into LLGH.\n", current, lgfrMnemonicName, lgfr)) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), TR::InstOpCode::LLGH, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + case TR::InstOpCode::LB: + if (isSigned && performTransformation(comp, "O^O S390 PEEPHOLE: Merging LB [%p] and LGFR [%p] into LGB.\n", current, lgfr)) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), TR::InstOpCode::LGB, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + case TR::InstOpCode::LLC: + if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging LLC [%p] and %s [%p] into LLGC.\n", current, lgfrMnemonicName, lgfr)) + { + mr = current->getMemoryReference(); + mr->resetMemRefUsedBefore(); + replacement = generateRXInstruction(self()->cg(), TR::InstOpCode::LLGC, current->getNode(), lgfrReg, mr, current->getPrev()); + } + break; + + case TR::InstOpCode::XR: + // The following sequence of instructions + // XR GPR1, GPR1 ; Zero out bottom 32 bits of GPR1 + // LGFR/LLGFR GPR1, GPR1 ; Extend those zeros to all 64 bits of GPR1 + // Can be converted to + // XGR GPR1, GPR1 ; Zero out all 64 bits of GPR1 + if (lgfrReg == current->getRegisterOperand(2) && + performTransformation(comp, "O^O S390 PEEPHOLE: Merging XR [%p] and %s [%p] into XGR.\n", current, lgfrMnemonicName, lgfr)) + replacement = generateRRInstruction(self()->cg(), TR::InstOpCode::XGR, current->getNode(), lgfrReg, lgfrReg, current->getPrev()); + break; + case TR::InstOpCode::IILF: + if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging IILF [%p] and %s [%p] into %s.\n", current, lgfrMnemonicName, lgfr, isSigned ? "LGFI" : "LLILF")) + replacement = generateRILInstruction(self()->cg(), isSigned ? TR::InstOpCode::LGFI : TR::InstOpCode::LLILF, current->getNode(), lgfrReg, toS390RILInstruction(current)->getSourceImmediate(), current->getPrev()); + break; + case TR::InstOpCode::LHI: + if (isSigned && performTransformation(comp, "O^O S390 PEEPHOLE: Merging LHI [%p] and LGFR [%p] into LGH.\n", current, lgfr)) + { + replacement = generateRIInstruction(self()->cg(), TR::InstOpCode::LGHI, current->getNode(), lgfrReg, toS390RIInstruction(current)->getSourceImmediate(), current->getPrev()); + } + else if (performTransformation(comp, "O^O S390 PEEPHOLE: Merging LHI [%p] and LLGFR [%p] into LLILF.\n", current, lgfr)) + { + // The following sequence of instructions: + // LHI GPR1, IMM ; sign extend IMM from 16 to 32 bits + // LLGFR GPR1, GPR1 ; zero extend from 32 to 64 bits + // Can be converted to + // LLILF GPR1, IMM' ; where IMM' is IMM sign extended from 16 to 32 bits + int16_t imm = toS390RIInstruction(current)->getSourceImmediate(); + replacement = generateRILInstruction(self()->cg(), TR::InstOpCode::LLILF, current->getNode(), lgfrReg, static_cast(imm), current->getPrev()); + } + break; + + case TR::InstOpCode::LR: + case TR::InstOpCode::LGR: + replacement = generateRRInstruction(self()->cg(), isSigned ? TR::InstOpCode::LGFR : TR::InstOpCode::LLGFR, current->getNode(), lgfrReg, current->getRegisterOperand(2), current->getPrev()); + break; + } + + if (replacement != NULL) + { + TR::DebugCounter::incStaticDebugCounter(comp, + TR::DebugCounter::debugCounterName(comp, "z/peephole/redundant32To64BitExtend/%s/%s/%s/(%s)", + current->getOpCode().getMnemonicName(), + lgfr->getOpCode().getMnemonicName(), + replacement->getOpCode().getMnemonicName(), + comp->signature())); + self()->cg()->replaceInst(current, replacement); + lgfr->remove(); + return true; + } + } + + // Ensure the extend acts on the correct register values + if (current->isDefRegister(lgfrReg)) + break; + + current = current->getPrev(); + + windowSize++; + } + + return false; + } diff --git a/compiler/z/codegen/OMRPeephole.hpp b/compiler/z/codegen/OMRPeephole.hpp index 0b41ad0f9e6..4e40d0be44e 100644 --- a/compiler/z/codegen/OMRPeephole.hpp +++ b/compiler/z/codegen/OMRPeephole.hpp @@ -70,7 +70,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryLoadStoreReduction(TR::InstOpCode::Mnemonic storeOpCode, uint16_t size); - + /** \brief * Tries to fold a load register instruction (\c LR or \c LGR) into a subsequent three-operand instruction if * possible. For example: @@ -92,7 +92,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToFoldLoadRegisterIntoSubsequentInstruction(); - + /** \brief * Tries to forward a branch target if the branch instruction transfers control to another unconditional * branch instruction (i.e. a trampoline). For example: @@ -170,7 +170,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceAGI(); - + /** \brief * Tries to reduce a compare logical (\c CLR) insturction followed by a branch to a compare and branch * instruction (\c CLRJ) For example: @@ -190,7 +190,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceCLRToCLRJ(); - + /** \brief * Tries to reduce a simple branch conditional load of an immediate to a load immediate on condition branch- * less sequence. For example: @@ -218,7 +218,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceCRJLHIToLOCHI(TR::InstOpCode::Mnemonic compareMnemonic); - + /** \brief * Tries to reduce a load instruction (\c L) to an insert character under mask (\c ICM) instruction. This can * be done if following the load we have a load and test or a compare against certain immediates. For example: @@ -261,7 +261,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceLToLZRF(TR::InstOpCode::Mnemonic loadAndZeroRightMostByteMnemonic); - + /** \brief * Tries to reduce a load register instruction (\c LGR or \c LTGR) followed by a sign extension to \c LGFR. * For example: @@ -300,7 +300,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceLHIToXR(); - + /** \brief * Tries to reduce a load logical character instruction (\c LLC) followed by a zero extension to \c LLGC. * For example: @@ -320,7 +320,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceLLCToLLGC(); - + /** \brief * Tries to reduce a load register instruction (\c LR or \c LGR) and a future compare (\c CHI) against the * target register to \c LTR or \c LTGR. For example: @@ -347,7 +347,7 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole * true if the reduction was successful; false otherwise. */ bool tryToReduceLRCHIToLTR(); - + /** \brief * Tries to reduce a load and test register instruction (\c LTR or \c LTGR) to a compare halfword immediate if * the target register of the load is used in a future memory reference. This is an attempt to reduce the AGI @@ -480,6 +480,30 @@ class OMR_EXTENSIBLE Peephole : public OMR::Peephole */ bool tryToRemoveRedundantLTR(); + /** \brief + * Tries to remove redundant 32 to 64 bit extensions with \c LGFR or \c LLGFR on register + * values originating from 32 bit loads if the 32 bit load instruction can be replaced with + * an equivalent extending 32 bit load. For example: + * + * + * L R1,N(R2,R3) + * LGFR R1,R1 + * + * + * can be reduced to: + * + * + * LGF R1,N(R2,R3) + * + * + * \param isSigned + * true if operating on an LGFR instruction; false if LLGFR + * + * \return + * true if the reduction was successful; false otherwise + */ + bool tryToRemoveRedundant32To64BitExtend(bool isSigned); + private: /// The instruction cursor currently being processed by the peephole optimization From a4f64329d224fce702a259bb089010a156b0c2f1 Mon Sep 17 00:00:00 2001 From: midronij Date: Tue, 12 Sep 2023 12:30:20 -0400 Subject: [PATCH 02/11] Offheap Adjustments for Unsafe.setMemory() When Unsafe.setMemory() is called on an array and offheap changes are enabled, adjust arguments so that dataAddr is passed in as base address of object. Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 132 +++++++++++++++++++----- 1 file changed, 107 insertions(+), 25 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 232167eebf0..792faa0845e 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5902,23 +5902,28 @@ OMR::Power::TreeEvaluator::generateHelperBranchAndLinkInstruction( TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR::CodeGenerator *cg) { TR::Compilation *comp = cg->comp(); - TR::Node *dstAddrNode, *lengthNode, *valueNode; - dstAddrNode = node->getChild(0); - lengthNode = node->getChild(1); - valueNode = node->getChild(2); - TR::Register *dstAddrReg, *lengthReg, *valueReg; - bool stopUsingCopyReg1, stopUsingCopyReg2 = false, stopUsingCopyReg3 = false; + TR::Node *dstBaseAddrNode = node->getChild(0); + TR::Node *dstOffsetNode = node->getChild(1); + TR::Node *lengthNode = node->getChild(2); + TR::Node *valueNode = node->getChild(3); + + TR::Register *dstBaseAddrReg, *dstOffsetReg, *lengthReg, *valueReg; + bool stopUsingCopyReg1, stopUsingCopyReg2, stopUsingCopyReg3 = false, stopUsingCopyReg4 = false; + + bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; + bool stopUsingCopyRegOffset = dstOffsetNode ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; + bool stopUsingCopyRegAddr = dstAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg) : false ; - stopUsingCopyReg1 = TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg); + bool stopUsingCopyRegLen, stopUsingCopyRegVal; lengthReg = cg->evaluate(lengthNode); if (!cg->canClobberNodesRegister(lengthNode)) { - TR::Register *lenCopyReg = cg->allocateRegister(); + TR::Register *lenCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); lengthReg = lenCopyReg; - stopUsingCopyReg2 = true; + stopUsingCopyReg3 = true; } valueReg = cg->evaluate(valueNode); @@ -5927,7 +5932,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::Register *valCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, valueNode, valCopyReg, valueReg); valueReg = valCopyReg; - stopUsingCopyReg3 = true; + stopUsingCopyReg4 = true; } TR::LabelSymbol * residualLabel = generateLabelSymbol(cg); @@ -5939,15 +5944,88 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); TR::RegisterDependencyConditions *conditions; - int32_t numDeps = 5; + int32_t numDeps = 7; conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); - TR::addDependency(conditions, dstAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + TR::addDependency(conditions, dstBaseAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + TR::addDependency(conditions, dstOffsetReg, TR::RealRegister::NoReg, TR_GPR, cg); TR::addDependency(conditions, lengthReg, TR::RealRegister::NoReg, TR_GPR, cg); TR::addDependency(conditions, valueReg, TR::RealRegister::NoReg, TR_GPR, cg); - TR::Register * tempReg = cg->allocateRegister(); - TR::addDependency(conditions, tempReg, TR::RealRegister::NoReg, TR_GPR, cg); + TR::Register * temp1Reg = cg->allocateRegister(); + TR::Register * temp2Reg = cg->allocateRegister(); + TR::addDependency(conditions, temp1Reg, TR::RealRegister::NoReg, TR_GPR, cg); + TR::addDependency(conditions, temp2Reg, TR::RealRegister::NoReg, TR_GPR, cg); + + +#if defined (J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) + // When using balanced GC policy with offheap allocation enabled, there are three possible cases: + // 1.) The object at dstBaseAddr is known to be a non-array object at compile time. In this scenario, no arrayCHK is + // generated, and no adjustments are made to dstBaseAddr or dstOffset. The behavior in this case should be identical + // to that under gencon GC policy. + // 2.) The object at dstBaseAddr is known to be an array at compile time. In this scenario, no arrayCHK is generated, but + // the dstBaseAddr and dstOffset with be adjusted as needed for offheap. + // 3.) The type of the object at dstBaseAddr is unknown at compile time. In this scenario, a runtime arrayCHK will generated, + // with two possible outcomes: if the object is an array, the dstBaseAddr and dstOffset will be adjusted, and if not, + // no adjustments will be made. + + //check dstBaseAddrNode type at compile time + int length; + const char *objTypeSig = dstBaseAddrNode->getSymbolReference()->getTypeSignature(length); + + //generate arrayCHK in case (3) only + bool arrayCheckNeeded = TR::Compiler->om.isOffHeapAllocationEnabled() && comp->target().is64Bit() && + (objTypeSig == NULL || strstr(objTypeSig, "Ljava/lang/Object")); + + //adjust dstBaseAddr and dstOffset in cases (2) and (3) + bool adjustmentNeeded = arrayCheckNeeded || + TR::Compiler->om.isOffHeapAllocationEnabled() && comp->target().is64Bit() && objTypeSig[0] == '['; + + //generate array check if needed + TR::LabelSymbol *notArray = generateLabelSymbol(cg); + + if (arrayCheckNeeded) + { + TR::Register *dstClassInfoReg = temp1Reg; + TR::Register *arrayFlagReg = temp2Reg; + + //load dst class info into temp1Reg + if (TR::Compiler->om.compressObjectReferences()) + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, dstClassInfoReg, + TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), 4)); + else + generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, node, dstClassInfoReg, + TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), TR::Compiler->om.sizeofReferenceAddress())); + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, dstClassInfoReg); + + TR::MemoryReference *dstClassMR = TR::MemoryReference::createWithDisplacement(cg, dstClassInfoReg, offsetof(J9Class, classDepthAndFlags), TR::Compiler->om.sizeofReferenceAddress()); + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstClassInfoReg, dstClassMR); + + //generate arrayCHK + loadConstant(cg, node, comp->fej9()->getFlagValueForArrayCheck(), arrayFlagReg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, arrayFlagReg, dstClassInfoReg, arrayFlagReg); + generateTrg1Src1ImmInstruction(cg,TR::InstOpCode::cmpi8, node, cndReg, arrayFlagReg, 0); + + //if object is not an array (i.e.: temp1Reg & temp2Reg == 0), skip adjusting dstBaseAddr and dstOffset + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, notArray, cndReg); + } + + //adjust dstBaseAddr if needed + if (adjustmentNeeded) + { + //load dataAddr + TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); + } + + //arrayCHK will skip to here if object is not an array + generateLabelInstruction(cg, TR::InstOpCode::label, node, notArray); + +#endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ + + //calculate dstAddr = dstBaseAddr + dstOffset + TR::Register *dstAddrReg = dstBaseAddrReg; + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); // assemble the double word value from byte value generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 8, 0xff00); @@ -5957,8 +6035,8 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::cmpli4 : TR::InstOpCode::cmpli8, node, cndReg, lengthReg, 32); generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residualLabel, cndReg); - generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, tempReg, lengthReg, 5); - generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, tempReg); + generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, temp1Reg, lengthReg, 5); + generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, temp1Reg); generateLabelInstruction(cg, TR::InstOpCode::label, node, loopStartLabel); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); @@ -5968,48 +6046,52 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, loopStartLabel, cndReg); generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); //check 16 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check 8 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check 4 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueReg); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check 2 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueReg); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //check 1 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueReg); generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); if (stopUsingCopyReg1) - cg->stopUsingRegister(dstAddrReg); + cg->stopUsingRegister(dstBaseAddrReg); if (stopUsingCopyReg2) - cg->stopUsingRegister(lengthReg); + cg->stopUsingRegister(dstOffsetReg); if (stopUsingCopyReg3) + cg->stopUsingRegister(lengthReg); + if (stopUsingCopyReg4) cg->stopUsingRegister(valueReg); cg->stopUsingRegister(cndReg); - cg->stopUsingRegister(tempReg); + cg->stopUsingRegister(temp1Reg); + cg->stopUsingRegister(temp2Reg); - cg->decReferenceCount(dstAddrNode); + cg->decReferenceCount(dstBaseAddrNode); + cg->decReferenceCount(dstOffsetNode); cg->decReferenceCount(lengthNode); cg->decReferenceCount(valueNode); From 310722641a65dabb3650e80b25d4f28ed6e9b67b Mon Sep 17 00:00:00 2001 From: midronij Date: Tue, 24 Oct 2023 13:52:19 -0400 Subject: [PATCH 03/11] Skip uneccesary runtime array check for Unsafe.setMemory() Only generate runtime array check in setmemoryEvaluator() if it is needed (i.e.: object type is unknown at compile time). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 138 +++++++++++++----------- 1 file changed, 78 insertions(+), 60 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 792faa0845e..3a931a5e79f 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5902,14 +5902,35 @@ OMR::Power::TreeEvaluator::generateHelperBranchAndLinkInstruction( TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR::CodeGenerator *cg) { TR::Compilation *comp = cg->comp(); + TR::Node *dstBaseAddrNode, *dstOffsetNode, *dstAddrNode, *lengthNode, *valueNode; - TR::Node *dstBaseAddrNode = node->getChild(0); - TR::Node *dstOffsetNode = node->getChild(1); - TR::Node *lengthNode = node->getChild(2); - TR::Node *valueNode = node->getChild(3); + bool arrayCheckNeeded; + + // IL tree structure depends on whether or not it's been determined that a runtime arrayCHK is needed: + // if node has four children (i.e.: object base address and offset are separate), need array check + // if node three children (i.e.: object base address and offset have already been added together), don't need array check + if (node->getNumChildren() == 4) + { + arrayCheckNeeded = true; + + dstBaseAddrNode = node->getChild(0); + dstOffsetNode = node->getChild(1); + dstAddrNode = NULL; + lengthNode = node->getChild(2); + valueNode = node->getChild(3); + } + else //i.e.: node->getNumChildren() == 3 + { + arrayCheckNeeded = false; + + dstBaseAddrNode = NULL; + dstOffsetNode = NULL; + dstAddrNode = node->getChild(0); + lengthNode = node->getChild(1); + valueNode = node->getChild(2); + } - TR::Register *dstBaseAddrReg, *dstOffsetReg, *lengthReg, *valueReg; - bool stopUsingCopyReg1, stopUsingCopyReg2, stopUsingCopyReg3 = false, stopUsingCopyReg4 = false; + TR::Register *dstBaseAddrReg, *dstOffsetReg, *dstAddrReg, *lengthReg, *valueReg; bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; bool stopUsingCopyRegOffset = dstOffsetNode ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; @@ -5923,16 +5944,16 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::Register *lenCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); lengthReg = lenCopyReg; - stopUsingCopyReg3 = true; + stopUsingCopyRegLen = true; } valueReg = cg->evaluate(valueNode); if (!cg->canClobberNodesRegister(valueNode)) { - TR::Register *valCopyReg = cg->allocateRegister(); + TR::Register *valCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, valueNode, valCopyReg, valueReg); valueReg = valCopyReg; - stopUsingCopyReg4 = true; + stopUsingCopyRegVal = true; } TR::LabelSymbol * residualLabel = generateLabelSymbol(cg); @@ -5944,48 +5965,46 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); TR::RegisterDependencyConditions *conditions; - int32_t numDeps = 7; + int32_t numDeps = arrayCheckNeeded ? 7 : 6; conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); - TR::addDependency(conditions, dstBaseAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); - TR::addDependency(conditions, dstOffsetReg, TR::RealRegister::NoReg, TR_GPR, cg); + + if (arrayCheckNeeded) + { + //dstBaseAddrReg holds the address of the object being written to, so need to exclude GPR0 + TR::addDependency(conditions, dstBaseAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(conditions->getAddCursorForPost() - 1)->setExcludeGPR0(); + + if (!useOffsetAsImmVal) + TR::addDependency(conditions, dstOffsetReg, TR::RealRegister::NoReg, TR_GPR, cg); + } + else + { + //dstAddrReg holds the address of the object being written to, so need to exclude GPR0 + TR::addDependency(conditions, dstAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(1)->setExcludeGPR0(); + } + TR::addDependency(conditions, lengthReg, TR::RealRegister::NoReg, TR_GPR, cg); TR::addDependency(conditions, valueReg, TR::RealRegister::NoReg, TR_GPR, cg); + + //temp1Reg will later be used to hold the J9Class flags for the object at dst, so need to exclude GPR0 TR::Register * temp1Reg = cg->allocateRegister(); - TR::Register * temp2Reg = cg->allocateRegister(); TR::addDependency(conditions, temp1Reg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(conditions->getAddCursorForPost() - 1)->setExcludeGPR0(); + + TR::Register * temp2Reg = cg->allocateRegister(); TR::addDependency(conditions, temp2Reg, TR::RealRegister::NoReg, TR_GPR, cg); #if defined (J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) - // When using balanced GC policy with offheap allocation enabled, there are three possible cases: - // 1.) The object at dstBaseAddr is known to be a non-array object at compile time. In this scenario, no arrayCHK is - // generated, and no adjustments are made to dstBaseAddr or dstOffset. The behavior in this case should be identical - // to that under gencon GC policy. - // 2.) The object at dstBaseAddr is known to be an array at compile time. In this scenario, no arrayCHK is generated, but - // the dstBaseAddr and dstOffset with be adjusted as needed for offheap. - // 3.) The type of the object at dstBaseAddr is unknown at compile time. In this scenario, a runtime arrayCHK will generated, - // with two possible outcomes: if the object is an array, the dstBaseAddr and dstOffset will be adjusted, and if not, - // no adjustments will be made. - - //check dstBaseAddrNode type at compile time - int length; - const char *objTypeSig = dstBaseAddrNode->getSymbolReference()->getTypeSignature(length); - - //generate arrayCHK in case (3) only - bool arrayCheckNeeded = TR::Compiler->om.isOffHeapAllocationEnabled() && comp->target().is64Bit() && - (objTypeSig == NULL || strstr(objTypeSig, "Ljava/lang/Object")); - - //adjust dstBaseAddr and dstOffset in cases (2) and (3) - bool adjustmentNeeded = arrayCheckNeeded || - TR::Compiler->om.isOffHeapAllocationEnabled() && comp->target().is64Bit() && objTypeSig[0] == '['; - - //generate array check if needed - TR::LabelSymbol *notArray = generateLabelSymbol(cg); - if (arrayCheckNeeded) + if (arrayCheckNeeded) // CASE (3) { + //generate array check if needed + TR::LabelSymbol *notArray = generateLabelSymbol(cg); + TR::Register *dstClassInfoReg = temp1Reg; TR::Register *arrayFlagReg = temp2Reg; @@ -5996,37 +6015,33 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: else generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, node, dstClassInfoReg, TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), TR::Compiler->om.sizeofReferenceAddress())); + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, dstClassInfoReg); TR::MemoryReference *dstClassMR = TR::MemoryReference::createWithDisplacement(cg, dstClassInfoReg, offsetof(J9Class, classDepthAndFlags), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstClassInfoReg, dstClassMR); - //generate arrayCHK - loadConstant(cg, node, comp->fej9()->getFlagValueForArrayCheck(), arrayFlagReg); - generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, arrayFlagReg, dstClassInfoReg, arrayFlagReg); - generateTrg1Src1ImmInstruction(cg,TR::InstOpCode::cmpi8, node, cndReg, arrayFlagReg, 0); + //generate array check + int32_t arrayFlagValue = comp->fej9()->getFlagValueForArrayCheck(); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, arrayFlagReg, dstClassInfoReg, arrayFlagValue >> 16); //if object is not an array (i.e.: temp1Reg & temp2Reg == 0), skip adjusting dstBaseAddr and dstOffset generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, notArray, cndReg); - } - //adjust dstBaseAddr if needed - if (adjustmentNeeded) - { - //load dataAddr + //load dataAddr if object is array: TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); - } + + //arrayCHK will skip to here if object is not an array + generateLabelInstruction(cg, TR::InstOpCode::label, node, notArray); - //arrayCHK will skip to here if object is not an array - generateLabelInstruction(cg, TR::InstOpCode::label, node, notArray); + //calculate dstAddr = dstBaseAddr + dstOffset + dstAddrReg = dstBaseAddrReg; + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); + } #endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ - //calculate dstAddr = dstBaseAddr + dstOffset - TR::Register *dstAddrReg = dstBaseAddrReg; - generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); - // assemble the double word value from byte value generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 8, 0xff00); generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 16, 0xffff0000); @@ -6077,21 +6092,24 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); - if (stopUsingCopyReg1) + if (stopUsingCopyRegBase) cg->stopUsingRegister(dstBaseAddrReg); - if (stopUsingCopyReg2) + if (stopUsingCopyRegOffset) cg->stopUsingRegister(dstOffsetReg); - if (stopUsingCopyReg3) + if (stopUsingCopyRegAddr) + cg->stopUsingRegister(dstAddrReg); + if (stopUsingCopyRegLen) cg->stopUsingRegister(lengthReg); - if (stopUsingCopyReg4) + if (stopUsingCopyRegVal) cg->stopUsingRegister(valueReg); cg->stopUsingRegister(cndReg); cg->stopUsingRegister(temp1Reg); cg->stopUsingRegister(temp2Reg); - cg->decReferenceCount(dstBaseAddrNode); - cg->decReferenceCount(dstOffsetNode); + if (dstBaseAddrNode) cg->decReferenceCount(dstBaseAddrNode); + if (dstOffsetNode) cg->decReferenceCount(dstOffsetNode); + if (dstAddrNode) cg->decReferenceCount(dstAddrNode); cg->decReferenceCount(lengthNode); cg->decReferenceCount(valueNode); From 038794441609ff0102fc76a5617caef582ce1145 Mon Sep 17 00:00:00 2001 From: midronij Date: Wed, 29 May 2024 00:36:23 -0400 Subject: [PATCH 04/11] Add NULLCHK on object address passed in to Unsafe.setMemory() In situations where an array check is needed, there are scenarios in which we do not want to modify the dest base address: 1.) If the object is a NULL reference (since we can't load dataAddr from a NULL pointer) 2.) If the object is a non-array object Thus, before the array check is performed, a null test is needed to account for situation (1). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 3a931a5e79f..806c192b214 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -6002,9 +6002,23 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: if (arrayCheckNeeded) // CASE (3) { - //generate array check if needed - TR::LabelSymbol *notArray = generateLabelSymbol(cg); + // There are two scenarios in which we DON'T want to modify the dest base address: + // 1.) If the object is NULL (since we can't load dataAddr from a NULL pointer) + // 2.) If the object is a non-array object + // So two checks are required (NULL, Array) to determine whether dataAddr should be loaded or not + TR::LabelSymbol *noDataAddr = generateLabelSymbol(cg); + + // We only want to generate a runtime NULL check if the status of the object (i.e.: whether it is NULL or non-NULL) + // is NOT known. Note that if the object is known to be NULL, arrayCheckNeeded will be false, so there is no need to check + // that condition here. + if (!dstBaseAddrNode->isNonNull()) + { + //generate NULL test + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpi, node, cndReg, dstBaseAddrReg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); + } + //Array Check TR::Register *dstClassInfoReg = temp1Reg; TR::Register *arrayFlagReg = temp2Reg; @@ -6026,14 +6040,14 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, arrayFlagReg, dstClassInfoReg, arrayFlagValue >> 16); //if object is not an array (i.e.: temp1Reg & temp2Reg == 0), skip adjusting dstBaseAddr and dstOffset - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, notArray, cndReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); //load dataAddr if object is array: TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); //arrayCHK will skip to here if object is not an array - generateLabelInstruction(cg, TR::InstOpCode::label, node, notArray); + generateLabelInstruction(cg, TR::InstOpCode::label, node, noDataAddr); //calculate dstAddr = dstBaseAddr + dstOffset dstAddrReg = dstBaseAddrReg; @@ -6047,7 +6061,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 16, 0xffff0000); generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, 0xffffffff00000000); - generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::cmpli4 : TR::InstOpCode::cmpli8, node, cndReg, lengthReg, 32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndReg, lengthReg, 32); generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residualLabel, cndReg); generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, temp1Reg, lengthReg, 5); From cf8b0b509ebac44b18df5ce78375e67cb8706e9c Mon Sep 17 00:00:00 2001 From: midronij Date: Mon, 30 Oct 2023 17:06:34 -0400 Subject: [PATCH 05/11] Avoid allocating extra register when Unsafe.setMemory() offset is constant When destOffset is a constant 16-bit value, it can be represented as the immediate value argument to addi when calculating the final destination address (i.e.: dest = base address + offset). This allows us to allocate one less register when generating the assembly code sequence for Unsafe.setMemory(). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 806c192b214..b2abcabb3ae 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5932,8 +5932,12 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::Register *dstBaseAddrReg, *dstOffsetReg, *dstAddrReg, *lengthReg, *valueReg; + // if the offset is a constant value less than 16 bits, then we dont need a separate register for it + bool useOffsetAsImmVal = dstOffsetNode && dstOffsetNode->getOpCode().isLoadConst() && + (dstOffsetNode->getConstValue() >= LOWER_IMMED) && (dstOffsetNode->getConstValue() <= UPPER_IMMED); + bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; - bool stopUsingCopyRegOffset = dstOffsetNode ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; + bool stopUsingCopyRegOffset = (dstOffsetNode && !useOffsetAsImmVal) ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; bool stopUsingCopyRegAddr = dstAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg) : false ; bool stopUsingCopyRegLen, stopUsingCopyRegVal; @@ -5965,7 +5969,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); TR::RegisterDependencyConditions *conditions; - int32_t numDeps = arrayCheckNeeded ? 7 : 6; + int32_t numDeps = (!arrayCheckNeeded || useOffsetAsImmVal) ? 6 : 7; conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); @@ -6051,7 +6055,14 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: //calculate dstAddr = dstBaseAddr + dstOffset dstAddrReg = dstBaseAddrReg; - generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); + + if (useOffsetAsImmVal) + { + int offsetImmVal = dstOffsetNode->getConstValue(); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstBaseAddrReg, offsetImmVal); + } + else + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); } #endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ From e714da12266a98b8b612242f35c6b11eb5b69b46 Mon Sep 17 00:00:00 2001 From: midronij Date: Mon, 30 Oct 2023 19:16:24 -0400 Subject: [PATCH 06/11] Use Vector Instructions to Optimize Unsafe.setMemory() on PPC On P8 and higher, we can make use of vector stores (stxvd2x and, for P10 specifically, stxvl) to reduce the number of memory accesses and avoid checks needed to set residual bytes in the assembly code that is generated for Unsafe.setMemory(). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 238 ++++++++++++++++++------ 1 file changed, 183 insertions(+), 55 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index b2abcabb3ae..df8091d3bc7 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5937,25 +5937,82 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: (dstOffsetNode->getConstValue() >= LOWER_IMMED) && (dstOffsetNode->getConstValue() <= UPPER_IMMED); bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; - bool stopUsingCopyRegOffset = (dstOffsetNode && !useOffsetAsImmVal) ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; bool stopUsingCopyRegAddr = dstAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg) : false ; - bool stopUsingCopyRegLen, stopUsingCopyRegVal; + bool stopUsingCopyRegOffset, stopUsingCopyRegLen, stopUsingCopyRegVal; + //dstOffsetNode (type: long) + if (dstOffsetNode && !useOffsetAsImmVal) //only want to allocate a register for dstoffset if we're using it for the array check AND it isn't a constant + { + if (!cg->canClobberNodesRegister(lengthNode)) //only need to copy dstOffset into another register if the current one isn't clobberable + { + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of offset from the register pair + { + dstOffsetReg = cg->evaluate(dstOffsetNode); + TR::Register *offsetCopyReg = cg->allocateRegister(); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, dstOffsetNode, offsetCopyReg, dstOffsetReg->getLowOrder()); + + dstOffsetReg = offsetCopyReg; + stopUsingCopyRegOffset = true; + } + else + { + stopUsingCopyRegOffset = TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg); + } + } + else + { + dstOffsetReg = cg->evaluate(dstOffsetNode); + + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of offset from the register pair + dstOffsetReg = dstOffsetReg->getLowOrder(); + + stopUsingCopyRegOffset = false; + } + } + else + { + stopUsingCopyRegOffset = false; + } + + //lengthNode (type: long) lengthReg = cg->evaluate(lengthNode); if (!cg->canClobberNodesRegister(lengthNode)) { - TR::Register *lenCopyReg = cg->allocateRegister(); - generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + TR::Register *lenCopyReg = cg->allocateRegister(); + + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of length from the register pair + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg->getLowOrder()); + else //on 64-bit system, can just do a normal copy + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + lengthReg = lenCopyReg; stopUsingCopyRegLen = true; } + else + { + if (cg->comp()->target().is32Bit()) //on 32-bit system, need to grab lower 32 bits of length from the register pair + lengthReg = lengthReg->getLowOrder(); + + stopUsingCopyRegLen = false; + } + //valueNode (type: byte) valueReg = cg->evaluate(valueNode); - if (!cg->canClobberNodesRegister(valueNode)) + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) { - TR::Register *valCopyReg = cg->allocateRegister(); + //on P8 or higher, we can use vector instructions to cut down on loop iterations and residual tests -> need to copy valueReg into a VSX register + TR::Register *valVectorReg = cg->allocateRegister(TR_VRF); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mtvsrd, valueNode, valVectorReg, valueReg); + + valueReg = valVectorReg; + stopUsingCopyRegVal = true; + } + else if (!cg->canClobberNodesRegister(valueNode)) + { + TR::Register *valCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, valueNode, valCopyReg, valueReg); + valueReg = valCopyReg; stopUsingCopyRegVal = true; } @@ -5963,13 +6020,25 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * residualLabel = generateLabelSymbol(cg); TR::LabelSymbol * loopStartLabel = generateLabelSymbol(cg); TR::LabelSymbol * doneLabel = generateLabelSymbol(cg); - TR::LabelSymbol * label8aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label4aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label2aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); + + //these labels are not needed for the vector approach to storing to residual bytes (i.e.: P10+) + TR::LabelSymbol *label8aligned, *label4aligned, *label2aligned, *label1aligned; + + if (!cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { + label8aligned = generateLabelSymbol(cg); + label4aligned = generateLabelSymbol(cg); + label2aligned = generateLabelSymbol(cg); + label1aligned = generateLabelSymbol(cg); + } TR::RegisterDependencyConditions *conditions; - int32_t numDeps = (!arrayCheckNeeded || useOffsetAsImmVal) ? 6 : 7; + int32_t numDeps = 6; + + //need extra register for offset only if it isn't already included in the destination address AND it isn't a constant + if (arrayCheckNeeded && !useOffsetAsImmVal) + numDeps++; + conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); @@ -6005,7 +6074,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: #if defined (J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) if (arrayCheckNeeded) // CASE (3) - { + { // There are two scenarios in which we DON'T want to modify the dest base address: // 1.) If the object is NULL (since we can't load dataAddr from a NULL pointer) // 2.) If the object is a non-array object @@ -6020,7 +6089,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: //generate NULL test generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpi, node, cndReg, dstBaseAddrReg, 0); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); - } + } //Array Check TR::Register *dstClassInfoReg = temp1Reg; @@ -6033,7 +6102,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: else generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, node, dstClassInfoReg, TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), TR::Compiler->om.sizeofReferenceAddress())); - + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, dstClassInfoReg); TR::MemoryReference *dstClassMR = TR::MemoryReference::createWithDisplacement(cg, dstClassInfoReg, offsetof(J9Class, classDepthAndFlags), TR::Compiler->om.sizeofReferenceAddress()); @@ -6049,7 +6118,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: //load dataAddr if object is array: TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); - + //arrayCHK will skip to here if object is not an array generateLabelInstruction(cg, TR::InstOpCode::label, node, noDataAddr); @@ -6057,20 +6126,27 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: dstAddrReg = dstBaseAddrReg; if (useOffsetAsImmVal) - { + { int offsetImmVal = dstOffsetNode->getConstValue(); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstBaseAddrReg, offsetImmVal); - } + } else generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); - } + } #endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ // assemble the double word value from byte value - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 8, 0xff00); - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 16, 0xffff0000); - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, 0xffffffff00000000); + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::vspltb, valueNode, valueReg, valueReg, 7); + } + else + { + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 8, CONSTANT64(0x000000000000FF00)); + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 16, CONSTANT64(0x00000000FFFF0000)); + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, CONSTANT64(0xFFFFFFFF00000000)); + } generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndReg, lengthReg, 32); generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residualLabel, cndReg); @@ -6078,42 +6154,94 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, temp1Reg, lengthReg, 5); generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, temp1Reg); generateLabelInstruction(cg, TR::InstOpCode::label, node, loopStartLabel); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 16, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 24, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + + //store designated value to memory in chunks of 32 bytes + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + //on P8 and higher, we can use vector instructions to cut down on loop iterations/number of stores + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvd2x, node, TR::MemoryReference::createWithIndexReg(cg, NULL, dstAddrReg, 16), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvd2x, node, TR::MemoryReference::createWithIndexReg(cg, NULL, dstAddrReg, 16), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + } + else + { + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 16, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 24, 8), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + } + + //decrement counter and return to start of loop generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, loopStartLabel, cndReg); - generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); //check 16 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check 8 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check 4 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check 2 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //check 1 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueReg); + //loop exit + generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); + + //Set residual bytes (max number of residual bytes = 31 = 0x1F) + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) //on P10, we can use stxvl to store all residual bytes efficiently + { + //First 16 byte segment + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); //get first hex char (can only be 0 or 1) + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, temp2Reg, temp1Reg); //keep a copy of first hex char + + //store to memory + //NOTE: due to a quirk of the stxvl instruction on P10, the number of residual bytes must be shifted over before it can be used + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldicr, node, temp1Reg, temp1Reg, 56, CONSTANT64(0xFF00000000000000)); + generateSrc3Instruction(cg, TR::InstOpCode::stxvl, node, valueReg, dstAddrReg, temp1Reg); + + //advance to next 16 byte chunk IF number of residual bytes >= 16 + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstAddrReg, temp2Reg); + + //Second 16 byte segment + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 15); //get second hex char + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldicr, node, temp1Reg, temp1Reg, 56, CONSTANT64(0xFF00000000000000)); //shift num residual bytes + generateSrc3Instruction(cg, TR::InstOpCode::stxvl, node, valueReg, dstAddrReg, temp1Reg); //store to memory + } + else + { + TR::Register *valueResidueReg; + + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + //since P8 and P9 used the vector approach, we first need to copy valueReg back into a GPR + generateTrg1Src1Instruction(cg, TR::InstOpCode::mfvsrd, node, temp2Reg, valueReg); + valueResidueReg = temp2Reg; + } + else + valueResidueReg = valueReg; + + //check if residual < 16 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueResidueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check if residual < 8 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check if residual < 4 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check if residual < 2 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //residual <= 1 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueResidueReg); + } generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); From 8b1032d65baca7a5640101979d9c07bf77126f9f Mon Sep 17 00:00:00 2001 From: Gita Koblents Date: Sun, 21 Jul 2024 18:10:01 -0400 Subject: [PATCH 07/11] Move snippets to warm code - during code cache disclaim, it's beneficial to move snippets into warm code cache - controlled by -Xjit:moveSnippetsToWarmCode option --- compiler/codegen/OMRCodeGenPhase.cpp | 15 ++- compiler/codegen/OMRCodeGenerator.cpp | 134 ++++++++++++++---------- compiler/codegen/OMRCodeGenerator.hpp | 5 + compiler/control/OMROptions.cpp | 2 + compiler/control/OMROptions.hpp | 2 +- compiler/x/codegen/OMRCodeGenerator.cpp | 18 +++- 6 files changed, 120 insertions(+), 56 deletions(-) diff --git a/compiler/codegen/OMRCodeGenPhase.cpp b/compiler/codegen/OMRCodeGenPhase.cpp index 8bf2997ba2d..fefac29de17 100644 --- a/compiler/codegen/OMRCodeGenPhase.cpp +++ b/compiler/codegen/OMRCodeGenPhase.cpp @@ -251,7 +251,20 @@ OMR::CodeGenPhase::performEmitSnippetsPhase(TR::CodeGenerator * cg, TR::CodeGenP TR::LexicalMemProfiler mp("Emit Snippets", comp->phaseMemProfiler()); LexicalTimer pt("Emit Snippets", comp->phaseTimer()); - cg->emitSnippets(); + if (cg->getLastWarmInstruction() && + comp->getOption(TR_MoveSnippetsToWarmCode)) + { + // Snippets will follow warm blocks + uint8_t * oldCursor = cg->getBinaryBufferCursor(); + cg->setBinaryBufferCursor(cg->getWarmCodeEnd()); + cg->emitSnippets(); + cg->setWarmCodeEnd(cg->getBinaryBufferCursor()); + cg->setBinaryBufferCursor(oldCursor); + } + else + { + cg->emitSnippets(); + } if (comp->getOption(TR_EnableOSR)) { diff --git a/compiler/codegen/OMRCodeGenerator.cpp b/compiler/codegen/OMRCodeGenerator.cpp index e4fc9bf385a..b4d263f0af3 100644 --- a/compiler/codegen/OMRCodeGenerator.cpp +++ b/compiler/codegen/OMRCodeGenerator.cpp @@ -363,6 +363,78 @@ OMR::CodeGenerator::generateCodeFromIL() return false; } +void +OMR::CodeGenerator::insertGotoIntoLastBlock(TR::Block *lastBlock) + { + // If the last tree in the last block is not a TR_goto, insert a goto tree + // at the end of the block. + // If there is a following block the goto will branch to it so that when the + // code is split any fall-through will go to the right place. + // If there is no following block the goto will branch to the first block; in + // this case the goto should never be reached, it is there only to + // make sure that the instruction following the last real treetop will be in + // method's code, so if it is a helper call (e.g. for a throw) the return address + // is in this method's code. + // + TR::Compilation *comp = self()->comp(); + TR::TreeTop * tt; + TR::Node * node; + + if (lastBlock->getNumberOfRealTreeTops() == 0) + tt = lastBlock->getEntry(); + else + tt = lastBlock->getLastRealTreeTop(); + + node = tt->getNode(); + + if (!(node->getOpCode().isGoto() || + node->getOpCode().isJumpWithMultipleTargets() || + node->getOpCode().isReturn())) + { + + if (comp->getOption(TR_TraceCG)) + { + traceMsg(comp, "%s Inserting goto at the end of block_%d\n", SPLIT_WARM_COLD_STRING, lastBlock->getNumber()); + } + + // Find the block to be branched to + // + TR::TreeTop * targetTreeTop = lastBlock->getExit()->getNextTreeTop(); + + if (targetTreeTop) + // Branch to following block. Make sure it is not marked as an + // extension block so that it will get a label generated. + // + targetTreeTop->getNode()->getBlock()->setIsExtensionOfPreviousBlock(false); + else + // Branch to the first block. This will not be marked as an extension + // block. + // + targetTreeTop = comp->getStartBlock()->getEntry(); + + // Generate the goto and insert it into the end of the last warm block. + // + TR::TreeTop *gotoTreeTop = TR::TreeTop::create(comp, TR::Node::create(node, TR::Goto, 0, targetTreeTop)); + + // Move reg deps from BBEnd to goto + // + TR::Node *bbEnd = lastBlock->getExit()->getNode(); + + if (bbEnd->getNumChildren() > 0) + { + TR::Node *glRegDeps = bbEnd->getChild(0); + + gotoTreeTop->getNode()->setNumChildren(1); + gotoTreeTop->getNode()->setChild(0, glRegDeps); + + bbEnd->setChild(0,NULL); + bbEnd->setNumChildren(0); + } + + tt->insertAfter(gotoTreeTop); + } + } + void OMR::CodeGenerator::findLastWarmBlock() { TR::Compilation *comp = self()->comp(); @@ -457,62 +529,18 @@ void OMR::CodeGenerator::findLastWarmBlock() (numColdBlocks - numNonOutlinedColdBlocks)*100/numColdBlocks); } - // If the last tree in the last warm block is not a TR_goto, insert a goto tree - // at the end of the block. - // If there is a following block the goto will branch to it so that when the - // code is split any fall-through will go to the right place. - // If there is no following block the goto will branch to the first block; in - // this case the goto should never be reached, it is there only to - // make sure that the instruction following the last real treetop will be in - // warm code, so if it is a helper call (e.g. for a throw) the return address - // is in this method's code. - // - if (lastWarmBlock->getNumberOfRealTreeTops() == 0) - tt = lastWarmBlock->getEntry(); - else - tt = lastWarmBlock->getLastRealTreeTop(); - node = tt->getNode(); + insertGotoIntoLastBlock(lastWarmBlock); + TR::Block *lastBlock = comp->findLastTree()->getNode()->getBlock(); - if (!(node->getOpCode().isGoto() || - node->getOpCode().isJumpWithMultipleTargets() || - node->getOpCode().isReturn())) + // If disclaim is enabled, it may happen that nothing follows mainline code + // (no snippets or OOL). Then, we need to insert a goto at the end for the + // reasons described in insertGotoIntoLastBlock() + // + if (TR::Options::getCmdLineOptions()->getOption(TR_EnableCodeCacheDisclaiming) && + lastBlock != lastWarmBlock) { - // Find the block to be branched to - // - TR::TreeTop * targetTreeTop = lastWarmBlock->getExit()->getNextTreeTop(); - - if (targetTreeTop) - // Branch to following block. Make sure it is not marked as an - // extension block so that it will get a label generated. - // - targetTreeTop->getNode()->getBlock()->setIsExtensionOfPreviousBlock(false); - else - // Branch to the first block. This will not be marked as an extension - // block. - // - targetTreeTop = comp->getStartBlock()->getEntry(); - - // Generate the goto and insert it into the end of the last warm block. - // - TR::TreeTop *gotoTreeTop = TR::TreeTop::create(comp, TR::Node::create(node, TR::Goto, 0, targetTreeTop)); - - // Move reg deps from BBEnd to goto - // - TR::Node *bbEnd = lastWarmBlock->getExit()->getNode(); - - if (bbEnd->getNumChildren() > 0) - { - TR::Node *glRegDeps = bbEnd->getChild(0); - - gotoTreeTop->getNode()->setNumChildren(1); - gotoTreeTop->getNode()->setChild(0, glRegDeps); - - bbEnd->setChild(0,NULL); - bbEnd->setNumChildren(0); - } - - tt->insertAfter(gotoTreeTop); + insertGotoIntoLastBlock(lastBlock); } } diff --git a/compiler/codegen/OMRCodeGenerator.hpp b/compiler/codegen/OMRCodeGenerator.hpp index bbf9a27ff34..4e429f91c57 100644 --- a/compiler/codegen/OMRCodeGenerator.hpp +++ b/compiler/codegen/OMRCodeGenerator.hpp @@ -344,6 +344,11 @@ class OMR_EXTENSIBLE CodeGenerator void lowerTreesPropagateBlockToNode(TR::Node *node); + /** + * @brief Inserts goto into the last block if necessary + */ + void insertGotoIntoLastBlock(TR::Block *lastBlock); + void findLastWarmBlock(); void setUpForInstructionSelection(); diff --git a/compiler/control/OMROptions.cpp b/compiler/control/OMROptions.cpp index d5d4b719ee6..0d421e2803d 100644 --- a/compiler/control/OMROptions.cpp +++ b/compiler/control/OMROptions.cpp @@ -1004,7 +1004,9 @@ TR::OptionTable OMR::Options::_jitOptions[] = { {"minSleepTimeMsForCompThrottling=", "M\tLower bound for sleep time during compilation throttling (ms)", TR::Options::setStaticNumeric, (intptr_t)&OMR::Options::_minSleepTimeMsForCompThrottling, 0, "F%d", NOT_IN_SUBSET }, {"moveOOLInstructionsToWarmCode", "M\tmove out-of-line instructions to after last warm instruction", SET_OPTION_BIT(TR_MoveOOLInstructionsToWarmCode), "F"}, + {"moveSnippetsToWarmCode", "M\tmove snippets to after last warm instruction", SET_OPTION_BIT(TR_MoveSnippetsToWarmCode), "F"}, {"noAotSecondRunDetection", "M\tdo not do second run detection for AOT", SET_OPTION_BIT(TR_NoAotSecondRunDetection), "F", NOT_IN_SUBSET }, + #ifdef DEBUG {"noExceptions", "C\tfail compilation for methods with exceptions", TR::Options::setDebug, (intptr_t)"noExceptions"}, diff --git a/compiler/control/OMROptions.hpp b/compiler/control/OMROptions.hpp index 8a99bf4398f..3872b1e6c9e 100644 --- a/compiler/control/OMROptions.hpp +++ b/compiler/control/OMROptions.hpp @@ -382,7 +382,7 @@ enum TR_CompilationOptions TR_DisableInliningUnrecognizedIntrinsics = 0x10000000 + 9, TR_EnableVectorAPIExpansion = 0x20000000 + 9, TR_MoveOOLInstructionsToWarmCode = 0x40000000 + 9, - // Available = 0x80000000 + 9, + TR_MoveSnippetsToWarmCode = 0x80000000 + 9, // Option word 10 // diff --git a/compiler/x/codegen/OMRCodeGenerator.cpp b/compiler/x/codegen/OMRCodeGenerator.cpp index d7a5499e029..f579df20b04 100644 --- a/compiler/x/codegen/OMRCodeGenerator.cpp +++ b/compiler/x/codegen/OMRCodeGenerator.cpp @@ -2059,6 +2059,8 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() // bool skipOneReturn = false; int32_t estimatedPrologueStartOffset = estimate; + bool snippetsAfterWarm = self()->comp()->getOption(TR_MoveSnippetsToWarmCode); + while (estimateCursor) { // Update the info bits on the register mask. @@ -2152,6 +2154,9 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() // if (estimateCursor->isLastWarmInstruction()) { + if (snippetsAfterWarm) + estimate = setEstimatedLocationsForSnippetLabels(estimate); + warmEstimate = (estimate+7) & ~7; estimate = warmEstimate + MIN_DISTANCE_BETWEEN_WARM_AND_COLD_CODE; } @@ -2165,7 +2170,8 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() if (self()->comp()->getOption(TR_TraceCG)) traceMsg(self()->comp(), "\n\n"); - estimate = self()->setEstimatedLocationsForSnippetLabels(estimate); + if (!snippetsAfterWarm || !warmEstimate) + estimate = self()->setEstimatedLocationsForSnippetLabels(estimate); // When using copyBinaryToBuffer() to copy the encoding of an instruction we // indiscriminatelly copy a whole integer, even if the size of the encoding @@ -2240,6 +2246,8 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() // Generate binary for the rest of the instructions // + int32_t accumulatedErrorBeforeSnippets = 0; + while (cursorInstruction) { uint8_t * const instructionStart = self()->getBinaryBufferCursor(); @@ -2279,6 +2287,8 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() self()->getWarmCodeEnd(), cursorInstruction, coldCode); } + accumulatedErrorBeforeSnippets = getAccumulatedInstructionLengthError(); + // Adjust the accumulated length error so that distances within the cold // code are calculated properly using the estimated code locations. // @@ -2328,6 +2338,12 @@ void OMR::X86::CodeGenerator::doBinaryEncoding() traceMsg(self()->comp(), "\n"); } + if (self()->comp()->getOption(TR_SplitWarmAndColdBlocks)) + { + if (snippetsAfterWarm) // snippets will follow the warm code + setAccumulatedInstructionLengthError(accumulatedErrorBeforeSnippets); + } + } // different from evaluate in that it returns a clobberable register From b9cc9abb3d35b249c281a2998e1d4cf1183a3cbc Mon Sep 17 00:00:00 2001 From: Gita Koblents Date: Fri, 23 Aug 2024 12:14:48 -0400 Subject: [PATCH 08/11] Rename findLastWarmBlock() --- compiler/codegen/OMRCodeGenerator.cpp | 4 ++-- compiler/codegen/OMRCodeGenerator.hpp | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/compiler/codegen/OMRCodeGenerator.cpp b/compiler/codegen/OMRCodeGenerator.cpp index b4d263f0af3..c63db4b4dc6 100644 --- a/compiler/codegen/OMRCodeGenerator.cpp +++ b/compiler/codegen/OMRCodeGenerator.cpp @@ -435,7 +435,7 @@ OMR::CodeGenerator::insertGotoIntoLastBlock(TR::Block *lastBlock) } } -void OMR::CodeGenerator::findLastWarmBlock() +void OMR::CodeGenerator::prepareLastWarmBlockForCodeSplitting() { TR::Compilation *comp = self()->comp(); TR::TreeTop * tt; @@ -598,7 +598,7 @@ void OMR::CodeGenerator::postLowerTrees() if (comp()->getOption(TR_SplitWarmAndColdBlocks) && !comp()->compileRelocatableCode()) { - self()->findLastWarmBlock(); + self()->prepareLastWarmBlockForCodeSplitting(); } } diff --git a/compiler/codegen/OMRCodeGenerator.hpp b/compiler/codegen/OMRCodeGenerator.hpp index 4e429f91c57..755538f3de5 100644 --- a/compiler/codegen/OMRCodeGenerator.hpp +++ b/compiler/codegen/OMRCodeGenerator.hpp @@ -349,7 +349,11 @@ class OMR_EXTENSIBLE CodeGenerator */ void insertGotoIntoLastBlock(TR::Block *lastBlock); - void findLastWarmBlock(); + /** + * @brief Finds last warm block and inserts necessary gotos + * for splitting code into warm and cold + */ + void prepareLastWarmBlockForCodeSplitting(); void setUpForInstructionSelection(); void doInstructionSelection(); From 20448ec8432618e14dad7c31b18016f1d5a2fe51 Mon Sep 17 00:00:00 2001 From: KONNO Kazuhiro Date: Mon, 26 Aug 2024 11:19:19 +0900 Subject: [PATCH 09/11] Add vector trn instructions This commit adds vector trn1/trn2 instructions for AArch64 and their binary encoding unit tests. Signed-off-by: KONNO Kazuhiro --- compiler/aarch64/codegen/ARM64Debug.cpp | 4 +++ compiler/aarch64/codegen/OMRInstOpCode.enum | 4 +++ compiler/aarch64/codegen/OpBinary.cpp | 4 +++ .../aarch64/BinaryEncoder.cpp | 30 +++++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/compiler/aarch64/codegen/ARM64Debug.cpp b/compiler/aarch64/codegen/ARM64Debug.cpp index 528ef6130ff..2eb7d8d4756 100644 --- a/compiler/aarch64/codegen/ARM64Debug.cpp +++ b/compiler/aarch64/codegen/ARM64Debug.cpp @@ -921,6 +921,10 @@ static const char *opCodeToNameMap[] = "vuzp2_8h", "vuzp2_4s", "vuzp2_2d", + "vtrn1_8b", + "vtrn1_16b", + "vtrn2_8b", + "vtrn2_16b", "vext16b", "vneg16b", "vneg8h", diff --git a/compiler/aarch64/codegen/OMRInstOpCode.enum b/compiler/aarch64/codegen/OMRInstOpCode.enum index 310a27f746f..d180f4fb44f 100644 --- a/compiler/aarch64/codegen/OMRInstOpCode.enum +++ b/compiler/aarch64/codegen/OMRInstOpCode.enum @@ -906,6 +906,10 @@ vuzp2_8h, /* 0x4E405800 UZP2 */ vuzp2_4s, /* 0x4E805800 UZP2 */ vuzp2_2d, /* 0x4EC05800 UZP2 */ + vtrn1_8b, /* 0x0E002800 TRN1 */ + vtrn1_16b, /* 0x4E002800 TRN1 */ + vtrn2_8b, /* 0x0E006800 TRN2 */ + vtrn2_16b, /* 0x4E006800 TRN2 */ /* Vector extract */ vext16b, /* 0x6E000000 EXT */ /* Vector Data-processing (1 source) */ diff --git a/compiler/aarch64/codegen/OpBinary.cpp b/compiler/aarch64/codegen/OpBinary.cpp index 7580f981155..f21849b604b 100644 --- a/compiler/aarch64/codegen/OpBinary.cpp +++ b/compiler/aarch64/codegen/OpBinary.cpp @@ -907,6 +907,10 @@ const OMR::ARM64::InstOpCode::OpCodeBinaryEntry OMR::ARM64::InstOpCode::binaryEn 0x4E405800, /* UZP2 vuzp2_8h */ 0x4E805800, /* UZP2 vuzp2_4s */ 0x4EC05800, /* UZP2 vuzp2_2d */ + 0x0E002800, /* TRN1 vtrn1_8b */ + 0x4E002800, /* TRN1 vtrn1_16b */ + 0x0E006800, /* TRN2 vtrn2_8b */ + 0x4E006800, /* TRN2 vtrn2_16b */ /* Vector extract */ 0x6E000000, /* EXT vext16b */ /* Vector Data-processing (1 source) */ diff --git a/fvtest/compilerunittest/aarch64/BinaryEncoder.cpp b/fvtest/compilerunittest/aarch64/BinaryEncoder.cpp index 50d1f88b89e..9eb62998f8a 100644 --- a/fvtest/compilerunittest/aarch64/BinaryEncoder.cpp +++ b/fvtest/compilerunittest/aarch64/BinaryEncoder.cpp @@ -1636,6 +1636,36 @@ INSTANTIATE_TEST_CASE_P(VectorUnzip2, ARM64Trg1Src2EncodingTest, ::testing::Valu std::make_tuple(TR::InstOpCode::vuzp2_2d, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "4edf5800") )); +INSTANTIATE_TEST_CASE_P(VectorTrn1, ARM64Trg1Src2EncodingTest, ::testing::Values( + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "0e00280f"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "0e00281f"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v0, TR::RealRegister::v15, TR::RealRegister::v0, "0e0029e0"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v0, TR::RealRegister::v31, TR::RealRegister::v0, "0e002be0"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v15, "0e0f2800"), + std::make_tuple(TR::InstOpCode::vtrn1_8b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "0e1f2800"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "4e00280f"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "4e00281f"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v0, TR::RealRegister::v15, TR::RealRegister::v0, "4e0029e0"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v0, TR::RealRegister::v31, TR::RealRegister::v0, "4e002be0"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v15, "4e0f2800"), + std::make_tuple(TR::InstOpCode::vtrn1_16b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "4e1f2800") +)); + +INSTANTIATE_TEST_CASE_P(VectorTrn2, ARM64Trg1Src2EncodingTest, ::testing::Values( + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "0e00680f"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "0e00681f"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v0, TR::RealRegister::v15, TR::RealRegister::v0, "0e0069e0"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v0, TR::RealRegister::v31, TR::RealRegister::v0, "0e006be0"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v15, "0e0f6800"), + std::make_tuple(TR::InstOpCode::vtrn2_8b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "0e1f6800"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "4e00680f"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "4e00681f"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v0, TR::RealRegister::v15, TR::RealRegister::v0, "4e0069e0"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v0, TR::RealRegister::v31, TR::RealRegister::v0, "4e006be0"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v15, "4e0f6800"), + std::make_tuple(TR::InstOpCode::vtrn2_16b, TR::RealRegister::v0, TR::RealRegister::v0, TR::RealRegister::v31, "4e1f6800") +)); + INSTANTIATE_TEST_CASE_P(VectorUMLAL, ARM64Trg1Src2EncodingTest, ::testing::Values( std::make_tuple(TR::InstOpCode::vumlal_8h, TR::RealRegister::v15, TR::RealRegister::v0, TR::RealRegister::v0, "2e20800f"), std::make_tuple(TR::InstOpCode::vumlal_8h, TR::RealRegister::v31, TR::RealRegister::v0, TR::RealRegister::v0, "2e20801f"), From 9b6c763e27d8619a375f76210015905747bdd0b4 Mon Sep 17 00:00:00 2001 From: Gita Koblents Date: Fri, 23 Aug 2024 17:53:36 -0400 Subject: [PATCH 10/11] Add -Xjit:EnableCodeCacheDisclaimingSupport option - add option that enables all experimental options that can help code cache disclaiming --- compiler/control/OMROptions.cpp | 10 ++++++++++ compiler/control/OMROptions.hpp | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/compiler/control/OMROptions.cpp b/compiler/control/OMROptions.cpp index 0d421e2803d..f44e0b48de1 100644 --- a/compiler/control/OMROptions.cpp +++ b/compiler/control/OMROptions.cpp @@ -689,6 +689,7 @@ TR::OptionTable OMR::Options::_jitOptions[] = { {"enableClassChainValidationCaching", "M\tenable class chain validation caching", SET_OPTION_BIT(TR_EnableClassChainValidationCaching), "F", NOT_IN_SUBSET}, {"enableCodeCacheConsolidation", "M\tenable code cache consolidation", SET_OPTION_BIT(TR_EnableCodeCacheConsolidation), "F", NOT_IN_SUBSET}, {"enableCodeCacheDisclaiming", "M\tenable memory disclaiming for code cache (linux specific).", SET_OPTION_BIT(TR_EnableCodeCacheDisclaiming),"F", NOT_IN_SUBSET}, + {"enableCodeCacheDisclaimingSupport", "M\tenable all experimental options that help code cache disclaiming.", SET_OPTION_BIT(TR_EnableCodeCacheDisclaimingSupport),"F", NOT_IN_SUBSET}, {"enableColdCheapTacticalGRA", "O\tenable cold cheap tactical GRA", SET_OPTION_BIT(TR_EnableColdCheapTacticalGRA), "F"}, {"enableCompilationBeforeCheckpoint", "C\tenable compilation before checkpoint", RESET_OPTION_BIT(TR_DisableCompilationBeforeCheckpoint), "F", NOT_IN_SUBSET}, {"enableCompilationSpreading", "C\tenable adding spreading invocations to methods before compiling", SET_OPTION_BIT(TR_EnableCompilationSpreading), "F", NOT_IN_SUBSET}, @@ -2456,6 +2457,15 @@ OMR::Options::jitLatePostProcess(TR::OptionSet *optionSet, void * jitConfig) self()->setOption(TR_ReservingLocks, false); } + if (self()->getOption(TR_EnableCodeCacheDisclaimingSupport)) + { + self()->setOption(TR_SplitWarmAndColdBlocks); + self()->setOption(TR_DisclaimMemoryOnSwap); + self()->setOption(TR_InstallAOTToColdCode); + self()->setOption(TR_MoveOOLInstructionsToWarmCode); + self()->setOption(TR_MoveSnippetsToWarmCode); + } + return true; } diff --git a/compiler/control/OMROptions.hpp b/compiler/control/OMROptions.hpp index 3872b1e6c9e..314a6ec0a5c 100644 --- a/compiler/control/OMROptions.hpp +++ b/compiler/control/OMROptions.hpp @@ -392,7 +392,7 @@ enum TR_CompilationOptions TR_FirstLevelProfiling = 0x00000100 + 10, TR_EnableCodeCacheDisclaiming = 0x00000200 + 10, // Available = 0x00000400 + 10, - // Available = 0x00000800 + 10, + TR_EnableCodeCacheDisclaimingSupport = 0x00000800 + 10, // Available = 0x00001000 + 10, TR_DisableNewMethodOverride = 0x00002000 + 10, // Available = 0x00004000 + 10, From 31b912e5210bea15c4f8ef3630a34d79ba0aca11 Mon Sep 17 00:00:00 2001 From: Devin Papineau Date: Tue, 20 Aug 2024 18:39:11 -0400 Subject: [PATCH 11/11] Remove unused cpIndex parameter of vTableSlot(), virtualCallSelector() ...in TR_ResolvedMethod. --- compiler/compile/OMRSymbolReferenceTable.cpp | 2 +- compiler/compile/ResolvedMethod.cpp | 4 ++-- compiler/compile/ResolvedMethod.hpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/compiler/compile/OMRSymbolReferenceTable.cpp b/compiler/compile/OMRSymbolReferenceTable.cpp index 7792db843b1..050aeefaf3e 100644 --- a/compiler/compile/OMRSymbolReferenceTable.cpp +++ b/compiler/compile/OMRSymbolReferenceTable.cpp @@ -1528,7 +1528,7 @@ OMR::SymbolReferenceTable::findOrCreateMethodSymbol( if (!resolvedMethod) symRef->setUnresolved(); else if (callKind == TR::MethodSymbol::Virtual && cpIndex != -1) - symRef->setOffset(resolvedMethod->virtualCallSelector(cpIndex)); + symRef->setOffset(resolvedMethod->virtualCallSelector()); aliasBuilder.methodSymRefs().set(symRef->getReferenceNumber()); diff --git a/compiler/compile/ResolvedMethod.cpp b/compiler/compile/ResolvedMethod.cpp index 13827f20094..3c7280e6aa4 100644 --- a/compiler/compile/ResolvedMethod.cpp +++ b/compiler/compile/ResolvedMethod.cpp @@ -391,11 +391,11 @@ char * TR_ResolvedMethod::fieldNameChars(int32_t, int32_t &) { TR_ char * TR_ResolvedMethod::fieldSignatureChars(int32_t, int32_t &) { TR_UNIMPLEMENTED(); return 0; } char * TR_ResolvedMethod::staticSignatureChars(int32_t, int32_t &) { TR_UNIMPLEMENTED(); return 0; } void * & TR_ResolvedMethod::addressOfClassOfMethod() { TR_UNIMPLEMENTED(); throw std::exception(); } -uint32_t TR_ResolvedMethod::vTableSlot(uint32_t) { TR_UNIMPLEMENTED(); return 0; } +uint32_t TR_ResolvedMethod::vTableSlot() { TR_UNIMPLEMENTED(); return 0; } bool TR_ResolvedMethod::virtualMethodIsOverridden() { TR_UNIMPLEMENTED(); return false; } void TR_ResolvedMethod::setVirtualMethodIsOverridden() { TR_UNIMPLEMENTED(); } void * TR_ResolvedMethod::addressContainingIsOverriddenBit() { TR_UNIMPLEMENTED(); return 0; } -int32_t TR_ResolvedMethod::virtualCallSelector(uint32_t) { TR_UNIMPLEMENTED(); return 0; } +int32_t TR_ResolvedMethod::virtualCallSelector() { TR_UNIMPLEMENTED(); return 0; } uint32_t TR_ResolvedMethod::numberOfExceptionHandlers() { TR_UNIMPLEMENTED(); return 0; } uint8_t * TR_ResolvedMethod::allocateException(uint32_t,TR::Compilation*){ TR_UNIMPLEMENTED(); return 0; } diff --git a/compiler/compile/ResolvedMethod.hpp b/compiler/compile/ResolvedMethod.hpp index c2d4cf6f9e6..4c96024aebb 100644 --- a/compiler/compile/ResolvedMethod.hpp +++ b/compiler/compile/ResolvedMethod.hpp @@ -218,7 +218,7 @@ class TR_ResolvedMethod virtual uint32_t classCPIndexOfMethod(uint32_t); virtual void * & addressOfClassOfMethod(); - virtual uint32_t vTableSlot(uint32_t); + virtual uint32_t vTableSlot(); virtual TR_OpaqueClassBlock *getResolvedInterfaceMethod(int32_t cpIndex, uintptr_t * pITableIndex); @@ -236,7 +236,7 @@ class TR_ResolvedMethod virtual bool virtualMethodIsOverridden(); virtual void setVirtualMethodIsOverridden(); virtual void *addressContainingIsOverriddenBit(); - virtual int32_t virtualCallSelector(uint32_t cpIndex); + virtual int32_t virtualCallSelector(); virtual int32_t exceptionData(int32_t exceptionNumber, int32_t * startIndex, int32_t * endIndex, int32_t * catchType); virtual uint32_t numberOfExceptionHandlers();