From f15f9f6ff2d4a694942e4b10b3732806d7ee1341 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 2 Oct 2015 11:55:15 -0700 Subject: [PATCH 001/904] Initial FLO commit. Summary: Directory created. (cherry picked from commit 6049153f609f0df923f5f333503da1609c173de5) --- bolt/CMakeLists.txt | 18 +++++++ bolt/LLVMBuild.txt | 22 +++++++++ bolt/llvm-flo.cpp | 112 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 bolt/CMakeLists.txt create mode 100644 bolt/LLVMBuild.txt create mode 100644 bolt/llvm-flo.cpp diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt new file mode 100644 index 000000000000..a28744c0b239 --- /dev/null +++ b/bolt/CMakeLists.txt @@ -0,0 +1,18 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + CodeGen + Core + DebugInfoDWARF + MC + MCDisassembler + MCParser + Object + Orcjit + Support + ) + +add_llvm_tool(llvm-flo + llvm-flo.cpp + BinaryCFG.cpp + BinaryOptimizer.cpp + ) diff --git a/bolt/LLVMBuild.txt b/bolt/LLVMBuild.txt new file mode 100644 index 000000000000..770196d110bd --- /dev/null +++ b/bolt/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./tools/llvm-flo/LLVMBuild.txt ---------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Tool +name = llvm-flo +parent = Tools +required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Object all-targets diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp new file mode 100644 index 000000000000..290e5cf4bb59 --- /dev/null +++ b/bolt/llvm-flo.cpp @@ -0,0 +1,112 @@ +//===-- llvm-flo.cpp - Feedback-directed layout optimizer -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ExecutionEngine/Orc/LambdaResolver.h" +#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Target/TargetMachine.h" + + +#include +#include +#include + +using namespace llvm; +using namespace object; + +// Tool options. +static cl::opt +InputFilename(cl::Positional, cl::desc(""), cl::Required); + +static cl::opt +InputDataFilename("data", cl::desc(""), cl::Optional); + +static cl::opt +OutputFilename("o", cl::desc(""), cl::Required); + +static cl::list +FunctionNames("funcs", cl::desc("list of functions to optimzize"), + cl::Optional); + + +static StringRef ToolName; + +static void report_error(StringRef Message, std::error_code EC) { + assert(EC); + errs() << ToolName << ": '" << Message << "': " << EC.message() << ".\n"; + exit(1); +} + +int main(int argc, char **argv) { + // Print a stack trace if we signal out. + sys::PrintStackTraceOnErrorSignal(); + PrettyStackTraceProgram X(argc, argv); + + llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. + + // Initialize targets and assembly printers/parsers. + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllDisassemblers(); + + llvm::InitializeAllTargets(); + llvm::InitializeAllAsmPrinters(); + + // Register the target printer for --version. + cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); + + cl::ParseCommandLineOptions(argc, argv, + "llvm feedback-directed layout optimizer\n"); + + ToolName = argv[0]; + + if (!sys::fs::exists(InputFilename)) + report_error(InputFilename, errc::no_such_file_or_directory); + + // Attempt to open the binary. + ErrorOr> BinaryOrErr = createBinary(InputFilename); + if (std::error_code EC = BinaryOrErr.getError()) + report_error(InputFilename, EC); + Binary &Binary = *BinaryOrErr.get().getBinary(); + + if (ELFObjectFileBase *e = dyn_cast(&Binary)) { + outs() << "mind blown : " << e << "!\n"; + } else { + report_error(InputFilename, object_error::invalid_file_type); + } + + return EXIT_SUCCESS; +} From 757a23b827f6ef10913e27d58eef1dccbd8b9906 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 2 Oct 2015 12:06:31 -0700 Subject: [PATCH 002/904] Removed remote .arcconfig + comment change. (cherry picked from commit e9bb781fe70f0d3d82084adbd25eea4f9d3842a1) --- bolt/llvm-flo.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 290e5cf4bb59..52628ebdf1d9 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -61,6 +61,7 @@ FunctionNames("funcs", cl::desc("list of functions to optimzize"), cl::Optional); +// Tool name used for reporting. static StringRef ToolName; static void report_error(StringRef Message, std::error_code EC) { From 08cb2d03528cedbc0898c32e9d951723250c9021 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 2 Oct 2015 12:38:07 -0700 Subject: [PATCH 003/904] Fixed cmake. (cherry picked from commit 1b732abb61220891447c067d3b15aff8a7f00e7d) --- bolt/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index a28744c0b239..ef2afbab01ae 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -13,6 +13,4 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-flo llvm-flo.cpp - BinaryCFG.cpp - BinaryOptimizer.cpp ) From 50bd53f5166ba605e18c06b899e2a3177df4ca31 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 9 Oct 2015 17:21:14 -0700 Subject: [PATCH 004/904] Commit FLO with control flow graph. Summary: llvm-flo disassembles, builds control flow graph, and re-writes simple functions. (cherry picked from commit 8f28b875df17fed59686714eaba585a7ee579cea) --- bolt/BinaryBasicBlock.cpp | 65 +++++ bolt/BinaryBasicBlock.h | 212 ++++++++++++++ bolt/BinaryContext.h | 114 ++++++++ bolt/BinaryFunction.cpp | 381 ++++++++++++++++++++++++ bolt/BinaryFunction.h | 399 +++++++++++++++++++++++++ bolt/CMakeLists.txt | 3 +- bolt/LLVMBuild.txt | 2 +- bolt/llvm-flo.cpp | 598 +++++++++++++++++++++++++++++++++++++- 8 files changed, 1768 insertions(+), 6 deletions(-) create mode 100644 bolt/BinaryBasicBlock.cpp create mode 100644 bolt/BinaryBasicBlock.h create mode 100644 bolt/BinaryContext.h create mode 100644 bolt/BinaryFunction.cpp create mode 100644 bolt/BinaryFunction.h diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp new file mode 100644 index 000000000000..975e8388978f --- /dev/null +++ b/bolt/BinaryBasicBlock.cpp @@ -0,0 +1,65 @@ +//===--- BinaryBasicBlock.cpp - Interface for assembly-level basic block --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include +#include + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "flo" + +namespace llvm { + +namespace flo { + +bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { + return LHS.Offset < RHS.Offset; +} + +void BinaryBasicBlock::addSuccessor(BinaryBasicBlock *Succ, + uint64_t Count, + uint64_t MispredictedCount) { + Successors.push_back(Succ); + Succ->Predecessors.push_back(this); + + // TODO: update weights. +} + +void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) { + Succ->removePredecessor(this); + auto I = std::find(succ_begin(), succ_end(), Succ); + assert(I != succ_end() && "no such successor!"); + + Successors.erase(I); + + // TODO: update weights. +} + +void BinaryBasicBlock::addPredecessor(BinaryBasicBlock *Pred) { + Predecessors.push_back(Pred); +} + +void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) { + auto I = std::find(pred_begin(), pred_end(), Pred); + assert(I != pred_end() && "Pred is not a predecessor of this block!"); + Predecessors.erase(I); +} + +} // namespace flo + +} // namespace llvm diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h new file mode 100644 index 000000000000..a7aeda445fdd --- /dev/null +++ b/bolt/BinaryBasicBlock.h @@ -0,0 +1,212 @@ +//===--- BinaryBasicBlock.h - Interface for assembly-level basic block ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: memory management for instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_FLO_BINARY_BASIC_BLOCK_H +#define LLVM_TOOLS_LLVM_FLO_BINARY_BASIC_BLOCK_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/ilist.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace llvm { + +namespace flo { + +class BinaryFunction; + +/// The intention is to keep the structure similar to MachineBasicBlock as +/// we might switch to it at some point. +class BinaryBasicBlock { + + /// Label associated with the block. + MCSymbol *Label{nullptr}; + + /// Original offset in the function. + uint64_t Offset{std::numeric_limits::max()}; + + /// Alignment requirements for the block. + uint64_t Alignment{1}; + + /// Vector of all instructions in the block. + std::vector Instructions; + + /// CFG information. + std::vector Predecessors; + std::vector Successors; + + struct BinaryBranchInfo { + uint64_t Count; + uint64_t MispredictedCount; /// number of branches mispredicted + }; + + /// Each successor has a corresponding BranchInfo entry in the list. + std::vector BranchInfo; + typedef std::vector::iterator branch_info_iterator; + typedef std::vector::const_iterator + const_branch_info_iterator; + + BinaryBasicBlock() {} + + explicit BinaryBasicBlock( + MCSymbol *Label, + uint64_t Offset = std::numeric_limits::max()) + : Label(Label), Offset(Offset) {} + + explicit BinaryBasicBlock(uint64_t Offset) + : Offset(Offset) {} + + // Exclusively managed by BinaryFunction. + friend class BinaryFunction; + friend bool operator<(const BinaryBasicBlock &LHS, + const BinaryBasicBlock &RHS); + +public: + + // Instructions iterators. + typedef std::vector::iterator iterator; + typedef std::vector::const_iterator const_iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + + MCInst &front() { return Instructions.front(); } + MCInst &back() { return Instructions.back(); } + const MCInst &front() const { return Instructions.front(); } + const MCInst &back() const { return Instructions.back(); } + + iterator begin() { return Instructions.begin(); } + const_iterator begin() const { return Instructions.begin(); } + iterator end () { return Instructions.end(); } + const_iterator end () const { return Instructions.end(); } + reverse_iterator rbegin() { return Instructions.rbegin(); } + const_reverse_iterator rbegin() const { return Instructions.rbegin(); } + reverse_iterator rend () { return Instructions.rend(); } + const_reverse_iterator rend () const { return Instructions.rend(); } + + // CFG iterators. + typedef std::vector::iterator pred_iterator; + typedef std::vector::const_iterator const_pred_iterator; + typedef std::vector::iterator succ_iterator; + typedef std::vector::const_iterator const_succ_iterator; + typedef std::vector::reverse_iterator + pred_reverse_iterator; + typedef std::vector::const_reverse_iterator + const_pred_reverse_iterator; + typedef std::vector::reverse_iterator + succ_reverse_iterator; + typedef std::vector::const_reverse_iterator + const_succ_reverse_iterator; + pred_iterator pred_begin() { return Predecessors.begin(); } + const_pred_iterator pred_begin() const { return Predecessors.begin(); } + pred_iterator pred_end() { return Predecessors.end(); } + const_pred_iterator pred_end() const { return Predecessors.end(); } + pred_reverse_iterator pred_rbegin() + { return Predecessors.rbegin();} + const_pred_reverse_iterator pred_rbegin() const + { return Predecessors.rbegin();} + pred_reverse_iterator pred_rend() + { return Predecessors.rend(); } + const_pred_reverse_iterator pred_rend() const + { return Predecessors.rend(); } + unsigned pred_size() const { + return (unsigned)Predecessors.size(); + } + bool pred_empty() const { return Predecessors.empty(); } + + succ_iterator succ_begin() { return Successors.begin(); } + const_succ_iterator succ_begin() const { return Successors.begin(); } + succ_iterator succ_end() { return Successors.end(); } + const_succ_iterator succ_end() const { return Successors.end(); } + succ_reverse_iterator succ_rbegin() + { return Successors.rbegin(); } + const_succ_reverse_iterator succ_rbegin() const + { return Successors.rbegin(); } + succ_reverse_iterator succ_rend() + { return Successors.rend(); } + const_succ_reverse_iterator succ_rend() const + { return Successors.rend(); } + unsigned succ_size() const { + return (unsigned)Successors.size(); + } + bool succ_empty() const { return Successors.empty(); } + + inline iterator_range predecessors() { + return iterator_range(pred_begin(), pred_end()); + } + inline iterator_range predecessors() const { + return iterator_range(pred_begin(), pred_end()); + } + inline iterator_range successors() { + return iterator_range(succ_begin(), succ_end()); + } + inline iterator_range successors() const { + return iterator_range(succ_begin(), succ_end()); + } + + /// Return symbol marking the start of this basic block. + MCSymbol *getLabel() const { + return Label; + } + + /// Return local name for the block. + StringRef getName() const { + return Label->getName(); + } + + /// Add instruction at the end of this basic block. + void addInstruction(MCInst &Inst) { + Instructions.emplace_back(Inst); + } + + /// Return required alignment for the block. + uint64_t getAlignment() const { + return Alignment; + } + + /// Adds block to successor list, and also updates predecessor list for + /// successor block. + /// Set branch info for this path. + void addSuccessor(BinaryBasicBlock *Succ, + uint64_t Count = 0, + uint64_t MispredictedCount = 0); + + /// Remove /p Succ basic block from the list of successors. Update the + /// list of predecessors of /p Succ and update branch info. + void removeSuccessor(BinaryBasicBlock *Succ); + +private: + + /// Adds predecessor to the BB. Most likely you don't need to call this. + void addPredecessor(BinaryBasicBlock *Pred); + + /// Remove predecessor of the basic block. Don't use directly, instead + /// use removeSuccessor() funciton. + void removePredecessor(BinaryBasicBlock *Pred); +}; + +bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); + + +} // namespace flo + +} // namespace llvm + +#endif diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h new file mode 100644 index 000000000000..fac66922caf0 --- /dev/null +++ b/bolt/BinaryContext.h @@ -0,0 +1,114 @@ +//===--- BinaryContext.h - Interface for machine-level context -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_FLO_BINARY_CONTEXT_H +#define LLVM_TOOLS_LLVM_FLO_BINARY_CONTEXT_H + +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/TargetRegistry.h" + +#include +#include +#include +#include + +namespace llvm { + +namespace flo { + +/// Everything that's needed to process binaries lives here. +class BinaryContext { + + BinaryContext() = delete; + +public: + + // [name] -> [address] + typedef std::map SymbolMapType; + SymbolMapType GlobalSymbols; + + // [address] -> [name1], [name2], ... + std::multimap GlobalAddresses; + + std::unique_ptr Ctx; + + std::unique_ptr TheTriple; + + const Target *TheTarget; + + MCCodeEmitter *MCE; + + std::unique_ptr MOFI; + + std::unique_ptr AsmInfo; + + std::unique_ptr MII; + + std::unique_ptr STI; + + std::unique_ptr InstPrinter; + + std::unique_ptr MIA; + + std::unique_ptr MRI; + + std::unique_ptr DisAsm; + + std::function ErrorCheck; + + MCAsmBackend *MAB; + + BinaryContext(std::unique_ptr Ctx, + std::unique_ptr TheTriple, + const Target *TheTarget, + MCCodeEmitter *MCE, + std::unique_ptr MOFI, + std::unique_ptr AsmInfo, + std::unique_ptr MII, + std::unique_ptr STI, + std::unique_ptr InstPrinter, + std::unique_ptr MIA, + std::unique_ptr MRI, + std::unique_ptr DisAsm, + MCAsmBackend *MAB) : + Ctx(std::move(Ctx)), + TheTriple(std::move(TheTriple)), + TheTarget(TheTarget), + MCE(MCE), + MOFI(std::move(MOFI)), + AsmInfo(std::move(AsmInfo)), + MII(std::move(MII)), + STI(std::move(STI)), + InstPrinter(std::move(InstPrinter)), + MIA(std::move(MIA)), + MRI(std::move(MRI)), + DisAsm(std::move(DisAsm)), + MAB(MAB) {} + + ~BinaryContext() {} +}; + +} // namespace flo + +} // namespace llvm + +#endif diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp new file mode 100644 index 000000000000..6569704da003 --- /dev/null +++ b/bolt/BinaryFunction.cpp @@ -0,0 +1,381 @@ +//===--- BinaryFunction.cpp - Interface for machine-level function --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "flo" + +namespace llvm { + +namespace flo { + +void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { + StringRef SectionName; + Section.getName(SectionName); + OS << "Binary Function \"" << getName() << "\" {" + << "\n State : " << CurrentState + << "\n Address : 0x" << Twine::utohexstr(Address) + << "\n Size : 0x" << Twine::utohexstr(Size) + << "\n MaxSize : 0x" << Twine::utohexstr(MaxSize) + << "\n Offset : 0x" << Twine::utohexstr(FileOffset) + << "\n Section : " << SectionName + << "\n Orc Section : " << getCodeSectionName() + << "\n IsSimple : " << IsSimple + << "\n BB count : " << BasicBlocks.size() + << "\n Image : 0x" << Twine::utohexstr(ImageAddress) + << "\n}\n"; + + if (!PrintInstructions || !BC.InstPrinter) + return; + + // Offset of the instruction in function. + uint64_t Offset{0}; + + if (BasicBlocks.empty() && !Instructions.empty()) { + // Print before CFG was built. + for (const auto &II : Instructions) { + auto Offset = II.first; + + // Print label if exists at this offset. + auto LI = Labels.find(Offset); + if (LI != Labels.end()) + OS << LI->second->getName() << ":\n"; + + auto &Instruction = II.second; + OS << format(" %08" PRIx64 ": ", Offset); + BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); + OS << "\n"; + } + } + + for (const auto &BB : BasicBlocks) { + OS << BB.getName() << " (" + << BB.Instructions.size() << " instructions)\n"; + + if (!BB.Predecessors.empty()) { + OS << " Predecessors: "; + auto Sep = ""; + for (auto Pred : BB.Predecessors) { + OS << Sep << Pred->getName(); + Sep = ", "; + } + OS << '\n'; + } + + Offset = RoundUpToAlignment(Offset, BB.getAlignment()); + + for (auto &Instr : BB) { + OS << format(" %08" PRIx64 ": ", Offset); + BC.InstPrinter->printInst(&Instr, OS, "", *BC.STI); + OS << "\n"; + + // In case we need MCInst printer: + // Instr.dump_pretty(OS, InstructionPrinter.get()); + + // Calculate the size of the instruction. + // Note: this is imprecise since happening prior to relaxation. + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + BC.MCE->encodeInstruction(Instr, VecOS, Fixups, *BC.STI); + Offset += Code.size(); + } + + if (!BB.Successors.empty()) { + OS << " Successors: "; + auto Sep = ""; + for (auto Succ : BB.Successors) { + OS << Sep << Succ->getName(); + Sep = ", "; + } + OS << '\n'; + } + + OS << '\n'; + } + + OS << "End of Function \"" << getName() << "\"\n"; +} + +bool BinaryFunction::disassemble(ArrayRef FunctionData) { + assert(FunctionData.size() == getSize() && + "function size does not match raw data size"); + + auto &Ctx = BC.Ctx; + auto &MIA = BC.MIA; + + // Insert a label at the beginning of the function. This will be our first + // basic block. + Labels[0] = Ctx->createTempSymbol("BB0", false); + + bool IsSimple = true; + for (uint64_t Offset = 0; IsSimple && (Offset < getSize()); ) { + MCInst Instruction; + uint64_t Size; + if (!BC.DisAsm->getInstruction(Instruction, + Size, + FunctionData.slice(Offset), + getAddress() + Offset, + nulls(), + nulls())) { + // Ignore this function. Skip to the next one. + IsSimple = false; + break; + } + + if (MIA->isIndirectBranch(Instruction)) { + IsSimple = false; + break; + } + + if (MIA->isBranch(Instruction) || MIA->isCall(Instruction)) { + uint64_t InstructionTarget = 0; + uint64_t AbsoluteInstrAddr = getAddress() + Offset; + if (MIA->evaluateBranch(Instruction, + AbsoluteInstrAddr, + Size, + InstructionTarget)) { + // Check if the target is within the same function. Otherwise it's + // a call, possibly a tail call. + // + // If the target *is* the function address it could be either a branch + // or a recursive call. + bool IsCall = MIA->isCall(Instruction); + MCSymbol *TargetSymbol{nullptr}; + uint64_t TargetOffset{0}; + + if (IsCall && containsAddress(InstructionTarget)) { + if (InstructionTarget == getAddress()) { + // Recursive call. + TargetSymbol = Ctx->getOrCreateSymbol(getName()); + } else { + // Possibly an old-style PIC code + DEBUG(dbgs() << "FLO: internal call detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << " in function " << getName() << "\n"); + IsSimple = false; + break; + } + } + + if (!TargetSymbol) { + // Create either local label or external symbol. + if (containsAddress(InstructionTarget)) { + // Check if there's already a registered label. + TargetOffset = InstructionTarget - getAddress(); + auto LI = Labels.find(TargetOffset); + if (LI == Labels.end()) { + TargetSymbol = Ctx->createTempSymbol(); + Labels[TargetOffset] = TargetSymbol; + } else { + TargetSymbol = LI->second; + } + } else { + // This is a call regardless of the opcode (e.g. tail call). + IsCall = true; + // Check if we already have a symbol at this address. + std::string Name; + auto NI = BC.GlobalAddresses.find(InstructionTarget); + if (NI != BC.GlobalAddresses.end()) { + // Any registered name will do. + Name = NI->second; + } else { + // Create a new symbol at the destination. + Name = (Twine("FUNCat0x") + + Twine::utohexstr(InstructionTarget)).str(); + BC.GlobalAddresses.emplace(std::make_pair(InstructionTarget, + Name)); + } + TargetSymbol = Ctx->getOrCreateSymbol(Name); + BC.GlobalSymbols[Name] = InstructionTarget; + } + } + + Instruction.clear(); + Instruction.addOperand( + MCOperand::createExpr( + MCSymbolRefExpr::create(TargetSymbol, + MCSymbolRefExpr::VK_None, + *Ctx))); + if (!IsCall) { + // Add local branch info. + LocalBranches.push_back({Offset, TargetOffset}); + } + + } else { + // Indirect call + IsSimple = false; + break; + } + } else { + if (MIA->hasRIPOperand(Instruction)) { + DEBUG(dbgs() << "FLO: rip-relative instruction found " + "(not supported yet)\n"); + IsSimple = false; + break; + } + } + + addInstruction(Offset, std::move(Instruction)); + + Offset += Size; + } + + setSimple(IsSimple); + + // TODO: clear memory if not simple function? + + // Update state. + updateState(State::Disassembled); + + // Print the function in the new state. + DEBUG(print(dbgs(), /* PrintInstructions = */ true)); + + return true; +} + +bool BinaryFunction::buildCFG() { + + auto &MIA = BC.MIA; + + if (!isSimple()) + return false; + + if (!(CurrentState == State::Disassembled)) + return false; + + assert(BasicBlocks.empty() && "basic block list should be empty"); + assert((Labels.find(0) != Labels.end()) && + "first instruction should always have a label"); + + // Create basic blocks in the original layout order: + // + // * Every instruction with associated label marks + // the beginning of a basic block. + // * Conditional instruction marks the end of a basic block, + // except when the following instruction is an + // unconditional branch, and the unconditional branch is not + // a destination of another branch. In the latter case, the + // basic block will consist of a single unconditional branch + // (missed optimization opportunity?). + // + // Created basic blocks are sorted in layout order since they are + // created in the same order as instructions, and instructions are + // sorted by offsets. + BinaryBasicBlock *InsertBB{nullptr}; + BinaryBasicBlock *PrevBB{nullptr}; + for (auto &InstrInfo : Instructions) { + auto LI = Labels.find(InstrInfo.first); + if (LI != Labels.end()) { + // Always create new BB at branch destination. + PrevBB = InsertBB; + InsertBB = addBasicBlock(LI->first, LI->second); + } + if (!InsertBB) { + // It must be a fallthrough. Create a new block unless we see an + // unconditional branch. + assert(PrevBB && "no previous basic block for a fall through"); + if (MIA->isUnconditionalBranch(InstrInfo.second)) { + // Temporarily restore inserter basic block. + InsertBB = PrevBB; + } else { + InsertBB = addBasicBlock(InstrInfo.first, + BC.Ctx->createTempSymbol("FT", true)); + } + } + + InsertBB->addInstruction(InstrInfo.second); + + // How well do we detect tail calls here? + if (MIA->isTerminator(InstrInfo.second)) { + PrevBB = InsertBB; + InsertBB = nullptr; + } + } + + // Intermediate dump. + DEBUG(print(dbgs(), /* PrintInstructions = */ true)); + + // TODO: handle properly calls to no-return functions, + // e.g. exit(3), etc. Otherwise we'll see a false fall-through + // blocks. + + for (auto &Branch : LocalBranches) { + + DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first) + << "] -> [0x" << Twine::utohexstr(Branch.second) << "]\n"); + BinaryBasicBlock *FromBB = getBasicBlockContainingOffset(Branch.first); + assert(FromBB && "cannot find BB containing FROM branch"); + BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second); + assert(ToBB && "cannot find BB containing TO branch"); + + // TODO: add weights here. + // + FromBB->addSuccessor(ToBB); + } + + // Add fall-through branches. + PrevBB = nullptr; + bool IsPrevFT = false; // Is previous block a fall-through. + for (auto &BB : BasicBlocks) { + if (IsPrevFT) { + PrevBB->addSuccessor(&BB); + } + + MCInst &LastInst = BB.back(); + if (BB.succ_size() == 0) { + IsPrevFT = MIA->isTerminator(LastInst) ? false : true; + } else if (BB.succ_size() == 1) { + IsPrevFT = MIA->isConditionalBranch(LastInst) ? true : false; + } else { + // Either ends with 2 branches, or with an indirect jump. + IsPrevFT = false; + } + + PrevBB = &BB; + } + + if (!IsPrevFT) { + // Possibly a call that does not return. + DEBUG(dbgs() << "last block was marked as a fall-through\n"); + } + + // Clean-up memory taken by instructions and labels. + clearInstructions(); + clearLabels(); + clearLocalBranches(); + + // Update the state. + CurrentState = State::CFG; + + // Print the function in the new state. + DEBUG(print(dbgs(), /* PrintInstructions = */ true)); + + return true; +} + +} // namespace flo + +} // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h new file mode 100644 index 000000000000..d9e123a4dff0 --- /dev/null +++ b/bolt/BinaryFunction.h @@ -0,0 +1,399 @@ +//===--- BinaryFunction.h - Interface for machine-level function ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to function in binary (machine) form. This is assembly-level +// code representation with the control flow. +// +// TODO: memory management for instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_FLO_BINARY_FUNCTION_H +#define LLVM_TOOLS_LLVM_FLO_BINARY_FUNCTION_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/ilist.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include + +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" + +using namespace llvm::object; + +namespace llvm { + +namespace flo { + +/// BinaryFunction is a representation of machine-level function. +// +/// We use the term "Binary" as "Machine" was already taken. +class BinaryFunction { +public: + enum class State : char { + Empty = 0, /// Function body is empty + Disassembled, /// Function have been disassembled + CFG, /// Control flow graph have been built + Assembled, /// Function has been assembled in memory + }; + + static constexpr uint64_t COUNT_NO_PROFILE = + std::numeric_limits::max(); + +private: + + /// Current state of the function. + State CurrentState{State::Empty}; + + /// Name of the function as we know it. + std::string Name; + + /// Symbol associated with this function. + SymbolRef Symbol; + + /// Containing section + SectionRef Section; + + /// Address of the function in memory. Also could be an offset from + /// base address for position independent binaries. + uint64_t Address; + + /// Original size of the function. + uint64_t Size; + + /// Offset in the file. + uint64_t FileOffset{0}; + + /// Maximum size this function is allowed to have. + uint64_t MaxSize{std::numeric_limits::max()}; + + /// Alignment requirements for the function. + uint64_t Alignment{1}; + + /// False if the function is too complex to reconstruct its control + /// flow graph and re-assemble. + bool IsSimple{true}; + + BinaryContext &BC; + + /// The address for the code for this function in codegen memory. + uint64_t ImageAddress{0}; + + /// The size of the code in memory. + uint64_t ImageSize{0}; + + /// Name for the section this function code should reside in. + std::string CodeSectionName; + + /// The profile data for the number of times the function was executed. + uint64_t ExecutionCount{COUNT_NO_PROFILE}; + + /// Release storage used by instructions. + BinaryFunction &clearInstructions() { + std::map TempMap; + Instructions.swap(TempMap); + return *this; + } + + /// Release storage used by instructions. + BinaryFunction &clearLabels() { + std::map TempMap; + Labels.swap(TempMap); + return *this; + } + + /// Release memory taken by local branch info. + BinaryFunction &clearLocalBranches() { + std::vector> TempVector; + LocalBranches.swap(TempVector); + return *this; + } + + BinaryFunction &updateState(BinaryFunction::State State) { + CurrentState = State; + return *this; + } + +public: + std::vector> LocalBranches; + + std::map Labels; + + /// Temporary holder of instructions before CFG is constructed. + std::map Instructions; + + // Blocks are kept sorted in the layout order. If we need to change the + // layout, the terminating instructions need to be modified. + typedef std::vector BasicBlockListType; + BasicBlockListType BasicBlocks; + + typedef BasicBlockListType::iterator iterator; + typedef BasicBlockListType::const_iterator const_iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + + // CFG iterators. + iterator begin() { return BasicBlocks.begin(); } + const_iterator begin() const { return BasicBlocks.begin(); } + iterator end () { return BasicBlocks.end(); } + const_iterator end () const { return BasicBlocks.end(); } + + reverse_iterator rbegin() { return BasicBlocks.rbegin(); } + const_reverse_iterator rbegin() const { return BasicBlocks.rbegin(); } + reverse_iterator rend () { return BasicBlocks.rend(); } + const_reverse_iterator rend () const { return BasicBlocks.rend(); } + + unsigned size() const { return (unsigned)BasicBlocks.size();} + bool empty() const { return BasicBlocks.empty(); } + const BinaryBasicBlock &front() const { return BasicBlocks.front(); } + BinaryBasicBlock &front() { return BasicBlocks.front(); } + const BinaryBasicBlock & back() const { return BasicBlocks.back(); } + BinaryBasicBlock & back() { return BasicBlocks.back(); } + + + BinaryFunction(StringRef Name, SymbolRef Symbol, SectionRef Section, + uint64_t Address, uint64_t Size, BinaryContext &BC) : + Name(Name), Symbol(Symbol), Section(Section), Address(Address), + Size(Size), BC(BC), CodeSectionName((".text." + Name).str()) {} + + /// Perform optimal code layout based on edge frequencies making necessary + /// adjustments to instructions at the end of basic blocks. + void optimizeLayout(); + + /// View CFG in graphviz program + void viewGraph(); + + /// Basic block iterator + + /// Return the name of the function as extracted from the binary file. + StringRef getName() const { + return Name; + } + + /// Return symbol associated with the function start. + SymbolRef getSymbol() const { + return Symbol; + } + + /// Return containing file section. + SectionRef getSection() const { + return Section; + } + + /// Return original address of the function (or offset from base for PIC). + uint64_t getAddress() const { + return Address; + } + + /// Return offset of the function body in the binary file. + uint64_t getFileOffset() const { + return FileOffset; + } + + /// Return (original) size of the function. + uint64_t getSize() const { + return Size; + } + + /// Return the maximum size the body of the function could have. + uint64_t getMaxSize() const { + return MaxSize; + } + + /// Return internal section name for this function. + StringRef getCodeSectionName() const { + assert(!CodeSectionName.empty() && "no section name for function"); + return StringRef(CodeSectionName); + } + + /// Return true if the function could be correctly processed. + bool isSimple() const { + return IsSimple; + } + + /// Return true if the given address \p PC is inside the function body. + bool containsAddress(uint64_t PC) const { + return Address <= PC && PC < Address + Size; + } + + /// Create a basic block at a given \p Offset in the + /// function and append it to the end of list of blocks. + /// Returns NULL if basic block already exists at the \p Offset. + BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label = nullptr) { + assert(!getBasicBlockAtOffset(Offset) && "basic block already exists"); + if (!Label) + Label = BC.Ctx->createTempSymbol("BB", true); + BasicBlocks.emplace_back(BinaryBasicBlock(Label, Offset)); + + return &BasicBlocks.back(); + } + + BinaryBasicBlock *getOrCreateBasicBlockAt(uint64_t Offset, + MCSymbol *Label = nullptr) { + BinaryBasicBlock *BB = getBasicBlockAtOffset(Offset); + if (!BB) + BB = addBasicBlock(Offset, Label); + + return BB; + } + + /// Return basic block that started at offset \p Offset. + BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) { + BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); + if (BB && BB->Offset == Offset) + return BB; + + return nullptr; + } + + /// Return basic block that originally contained offset \p Offset + /// from the function start. + BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset) { + if (Offset > Size) + return nullptr; + + if (BasicBlocks.empty()) + return nullptr; + + auto I = std::lower_bound(BasicBlocks.begin(), + BasicBlocks.end(), + BinaryBasicBlock(Offset)); + + if (I == BasicBlocks.end()) + return &BasicBlocks.back(); + + return &(*I); + } + + /// Dump function information to debug output. If \p PrintInstructions + /// is true - include instruction disassembly. + void dump(bool PrintInstructions = false) const { + print(dbgs(), PrintInstructions); + } + + /// Print function information to the \p OS stream. + void print(raw_ostream &OS, bool PrintInstructions = false) const; + + void addInstruction(uint64_t Offset, MCInst &&Instruction) { + Instructions.emplace(Offset, std::forward(Instruction)); + } + + BinaryFunction &setFileOffset(uint64_t Offset) { + FileOffset = Offset; + return *this; + } + + BinaryFunction &setMaxSize(uint64_t Size) { + MaxSize = Size; + return *this; + } + + BinaryFunction &setSimple(bool Simple) { + IsSimple = Simple; + return *this; + } + + BinaryFunction &setAlignment(uint64_t Align) { + Alignment = Align; + return *this; + } + + uint64_t getAlignment() const { + return Alignment; + } + + BinaryFunction &setImageAddress(uint64_t Address) { + ImageAddress = Address; + return *this; + } + + /// Return the address of this function' image in memory. + uint64_t getImageAddress() const { + return ImageAddress; + } + + BinaryFunction &setImageSize(uint64_t Size) { + ImageSize = Size; + return *this; + } + + /// Return the size of this function' image in memory. + uint64_t getImageSize() const { + return ImageSize; + } + + /// Set the profile data for the number of times the function was called. + BinaryFunction &setExecutionCount(uint64_t Count) { + ExecutionCount = Count; + return *this; + } + + /// Return the profile information about the number of times + /// the function was executed. + /// + /// Return COUNT_NO_PROFILE if there's no profile info. + uint64_t getExecutionCount() const { + return ExecutionCount; + } + + /// Disassemble function from raw data \p FunctionData. + /// If successful, this function will populate the list of instructions + /// for this function together with offsets from the function start + /// in the input. It will also populate Labels with destinations for + /// local branches, and LocalBranches with [from, to] info. + /// + /// \p FunctionData is the set bytes representing the function body. + /// + /// The Function should be properly initialized before this function + /// is called. I.e. function address and size should be set. + /// + /// Returns true on successful disassembly, and updates the current + /// state to State:Disassembled. + /// + /// Returns false if disassembly failed. + bool disassemble(ArrayRef FunctionData); + + /// Builds a list of basic blocks with successor and predecessor info. + /// + /// The function should in Disassembled state prior to call. + /// + /// Returns true on success and update the current function state to + /// State::CFG. Returns false if CFG cannot be built. + bool buildCFG(); + + virtual ~BinaryFunction() {} +}; + +inline raw_ostream &operator<<(raw_ostream &OS, + const BinaryFunction::State State) { + switch (State) { + default: OS << ""; break; + case BinaryFunction::State::Empty: OS << "empty"; break; + case BinaryFunction::State::Disassembled: OS << "disassembled"; break; + case BinaryFunction::State::CFG: OS << "CFG constructed"; break; + case BinaryFunction::State::Assembled: OS << "assembled"; break; + } + + return OS; +} + +} // namespace flo + +} // namespace llvm + +#endif diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index ef2afbab01ae..098176eb5f77 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -2,7 +2,6 @@ set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} CodeGen Core - DebugInfoDWARF MC MCDisassembler MCParser @@ -13,4 +12,6 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-flo llvm-flo.cpp + BinaryBasicBlock.cpp + BinaryFunction.cpp ) diff --git a/bolt/LLVMBuild.txt b/bolt/LLVMBuild.txt index 770196d110bd..eb8a2efe4cd9 100644 --- a/bolt/LLVMBuild.txt +++ b/bolt/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Tool name = llvm-flo parent = Tools -required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Object all-targets +required_libraries = MC MCDisassembler MCParser Object all-targets diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 52628ebdf1d9..54d6593bb564 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -7,12 +7,17 @@ // //===----------------------------------------------------------------------===// // +// This is a binary optimizer that will take 'perf' output and change +// basic block layout for better performance (a.k.a. branch straightening), +// plus some other optimizations that are better performed on a binary. +// //===----------------------------------------------------------------------===// #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" +#include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler.h" @@ -20,7 +25,10 @@ #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -38,13 +46,20 @@ #include "llvm/Support/ToolOutputFile.h" #include "llvm/Target/TargetMachine.h" +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" #include #include #include +#undef DEBUG_TYPE +#define DEBUG_TYPE "flo" + using namespace llvm; using namespace object; +using namespace flo; // Tool options. static cl::opt @@ -57,11 +72,16 @@ static cl::opt OutputFilename("o", cl::desc(""), cl::Required); static cl::list -FunctionNames("funcs", cl::desc("list of functions to optimzize"), - cl::Optional); +FunctionNames("funcs", + cl::CommaSeparated, + cl::desc("list of functions to optimize"), + cl::value_desc("func1,func2,func3,...")); +static cl::opt +EliminateUnreachable("eliminate-unreachable", + cl::desc("eliminate unreachable code"), + cl::Optional); -// Tool name used for reporting. static StringRef ToolName; static void report_error(StringRef Message, std::error_code EC) { @@ -70,6 +90,576 @@ static void report_error(StringRef Message, std::error_code EC) { exit(1); } +static void error(std::error_code EC) { + if (!EC) + return; + + errs() << ToolName << ": error reading file: " << EC.message() << ".\n"; + exit(1); +} + +template +static std::vector singletonSet(T t) { + std::vector Vec; + Vec.push_back(std::move(t)); + return Vec; +} + +/// Class responsible for allocating and managing code and data sections. +class ExecutableFileMemoryManager : public SectionMemoryManager { +public: + + // Keep [section name] -> [allocated address, size] map for later remapping. + std::map> SectionAddressInfo; + + ExecutableFileMemoryManager() {} + + uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, + StringRef SectionName) override { + auto ret = + SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, + SectionName); + DEBUG(dbgs() << "FLO: allocating code section : " << SectionName + << " with size " << Size << ", alignment " << Alignment + << " at 0x" << ret << "\n"); + + SectionAddressInfo[SectionName] = {reinterpret_cast(ret), Size}; + + return ret; + } + + uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, StringRef SectionName, + bool IsReadOnly) override { + DEBUG(dbgs() << "FLO: allocating data section : " << SectionName + << " with size " << Size << ", alignment " + << Alignment << "\n"); + errs() << "FLO-WARNING: allocating data section.\n"; + return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, + SectionName, IsReadOnly); + } + + // Tell EE that we guarantee we don't need stubs. + bool allowStubAllocation() const override { return false; } + + bool finalizeMemory(std::string *ErrMsg = nullptr) override { + DEBUG(dbgs() << "FLO: finalizeMemory()\n"); + return SectionMemoryManager::finalizeMemory(ErrMsg); + } +}; + +/// Create BinaryContext for a given architecture \p ArchName and +/// triple \p TripleName. +static std::unique_ptr CreateBinaryContext( + std::string ArchName, + std::string TripleName) { + + std::string Error; + + std::unique_ptr TheTriple = llvm::make_unique(TripleName); + const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, + *TheTriple, + Error); + if (!TheTarget) { + errs() << ToolName << ": " << Error; + return nullptr; + } + + std::unique_ptr MRI( + TheTarget->createMCRegInfo(TripleName)); + if (!MRI) { + errs() << "error: no register info for target " << TripleName << "\n"; + return nullptr; + } + + // Set up disassembler. + std::unique_ptr AsmInfo( + TheTarget->createMCAsmInfo(*MRI, TripleName)); + if (!AsmInfo) { + errs() << "error: no assembly info for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr STI( + TheTarget->createMCSubtargetInfo(TripleName, "", "")); + if (!STI) { + errs() << "error: no subtarget info for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MII(TheTarget->createMCInstrInfo()); + if (!MII) { + errs() << "error: no instruction info for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MOFI = + llvm::make_unique(); + std::unique_ptr Ctx = + llvm::make_unique(AsmInfo.get(), MRI.get(), MOFI.get()); + MOFI->InitMCObjectFileInfo(*TheTriple, Reloc::Default, + CodeModel::Default, *Ctx); + + std::unique_ptr DisAsm( + TheTarget->createMCDisassembler(*STI, *Ctx)); + + if (!DisAsm) { + errs() << "error: no disassembler for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MIA( + TheTarget->createMCInstrAnalysis(MII.get())); + if (!MIA) { + errs() << "error: failed to create instruction analysis for target" + << TripleName << "\n"; + return nullptr; + } + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + std::unique_ptr InstructionPrinter( + TheTarget->createMCInstPrinter(Triple(TripleName), AsmPrinterVariant, + *AsmInfo, *MII, *MRI)); + if (!InstructionPrinter) { + errs() << "error: no instruction printer for target " << TripleName + << '\n'; + return nullptr; + } + InstructionPrinter->setPrintImmHex(true); + + auto MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *Ctx); + + auto MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, ""); + + // Make sure we don't miss any output on core dumps. + outs().SetUnbuffered(); + errs().SetUnbuffered(); + dbgs().SetUnbuffered(); + + auto BC = + llvm::make_unique(std::move(Ctx), + std::move(TheTriple), + TheTarget, + MCE, + std::move(MOFI), + std::move(AsmInfo), + std::move(MII), + std::move(STI), + std::move(InstructionPrinter), + std::move(MIA), + std::move(MRI), + std::move(DisAsm), + MAB); + + return BC; +} + +static void OptimizeFile(ELFObjectFileBase *File) { + + // FIXME: there should be some way to extract arch and triple information + // from the file. + std::unique_ptr BC = + std::move(CreateBinaryContext("x86-64", "x86_64-unknown-linux")); + if (!BC) { + errs() << "failed to create a binary context\n"; + return; + } + + // Store all non-zero file symbols in this map for quick address lookup. + std::map FileSymRefs; + + // Entry point to the binary. + // + // Note: this is ELF header entry point, but we could have more entry points + // from constructors etc. + BinaryFunction *EntryPointFunction{nullptr}; + + // Populate array of binary functions and file symbols + // from file symbol table. + // + // For local symbols we want to keep track of associated FILE symbol for + // disambiguation by name. + std::map BinaryFunctions; + StringRef FileSymbolName; + for (const SymbolRef &Symbol : File->symbols()) { + // Keep undefined symbols for pretty printing? + if (Symbol.getFlags() & SymbolRef::SF_Undefined) + continue; + + ErrorOr Name = Symbol.getName(); + error(Name.getError()); + + if (Symbol.getType() == SymbolRef::ST_File) { + // Could be used for local symbol disambiguation. + FileSymbolName = *Name; + continue; + } + + ErrorOr AddressOrErr = Symbol.getAddress(); + error(AddressOrErr.getError()); + uint64_t Address = *AddressOrErr; + if (Address == 0) { + if (Symbol.getType() == SymbolRef::ST_Function) + errs() << "FLO-WARNING: function with 0 address seen\n"; + continue; + } + + FileSymRefs[Address] = Symbol; + + // Only consider ST_Function symbols for functions. Although this + // assumption could be broken by assembly functions for which the type + // could be wrong. + if (Symbol.getType() != SymbolRef::ST_Function) { + // FIXME: add it to the address map. + continue; + } + + // TODO: populate address map with PLT entries for better readability. + + // Ignore function with 0 size for now (possibly coming from assembly). + auto SymbolSize = ELFSymbolRef(Symbol).getSize(); + if (SymbolSize == 0) + continue; + + // There's nothing horribly wrong with anonymous symbols, but let's + // ignore them for now. + if (Name->empty()) + continue; + + ErrorOr SectionOrErr = Symbol.getSection(); + error(SectionOrErr.getError()); + section_iterator Section = *SectionOrErr; + if (Section == File->section_end()) { + // Could be an absolute symbol. Could record for pretty printing. + continue; + } + + // Disambiguate local function name. Since we don't know if we'll see + // a global with the same name, always modify the local function name. + std::string UniqueFunctionName; + if (!(Symbol.getFlags() & SymbolRef::SF_Global)) { + unsigned LocalCount = 1; + auto LocalName = *Name + "/" + FileSymbolName + "/"; + while (BC->GlobalSymbols.find((LocalName + Twine(LocalCount)).str()) != + BC->GlobalSymbols.end()) { + ++LocalCount; + } + UniqueFunctionName = (LocalName + Twine(LocalCount)).str(); + } else { + auto I = BC->GlobalSymbols.find(*Name); + assert(I == BC->GlobalSymbols.end() && "global name not unique"); + UniqueFunctionName = *Name; + } + + // Create the function and add to the map. + BinaryFunctions.emplace( + Address, + BinaryFunction(UniqueFunctionName, Symbol, *Section, Address, + SymbolSize, *BC) + ); + + // Add the name to global symbols map. + BC->GlobalSymbols[UniqueFunctionName] = Address; + + // Add to the reverse map. + BC->GlobalAddresses.emplace(std::make_pair(Address, UniqueFunctionName)); + } + + // Disassemble every function and build it's control flow graph. + for (auto &BFI : BinaryFunctions) { + BinaryFunction &Function = BFI.second; + + SectionRef Section = Function.getSection(); + assert(Section.containsSymbol(Function.getSymbol()) && + "symbol not in section"); + + // When could it happen? + if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { + DEBUG(dbgs() << "FLO: corresponding section non-executable or empty " + << "for function " << Function.getName()); + continue; + } + + // Set the proper maximum size value after the whole symbol table + // has been processed. + auto SymRefI = FileSymRefs.upper_bound(Function.getAddress()); + if (SymRefI != FileSymRefs.end()) { + auto MaxSize = SymRefI->first - Function.getAddress(); + assert(MaxSize >= Function.getSize() && + "symbol seen in the middle of the function"); + Function.setMaxSize(MaxSize); + } + + StringRef SectionContents; + error(Section.getContents(SectionContents)); + + assert(SectionContents.size() == Section.getSize() && + "section size mismatch"); + + // Function offset from the section start. + auto FunctionOffset = Function.getAddress() - Section.getAddress(); + + // Offset of the function in the file. + Function.setFileOffset( + SectionContents.data() - File->getData().data() + FunctionOffset); + + ArrayRef FunctionData( + reinterpret_cast + (SectionContents.data()) + FunctionOffset, + Function.getSize()); + + if (!Function.disassemble(FunctionData)) + continue; + + if (!Function.buildCFG()) + continue; + + } // Iterate over all functions + + + // Run optimization passes. + // + // FIXME: use real optimization passes. + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + // Detect and eliminate unreachable basic blocks. We could have those + // filled with nops and they are used for alignment. + // + // FIXME: this wouldn't work with C++ exceptions until we implement + // support for those as there will be "invisible" edges + // in the graph. + if (EliminateUnreachable) { + bool IsFirst = true; + for (auto &BB : Function) { + if (!IsFirst && BB.pred_empty()) { + outs() << "FLO: basic block " << BB.getName() << " in function " + << Function.getName() << " is dead\n"; + // TODO: currently lacking interface to eliminate basic block. + } + IsFirst = false; + } + DEBUG(dbgs() << "*** After unreachable block elimination ***\n"); + DEBUG(Function.print(dbgs(), /* PrintInstructions = */ true)); + } + } + + std::error_code EC; + std::unique_ptr Out = + llvm::make_unique(OutputFilename + ".o", + EC,sys::fs::F_None); + + if (EC) { + // FIXME: handle error + return; + } + + std::unique_ptr RealOut = + llvm::make_unique(OutputFilename, EC, sys::fs::F_None, + 0777); + if (EC) { + // FIXME: handle error + return; + } + + // Copy input file. + RealOut->os() << File->getData(); + + std::unique_ptr BOS = + make_unique(Out->os()); + raw_pwrite_stream *OS = BOS.get(); + + // Implicitly MCObjectStreamer takes ownership of MCAsmBackend (MAB) + // and MCCodeEmitter (MCE). ~MCObjectStreamer() will delete these + // two instances. + std::unique_ptr Streamer( + BC->TheTarget->createMCObjectStreamer(*BC->TheTriple, + *BC->Ctx, + *BC->MAB, + *OS, + BC->MCE, + *BC->STI, + /* RelaxAll */ false, + /* DWARFMustBeAtTheEnd */ false)); + + Streamer->InitSections(false); + + // Output functions one by one. + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + if (!Function.isSimple()) + continue; + + // Only overwrite functions from the list if non-empty. + if (!FunctionNames.empty()) { + bool IsValid = false; + for (auto &Name : FunctionNames) { + if (Function.getName() == Name) { + IsValid = true; + break; + } + } + if (!IsValid) + continue; + } + + DEBUG(dbgs() << "FLO: generating code for function \"" + << Function.getName() << "\"\n"); + + // No need for human readability? + // FIXME: what difference does it make in reality? + //Ctx.setUseNamesOnTempLabels(false); + + // Emit function start + + // Each fuction is emmitted into its own section. + MCSectionELF *FunctionSection = + BC->Ctx->getELFSection(Function.getCodeSectionName(), + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + + MCSection *Section = FunctionSection; + Streamer->SwitchSection(Section); + + Streamer->EmitCodeAlignment(Function.getAlignment()); + + MCSymbol *FunctionSymbol = BC->Ctx->getOrCreateSymbol(Function.getName()); + Streamer->EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); + Streamer->EmitLabel(FunctionSymbol); + + // Emit code. + for (const auto &BB : Function) { + Streamer->EmitLabel(BB.getLabel()); + for (const auto &Instr : BB) { + Streamer->EmitInstruction(Instr, *BC->STI); + } + } + + // TODO: is there any use in emiting end of function? + // Perhaps once we have a support for C++ exceptions. + //auto FunctionEndLabel = Ctx.createTempSymbol("func_end"); + //Streamer->EmitLabel(FunctionEndLabel); + //Streamer->emitELFSize(FunctionSymbol, MCExpr()); + } + + Streamer->Finish(); + + // Get output object as ObjectFile. + std::unique_ptr ObjectMemBuffer = + MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false); + ErrorOr> ObjOrErr = + object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()); + + if (std::error_code EC = ObjOrErr.getError()) { + report_error(InputFilename, EC); + return; + } + + std::unique_ptr + EFMM(new ExecutableFileMemoryManager()); + + // FIXME: use notifyObjectLoaded() to remap sections. + + DEBUG(dbgs() << "Creating OLT\n"); + // Run ObjectLinkingLayer() with custom memory manager and symbol resolver. + orc::ObjectLinkingLayer<> OLT; + + auto Resolver = orc::createLambdaResolver( + [&](const std::string &Name) { + DEBUG(dbgs() << "FLO: looking for " << Name << "\n"); + auto I = BC->GlobalSymbols.find(Name); + if (I == BC->GlobalSymbols.end()) + return RuntimeDyld::SymbolInfo(nullptr); + return RuntimeDyld::SymbolInfo(I->second, + JITSymbolFlags::None); + }, + [](const std::string &S) { + DEBUG(dbgs() << "FLO: resolving " << S << "\n"); + return nullptr; + } + ); + // FIXME: + auto ObjectsHandle = OLT.addObjectSet( + singletonSet(std::move(ObjOrErr.get())), + EFMM.get(), + //std::move(EFMM), + std::move(Resolver)); + //OLT.takeOwnershipOfBuffers(ObjectsHandle, ); + + // Map every function/section current address in memory to that in + // the output binary. + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + if (!Function.isSimple()) + continue; + + auto SAI = EFMM->SectionAddressInfo.find(Function.getCodeSectionName()); + if (SAI != EFMM->SectionAddressInfo.end()) { + DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + << " to 0x" << Twine::utohexstr(Function.getAddress()) + << '\n'); + OLT.mapSectionAddress(ObjectsHandle, + reinterpret_cast(SAI->second.first), + Function.getAddress()); + Function.setImageAddress(SAI->second.first); + Function.setImageSize(SAI->second.second); + } else { + errs() << "FLO: cannot remap function " << Function.getName() << "\n"; + } + } + + OLT.emitAndFinalize(ObjectsHandle); + + // FIXME: is there a less painful way to obtain assembler/writer? + auto &Writer = + static_cast(Streamer.get())->getAssembler().getWriter(); + Writer.setStream(RealOut->os()); + + // Overwrite function in the output file. + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) + continue; + + if (Function.getImageSize() > Function.getMaxSize()) { + errs() << "FLO-WARNING: new function size (0x" + << Twine::utohexstr(Function.getImageSize()) + << ") is larger than maximum allowed size (0x" + << Twine::utohexstr(Function.getMaxSize()) + << ") for function " << Function.getName() << '\n'; + continue; + } + + // Overwrite function in the output file. + outs() << "FLO: rewriting function \"" << Function.getName() << "\"\n"; + RealOut->os().pwrite( + reinterpret_cast(Function.getImageAddress()), + Function.getImageSize(), + Function.getFileOffset()); + + // Write nops at the end of the function. + auto Pos = RealOut->os().tell(); + RealOut->os().seek(Function.getFileOffset() + Function.getImageSize()); + BC->MAB->writeNopData(Function.getMaxSize() - Function.getImageSize(), + &Writer); + RealOut->os().seek(Pos); + } + + if (EntryPointFunction) { + DEBUG(dbgs() << "FLO: entry point function is " + << EntryPointFunction->getName() << '\n'); + } else { + DEBUG(dbgs() << "FLO: no entry point function was set\n"); + } + + // TODO: we should find a way to mark the binary as optimized by us. + + Out->keep(); + RealOut->keep(); +} + int main(int argc, char **argv) { // Print a stack trace if we signal out. sys::PrintStackTraceOnErrorSignal(); @@ -104,7 +694,7 @@ int main(int argc, char **argv) { Binary &Binary = *BinaryOrErr.get().getBinary(); if (ELFObjectFileBase *e = dyn_cast(&Binary)) { - outs() << "mind blown : " << e << "!\n"; + OptimizeFile(e); } else { report_error(InputFilename, object_error::invalid_file_type); } From 6e4084c1ff7f2858fd83c15a2e24db9feaebac75 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 5 Oct 2015 18:31:25 -0700 Subject: [PATCH 005/904] Add initial implementation of DataReader Summary: This patch introduces DataReader, a module responsible for parsing llvm flo data files into in-memory data structures. (cherry picked from commit 6d82bb879b070535e4502f541124298c1b4590e0) --- bolt/CMakeLists.txt | 1 + bolt/DataReader.cpp | 164 ++++++++++++++++++++++++++++++++++++++++++++ bolt/DataReader.h | 110 +++++++++++++++++++++++++++++ bolt/llvm-flo.cpp | 19 +++++ 4 files changed, 294 insertions(+) create mode 100644 bolt/DataReader.cpp create mode 100644 bolt/DataReader.h diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 098176eb5f77..b8720c3d9a64 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -14,4 +14,5 @@ add_llvm_tool(llvm-flo llvm-flo.cpp BinaryBasicBlock.cpp BinaryFunction.cpp + DataReader.cpp ) diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp new file mode 100644 index 000000000000..f3cc166a6c2a --- /dev/null +++ b/bolt/DataReader.cpp @@ -0,0 +1,164 @@ +//===-- DataReader.cpp - Perf data reader -----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of functions reads profile data written by the perf2flo +// utility and stores it in memory for llvm-flo consumption. +// +//===----------------------------------------------------------------------===// + + +#include "DataReader.h" + +namespace llvm { +namespace flo { + +ErrorOr> +DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(Path); + if (std::error_code EC = MB.getError()) { + Diag << "Cannot open " << Path << ": " << EC.message() << "\n"; + } + auto DR = make_unique(std::move(MB.get()), Diag); + DR->parse(); + return std::move(DR); +} + +void DataReader::reportError(StringRef ErrorMsg) { + Diag << "Error reading flo data input file: line " << Line << ", column " + << Col << ": " << ErrorMsg << '\n'; +} + +bool DataReader::expectAndConsumeFS() { + if (ParsingBuf[0] != FieldSeparator) { + reportError("expected field separator"); + return false; + } + ParsingBuf = ParsingBuf.drop_front(1); + Col += 1; + return true; +} + +ErrorOr DataReader::parseString(char EndChar) { + auto StringEnd = ParsingBuf.find(EndChar); + if (StringEnd == StringRef::npos || StringEnd == 0) { + reportError("malformed field"); + return make_error_code(llvm::errc::io_error); + } + StringRef Str = ParsingBuf.substr(0, StringEnd); + ParsingBuf = ParsingBuf.drop_front(StringEnd + 1); + Col += StringEnd + 1; + return Str; +} + +ErrorOr DataReader::parseNumberField(char EndChar) { + auto NumStrRes = parseString(EndChar); + if (std::error_code EC = NumStrRes.getError()) + return EC; + StringRef NumStr = NumStrRes.get(); + int64_t Num; + if (NumStr.getAsInteger(10, Num)) { + reportError("expected decimal number"); + Diag << "Found: " << NumStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + return Num; +} + +ErrorOr DataReader::parseLocation() { + // Read whether the location of the branch should be DSO or a symbol + if (ParsingBuf[0] != '0' && ParsingBuf[0] != '1') { + reportError("expected 0 or 1"); + return make_error_code(llvm::errc::io_error); + } + + bool IsSymbol = ParsingBuf[0] == '1'; + ParsingBuf = ParsingBuf.drop_front(1); + Col += 1; + + if (!expectAndConsumeFS()) + return make_error_code(llvm::errc::io_error); + + // Read the string containing the symbol or the DSO name + auto NameRes = parseString(FieldSeparator); + if (std::error_code EC = NameRes.getError()) + return EC; + StringRef Name = NameRes.get(); + + // Read the offset + auto OffsetStrRes = parseString(FieldSeparator); + if (std::error_code EC = OffsetStrRes.getError()) + return EC; + StringRef OffsetStr = OffsetStrRes.get(); + uint64_t Offset; + if (OffsetStr.getAsInteger(16, Offset)) { + reportError("expected hexadecimal number"); + Diag << "Found: " << OffsetStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + + return Location(IsSymbol, Name, Offset); +} + +ErrorOr DataReader::parseBranchInfo() { + auto Res = parseLocation(); + if (std::error_code EC = Res.getError()) + return EC; + Location From = Res.get(); + + Res = parseLocation(); + if (std::error_code EC = Res.getError()) + return EC; + Location To = Res.get(); + + auto MRes = parseNumberField(FieldSeparator); + if (std::error_code EC = MRes.getError()) + return EC; + int64_t NumMispreds = MRes.get(); + + auto BRes = parseNumberField('\n'); + if (std::error_code EC = BRes.getError()) + return EC; + int64_t NumBranches = BRes.get(); + + return BranchInfo(std::move(From), std::move(To), NumMispreds, NumBranches); +} + +bool DataReader::hasData() { + if (ParsingBuf.size() == 0) + return false; + + if (ParsingBuf[0] == '0' || ParsingBuf[0] == '1') + return true; + return false; +} + +std::error_code DataReader::parse() { + Col = 0; + Line = 1; + while (hasData()) { + auto Res = parseBranchInfo(); + if (std::error_code EC = Res.getError()) + return EC; + Col = 0; + Line += 1; + BranchInfo BI = Res.get(); + ParsedData.emplace_back(std::move(BI)); + } + return std::error_code(); +} + +void DataReader::dump() { + for (auto &BI : ParsedData) { + Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " + << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; + } +} +} +} diff --git a/bolt/DataReader.h b/bolt/DataReader.h new file mode 100644 index 000000000000..02b98c31cf99 --- /dev/null +++ b/bolt/DataReader.h @@ -0,0 +1,110 @@ +//===-- Reader/DataReader.h - Perf data reader ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of functions reads profile data written by the perf2flo +// utility and stores it in memory for llvm-flo consumption. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_FLO_DATA_READER_H +#define LLVM_TOOLS_LLVM_FLO_DATA_READER_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +namespace flo { + +struct Location { + bool IsSymbol; + StringRef Name; + uint64_t Offset; + + Location(bool IsSymbol, StringRef Name, uint64_t Offset) + : IsSymbol(IsSymbol), Name(Name), Offset(Offset) {} +}; + +struct BranchInfo { + Location From; + Location To; + int64_t Mispreds; + int64_t Branches; + + BranchInfo(Location From, Location To, int64_t Mispreds, int64_t Branches) + : From(std::move(From)), To(std::move(To)), Mispreds(Mispreds), + Branches(Branches) {} +}; + +//===----------------------------------------------------------------------===// +// +/// DataReader Class +/// +class DataReader { +public: + DataReader(std::unique_ptr MemBuf, raw_ostream &Diag) + : FileBuf(std::move(MemBuf)), Diag(Diag), ParsingBuf(FileBuf->getBuffer()), + Line(0), Col(0) {} + + static ErrorOr> readPerfData(StringRef Path, + raw_ostream &Diag); + + /// Parses the input flo data file into internal data structures. We expect + /// the file format to follow the syntax below. + /// + /// + /// + /// + /// + /// In field we record 0 if our closest address is a DSO load + /// address or 1 if our closest address is an ELF symbol. + /// + /// Example: + /// + /// 1 main 3fb 0 /lib/ld-2.21.so 12 4 221 + /// + /// The example records branches from symbol main, offset 3fb, to DSO ld-2.21, + /// offset 12, with 4 mispredictions and 221 branches + std::error_code parse(); + + /// Dumps the entire data structures parsed. Used for debugging. + void dump(); + +private: + + void reportError(StringRef ErrorMsg); + bool expectAndConsumeFS(); + ErrorOr parseString(char EndChar); + ErrorOr parseNumberField(char EndChar); + ErrorOr parseLocation(); + ErrorOr parseBranchInfo(); + bool hasData(); + + // Owns reader data structures + BumpPtrAllocator Alloc; + // An in-memory copy of the input data file - owns strings used in reader + std::unique_ptr FileBuf; + raw_ostream &Diag; + StringRef ParsingBuf; + unsigned Line; + unsigned Col; + std::vector ParsedData; + static const char FieldSeparator = ' '; +}; + + + +} +} + +#endif diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 54d6593bb564..3196b85c9db0 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "DataReader.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" @@ -82,6 +83,10 @@ EliminateUnreachable("eliminate-unreachable", cl::desc("eliminate unreachable code"), cl::Optional); +static cl::opt +DumpData("dump-data", cl::desc("dump parsed flo data (debugging)"), + cl::Hidden); + static StringRef ToolName; static void report_error(StringRef Message, std::error_code EC) { @@ -687,6 +692,20 @@ int main(int argc, char **argv) { if (!sys::fs::exists(InputFilename)) report_error(InputFilename, errc::no_such_file_or_directory); + if (!sys::fs::exists(InputDataFilename)) + report_error(InputDataFilename, errc::no_such_file_or_directory); + + // Attempt to read input flo data + ErrorOr> ReaderOrErr = + flo::DataReader::readPerfData(InputDataFilename, errs()); + if (std::error_code EC = ReaderOrErr.getError()) + report_error(InputDataFilename, EC); + flo::DataReader &DR = *ReaderOrErr.get().get(); + if (DumpData) { + DR.dump(); + return EXIT_SUCCESS; + } + // Attempt to open the binary. ErrorOr> BinaryOrErr = createBinary(InputFilename); if (std::error_code EC = BinaryOrErr.getError()) From 66127bbc702cc2067acb87e1c34fc6878d10f9f6 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 12 Oct 2015 12:12:16 -0700 Subject: [PATCH 006/904] Fix CFG building issue. Summary: Fixed getBasicBlockContainingOffset() to return correct basic block. (cherry picked from commit 87260e4b1d1dc3862779d42e9a3f98b2e82c99f8) --- bolt/BinaryFunction.cpp | 16 ++++++++++++++++ bolt/BinaryFunction.h | 17 +---------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 6569704da003..480c154084fb 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -31,6 +31,22 @@ namespace llvm { namespace flo { +BinaryBasicBlock * +BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { + if (Offset > Size) + return nullptr; + + if (BasicBlocks.empty()) + return nullptr; + + auto I = std::upper_bound(BasicBlocks.begin(), + BasicBlocks.end(), + BinaryBasicBlock(Offset)); + assert(I != BasicBlocks.begin() && "first basic block not at offset 0"); + + return &(*--I); +} + void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { StringRef SectionName; Section.getName(SectionName); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index d9e123a4dff0..c88d5a665a11 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -263,22 +263,7 @@ class BinaryFunction { /// Return basic block that originally contained offset \p Offset /// from the function start. - BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset) { - if (Offset > Size) - return nullptr; - - if (BasicBlocks.empty()) - return nullptr; - - auto I = std::lower_bound(BasicBlocks.begin(), - BasicBlocks.end(), - BinaryBasicBlock(Offset)); - - if (I == BasicBlocks.end()) - return &BasicBlocks.back(); - - return &(*I); - } + BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); /// Dump function information to debug output. If \p PrintInstructions /// is true - include instruction disassembly. From 053865b8626b05ddf8663b9d10f3cb8cac675033 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 9 Oct 2015 21:47:18 -0700 Subject: [PATCH 007/904] FLO: added support for rip-relative operands. Summary: Detect and replace rip-relative operands with relocations. (cherry picked from commit 2253aca46a53d8e86c12ede3330cd173bfbe6cbe) --- bolt/BinaryFunction.cpp | 41 ++++++++++++++++++++++++++++++++++++----- bolt/llvm-flo.cpp | 19 ++++++++++--------- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 480c154084fb..71445f4d37ad 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstPrinter.h" #include "llvm/Object/ObjectFile.h" @@ -166,9 +167,9 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { break; } + uint64_t AbsoluteInstrAddr = getAddress() + Offset; if (MIA->isBranch(Instruction) || MIA->isCall(Instruction)) { uint64_t InstructionTarget = 0; - uint64_t AbsoluteInstrAddr = getAddress() + Offset; if (MIA->evaluateBranch(Instruction, AbsoluteInstrAddr, Size, @@ -242,15 +243,45 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } else { // Indirect call + DEBUG(dbgs() << "FLO: indirect call detected (not yet supported)\n"); IsSimple = false; break; } } else { if (MIA->hasRIPOperand(Instruction)) { - DEBUG(dbgs() << "FLO: rip-relative instruction found " - "(not supported yet)\n"); - IsSimple = false; - break; + uint64_t TargetAddress{0}; + MCSymbol *TargetSymbol{nullptr}; + if (!MIA->evaluateRIPOperand(Instruction, AbsoluteInstrAddr, + Size, TargetAddress)) { + DEBUG( + dbgs() << "FLO: rip-relative operand could not be evaluated:\n"; + BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); + dbgs() << '\n'; + Instruction.dump_pretty(dbgs(), BC.InstPrinter.get()); + dbgs() << '\n'; + ); + IsSimple = false; + break; + } + std::string Name; + auto NI = BC.GlobalAddresses.find(TargetAddress); + if (NI != BC.GlobalAddresses.end()) { + Name = NI->second; + } else { + // Register new "data" symbol at the destination. + Name = (Twine("DATAat0x") + Twine::utohexstr(TargetAddress)).str(); + BC.GlobalAddresses.emplace(std::make_pair(TargetAddress, + Name)); + } + TargetSymbol = Ctx->getOrCreateSymbol(Name); + BC.GlobalSymbols[Name] = TargetAddress; + + MIA->replaceRIPOperandDisp( + Instruction, + MCOperand::createExpr( + MCSymbolRefExpr::create(TargetSymbol, + MCSymbolRefExpr::VK_None, + *Ctx))); } } diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 3196b85c9db0..ebcb1be73372 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -312,13 +312,19 @@ static void OptimizeFile(ELFObjectFileBase *File) { FileSymRefs[Address] = Symbol; + // There's nothing horribly wrong with anonymous symbols, but let's + // ignore them for now. + if (Name->empty()) + continue; + + BC->GlobalAddresses.emplace(std::make_pair(Address, *Name)); + // Only consider ST_Function symbols for functions. Although this // assumption could be broken by assembly functions for which the type - // could be wrong. - if (Symbol.getType() != SymbolRef::ST_Function) { - // FIXME: add it to the address map. + // could be wrong, we skip such entries till the support for + // assembly is implemented. + if (Symbol.getType() != SymbolRef::ST_Function) continue; - } // TODO: populate address map with PLT entries for better readability. @@ -327,11 +333,6 @@ static void OptimizeFile(ELFObjectFileBase *File) { if (SymbolSize == 0) continue; - // There's nothing horribly wrong with anonymous symbols, but let's - // ignore them for now. - if (Name->empty()) - continue; - ErrorOr SectionOrErr = Symbol.getSection(); error(SectionOrErr.getError()); section_iterator Section = *SectionOrErr; From 831c4cc379006f3b45909d68124cf509ad4e49a2 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 12 Oct 2015 14:46:18 -0700 Subject: [PATCH 008/904] Don't bail out if there's no input data file specified. Summary: Don't attempt to read data file if it was not specified by the user. (cherry picked from commit 2424afc3def3da4bdfb260549fe196ded4e0f324) --- bolt/llvm-flo.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index ebcb1be73372..e8c9bfdbf837 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -693,18 +693,20 @@ int main(int argc, char **argv) { if (!sys::fs::exists(InputFilename)) report_error(InputFilename, errc::no_such_file_or_directory); - if (!sys::fs::exists(InputDataFilename)) - report_error(InputDataFilename, errc::no_such_file_or_directory); - - // Attempt to read input flo data - ErrorOr> ReaderOrErr = - flo::DataReader::readPerfData(InputDataFilename, errs()); - if (std::error_code EC = ReaderOrErr.getError()) - report_error(InputDataFilename, EC); - flo::DataReader &DR = *ReaderOrErr.get().get(); - if (DumpData) { - DR.dump(); - return EXIT_SUCCESS; + if (!InputDataFilename.empty()) { + if (!sys::fs::exists(InputDataFilename)) + report_error(InputDataFilename, errc::no_such_file_or_directory); + + // Attempt to read input flo data + ErrorOr> ReaderOrErr = + flo::DataReader::readPerfData(InputDataFilename, errs()); + if (std::error_code EC = ReaderOrErr.getError()) + report_error(InputDataFilename, EC); + flo::DataReader &DR = *ReaderOrErr.get().get(); + if (DumpData) { + DR.dump(); + return EXIT_SUCCESS; + } } // Attempt to open the binary. From 5e0094965cba7981a4c208f75f2754ebfdcae768 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 12 Oct 2015 12:30:47 -0700 Subject: [PATCH 009/904] Add branch count information to binary CFG Summary: Changes DataReader to organize branch perf data per function name and sets up logistics to bring this data to BinaryFunction::buildCFG(). To do this, we expand BinaryContext with a const reference to DataReader. This patch also adds the "-dump-functions" flag to force llvm-flo to dump the current state of BinaryFunctions once they are disassembled and their CFG built, allowing us to test whether the builder is sane with LLVM LIT tests. (cherry picked from commit 3a267a216c69583f5ec481b15277b36e109451dc) --- bolt/BinaryBasicBlock.cpp | 15 ++++++---- bolt/BinaryContext.h | 10 +++++-- bolt/BinaryFunction.cpp | 35 +++++++++++++++++++---- bolt/DataReader.cpp | 58 +++++++++++++++++++++++++++++++++++---- bolt/DataReader.h | 25 ++++++++++++++--- bolt/llvm-flo.cpp | 27 ++++++++++++------ 6 files changed, 139 insertions(+), 31 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 975e8388978f..70a84280215f 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -35,19 +35,24 @@ void BinaryBasicBlock::addSuccessor(BinaryBasicBlock *Succ, uint64_t Count, uint64_t MispredictedCount) { Successors.push_back(Succ); + BranchInfo.push_back({Count, MispredictedCount}); Succ->Predecessors.push_back(this); - - // TODO: update weights. } void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) { Succ->removePredecessor(this); - auto I = std::find(succ_begin(), succ_end(), Succ); + auto I = succ_begin(); + auto BI = BranchInfo.begin(); + for (; I != succ_end(); ++I) { + assert(BI != BranchInfo.end() && "missing BranchInfo entry"); + if (*I == Succ) + break; + ++BI; + } assert(I != succ_end() && "no such successor!"); Successors.erase(I); - - // TODO: update weights. + BranchInfo.erase(BI); } void BinaryBasicBlock::addPredecessor(BinaryBasicBlock *Pred) { diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index fac66922caf0..0e6f3ca7724f 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -35,6 +35,8 @@ namespace llvm { namespace flo { +class DataReader; + /// Everything that's needed to process binaries lives here. class BinaryContext { @@ -77,6 +79,8 @@ class BinaryContext { MCAsmBackend *MAB; + const DataReader &DR; + BinaryContext(std::unique_ptr Ctx, std::unique_ptr TheTriple, const Target *TheTarget, @@ -89,7 +93,8 @@ class BinaryContext { std::unique_ptr MIA, std::unique_ptr MRI, std::unique_ptr DisAsm, - MCAsmBackend *MAB) : + MCAsmBackend *MAB, + const DataReader &DR) : Ctx(std::move(Ctx)), TheTriple(std::move(TheTriple)), TheTarget(TheTarget), @@ -102,7 +107,8 @@ class BinaryContext { MIA(std::move(MIA)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)), - MAB(MAB) {} + MAB(MAB), + DR(DR) {} ~BinaryContext() {} }; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 71445f4d37ad..75a3311bafd5 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -24,6 +24,7 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" +#include "DataReader.h" #undef DEBUG_TYPE #define DEBUG_TYPE "flo" @@ -61,8 +62,10 @@ void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { << "\n Orc Section : " << getCodeSectionName() << "\n IsSimple : " << IsSimple << "\n BB count : " << BasicBlocks.size() - << "\n Image : 0x" << Twine::utohexstr(ImageAddress) - << "\n}\n"; + << "\n Image : 0x" << Twine::utohexstr(ImageAddress); + if (ExecutionCount != COUNT_NO_PROFILE) + OS << "\n Exec Count : " << ExecutionCount; + OS << "\n}\n"; if (!PrintInstructions || !BC.InstPrinter) return; @@ -122,10 +125,14 @@ void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { if (!BB.Successors.empty()) { OS << " Successors: "; + auto BI = BB.BranchInfo.begin(); auto Sep = ""; for (auto Succ : BB.Successors) { - OS << Sep << Succ->getName(); + assert(BI != BB.BranchInfo.end() && "missing BranchInfo entry"); + OS << Sep << Succ->getName() << " (mispreds: " << BI->MispredictedCount + << ", count: " << BI->Count << ")"; Sep = ", "; + ++BI; } OS << '\n'; } @@ -307,6 +314,13 @@ bool BinaryFunction::buildCFG() { auto &MIA = BC.MIA; + auto BranchDataOrErr = BC.DR.getFuncBranchData(getName()); + if (std::error_code EC = BranchDataOrErr.getError()) { + DEBUG(dbgs() << "no branch data found for \"" << getName() << "\"\n"); + } else { + ExecutionCount = BC.DR.countBranchesTo(getName()); + } + if (!isSimple()) return false; @@ -378,9 +392,18 @@ bool BinaryFunction::buildCFG() { BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second); assert(ToBB && "cannot find BB containing TO branch"); - // TODO: add weights here. - // - FromBB->addSuccessor(ToBB); + if (std::error_code EC = BranchDataOrErr.getError()) { + FromBB->addSuccessor(ToBB); + } else { + const FuncBranchData &BranchData = BranchDataOrErr.get(); + auto BranchInfoOrErr = BranchData.getBranch(Branch.first, Branch.second); + if (std::error_code EC = BranchInfoOrErr.getError()) { + FromBB->addSuccessor(ToBB); + } else { + const BranchInfo &BInfo = BranchInfoOrErr.get(); + FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); + } + } } // Add fall-through branches. diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index f3cc166a6c2a..1a762735d019 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -18,6 +18,25 @@ namespace llvm { namespace flo { +ErrorOr FuncBranchData::getBranch(uint64_t From, + uint64_t To) const { + for (const auto &I : Data) { + if (I.From.Offset == From && I.To.Offset == To) + return I; + } + return make_error_code(llvm::errc::invalid_argument); +} + +uint64_t +FuncBranchData::countBranchesTo(StringRef FuncName) const { + uint64_t TotalCount = 0; + for (const auto &I : Data) { + if (I.To.Offset == 0 && I.To.Name == FuncName) + TotalCount += I.Branches; + } + return TotalCount; +} + ErrorOr> DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { ErrorOr> MB = @@ -148,16 +167,45 @@ std::error_code DataReader::parse() { return EC; Col = 0; Line += 1; + BranchInfo BI = Res.get(); - ParsedData.emplace_back(std::move(BI)); + StringRef Name = BI.From.Name; + auto I = FuncsMap.find(Name); + if (I == FuncsMap.end()) { + FuncBranchData::ContainerTy Cont; + Cont.emplace_back(std::move(BI)); + FuncsMap.insert( + std::make_pair(Name, FuncBranchData(Name, std::move(Cont)))); + continue; + } + I->getValue().Data.emplace_back(std::move(BI)); } return std::error_code(); } -void DataReader::dump() { - for (auto &BI : ParsedData) { - Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " - << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; +ErrorOr +DataReader::getFuncBranchData(StringRef FuncName) const { + const auto I = FuncsMap.find(FuncName); + if (I == FuncsMap.end()) { + return make_error_code(llvm::errc::invalid_argument); + } + return I->getValue(); +} + +uint64_t DataReader::countBranchesTo(StringRef FuncName) const { + uint64_t TotalCount = 0; + for (const auto &KV : FuncsMap) { + TotalCount += KV.getValue().countBranchesTo(FuncName); + } + return TotalCount; +} + +void DataReader::dump() const { + for (const auto &Func : FuncsMap) { + for (const auto &BI : Func.getValue().Data) { + Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " + << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; + } } } } diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 02b98c31cf99..a47bcf3ab324 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -46,12 +46,28 @@ struct BranchInfo { Branches(Branches) {} }; +class FuncBranchData { +public: + typedef std::vector ContainerTy; + + StringRef Name; + ContainerTy Data; + + FuncBranchData(StringRef Name, ContainerTy Data) + : Name(Name), Data(std::move(Data)) {} + + ErrorOr getBranch(uint64_t From, uint64_t To) const; + uint64_t countBranchesTo(StringRef FuncName) const; +}; + //===----------------------------------------------------------------------===// // /// DataReader Class /// class DataReader { public: + explicit DataReader(raw_ostream &Diag) : Diag(Diag) {} + DataReader(std::unique_ptr MemBuf, raw_ostream &Diag) : FileBuf(std::move(MemBuf)), Diag(Diag), ParsingBuf(FileBuf->getBuffer()), Line(0), Col(0) {} @@ -77,8 +93,11 @@ class DataReader { /// offset 12, with 4 mispredictions and 221 branches std::error_code parse(); + ErrorOr getFuncBranchData(StringRef FuncName) const; + uint64_t countBranchesTo(StringRef FuncName) const; + /// Dumps the entire data structures parsed. Used for debugging. - void dump(); + void dump() const; private: @@ -90,15 +109,13 @@ class DataReader { ErrorOr parseBranchInfo(); bool hasData(); - // Owns reader data structures - BumpPtrAllocator Alloc; // An in-memory copy of the input data file - owns strings used in reader std::unique_ptr FileBuf; raw_ostream &Diag; StringRef ParsingBuf; unsigned Line; unsigned Col; - std::vector ParsedData; + StringMap FuncsMap; static const char FieldSeparator = ' '; }; diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index e8c9bfdbf837..90ea3f03d77c 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -87,6 +87,10 @@ static cl::opt DumpData("dump-data", cl::desc("dump parsed flo data (debugging)"), cl::Hidden); +static cl::opt +DumpFunctions("dump-functions", cl::desc("dump parsed functions (debugging)"), + cl::Hidden); + static StringRef ToolName; static void report_error(StringRef Message, std::error_code EC) { @@ -158,7 +162,7 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { /// triple \p TripleName. static std::unique_ptr CreateBinaryContext( std::string ArchName, - std::string TripleName) { + std::string TripleName, const DataReader &DR) { std::string Error; @@ -255,17 +259,18 @@ static std::unique_ptr CreateBinaryContext( std::move(MIA), std::move(MRI), std::move(DisAsm), - MAB); + MAB, + DR); return BC; } -static void OptimizeFile(ELFObjectFileBase *File) { +static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // FIXME: there should be some way to extract arch and triple information // from the file. std::unique_ptr BC = - std::move(CreateBinaryContext("x86-64", "x86_64-unknown-linux")); + std::move(CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR)); if (!BC) { errs() << "failed to create a binary context\n"; return; @@ -421,8 +426,12 @@ static void OptimizeFile(ELFObjectFileBase *File) { if (!Function.buildCFG()) continue; + if (DumpFunctions) + Function.print(errs(), true); } // Iterate over all functions + if (DumpFunctions) + return; // Run optimization passes. // @@ -693,18 +702,18 @@ int main(int argc, char **argv) { if (!sys::fs::exists(InputFilename)) report_error(InputFilename, errc::no_such_file_or_directory); + std::unique_ptr DR(new DataReader(errs())); if (!InputDataFilename.empty()) { if (!sys::fs::exists(InputDataFilename)) report_error(InputDataFilename, errc::no_such_file_or_directory); // Attempt to read input flo data - ErrorOr> ReaderOrErr = - flo::DataReader::readPerfData(InputDataFilename, errs()); + auto ReaderOrErr = flo::DataReader::readPerfData(InputDataFilename, errs()); if (std::error_code EC = ReaderOrErr.getError()) report_error(InputDataFilename, EC); - flo::DataReader &DR = *ReaderOrErr.get().get(); + DR.reset(ReaderOrErr.get().release()); if (DumpData) { - DR.dump(); + DR->dump(); return EXIT_SUCCESS; } } @@ -716,7 +725,7 @@ int main(int argc, char **argv) { Binary &Binary = *BinaryOrErr.get().getBinary(); if (ELFObjectFileBase *e = dyn_cast(&Binary)) { - OptimizeFile(e); + OptimizeFile(e, *DR.get()); } else { report_error(InputFilename, object_error::invalid_file_type); } From 1c5d8f4438b59747e9c8f415a88dc5b1a61cb4a8 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 14 Oct 2015 16:46:59 -0700 Subject: [PATCH 010/904] Converted local offsets from uint64_t to uint32_t. Refactoring. (cherry picked from commit 84e892df17746901872ac7e48b14f10fbfa0ba70) --- bolt/BinaryFunction.h | 25 ++++++++++++++++--------- bolt/llvm-flo.cpp | 37 ++++++++++++++----------------------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index c88d5a665a11..86237bd98c21 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -105,22 +105,22 @@ class BinaryFunction { /// Release storage used by instructions. BinaryFunction &clearInstructions() { - std::map TempMap; + InstrMapType TempMap; Instructions.swap(TempMap); return *this; } /// Release storage used by instructions. BinaryFunction &clearLabels() { - std::map TempMap; + LabelsMapType TempMap; Labels.swap(TempMap); return *this; } /// Release memory taken by local branch info. BinaryFunction &clearLocalBranches() { - std::vector> TempVector; - LocalBranches.swap(TempVector); + LocalBranchesListType TempList; + LocalBranches.swap(TempList); return *this; } @@ -129,19 +129,26 @@ class BinaryFunction { return *this; } -public: - std::vector> LocalBranches; + /// Storage for all local branches in the function (non-fall-throughs). + using LocalBranchesListType = std::vector>; + LocalBranchesListType LocalBranches; - std::map Labels; + /// Map offset in the function to a local label. + using LabelsMapType = std::map; + LabelsMapType Labels; /// Temporary holder of instructions before CFG is constructed. - std::map Instructions; + /// Map offset in the function to MCInst. + using InstrMapType = std::map; + InstrMapType Instructions; // Blocks are kept sorted in the layout order. If we need to change the // layout, the terminating instructions need to be modified. - typedef std::vector BasicBlockListType; + using BasicBlockListType = std::vector; BasicBlockListType BasicBlocks; +public: + typedef BasicBlockListType::iterator iterator; typedef BasicBlockListType::const_iterator const_iterator; typedef std::reverse_iterator const_reverse_iterator; diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 90ea3f03d77c..06cbcb27395d 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -99,12 +99,10 @@ static void report_error(StringRef Message, std::error_code EC) { exit(1); } -static void error(std::error_code EC) { +static void check_error(std::error_code EC, StringRef Message) { if (!EC) return; - - errs() << ToolName << ": error reading file: " << EC.message() << ".\n"; - exit(1); + report_error(Message, EC); } template @@ -298,7 +296,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { continue; ErrorOr Name = Symbol.getName(); - error(Name.getError()); + check_error(Name.getError(), "cannot get symbol name"); if (Symbol.getType() == SymbolRef::ST_File) { // Could be used for local symbol disambiguation. @@ -307,7 +305,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } ErrorOr AddressOrErr = Symbol.getAddress(); - error(AddressOrErr.getError()); + check_error(AddressOrErr.getError(), "cannot get symbol address"); uint64_t Address = *AddressOrErr; if (Address == 0) { if (Symbol.getType() == SymbolRef::ST_Function) @@ -339,7 +337,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { continue; ErrorOr SectionOrErr = Symbol.getSection(); - error(SectionOrErr.getError()); + check_error(SectionOrErr.getError(), "cannot get symbol section"); section_iterator Section = *SectionOrErr; if (Section == File->section_end()) { // Could be an absolute symbol. Could record for pretty printing. @@ -403,7 +401,8 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } StringRef SectionContents; - error(Section.getContents(SectionContents)); + check_error(Section.getContents(SectionContents), + "cannot get section contents"); assert(SectionContents.size() == Section.getSize() && "section size mismatch"); @@ -460,22 +459,18 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } std::error_code EC; + + // This is an object file, which we keep for debugging purposes. + // Once we decide it's useless, we should create it in memory. std::unique_ptr Out = llvm::make_unique(OutputFilename + ".o", - EC,sys::fs::F_None); - - if (EC) { - // FIXME: handle error - return; - } + EC, sys::fs::F_None); + check_error(EC, "cannot create output object file"); std::unique_ptr RealOut = llvm::make_unique(OutputFilename, EC, sys::fs::F_None, 0777); - if (EC) { - // FIXME: handle error - return; - } + check_error(EC, "cannot create output executable file"); // Copy input file. RealOut->os() << File->getData(); @@ -565,11 +560,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false); ErrorOr> ObjOrErr = object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()); - - if (std::error_code EC = ObjOrErr.getError()) { - report_error(InputFilename, EC); - return; - } + check_error(ObjOrErr.getError(), "error creating in-memory object"); std::unique_ptr EFMM(new ExecutableFileMemoryManager()); From 71bc6d5c144de08154f0e909704fc7386185a710 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 13 Oct 2015 10:25:45 -0700 Subject: [PATCH 011/904] Make llvm-flo infer branch count data for fall-through edges Summary: The LBR only has information about taken branches and does not record information when a branch is not taken. In our CFG, we call these edges "fall-through" edges. This patch teaches llvm-flo how to infer fall-through edge frequencies. (cherry picked from commit 82b857ad911c268cfe800baa6376d0f8a004ff2e) --- bolt/BinaryBasicBlock.h | 16 ++++++++ bolt/BinaryFunction.cpp | 87 +++++++++++++++++++++++++++++++++++++++-- bolt/BinaryFunction.h | 7 ++++ 3 files changed, 107 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index a7aeda445fdd..9903710007be 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -46,6 +46,9 @@ class BinaryBasicBlock { /// Alignment requirements for the block. uint64_t Alignment{1}; + /// Number of times this basic block was executed. + uint64_t ExecutionCount{COUNT_NO_PROFILE}; + /// Vector of all instructions in the block. std::vector Instructions; @@ -80,6 +83,10 @@ class BinaryBasicBlock { const BinaryBasicBlock &RHS); public: + static constexpr uint64_t COUNT_FALLTHROUGH_EDGE = + std::numeric_limits::max(); + static constexpr uint64_t COUNT_NO_PROFILE = + std::numeric_limits::max(); // Instructions iterators. typedef std::vector::iterator iterator; @@ -192,6 +199,15 @@ class BinaryBasicBlock { /// list of predecessors of /p Succ and update branch info. void removeSuccessor(BinaryBasicBlock *Succ); + /// Return the information about the number of times this basic block was + /// executed. + /// + /// Return COUNT_NO_PROFILE if there's no profile info. + uint64_t getExecutionCount() const { + return ExecutionCount; + } + + private: /// Adds predecessor to the BB. Most likely you don't need to call this. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 75a3311bafd5..39cff4627196 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -94,6 +94,10 @@ void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { OS << BB.getName() << " (" << BB.Instructions.size() << " instructions)\n"; + uint64_t BBExecCount = BB.getExecutionCount(); + if (BBExecCount != BinaryBasicBlock::COUNT_NO_PROFILE) { + OS << " Exec Count : " << BBExecCount << "\n"; + } if (!BB.Predecessors.empty()) { OS << " Predecessors: "; auto Sep = ""; @@ -129,8 +133,15 @@ void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { auto Sep = ""; for (auto Succ : BB.Successors) { assert(BI != BB.BranchInfo.end() && "missing BranchInfo entry"); - OS << Sep << Succ->getName() << " (mispreds: " << BI->MispredictedCount - << ", count: " << BI->Count << ")"; + OS << Sep << Succ->getName(); + if (ExecutionCount != COUNT_NO_PROFILE && + BI->MispredictedCount != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + OS << " (mispreds: " << BI->MispredictedCount + << ", count: " << BI->Count << ")"; + } else if (ExecutionCount != COUNT_NO_PROFILE && + BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + OS << " (inferred count: " << BI->Count << ")"; + } Sep = ", "; ++BI; } @@ -411,7 +422,8 @@ bool BinaryFunction::buildCFG() { bool IsPrevFT = false; // Is previous block a fall-through. for (auto &BB : BasicBlocks) { if (IsPrevFT) { - PrevBB->addSuccessor(&BB); + PrevBB->addSuccessor(&BB, BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE, + BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE); } MCInst &LastInst = BB.back(); @@ -432,6 +444,11 @@ bool BinaryFunction::buildCFG() { DEBUG(dbgs() << "last block was marked as a fall-through\n"); } + // Infer frequency for non-taken branches + if (ExecutionCount != COUNT_NO_PROFILE && !BranchDataOrErr.getError()) { + inferFallThroughCounts(); + } + // Clean-up memory taken by instructions and labels. clearInstructions(); clearLabels(); @@ -446,6 +463,70 @@ bool BinaryFunction::buildCFG() { return true; } +void BinaryFunction::inferFallThroughCounts() { + assert(!BasicBlocks.empty() && "basic block list should not be empty"); + + // Compute preliminary execution time for each basic block + for (auto &CurBB : BasicBlocks) { + if (&CurBB == &*BasicBlocks.begin()) { + CurBB.ExecutionCount = ExecutionCount; + continue; + } + CurBB.ExecutionCount = 0; + } + + for (auto &CurBB : BasicBlocks) { + auto SuccCount = CurBB.BranchInfo.begin(); + for (auto Succ : CurBB.successors()) { + if (SuccCount->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) + Succ->ExecutionCount += SuccCount->Count; + ++SuccCount; + } + } + + // Work on a basic block at a time, propagating frequency information forwards + // It is important to walk in the layour order + for (auto &CurBB : BasicBlocks) { + uint64_t BBExecCount = CurBB.getExecutionCount(); + + // Propagate this information to successors, filling in fall-through edges + // with frequency information + if (CurBB.succ_size() == 0) + continue; + + // Calculate frequency of outgoing branches from this node according to + // LBR data + uint64_t ReportedBranches = 0; + for (auto &SuccCount : CurBB.BranchInfo) { + if (SuccCount.Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) + ReportedBranches += SuccCount.Count; + } + + // Infer the frequency of the fall-through edge, representing not taking the + // branch + uint64_t Inferred = 0; + if (BBExecCount > ReportedBranches) + Inferred = BBExecCount - ReportedBranches; + if (BBExecCount < ReportedBranches) + errs() << "FLO-WARNING: Fall-through inference is slightly inconsistent. " + "BB exec frequency is less than the outgoing edges frequency\n"; + + // Put this information into the fall-through edge + if (CurBB.succ_size() == 0) + continue; + // If there is a FT, the last successor will be it. + auto &SuccCount = CurBB.BranchInfo.back(); + auto &Succ = CurBB.Successors.back(); + if (SuccCount.Count == BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + SuccCount.Count = Inferred; + Succ->ExecutionCount += Inferred; + } + + } // end for (CurBB : BasicBlocks) + + return; +} + } // namespace flo } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 86237bd98c21..5baefb8e27f0 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -368,6 +368,13 @@ class BinaryFunction { /// State::CFG. Returns false if CFG cannot be built. bool buildCFG(); + /// Walks the list of basic blocks filling in missing information about + /// edge frequency for fall-throughs. + /// + /// Assumes the CFG has been built and edge frequency for taken branches + /// has been filled with LBR data. + void inferFallThroughCounts(); + virtual ~BinaryFunction() {} }; From 1e2e6d1811a3aeaa3c48ba9f521b097bb3e4404b Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 13 Oct 2015 12:18:54 -0700 Subject: [PATCH 012/904] Teach llvm-flo how to reorder basic blocks with a heuristic Summary: This patch introduces a first approach to reorder basic blocks based on profiling data that gives us the execution frequency for each edge. Our strategy is to layout basic blocks in a order that maximizes the weight (hotness) of branches that will be deleted. We can delete branches when src comes right before dst in the new layout order. This can be reduced to the TSP problem. This patch uses a greedy heuristic to solve the problem: we start with a graph with no edges and progressively add edges by choosing the hottest edges first, building a layout order that attempts to put BBs with hot edges together. (cherry picked from commit 2831488d835a3279d932b3f20315f645a996a2a5) --- bolt/BinaryFunction.cpp | 156 ++++++++++++++++++++++++++++++++++++++++ bolt/BinaryFunction.h | 12 +++- bolt/llvm-flo.cpp | 20 ++++-- 3 files changed, 182 insertions(+), 6 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 39cff4627196..447b895dd30b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -20,6 +20,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include +#include #include #include "BinaryBasicBlock.h" @@ -387,6 +388,11 @@ bool BinaryFunction::buildCFG() { } } + // Set the basic block layout to the original order + for (auto &BB : BasicBlocks) { + BasicBlocksLayout.emplace_back(&BB); + } + // Intermediate dump. DEBUG(print(dbgs(), /* PrintInstructions = */ true)); @@ -527,6 +533,156 @@ void BinaryFunction::inferFallThroughCounts() { return; } +void BinaryFunction::optimizeLayout(bool DumpLayout) { + // Bail if no profiling information + if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) { + return; + } + + if (DumpLayout) { + dbgs() << "running block layout heuristics on " << getName() << "\n"; + } + + // Greedy heuristic implementation for the "TSP problem", applied to BB + // layout. Try to maximize weight during a path traversing all BBs. In this + // way, we will convert the hottest branches into fall-throughs. + + // Encode an edge between two basic blocks, source and destination + typedef std::pair EdgeTy; + std::map Weight; + + // Define a comparison function to establish SWO between edges + auto Comp = [&Weight](EdgeTy A, EdgeTy B) { return Weight[A] > Weight[B]; }; + std::priority_queue, decltype(Comp)> Queue(Comp); + + typedef std::vector ClusterTy; + typedef std::map BBToClusterMapTy; + std::vector Clusters; + BBToClusterMapTy BBToClusterMap; + + // Populating priority queue with all edges + for (auto &BB : BasicBlocks) { + BBToClusterMap[&BB] = -1; // Mark as unmapped + auto BI = BB.BranchInfo.begin(); + for (auto &I : BB.successors()) { + if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) + Weight[std::make_pair(&BB, I)] = BI->Count; + Queue.push(std::make_pair(&BB, I)); + ++BI; + } + } + + // Start a cluster with the entry point + BinaryBasicBlock *Entry = &*BasicBlocks.begin(); + Clusters.emplace_back(); + auto &EntryCluster = Clusters.back(); + EntryCluster.push_back(Entry); + BBToClusterMap[Entry] = 0; + + // Grow clusters in a greedy fashion + while (!Queue.empty()) { + auto elmt = Queue.top(); + Queue.pop(); + + BinaryBasicBlock *BBSrc = elmt.first; + BinaryBasicBlock *BBDst = elmt.second; + int I = 0, J = 0; + + // Case 1: BBSrc and BBDst are the same. Ignore this edge + if (BBSrc == BBDst) + continue; + + // Case 2: Both BBSrc and BBDst are already allocated + if ((I = BBToClusterMap[BBSrc]) != -1 && + (J = BBToClusterMap[BBDst]) != -1) { + auto &ClusterA = Clusters[I]; + auto &ClusterB = Clusters[J]; + if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { + // Case 2a: BBSrc is at the end of a cluster and BBDst is at the start, + // allowing us to merge two clusters + for (auto BB : ClusterB) + BBToClusterMap[BB] = I; + ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); + ClusterB.clear(); + } else { + // Case 2b: Both BBSrc and BBDst are allocated in positions we cannot + // merge them, so we ignore this edge. + } + continue; + } + + // Case 3: BBSrc is already allocated in a cluster + if ((I = BBToClusterMap[BBSrc]) != -1) { + auto &Cluster = Clusters[I]; + if (Cluster.back() == BBSrc) { + // Case 3a: BBSrc is allocated at the end of this cluster. We put + // BBSrc and BBDst together. + Cluster.push_back(BBDst); + BBToClusterMap[BBDst] = I; + } else { + // Case 3b: We cannot put BBSrc and BBDst in consecutive positions, + // so we ignore this edge. + } + continue; + } + + // Case 4: BBSrc is not in a cluster, but BBDst is + if ((I = BBToClusterMap[BBDst]) != -1) { + auto &Cluster = Clusters[I]; + if (Cluster.front() == BBDst) { + // Case 4a: BBDst is allocated at the start of this cluster. We put + // BBSrc and BBDst together. + Cluster.insert(Cluster.begin(), BBSrc); + BBToClusterMap[BBSrc] = I; + } else { + // Case 4b: We cannot put BBSrc and BBDst in consecutive positions, + // so we ignore this edge. + } + continue; + } + + // Case 5: Both BBSrc and BBDst are unallocated, so we create a new cluster + // with them + I = Clusters.size(); + Clusters.emplace_back(); + auto &Cluster = Clusters.back(); + Cluster.push_back(BBSrc); + Cluster.push_back(BBDst); + BBToClusterMap[BBSrc] = I; + BBToClusterMap[BBDst] = I; + } + + // Define final function layout based on clusters + BasicBlocksLayout.clear(); + for (auto &Cluster : Clusters) { + BasicBlocksLayout.insert(BasicBlocksLayout.end(), Cluster.begin(), + Cluster.end()); + } + + // Finalize layout with BBs that weren't assigned to any cluster, preserving + // their relative order + for (auto &BB : BasicBlocks) { + if (BBToClusterMap[&BB] == -1) + BasicBlocksLayout.push_back(&BB); + } + + if (DumpLayout) { + dbgs() << "original BB order is: "; + auto Sep = ""; + for (auto &BB : BasicBlocks) { + dbgs() << Sep << BB.getName(); + Sep = ","; + } + dbgs() << "\nnew order is: "; + Sep = ""; + for (auto BB : BasicBlocksLayout) { + dbgs() << Sep << BB->getName(); + Sep = ","; + } + dbgs() << "\n"; + } +} + } // namespace flo } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 5baefb8e27f0..4507ff8a1c71 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -143,9 +143,12 @@ class BinaryFunction { InstrMapType Instructions; // Blocks are kept sorted in the layout order. If we need to change the - // layout, the terminating instructions need to be modified. + // layout (if BasicBlocksLayout stores a different order than BasicBlocks), + // the terminating instructions need to be modified. using BasicBlockListType = std::vector; + using BasicBlockOrderType = std::vector; BasicBlockListType BasicBlocks; + BasicBlockOrderType BasicBlocksLayout; public: @@ -153,6 +156,7 @@ class BinaryFunction { typedef BasicBlockListType::const_iterator const_iterator; typedef std::reverse_iterator const_reverse_iterator; typedef std::reverse_iterator reverse_iterator; + typedef BasicBlockOrderType::iterator order_iterator; // CFG iterators. iterator begin() { return BasicBlocks.begin(); } @@ -172,6 +176,10 @@ class BinaryFunction { const BinaryBasicBlock & back() const { return BasicBlocks.back(); } BinaryBasicBlock & back() { return BasicBlocks.back(); } + inline iterator_range layout() { + return iterator_range(BasicBlocksLayout.begin(), + BasicBlocksLayout.end()); + } BinaryFunction(StringRef Name, SymbolRef Symbol, SectionRef Section, uint64_t Address, uint64_t Size, BinaryContext &BC) : @@ -180,7 +188,7 @@ class BinaryFunction { /// Perform optimal code layout based on edge frequencies making necessary /// adjustments to instructions at the end of basic blocks. - void optimizeLayout(); + void optimizeLayout(bool DumpLayout); /// View CFG in graphviz program void viewGraph(); diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 06cbcb27395d..2035b1003c76 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -13,7 +13,6 @@ // //===----------------------------------------------------------------------===// -#include "DataReader.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" @@ -50,6 +49,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" #include "BinaryFunction.h" +#include "DataReader.h" #include #include @@ -83,6 +83,11 @@ EliminateUnreachable("eliminate-unreachable", cl::desc("eliminate unreachable code"), cl::Optional); +static cl::opt +ReorderBlocks("reorder-blocks", + cl::desc("redo basic block layout based on profiling data"), + cl::Optional); + static cl::opt DumpData("dump-data", cl::desc("dump parsed flo data (debugging)"), cl::Hidden); @@ -91,6 +96,10 @@ static cl::opt DumpFunctions("dump-functions", cl::desc("dump parsed functions (debugging)"), cl::Hidden); +static cl::opt +DumpLayout("dump-layout", cl::desc("dump parsed flo data (debugging)"), + cl::Hidden); + static StringRef ToolName; static void report_error(StringRef Message, std::error_code EC) { @@ -456,6 +465,9 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { DEBUG(dbgs() << "*** After unreachable block elimination ***\n"); DEBUG(Function.print(dbgs(), /* PrintInstructions = */ true)); } + if (ReorderBlocks) { + BFI.second.optimizeLayout(DumpLayout); + } } std::error_code EC; @@ -539,9 +551,9 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Streamer->EmitLabel(FunctionSymbol); // Emit code. - for (const auto &BB : Function) { - Streamer->EmitLabel(BB.getLabel()); - for (const auto &Instr : BB) { + for (auto BB : Function.layout()) { + Streamer->EmitLabel(BB->getLabel()); + for (const auto &Instr : *BB) { Streamer->EmitInstruction(Instr, *BC->STI); } } From 32cff663cc74a614593dbcfbfa0263675d2748ba Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 14 Oct 2015 16:58:55 -0700 Subject: [PATCH 013/904] Teach llvm-flo how to reorder blocks in an optimal way Summary: This patch implements a dynamic programming approach to solve reorder basic blocks with profiling information in an optimal way. Since this is analogous to TSP, it is NP-hard and the algorithm is exponential in time and memory consumption. Therefore, we only use the optimal algorithm to decide the layout of small functions (with less than 11 basic blocks). (cherry picked from commit 882906726267bddeed543dcbaa20449fd5a0a25d) --- bolt/BinaryFunction.cpp | 122 ++++++++++++++++++++++++++++++++++++++++ bolt/BinaryFunction.h | 12 ++++ 2 files changed, 134 insertions(+) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 447b895dd30b..a61c65b8c425 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -539,6 +539,10 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { return; } + // Work on optimal solution if problem is small enough + if (BasicBlocks.size() <= FUNC_SIZE_THRESHOLD) + return solveOptimalLayout(DumpLayout); + if (DumpLayout) { dbgs() << "running block layout heuristics on " << getName() << "\n"; } @@ -683,6 +687,124 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { } } +void BinaryFunction::solveOptimalLayout(bool DumpLayout) { + std::vector> Weight; + std::map BBToIndex; + std::vector IndexToBB; + + if (DumpLayout) { + dbgs() << "finding optimal block layout for " << getName() << "\n"; + } + + unsigned N = BasicBlocks.size(); + // Populating weight map and index map + for (auto &BB : BasicBlocks) { + BBToIndex[&BB] = IndexToBB.size(); + IndexToBB.push_back(&BB); + } + Weight.resize(N); + for (auto &BB : BasicBlocks) { + auto BI = BB.BranchInfo.begin(); + Weight[BBToIndex[&BB]].resize(N); + for (auto &I : BB.successors()) { + if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) + Weight[BBToIndex[&BB]][BBToIndex[I]] = BI->Count; + ++BI; + } + } + + std::vector> DP; + DP.resize(1 << N); + for (auto &Elmt : DP) { + Elmt.resize(N, -1); + } + // Start with the entry basic block being allocated with cost zero + DP[1][0] = 0; + // Walk through TSP solutions using a bitmask to represent state (current set + // of BBs in the layout) + unsigned BestSet = 1; + unsigned BestLast = 0; + int64_t BestWeight = 0; + for (unsigned Set = 1; Set < (1U << N); ++Set) { + // Traverse each possibility of Last BB visited in this layout + for (unsigned Last = 0; Last < N; ++Last) { + // Case 1: There is no possible layout with this BB as Last + if (DP[Set][Last] == -1) + continue; + + // Case 2: There is a layout with this Set and this Last, and we try + // to expand this set with New + for (unsigned New = 1; New < N; ++New) { + // Case 2a: BB "New" is already in this Set + if ((Set & (1 << New)) != 0) + continue; + + // Case 2b: BB "New" is not in this set and we add it to this Set and + // record total weight of this layout with "New" as the last BB. + unsigned NewSet = (Set | (1 << New)); + if (DP[NewSet][New] == -1) + DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; + DP[NewSet][New] = std::max(DP[NewSet][New], + DP[Set][Last] + (int64_t)Weight[Last][New]); + + if (DP[NewSet][New] > BestWeight) { + BestWeight = DP[NewSet][New]; + BestSet = NewSet; + BestLast = New; + } + } + } + } + + // Define final function layout based on layout that maximizes weight + BasicBlocksLayout.clear(); + unsigned Last = BestLast; + unsigned Set = BestSet; + std::vector Visited; + Visited.resize(N); + Visited[Last] = true; + BasicBlocksLayout.push_back(IndexToBB[Last]); + Set = Set & ~(1U << Last); + while (Set != 0) { + int64_t Best = -1; + for (unsigned I = 0; I < N; ++I) { + if (DP[Set][I] == -1) + continue; + if (DP[Set][I] > Best) { + Last = I; + Best = DP[Set][I]; + } + } + Visited[Last] = true; + BasicBlocksLayout.push_back(IndexToBB[Last]); + Set = Set & ~(1U << Last); + } + std::reverse(BasicBlocksLayout.begin(), BasicBlocksLayout.end()); + + // Finalize layout with BBs that weren't assigned to the layout + for (auto &BB : BasicBlocks) { + if (Visited[BBToIndex[&BB]] == false) + BasicBlocksLayout.push_back(&BB); + } + + if (DumpLayout) { + dbgs() << "original BB order is: "; + auto Sep = ""; + for (auto &BB : BasicBlocks) { + dbgs() << Sep << BB.getName(); + Sep = ","; + } + dbgs() << "\nnew order is: "; + Sep = ""; + for (auto BB : BasicBlocksLayout) { + dbgs() << Sep << BB->getName(); + Sep = ","; + } + dbgs() << "\n"; + DEBUG(print(dbgs(), /* PrintInstructions = */ true)); + } +} + } // namespace flo } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 4507ff8a1c71..d56c1ff17880 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -54,6 +54,9 @@ class BinaryFunction { static constexpr uint64_t COUNT_NO_PROFILE = std::numeric_limits::max(); + // Function size, in number of BBs, above which we fallback to a heuristic + // solution to the layout problem instead of seeking the optimal one. + static constexpr uint64_t FUNC_SIZE_THRESHOLD = 10; private: @@ -190,6 +193,15 @@ class BinaryFunction { /// adjustments to instructions at the end of basic blocks. void optimizeLayout(bool DumpLayout); + /// Dynamic programming implementation for the "TSP problem", applied to BB + /// layout. Find the optimal way to maximize weight during a path traversing + /// all BBs. In this way, we will convert the hottest branches into + /// fall-throughs. + /// + /// Uses exponential amount of memory on the number of basic blocks and should + /// only be used for small functions. + void solveOptimalLayout(bool DumpLayout); + /// View CFG in graphviz program void viewGraph(); From c741099ba9a9c87842b7f97b3c4b5349ddc0da66 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 16 Oct 2015 17:00:36 -0700 Subject: [PATCH 014/904] Fix DataReader to work with new local sym perf2flo format Summary: In a recent commit, we changed local symbols to be specially tagged with the number 2 (local sym) instead of 1 (sym). This patch modifies the reader to don't choke when seeing a 2 in the symbol id field. (cherry picked from commit 79eec18dbada471d5bdeae0966259890286b6762) --- bolt/DataReader.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 1a762735d019..a0db59b401d2 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -92,12 +92,14 @@ ErrorOr DataReader::parseNumberField(char EndChar) { ErrorOr DataReader::parseLocation() { // Read whether the location of the branch should be DSO or a symbol - if (ParsingBuf[0] != '0' && ParsingBuf[0] != '1') { - reportError("expected 0 or 1"); + // 0 means it is a DSO. 1 means it is a global symbol. 2 means it is a local + // symbol. + if (ParsingBuf[0] != '0' && ParsingBuf[0] != '1' && ParsingBuf[0] != '2') { + reportError("expected 0, 1 or 2"); return make_error_code(llvm::errc::io_error); } - bool IsSymbol = ParsingBuf[0] == '1'; + bool IsSymbol = ParsingBuf[0] == '1' || ParsingBuf[0] == '2'; ParsingBuf = ParsingBuf.drop_front(1); Col += 1; @@ -153,7 +155,7 @@ bool DataReader::hasData() { if (ParsingBuf.size() == 0) return false; - if (ParsingBuf[0] == '0' || ParsingBuf[0] == '1') + if (ParsingBuf[0] == '0' || ParsingBuf[0] == '1' || ParsingBuf[0] == '2') return true; return false; } From 44f48c4294d0dc577837c4cb1c55e4eccf5db875 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 16 Oct 2015 17:15:00 -0700 Subject: [PATCH 015/904] Fix comments. NFC. Summary: Updated comments in BinaryFunction class. (cherry picked from commit 4958367e8e1e7a348c82e1526d881217a93cf111) --- bolt/BinaryFunction.cpp | 6 +++--- bolt/BinaryFunction.h | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a61c65b8c425..5ac5f9692a01 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -547,9 +547,9 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { dbgs() << "running block layout heuristics on " << getName() << "\n"; } - // Greedy heuristic implementation for the "TSP problem", applied to BB - // layout. Try to maximize weight during a path traversing all BBs. In this - // way, we will convert the hottest branches into fall-throughs. + // Greedy heuristic implementation for the TSP, applied to BB layout. Try to + // maximize weight during a path traversing all BBs. In this way, we will + // convert the hottest branches into fall-throughs. // Encode an edge between two basic blocks, source and destination typedef std::pair EdgeTy; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index d56c1ff17880..f6cb9a882b10 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -193,10 +193,9 @@ class BinaryFunction { /// adjustments to instructions at the end of basic blocks. void optimizeLayout(bool DumpLayout); - /// Dynamic programming implementation for the "TSP problem", applied to BB - /// layout. Find the optimal way to maximize weight during a path traversing - /// all BBs. In this way, we will convert the hottest branches into - /// fall-throughs. + /// Dynamic programming implementation for the TSP, applied to BB layout. Find + /// the optimal way to maximize weight during a path traversing all BBs. In + /// this way, we will convert the hottest branches into fall-throughs. /// /// Uses exponential amount of memory on the number of basic blocks and should /// only be used for small functions. From 6c6ab4fa69d8d80377dc8db49293807242e36dbd Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 14 Oct 2015 15:35:14 -0700 Subject: [PATCH 016/904] Make FLO work on hhvm binary. Summary: Fixes several issues that prevented us from running hhvm binary. (cherry picked from commit e28912b476d4fe7a15b668305ac258c4f2ec7f69) --- bolt/BinaryBasicBlock.cpp | 7 +- bolt/BinaryBasicBlock.h | 2 - bolt/BinaryContext.cpp | 47 +++++++++++ bolt/BinaryContext.h | 15 ++-- bolt/BinaryFunction.cpp | 50 ++++-------- bolt/BinaryFunction.h | 11 +-- bolt/CMakeLists.txt | 1 + bolt/llvm-flo.cpp | 168 ++++++++++++++++++++++++-------------- 8 files changed, 188 insertions(+), 113 deletions(-) create mode 100644 bolt/BinaryContext.cpp diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 70a84280215f..225b04f02eaa 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -9,6 +9,8 @@ // //===----------------------------------------------------------------------===// +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -17,14 +19,10 @@ #include #include -#include "BinaryBasicBlock.h" -#include "BinaryFunction.h" - #undef DEBUG_TYPE #define DEBUG_TYPE "flo" namespace llvm { - namespace flo { bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { @@ -66,5 +64,4 @@ void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) { } } // namespace flo - } // namespace llvm diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 9903710007be..11d94ee1fe9b 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -28,7 +28,6 @@ #include namespace llvm { - namespace flo { class BinaryFunction; @@ -222,7 +221,6 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); } // namespace flo - } // namespace llvm #endif diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp new file mode 100644 index 000000000000..eb0f6dd6ef72 --- /dev/null +++ b/bolt/BinaryContext.cpp @@ -0,0 +1,47 @@ +//===--- BinaryContext.cpp - Interface for machine-level context ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryContext.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSymbol.h" + +namespace llvm { +namespace flo { + +MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, + Twine Prefix) { + MCSymbol *Symbol{nullptr}; + std::string Name; + auto NI = GlobalAddresses.find(Address); + if (NI != GlobalAddresses.end()) { + // Even though there could be multiple names registered at the address, + // we only use the first one. + Name = NI->second; + } else { + Name = (Prefix + "0x" + Twine::utohexstr(Address)).str(); + assert(GlobalSymbols.find(Name) == GlobalSymbols.end() && + "created name is not unique"); + GlobalAddresses.emplace(std::make_pair(Address, Name)); + } + + Symbol = Ctx->lookupSymbol(Name); + if (Symbol) + return Symbol; + + Symbol = Ctx->getOrCreateSymbol(Name); + GlobalSymbols[Name] = Address; + + return Symbol; +} + +} // namespace flo +} // namespace llvm diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 0e6f3ca7724f..ab0e9888941a 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -7,6 +7,8 @@ // //===----------------------------------------------------------------------===// // +// Context for processing binary executables in files and/or memory. +// //===----------------------------------------------------------------------===// #ifndef LLVM_TOOLS_LLVM_FLO_BINARY_CONTEXT_H @@ -24,27 +26,25 @@ #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/TargetRegistry.h" - #include #include #include #include namespace llvm { - namespace flo { class DataReader; -/// Everything that's needed to process binaries lives here. class BinaryContext { BinaryContext() = delete; public: - // [name] -> [address] + // [name] -> [address] map used for global symbol resolution. typedef std::map SymbolMapType; SymbolMapType GlobalSymbols; @@ -111,10 +111,15 @@ class BinaryContext { DR(DR) {} ~BinaryContext() {} + + /// Return a global symbol registered at a given \p Address. If no symbol + /// exists, create one with unique name using \p Prefix. + /// If there are multiple symbols registered at the \p Address, then + /// return the first one. + MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); }; } // namespace flo - } // namespace llvm #endif diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 5ac5f9692a01..cdda810ae15c 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -10,6 +10,9 @@ //===----------------------------------------------------------------------===// +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "DataReader.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -23,15 +26,10 @@ #include #include -#include "BinaryBasicBlock.h" -#include "BinaryFunction.h" -#include "DataReader.h" - #undef DEBUG_TYPE #define DEBUG_TYPE "flo" namespace llvm { - namespace flo { BinaryBasicBlock * @@ -181,7 +179,16 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { break; } + if (MIA->isUnsupported(Instruction)) { + DEBUG(dbgs() << "FLO: unsupported instruction seen. Skipping function " + << getName() << ".\n"); + IsSimple = false; + break; + } + if (MIA->isIndirectBranch(Instruction)) { + DEBUG(dbgs() << "FLO: indirect branch seen. Skipping function " + << getName() << ".\n"); IsSimple = false; break; } @@ -231,21 +238,8 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } else { // This is a call regardless of the opcode (e.g. tail call). IsCall = true; - // Check if we already have a symbol at this address. - std::string Name; - auto NI = BC.GlobalAddresses.find(InstructionTarget); - if (NI != BC.GlobalAddresses.end()) { - // Any registered name will do. - Name = NI->second; - } else { - // Create a new symbol at the destination. - Name = (Twine("FUNCat0x") + - Twine::utohexstr(InstructionTarget)).str(); - BC.GlobalAddresses.emplace(std::make_pair(InstructionTarget, - Name)); - } - TargetSymbol = Ctx->getOrCreateSymbol(Name); - BC.GlobalSymbols[Name] = InstructionTarget; + TargetSymbol = BC.getOrCreateGlobalSymbol(InstructionTarget, + "FUNCat"); } } @@ -282,19 +276,8 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { IsSimple = false; break; } - std::string Name; - auto NI = BC.GlobalAddresses.find(TargetAddress); - if (NI != BC.GlobalAddresses.end()) { - Name = NI->second; - } else { - // Register new "data" symbol at the destination. - Name = (Twine("DATAat0x") + Twine::utohexstr(TargetAddress)).str(); - BC.GlobalAddresses.emplace(std::make_pair(TargetAddress, - Name)); - } - TargetSymbol = Ctx->getOrCreateSymbol(Name); - BC.GlobalSymbols[Name] = TargetAddress; - + // FIXME: check that the address is in data, not in code. + TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); MIA->replaceRIPOperandDisp( Instruction, MCOperand::createExpr( @@ -806,5 +789,4 @@ void BinaryFunction::solveOptimalLayout(bool DumpLayout) { } } // namespace flo - } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index f6cb9a882b10..32af380e1dc0 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -17,6 +17,8 @@ #ifndef LLVM_TOOLS_LLVM_FLO_BINARY_FUNCTION_H #define LLVM_TOOLS_LLVM_FLO_BINARY_FUNCTION_H +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" #include "llvm/MC/MCCodeEmitter.h" @@ -31,13 +33,9 @@ #include "llvm/Support/raw_ostream.h" #include -#include "BinaryBasicBlock.h" -#include "BinaryContext.h" - using namespace llvm::object; namespace llvm { - namespace flo { /// BinaryFunction is a representation of machine-level function. @@ -184,10 +182,10 @@ class BinaryFunction { BasicBlocksLayout.end()); } - BinaryFunction(StringRef Name, SymbolRef Symbol, SectionRef Section, + BinaryFunction(std::string Name, SymbolRef Symbol, SectionRef Section, uint64_t Address, uint64_t Size, BinaryContext &BC) : Name(Name), Symbol(Symbol), Section(Section), Address(Address), - Size(Size), BC(BC), CodeSectionName((".text." + Name).str()) {} + Size(Size), BC(BC), CodeSectionName(".text." + Name) {} /// Perform optimal code layout based on edge frequencies making necessary /// adjustments to instructions at the end of basic blocks. @@ -411,7 +409,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, } } // namespace flo - } // namespace llvm #endif diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index b8720c3d9a64..a66505c5d097 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-flo llvm-flo.cpp BinaryBasicBlock.cpp + BinaryContext.cpp BinaryFunction.cpp DataReader.cpp ) diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 2035b1003c76..657ebb50b14e 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -13,6 +13,10 @@ // //===----------------------------------------------------------------------===// +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "DataReader.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" @@ -45,12 +49,6 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Target/TargetMachine.h" - -#include "BinaryBasicBlock.h" -#include "BinaryContext.h" -#include "BinaryFunction.h" -#include "DataReader.h" - #include #include #include @@ -62,7 +60,8 @@ using namespace llvm; using namespace object; using namespace flo; -// Tool options. +namespace opts { + static cl::opt InputFilename(cl::Positional, cl::desc(""), cl::Required); @@ -78,6 +77,17 @@ FunctionNames("funcs", cl::desc("list of functions to optimize"), cl::value_desc("func1,func2,func3,...")); +static cl::list +SkipFunctionNames("skip_funcs", + cl::CommaSeparated, + cl::desc("list of functions to skip"), + cl::value_desc("func1,func2,func3,...")); + +static cl::opt +MaxFunctions("max_funcs", + cl::desc("maximum # of functions to overwrite"), + cl::Optional); + static cl::opt EliminateUnreachable("eliminate-unreachable", cl::desc("eliminate unreachable code"), @@ -99,6 +109,7 @@ DumpFunctions("dump-functions", cl::desc("dump parsed functions (debugging)"), static cl::opt DumpLayout("dump-layout", cl::desc("dump parsed flo data (debugging)"), cl::Hidden); +} // namespace opts static StringRef ToolName; @@ -283,7 +294,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { return; } - // Store all non-zero file symbols in this map for quick address lookup. + // Store all non-zero symbols in this map for a quick address lookup. std::map FileSymRefs; // Entry point to the binary. @@ -298,7 +309,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // For local symbols we want to keep track of associated FILE symbol for // disambiguation by name. std::map BinaryFunctions; - StringRef FileSymbolName; + std::string FileSymbolName; for (const SymbolRef &Symbol : File->symbols()) { // Keep undefined symbols for pretty printing? if (Symbol.getFlags() & SymbolRef::SF_Undefined) @@ -329,7 +340,36 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (Name->empty()) continue; - BC->GlobalAddresses.emplace(std::make_pair(Address, *Name)); + // Disambiguate all local symbols before adding to symbol table. + // Since we don't know if we'll see a global with the same name, + // always modify the local name. + std::string UniqueName; + if (Symbol.getFlags() & SymbolRef::SF_Global) { + assert(BC->GlobalSymbols.find(*Name) == BC->GlobalSymbols.end() && + "global name not unique"); + UniqueName = *Name; + } else { + unsigned LocalCount = 1; + std::string LocalName = (*Name).str() + "/" + FileSymbolName + "/"; + while (BC->GlobalSymbols.find(LocalName + std::to_string(LocalCount)) != + BC->GlobalSymbols.end()) { + ++LocalCount; + } + UniqueName = LocalName + std::to_string(LocalCount); + } + + /// It's possible we are seeing a globalized local. Even though + /// we've made the name unique, LLVM might still treat it as local + /// if it has a "private global" prefix, e.g. ".L". Thus we have to + /// change the prefix to enforce global scope of the symbol. + if (StringRef(UniqueName).startswith(BC->AsmInfo->getPrivateGlobalPrefix())) + UniqueName = "PG." + UniqueName; + + // Add the name to global symbols map. + BC->GlobalSymbols[UniqueName] = Address; + + // Add to the reverse map. There could multiple names at the same address. + BC->GlobalAddresses.emplace(std::make_pair(Address, UniqueName)); // Only consider ST_Function symbols for functions. Although this // assumption could be broken by assembly functions for which the type @@ -353,35 +393,12 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { continue; } - // Disambiguate local function name. Since we don't know if we'll see - // a global with the same name, always modify the local function name. - std::string UniqueFunctionName; - if (!(Symbol.getFlags() & SymbolRef::SF_Global)) { - unsigned LocalCount = 1; - auto LocalName = *Name + "/" + FileSymbolName + "/"; - while (BC->GlobalSymbols.find((LocalName + Twine(LocalCount)).str()) != - BC->GlobalSymbols.end()) { - ++LocalCount; - } - UniqueFunctionName = (LocalName + Twine(LocalCount)).str(); - } else { - auto I = BC->GlobalSymbols.find(*Name); - assert(I == BC->GlobalSymbols.end() && "global name not unique"); - UniqueFunctionName = *Name; - } - // Create the function and add to the map. BinaryFunctions.emplace( Address, - BinaryFunction(UniqueFunctionName, Symbol, *Section, Address, + BinaryFunction(UniqueName, Symbol, *Section, Address, SymbolSize, *BC) ); - - // Add the name to global symbols map. - BC->GlobalSymbols[UniqueFunctionName] = Address; - - // Add to the reverse map. - BC->GlobalAddresses.emplace(std::make_pair(Address, UniqueFunctionName)); } // Disassemble every function and build it's control flow graph. @@ -404,8 +421,12 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { auto SymRefI = FileSymRefs.upper_bound(Function.getAddress()); if (SymRefI != FileSymRefs.end()) { auto MaxSize = SymRefI->first - Function.getAddress(); - assert(MaxSize >= Function.getSize() && - "symbol seen in the middle of the function"); + if (MaxSize < Function.getSize()) { + DEBUG(dbgs() << "FLO: symbol seen in the middle of the function " + << Function.getName() << ". Skipping.\n"); + Function.setSimple(false); + continue; + } Function.setMaxSize(MaxSize); } @@ -434,11 +455,11 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (!Function.buildCFG()) continue; - if (DumpFunctions) + if (opts::DumpFunctions) Function.print(errs(), true); } // Iterate over all functions - if (DumpFunctions) + if (opts::DumpFunctions) return; // Run optimization passes. @@ -452,7 +473,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // FIXME: this wouldn't work with C++ exceptions until we implement // support for those as there will be "invisible" edges // in the graph. - if (EliminateUnreachable) { + if (opts::EliminateUnreachable) { bool IsFirst = true; for (auto &BB : Function) { if (!IsFirst && BB.pred_empty()) { @@ -465,8 +486,8 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { DEBUG(dbgs() << "*** After unreachable block elimination ***\n"); DEBUG(Function.print(dbgs(), /* PrintInstructions = */ true)); } - if (ReorderBlocks) { - BFI.second.optimizeLayout(DumpLayout); + if (opts::ReorderBlocks) { + BFI.second.optimizeLayout(opts::DumpLayout); } } @@ -475,12 +496,14 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // This is an object file, which we keep for debugging purposes. // Once we decide it's useless, we should create it in memory. std::unique_ptr Out = - llvm::make_unique(OutputFilename + ".o", + llvm::make_unique(opts::OutputFilename + ".o", EC, sys::fs::F_None); check_error(EC, "cannot create output object file"); std::unique_ptr RealOut = - llvm::make_unique(OutputFilename, EC, sys::fs::F_None, + llvm::make_unique(opts::OutputFilename, + EC, + sys::fs::F_None, 0777); check_error(EC, "cannot create output executable file"); @@ -513,18 +536,31 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (!Function.isSimple()) continue; - // Only overwrite functions from the list if non-empty. - if (!FunctionNames.empty()) { - bool IsValid = false; - for (auto &Name : FunctionNames) { + // Check against lists of functions from options if we should + // optimize the function. + bool IsValid = true; + if (!opts::FunctionNames.empty()) { + IsValid = false; + for (auto &Name : opts::FunctionNames) { if (Function.getName() == Name) { IsValid = true; break; } } - if (!IsValid) - continue; } + if (!IsValid) + continue; + + if (!opts::SkipFunctionNames.empty()) { + for (auto &Name : opts::SkipFunctionNames) { + if (Function.getName() == Name) { + IsValid = false; + break; + } + } + } + if (!IsValid) + continue; DEBUG(dbgs() << "FLO: generating code for function \"" << Function.getName() << "\"\n"); @@ -635,6 +671,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Writer.setStream(RealOut->os()); // Overwrite function in the output file. + uint64_t CountOverwrittenFunctions = 0; for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -663,6 +700,13 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { BC->MAB->writeNopData(Function.getMaxSize() - Function.getImageSize(), &Writer); RealOut->os().seek(Pos); + + ++CountOverwrittenFunctions; + + if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { + outs() << "FLO: maximum number of functions reached\n"; + break; + } } if (EntryPointFunction) { @@ -672,6 +716,9 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { DEBUG(dbgs() << "FLO: no entry point function was set\n"); } + outs() << "FLO: " << CountOverwrittenFunctions + << " out of " << BinaryFunctions.size() + << " functions were overwritten.\n"; // TODO: we should find a way to mark the binary as optimized by us. Out->keep(); @@ -702,35 +749,36 @@ int main(int argc, char **argv) { ToolName = argv[0]; - if (!sys::fs::exists(InputFilename)) - report_error(InputFilename, errc::no_such_file_or_directory); + if (!sys::fs::exists(opts::InputFilename)) + report_error(opts::InputFilename, errc::no_such_file_or_directory); std::unique_ptr DR(new DataReader(errs())); - if (!InputDataFilename.empty()) { - if (!sys::fs::exists(InputDataFilename)) - report_error(InputDataFilename, errc::no_such_file_or_directory); + if (!opts::InputDataFilename.empty()) { + if (!sys::fs::exists(opts::InputDataFilename)) + report_error(opts::InputDataFilename, errc::no_such_file_or_directory); // Attempt to read input flo data - auto ReaderOrErr = flo::DataReader::readPerfData(InputDataFilename, errs()); + auto ReaderOrErr = + flo::DataReader::readPerfData(opts::InputDataFilename, errs()); if (std::error_code EC = ReaderOrErr.getError()) - report_error(InputDataFilename, EC); + report_error(opts::InputDataFilename, EC); DR.reset(ReaderOrErr.get().release()); - if (DumpData) { + if (opts::DumpData) { DR->dump(); return EXIT_SUCCESS; } } // Attempt to open the binary. - ErrorOr> BinaryOrErr = createBinary(InputFilename); + ErrorOr> BinaryOrErr = createBinary(opts::InputFilename); if (std::error_code EC = BinaryOrErr.getError()) - report_error(InputFilename, EC); + report_error(opts::InputFilename, EC); Binary &Binary = *BinaryOrErr.get().getBinary(); if (ELFObjectFileBase *e = dyn_cast(&Binary)) { OptimizeFile(e, *DR.get()); } else { - report_error(InputFilename, object_error::invalid_file_type); + report_error(opts::InputFilename, object_error::invalid_file_type); } return EXIT_SUCCESS; From 77d90f018bc8bac674ce531996b7d5fa24ab0554 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 19 Oct 2015 13:23:03 -0700 Subject: [PATCH 017/904] Fix missing sanity check in BinaryFunction::optimizeLayout() Summary: SPEC CPU2006 perlbench exposed a bug in BinaryFunction::optimizeLayout() where it would try to optimize the layout even though the function had zero basic blocks. This patch simply checks if the function has zero basic blocks and bails out. (cherry picked from commit 25f7a445e7f17d72465d76e49ac6bb5b42ab4b19) --- bolt/BinaryFunction.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index cdda810ae15c..6d7a246932d6 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -517,8 +517,8 @@ void BinaryFunction::inferFallThroughCounts() { } void BinaryFunction::optimizeLayout(bool DumpLayout) { - // Bail if no profiling information - if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) { + // Bail if no profiling information or if empty + if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE || empty()) { return; } From 7200d2b9a295306ca0e70a94ad75f0a16582548b Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 19 Oct 2015 10:43:54 -0700 Subject: [PATCH 018/904] Fix bug in block reorder heuristic Summary: Tests with SPEC CPU2006 400.perlbench exposed a bug in the block reordering heuristic that happened when two blocks are both successor and predecessor of each other. This patch fixes this. (cherry picked from commit 5732d2f2e5bfc4392fcd556618abdec5363fb190) --- bolt/BinaryFunction.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 6d7a246932d6..2dfa098b3ef6 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -582,17 +582,20 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { // Case 2: Both BBSrc and BBDst are already allocated if ((I = BBToClusterMap[BBSrc]) != -1 && (J = BBToClusterMap[BBDst]) != -1) { + // Case 2a: If they are already allocated at the same cluster, ignore + if (I == J) + continue; auto &ClusterA = Clusters[I]; auto &ClusterB = Clusters[J]; if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { - // Case 2a: BBSrc is at the end of a cluster and BBDst is at the start, + // Case 2b: BBSrc is at the end of a cluster and BBDst is at the start, // allowing us to merge two clusters for (auto BB : ClusterB) BBToClusterMap[BB] = I; ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); ClusterB.clear(); } else { - // Case 2b: Both BBSrc and BBDst are allocated in positions we cannot + // Case 2c: Both BBSrc and BBDst are allocated in positions we cannot // merge them, so we ignore this edge. } continue; @@ -649,8 +652,9 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { // Finalize layout with BBs that weren't assigned to any cluster, preserving // their relative order for (auto &BB : BasicBlocks) { - if (BBToClusterMap[&BB] == -1) + if (BBToClusterMap[&BB] == -1) { BasicBlocksLayout.push_back(&BB); + } } if (DumpLayout) { From 24b54fcaf4d405826c95d47847af20f4728a2952 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 16 Oct 2015 09:49:04 -0700 Subject: [PATCH 019/904] Fixes branches after reordering basic blocks in a binary function Summary: Adds logic in BinaryFunction to be able to fix branches (invert its condition, delete or add a branch), making the new function work with the new layout proposed by the layout pass. All the architecture-specific content was designed to live in the LLVM Target library, in the MCInstrAnalysis pass. For now, we only introduce such logic to the X86 backend. (cherry picked from commit 50b3fa10db33cec65a7e926b8d7482768b8fb20c) --- bolt/BinaryBasicBlock.h | 12 ++++ bolt/BinaryFunction.cpp | 140 ++++++++++++++++++++++++++++++++++++---- bolt/BinaryFunction.h | 7 ++ 3 files changed, 146 insertions(+), 13 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 11d94ee1fe9b..339f5b0e4432 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -206,6 +206,18 @@ class BinaryBasicBlock { return ExecutionCount; } + bool eraseInstruction(MCInst *Inst) { + auto I = Instructions.end(); + auto B = Instructions.begin(); + while (I > B) { + --I; + if (&*I == Inst) { + Instructions.erase(I); + return true; + } + } + return false; + } private: diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 2dfa098b3ef6..cfab81cb1f3a 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -89,27 +89,27 @@ void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { } } - for (const auto &BB : BasicBlocks) { - OS << BB.getName() << " (" - << BB.Instructions.size() << " instructions)\n"; + for (auto BB : BasicBlocksLayout) { + OS << BB->getName() << " (" + << BB->Instructions.size() << " instructions)\n"; - uint64_t BBExecCount = BB.getExecutionCount(); + uint64_t BBExecCount = BB->getExecutionCount(); if (BBExecCount != BinaryBasicBlock::COUNT_NO_PROFILE) { OS << " Exec Count : " << BBExecCount << "\n"; } - if (!BB.Predecessors.empty()) { + if (!BB->Predecessors.empty()) { OS << " Predecessors: "; auto Sep = ""; - for (auto Pred : BB.Predecessors) { + for (auto Pred : BB->Predecessors) { OS << Sep << Pred->getName(); Sep = ", "; } OS << '\n'; } - Offset = RoundUpToAlignment(Offset, BB.getAlignment()); + Offset = RoundUpToAlignment(Offset, BB->getAlignment()); - for (auto &Instr : BB) { + for (auto &Instr : *BB) { OS << format(" %08" PRIx64 ": ", Offset); BC.InstPrinter->printInst(&Instr, OS, "", *BC.STI); OS << "\n"; @@ -126,12 +126,12 @@ void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { Offset += Code.size(); } - if (!BB.Successors.empty()) { + if (!BB->Successors.empty()) { OS << " Successors: "; - auto BI = BB.BranchInfo.begin(); + auto BI = BB->BranchInfo.begin(); auto Sep = ""; - for (auto Succ : BB.Successors) { - assert(BI != BB.BranchInfo.end() && "missing BranchInfo entry"); + for (auto Succ : BB->Successors) { + assert(BI != BB->BranchInfo.end() && "missing BranchInfo entry"); OS << Sep << Succ->getName(); if (ExecutionCount != COUNT_NO_PROFILE && BI->MispredictedCount != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { @@ -657,6 +657,8 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { } } + fixBranches(); + if (DumpLayout) { dbgs() << "original BB order is: "; auto Sep = ""; @@ -671,6 +673,7 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { Sep = ","; } dbgs() << "\n"; + print(dbgs(), /* PrintInstructions = */ true); } } @@ -774,6 +777,8 @@ void BinaryFunction::solveOptimalLayout(bool DumpLayout) { BasicBlocksLayout.push_back(&BB); } + fixBranches(); + if (DumpLayout) { dbgs() << "original BB order is: "; auto Sep = ""; @@ -788,7 +793,116 @@ void BinaryFunction::solveOptimalLayout(bool DumpLayout) { Sep = ","; } dbgs() << "\n"; - DEBUG(print(dbgs(), /* PrintInstructions = */ true)); + print(dbgs(), /* PrintInstructions = */ true); + } +} + +const BinaryBasicBlock * +BinaryFunction::getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const { + auto I = std::upper_bound(BasicBlocks.begin(), BasicBlocks.end(), *BB); + assert(I != BasicBlocks.begin() && "first basic block not at offset 0"); + + if (I == BasicBlocks.end()) + return nullptr; + return &*I; +} + +void BinaryFunction::fixBranches() { + auto &MIA = BC.MIA; + + for (unsigned I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { + BinaryBasicBlock *BB = BasicBlocksLayout[I]; + if (BB->begin() == BB->end()) + continue; + + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + if (!MIA->analyzeBranch(BB->Instructions, TBB, FBB, CondBranch, + UncondBranch)) { + continue; + } + + // Check if the original fall-through for this block has been moved + const MCSymbol *FT = nullptr; + if (I + 1 != BasicBlocksLayout.size()) + FT = BasicBlocksLayout[I + 1]->getLabel(); + const BinaryBasicBlock *OldFTBB = getOriginalLayoutSuccessor(BB); + const MCSymbol *OldFT = nullptr; + if (OldFTBB != nullptr) + OldFT = OldFTBB->getLabel(); + + // Case 1: There are no branches in this basic block and it just falls + // through + if (CondBranch == nullptr && UncondBranch == nullptr) { + // Case 1a: Last instruction is a return, so it does *not* fall through to + // the next block. + if (MIA->isReturn(BB->back())) + continue; + // Case 1b: Layout has changed and the fallthrough is not the same. Need + // to add a new unconditional branch to jump to the old fallthrough. + if (FT != OldFT && OldFT != nullptr) { + MCInst NewInst; + if (!MIA->createUncondBranch(NewInst, OldFT, BC.Ctx.get())) + llvm_unreachable("Target does not support creating new branches"); + BB->Instructions.emplace_back(std::move(NewInst)); + } + // Case 1c: Layout hasn't changed, nothing to do. + continue; + } + + // Case 2: There is a single jump, unconditional, in this basic block + if (CondBranch == nullptr) { + // Case 2a: It jumps to the new fall-through, so we can delete it + if (TBB == FT) { + BB->eraseInstruction(UncondBranch); + } + // Case 2b: If 2a doesn't happen, there is nothing we can do + continue; + } + + // Case 3: There is a single jump, conditional, in this basic block + if (UncondBranch == nullptr) { + // Case 3a: If the taken branch goes to the next block in the new layout, + // invert this conditional branch logic so we can make this a fallthrough. + if (TBB == FT) { + assert(OldFT != nullptr && "malformed CFG"); + if (!MIA->reverseBranchCondition(*CondBranch, OldFT, BC.Ctx.get())) + llvm_unreachable("Target does not support reversing branches"); + continue; + } + // Case 3b: Need to add a new unconditional branch because layout + // has changed + if (FT != OldFT && OldFT != nullptr) { + MCInst NewInst; + if (!MIA->createUncondBranch(NewInst, OldFT, BC.Ctx.get())) + llvm_unreachable("Target does not support creating new branches"); + BB->Instructions.emplace_back(std::move(NewInst)); + continue; + } + // Case 3c: Old fall-through is the same as the new one, no need to change + continue; + } + + // Case 4: There are two jumps in this basic block, one conditional followed + // by another unconditional. + // Case 4a: If the unconditional jump target is the new fall through, + // delete it. + if (FBB == FT) { + BB->eraseInstruction(UncondBranch); + continue; + } + // Case 4b: If the taken branch goes to the next block in the new layout, + // invert this conditional branch logic so we can make this a fallthrough. + // Now we don't need the unconditional jump anymore, so we also delete it. + if (TBB == FT) { + if (!MIA->reverseBranchCondition(*CondBranch, FBB, BC.Ctx.get())) + llvm_unreachable("Target does not support reversing branches"); + BB->eraseInstruction(UncondBranch); + continue; + } + // Case 4c: Nothing interesting happening. } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 32af380e1dc0..69158928bc85 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -130,6 +130,9 @@ class BinaryFunction { return *this; } + const BinaryBasicBlock * + getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const; + /// Storage for all local branches in the function (non-fall-throughs). using LocalBranchesListType = std::vector>; LocalBranchesListType LocalBranches; @@ -392,6 +395,10 @@ class BinaryFunction { /// has been filled with LBR data. void inferFallThroughCounts(); + /// Traverse the CFG checking branches, inverting their condition, removing or + /// adding jumps based on a new layout order. + void fixBranches(); + virtual ~BinaryFunction() {} }; From 8b7d9cf4ffd3dabbf54b05179b0784bbda5e7694 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 20 Oct 2015 10:51:17 -0700 Subject: [PATCH 020/904] Eliminate nop instruction in input and derive alignment. Summary: Nop instructions are primarily used for alignment purposes on the input. We remove all nops when we build CFG and derive alignment of basic blocks based on existing alignment and a presence of nops before it. This will not always work as some basic blocks will be naturally aligned without necessity for nops. However, it's better than random alignment. We would also add heuristics for BB alignment based on execution profile. (cherry picked from commit d7ccc7324995ff9d27042107f65c38edd0c7a2b3) --- bolt/BinaryBasicBlock.h | 6 ++++++ bolt/BinaryFunction.cpp | 23 +++++++++++++++++++---- bolt/BinaryFunction.h | 18 ++++++++++-------- bolt/llvm-flo.cpp | 10 ++++++---- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 339f5b0e4432..c69a138ac3aa 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -93,6 +93,7 @@ class BinaryBasicBlock { typedef std::reverse_iterator const_reverse_iterator; typedef std::reverse_iterator reverse_iterator; + bool empty() const { return Instructions.empty(); } MCInst &front() { return Instructions.front(); } MCInst &back() { return Instructions.back(); } const MCInst &front() const { return Instructions.front(); } @@ -182,6 +183,11 @@ class BinaryBasicBlock { Instructions.emplace_back(Inst); } + /// Set minimum alignment for the basic block. + void setAlignment(uint64_t Align) { + Alignment = Align; + } + /// Return required alignment for the block. uint64_t getAlignment() const { return Alignment; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index cfab81cb1f3a..3bf8352e48c6 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -91,7 +91,8 @@ void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { for (auto BB : BasicBlocksLayout) { OS << BB->getName() << " (" - << BB->Instructions.size() << " instructions)\n"; + << BB->Instructions.size() << " instructions, align : " + << BB->getAlignment() << ")\n"; uint64_t BBExecCount = BB->getExecutionCount(); if (BBExecCount != BinaryBasicBlock::COUNT_NO_PROFILE) { @@ -342,12 +343,14 @@ bool BinaryFunction::buildCFG() { // sorted by offsets. BinaryBasicBlock *InsertBB{nullptr}; BinaryBasicBlock *PrevBB{nullptr}; + bool IsLastInstrNop = false; for (auto &InstrInfo : Instructions) { auto LI = Labels.find(InstrInfo.first); if (LI != Labels.end()) { // Always create new BB at branch destination. PrevBB = InsertBB; - InsertBB = addBasicBlock(LI->first, LI->second); + InsertBB = addBasicBlock(LI->first, LI->second, + /* DeriveAlignment = */ IsLastInstrNop); } if (!InsertBB) { // It must be a fallthrough. Create a new block unless we see an @@ -358,10 +361,20 @@ bool BinaryFunction::buildCFG() { InsertBB = PrevBB; } else { InsertBB = addBasicBlock(InstrInfo.first, - BC.Ctx->createTempSymbol("FT", true)); + BC.Ctx->createTempSymbol("FT", true), + /* DeriveAlignment = */ IsLastInstrNop); } } + // Ignore nops. We use nops to derive alignment of the next basic block. + // It will not always work, as some blocks are naturally aligned, but + // it's just part of heuristic for block alignment. + if (MIA->isNoop(InstrInfo.second)) { + IsLastInstrNop = true; + continue; + } + + IsLastInstrNop = false; InsertBB->addInstruction(InstrInfo.second); // How well do we detect tail calls here? @@ -416,7 +429,9 @@ bool BinaryFunction::buildCFG() { } MCInst &LastInst = BB.back(); - if (BB.succ_size() == 0) { + if (BB.empty()) { + IsPrevFT = true; + } else if (BB.succ_size() == 0) { IsPrevFT = MIA->isTerminator(LastInst) ? false : true; } else if (BB.succ_size() == 1) { IsPrevFT = MIA->isConditionalBranch(LastInst) ? true : false; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 69158928bc85..a1de19509109 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -260,21 +260,23 @@ class BinaryFunction { /// Create a basic block at a given \p Offset in the /// function and append it to the end of list of blocks. + /// If \p DeriveAlignment is true, set the alignment of the block based + /// on the alignment of the existing offset. + /// /// Returns NULL if basic block already exists at the \p Offset. - BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label = nullptr) { + BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, + bool DeriveAlignment = false) { assert(!getBasicBlockAtOffset(Offset) && "basic block already exists"); if (!Label) Label = BC.Ctx->createTempSymbol("BB", true); BasicBlocks.emplace_back(BinaryBasicBlock(Label, Offset)); - return &BasicBlocks.back(); - } + auto BB = &BasicBlocks.back(); - BinaryBasicBlock *getOrCreateBasicBlockAt(uint64_t Offset, - MCSymbol *Label = nullptr) { - BinaryBasicBlock *BB = getBasicBlockAtOffset(Offset); - if (!BB) - BB = addBasicBlock(Offset, Label); + if (DeriveAlignment) { + uint64_t DerivedAlignment = Offset & (1 + ~Offset); + BB->setAlignment(std::min(DerivedAlignment, uint64_t(16))); + } return BB; } diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 657ebb50b14e..d7f607ba0137 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -95,8 +95,8 @@ EliminateUnreachable("eliminate-unreachable", static cl::opt ReorderBlocks("reorder-blocks", - cl::desc("redo basic block layout based on profiling data"), - cl::Optional); + cl::desc("redo basic block layout based on profiling data"), + cl::Optional); static cl::opt DumpData("dump-data", cl::desc("dump parsed flo data (debugging)"), @@ -104,11 +104,11 @@ DumpData("dump-data", cl::desc("dump parsed flo data (debugging)"), static cl::opt DumpFunctions("dump-functions", cl::desc("dump parsed functions (debugging)"), - cl::Hidden); + cl::Hidden); static cl::opt DumpLayout("dump-layout", cl::desc("dump parsed flo data (debugging)"), - cl::Hidden); + cl::Hidden); } // namespace opts static StringRef ToolName; @@ -588,6 +588,8 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // Emit code. for (auto BB : Function.layout()) { + if (BB->getAlignment() > 1) + Streamer->EmitCodeAlignment(BB->getAlignment()); Streamer->EmitLabel(BB->getLabel()); for (const auto &Instr : *BB) { Streamer->EmitInstruction(Instr, *BC->STI); From b7b0793f1a2d9d2c5a9d26c6e1500390549463b2 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 20 Oct 2015 10:17:38 -0700 Subject: [PATCH 021/904] Teach llvm-flo how to handle two back-to-back JMPs Summary: If we have two consecutive JMP instructions and no branches to the second one, the second one is dead code, but llvm-flo does not handle these cases properly and put two JMPs in the same BB. This patch fixes this, putting the extraneous JMP in a separate block, making it easy for us to detect it is dead code and remove it later in a separate step. (cherry picked from commit 20830935ec9804750e619ab4376497dca19136a3) --- bolt/BinaryFunction.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 3bf8352e48c6..079b0ed2e2fd 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -344,6 +344,7 @@ bool BinaryFunction::buildCFG() { BinaryBasicBlock *InsertBB{nullptr}; BinaryBasicBlock *PrevBB{nullptr}; bool IsLastInstrNop = false; + MCInst *PrevInstr{nullptr}; for (auto &InstrInfo : Instructions) { auto LI = Labels.find(InstrInfo.first); if (LI != Labels.end()) { @@ -353,10 +354,12 @@ bool BinaryFunction::buildCFG() { /* DeriveAlignment = */ IsLastInstrNop); } if (!InsertBB) { - // It must be a fallthrough. Create a new block unless we see an - // unconditional branch. + // It must be a fallthrough or unreachable code. Create a new block unless + // we see an unconditional branch following a conditional one. assert(PrevBB && "no previous basic block for a fall through"); - if (MIA->isUnconditionalBranch(InstrInfo.second)) { + assert(PrevInstr && "no previous instruction for a fall through"); + if (MIA->isUnconditionalBranch(InstrInfo.second) && + !MIA->isUnconditionalBranch(*PrevInstr)) { // Temporarily restore inserter basic block. InsertBB = PrevBB; } else { @@ -376,6 +379,7 @@ bool BinaryFunction::buildCFG() { IsLastInstrNop = false; InsertBB->addInstruction(InstrInfo.second); + PrevInstr = &InstrInfo.second; // How well do we detect tail calls here? if (MIA->isTerminator(InstrInfo.second)) { From 7c95f607a12752a58fe5168fde338d423171ab80 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 20 Oct 2015 12:30:22 -0700 Subject: [PATCH 022/904] Do not schedule BBs before the entry point Summary: SPEC CPU2006 perlbench triggered a bug in our heuristic block reordering algorithm where a hot edge that targets the entry point (as in a recursive tail call) would make us try to allocate the call site before the function entry point. Since we don't update function addresses yet, moving the entry point will corrupt the program. This patch fixes this. (cherry picked from commit b14c3308f54ea266d10c1078d3c1905c5a3aa716) --- bolt/BinaryFunction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 079b0ed2e2fd..0c1d61918530 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -595,7 +595,7 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { int I = 0, J = 0; // Case 1: BBSrc and BBDst are the same. Ignore this edge - if (BBSrc == BBDst) + if (BBSrc == BBDst || BBDst == Entry) continue; // Case 2: Both BBSrc and BBDst are already allocated From 54685653b89ed1987967c4ef050a5c45706e103d Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 20 Oct 2015 12:47:37 -0700 Subject: [PATCH 023/904] Implement unreachable BB elimination in llvm-flo Summary: It is important to remove dead blocks to free up space in functions and allow us to reorder blocks or align branch targets with more freedom. This patch implements a simple algorithm to delete all basic blocks that are not reachable from the entry point. Note that C++ exceptions may create "unreachable" blocks, so this option must be used with care. (cherry picked from commit 07b51c93541912ffabf8551fc240b9aadb3210d9) --- bolt/BinaryFunction.cpp | 75 +++++++++++++++++++++++++++-------------- bolt/BinaryFunction.h | 14 ++++++++ bolt/llvm-flo.cpp | 40 +++++++++++++++++----- 3 files changed, 95 insertions(+), 34 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 0c1d61918530..8cdb17ea4c3d 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -48,6 +48,21 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { return &(*--I); } +unsigned BinaryFunction::eraseDeadBBs( + std::map &ToPreserve) { + BasicBlockOrderType NewLayout; + unsigned Count = 0; + for (auto I = BasicBlocksLayout.begin(), E = BasicBlocksLayout.end(); I != E; + ++I) { + if (ToPreserve[*I]) + NewLayout.push_back(*I); + else + ++Count; + } + BasicBlocksLayout = std::move(NewLayout); + return Count; +} + void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { StringRef SectionName; Section.getName(SectionName); @@ -537,12 +552,13 @@ void BinaryFunction::inferFallThroughCounts() { void BinaryFunction::optimizeLayout(bool DumpLayout) { // Bail if no profiling information or if empty - if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE || empty()) { + if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE || + BasicBlocksLayout.empty()) { return; } // Work on optimal solution if problem is small enough - if (BasicBlocks.size() <= FUNC_SIZE_THRESHOLD) + if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) return solveOptimalLayout(DumpLayout); if (DumpLayout) { @@ -567,19 +583,19 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { BBToClusterMapTy BBToClusterMap; // Populating priority queue with all edges - for (auto &BB : BasicBlocks) { - BBToClusterMap[&BB] = -1; // Mark as unmapped - auto BI = BB.BranchInfo.begin(); - for (auto &I : BB.successors()) { + for (auto BB : BasicBlocksLayout) { + BBToClusterMap[BB] = -1; // Mark as unmapped + auto BI = BB->BranchInfo.begin(); + for (auto &I : BB->successors()) { if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) - Weight[std::make_pair(&BB, I)] = BI->Count; - Queue.push(std::make_pair(&BB, I)); + Weight[std::make_pair(BB, I)] = BI->Count; + Queue.push(std::make_pair(BB, I)); ++BI; } } // Start a cluster with the entry point - BinaryBasicBlock *Entry = &*BasicBlocks.begin(); + BinaryBasicBlock *Entry = *BasicBlocksLayout.begin(); Clusters.emplace_back(); auto &EntryCluster = Clusters.back(); EntryCluster.push_back(Entry); @@ -661,6 +677,14 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { BBToClusterMap[BBDst] = I; } + // Create an extra cluster for unvisited basic blocks + std::vector Unvisited; + for (auto BB : BasicBlocksLayout) { + if (BBToClusterMap[BB] == -1) { + Unvisited.push_back(BB); + } + } + // Define final function layout based on clusters BasicBlocksLayout.clear(); for (auto &Cluster : Clusters) { @@ -670,11 +694,8 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { // Finalize layout with BBs that weren't assigned to any cluster, preserving // their relative order - for (auto &BB : BasicBlocks) { - if (BBToClusterMap[&BB] == -1) { - BasicBlocksLayout.push_back(&BB); - } - } + BasicBlocksLayout.insert(BasicBlocksLayout.end(), Unvisited.begin(), + Unvisited.end()); fixBranches(); @@ -705,19 +726,19 @@ void BinaryFunction::solveOptimalLayout(bool DumpLayout) { dbgs() << "finding optimal block layout for " << getName() << "\n"; } - unsigned N = BasicBlocks.size(); + unsigned N = BasicBlocksLayout.size(); // Populating weight map and index map - for (auto &BB : BasicBlocks) { - BBToIndex[&BB] = IndexToBB.size(); - IndexToBB.push_back(&BB); + for (auto BB : BasicBlocksLayout) { + BBToIndex[BB] = IndexToBB.size(); + IndexToBB.push_back(BB); } Weight.resize(N); - for (auto &BB : BasicBlocks) { - auto BI = BB.BranchInfo.begin(); - Weight[BBToIndex[&BB]].resize(N); - for (auto &I : BB.successors()) { + for (auto BB : BasicBlocksLayout) { + auto BI = BB->BranchInfo.begin(); + Weight[BBToIndex[BB]].resize(N); + for (auto I : BB->successors()) { if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) - Weight[BBToIndex[&BB]][BBToIndex[I]] = BI->Count; + Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count; ++BI; } } @@ -765,6 +786,8 @@ void BinaryFunction::solveOptimalLayout(bool DumpLayout) { } } + std::vector PastLayout = BasicBlocksLayout; + // Define final function layout based on layout that maximizes weight BasicBlocksLayout.clear(); unsigned Last = BestLast; @@ -791,9 +814,9 @@ void BinaryFunction::solveOptimalLayout(bool DumpLayout) { std::reverse(BasicBlocksLayout.begin(), BasicBlocksLayout.end()); // Finalize layout with BBs that weren't assigned to the layout - for (auto &BB : BasicBlocks) { - if (Visited[BBToIndex[&BB]] == false) - BasicBlocksLayout.push_back(&BB); + for (auto BB : PastLayout) { + if (Visited[BBToIndex[BB]] == false) + BasicBlocksLayout.push_back(BB); } fixBranches(); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index a1de19509109..829de7b8c1a6 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -32,6 +32,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include +#include using namespace llvm::object; @@ -161,6 +162,7 @@ class BinaryFunction { typedef std::reverse_iterator const_reverse_iterator; typedef std::reverse_iterator reverse_iterator; typedef BasicBlockOrderType::iterator order_iterator; + typedef BasicBlockOrderType::const_iterator const_order_iterator; // CFG iterators. iterator begin() { return BasicBlocks.begin(); } @@ -180,6 +182,14 @@ class BinaryFunction { const BinaryBasicBlock & back() const { return BasicBlocks.back(); } BinaryBasicBlock & back() { return BasicBlocks.back(); } + unsigned layout_size() const { + return (unsigned)BasicBlocksLayout.size(); + } + const_order_iterator layout_begin() const { + return BasicBlocksLayout.begin(); + } + order_iterator layout_begin() { return BasicBlocksLayout.begin(); } + inline iterator_range layout() { return iterator_range(BasicBlocksLayout.begin(), BasicBlocksLayout.end()); @@ -281,6 +291,10 @@ class BinaryFunction { return BB; } + /// Rebuilds BBs layout, ignoring dead BBs. Returns the number of removed + /// BBs. + unsigned eraseDeadBBs(std::map &ToPreserve); + /// Return basic block that started at offset \p Offset. BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) { BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index d7f607ba0137..20502e3e2865 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -51,6 +51,7 @@ #include "llvm/Target/TargetMachine.h" #include #include +#include #include #undef DEBUG_TYPE @@ -465,6 +466,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // Run optimization passes. // // FIXME: use real optimization passes. + bool NagUser = true; for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; // Detect and eliminate unreachable basic blocks. We could have those @@ -473,19 +475,41 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // FIXME: this wouldn't work with C++ exceptions until we implement // support for those as there will be "invisible" edges // in the graph. - if (opts::EliminateUnreachable) { - bool IsFirst = true; - for (auto &BB : Function) { - if (!IsFirst && BB.pred_empty()) { - outs() << "FLO: basic block " << BB.getName() << " in function " - << Function.getName() << " is dead\n"; - // TODO: currently lacking interface to eliminate basic block. + if (opts::EliminateUnreachable && Function.layout_size() > 0) { + if (NagUser) { + outs() + << "FLO-WARNING: Using -eliminate-unreachable is experimental and " + "unsafe for exceptions\n"; + NagUser = false; + } + + std::stack Stack; + std::map Reachable; + BinaryBasicBlock *Entry = *Function.layout_begin(); + Stack.push(Entry); + Reachable[Entry] = true; + // Determine reachable BBs from the entry point + while (!Stack.empty()) { + auto BB = Stack.top(); + Stack.pop(); + for (auto Succ : BB->successors()) { + if (Reachable[Succ]) + continue; + Reachable[Succ] = true; + Stack.push(Succ); } - IsFirst = false; } + + if (unsigned Count = Function.eraseDeadBBs(Reachable)) { + outs() << "FLO: Removed " << Count + << " dead basic block(s) in function " << Function.getName() + << "\n"; + } + DEBUG(dbgs() << "*** After unreachable block elimination ***\n"); DEBUG(Function.print(dbgs(), /* PrintInstructions = */ true)); } + if (opts::ReorderBlocks) { BFI.second.optimizeLayout(opts::DumpLayout); } From f6109a86b6d1fcaa39392a5a2e7a8507cc04999d Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 20 Oct 2015 16:48:54 -0700 Subject: [PATCH 024/904] Fix entry BB execution count in llvm-flo Summary: When we have tailcalls, the execution count for the entry point is wrongly computed. Fix this. (cherry picked from commit 4be1298815f97988bf131a380d0f76c1b74c6777) --- bolt/BinaryFunction.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 8cdb17ea4c3d..9c7467b5cfd8 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -501,6 +501,10 @@ void BinaryFunction::inferFallThroughCounts() { for (auto &CurBB : BasicBlocks) { auto SuccCount = CurBB.BranchInfo.begin(); for (auto Succ : CurBB.successors()) { + // Do not update execution count of the entry block (when we have tail + // calls). We already accounted for those when computing the func count. + if (Succ == &*BasicBlocks.begin()) + continue; if (SuccCount->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) Succ->ExecutionCount += SuccCount->Count; ++SuccCount; From 190e70ead71ce3d0cb1b2fd91ccc0581799c1f1f Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 21 Oct 2015 16:25:16 -0700 Subject: [PATCH 025/904] Fix bug in BinaryFunction::fixBranches() in llvm-flo Summary: When the ignore-nops patch landed, it exposed a bug in fixBranches() where it ignored empty BBs. However, we cannot ignore empty BBs when it is reordered and its fall-through changes. We must update it with a jump to the original fall-through. This patch fixes this. (cherry picked from commit b8b92c9febc097e1a750ea40d269a5364b088ddf) --- bolt/BinaryFunction.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 9c7467b5cfd8..8eb462f1fa31 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -858,8 +858,6 @@ void BinaryFunction::fixBranches() { for (unsigned I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { BinaryBasicBlock *BB = BasicBlocksLayout[I]; - if (BB->begin() == BB->end()) - continue; const MCSymbol *TBB = nullptr; const MCSymbol *FBB = nullptr; @@ -884,7 +882,7 @@ void BinaryFunction::fixBranches() { if (CondBranch == nullptr && UncondBranch == nullptr) { // Case 1a: Last instruction is a return, so it does *not* fall through to // the next block. - if (MIA->isReturn(BB->back())) + if (!BB->empty() && MIA->isReturn(BB->back())) continue; // Case 1b: Layout has changed and the fallthrough is not the same. Need // to add a new unconditional branch to jump to the old fallthrough. From b0888044f27f30873c632a59ab315b97bd4a99e1 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 20 Oct 2015 10:51:17 -0700 Subject: [PATCH 026/904] Issue warning when relaxed tail call is seen on input. Summary: Issue warning when we see a 2-byte tail call. Currently we will increase the size of these instructions. (cherry picked from commit 6821c6ef6e227bf73ae84ec2b23298e97eeacb80) --- bolt/BinaryBasicBlock.h | 5 +++++ bolt/BinaryFunction.cpp | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index c69a138ac3aa..568c5f3ea034 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -193,6 +193,11 @@ class BinaryBasicBlock { return Alignment; } + /// Return offset of the basic block from the function start. + uint64_t getOffset() const { + return Offset; + } + /// Adds block to successor list, and also updates predecessor list for /// successor block. /// Set branch info for this path. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 8eb462f1fa31..7ef79460fef8 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -252,6 +252,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = LI->second; } } else { + if (!IsCall && Size == 2) { + errs() << "FLO-WARNING: relaxed tail call detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << ". Code size will be increased.\n"; + } + // This is a call regardless of the opcode (e.g. tail call). IsCall = true; TargetSymbol = BC.getOrCreateGlobalSymbol(InstructionTarget, @@ -536,7 +542,10 @@ void BinaryFunction::inferFallThroughCounts() { Inferred = BBExecCount - ReportedBranches; if (BBExecCount < ReportedBranches) errs() << "FLO-WARNING: Fall-through inference is slightly inconsistent. " - "BB exec frequency is less than the outgoing edges frequency\n"; + "exec frequency is less than the outgoing edges frequency (" + << BBExecCount << " < " << ReportedBranches + << ") for BB at offset 0x" + << Twine::utohexstr(getAddress() + CurBB.getOffset()) << '\n'; // Put this information into the fall-through edge if (CurBB.succ_size() == 0) From a1a1f9b3aeaa21da0d4e5a070d29640ec73f06b9 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 23 Oct 2015 15:52:59 -0700 Subject: [PATCH 027/904] More control over function printing. Summary: Can use '-print-*' option to print function at specific stage. Use '-print-all' to print at every stage. (cherry picked from commit b3983a419a12a01074a1947113eccd8f733115d2) --- bolt/BinaryFunction.cpp | 75 +++++++------------------ bolt/BinaryFunction.h | 11 ++-- bolt/llvm-flo.cpp | 118 ++++++++++++++++++++++++++-------------- 3 files changed, 104 insertions(+), 100 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 7ef79460fef8..3fb1bb149844 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -63,10 +63,11 @@ unsigned BinaryFunction::eraseDeadBBs( return Count; } -void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { +void BinaryFunction::print(raw_ostream &OS, std::string Annotation, + bool PrintInstructions) const { StringRef SectionName; Section.getName(SectionName); - OS << "Binary Function \"" << getName() << "\" {" + OS << "Binary Function \"" << getName() << "\" " << Annotation << " {" << "\n State : " << CurrentState << "\n Address : 0x" << Twine::utohexstr(Address) << "\n Size : 0x" << Twine::utohexstr(Size) @@ -75,10 +76,20 @@ void BinaryFunction::print(raw_ostream &OS, bool PrintInstructions) const { << "\n Section : " << SectionName << "\n Orc Section : " << getCodeSectionName() << "\n IsSimple : " << IsSimple - << "\n BB count : " << BasicBlocks.size() - << "\n Image : 0x" << Twine::utohexstr(ImageAddress); + << "\n BB Count : " << BasicBlocksLayout.size(); + if (BasicBlocksLayout.size()) { + OS << "\n BB Layout : "; + auto Sep = ""; + for (auto BB : BasicBlocksLayout) { + OS << Sep << BB->getName(); + Sep = ", "; + } + } + if (ImageAddress) + OS << "\n Image : 0x" << Twine::utohexstr(ImageAddress); if (ExecutionCount != COUNT_NO_PROFILE) OS << "\n Exec Count : " << ExecutionCount; + OS << "\n}\n"; if (!PrintInstructions || !BC.InstPrinter) @@ -321,9 +332,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Update state. updateState(State::Disassembled); - // Print the function in the new state. - DEBUG(print(dbgs(), /* PrintInstructions = */ true)); - return true; } @@ -415,7 +423,7 @@ bool BinaryFunction::buildCFG() { } // Intermediate dump. - DEBUG(print(dbgs(), /* PrintInstructions = */ true)); + DEBUG(print(dbgs(), "after creating basic blocks")); // TODO: handle properly calls to no-return functions, // e.g. exit(3), etc. Otherwise we'll see a false fall-through @@ -486,9 +494,6 @@ bool BinaryFunction::buildCFG() { // Update the state. CurrentState = State::CFG; - // Print the function in the new state. - DEBUG(print(dbgs(), /* PrintInstructions = */ true)); - return true; } @@ -563,7 +568,7 @@ void BinaryFunction::inferFallThroughCounts() { return; } -void BinaryFunction::optimizeLayout(bool DumpLayout) { +void BinaryFunction::optimizeLayout() { // Bail if no profiling information or if empty if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE || BasicBlocksLayout.empty()) { @@ -572,11 +577,9 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { // Work on optimal solution if problem is small enough if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) - return solveOptimalLayout(DumpLayout); + return solveOptimalLayout(); - if (DumpLayout) { - dbgs() << "running block layout heuristics on " << getName() << "\n"; - } + DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n"); // Greedy heuristic implementation for the TSP, applied to BB layout. Try to // maximize weight during a path traversing all BBs. In this way, we will @@ -711,33 +714,14 @@ void BinaryFunction::optimizeLayout(bool DumpLayout) { Unvisited.end()); fixBranches(); - - if (DumpLayout) { - dbgs() << "original BB order is: "; - auto Sep = ""; - for (auto &BB : BasicBlocks) { - dbgs() << Sep << BB.getName(); - Sep = ","; - } - dbgs() << "\nnew order is: "; - Sep = ""; - for (auto BB : BasicBlocksLayout) { - dbgs() << Sep << BB->getName(); - Sep = ","; - } - dbgs() << "\n"; - print(dbgs(), /* PrintInstructions = */ true); - } } -void BinaryFunction::solveOptimalLayout(bool DumpLayout) { +void BinaryFunction::solveOptimalLayout() { std::vector> Weight; std::map BBToIndex; std::vector IndexToBB; - if (DumpLayout) { - dbgs() << "finding optimal block layout for " << getName() << "\n"; - } + DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n"); unsigned N = BasicBlocksLayout.size(); // Populating weight map and index map @@ -833,23 +817,6 @@ void BinaryFunction::solveOptimalLayout(bool DumpLayout) { } fixBranches(); - - if (DumpLayout) { - dbgs() << "original BB order is: "; - auto Sep = ""; - for (auto &BB : BasicBlocks) { - dbgs() << Sep << BB.getName(); - Sep = ","; - } - dbgs() << "\nnew order is: "; - Sep = ""; - for (auto BB : BasicBlocksLayout) { - dbgs() << Sep << BB->getName(); - Sep = ","; - } - dbgs() << "\n"; - print(dbgs(), /* PrintInstructions = */ true); - } } const BinaryBasicBlock * diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 829de7b8c1a6..3aa970d9aa36 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -202,7 +202,7 @@ class BinaryFunction { /// Perform optimal code layout based on edge frequencies making necessary /// adjustments to instructions at the end of basic blocks. - void optimizeLayout(bool DumpLayout); + void optimizeLayout(); /// Dynamic programming implementation for the TSP, applied to BB layout. Find /// the optimal way to maximize weight during a path traversing all BBs. In @@ -210,7 +210,7 @@ class BinaryFunction { /// /// Uses exponential amount of memory on the number of basic blocks and should /// only be used for small functions. - void solveOptimalLayout(bool DumpLayout); + void solveOptimalLayout(); /// View CFG in graphviz program void viewGraph(); @@ -310,12 +310,13 @@ class BinaryFunction { /// Dump function information to debug output. If \p PrintInstructions /// is true - include instruction disassembly. - void dump(bool PrintInstructions = false) const { - print(dbgs(), PrintInstructions); + void dump(std::string Annotation = "", bool PrintInstructions = true) const { + print(dbgs(), Annotation, PrintInstructions); } /// Print function information to the \p OS stream. - void print(raw_ostream &OS, bool PrintInstructions = false) const; + void print(raw_ostream &OS, std::string Annotation = "", + bool PrintInstructions = true) const; void addInstruction(uint64_t Offset, MCInst &&Instruction) { Instructions.emplace(Offset, std::forward(Instruction)); diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 20502e3e2865..7e920040eac5 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -100,16 +100,60 @@ ReorderBlocks("reorder-blocks", cl::Optional); static cl::opt -DumpData("dump-data", cl::desc("dump parsed flo data (debugging)"), +DumpData("dump-data", cl::desc("dump parsed flo data and exit (debugging)"), cl::Hidden); static cl::opt -DumpFunctions("dump-functions", cl::desc("dump parsed functions (debugging)"), - cl::Hidden); +PrintAll("print-all", cl::desc("print functions after each stage"), + cl::Hidden); + +static cl::opt +PrintCFG("print-cfg", cl::desc("print functions after CFG construction"), + cl::Hidden); + +static cl::opt +PrintUCE("print-uce", + cl::desc("print functions after unreachable code elimination"), + cl::Hidden); static cl::opt -DumpLayout("dump-layout", cl::desc("dump parsed flo data (debugging)"), - cl::Hidden); +PrintDisasm("print-disasm", cl::desc("print function after disassembly"), + cl::Hidden); + +static cl::opt +PrintReordered("print-reordered", + cl::desc("print functions after layout optimization"), + cl::Hidden); + + +// Check against lists of functions from options if we should +// optimize the function with a given name. +bool shouldProcess(StringRef FunctionName) { + bool IsValid = true; + if (!FunctionNames.empty()) { + IsValid = false; + for (auto &Name : FunctionNames) { + if (FunctionName == Name) { + IsValid = true; + break; + } + } + } + if (!IsValid) + return false; + + if (!SkipFunctionNames.empty()) { + for (auto &Name : SkipFunctionNames) { + if (FunctionName == Name) { + IsValid = false; + break; + } + } + } + + return IsValid; +} + } // namespace opts static StringRef ToolName; @@ -406,6 +450,12 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; + if (!opts::shouldProcess(Function.getName())) { + DEBUG(dbgs() << "FLO: skipping processing function " << Function.getName() + << " per user request.\n"); + continue; + } + SectionRef Section = Function.getSection(); assert(Section.containsSymbol(Function.getSymbol()) && "symbol not in section"); @@ -453,15 +503,16 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (!Function.disassemble(FunctionData)) continue; + if (opts::PrintAll || opts::PrintDisasm) + Function.print(errs(), "after disassembly"); + if (!Function.buildCFG()) continue; - if (opts::DumpFunctions) - Function.print(errs(), true); - } // Iterate over all functions + if (opts::PrintAll || opts::PrintCFG) + Function.print(errs(), "after building cfg"); - if (opts::DumpFunctions) - return; + } // Iterate over all functions // Run optimization passes. // @@ -469,6 +520,10 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { bool NagUser = true; for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; + + if (!opts::shouldProcess(Function.getName())) + continue; + // Detect and eliminate unreachable basic blocks. We could have those // filled with nops and they are used for alignment. // @@ -500,18 +555,22 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } } - if (unsigned Count = Function.eraseDeadBBs(Reachable)) { - outs() << "FLO: Removed " << Count - << " dead basic block(s) in function " << Function.getName() - << "\n"; + auto Count = Function.eraseDeadBBs(Reachable); + if (Count) { + DEBUG(dbgs() << "FLO: Removed " << Count + << " dead basic block(s) in function " + << Function.getName() << '\n'); } - DEBUG(dbgs() << "*** After unreachable block elimination ***\n"); - DEBUG(Function.print(dbgs(), /* PrintInstructions = */ true)); + if (opts::PrintAll || opts::PrintUCE) + Function.print(errs(), "after unreachable code elimination"); } if (opts::ReorderBlocks) { - BFI.second.optimizeLayout(opts::DumpLayout); + Function.optimizeLayout(); + + if (opts::PrintAll || opts::PrintReordered) + Function.print(errs(), "after reordering blocks"); } } @@ -560,30 +619,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (!Function.isSimple()) continue; - // Check against lists of functions from options if we should - // optimize the function. - bool IsValid = true; - if (!opts::FunctionNames.empty()) { - IsValid = false; - for (auto &Name : opts::FunctionNames) { - if (Function.getName() == Name) { - IsValid = true; - break; - } - } - } - if (!IsValid) - continue; - - if (!opts::SkipFunctionNames.empty()) { - for (auto &Name : opts::SkipFunctionNames) { - if (Function.getName() == Name) { - IsValid = false; - break; - } - } - } - if (!IsValid) + if (!opts::shouldProcess(Function.getName())) continue; DEBUG(dbgs() << "FLO: generating code for function \"" From c0658f77025f9db2caf3f6bf931c094d2196f13b Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 27 Oct 2015 03:04:58 -0700 Subject: [PATCH 028/904] Fixes priority queue ordering in llvm-flo block reordering Summary: Fixes a bug which caused the block reordering heuristic to put in the same cluster hot basic blocks and cold basic blocks, increasing I-cache misses. (cherry picked from commit 4c37246748c1f269dabed6b686c0090c1364bedd) --- bolt/BinaryFunction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 3fb1bb149844..868cc70a89c7 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -590,7 +590,7 @@ void BinaryFunction::optimizeLayout() { std::map Weight; // Define a comparison function to establish SWO between edges - auto Comp = [&Weight](EdgeTy A, EdgeTy B) { return Weight[A] > Weight[B]; }; + auto Comp = [&Weight](EdgeTy A, EdgeTy B) { return Weight[A] < Weight[B]; }; std::priority_queue, decltype(Comp)> Queue(Comp); typedef std::vector ClusterTy; From eb653cc1cd786617d14e838b10b6553a0375edca Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 23 Oct 2015 09:38:26 -0700 Subject: [PATCH 029/904] Implement two cluster layout heuristics Summary: Pettis' paper on block layout (PLDI'90) suggests we should order clusters (or chains, using the paper terminology) using a specific criterion. This patch implements two distinct ideas for cluster layout that can be activated using different command-line flags. The first one reflects Pettis' ideas on minimizing branch mispredictions and the second one is targeted at reducing I-cache misses, described in the Ispike paper (CGO'04). (cherry picked from commit dc997d6b8cebb289ef64867061d1be07ff369dba) --- bolt/BinaryBasicBlock.h | 1 + bolt/BinaryFunction.cpp | 233 ++++++++++++++++++++++++++-------------- bolt/BinaryFunction.h | 15 ++- bolt/llvm-flo.cpp | 29 +++-- 4 files changed, 190 insertions(+), 88 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 568c5f3ea034..5ee41720b086 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -94,6 +94,7 @@ class BinaryBasicBlock { typedef std::reverse_iterator reverse_iterator; bool empty() const { return Instructions.empty(); } + unsigned size() const { return (unsigned)Instructions.size(); } MCInst &front() { return Instructions.front(); } MCInst &back() { return Instructions.back(); } const MCInst &front() const { return Instructions.front(); } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 868cc70a89c7..6905f51ecb44 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -568,7 +568,7 @@ void BinaryFunction::inferFallThroughCounts() { return; } -void BinaryFunction::optimizeLayout() { +void BinaryFunction::optimizeLayout(HeuristicPriority Priority) { // Bail if no profiling information or if empty if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE || BasicBlocksLayout.empty()) { @@ -598,9 +598,18 @@ void BinaryFunction::optimizeLayout() { std::vector Clusters; BBToClusterMapTy BBToClusterMap; - // Populating priority queue with all edges + // Encode relative weights between two clusters + std::vector> ClusterEdges; + ClusterEdges.resize(BasicBlocksLayout.size()); + for (auto BB : BasicBlocksLayout) { - BBToClusterMap[BB] = -1; // Mark as unmapped + // Create a cluster for this BB + uint32_t I = Clusters.size(); + Clusters.emplace_back(); + auto &Cluster = Clusters.back(); + Cluster.push_back(BB); + BBToClusterMap[BB] = I; + // Populate priority queue with edges auto BI = BB->BranchInfo.begin(); for (auto &I : BB->successors()) { if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) @@ -610,13 +619,6 @@ void BinaryFunction::optimizeLayout() { } } - // Start a cluster with the entry point - BinaryBasicBlock *Entry = *BasicBlocksLayout.begin(); - Clusters.emplace_back(); - auto &EntryCluster = Clusters.back(); - EntryCluster.push_back(Entry); - BBToClusterMap[Entry] = 0; - // Grow clusters in a greedy fashion while (!Queue.empty()) { auto elmt = Queue.top(); @@ -624,95 +626,166 @@ void BinaryFunction::optimizeLayout() { BinaryBasicBlock *BBSrc = elmt.first; BinaryBasicBlock *BBDst = elmt.second; - int I = 0, J = 0; // Case 1: BBSrc and BBDst are the same. Ignore this edge - if (BBSrc == BBDst || BBDst == Entry) + if (BBSrc == BBDst || BBDst == *BasicBlocksLayout.begin()) continue; - // Case 2: Both BBSrc and BBDst are already allocated - if ((I = BBToClusterMap[BBSrc]) != -1 && - (J = BBToClusterMap[BBDst]) != -1) { - // Case 2a: If they are already allocated at the same cluster, ignore - if (I == J) - continue; - auto &ClusterA = Clusters[I]; - auto &ClusterB = Clusters[J]; - if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { - // Case 2b: BBSrc is at the end of a cluster and BBDst is at the start, - // allowing us to merge two clusters - for (auto BB : ClusterB) - BBToClusterMap[BB] = I; - ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); - ClusterB.clear(); - } else { - // Case 2c: Both BBSrc and BBDst are allocated in positions we cannot - // merge them, so we ignore this edge. - } - continue; - } + int I = BBToClusterMap[BBSrc]; + int J = BBToClusterMap[BBDst]; - // Case 3: BBSrc is already allocated in a cluster - if ((I = BBToClusterMap[BBSrc]) != -1) { - auto &Cluster = Clusters[I]; - if (Cluster.back() == BBSrc) { - // Case 3a: BBSrc is allocated at the end of this cluster. We put - // BBSrc and BBDst together. - Cluster.push_back(BBDst); - BBToClusterMap[BBDst] = I; - } else { - // Case 3b: We cannot put BBSrc and BBDst in consecutive positions, - // so we ignore this edge. - } + // Case 2: If they are already allocated at the same cluster, just increase + // the weight of this cluster + if (I == J) { + ClusterEdges[I][I] += Weight[elmt]; continue; } - // Case 4: BBSrc is not in a cluster, but BBDst is - if ((I = BBToClusterMap[BBDst]) != -1) { - auto &Cluster = Clusters[I]; - if (Cluster.front() == BBDst) { - // Case 4a: BBDst is allocated at the start of this cluster. We put - // BBSrc and BBDst together. - Cluster.insert(Cluster.begin(), BBSrc); - BBToClusterMap[BBSrc] = I; - } else { - // Case 4b: We cannot put BBSrc and BBDst in consecutive positions, - // so we ignore this edge. - } - continue; + auto &ClusterA = Clusters[I]; + auto &ClusterB = Clusters[J]; + if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { + // Case 3: BBSrc is at the end of a cluster and BBDst is at the start, + // allowing us to merge two clusters + for (auto BB : ClusterB) + BBToClusterMap[BB] = I; + ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); + ClusterB.clear(); + // Iterate through all inter-cluster edges and transfer edges targeting + // cluster B to cluster A. + // It is bad to have to iterate though all edges when we could have a list + // of predecessors for cluster B. However, it's not clear if it is worth + // the added code complexity to create a data structure for clusters that + // maintains a list of predecessors. Maybe change this if it becomes a + // deal breaker. + for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) + ClusterEdges[K][I] += ClusterEdges[K][J]; + } else { + // Case 4: Both BBSrc and BBDst are allocated in positions we cannot + // merge them. Annotate the weight of this edge in the weight between + // clusters to help us decide ordering between these clusters. + ClusterEdges[I][J] += Weight[elmt]; } + } - // Case 5: Both BBSrc and BBDst are unallocated, so we create a new cluster - // with them - I = Clusters.size(); - Clusters.emplace_back(); - auto &Cluster = Clusters.back(); - Cluster.push_back(BBSrc); - Cluster.push_back(BBDst); - BBToClusterMap[BBSrc] = I; - BBToClusterMap[BBDst] = I; + std::vector Order; // Cluster layout order + + // Here we have 3 conflicting goals as to how to layout clusters. If we want + // to minimize jump offsets, we should put clusters with heavy inter-cluster + // dependence as close as possible. If we want to maximize the probability + // that all inter-cluster edges are predicted as not-taken, we should enforce + // a topological order to make targets appear after sources, creating forward + // branches. If we want to separate hot from cold blocks to maximize the + // probability that unfrequently executed code doesn't pollute the cache, we + // should put clusters in descending order of hotness. + std::vector AvgFreq; + AvgFreq.resize(Clusters.size(), 0.0); + for (uint32_t I = 1, E = Clusters.size(); I < E; ++I) { + double Freq = 0.0; + for (auto BB : Clusters[I]) { + if (!BB->empty()) + Freq += BB->getExecutionCount() / BB->size(); + } + AvgFreq[I] = Freq; } - // Create an extra cluster for unvisited basic blocks - std::vector Unvisited; - for (auto BB : BasicBlocksLayout) { - if (BBToClusterMap[BB] == -1) { - Unvisited.push_back(BB); - } + switch(Priority) { + case HP_NONE: { + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!Clusters[I].empty()) + Order.push_back(I); + break; + } + case HP_BRANCH_PREDICTOR: { + // Do a topological sort for clusters, prioritizing frequently-executed BBs + // during the traversal. + std::stack Stack; + std::vector Status; + std::vector Parent; + Status.resize(Clusters.size(), 0); + Parent.resize(Clusters.size(), 0); + constexpr uint32_t STACKED = 1; + constexpr uint32_t VISITED = 2; + Status[0] = STACKED; + Stack.push(0); + while (!Stack.empty()) { + uint32_t I = Stack.top(); + if (!(Status[I] & VISITED)) { + Status[I] |= VISITED; + // Order successors by weight + auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { + return ClusterEdges[I][A] > ClusterEdges[I][B]; + }; + std::priority_queue, + decltype(ClusterComp)> SuccQueue(ClusterComp); + for (auto &Target: ClusterEdges[I]) { + if (Target.second > 0 && !(Status[Target.first] & STACKED) && + !Clusters[Target.first].empty()) { + Parent[Target.first] = I; + Status[Target.first] = STACKED; + SuccQueue.push(Target.first); + } + } + while (!SuccQueue.empty()) { + Stack.push(SuccQueue.top()); + SuccQueue.pop(); + } + continue; + } + // Already visited this node + Stack.pop(); + Order.push_back(I); + } + std::reverse(Order.begin(), Order.end()); + // Put unreachable clusters at the end + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!(Status[I] & VISITED) && !Clusters[I].empty()) + Order.push_back(I); + + // Sort nodes with equal precedence + auto Beg = Order.begin(); + // Don't reorder the first cluster, which contains the function entry point + ++Beg; + std::stable_sort(Beg, Order.end(), + [&AvgFreq, &Parent](uint32_t A, uint32_t B) { + uint32_t P = Parent[A]; + while (Parent[P] != 0) { + if (Parent[P] == B) + return false; + P = Parent[P]; + } + P = Parent[B]; + while (Parent[P] != 0) { + if (Parent[P] == A) + return true; + P = Parent[P]; + } + return AvgFreq[A] > AvgFreq[B]; + }); + break; + } + case HP_CACHE_UTILIZATION: { + // Order clusters based on average instruction execution frequency + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!Clusters[I].empty()) + Order.push_back(I); + auto Beg = Order.begin(); + // Don't reorder the first cluster, which contains the function entry point + ++Beg; + std::stable_sort(Beg, Order.end(), [&AvgFreq](uint32_t A, uint32_t B) { + return AvgFreq[A] > AvgFreq[B]; + }); + + break; + } } - // Define final function layout based on clusters BasicBlocksLayout.clear(); - for (auto &Cluster : Clusters) { + for (auto I : Order) { + auto &Cluster = Clusters[I]; BasicBlocksLayout.insert(BasicBlocksLayout.end(), Cluster.begin(), Cluster.end()); } - // Finalize layout with BBs that weren't assigned to any cluster, preserving - // their relative order - BasicBlocksLayout.insert(BasicBlocksLayout.end(), Unvisited.begin(), - Unvisited.end()); - fixBranches(); } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 3aa970d9aa36..2de397d21425 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -51,6 +51,19 @@ class BinaryFunction { Assembled, /// Function has been assembled in memory }; + // Choose which strategy should the block layout heuristic prioritize when + // facing conflicting goals. + enum HeuristicPriority : char { + HP_NONE = 0, + // HP_BRANCH_PREDICTOR is an implementation of what is suggested in Pettis' + // paper (PLDI '90) about block reordering, trying to minimize branch + // mispredictions. + HP_BRANCH_PREDICTOR, + // HP_CACHE_UTILIZATION pigbacks on the idea from Ispike paper (CGO '04) + // that suggests putting frequently executed chains first in the layout. + HP_CACHE_UTILIZATION, + }; + static constexpr uint64_t COUNT_NO_PROFILE = std::numeric_limits::max(); // Function size, in number of BBs, above which we fallback to a heuristic @@ -202,7 +215,7 @@ class BinaryFunction { /// Perform optimal code layout based on edge frequencies making necessary /// adjustments to instructions at the end of basic blocks. - void optimizeLayout(); + void optimizeLayout(HeuristicPriority Priority); /// Dynamic programming implementation for the TSP, applied to BB layout. Find /// the optimal way to maximize weight during a path traversing all BBs. In diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 7e920040eac5..7446c63c8f03 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -94,10 +94,11 @@ EliminateUnreachable("eliminate-unreachable", cl::desc("eliminate unreachable code"), cl::Optional); -static cl::opt -ReorderBlocks("reorder-blocks", - cl::desc("redo basic block layout based on profiling data"), - cl::Optional); +static cl::opt ReorderBlocks( + "reorder-blocks", + cl::desc("redo basic block layout based on profiling data with a specific " + "priority (none, branch-predictor or cache)"), + cl::value_desc("priority"), cl::init("disable")); static cl::opt DumpData("dump-data", cl::desc("dump parsed flo data and exit (debugging)"), @@ -518,6 +519,15 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // // FIXME: use real optimization passes. bool NagUser = true; + if (opts::ReorderBlocks != "" && + opts::ReorderBlocks != "disable" && + opts::ReorderBlocks != "none" && + opts::ReorderBlocks != "branch-predictor" && + opts::ReorderBlocks != "cache") { + errs() << ToolName << ": Unrecognized block reordering priority \"" + << opts::ReorderBlocks << "\".\n"; + exit(1); + } for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -566,9 +576,14 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Function.print(errs(), "after unreachable code elimination"); } - if (opts::ReorderBlocks) { - Function.optimizeLayout(); - + if (opts::ReorderBlocks != "disable") { + if (opts::ReorderBlocks == "branch-predictor") { + BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR); + } else if (opts::ReorderBlocks == "cache") { + BFI.second.optimizeLayout(BinaryFunction::HP_CACHE_UTILIZATION); + } else { + BFI.second.optimizeLayout(BinaryFunction::HP_NONE); + } if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks"); } From 3e350bf3c9e670facd4cecf119e4b464ec7a0a5e Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 26 Oct 2015 15:00:56 -0700 Subject: [PATCH 030/904] Extract non-taken branch frequencies from LBR Summary: Previously, we inferred all non-taken branch frequencies with the information we had for taken branches. This patch teaches perf2flo and llvm-flo how to read and incorporate non-taken branch frequencies directly from the traces available in LBR data and by disassembling the binary. It still leaves the inference engine untouched in case we need it to fill out other fall-throughs. (cherry picked from commit 2f1ae8bf894d2ad7ef3bb71fb4eb1b11020827e9) --- bolt/BinaryFunction.cpp | 50 ++++++++++++++++++++++++++++++++++++----- bolt/BinaryFunction.h | 7 ++++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 6905f51ecb44..aba808688d0b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -233,6 +233,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // If the target *is* the function address it could be either a branch // or a recursive call. bool IsCall = MIA->isCall(Instruction); + bool IsCondBranch = MIA->isConditionalBranch(Instruction); MCSymbol *TargetSymbol{nullptr}; uint64_t TargetOffset{0}; @@ -286,6 +287,10 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Add local branch info. LocalBranches.push_back({Offset, TargetOffset}); } + if (IsCondBranch) { + // Add fallthrough branch info. + FTBranches.push_back({Offset, Offset + Size}); + } } else { // Indirect call @@ -430,7 +435,6 @@ bool BinaryFunction::buildCFG() { // blocks. for (auto &Branch : LocalBranches) { - DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first) << "] -> [0x" << Twine::utohexstr(Branch.second) << "]\n"); BinaryBasicBlock *FromBB = getBasicBlockContainingOffset(Branch.first); @@ -438,12 +442,12 @@ bool BinaryFunction::buildCFG() { BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second); assert(ToBB && "cannot find BB containing TO branch"); - if (std::error_code EC = BranchDataOrErr.getError()) { + if (BranchDataOrErr.getError()) { FromBB->addSuccessor(ToBB); } else { const FuncBranchData &BranchData = BranchDataOrErr.get(); auto BranchInfoOrErr = BranchData.getBranch(Branch.first, Branch.second); - if (std::error_code EC = BranchInfoOrErr.getError()) { + if (BranchInfoOrErr.getError()) { FromBB->addSuccessor(ToBB); } else { const BranchInfo &BInfo = BranchInfoOrErr.get(); @@ -452,7 +456,32 @@ bool BinaryFunction::buildCFG() { } } - // Add fall-through branches. + for (auto &Branch : FTBranches) { + DEBUG(dbgs() << "registering fallthrough [0x" + << Twine::utohexstr(Branch.first) << "] -> [0x" + << Twine::utohexstr(Branch.second) << "]\n"); + BinaryBasicBlock *FromBB = getBasicBlockContainingOffset(Branch.first); + assert(FromBB && "cannot find BB containing FROM branch"); + BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second); + // We have a fall-through that does not point to another BB, ignore it as + // it may happen in cases where we have a BB finished by two branches. + if (ToBB == nullptr) + continue; + + // Does not add a successor if we can't find profile data, leave it to the + // inference pass to guess its frequency + if (!BranchDataOrErr.getError()) { + const FuncBranchData &BranchData = BranchDataOrErr.get(); + auto BranchInfoOrErr = BranchData.getBranch(Branch.first, Branch.second); + if (!BranchInfoOrErr.getError()) { + const BranchInfo &BInfo = BranchInfoOrErr.get(); + FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); + } + } + } + + // Add fall-through branches (except for non-taken conditional branches with + // profile data, which were already accounted for in LocalBranches). PrevBB = nullptr; bool IsPrevFT = false; // Is previous block a fall-through. for (auto &BB : BasicBlocks) { @@ -469,7 +498,8 @@ bool BinaryFunction::buildCFG() { } else if (BB.succ_size() == 1) { IsPrevFT = MIA->isConditionalBranch(LastInst) ? true : false; } else { - // Either ends with 2 branches, or with an indirect jump. + // Ends with 2 branches, with an indirect jump or it is a conditional + // branch whose frequency has been inferred from LBR IsPrevFT = false; } @@ -490,6 +520,7 @@ bool BinaryFunction::buildCFG() { clearInstructions(); clearLabels(); clearLocalBranches(); + clearFTBranches(); // Update the state. CurrentState = State::CFG; @@ -666,6 +697,15 @@ void BinaryFunction::optimizeLayout(HeuristicPriority Priority) { ClusterEdges[I][J] += Weight[elmt]; } } + DEBUG(for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + dbgs() << "Cluster number " << I << ": "; + auto Sep = ""; + for (auto BB : Clusters[I]) { + dbgs() << Sep << BB->getName(); + Sep = ", "; + } + dbgs() << "\n"; + }); std::vector Order; // Cluster layout order diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 2de397d21425..5cbc1d953422 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -139,6 +139,12 @@ class BinaryFunction { return *this; } + BinaryFunction &clearFTBranches() { + LocalBranchesListType TempList; + FTBranches.swap(TempList); + return *this; + } + BinaryFunction &updateState(BinaryFunction::State State) { CurrentState = State; return *this; @@ -150,6 +156,7 @@ class BinaryFunction { /// Storage for all local branches in the function (non-fall-throughs). using LocalBranchesListType = std::vector>; LocalBranchesListType LocalBranches; + LocalBranchesListType FTBranches; /// Map offset in the function to a local label. using LabelsMapType = std::map; From ef91e5391348443ab6449bf8dbfa895a4677de37 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 2 Nov 2015 11:50:53 -0700 Subject: [PATCH 031/904] Added function to parse and dump .gcc_except_table Summary: Use '-print-exceptions' option to dump contents of .gcc_except_table. (cherry picked from commit 9f3bb74d2b5c75f3f7dd9492e94627bd30acba6e) --- bolt/CMakeLists.txt | 1 + bolt/Exceptions.cpp | 264 ++++++++++++++++++++++++++++++++++++++++++++ bolt/Exceptions.h | 25 +++++ bolt/llvm-flo.cpp | 17 +++ 4 files changed, 307 insertions(+) create mode 100644 bolt/Exceptions.cpp create mode 100644 bolt/Exceptions.h diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index a66505c5d097..cfb1a91da32b 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -16,4 +16,5 @@ add_llvm_tool(llvm-flo BinaryContext.cpp BinaryFunction.cpp DataReader.cpp + Exceptions.cpp ) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp new file mode 100644 index 000000000000..3477af2561d5 --- /dev/null +++ b/bolt/Exceptions.cpp @@ -0,0 +1,264 @@ +//===-- Exceptions.cpp - Helpers for processing C++ exceptions ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Some of the code is taken from examples/ExceptionDemo +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "flo-exceptions" + +STATISTIC(NumLSDAs, "Number of all LSDAs"); +STATISTIC(NumTrivialLSDAs, + "Number of LSDAs with single call site without landing pad or action"); + +using namespace llvm::dwarf; + +namespace llvm { +namespace flo { + +namespace opts { + +static cl::opt +PrintExceptions("print-exceptions", + cl::desc("print exception handling data"), + cl::Hidden); + +} // namespace opts + +namespace { + +/// Read an unsigned LEB128 value from data, advancing it past the value. +uintptr_t readULEB128(const uint8_t *&Data) { + uintptr_t Result = 0; + uintptr_t Shift = 0; + unsigned char Byte; + + do { + Byte = *Data++; + Result |= (Byte & 0x7f) << Shift; + Shift += 7; + } while (Byte & 0x80); + + return Result; +} + +/// Read a signed LEB128 value from data, advancing it past the value. +uintptr_t readSLEB128(const uint8_t *&Data) { + uintptr_t Result = 0; + uintptr_t Shift = 0; + unsigned char Byte; + + do { + Byte = *Data++; + Result |= (Byte & 0x7f) << Shift; + Shift += 7; + } while (Byte & 0x80); + + if ((Byte & 0x40) && (Shift < (sizeof(Result) << 3))) { + Result |= (~0 << Shift); + } + + return Result; +} + +/// Read and return a T from data, advancing it past the read item. +template +T readValue(const uint8_t *&Data) { + T Val; + memcpy(&Val, Data, sizeof(T)); + Data += sizeof(T); + return Val; +} + +/// Read an encoded DWARF value from data, advancing it past any data read. This +/// function was adapted from the ExceptionDemo.cpp example in llvm. +uintptr_t readEncodedPointer(const uint8_t *&Data, uint8_t Encoding) { + uintptr_t Result = 0; + auto const Start = Data; + + if (Encoding == DW_EH_PE_omit) + return Result; + + // first get value + switch (Encoding & 0x0F) { + case DW_EH_PE_absptr: + Result = readValue(Data); + break; + case DW_EH_PE_uleb128: + Result = readULEB128(Data); + break; + case DW_EH_PE_sleb128: + Result = readSLEB128(Data); + break; + case DW_EH_PE_udata2: + Result = readValue(Data); + break; + case DW_EH_PE_udata4: + Result = readValue(Data); + break; + case DW_EH_PE_udata8: + Result = readValue(Data); + break; + case DW_EH_PE_sdata2: + Result = readValue(Data); + break; + case DW_EH_PE_sdata4: + Result = readValue(Data); + break; + case DW_EH_PE_sdata8: + Result = readValue(Data); + break; + default: + assert(0 && "not implemented"); + } + + // then add relative offset + switch (Encoding & 0x70) { + case DW_EH_PE_absptr: + // do nothing + break; + case DW_EH_PE_pcrel: + Result += reinterpret_cast(Start); + break; + case DW_EH_PE_textrel: + case DW_EH_PE_datarel: + case DW_EH_PE_funcrel: + case DW_EH_PE_aligned: + default: + assert(0 && "not implemented"); + } + + // then apply indirection + if (Encoding & 0x80 /*DW_EH_PE_indirect*/) { + Result = *((uintptr_t*)Result); + } + + return Result; +} + +} // namespace + +void readLSDA(ArrayRef LSDAData) { + const uint8_t *Ptr = LSDAData.data(); + + while (Ptr < LSDAData.data() + LSDAData.size()) { + uint8_t LPStartEncoding = *Ptr++; + // Some of LSDAs are aligned while other are not. We use the hack below + // to work around 0-filled alignment. However it could also mean + // DW_EH_PE_absptr format. + // + // FIXME: the proper way to parse these tables is to get the pointer + // from .eh_frame and parse one entry at a time. + while (!LPStartEncoding) + LPStartEncoding = *Ptr++; + if (opts::PrintExceptions) { + errs() << "[LSDA at 0x" + << Twine::utohexstr(reinterpret_cast(Ptr-1)) << "]:\n"; + } + + ++NumLSDAs; + bool IsTrivial = true; + + uintptr_t LPStart = 0; + if (LPStartEncoding != DW_EH_PE_omit) { + LPStart = readEncodedPointer(Ptr, LPStartEncoding); + } + + uint8_t TTypeEncoding = *Ptr++; + uintptr_t TTypeEnd = 0; + if (TTypeEncoding != DW_EH_PE_omit) { + TTypeEnd = readULEB128(Ptr); + } + const uint8_t *NextLSDA = Ptr + TTypeEnd; + + if (opts::PrintExceptions) { + errs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; + errs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; + errs() << "TType Encoding = " << (unsigned)TTypeEncoding << '\n'; + errs() << "TType End = " << TTypeEnd << '\n'; + } + + uint8_t CallSiteEncoding = *Ptr++; + uint32_t CallSiteTableLength = readULEB128(Ptr); + const uint8_t *CallSiteTableStart = Ptr; + const uint8_t *CallSiteTableEnd = CallSiteTableStart + CallSiteTableLength; + const uint8_t *CallSitePtr = CallSiteTableStart; + + if (opts::PrintExceptions) { + errs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n'; + errs() << "CallSite table length = " << CallSiteTableLength << '\n'; + errs() << '\n'; + } + + unsigned NumCallSites = 0; + while (CallSitePtr < CallSiteTableEnd) { + ++NumCallSites; + uintptr_t Start = readEncodedPointer(CallSitePtr, CallSiteEncoding); + uintptr_t Length = readEncodedPointer(CallSitePtr, CallSiteEncoding); + uintptr_t LandingPad = readEncodedPointer(CallSitePtr, CallSiteEncoding); + + uintptr_t ActionEntry = readULEB128(CallSitePtr); + uint64_t RangeBase = 0; + if (opts::PrintExceptions) { + errs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) + << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) + << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad) + << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; + } + + if (LandingPad != 0 || ActionEntry != 0) + IsTrivial = false; + } + Ptr = CallSiteTableEnd; + + if (NumCallSites > 1) + IsTrivial = false; + + if (opts::PrintExceptions) + errs() << '\n'; + + if (IsTrivial) + ++NumTrivialLSDAs; + + if (CallSiteTableLength == 0 || TTypeEnd == 0) + continue; + + const uint8_t *ActionPtr = Ptr; + uintptr_t ActionOffset = 0; + do { + uintptr_t ActionType = readULEB128(ActionPtr); + ActionOffset = readULEB128(ActionPtr); + if (opts::PrintExceptions) { + errs() << "ActionType: " << ActionType + << "; ActionOffset: " << ActionOffset << "\n"; + } + } while (ActionOffset != 0); + + if (opts::PrintExceptions) + errs() << '\n'; + + Ptr = NextLSDA; + } +} + +} // namespace flo +} // namespace llvm diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h new file mode 100644 index 000000000000..31e37c6248c2 --- /dev/null +++ b/bolt/Exceptions.h @@ -0,0 +1,25 @@ +//===-- Exceptions.h - Helpers for processing C++ exceptions --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_FLO_EXCEPTIONS_H +#define LLVM_TOOLS_LLVM_FLO_EXCEPTIONS_H + +#include "llvm/ADT/ArrayRef.h" + +namespace llvm { +namespace flo { + +void readLSDA(ArrayRef LSDAData); + +} // namespace flo +} // namespace llvm + +#endif diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 7446c63c8f03..c9bb4b7d90e2 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -17,6 +17,7 @@ #include "BinaryContext.h" #include "BinaryFunction.h" #include "DataReader.h" +#include "Exceptions.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" @@ -447,6 +448,22 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { ); } + // Process special sections. + for (const auto &Section : File->sections()) { + StringRef SectionName; + check_error(Section.getName(SectionName), "cannot get section name"); + StringRef SectionContents; + check_error(Section.getContents(SectionContents), + "cannot get section contents"); + ArrayRef SectionData( + reinterpret_cast(SectionContents.data()), + Section.getSize()); + + if (SectionName == ".gcc_except_table") { + readLSDA(SectionData); + } + } + // Disassemble every function and build it's control flow graph. for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; From 8e2944b280e29392718fc24c002434af2d82ea80 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 3 Nov 2015 14:26:33 -0800 Subject: [PATCH 032/904] Verbose printing of actions from .gcc_except_table Summary: Print actions for exception ranges from .gcc_except_table. Types are printed as names if the name is available from symbol table. (cherry picked from commit 24c87f20eee4a281e5b41e9e5f8be08ed4aa9605) --- bolt/Exceptions.cpp | 139 +++++++++++++++++++++++++++++++++++++------- bolt/Exceptions.h | 3 +- bolt/llvm-flo.cpp | 2 +- 3 files changed, 120 insertions(+), 24 deletions(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 3477af2561d5..6722d96aafd9 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "Exceptions.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" @@ -128,7 +129,7 @@ uintptr_t readEncodedPointer(const uint8_t *&Data, uint8_t Encoding) { Result = readValue(Data); break; default: - assert(0 && "not implemented"); + llvm_unreachable("not implemented"); } // then add relative offset @@ -144,7 +145,7 @@ uintptr_t readEncodedPointer(const uint8_t *&Data, uint8_t Encoding) { case DW_EH_PE_funcrel: case DW_EH_PE_aligned: default: - assert(0 && "not implemented"); + llvm_unreachable("not implemented"); } // then apply indirection @@ -157,7 +158,48 @@ uintptr_t readEncodedPointer(const uint8_t *&Data, uint8_t Encoding) { } // namespace -void readLSDA(ArrayRef LSDAData) { +// readLSDA is reading and dumping the whole .gcc_exception_table section +// at once. +// +// .gcc_except_table section contains a set of Language-Specific Data Areas +// which are basically exception handling tables. One LSDA per function. +// One important observation - you can't actually tell which function LSDA +// refers to, and most addresses are relative to the function start. So you +// have to start with parsing .eh_frame entries that refers to LSDA to obtain +// a function context. +// +// The best visual representation of the tables comprising LSDA and relationship +// between them is illustrated at: +// http://mentorembedded.github.io/cxx-abi/exceptions.pdf +// Keep in mind that GCC implementation deviates slightly from that document. +// +// To summarize, there are 4 tables in LSDA: call site table, actions table, +// types table, and types index table (indirection). The main table contains +// call site entries. Each call site includes a range that can throw an exception, +// a handler (landing pad), and a reference to an entry in the action table. +// A handler and/or action could be 0. An action entry is in fact a head +// of a list of actions associated with a call site and an action table contains +// all such lists (it could be optimize to share list tails). Each action could be +// either to catch an exception of a given type, to perform a cleanup, or to +// propagate an exception after filtering it out (e.g. to make sure function +// exception specification is not violated). Catch action contains a reference +// to an entry in the type table, and filter action refers to an entry in the +// type index table to encode a set of types to filter. +// +// Call site table follows LSDA header. Action table immediately follows the +// call site table. +// +// Both types table and type index table start at the same location, but they +// grow in opposite directions (types go up, indices go down). The beginning of +// these tables is encoded in LSDA header. Sizes for both of the tables are not +// included anywhere. +// +// For the purpose of rewriting exception handling tables, we can reuse action +// table, types table, and type index table in a binary format when type +// references are hard-coded absolute addresses. We still have to parse all the +// table to determine their size. We have to parse call site table and associate +// discovered information with actual call instructions and landing pad blocks. +void readLSDA(ArrayRef LSDAData, BinaryContext &BC) { const uint8_t *Ptr = LSDAData.data(); while (Ptr < LSDAData.data() + LSDAData.size()) { @@ -188,7 +230,6 @@ void readLSDA(ArrayRef LSDAData) { if (TTypeEncoding != DW_EH_PE_omit) { TTypeEnd = readULEB128(Ptr); } - const uint8_t *NextLSDA = Ptr + TTypeEnd; if (opts::PrintExceptions) { errs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; @@ -197,11 +238,22 @@ void readLSDA(ArrayRef LSDAData) { errs() << "TType End = " << TTypeEnd << '\n'; } + // Table to store list of indices in type table. Entries are uleb128s values. + auto TypeIndexTableStart = Ptr + TTypeEnd; + + // Offset past the last decoded index. + intptr_t MaxTypeIndexTableOffset = 0; + + // The actual type info table starts at the same location, but grows in + // different direction. Encoding is different too (TTypeEncoding). + auto TypeTableStart = reinterpret_cast(Ptr + TTypeEnd); + uint8_t CallSiteEncoding = *Ptr++; uint32_t CallSiteTableLength = readULEB128(Ptr); const uint8_t *CallSiteTableStart = Ptr; const uint8_t *CallSiteTableEnd = CallSiteTableStart + CallSiteTableLength; const uint8_t *CallSitePtr = CallSiteTableStart; + const uint8_t *ActionTableStart = CallSiteTableEnd; if (opts::PrintExceptions) { errs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n'; @@ -219,10 +271,67 @@ void readLSDA(ArrayRef LSDAData) { uintptr_t ActionEntry = readULEB128(CallSitePtr); uint64_t RangeBase = 0; if (opts::PrintExceptions) { + auto printType = [&] (int Index, raw_ostream &OS) { + assert(Index > 0 && "only positive indices are valid"); + assert(TTypeEncoding == DW_EH_PE_udata4 && + "only udata4 supported for TTypeEncoding"); + auto TypeAddress = *(TypeTableStart - Index); + if (TypeAddress == 0) { + OS << ""; + return; + } + auto NI = BC.GlobalAddresses.find(TypeAddress); + if (NI != BC.GlobalAddresses.end()) { + OS << NI->second; + } else { + OS << "0x" << Twine::utohexstr(TypeAddress); + } + }; errs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad) << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; + if (ActionEntry != 0) { + errs() << " actions: "; + const uint8_t *ActionPtr = ActionTableStart + ActionEntry - 1; + long long ActionType; + long long ActionNext; + auto Sep = ""; + do { + ActionType = readSLEB128(ActionPtr); + auto Self = ActionPtr; + ActionNext = readSLEB128(ActionPtr); + errs() << Sep << "(" << ActionType << ", " << ActionNext << ") "; + if (ActionType == 0) { + errs() << "cleanup"; + } else if (ActionType > 0) { + // It's an index into a type table. + errs() << "catch type "; + printType(ActionType, errs()); + } else { // ActionType < 0 + errs() << "filter exception types "; + auto TSep = ""; + // ActionType is a negative byte offset into uleb128-encoded table + // of indices with base 1. + // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are + // encoded using uleb128 so we cannot directly dereference them. + auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; + while (auto Index = readULEB128(TypeIndexTablePtr)) { + errs() << TSep; + printType(Index, errs()); + TSep = ", "; + } + MaxTypeIndexTableOffset = + std::max(MaxTypeIndexTableOffset, + TypeIndexTablePtr - TypeIndexTableStart); + } + + Sep = "; "; + + ActionPtr = Self + ActionNext; + } while (ActionNext); + errs() << '\n'; + } } if (LandingPad != 0 || ActionEntry != 0) @@ -233,30 +342,16 @@ void readLSDA(ArrayRef LSDAData) { if (NumCallSites > 1) IsTrivial = false; - if (opts::PrintExceptions) - errs() << '\n'; - if (IsTrivial) ++NumTrivialLSDAs; - if (CallSiteTableLength == 0 || TTypeEnd == 0) - continue; - - const uint8_t *ActionPtr = Ptr; - uintptr_t ActionOffset = 0; - do { - uintptr_t ActionType = readULEB128(ActionPtr); - ActionOffset = readULEB128(ActionPtr); - if (opts::PrintExceptions) { - errs() << "ActionType: " << ActionType - << "; ActionOffset: " << ActionOffset << "\n"; - } - } while (ActionOffset != 0); - if (opts::PrintExceptions) errs() << '\n'; - Ptr = NextLSDA; + if (CallSiteTableLength == 0 || TTypeEnd == 0) + continue; + + Ptr = TypeIndexTableStart + MaxTypeIndexTableOffset; } } diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h index 31e37c6248c2..0aca298cb466 100644 --- a/bolt/Exceptions.h +++ b/bolt/Exceptions.h @@ -12,12 +12,13 @@ #ifndef LLVM_TOOLS_LLVM_FLO_EXCEPTIONS_H #define LLVM_TOOLS_LLVM_FLO_EXCEPTIONS_H +#include "BinaryContext.h" #include "llvm/ADT/ArrayRef.h" namespace llvm { namespace flo { -void readLSDA(ArrayRef LSDAData); +void readLSDA(ArrayRef LSDAData, BinaryContext &BC); } // namespace flo } // namespace llvm diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index c9bb4b7d90e2..238e0b6d08bd 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -460,7 +460,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Section.getSize()); if (SectionName == ".gcc_except_table") { - readLSDA(SectionData); + readLSDA(SectionData, *BC); } } From 030a1ca3bcaaab5fee8e29c770228ef2c949bb28 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 5 Nov 2015 13:37:30 -0800 Subject: [PATCH 033/904] Teach llvm-flo how to read .eh_frame information from binaries Summary: In order to reorder binaries with C++ exceptions, we first need to read DWARF CFI (call frame info) from binaries in a table in the .eh_frame ELF section. This table contains unwinding information we need to be aware of when reordering basic blocks, so as to avoid corrupting it. This patch also cleans up some code from Exceptions.cpp due to a refactoring where we moved some functions to the LLVM's libSupport. (cherry picked from commit dc4244856b28f9fbc0a1741a5eef0914bd03b739) --- bolt/CMakeLists.txt | 1 + bolt/Exceptions.cpp | 113 -------------------------------------------- bolt/llvm-flo.cpp | 12 +++++ 3 files changed, 13 insertions(+), 113 deletions(-) diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index cfb1a91da32b..fc75bcbe9844 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} CodeGen Core + DebugInfoDWARF MC MCDisassembler MCParser diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 6722d96aafd9..18988669853e 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -45,119 +45,6 @@ PrintExceptions("print-exceptions", } // namespace opts -namespace { - -/// Read an unsigned LEB128 value from data, advancing it past the value. -uintptr_t readULEB128(const uint8_t *&Data) { - uintptr_t Result = 0; - uintptr_t Shift = 0; - unsigned char Byte; - - do { - Byte = *Data++; - Result |= (Byte & 0x7f) << Shift; - Shift += 7; - } while (Byte & 0x80); - - return Result; -} - -/// Read a signed LEB128 value from data, advancing it past the value. -uintptr_t readSLEB128(const uint8_t *&Data) { - uintptr_t Result = 0; - uintptr_t Shift = 0; - unsigned char Byte; - - do { - Byte = *Data++; - Result |= (Byte & 0x7f) << Shift; - Shift += 7; - } while (Byte & 0x80); - - if ((Byte & 0x40) && (Shift < (sizeof(Result) << 3))) { - Result |= (~0 << Shift); - } - - return Result; -} - -/// Read and return a T from data, advancing it past the read item. -template -T readValue(const uint8_t *&Data) { - T Val; - memcpy(&Val, Data, sizeof(T)); - Data += sizeof(T); - return Val; -} - -/// Read an encoded DWARF value from data, advancing it past any data read. This -/// function was adapted from the ExceptionDemo.cpp example in llvm. -uintptr_t readEncodedPointer(const uint8_t *&Data, uint8_t Encoding) { - uintptr_t Result = 0; - auto const Start = Data; - - if (Encoding == DW_EH_PE_omit) - return Result; - - // first get value - switch (Encoding & 0x0F) { - case DW_EH_PE_absptr: - Result = readValue(Data); - break; - case DW_EH_PE_uleb128: - Result = readULEB128(Data); - break; - case DW_EH_PE_sleb128: - Result = readSLEB128(Data); - break; - case DW_EH_PE_udata2: - Result = readValue(Data); - break; - case DW_EH_PE_udata4: - Result = readValue(Data); - break; - case DW_EH_PE_udata8: - Result = readValue(Data); - break; - case DW_EH_PE_sdata2: - Result = readValue(Data); - break; - case DW_EH_PE_sdata4: - Result = readValue(Data); - break; - case DW_EH_PE_sdata8: - Result = readValue(Data); - break; - default: - llvm_unreachable("not implemented"); - } - - // then add relative offset - switch (Encoding & 0x70) { - case DW_EH_PE_absptr: - // do nothing - break; - case DW_EH_PE_pcrel: - Result += reinterpret_cast(Start); - break; - case DW_EH_PE_textrel: - case DW_EH_PE_datarel: - case DW_EH_PE_funcrel: - case DW_EH_PE_aligned: - default: - llvm_unreachable("not implemented"); - } - - // then apply indirection - if (Encoding & 0x80 /*DW_EH_PE_indirect*/) { - Result = *((uintptr_t*)Result); - } - - return Result; -} - -} // namespace - // readLSDA is reading and dumping the whole .gcc_exception_table section // at once. // diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 238e0b6d08bd..02ba000f4943 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -19,6 +19,7 @@ #include "DataReader.h" #include "Exceptions.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" @@ -105,6 +106,10 @@ static cl::opt DumpData("dump-data", cl::desc("dump parsed flo data and exit (debugging)"), cl::Hidden); +static cl::opt +DumpEHFrame("dump-eh-frame", cl::desc("dump parsed .eh_frame (debugging)"), + cl::Hidden); + static cl::opt PrintAll("print-all", cl::desc("print functions after each stage"), cl::Hidden); @@ -464,6 +469,13 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } } + // Process debug sections. + std::unique_ptr DwCtx(new DWARFContextInMemory(*File)); + if (opts::DumpEHFrame) { + const auto *Frames = DwCtx->getEHFrame(); + Frames->dump(outs()); + } + // Disassemble every function and build it's control flow graph. for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; From 1f8e082ab450eac245731efe1895c98664df0cd4 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 9 Nov 2015 12:27:13 -0800 Subject: [PATCH 034/904] Parse whole contents of .gcc_except_table even if we are not printing. Summary: We need to parse the whole contents of .gcc_except_table even if we are not printing exceptions. Otherwise we are missing type index table and miscalculate the size of the current table. (cherry picked from commit 1e60b712aaeaa020b29c8691cfd6d33c81815f82) --- bolt/Exceptions.cpp | 71 +++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 18988669853e..21c3bc8dc318 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -158,6 +158,12 @@ void readLSDA(ArrayRef LSDAData, BinaryContext &BC) { uintptr_t ActionEntry = readULEB128(CallSitePtr); uint64_t RangeBase = 0; if (opts::PrintExceptions) { + errs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) + << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) + << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad) + << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; + } + if (ActionEntry != 0) { auto printType = [&] (int Index, raw_ostream &OS) { assert(Index > 0 && "only positive indices are valid"); assert(TTypeEncoding == DW_EH_PE_udata4 && @@ -174,51 +180,54 @@ void readLSDA(ArrayRef LSDAData, BinaryContext &BC) { OS << "0x" << Twine::utohexstr(TypeAddress); } }; - errs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) - << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) - << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad) - << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; - if (ActionEntry != 0) { + if (opts::PrintExceptions) errs() << " actions: "; - const uint8_t *ActionPtr = ActionTableStart + ActionEntry - 1; - long long ActionType; - long long ActionNext; - auto Sep = ""; - do { - ActionType = readSLEB128(ActionPtr); - auto Self = ActionPtr; - ActionNext = readSLEB128(ActionPtr); + const uint8_t *ActionPtr = ActionTableStart + ActionEntry - 1; + long long ActionType; + long long ActionNext; + auto Sep = ""; + do { + ActionType = readSLEB128(ActionPtr); + auto Self = ActionPtr; + ActionNext = readSLEB128(ActionPtr); + if (opts::PrintExceptions) errs() << Sep << "(" << ActionType << ", " << ActionNext << ") "; - if (ActionType == 0) { + if (ActionType == 0) { + if (opts::PrintExceptions) errs() << "cleanup"; - } else if (ActionType > 0) { - // It's an index into a type table. + } else if (ActionType > 0) { + // It's an index into a type table. + if (opts::PrintExceptions) { errs() << "catch type "; printType(ActionType, errs()); - } else { // ActionType < 0 + } + } else { // ActionType < 0 + if (opts::PrintExceptions) errs() << "filter exception types "; - auto TSep = ""; - // ActionType is a negative byte offset into uleb128-encoded table - // of indices with base 1. - // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are - // encoded using uleb128 so we cannot directly dereference them. - auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; - while (auto Index = readULEB128(TypeIndexTablePtr)) { + auto TSep = ""; + // ActionType is a negative byte offset into uleb128-encoded table + // of indices with base 1. + // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are + // encoded using uleb128 so we cannot directly dereference them. + auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; + while (auto Index = readULEB128(TypeIndexTablePtr)) { + if (opts::PrintExceptions) { errs() << TSep; printType(Index, errs()); TSep = ", "; } - MaxTypeIndexTableOffset = - std::max(MaxTypeIndexTableOffset, - TypeIndexTablePtr - TypeIndexTableStart); } + MaxTypeIndexTableOffset = + std::max(MaxTypeIndexTableOffset, + TypeIndexTablePtr - TypeIndexTableStart); + } - Sep = "; "; + Sep = "; "; - ActionPtr = Self + ActionNext; - } while (ActionNext); + ActionPtr = Self + ActionNext; + } while (ActionNext); + if (opts::PrintExceptions) errs() << '\n'; - } } if (LandingPad != 0 || ActionEntry != 0) From ebe144e7d4af910507491eff3b9da48f12faf2cc Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 4 Nov 2015 16:48:47 -0800 Subject: [PATCH 035/904] Annotate BinaryFunctions with MCCFIInstructions encoding CFI Summary: In order to represent CFI information in our BinaryFunction class, this patch adds a map of Offsets to CFI instructions. In this way, we make it easy to check exactly where DWARF CFI information is annotated in the disassembled function. (cherry picked from commit c4aab1766d9680f287b0efa173c1ff1219147274) --- bolt/BinaryFunction.cpp | 30 ++++++++- bolt/BinaryFunction.h | 9 +++ bolt/Exceptions.cpp | 131 ++++++++++++++++++++++++++++++++++++++++ bolt/Exceptions.h | 28 +++++++++ bolt/llvm-flo.cpp | 13 +++- 5 files changed, 208 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index aba808688d0b..df870257384b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -76,7 +76,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, << "\n Section : " << SectionName << "\n Orc Section : " << getCodeSectionName() << "\n IsSimple : " << IsSimple - << "\n BB Count : " << BasicBlocksLayout.size(); + << "\n BB Count : " << BasicBlocksLayout.size() + << "\n CFI Instrs : " << FrameInstructions.size(); if (BasicBlocksLayout.size()) { OS << "\n BB Layout : "; auto Sep = ""; @@ -177,6 +178,33 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } + if (FrameInstructions.empty()) { + OS << "End of Function \"" << getName() << "\"\n"; + return; + } + + OS << "DWARF CFI Instructions:\n"; + for (auto &CFIInstr : FrameInstructions) { + OS << format(" %08x: ", CFIInstr.first); + switch(CFIInstr.second.getOperation()) { + case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; + case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; + case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; + case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; + case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; + case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; + case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; + case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; + case MCCFIInstruction::OpAdjustCfaOffset: OS << "OfAdjustCfaOffset"; break; + case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; + case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; + case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; + case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; + case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; + } + OS << "\n"; + } + OS << "End of Function \"" << getName() << "\"\n"; } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 5cbc1d953422..8e5a9cda4fa3 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -24,6 +24,7 @@ #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -167,6 +168,10 @@ class BinaryFunction { using InstrMapType = std::map; InstrMapType Instructions; + /// List of DWARF CFI instructions + using CFIInstrMapType = std::multimap; + CFIInstrMapType FrameInstructions; + // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), // the terminating instructions need to be modified. @@ -342,6 +347,10 @@ class BinaryFunction { Instructions.emplace(Offset, std::forward(Instruction)); } + void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) { + FrameInstructions.emplace(Offset, std::forward(Inst)); + } + BinaryFunction &setFileOffset(uint64_t Offset) { FileOffset = Offset; return *this; diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 21c3bc8dc318..36371e7f11cc 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "Exceptions.h" +#include "BinaryFunction.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" @@ -251,5 +252,135 @@ void readLSDA(ArrayRef LSDAData, BinaryContext &BC) { } } +const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; +const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f; + +void CFIReader::fillCFIInfoFor(BinaryFunction &Function) const { + uint64_t Address = Function.getAddress(); + auto I = FDEs.find(Address); + if (I == FDEs.end()) + return; + + const FDE &CurFDE = *I->second; + if (Function.getSize() != CurFDE.getAddressRange()) { + errs() << "FLO-WARNING: CFI information size mismatch for function \"" + << Function.getName() << "\"" + << format(": Function size is %dB, CFI covers " + "%dB\n", + Function.getSize(), CurFDE.getAddressRange()); + } + + uint64_t Offset = 0; + uint64_t CodeAlignment = CurFDE.getLinkedCIE()->getCodeAlignmentFactor(); + uint64_t DataAlignment = CurFDE.getLinkedCIE()->getDataAlignmentFactor(); + for (const FrameEntry::Instruction &Instr : CurFDE) { + uint8_t Opcode = Instr.Opcode; + if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK) + Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK; + switch (Instr.Opcode) { + case DW_CFA_nop: + break; + case DW_CFA_advance_loc4: + case DW_CFA_advance_loc2: + case DW_CFA_advance_loc1: + case DW_CFA_advance_loc: + // Advance our current address + Offset += CodeAlignment * int64_t(Instr.Ops[0]); + break; + case DW_CFA_offset_extended_sf: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::createOffset( + nullptr, Instr.Ops[0], DataAlignment * int64_t(Instr.Ops[1]))); + break; + case DW_CFA_offset_extended: + case DW_CFA_offset: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createOffset(nullptr, Instr.Ops[0], + DataAlignment * Instr.Ops[1])); + break; + case DW_CFA_restore_extended: + case DW_CFA_restore: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRestore(nullptr, Instr.Ops[0])); + break; + case DW_CFA_set_loc: + assert(Instr.Ops[0] < Address && "set_loc out of function bounds"); + assert(Instr.Ops[0] > Address + Function.getSize() && + "set_loc out of function bounds"); + Offset = Instr.Ops[0] - Address; + break; + + case DW_CFA_undefined: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createUndefined(nullptr, Instr.Ops[0])); + break; + case DW_CFA_same_value: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createSameValue(nullptr, Instr.Ops[0])); + break; + case DW_CFA_register: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRegister(nullptr, Instr.Ops[0], + Instr.Ops[1])); + break; + case DW_CFA_remember_state: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRememberState(nullptr)); + break; + case DW_CFA_restore_state: + Function.addCFIInstruction(Offset, + MCCFIInstruction::createRestoreState(nullptr)); + break; + case DW_CFA_def_cfa: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::createDefCfa(nullptr, Instr.Ops[0], Instr.Ops[1])); + break; + case DW_CFA_def_cfa_sf: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::createDefCfa( + nullptr, Instr.Ops[0], DataAlignment * int64_t(Instr.Ops[1]))); + break; + case DW_CFA_def_cfa_register: + Function.addCFIInstruction(Offset, MCCFIInstruction::createDefCfaRegister( + nullptr, Instr.Ops[0])); + break; + case DW_CFA_def_cfa_offset: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createDefCfaOffset(nullptr, Instr.Ops[0])); + break; + case DW_CFA_def_cfa_offset_sf: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createDefCfaOffset( + nullptr, DataAlignment * int64_t(Instr.Ops[0]))); + break; + case DW_CFA_val_offset_sf: + case DW_CFA_val_offset: + llvm_unreachable("DWARF val_offset() unimplemented"); + break; + case DW_CFA_expression: + case DW_CFA_def_cfa_expression: + case DW_CFA_val_expression: + llvm_unreachable("DWARF CFA expressions unimplemented"); + break; + dbgs() << "DW_CFA_val_expression"; + break; + case DW_CFA_MIPS_advance_loc8: + llvm_unreachable("DW_CFA_MIPS_advance_loc unimplemented"); + break; + case DW_CFA_GNU_args_size: + case DW_CFA_GNU_window_save: + case DW_CFA_lo_user: + case DW_CFA_hi_user: + llvm_unreachable("DW_CFA_GNU_* and DW_CFA_*_use unimplemented"); + break; + default: + llvm_unreachable("Unrecognized CFI instruction"); + } + } +} + } // namespace flo } // namespace llvm diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h index 0aca298cb466..191743e77757 100644 --- a/bolt/Exceptions.h +++ b/bolt/Exceptions.h @@ -14,12 +14,40 @@ #include "BinaryContext.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/DebugInfo/DWARF/DWARFFrame.h" +#include "llvm/Support/Casting.h" +#include namespace llvm { namespace flo { +class BinaryFunction; + void readLSDA(ArrayRef LSDAData, BinaryContext &BC); +/// \brief Wraps up information to read all CFI instructions and feed them to a +/// BinaryFunction. +class CFIReader { +public: + explicit CFIReader(const DWARFFrame &EHFrame) : EHFrame(EHFrame) { + // Prepare FDEs for fast lookup + for (const auto &Entry : EHFrame.Entries) { + const dwarf::FrameEntry *FE = Entry.get(); + if (const auto *CurFDE = dyn_cast(FE)) { + FDEs[CurFDE->getInitialLocation()] = CurFDE; + } + } + } + + using FDEsMap = std::map; + + void fillCFIInfoFor(BinaryFunction &Function) const; + +private: + const DWARFFrame &EHFrame; + FDEsMap FDEs; +}; + } // namespace flo } // namespace llvm diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 02ba000f4943..910af6b24776 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -471,9 +471,14 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // Process debug sections. std::unique_ptr DwCtx(new DWARFContextInMemory(*File)); + const DWARFFrame &EHFrame = *DwCtx->getEHFrame(); if (opts::DumpEHFrame) { - const auto *Frames = DwCtx->getEHFrame(); - Frames->dump(outs()); + EHFrame.dump(outs()); + } + CFIReader DwCFIReader(EHFrame); + if (!EHFrame.ParseError.empty()) { + errs() << "FLO-WARNING: EHFrame reader failed with message \"" + << EHFrame.ParseError << "\"\n"; } // Disassemble every function and build it's control flow graph. @@ -533,6 +538,10 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (!Function.disassemble(FunctionData)) continue; + // Fill in CFI information for this function + if (EHFrame.ParseError.empty()) + DwCFIReader.fillCFIInfoFor(Function); + if (opts::PrintAll || opts::PrintDisasm) Function.print(errs(), "after disassembly"); From 54a9afbc9c6d2d2d5cc4faa91ec36413c89a0540 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 11 Nov 2015 23:56:24 -0800 Subject: [PATCH 036/904] Fix bug in local symbol name disambiguation algorithm Summary: This bug would cause llvm-flo to fail to disambiguate two local symbols with the same file name, causing two different addresses to compete in the symbol table for the resolution of a given name, causing unpredicted behavior in the linker. (cherry picked from commit 7a5c9767a824d116170c03a058e686b23c956bb6) --- bolt/llvm-flo.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 910af6b24776..641463bacec5 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -400,9 +400,20 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { assert(BC->GlobalSymbols.find(*Name) == BC->GlobalSymbols.end() && "global name not unique"); UniqueName = *Name; + /// It's possible we are seeing a globalized local. LLVM might treat it as + /// local if it has a "private global" prefix, e.g. ".L". Thus we have to + /// change the prefix to enforce global scope of the symbol. + if (StringRef(UniqueName) + .startswith(BC->AsmInfo->getPrivateGlobalPrefix())) + UniqueName = "PG." + UniqueName; } else { unsigned LocalCount = 1; std::string LocalName = (*Name).str() + "/" + FileSymbolName + "/"; + + if ((*Name).startswith(BC->AsmInfo->getPrivateGlobalPrefix())) { + LocalName = "PG." + LocalName; + } + while (BC->GlobalSymbols.find(LocalName + std::to_string(LocalCount)) != BC->GlobalSymbols.end()) { ++LocalCount; @@ -410,13 +421,6 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { UniqueName = LocalName + std::to_string(LocalCount); } - /// It's possible we are seeing a globalized local. Even though - /// we've made the name unique, LLVM might still treat it as local - /// if it has a "private global" prefix, e.g. ".L". Thus we have to - /// change the prefix to enforce global scope of the symbol. - if (StringRef(UniqueName).startswith(BC->AsmInfo->getPrivateGlobalPrefix())) - UniqueName = "PG." + UniqueName; - // Add the name to global symbols map. BC->GlobalSymbols[UniqueName] = Address; From 685f2fbe77fb47f740013087274616dcc5ac1a97 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 12 Nov 2015 10:02:12 -0800 Subject: [PATCH 037/904] Remove add PG prefix from symbols that are already local Summary: After discussion with Maksim, we decided to drop the lines that add the PG prefix if the symbol is already local, since they wouldn't be impacted by the way LLVM handles these symbols. (cherry picked from commit 82f7b9d900b962eb33ad113ed23e12e157d62cb4) --- bolt/llvm-flo.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 641463bacec5..85ac6325ac53 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -410,10 +410,6 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { unsigned LocalCount = 1; std::string LocalName = (*Name).str() + "/" + FileSymbolName + "/"; - if ((*Name).startswith(BC->AsmInfo->getPrivateGlobalPrefix())) { - LocalName = "PG." + LocalName; - } - while (BC->GlobalSymbols.find(LocalName + std::to_string(LocalCount)) != BC->GlobalSymbols.end()) { ++LocalCount; From 0384a34aeb3ecebd178fbccff2740d9f569ee1f9 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 12 Nov 2015 10:41:46 -0800 Subject: [PATCH 038/904] Revert 45fc13b as it breaks HHVM rewriting Summary: Reverting this commit until we better investigate why it is necessary to change local symbol names with a prefix. (cherry picked from commit 83b8e059d7cad6cdc02387e13c5a6959fae4b825) --- bolt/llvm-flo.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 85ac6325ac53..641463bacec5 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -410,6 +410,10 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { unsigned LocalCount = 1; std::string LocalName = (*Name).str() + "/" + FileSymbolName + "/"; + if ((*Name).startswith(BC->AsmInfo->getPrivateGlobalPrefix())) { + LocalName = "PG." + LocalName; + } + while (BC->GlobalSymbols.find(LocalName + std::to_string(LocalCount)) != BC->GlobalSymbols.end()) { ++LocalCount; From 58b15c5229bc6ca745c5cebe45e869496219d64c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 12 Nov 2015 18:56:58 -0800 Subject: [PATCH 039/904] Add exception handling information to CFG. Summary: Read .gcc_except_table and add information to CFG. Calls have extra operands indicating there's a possible handler for exceptions and an action. Landing pad information is recorded in BinaryFunction. Also convert JMP instructions that are calls into tail calls pseudo instructions so that they don't miss call instruction analysis. (cherry picked from commit 3e8dcdecbf434d54e2e503f8523f7c41dddeb591) --- bolt/BinaryFunction.cpp | 42 +++++++--- bolt/BinaryFunction.h | 24 ++++++ bolt/Exceptions.cpp | 179 ++++++++++++++++++++++++++++++++++++++++ bolt/llvm-flo.cpp | 10 +++ 4 files changed, 243 insertions(+), 12 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index df870257384b..0cd4816910a0 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -99,6 +99,22 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, // Offset of the instruction in function. uint64_t Offset{0}; + auto printInstruction = [&](const MCInst &Instruction) { + OS << format(" %08" PRIx64 ": ", Offset); + BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); + if (BC.MIA->isCall(Instruction)) { + if (BC.MIA->isTailCall(Instruction)) + OS << " # TAILCALL "; + if (Instruction.getNumOperands() > 1) { + OS << " # handler: " << Instruction.getOperand(1); + OS << "; action: " << Instruction.getOperand(2); + } + } + OS << "\n"; + // In case we need MCInst printer: + // Instr.dump_pretty(OS, InstructionPrinter.get()); + }; + if (BasicBlocks.empty() && !Instructions.empty()) { // Print before CFG was built. for (const auto &II : Instructions) { @@ -109,10 +125,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (LI != Labels.end()) OS << LI->second->getName() << ":\n"; - auto &Instruction = II.second; - OS << format(" %08" PRIx64 ": ", Offset); - BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); - OS << "\n"; + printInstruction(II.second); } } @@ -121,6 +134,10 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, << BB->Instructions.size() << " instructions, align : " << BB->getAlignment() << ")\n"; + if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { + OS << " Landing Pad\n"; + } + uint64_t BBExecCount = BB->getExecutionCount(); if (BBExecCount != BinaryBasicBlock::COUNT_NO_PROFILE) { OS << " Exec Count : " << BBExecCount << "\n"; @@ -138,12 +155,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, Offset = RoundUpToAlignment(Offset, BB->getAlignment()); for (auto &Instr : *BB) { - OS << format(" %08" PRIx64 ": ", Offset); - BC.InstPrinter->printInst(&Instr, OS, "", *BC.STI); - OS << "\n"; - - // In case we need MCInst printer: - // Instr.dump_pretty(OS, InstructionPrinter.get()); + printInstruction(Instr); // Calculate the size of the instruction. // Note: this is imprecise since happening prior to relaxation. @@ -298,8 +310,14 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { << ". Code size will be increased.\n"; } - // This is a call regardless of the opcode (e.g. tail call). - IsCall = true; + // This is a call regardless of the opcode. + // Assign proper opcode for tail calls, so that they could be + // treated as calls. + if (!IsCall) { + MIA->convertJmpToTailCall(Instruction); + IsCall = true; + } + TargetSymbol = BC.getOrCreateGlobalSymbol(InstructionTarget, "FUNCat"); } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 8e5a9cda4fa3..ebe266d53b18 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -119,6 +119,16 @@ class BinaryFunction { /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; + /// Binary blob reprsenting action, type, and type index tables for this + /// function' LSDA (exception handling). + ArrayRef LSDATables; + + /// Original LSDA address for the function. + uint64_t LSDAAddress{0}; + + /// Landing pads for the function. + std::set LandingPads; + /// Release storage used by instructions. BinaryFunction &clearInstructions() { InstrMapType TempMap; @@ -401,6 +411,12 @@ class BinaryFunction { return *this; } + /// Set LSDA address for the function. + BinaryFunction &setLSDAAddress(uint64_t Address) { + LSDAAddress = Address; + return *this; + } + /// Return the profile information about the number of times /// the function was executed. /// @@ -409,6 +425,11 @@ class BinaryFunction { return ExecutionCount; } + /// Return original LSDA address for the function or NULL. + uint64_t getLSDAAddress() const { + return LSDAAddress; + } + /// Disassemble function from raw data \p FunctionData. /// If successful, this function will populate the list of instructions /// for this function together with offsets from the function start @@ -445,6 +466,9 @@ class BinaryFunction { /// adding jumps based on a new layout order. void fixBranches(); + /// Process LSDA information for the function. + void parseLSDA(ArrayRef LSDAData, uint64_t LSDAAddress); + virtual ~BinaryFunction() {} }; diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 36371e7f11cc..75ff8ffa06c6 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -252,6 +252,183 @@ void readLSDA(ArrayRef LSDAData, BinaryContext &BC) { } } +void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, + uint64_t LSDASectionAddress) { + assert(CurrentState == State::Disassembled && "unexpecrted function state"); + + if (!getLSDAAddress()) + return; + + assert(getLSDAAddress() < LSDASectionAddress + LSDASectionData.size() && + "wrong LSDA address"); + + const uint8_t *Ptr = + LSDASectionData.data() + getLSDAAddress() - LSDASectionAddress; + + uint8_t LPStartEncoding = *Ptr++; + uintptr_t LPStart = 0; + if (LPStartEncoding != DW_EH_PE_omit) { + LPStart = readEncodedPointer(Ptr, LPStartEncoding); + } + + assert(LPStart == 0 && "support for split functions not implemented"); + + uint8_t TTypeEncoding = *Ptr++; + uintptr_t TTypeEnd = 0; + if (TTypeEncoding != DW_EH_PE_omit) { + TTypeEnd = readULEB128(Ptr); + } + + if (opts::PrintExceptions) { + errs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; + errs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; + errs() << "TType Encoding = " << (unsigned)TTypeEncoding << '\n'; + errs() << "TType End = " << TTypeEnd << '\n'; + } + + // Table to store list of indices in type table. Entries are uleb128s values. + auto TypeIndexTableStart = Ptr + TTypeEnd; + + // Offset past the last decoded index. + intptr_t MaxTypeIndexTableOffset = 0; + + // The actual type info table starts at the same location, but grows in + // different direction. Encoding is different too (TTypeEncoding). + auto TypeTableStart = reinterpret_cast(Ptr + TTypeEnd); + + uint8_t CallSiteEncoding = *Ptr++; + uint32_t CallSiteTableLength = readULEB128(Ptr); + const uint8_t *CallSiteTableStart = Ptr; + const uint8_t *CallSiteTableEnd = CallSiteTableStart + CallSiteTableLength; + const uint8_t *CallSitePtr = CallSiteTableStart; + const uint8_t *ActionTableStart = CallSiteTableEnd; + + if (opts::PrintExceptions) { + errs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n'; + errs() << "CallSite table length = " << CallSiteTableLength << '\n'; + errs() << '\n'; + } + + unsigned NumCallSites = 0; + while (CallSitePtr < CallSiteTableEnd) { + ++NumCallSites; + uintptr_t Start = readEncodedPointer(CallSitePtr, CallSiteEncoding); + uintptr_t Length = readEncodedPointer(CallSitePtr, CallSiteEncoding); + uintptr_t LandingPad = readEncodedPointer(CallSitePtr, CallSiteEncoding); + + uintptr_t ActionEntry = readULEB128(CallSitePtr); + uint64_t RangeBase = getAddress(); + if (opts::PrintExceptions) { + errs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) + << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) + << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad) + << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; + } + + // Create a handler entry if necessary. + MCSymbol *LPSymbol{nullptr}; + if (LandingPad) { + auto Label = Labels.find(LandingPad); + if (Label != Labels.end()) { + LPSymbol = Label->second; + } else { + LPSymbol = BC.Ctx->createTempSymbol("LP"); + Labels[LandingPad] = LPSymbol; + } + LandingPads.insert(LPSymbol); + } + + // Mark all call instructions in the range. + auto II = Instructions.find(Start); + assert(II != Instructions.end() && + "exception range not pointing to instruction"); + do { + auto &Instruction = II->second; + if (BC.MIA->isCall(Instruction)) { + if (LPSymbol) { + Instruction.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(LPSymbol, + MCSymbolRefExpr::VK_None, + *BC.Ctx))); + } else { + Instruction.addOperand(MCOperand::createImm(0)); + } + Instruction.addOperand(MCOperand::createImm(ActionEntry)); + } + ++II; + } while (II->first < Start + Length); + + if (ActionEntry != 0) { + auto printType = [&] (int Index, raw_ostream &OS) { + assert(Index > 0 && "only positive indices are valid"); + assert(TTypeEncoding == DW_EH_PE_udata4 && + "only udata4 supported for TTypeEncoding"); + auto TypeAddress = *(TypeTableStart - Index); + if (TypeAddress == 0) { + OS << ""; + return; + } + auto NI = BC.GlobalAddresses.find(TypeAddress); + if (NI != BC.GlobalAddresses.end()) { + OS << NI->second; + } else { + OS << "0x" << Twine::utohexstr(TypeAddress); + } + }; + if (opts::PrintExceptions) + errs() << " actions: "; + const uint8_t *ActionPtr = ActionTableStart + ActionEntry - 1; + long long ActionType; + long long ActionNext; + auto Sep = ""; + do { + ActionType = readSLEB128(ActionPtr); + auto Self = ActionPtr; + ActionNext = readSLEB128(ActionPtr); + if (opts::PrintExceptions) + errs() << Sep << "(" << ActionType << ", " << ActionNext << ") "; + if (ActionType == 0) { + if (opts::PrintExceptions) + errs() << "cleanup"; + } else if (ActionType > 0) { + // It's an index into a type table. + if (opts::PrintExceptions) { + errs() << "catch type "; + printType(ActionType, errs()); + } + } else { // ActionType < 0 + if (opts::PrintExceptions) + errs() << "filter exception types "; + auto TSep = ""; + // ActionType is a negative byte offset into uleb128-encoded table + // of indices with base 1. + // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are + // encoded using uleb128 so we cannot directly dereference them. + auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; + while (auto Index = readULEB128(TypeIndexTablePtr)) { + if (opts::PrintExceptions) { + errs() << TSep; + printType(Index, errs()); + TSep = ", "; + } + } + MaxTypeIndexTableOffset = + std::max(MaxTypeIndexTableOffset, + TypeIndexTablePtr - TypeIndexTableStart); + } + + Sep = "; "; + + ActionPtr = Self + ActionNext; + } while (ActionNext); + if (opts::PrintExceptions) + errs() << '\n'; + } + } + if (opts::PrintExceptions) + errs() << '\n'; +} + const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f; @@ -270,6 +447,8 @@ void CFIReader::fillCFIInfoFor(BinaryFunction &Function) const { Function.getSize(), CurFDE.getAddressRange()); } + Function.setLSDAAddress(CurFDE.getLSDAAddress()); + uint64_t Offset = 0; uint64_t CodeAlignment = CurFDE.getLinkedCIE()->getCodeAlignmentFactor(); uint64_t DataAlignment = CurFDE.getLinkedCIE()->getDataAlignmentFactor(); diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 641463bacec5..288b74ce3162 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -457,6 +457,9 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { ); } + ArrayRef LSDAData; + uint64_t LSDAAddress{0}; + // Process special sections. for (const auto &Section : File->sections()) { StringRef SectionName; @@ -470,6 +473,8 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (SectionName == ".gcc_except_table") { readLSDA(SectionData, *BC); + LSDAData = SectionData; + LSDAAddress = Section.getAddress(); } } @@ -546,6 +551,11 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (EHFrame.ParseError.empty()) DwCFIReader.fillCFIInfoFor(Function); + // Parse LSDA. + if (Function.getLSDAAddress() != 0) { + Function.parseLSDA(LSDAData, LSDAAddress); + } + if (opts::PrintAll || opts::PrintDisasm) Function.print(errs(), "after disassembly"); From 8c4b93c503775b37da5de66fef6b0783a2f6b86b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 17 Nov 2015 11:02:04 -0800 Subject: [PATCH 040/904] Fix LSDA reading issues. Summary: There were two issues: we were trying to process non-simple functions, i.e. function that we don't fully understand, and then we failed to stop iterating if EH closing label was after the last instruction in a function. (cherry picked from commit 2d3cf8ae8ffe795ffff1068196b69f1b6f051650) --- bolt/Exceptions.cpp | 6 +++--- bolt/llvm-flo.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 75ff8ffa06c6..1128af962d02 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -340,8 +340,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // Mark all call instructions in the range. auto II = Instructions.find(Start); - assert(II != Instructions.end() && - "exception range not pointing to instruction"); + auto IE = Instructions.end(); + assert(II != IE && "exception range not pointing to an instruction"); do { auto &Instruction = II->second; if (BC.MIA->isCall(Instruction)) { @@ -356,7 +356,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, Instruction.addOperand(MCOperand::createImm(ActionEntry)); } ++II; - } while (II->first < Start + Length); + } while (II != IE && II->first < Start + Length); if (ActionEntry != 0) { auto printType = [&] (int Index, raw_ostream &OS) { diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 288b74ce3162..8f42059d6d85 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -544,7 +544,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { (SectionContents.data()) + FunctionOffset, Function.getSize()); - if (!Function.disassemble(FunctionData)) + if (!Function.disassemble(FunctionData) || !Function.isSimple()) continue; // Fill in CFI information for this function From 6fb15902f72d7f8bc184f14e554892d779c4ec9b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 13 Nov 2015 14:18:45 -0800 Subject: [PATCH 041/904] Regenerate exception handling information after optimizations. Summary: Regenerate exception handling information after optimizations. Use '-print-eh-ranges' to see CFG with updated ranges. (cherry picked from commit d0a6e0b12782e1e7f79497e3f8c794e02d371cda) --- bolt/BinaryFunction.cpp | 74 ++++++++++++++++++---------- bolt/BinaryFunction.h | 12 +++++ bolt/Exceptions.cpp | 104 +++++++++++++++++++++++++++++++++++++++- bolt/llvm-flo.cpp | 37 +++++++++++--- 4 files changed, 195 insertions(+), 32 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 0cd4816910a0..3c53d56f2d65 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -100,14 +100,26 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, uint64_t Offset{0}; auto printInstruction = [&](const MCInst &Instruction) { + if (BC.MIA->isEHLabel(Instruction)) { + OS << " EH_LABEL: " + << cast(Instruction.getOperand(0).getExpr())-> + getSymbol() + << '\n'; + return; + } OS << format(" %08" PRIx64 ": ", Offset); BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); if (BC.MIA->isCall(Instruction)) { if (BC.MIA->isTailCall(Instruction)) OS << " # TAILCALL "; if (Instruction.getNumOperands() > 1) { - OS << " # handler: " << Instruction.getOperand(1); - OS << "; action: " << Instruction.getOperand(2); + OS << " # handler: "; + if (Instruction.getOperand(1).isExpr()) + OS << cast(Instruction.getOperand(1).getExpr())-> + getSymbol(); + else + OS << '0'; + OS << "; action: " << Instruction.getOperand(2).getImm(); } } OS << "\n"; @@ -190,34 +202,46 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } - if (FrameInstructions.empty()) { - OS << "End of Function \"" << getName() << "\"\n"; - return; + // Dump new exception ranges for the function. + if (!CallSites.empty()) { + OS << "EH table:\n"; + for (auto &CSI : CallSites) { + OS << " [" << *CSI.Start << ", " << *CSI.End << ") landing pad : "; + if (CSI.LP) + OS << *CSI.LP; + else + OS << "0"; + OS << ", action : " << CSI.Action << '\n'; + } + OS << '\n'; } - OS << "DWARF CFI Instructions:\n"; - for (auto &CFIInstr : FrameInstructions) { - OS << format(" %08x: ", CFIInstr.first); - switch(CFIInstr.second.getOperation()) { - case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; - case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; - case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; - case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; - case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; - case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; - case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; - case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; - case MCCFIInstruction::OpAdjustCfaOffset: OS << "OfAdjustCfaOffset"; break; - case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; - case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; - case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; - case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; - case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; + if (!FrameInstructions.empty()) { + OS << "DWARF CFI Instructions:\n"; + for (auto &CFIInstr : FrameInstructions) { + OS << format(" %08x: ", CFIInstr.first); + switch(CFIInstr.second.getOperation()) { + case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; + case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; + case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; + case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; + case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; + case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; + case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; + case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; + case MCCFIInstruction::OpAdjustCfaOffset:OS << "OfAdjustCfaOffset"; break; + case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; + case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; + case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; + case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; + case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; + } + OS << '\n'; } - OS << "\n"; + OS << '\n'; } - OS << "End of Function \"" << getName() << "\"\n"; + OS << "End of Function \"" << getName() << "\"\n\n"; } bool BinaryFunction::disassemble(ArrayRef FunctionData) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index ebe266d53b18..82fb078f34c1 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -182,6 +182,15 @@ class BinaryFunction { using CFIInstrMapType = std::multimap; CFIInstrMapType FrameInstructions; + /// Exception handling ranges. + struct CallSite { + const MCSymbol *Start; + const MCSymbol *End; + const MCSymbol *LP; + uint64_t Action; + }; + std::vector CallSites; + // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), // the terminating instructions need to be modified. @@ -469,6 +478,9 @@ class BinaryFunction { /// Process LSDA information for the function. void parseLSDA(ArrayRef LSDAData, uint64_t LSDAAddress); + /// Update exception handling ranges for the function. + void updateEHRanges(); + virtual ~BinaryFunction() {} }; diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 1128af962d02..1800e3fe8d09 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -332,7 +332,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (Label != Labels.end()) { LPSymbol = Label->second; } else { - LPSymbol = BC.Ctx->createTempSymbol("LP"); + LPSymbol = BC.Ctx->createTempSymbol("LP", true); Labels[LandingPad] = LPSymbol; } LandingPads.insert(LPSymbol); @@ -429,6 +429,108 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, errs() << '\n'; } +void BinaryFunction::updateEHRanges() { + assert(CurrentState == State::CFG && "unexpected state"); + + // Build call sites table. + struct EHInfo { + const MCSymbol *LP; // landing pad + uint64_t Action; + }; + + // Markers for begining and the end of exceptions range. + const MCSymbol *StartRange{nullptr}; + const MCSymbol *EndRange{nullptr}; + + // If previous call can throw, this is its exception handler. + EHInfo PreviousEH = {nullptr, 0}; + + for(auto &BB : BasicBlocksLayout) { + for (auto II = BB->begin(); II != BB->end(); ++II) { + auto Instr = *II; + + if (!BC.MIA->isCall(Instr)) + continue; + + // Instruction can throw an exception that should be handled. + bool Throws = Instr.getNumOperands() > 1; + + // Ignore the call if it's a continuation of a no-throw gap. + if (!Throws && !StartRange) + continue; + + // Extract exception handling information from the instruction. + const MCSymbol *LP = + Throws ? (Instr.getOperand(1).isExpr() + ? &(cast( + Instr.getOperand(1).getExpr())->getSymbol()) + : nullptr) + : nullptr; + uint64_t Action = Throws ? Instr.getOperand(2).getImm() : 0; + + // No action if the exception handler has not changed. + if (Throws && + StartRange && + PreviousEH.LP == LP && + PreviousEH.Action == Action) + continue; + + // Same symbol is used for the beginning and the end of the range. + const MCSymbol *EHSymbol = BC.Ctx->createTempSymbol("EH", true); + MCInst EHLabel; + BC.MIA->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get()); + II = BB->Instructions.insert(II, EHLabel); + ++II; + + // At this point we could be in the one of the following states: + // + // I. Exception handler has changed and we need to close the prev range + // and start the new one. + // + // II. Start the new exception range after the gap. + // + // III. Close exception range and start the new gap. + + if (StartRange) { + // I, III: + EndRange = EHSymbol; + } else { + // II: + StartRange = EHSymbol; + EndRange = nullptr; + } + + // Close the previous range. + if (EndRange) { + assert(StartRange && "beginning of the range expected"); + CallSites.emplace_back(CallSite{StartRange, EndRange, + PreviousEH.LP, PreviousEH.Action}); + EndRange = nullptr; + } + + if (Throws) { + // I, II: + StartRange = EHSymbol; + PreviousEH = EHInfo{LP, Action}; + } else { + StartRange = nullptr; + } + } + } + + // Check if we need to close the range. + if (StartRange) { + assert(!EndRange && "unexpected end of range"); + EndRange = BC.Ctx->createTempSymbol("EH", true); + MCInst EHLabel; + BC.MIA->createEHLabel(EHLabel, EndRange, BC.Ctx.get()); + BasicBlocksLayout.back()->Instructions.emplace_back(EHLabel); + + CallSites.emplace_back(CallSite{StartRange, EndRange, + PreviousEH.LP, PreviousEH.Action}); + } +} + const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f; diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 8f42059d6d85..f35d0d4a0535 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -127,6 +127,11 @@ static cl::opt PrintDisasm("print-disasm", cl::desc("print function after disassembly"), cl::Hidden); +static cl::opt +PrintEHRanges("print-eh-ranges", + cl::desc("print function with updated exception ranges"), + cl::Hidden); + static cl::opt PrintReordered("print-reordered", cl::desc("print functions after layout optimization"), @@ -544,7 +549,13 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { (SectionContents.data()) + FunctionOffset, Function.getSize()); - if (!Function.disassemble(FunctionData) || !Function.isSimple()) + if (!Function.disassemble(FunctionData)) + continue; + + if (opts::PrintAll || opts::PrintDisasm) + Function.print(errs(), "after disassembly"); + + if (!Function.isSimple()) continue; // Fill in CFI information for this function @@ -552,12 +563,8 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { DwCFIReader.fillCFIInfoFor(Function); // Parse LSDA. - if (Function.getLSDAAddress() != 0) { + if (Function.getLSDAAddress() != 0) Function.parseLSDA(LSDAData, LSDAAddress); - } - - if (opts::PrintAll || opts::PrintDisasm) - Function.print(errs(), "after disassembly"); if (!Function.buildCFG()) continue; @@ -586,6 +593,9 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (!opts::shouldProcess(Function.getName())) continue; + if (!Function.isSimple()) + continue; + // Detect and eliminate unreachable basic blocks. We could have those // filled with nops and they are used for alignment. // @@ -639,6 +649,12 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks"); } + + // Post-processing passes. + Function.updateEHRanges(); + if (opts::PrintAll || opts::PrintEHRanges) { + Function.print(errs(), "after updating EH ranges"); + } } std::error_code EC; @@ -719,6 +735,15 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Streamer->EmitCodeAlignment(BB->getAlignment()); Streamer->EmitLabel(BB->getLabel()); for (const auto &Instr : *BB) { + // Handle pseudo instructions. + if (BC->MIA->isEHLabel(Instr)) { + assert(Instr.getNumOperands() == 1 && Instr.getOperand(0).isExpr() && + "bad EH_LABEL instruction"); + auto Label = &(cast( + Instr.getOperand(0).getExpr())->getSymbol()); + Streamer->EmitLabel(const_cast(Label)); + continue; + } Streamer->EmitInstruction(Instr, *BC->STI); } } From c82747318da6b07895ccdec72dd4650f76dbf3df Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Sun, 8 Nov 2015 12:23:54 -0800 Subject: [PATCH 042/904] Attempts to fix CFI state after reordering Summary: This patch introduces logic to check how the CFI instructions define a table to help during stack unwinding at exception run time and attempts to fix any problem in this table that may have been introduced by reordering the basic blocks. If it fails to fix this problem, the function is marked as not simple and not eligible for rewriting. (cherry picked from commit d294337217681453fa41ab9da20c44ac4efb5f41) --- bolt/BinaryBasicBlock.h | 11 ++ bolt/BinaryFunction.cpp | 303 ++++++++++++++++++++++++++++++++++------ bolt/BinaryFunction.h | 81 ++++++++++- bolt/Exceptions.cpp | 56 ++++---- bolt/llvm-flo.cpp | 9 +- 5 files changed, 380 insertions(+), 80 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 5ee41720b086..f9c3b9f16bf6 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -45,6 +45,9 @@ class BinaryBasicBlock { /// Alignment requirements for the block. uint64_t Alignment{1}; + /// Number of pseudo instructions in this block. + uint32_t NumPseudos{0}; + /// Number of times this basic block was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; @@ -184,6 +187,14 @@ class BinaryBasicBlock { Instructions.emplace_back(Inst); } + /// Add instruction before Pos in this basic block. + const_iterator insertPseudoInstr(const_iterator Pos, MCInst &Instr) { + ++NumPseudos; + return Instructions.emplace(Pos, Instr); + } + + uint32_t getNumPseudos() const { return NumPseudos; } + /// Set minimum alignment for the basic block. void setAlignment(uint64_t Align) { Alignment = Align; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 3c53d56f2d65..5b5636633c63 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -76,8 +76,10 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, << "\n Section : " << SectionName << "\n Orc Section : " << getCodeSectionName() << "\n IsSimple : " << IsSimple - << "\n BB Count : " << BasicBlocksLayout.size() - << "\n CFI Instrs : " << FrameInstructions.size(); + << "\n BB Count : " << BasicBlocksLayout.size(); + if (FrameInstructions.size()) { + OS << "\n CFI Instrs : " << FrameInstructions.size(); + } if (BasicBlocksLayout.size()) { OS << "\n BB Layout : "; auto Sep = ""; @@ -99,6 +101,25 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, // Offset of the instruction in function. uint64_t Offset{0}; + auto printCFI = [&OS] (uint32_t Operation) { + switch(Operation) { + case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; + case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; + case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; + case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; + case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; + case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; + case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; + case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; + case MCCFIInstruction::OpAdjustCfaOffset: OS << "OfAdjustCfaOffset"; break; + case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; + case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; + case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; + case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; + case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; + } + }; + auto printInstruction = [&](const MCInst &Instruction) { if (BC.MIA->isEHLabel(Instruction)) { OS << " EH_LABEL: " @@ -108,6 +129,14 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, return; } OS << format(" %08" PRIx64 ": ", Offset); + if (BC.MIA->isCFI(Instruction)) { + uint32_t Offset = Instruction.getOperand(0).getImm(); + OS << "\t!CFI\t$" << Offset << "\t; "; + assert(Offset < FrameInstructions.size() && "Invalid CFI offset"); + printCFI(FrameInstructions[Offset].getOperation()); + OS << "\n"; + return; + } BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); if (BC.MIA->isCall(Instruction)) { if (BC.MIA->isTailCall(Instruction)) @@ -216,30 +245,26 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } - if (!FrameInstructions.empty()) { - OS << "DWARF CFI Instructions:\n"; - for (auto &CFIInstr : FrameInstructions) { - OS << format(" %08x: ", CFIInstr.first); - switch(CFIInstr.second.getOperation()) { - case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; - case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; - case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; - case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; - case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; - case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; - case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; - case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; - case MCCFIInstruction::OpAdjustCfaOffset:OS << "OfAdjustCfaOffset"; break; - case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; - case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; - case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; - case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; - case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; - } - OS << '\n'; + OS << "DWARF CFI Instructions:\n"; + if (OffsetToCFI.size()) { + // Pre-buildCFG information + for (auto &Elmt : OffsetToCFI) { + OS << format(" %08x:\t", Elmt.first); + assert(Elmt.second < FrameInstructions.size() && "Incorrect CFI offset"); + printCFI(FrameInstructions[Elmt.second].getOperation()); + OS << "\n"; + } + } else { + // Post-buildCFG information + for (uint32_t I = 0, E = FrameInstructions.size(); I != E; ++I) { + const MCCFIInstruction &CFI = FrameInstructions[I]; + OS << format(" %d:\t", I); + printCFI(CFI.getOperation()); + OS << "\n"; } - OS << '\n'; } + if (FrameInstructions.empty()) + OS << " \n"; OS << "End of Function \"" << getName() << "\"\n\n"; } @@ -449,7 +474,18 @@ bool BinaryFunction::buildCFG() { BinaryBasicBlock *PrevBB{nullptr}; bool IsLastInstrNop = false; MCInst *PrevInstr{nullptr}; - for (auto &InstrInfo : Instructions) { + + auto addCFIPlaceholders = + [this](uint64_t CFIOffset, BinaryBasicBlock *InsertBB) { + for (auto FI = OffsetToCFI.lower_bound(CFIOffset), + FE = OffsetToCFI.upper_bound(CFIOffset); + FI != FE; ++FI) { + addCFIPseudo(InsertBB, InsertBB->end(), FI->second); + } + }; + + for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { + auto &InstrInfo = *I; auto LI = Labels.find(InstrInfo.first); if (LI != Labels.end()) { // Always create new BB at branch destination. @@ -472,7 +508,10 @@ bool BinaryFunction::buildCFG() { /* DeriveAlignment = */ IsLastInstrNop); } } - + if (InstrInfo.first == 0) { + // Add associated CFI pseudos in the first offset (0) + addCFIPlaceholders(0, InsertBB); + } // Ignore nops. We use nops to derive alignment of the next basic block. // It will not always work, as some blocks are naturally aligned, but // it's just part of heuristic for block alignment. @@ -484,6 +523,17 @@ bool BinaryFunction::buildCFG() { IsLastInstrNop = false; InsertBB->addInstruction(InstrInfo.second); PrevInstr = &InstrInfo.second; + // Add associated CFI instrs. We always add the CFI instruction that is + // located immediately after this instruction, since the next CFI + // instruction reflects the change in state caused by this instruction. + auto NextInstr = I; + ++NextInstr; + uint64_t CFIOffset; + if (NextInstr != E) + CFIOffset = NextInstr->first; + else + CFIOffset = getSize(); + addCFIPlaceholders(CFIOffset, InsertBB); // How well do we detect tail calls here? if (MIA->isTerminator(InstrInfo.second)) { @@ -559,14 +609,19 @@ bool BinaryFunction::buildCFG() { PrevBB->addSuccessor(&BB, BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE, BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE); } - - MCInst &LastInst = BB.back(); if (BB.empty()) { IsPrevFT = true; - } else if (BB.succ_size() == 0) { - IsPrevFT = MIA->isTerminator(LastInst) ? false : true; + PrevBB = &BB; + continue; + } + + auto LastInstIter = --BB.end(); + while (MIA->isCFI(*LastInstIter) && LastInstIter != BB.begin()) + --LastInstIter; + if (BB.succ_size() == 0) { + IsPrevFT = MIA->isTerminator(*LastInstIter) ? false : true; } else if (BB.succ_size() == 1) { - IsPrevFT = MIA->isConditionalBranch(LastInst) ? true : false; + IsPrevFT = MIA->isConditionalBranch(*LastInstIter) ? true : false; } else { // Ends with 2 branches, with an indirect jump or it is a conditional // branch whose frequency has been inferred from LBR @@ -586,8 +641,12 @@ bool BinaryFunction::buildCFG() { inferFallThroughCounts(); } + // Update CFI information for each BB + annotateCFIState(); + // Clean-up memory taken by instructions and labels. clearInstructions(); + clearCFIOffsets(); clearLabels(); clearLocalBranches(); clearFTBranches(); @@ -646,12 +705,16 @@ void BinaryFunction::inferFallThroughCounts() { uint64_t Inferred = 0; if (BBExecCount > ReportedBranches) Inferred = BBExecCount - ReportedBranches; - if (BBExecCount < ReportedBranches) - errs() << "FLO-WARNING: Fall-through inference is slightly inconsistent. " - "exec frequency is less than the outgoing edges frequency (" - << BBExecCount << " < " << ReportedBranches - << ") for BB at offset 0x" - << Twine::utohexstr(getAddress() + CurBB.getOffset()) << '\n'; + + DEBUG({ + if (BBExecCount < ReportedBranches) + dbgs() + << "FLO-WARNING: Fall-through inference is slightly inconsistent. " + "exec frequency is less than the outgoing edges frequency (" + << BBExecCount << " < " << ReportedBranches + << ") for BB at offset 0x" + << Twine::utohexstr(getAddress() + CurBB.getOffset()) << '\n'; + }); // Put this information into the fall-through edge if (CurBB.succ_size() == 0) @@ -669,6 +732,154 @@ void BinaryFunction::inferFallThroughCounts() { return; } +void BinaryFunction::annotateCFIState() { + assert(!BasicBlocks.empty() && "basic block list should not be empty"); + + uint32_t State = 0; + uint32_t HighestState = 0; + std::stack StateStack; + + for (auto CI = BasicBlocks.begin(), CE = BasicBlocks.end(); CI != CE; ++CI) { + BinaryBasicBlock &CurBB = *CI; + // Annotate this BB entry + BBCFIState.emplace_back(State); + + // Advance state + for (const auto &Instr : CurBB) { + MCCFIInstruction *CFI = getCFIFor(Instr); + if (CFI == nullptr) + continue; + ++HighestState; + if (CFI->getOperation() == MCCFIInstruction::OpRememberState) { + StateStack.push(State); + continue; + } + if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) { + assert(!StateStack.empty() && "Corrupt CFI stack"); + State = StateStack.top(); + StateStack.pop(); + continue; + } + State = HighestState; + } + } + + // Store the state after the last BB + BBCFIState.emplace_back(State); + + assert(StateStack.empty() && "Corrupt CFI stack"); +} + +bool BinaryFunction::fixCFIState() { + auto Sep = ""; + DEBUG(dbgs() << "Trying to fix CFI states for each BB after reordering.\n"); + DEBUG(dbgs() << "This is the list of CFI states for each BB of " << getName() + << ": "); + + auto replayCFIInstrs = + [this](uint32_t FromState, uint32_t ToState, BinaryBasicBlock *InBB, + BinaryBasicBlock::const_iterator InsertIt) -> bool { + if (FromState == ToState) + return true; + assert(FromState < ToState); + + for (uint32_t CurState = FromState; CurState < ToState; ++CurState) { + MCCFIInstruction *Instr = &FrameInstructions[CurState]; + if (Instr->getOperation() == MCCFIInstruction::OpRememberState || + Instr->getOperation() == MCCFIInstruction::OpRestoreState) { + // TODO: If in replaying the CFI instructions to reach this state we + // have state stack instructions, we could still work out the logic + // to extract only the necessary instructions to reach this state + // without using the state stack. Not sure if it is worth the effort + // because this happens rarely. + errs() << "FLO-WARNING: CFI rewriter expected state " << ToState + << " but found " << FromState << " instead (@ " << getName() + << "). Giving up this function.\n"; + return false; + } + InsertIt = + addCFIPseudo(InBB, InsertIt, Instr - &*FrameInstructions.begin()); + ++InsertIt; + } + + return true; + }; + + uint32_t State = 0; + BinaryBasicBlock *EntryBB = *BasicBlocksLayout.begin(); + for (BinaryBasicBlock *BB : BasicBlocksLayout) { + uint32_t BBIndex = BB - &*BasicBlocks.begin(); + + // Check if state is what this BB expect it to be at its entry point + if (BBCFIState[BBIndex] != State) { + // Need to recover the correct state + if (BBCFIState[BBIndex] < State) { + // In this case, State is currently higher than what this BB expect it + // to be. To solve this, we need to insert a CFI instruction to remember + // the old state at function entry, then another CFI instruction to + // restore it at the entry of this BB and replay CFI instructions to + // reach the desired state. + uint32_t OldState = BBCFIState[BBIndex]; + // Remember state at function entry point (our reference state). + BinaryBasicBlock::const_iterator InsertIt = EntryBB->begin(); + while (InsertIt != EntryBB->end() && BC.MIA->isCFI(*InsertIt)) + ++InsertIt; + addCFIPseudo(EntryBB, InsertIt, FrameInstructions.size()); + FrameInstructions.emplace_back( + MCCFIInstruction::createRememberState(nullptr)); + // Restore state + InsertIt = addCFIPseudo(BB, BB->begin(), FrameInstructions.size()); + ++InsertIt; + FrameInstructions.emplace_back( + MCCFIInstruction::createRestoreState(nullptr)); + if (!replayCFIInstrs(0, OldState, BB, InsertIt)) + return false; + // Check if we messed up the stack in this process + int StackOffset = 0; + for (BinaryBasicBlock *CurBB : BasicBlocksLayout) { + if (CurBB == BB) + break; + for (auto &Instr : *CurBB) { + if (MCCFIInstruction *CFI = getCFIFor(Instr)) { + if (CFI->getOperation() == MCCFIInstruction::OpRememberState) + ++StackOffset; + if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) + --StackOffset; + } + } + } + auto Pos = BB->begin(); + while (MCCFIInstruction *CFI = getCFIFor(*Pos++)) { + if (CFI->getOperation() == MCCFIInstruction::OpRememberState) + ++StackOffset; + if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) + --StackOffset; + } + + if (StackOffset != 0) { + errs() << " FLO-WARNING: not possible to remember/recover state" + << "without corrupting CFI state stack in function " + << getName() << "\n"; + return false; + } + } else { + // If BBCFIState[BBIndex] > State, it means we are behind in the + // state. Just emit all instructions to reach this state at the + // beginning of this BB. If this sequence of instructions involve + // remember state or restore state, bail out. + if (!replayCFIInstrs(State, BBCFIState[BBIndex], BB, BB->begin())) + return false; + } + } + + State = BBCFIState[BBIndex + 1]; + DEBUG(dbgs() << Sep << State); + DEBUG(Sep = ", "); + } + DEBUG(dbgs() << "\n"); + return true; +} + void BinaryFunction::optimizeLayout(HeuristicPriority Priority) { // Bail if no profiling information or if empty if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE || @@ -792,8 +1003,8 @@ void BinaryFunction::optimizeLayout(HeuristicPriority Priority) { for (uint32_t I = 1, E = Clusters.size(); I < E; ++I) { double Freq = 0.0; for (auto BB : Clusters[I]) { - if (!BB->empty()) - Freq += BB->getExecutionCount() / BB->size(); + if (!BB->empty() && BB->size() != BB->getNumPseudos()) + Freq += BB->getExecutionCount() / (BB->size() - BB->getNumPseudos()); } AvgFreq[I] = Freq; } @@ -1039,10 +1250,16 @@ void BinaryFunction::fixBranches() { // Case 1: There are no branches in this basic block and it just falls // through if (CondBranch == nullptr && UncondBranch == nullptr) { - // Case 1a: Last instruction is a return, so it does *not* fall through to - // the next block. - if (!BB->empty() && MIA->isReturn(BB->back())) - continue; + // Case 1a: Last instruction, excluding pseudos, is a return, so it does + // *not* fall through to the next block. + if (!BB->empty()) { + auto LastInstIter = --BB->end(); + while (BC.MII->get(LastInstIter->getOpcode()).isPseudo() && + LastInstIter != BB->begin()) + --LastInstIter; + if (MIA->isReturn(*LastInstIter)) + continue; + } // Case 1b: Layout has changed and the fallthrough is not the same. Need // to add a new unconditional branch to jump to the old fallthrough. if (FT != OldFT && OldFT != nullptr) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 82fb078f34c1..141de33c1840 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -136,6 +136,13 @@ class BinaryFunction { return *this; } + /// Release storage used by CFI offsets map. + BinaryFunction &clearCFIOffsets() { + std::multimap TempMap; + OffsetToCFI.swap(TempMap); + return *this; + } + /// Release storage used by instructions. BinaryFunction &clearLabels() { LabelsMapType TempMap; @@ -178,8 +185,10 @@ class BinaryFunction { using InstrMapType = std::map; InstrMapType Instructions; - /// List of DWARF CFI instructions - using CFIInstrMapType = std::multimap; + /// List of DWARF CFI instructions. Original CFI from the binary must be + /// sorted w.r.t. offset that it appears. We rely on this to replay CFIs + /// if needed (to fix state after reordering BBs). + using CFIInstrMapType = std::vector; CFIInstrMapType FrameInstructions; /// Exception handling ranges. @@ -191,6 +200,11 @@ class BinaryFunction { }; std::vector CallSites; + /// Map to discover which CFIs are attached to a given instruction offset. + /// Maps an instruction offset into a FrameInstructions offset. + /// This is only relevant to the buildCFG phase and is discarded afterwards. + std::multimap OffsetToCFI; + // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), // the terminating instructions need to be modified. @@ -199,6 +213,12 @@ class BinaryFunction { BasicBlockListType BasicBlocks; BasicBlockOrderType BasicBlocksLayout; + // At each basic block entry we attach a CFI state to detect if reordering + // corrupts the CFI state for a block. The CFI state is simply the index in + // FrameInstructions for the CFI responsible for creating this state. + // This vector is indexed by BB index. + std::vector BBCFIState; + public: typedef BasicBlockListType::iterator iterator; @@ -366,8 +386,50 @@ class BinaryFunction { Instructions.emplace(Offset, std::forward(Instruction)); } - void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) { - FrameInstructions.emplace(Offset, std::forward(Inst)); + void addCFI(uint64_t Offset, MCCFIInstruction &&Inst) { + assert(!Instructions.empty()); + + // Fix CFI instructions skipping NOPs. We need to fix this because changing + // CFI state after a NOP, besides being wrong and innacurate, makes it + // harder for us to recover this information, since we can create empty BBs + // with NOPs and then reorder it away. + // We fix this by moving the CFI instruction just before any NOPs. + auto I = Instructions.lower_bound(Offset); + assert(I->first == Offset && "CFI pointing to unknown instruction"); + if (I == Instructions.begin()) { + OffsetToCFI.emplace(Offset, FrameInstructions.size()); + FrameInstructions.emplace_back(std::forward(Inst)); + return; + } + + --I; + while (I != Instructions.begin() && BC.MIA->isNoop(I->second)) { + Offset = I->first; + --I; + } + OffsetToCFI.emplace(Offset, FrameInstructions.size()); + FrameInstructions.emplace_back(std::forward(Inst)); + return; + } + + /// Insert a CFI pseudo instruction in a basic block. This pseudo instruction + /// is a placeholder that refers to a real MCCFIInstruction object kept by + /// this function that will be emitted at that position. + BinaryBasicBlock::const_iterator + addCFIPseudo(BinaryBasicBlock *BB, BinaryBasicBlock::const_iterator Pos, + uint32_t Offset) { + MCInst CFIPseudo; + BC.MIA->createCFI(CFIPseudo, Offset); + return BB->insertPseudoInstr(Pos, CFIPseudo); + } + + /// Retrieve the MCCFIInstruction object associated with a CFI pseudo. + MCCFIInstruction* getCFIFor(const MCInst &Instr) { + if (!BC.MIA->isCFI(Instr)) + return nullptr; + uint32_t Offset = Instr.getOperand(0).getImm(); + assert(Offset < FrameInstructions.size() && "Invalid CFI offset"); + return &FrameInstructions[Offset]; } BinaryFunction &setFileOffset(uint64_t Offset) { @@ -471,6 +533,17 @@ class BinaryFunction { /// has been filled with LBR data. void inferFallThroughCounts(); + /// Annotate each basic block entry with its current CFI state. This is used + /// to detect when reordering changes the CFI state seen by a basic block and + /// fix this. + /// The CFI state is simply the index in FrameInstructions for the + /// MCCFIInstruction object responsible for this state. + void annotateCFIState(); + + /// After reordering, this function checks the state of CFI and fixes it if it + /// is corrupted. If it is unable to fix it, it returns false. + bool fixCFIState(); + /// Traverse the CFG checking branches, inverting their condition, removing or /// adding jumps based on a new layout order. void fixBranches(); diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 1800e3fe8d09..1fb54e99870e 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -569,21 +569,20 @@ void CFIReader::fillCFIInfoFor(BinaryFunction &Function) const { Offset += CodeAlignment * int64_t(Instr.Ops[0]); break; case DW_CFA_offset_extended_sf: - Function.addCFIInstruction( - Offset, - MCCFIInstruction::createOffset( - nullptr, Instr.Ops[0], DataAlignment * int64_t(Instr.Ops[1]))); + Function.addCFI(Offset, MCCFIInstruction::createOffset( + nullptr, Instr.Ops[0], + DataAlignment * int64_t(Instr.Ops[1]))); break; case DW_CFA_offset_extended: case DW_CFA_offset: - Function.addCFIInstruction( + Function.addCFI( Offset, MCCFIInstruction::createOffset(nullptr, Instr.Ops[0], DataAlignment * Instr.Ops[1])); break; case DW_CFA_restore_extended: case DW_CFA_restore: - Function.addCFIInstruction( - Offset, MCCFIInstruction::createRestore(nullptr, Instr.Ops[0])); + Function.addCFI(Offset, + MCCFIInstruction::createRestore(nullptr, Instr.Ops[0])); break; case DW_CFA_set_loc: assert(Instr.Ops[0] < Address && "set_loc out of function bounds"); @@ -593,49 +592,44 @@ void CFIReader::fillCFIInfoFor(BinaryFunction &Function) const { break; case DW_CFA_undefined: - Function.addCFIInstruction( - Offset, MCCFIInstruction::createUndefined(nullptr, Instr.Ops[0])); + Function.addCFI(Offset, + MCCFIInstruction::createUndefined(nullptr, Instr.Ops[0])); break; case DW_CFA_same_value: - Function.addCFIInstruction( - Offset, MCCFIInstruction::createSameValue(nullptr, Instr.Ops[0])); + Function.addCFI(Offset, + MCCFIInstruction::createSameValue(nullptr, Instr.Ops[0])); break; case DW_CFA_register: - Function.addCFIInstruction( - Offset, MCCFIInstruction::createRegister(nullptr, Instr.Ops[0], - Instr.Ops[1])); + Function.addCFI(Offset, MCCFIInstruction::createRegister( + nullptr, Instr.Ops[0], Instr.Ops[1])); break; case DW_CFA_remember_state: - Function.addCFIInstruction( - Offset, MCCFIInstruction::createRememberState(nullptr)); + Function.addCFI(Offset, MCCFIInstruction::createRememberState(nullptr)); break; case DW_CFA_restore_state: - Function.addCFIInstruction(Offset, - MCCFIInstruction::createRestoreState(nullptr)); + Function.addCFI(Offset, MCCFIInstruction::createRestoreState(nullptr)); break; case DW_CFA_def_cfa: - Function.addCFIInstruction( - Offset, - MCCFIInstruction::createDefCfa(nullptr, Instr.Ops[0], Instr.Ops[1])); + Function.addCFI(Offset, MCCFIInstruction::createDefCfa( + nullptr, Instr.Ops[0], Instr.Ops[1])); break; case DW_CFA_def_cfa_sf: - Function.addCFIInstruction( - Offset, - MCCFIInstruction::createDefCfa( - nullptr, Instr.Ops[0], DataAlignment * int64_t(Instr.Ops[1]))); + Function.addCFI(Offset, MCCFIInstruction::createDefCfa( + nullptr, Instr.Ops[0], + DataAlignment * int64_t(Instr.Ops[1]))); break; case DW_CFA_def_cfa_register: - Function.addCFIInstruction(Offset, MCCFIInstruction::createDefCfaRegister( - nullptr, Instr.Ops[0])); + Function.addCFI(Offset, MCCFIInstruction::createDefCfaRegister( + nullptr, Instr.Ops[0])); break; case DW_CFA_def_cfa_offset: - Function.addCFIInstruction( + Function.addCFI( Offset, MCCFIInstruction::createDefCfaOffset(nullptr, Instr.Ops[0])); break; case DW_CFA_def_cfa_offset_sf: - Function.addCFIInstruction( - Offset, MCCFIInstruction::createDefCfaOffset( - nullptr, DataAlignment * int64_t(Instr.Ops[0]))); + Function.addCFI(Offset, + MCCFIInstruction::createDefCfaOffset( + nullptr, DataAlignment * int64_t(Instr.Ops[0]))); break; case DW_CFA_val_offset_sf: case DW_CFA_val_offset: diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index f35d0d4a0535..d579c6036361 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -559,7 +559,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { continue; // Fill in CFI information for this function - if (EHFrame.ParseError.empty()) + if (EHFrame.ParseError.empty() && Function.isSimple()) DwCFIReader.fillCFIInfoFor(Function); // Parse LSDA. @@ -655,6 +655,10 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (opts::PrintAll || opts::PrintEHRanges) { Function.print(errs(), "after updating EH ranges"); } + + // After optimizations, fix the CFI state + if (!Function.fixCFIState()) + Function.setSimple(false); } std::error_code EC; @@ -744,7 +748,8 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Streamer->EmitLabel(const_cast(Label)); continue; } - Streamer->EmitInstruction(Instr, *BC->STI); + if (!BC->MII->get(Instr.getOpcode()).isPseudo()) + Streamer->EmitInstruction(Instr, *BC->STI); } } From 3024a5f465a71180a130b6f2ff7899905b3820cd Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 9 Nov 2015 11:08:02 -0800 Subject: [PATCH 043/904] Write updated CFI to temporary object file Summary: This patch is an intermediary step towards updating the CFI in the optimized binary. It adds the logic necessary to output our CFI annotations to a new .eh_frame in the temporary object file we create to hold rewritten functions. The next step will be to fully integrate this new .eh_frame into the optimized binary. (cherry picked from commit 4ab892eb7d4d636979940412e1024f353d851638) --- bolt/BinaryFunction.h | 5 +++ bolt/llvm-flo.cpp | 87 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 141de33c1840..001bf9954c7d 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -327,6 +327,11 @@ class BinaryFunction { return IsSimple; } + /// Return true if the function has CFI instructions + bool hasCFI() const { + return !FrameInstructions.empty(); + } + /// Return true if the given address \p PC is inside the function body. bool containsAddress(uint64_t PC) const { return Address <= PC && PC < Address + Size; diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index d579c6036361..2b8b59ba027c 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -219,9 +219,12 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { DEBUG(dbgs() << "FLO: allocating data section : " << SectionName << " with size " << Size << ", alignment " << Alignment << "\n"); - errs() << "FLO-WARNING: allocating data section.\n"; - return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, - SectionName, IsReadOnly); + auto ret = SectionMemoryManager::allocateDataSection( + Size, Alignment, SectionID, SectionName, IsReadOnly); + + SectionAddressInfo[SectionName] = {reinterpret_cast(ret), Size}; + + return ret; } // Tell EE that we guarantee we don't need stubs. @@ -466,6 +469,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { uint64_t LSDAAddress{0}; // Process special sections. + uint64_t EHFrameAddress = 0ULL; for (const auto &Section : File->sections()) { StringRef SectionName; check_error(Section.getName(SectionName), "cannot get section name"); @@ -481,6 +485,9 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { LSDAData = SectionData; LSDAAddress = Section.getAddress(); } + if (SectionName == ".eh_frame") { + EHFrameAddress = Section.getAddress(); + } } // Process debug sections. @@ -699,6 +706,53 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Streamer->InitSections(false); + // Define a helper to decode and emit CFI instructions at a given point in a + // BB + auto emitCFIInstr = [&Streamer](MCCFIInstruction &CFIInstr) { + switch (CFIInstr.getOperation()) { + default: + llvm_unreachable("Unexpected instruction"); + case MCCFIInstruction::OpDefCfaOffset: + Streamer->EmitCFIDefCfaOffset(CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpAdjustCfaOffset: + Streamer->EmitCFIAdjustCfaOffset(CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpDefCfa: + Streamer->EmitCFIDefCfa(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpDefCfaRegister: + Streamer->EmitCFIDefCfaRegister(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpOffset: + Streamer->EmitCFIOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpRegister: + Streamer->EmitCFIRegister(CFIInstr.getRegister(), + CFIInstr.getRegister2()); + break; + case MCCFIInstruction::OpRelOffset: + Streamer->EmitCFIRelOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpUndefined: + Streamer->EmitCFIUndefined(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpRememberState: + Streamer->EmitCFIRememberState(); + break; + case MCCFIInstruction::OpRestoreState: + Streamer->EmitCFIRestoreState(); + break; + case MCCFIInstruction::OpRestore: + Streamer->EmitCFIRestore(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpSameValue: + Streamer->EmitCFISameValue(CFIInstr.getRegister()); + break; + } + }; + + bool HasEHFrame = false; // Output functions one by one. for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -733,6 +787,12 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Streamer->EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); Streamer->EmitLabel(FunctionSymbol); + // Emit CFI start + if (Function.hasCFI()) { + HasEHFrame = true; + Streamer->EmitCFIStartProc(/*IsSimple=*/false); + } + // Emit code. for (auto BB : Function.layout()) { if (BB->getAlignment() > 1) @@ -750,9 +810,15 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } if (!BC->MII->get(Instr.getOpcode()).isPseudo()) Streamer->EmitInstruction(Instr, *BC->STI); + else + emitCFIInstr(*Function.getCFIFor(Instr)); } } + // Emit CFI end + if (Function.hasCFI()) + Streamer->EmitCFIEndProc(); + // TODO: is there any use in emiting end of function? // Perhaps once we have a support for C++ exceptions. //auto FunctionEndLabel = Ctx.createTempSymbol("func_end"); @@ -821,6 +887,21 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { errs() << "FLO: cannot remap function " << Function.getName() << "\n"; } } + // Map .eh_frame + if (HasEHFrame) { + assert(EHFrameAddress); + auto SAI = EFMM->SectionAddressInfo.find(".eh_frame"); + if (SAI != EFMM->SectionAddressInfo.end()) { + DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + << " to 0x" << Twine::utohexstr(EHFrameAddress) + << '\n'); + OLT.mapSectionAddress(ObjectsHandle, + reinterpret_cast(SAI->second.first), + EHFrameAddress); + } else { + errs() << "FLO: cannot remap .eh_frame\n"; + } + } OLT.emitAndFinalize(ObjectsHandle); From f1143d2afdd52ce8f11127a01280b1ab739883ae Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 10 Nov 2015 15:20:50 -0800 Subject: [PATCH 044/904] Write .eh_frame and .eh_frame_hdr after reordering BBs Summary: This patch adds logic to detect when the binary has extra space reserved for us via the __flo_storage symbol. If this symbol is present, it means we have extra space in the binary to write extraneous information. When we write a new .eh_frame, we cannot discard the old .eh_frame because it may still contain relevant information for functions we do not reorder. Thus, we write the new .eh_frame into __flo_storage and patch the current .eh_frame_hdr to point to the new .eh_frame only for the functions we touched, generating a binary that works with a bi-.eh_frame model. (cherry picked from commit cbbc2b0dc1a08187e59814bdcd79951e5baf1b5e) --- bolt/BinaryFunction.h | 49 ++++++- bolt/Exceptions.cpp | 328 +++++++++++++++++++++++++++++------------- bolt/Exceptions.h | 15 +- bolt/llvm-flo.cpp | 130 +++++++++++++++-- 4 files changed, 404 insertions(+), 118 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 001bf9954c7d..2ac217879513 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -31,6 +31,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Dwarf.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -105,6 +106,9 @@ class BinaryFunction { /// flow graph and re-assemble. bool IsSimple{true}; + MCSymbol *PersonalityFunction{nullptr}; + uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel}; + BinaryContext &BC; /// The address for the code for this function in codegen memory. @@ -189,6 +193,8 @@ class BinaryFunction { /// sorted w.r.t. offset that it appears. We rely on this to replay CFIs /// if needed (to fix state after reordering BBs). using CFIInstrMapType = std::vector; + using cfi_iterator = CFIInstrMapType::iterator; + using const_cfi_iterator = CFIInstrMapType::const_iterator; CFIInstrMapType FrameInstructions; /// Exception handling ranges. @@ -205,6 +211,10 @@ class BinaryFunction { /// This is only relevant to the buildCFG phase and is discarded afterwards. std::multimap OffsetToCFI; + /// List of CFI instructions associated with the CIE (common to more than one + /// function and that apply before the entry basic block). + CFIInstrMapType CIEFrameInstructions; + // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), // the terminating instructions need to be modified. @@ -259,6 +269,20 @@ class BinaryFunction { BasicBlocksLayout.end()); } + cfi_iterator cie_begin() { return CIEFrameInstructions.begin(); } + const_cfi_iterator cie_begin() const { return CIEFrameInstructions.begin(); } + cfi_iterator cie_end() { return CIEFrameInstructions.end(); } + const_cfi_iterator cie_end() const { return CIEFrameInstructions.end(); } + bool cie_empty() const { return CIEFrameInstructions.empty(); } + + inline iterator_range cie() { + return iterator_range(cie_begin(), cie_end()); + } + inline iterator_range cie() const { + return iterator_range(cie_begin(), cie_end()); + } + + BinaryFunction(std::string Name, SymbolRef Symbol, SectionRef Section, uint64_t Address, uint64_t Size, BinaryContext &BC) : Name(Name), Symbol(Symbol), Section(Section), Address(Address), @@ -327,9 +351,17 @@ class BinaryFunction { return IsSimple; } + MCSymbol *getPersonalityFunction() const { + return PersonalityFunction; + } + + uint8_t getPersonalityEncoding() const { + return PersonalityEncoding; + } + /// Return true if the function has CFI instructions bool hasCFI() const { - return !FrameInstructions.empty(); + return !FrameInstructions.empty() || !CIEFrameInstructions.empty(); } /// Return true if the given address \p PC is inside the function body. @@ -391,7 +423,7 @@ class BinaryFunction { Instructions.emplace(Offset, std::forward(Instruction)); } - void addCFI(uint64_t Offset, MCCFIInstruction &&Inst) { + void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) { assert(!Instructions.empty()); // Fix CFI instructions skipping NOPs. We need to fix this because changing @@ -402,8 +434,7 @@ class BinaryFunction { auto I = Instructions.lower_bound(Offset); assert(I->first == Offset && "CFI pointing to unknown instruction"); if (I == Instructions.begin()) { - OffsetToCFI.emplace(Offset, FrameInstructions.size()); - FrameInstructions.emplace_back(std::forward(Inst)); + CIEFrameInstructions.emplace_back(std::forward(Inst)); return; } @@ -452,6 +483,16 @@ class BinaryFunction { return *this; } + BinaryFunction &setPersonalityFunction(uint64_t Addr) { + PersonalityFunction = BC.getOrCreateGlobalSymbol(Addr, "FUNCat"); + return *this; + } + + BinaryFunction &setPersonalityEncoding(uint8_t Encoding) { + PersonalityEncoding = Encoding; + return *this; + } + BinaryFunction &setAlignment(uint64_t Align) { Alignment = Align; return *this; diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 1fb54e99870e..022d8c5ca108 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -534,7 +534,7 @@ void BinaryFunction::updateEHRanges() { const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f; -void CFIReader::fillCFIInfoFor(BinaryFunction &Function) const { +void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { uint64_t Address = Function.getAddress(); auto I = FDEs.find(Address); if (I == FDEs.end()) @@ -554,105 +554,239 @@ void CFIReader::fillCFIInfoFor(BinaryFunction &Function) const { uint64_t Offset = 0; uint64_t CodeAlignment = CurFDE.getLinkedCIE()->getCodeAlignmentFactor(); uint64_t DataAlignment = CurFDE.getLinkedCIE()->getDataAlignmentFactor(); + if (CurFDE.getLinkedCIE()->getPersonalityAddress() != 0) { + Function.setPersonalityFunction( + CurFDE.getLinkedCIE()->getPersonalityAddress()); + Function.setPersonalityEncoding( + CurFDE.getLinkedCIE()->getPersonalityEncoding()); + } + + auto decodeFrameInstruction = + [&Function, &Offset, Address, CodeAlignment, DataAlignment]( + const FrameEntry::Instruction &Instr) { + uint8_t Opcode = Instr.Opcode; + if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK) + Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK; + switch (Instr.Opcode) { + case DW_CFA_nop: + break; + case DW_CFA_advance_loc4: + case DW_CFA_advance_loc2: + case DW_CFA_advance_loc1: + case DW_CFA_advance_loc: + // Advance our current address + Offset += CodeAlignment * int64_t(Instr.Ops[0]); + break; + case DW_CFA_offset_extended_sf: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createOffset( + nullptr, Instr.Ops[0], + DataAlignment * int64_t(Instr.Ops[1]))); + break; + case DW_CFA_offset_extended: + case DW_CFA_offset: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createOffset( + nullptr, Instr.Ops[0], DataAlignment * Instr.Ops[1])); + break; + case DW_CFA_restore_extended: + case DW_CFA_restore: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRestore(nullptr, Instr.Ops[0])); + break; + case DW_CFA_set_loc: + assert(Instr.Ops[0] < Address && "set_loc out of function bounds"); + assert(Instr.Ops[0] > Address + Function.getSize() && + "set_loc out of function bounds"); + Offset = Instr.Ops[0] - Address; + break; + + case DW_CFA_undefined: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createUndefined(nullptr, Instr.Ops[0])); + break; + case DW_CFA_same_value: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createSameValue(nullptr, Instr.Ops[0])); + break; + case DW_CFA_register: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRegister(nullptr, Instr.Ops[0], + Instr.Ops[1])); + break; + case DW_CFA_remember_state: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRememberState(nullptr)); + break; + case DW_CFA_restore_state: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createRestoreState(nullptr)); + break; + case DW_CFA_def_cfa: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createDefCfa(nullptr, Instr.Ops[1], + -Instr.Ops[0])); + break; + case DW_CFA_def_cfa_sf: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createDefCfa( + nullptr, Instr.Ops[1], + -(DataAlignment * int64_t(Instr.Ops[0])))); + break; + case DW_CFA_def_cfa_register: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::createDefCfaRegister(nullptr, Instr.Ops[0])); + break; + case DW_CFA_def_cfa_offset: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::createDefCfaOffset(nullptr, -Instr.Ops[0])); + break; + case DW_CFA_def_cfa_offset_sf: + Function.addCFIInstruction( + Offset, MCCFIInstruction::createDefCfaOffset( + nullptr, -(DataAlignment * int64_t(Instr.Ops[0])))); + break; + case DW_CFA_val_offset_sf: + case DW_CFA_val_offset: + llvm_unreachable("DWARF val_offset() unimplemented"); + break; + case DW_CFA_expression: + case DW_CFA_def_cfa_expression: + case DW_CFA_val_expression: + llvm_unreachable("DWARF CFA expressions unimplemented"); + break; + dbgs() << "DW_CFA_val_expression"; + break; + case DW_CFA_MIPS_advance_loc8: + llvm_unreachable("DW_CFA_MIPS_advance_loc unimplemented"); + break; + case DW_CFA_GNU_args_size: + case DW_CFA_GNU_window_save: + case DW_CFA_lo_user: + case DW_CFA_hi_user: + llvm_unreachable("DW_CFA_GNU_* and DW_CFA_*_use unimplemented"); + break; + default: + llvm_unreachable("Unrecognized CFI instruction"); + } + }; + + for (const FrameEntry::Instruction &Instr : *(CurFDE.getLinkedCIE())) { + decodeFrameInstruction(Instr); + } + for (const FrameEntry::Instruction &Instr : CurFDE) { - uint8_t Opcode = Instr.Opcode; - if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK) - Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK; - switch (Instr.Opcode) { - case DW_CFA_nop: - break; - case DW_CFA_advance_loc4: - case DW_CFA_advance_loc2: - case DW_CFA_advance_loc1: - case DW_CFA_advance_loc: - // Advance our current address - Offset += CodeAlignment * int64_t(Instr.Ops[0]); - break; - case DW_CFA_offset_extended_sf: - Function.addCFI(Offset, MCCFIInstruction::createOffset( - nullptr, Instr.Ops[0], - DataAlignment * int64_t(Instr.Ops[1]))); - break; - case DW_CFA_offset_extended: - case DW_CFA_offset: - Function.addCFI( - Offset, MCCFIInstruction::createOffset(nullptr, Instr.Ops[0], - DataAlignment * Instr.Ops[1])); - break; - case DW_CFA_restore_extended: - case DW_CFA_restore: - Function.addCFI(Offset, - MCCFIInstruction::createRestore(nullptr, Instr.Ops[0])); - break; - case DW_CFA_set_loc: - assert(Instr.Ops[0] < Address && "set_loc out of function bounds"); - assert(Instr.Ops[0] > Address + Function.getSize() && - "set_loc out of function bounds"); - Offset = Instr.Ops[0] - Address; - break; + decodeFrameInstruction(Instr); + } +} - case DW_CFA_undefined: - Function.addCFI(Offset, - MCCFIInstruction::createUndefined(nullptr, Instr.Ops[0])); - break; - case DW_CFA_same_value: - Function.addCFI(Offset, - MCCFIInstruction::createSameValue(nullptr, Instr.Ops[0])); - break; - case DW_CFA_register: - Function.addCFI(Offset, MCCFIInstruction::createRegister( - nullptr, Instr.Ops[0], Instr.Ops[1])); - break; - case DW_CFA_remember_state: - Function.addCFI(Offset, MCCFIInstruction::createRememberState(nullptr)); - break; - case DW_CFA_restore_state: - Function.addCFI(Offset, MCCFIInstruction::createRestoreState(nullptr)); - break; - case DW_CFA_def_cfa: - Function.addCFI(Offset, MCCFIInstruction::createDefCfa( - nullptr, Instr.Ops[0], Instr.Ops[1])); - break; - case DW_CFA_def_cfa_sf: - Function.addCFI(Offset, MCCFIInstruction::createDefCfa( - nullptr, Instr.Ops[0], - DataAlignment * int64_t(Instr.Ops[1]))); - break; - case DW_CFA_def_cfa_register: - Function.addCFI(Offset, MCCFIInstruction::createDefCfaRegister( - nullptr, Instr.Ops[0])); - break; - case DW_CFA_def_cfa_offset: - Function.addCFI( - Offset, MCCFIInstruction::createDefCfaOffset(nullptr, Instr.Ops[0])); - break; - case DW_CFA_def_cfa_offset_sf: - Function.addCFI(Offset, - MCCFIInstruction::createDefCfaOffset( - nullptr, DataAlignment * int64_t(Instr.Ops[0]))); - break; - case DW_CFA_val_offset_sf: - case DW_CFA_val_offset: - llvm_unreachable("DWARF val_offset() unimplemented"); - break; - case DW_CFA_expression: - case DW_CFA_def_cfa_expression: - case DW_CFA_val_expression: - llvm_unreachable("DWARF CFA expressions unimplemented"); - break; - dbgs() << "DW_CFA_val_expression"; - break; - case DW_CFA_MIPS_advance_loc8: - llvm_unreachable("DW_CFA_MIPS_advance_loc unimplemented"); - break; - case DW_CFA_GNU_args_size: - case DW_CFA_GNU_window_save: - case DW_CFA_lo_user: - case DW_CFA_hi_user: - llvm_unreachable("DW_CFA_GNU_* and DW_CFA_*_use unimplemented"); +void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, + uint64_t EHFrameAddress, + ArrayRef FailedAddresses) { + DataExtractor Data(EHFrame, + /*IsLittleEndian=*/true, + /*PtrSize=*/4); + uint32_t Offset = 0; + std::map PCToFDE; + + // Scans the EHFrame, parsing start addresses for each function + while (Data.isValidOffset(Offset)) { + uint32_t StartOffset = Offset; + + uint64_t Length = Data.getU32(&Offset); + + if (Length == 0) break; - default: - llvm_unreachable("Unrecognized CFI instruction"); + + uint32_t EndStructureOffset = Offset + static_cast(Length); + uint64_t Id = Data.getUnsigned(&Offset, 4); + if (Id == 0) { + Offset = EndStructureOffset; + continue; + } + + const uint8_t *DataStart = + reinterpret_cast(Data.getData().substr(Offset).data()); + const uint8_t *DataEnd = DataStart; + uint64_t FuncAddress = + readEncodedPointer(DataEnd, DW_EH_PE_sdata4 | DW_EH_PE_pcrel, + EHFrameAddress + Offset - (uintptr_t)DataEnd); + Offset += DataEnd - DataStart; + + auto I = std::lower_bound(FailedAddresses.begin(), FailedAddresses.end(), + FuncAddress); + if (I != FailedAddresses.end() && *I == FuncAddress) { + Offset = EndStructureOffset; + continue; + } + + PCToFDE[FuncAddress] = EHFrameAddress + StartOffset; + Offset = EndStructureOffset; + } + + //Updates the EHFrameHdr + DataExtractor HDRData( + StringRef(FrameHdrContents.data(), FrameHdrContents.size()), + /*IsLittleEndian=*/true, + /*PtrSize=*/4); + Offset = 0; + uint8_t Version = HDRData.getU8(&Offset); + assert(Version == 1 && + "Don't know how to handle this version of .eh_frame_hdr"); + + uint8_t EhFrameAddrEncoding = HDRData.getU8(&Offset); + uint8_t FDECntEncoding = HDRData.getU8(&Offset); + uint8_t TableEncoding = HDRData.getU8(&Offset); + const uint8_t *DataStart = reinterpret_cast( + HDRData.getData().substr(Offset).data()); + const uint8_t *DataEnd = DataStart; + // Advance Offset past .eh_frame addr + readEncodedPointer(DataEnd, EhFrameAddrEncoding); + Offset += DataEnd - DataStart; + + DataStart = reinterpret_cast( + HDRData.getData().substr(Offset).data()); + DataEnd = DataStart; + uint64_t FDECount = readEncodedPointer( + DataEnd, FDECntEncoding, FrameHdrAddress + Offset - (uintptr_t)DataEnd, + FrameHdrAddress); + Offset += DataEnd - DataStart; + + assert(FDECount > 0 && "Empty binary search table in .eh_frame_hdr!"); + assert(TableEncoding == (DW_EH_PE_datarel | DW_EH_PE_sdata4) && + "Don't know how to handle other .eh_frame.hdr encoding!"); + + // Offset now points to the binary search table. Update it. + for (uint64_t I = 0; I != FDECount; ++I) { + assert(HDRData.isValidOffset(Offset) && + ".eh_frame_hdr table finished earlier than we expected"); + DataStart = reinterpret_cast( + HDRData.getData().substr(Offset).data()); + DataEnd = DataStart; + uint64_t InitialPC = readEncodedPointer( + DataEnd, TableEncoding, FrameHdrAddress + Offset - (uintptr_t)DataEnd, + FrameHdrAddress); + Offset += DataEnd - DataStart; + + uint64_t FDEPtrOffset = Offset; + DataStart = reinterpret_cast( + HDRData.getData().substr(Offset).data()); + DataEnd = DataStart; + // Advance Offset past FDEPtr + readEncodedPointer(DataEnd, TableEncoding); + Offset += DataEnd - DataStart; + + if (uint64_t NewPtr = PCToFDE[InitialPC]) { + // Patch FDEPtr + int64_t RealOffset = NewPtr - FrameHdrAddress; + + assert(isInt<32>(RealOffset)); + DEBUG(dbgs() << format("CFIReaderWriter: Patching .eh_frame_hdr contents " + "@offset %08x with new FDE ptr %08x\n", + FDEPtrOffset, NewPtr)); + support::ulittle32_t::ref(FrameHdrContents.data() + FDEPtrOffset) = RealOffset; } } } diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h index 191743e77757..e497f70f9a02 100644 --- a/bolt/Exceptions.h +++ b/bolt/Exceptions.h @@ -26,10 +26,13 @@ class BinaryFunction; void readLSDA(ArrayRef LSDAData, BinaryContext &BC); /// \brief Wraps up information to read all CFI instructions and feed them to a -/// BinaryFunction. -class CFIReader { +/// BinaryFunction, as well as rewriting CFI sections. +class CFIReaderWriter { public: - explicit CFIReader(const DWARFFrame &EHFrame) : EHFrame(EHFrame) { + explicit CFIReaderWriter(const DWARFFrame &EHFrame, uint64_t FrameHdrAddress, + MutableArrayRef FrameHdrContents) + : EHFrame(EHFrame), FrameHdrAddress(FrameHdrAddress), + FrameHdrContents(FrameHdrContents) { // Prepare FDEs for fast lookup for (const auto &Entry : EHFrame.Entries) { const dwarf::FrameEntry *FE = Entry.get(); @@ -43,8 +46,14 @@ class CFIReader { void fillCFIInfoFor(BinaryFunction &Function) const; + // Include a new EHFrame, updating the .eh_frame_hdr + void rewriteHeaderFor(StringRef EHFrame, uint64_t EHFrameAddress, + ArrayRef FailedAddresses); + private: const DWARFFrame &EHFrame; + uint64_t FrameHdrAddress; + MutableArrayRef FrameHdrContents; FDEsMap FDEs; }; diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 2b8b59ba027c..725a3431aba8 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -343,6 +343,26 @@ static std::unique_ptr CreateBinaryContext( return BC; } +// Helper function to map a random memory address to a file offset. Returns 0 if +// this address cannot be mapped back to the file. +static uint64_t discoverFileOffset(ELFObjectFileBase *File, uint64_t MemAddr) { + for (const auto &Section : File->sections()) { + uint64_t SecAddress = Section.getAddress(); + uint64_t Size = Section.getSize(); + if (MemAddr < SecAddress || + SecAddress + Size <= MemAddr) + continue; + + StringRef SectionContents; + check_error(Section.getContents(SectionContents), + "cannot get section contents"); + uint64_t SecFileOffset = SectionContents.data() - File->getData().data(); + uint64_t MemAddrSecOffset = MemAddr - SecAddress; + return SecFileOffset + MemAddrSecOffset; + } + return 0ULL; +} + static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // FIXME: there should be some way to extract arch and triple information @@ -363,6 +383,13 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // from constructors etc. BinaryFunction *EntryPointFunction{nullptr}; + struct BlobTy { + uint64_t Addr; + uint64_t FileOffset; + uint64_t Size; + }; + BlobTy ExtraStorage = {0ULL, 0ULL, 0ULL}; + // Populate array of binary functions and file symbols // from file symbol table. // @@ -378,6 +405,17 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { ErrorOr Name = Symbol.getName(); check_error(Name.getError(), "cannot get symbol name"); + if (*Name == "__flo_storage") { + ExtraStorage.Addr = Symbol.getValue(); + ExtraStorage.FileOffset = discoverFileOffset(File, ExtraStorage.Addr); + assert(ExtraStorage.FileOffset != 0 && "Corrupt __flo_storage symbol"); + continue; + } + if (*Name == "__flo_storage_size") { + ExtraStorage.Size = Symbol.getValue(); + continue; + } + if (Symbol.getType() == SymbolRef::ST_File) { // Could be used for local symbol disambiguation. FileSymbolName = *Name; @@ -469,7 +507,8 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { uint64_t LSDAAddress{0}; // Process special sections. - uint64_t EHFrameAddress = 0ULL; + uint64_t FrameHdrAddress = 0ULL; + StringRef FrameHdrContents; for (const auto &Section : File->sections()) { StringRef SectionName; check_error(Section.getName(SectionName), "cannot get section name"); @@ -485,18 +524,21 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { LSDAData = SectionData; LSDAAddress = Section.getAddress(); } - if (SectionName == ".eh_frame") { - EHFrameAddress = Section.getAddress(); + if (SectionName == ".eh_frame_hdr") { + FrameHdrAddress = Section.getAddress(); + FrameHdrContents = SectionContents; } } + std::vector FrameHdrCopy(FrameHdrContents.begin(), + FrameHdrContents.end()); // Process debug sections. std::unique_ptr DwCtx(new DWARFContextInMemory(*File)); const DWARFFrame &EHFrame = *DwCtx->getEHFrame(); if (opts::DumpEHFrame) { EHFrame.dump(outs()); } - CFIReader DwCFIReader(EHFrame); + CFIReaderWriter CFIRdWrt(EHFrame, FrameHdrAddress, FrameHdrCopy); if (!EHFrame.ParseError.empty()) { errs() << "FLO-WARNING: EHFrame reader failed with message \"" << EHFrame.ParseError << "\"\n"; @@ -567,7 +609,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // Fill in CFI information for this function if (EHFrame.ParseError.empty() && Function.isSimple()) - DwCFIReader.fillCFIInfoFor(Function); + CFIRdWrt.fillCFIInfoFor(Function); // Parse LSDA. if (Function.getLSDAAddress() != 0) @@ -753,6 +795,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { }; bool HasEHFrame = false; + bool NoSpaceWarning = false; // Output functions one by one. for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -789,8 +832,33 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // Emit CFI start if (Function.hasCFI()) { - HasEHFrame = true; - Streamer->EmitCFIStartProc(/*IsSimple=*/false); + if (ExtraStorage.Size != 0ULL) { + HasEHFrame = true; + Streamer->EmitCFIStartProc(/*IsSimple=*/false); + if (Function.getPersonalityFunction() != nullptr) { + Streamer->EmitCFIPersonality(Function.getPersonalityFunction(), + Function.getPersonalityEncoding()); + } + // Emit CFI instructions relative to the CIE + for (auto &CFIInstr : Function.cie()) { + // Ignore these CIE CFI insns because LLVM will already emit this. + switch (CFIInstr.getOperation()) { + default: + break; + case MCCFIInstruction::OpDefCfa: + if (CFIInstr.getRegister() == 7 && CFIInstr.getOffset() == 8) + continue; + break; + case MCCFIInstruction::OpOffset: + if (CFIInstr.getRegister() == 16 && CFIInstr.getOffset() == -8) + continue; + break; + } + emitCFIInstr(CFIInstr); + } + } else { + NoSpaceWarning = true; + } } // Emit code. @@ -808,15 +876,18 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Streamer->EmitLabel(const_cast(Label)); continue; } - if (!BC->MII->get(Instr.getOpcode()).isPseudo()) + if (!BC->MIA->isCFI(Instr)) { Streamer->EmitInstruction(Instr, *BC->STI); - else - emitCFIInstr(*Function.getCFIFor(Instr)); + continue; + } + if (ExtraStorage.Size == 0) + continue; + emitCFIInstr(*Function.getCFIFor(Instr)); } } // Emit CFI end - if (Function.hasCFI()) + if (Function.hasCFI() && ExtraStorage.Size != 0) Streamer->EmitCFIEndProc(); // TODO: is there any use in emiting end of function? @@ -825,6 +896,10 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { //Streamer->EmitLabel(FunctionEndLabel); //Streamer->emitELFSize(FunctionSymbol, MCExpr()); } + if (NoSpaceWarning) { + errs() << "FLO-WARNING: missing __flo_storage in this binary. No " + << "extra space left to allocate the new .eh_frame\n"; + } Streamer->Finish(); @@ -866,6 +941,10 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { std::move(Resolver)); //OLT.takeOwnershipOfBuffers(ObjectsHandle, ); + // Fow now on, keep track of functions we fail to write in the binary. We need + // to avoid rewriting CFI info for these functions. + std::vector FailedAddresses; + // Map every function/section current address in memory to that in // the output binary. for (auto &BFI : BinaryFunctions) { @@ -885,19 +964,30 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Function.setImageSize(SAI->second.second); } else { errs() << "FLO: cannot remap function " << Function.getName() << "\n"; + FailedAddresses.emplace_back(Function.getAddress()); } } // Map .eh_frame + StringRef NewEhFrameContents; if (HasEHFrame) { - assert(EHFrameAddress); auto SAI = EFMM->SectionAddressInfo.find(".eh_frame"); if (SAI != EFMM->SectionAddressInfo.end()) { DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) - << " to 0x" << Twine::utohexstr(EHFrameAddress) + << " to 0x" << Twine::utohexstr(ExtraStorage.Addr) << '\n'); OLT.mapSectionAddress(ObjectsHandle, reinterpret_cast(SAI->second.first), - EHFrameAddress); + ExtraStorage.Addr); + NewEhFrameContents = + StringRef(reinterpret_cast(SAI->second.first), + SAI->second.second); + if (ExtraStorage.Size < SAI->second.second) { + errs() << format("FLO fatal error: new .eh_frame requires 0x%x bytes, " + "but __flo_storage in this binary only has 0x%x extra " + "bytes available.", + SAI->second.second, ExtraStorage.Size); + exit(1); + } } else { errs() << "FLO: cannot remap .eh_frame\n"; } @@ -924,6 +1014,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { << ") is larger than maximum allowed size (0x" << Twine::utohexstr(Function.getMaxSize()) << ") for function " << Function.getName() << '\n'; + FailedAddresses.emplace_back(Function.getAddress()); continue; } @@ -948,6 +1039,17 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { break; } } + if (NewEhFrameContents.size()) { + std::sort(FailedAddresses.begin(), FailedAddresses.end()); + CFIRdWrt.rewriteHeaderFor(NewEhFrameContents, ExtraStorage.Addr, + FailedAddresses); + outs() << "FLO: rewriting .eh_frame_hdr\n"; + RealOut->os().pwrite(FrameHdrCopy.data(), FrameHdrCopy.size(), + FrameHdrContents.data() - File->getData().data()); + outs() << "FLO: writing a new .eh_frame\n"; + RealOut->os().pwrite(NewEhFrameContents.data(), NewEhFrameContents.size(), + ExtraStorage.FileOffset); + } if (EntryPointFunction) { DEBUG(dbgs() << "FLO: entry point function is " From 1c49282a9779ef34b554b51c2bf95c8ddf10a7a8 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 10 Nov 2015 17:21:42 -0800 Subject: [PATCH 045/904] Ignore LSDA information for now Summary: Teach llvm-flo to drop on function with LSDA information until we know how to update them after block reordering. (cherry picked from commit 24099af0041a3968ab413aa0515961cf50d0c271) --- bolt/llvm-flo.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index 725a3431aba8..fc4e89181fbe 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -608,8 +608,11 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { continue; // Fill in CFI information for this function - if (EHFrame.ParseError.empty() && Function.isSimple()) + if (EHFrame.ParseError.empty() && Function.isSimple()) { CFIRdWrt.fillCFIInfoFor(Function); + if (Function.getLSDAAddress() != 0) + Function.setSimple(false); + } // Parse LSDA. if (Function.getLSDAAddress() != 0) From 5fae610bd8be98ced4914a66ac40676abd899d1d Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 2 Nov 2015 09:46:50 -0800 Subject: [PATCH 046/904] Do not bail on functions with indirect calls Summary: Previously, we were marking functions with indirect calls as too complex to be disassembled, but this was unnecessarily conservative. This patch removes this restriction. (cherry picked from commit 1856b3836cd119351c285815ce60be15f959e915) --- bolt/BinaryFunction.cpp | 90 +++++++++++++++++++++++------------------ bolt/llvm-flo.cpp | 9 +++-- 2 files changed, 56 insertions(+), 43 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 5b5636633c63..5713e536ba1b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -141,15 +141,16 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (BC.MIA->isCall(Instruction)) { if (BC.MIA->isTailCall(Instruction)) OS << " # TAILCALL "; - if (Instruction.getNumOperands() > 1) { - OS << " # handler: "; - if (Instruction.getOperand(1).isExpr()) - OS << cast(Instruction.getOperand(1).getExpr())-> - getSymbol(); - else - OS << '0'; - OS << "; action: " << Instruction.getOperand(2).getImm(); - } + // FIXME: Print EH handlers correctly in presence of indirect calls +// if (Instruction.getNumOperands() > 1) { +// OS << " # handler: "; +// if (Instruction.getOperand(1).isExpr()) +// OS << cast(Instruction.getOperand(1).getExpr())-> +// getSymbol(); +// else +// OS << '0'; +// OS << "; action: " << Instruction.getOperand(2).getImm(); +// } } OS << "\n"; // In case we need MCInst printer: @@ -280,6 +281,27 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // basic block. Labels[0] = Ctx->createTempSymbol("BB0", false); + auto handleRIPOperand = + [this](MCInst &Instruction, uint64_t Address, uint64_t Size) -> bool { + uint64_t TargetAddress{0}; + MCSymbol *TargetSymbol{nullptr}; + if (!BC.MIA->evaluateRIPOperand(Instruction, Address, Size, + TargetAddress)) { + DEBUG(dbgs() << "FLO: rip-relative operand could not be evaluated:\n"; + BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); + dbgs() << '\n'; + Instruction.dump_pretty(dbgs(), BC.InstPrinter.get()); + dbgs() << '\n';); + return false; + } + // FIXME: check that the address is in data, not in code. + TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); + BC.MIA->replaceRIPOperandDisp( + Instruction, MCOperand::createExpr(MCSymbolRefExpr::create( + TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx))); + return true; + }; + bool IsSimple = true; for (uint64_t Offset = 0; IsSimple && (Offset < getSize()); ) { MCInst Instruction; @@ -302,13 +324,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { break; } - if (MIA->isIndirectBranch(Instruction)) { - DEBUG(dbgs() << "FLO: indirect branch seen. Skipping function " - << getName() << ".\n"); - IsSimple = false; - break; - } - uint64_t AbsoluteInstrAddr = getAddress() + Offset; if (MIA->isBranch(Instruction) || MIA->isCall(Instruction)) { uint64_t InstructionTarget = 0; @@ -369,6 +384,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = BC.getOrCreateGlobalSymbol(InstructionTarget, "FUNCat"); + if (InstructionTarget == 0) { + errs() << "FLO-WARNING: Function \":" << getName() + << "\" has call to address zero. Ignoring it.\n"; + IsSimple = false; + break; + } } } @@ -388,35 +409,26 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } } else { - // Indirect call - DEBUG(dbgs() << "FLO: indirect call detected (not yet supported)\n"); - IsSimple = false; - break; + // Should be an indirect call or an indirect branch. Bail out on the + // latter case. + if (MIA->isIndirectBranch(Instruction)) { + IsSimple = false; + break; + } + // Indirect call. We only need to fix it if the operand is RIP-relative + if (MIA->hasRIPOperand(Instruction)) { + if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { + IsSimple = false; + break; + } + } } } else { if (MIA->hasRIPOperand(Instruction)) { - uint64_t TargetAddress{0}; - MCSymbol *TargetSymbol{nullptr}; - if (!MIA->evaluateRIPOperand(Instruction, AbsoluteInstrAddr, - Size, TargetAddress)) { - DEBUG( - dbgs() << "FLO: rip-relative operand could not be evaluated:\n"; - BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); - dbgs() << '\n'; - Instruction.dump_pretty(dbgs(), BC.InstPrinter.get()); - dbgs() << '\n'; - ); + if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { IsSimple = false; break; } - // FIXME: check that the address is in data, not in code. - TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); - MIA->replaceRIPOperandDisp( - Instruction, - MCOperand::createExpr( - MCSymbolRefExpr::create(TargetSymbol, - MCSymbolRefExpr::VK_None, - *Ctx))); } } diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index fc4e89181fbe..ac29f527815f 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -703,10 +703,11 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } // Post-processing passes. - Function.updateEHRanges(); - if (opts::PrintAll || opts::PrintEHRanges) { - Function.print(errs(), "after updating EH ranges"); - } + // FIXME: Check EH handlers correctly in presence of indirect calls + // Function.updateEHRanges(); + // if (opts::PrintAll || opts::PrintEHRanges) { + // Function.print(errs(), "after updating EH ranges"); + // } // After optimizations, fix the CFI state if (!Function.fixCFIState()) From 64ba4aab6a367097e8f2490a43ef318b71726a74 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 13 Nov 2015 15:27:59 -0800 Subject: [PATCH 047/904] Make llvm-flo print dynamic coverage of rewritten functions Summary: This is an attempt at determining the hotness of functions we are rewriting and help detect if we are discarding hot functions. This patch introduces logic to estimate the number of instructions executed in each function by using the profile data for branches. It sums the products of BB frequency and size. Since we can only do this for functions we have successfully disassembled, created the CFG and annotated with profiling data, all complex functions that were not disassembled are left out from this analysis. (cherry picked from commit 6a1b8a9379a209277c23290ae821f2c72ef87569) --- bolt/BinaryFunction.cpp | 12 ++++++++++++ bolt/BinaryFunction.h | 4 ++++ bolt/llvm-flo.cpp | 12 ++++++++++++ 3 files changed, 28 insertions(+) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 5713e536ba1b..dec5c30bb838 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -744,6 +744,18 @@ void BinaryFunction::inferFallThroughCounts() { return; } +uint64_t BinaryFunction::getFunctionScore() { + uint64_t TotalScore = 0ULL; + for (auto BB : layout()) { + uint64_t BBExecCount = BB->getExecutionCount(); + if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) + continue; + BBExecCount *= BB->Instructions.size(); + TotalScore += BBExecCount; + } + return TotalScore; +} + void BinaryFunction::annotateCFIState() { assert(!BasicBlocks.empty() && "basic block list should not be empty"); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 2ac217879513..a50750945560 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -579,6 +579,10 @@ class BinaryFunction { /// has been filled with LBR data. void inferFallThroughCounts(); + /// Computes a function hotness score: the sum of the products of BB frequency + /// and size. + uint64_t getFunctionScore(); + /// Annotate each basic block entry with its current CFI state. This is used /// to detect when reordering changes the CFI state seen by a basic block and /// fix this. diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index ac29f527815f..c3aeee90d75a 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -545,6 +545,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } // Disassemble every function and build it's control flow graph. + uint64_t TotalScore = 0; for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; @@ -624,6 +625,8 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (opts::PrintAll || opts::PrintCFG) Function.print(errs(), "after building cfg"); + TotalScore += Function.getFunctionScore(); + } // Iterate over all functions // Run optimization passes. @@ -1006,6 +1009,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { // Overwrite function in the output file. uint64_t CountOverwrittenFunctions = 0; + uint64_t OverwrittenScore = 0; for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -1022,6 +1026,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { continue; } + OverwrittenScore += Function.getFunctionScore(); // Overwrite function in the output file. outs() << "FLO: rewriting function \"" << Function.getName() << "\"\n"; RealOut->os().pwrite( @@ -1065,6 +1070,13 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { outs() << "FLO: " << CountOverwrittenFunctions << " out of " << BinaryFunctions.size() << " functions were overwritten.\n"; + + if (TotalScore != 0) { + double Coverage = OverwrittenScore / (double)TotalScore * 100.0; + outs() << format("FLO: Rewritten functions cover %.2lf", Coverage) + << "% of the execution count of simple functions of this binary.\n"; + } + // TODO: we should find a way to mark the binary as optimized by us. Out->keep(); From 7119bd5136b8a8a85c811fe24e1eeae14a019c21 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 19 Nov 2015 17:59:41 -0800 Subject: [PATCH 048/904] Teach llvm-flo how to split functions into hot and cold regions Summary: After basic block reordering, it may be possible that the reordered function is now larger than the original because of the following reasons: - jump offsets may change, forcing some jump instructions to use 4-byte immediate operand instead of the 1-byte, shorter version. - fall-throughs change, forcing us to emit an extra jump instruction to jump to the original fall-through at the end of a basic block. Since we currently do not change function addresses, we need to rewrite the function back in the binary in the original location. If it doesn't fit, we were dropping the function. This patch adds a flag -split-functions that tells llvm-flo to split hot functions into hot and cold separate regions. The hot region is written back in the original function location, while the cold region is written in a separate, far-away region reserved to flo via a linker script. This patch also adds the logic to create and extra FDE to supply unwinding information to the cold part of the function. Owing to this, we now need to rewrite .eh_frame_hdr to another location and patch the EH_FRAME ELF segment to point to this new .eh_frame_hdr. (cherry picked from commit 88ea010cacd331c50b27d4a33309c120c3f1d406) --- bolt/BinaryBasicBlock.h | 8 + bolt/BinaryFunction.cpp | 83 ++++++-- bolt/BinaryFunction.h | 50 ++++- bolt/Exceptions.cpp | 103 +++++++-- bolt/Exceptions.h | 5 +- bolt/llvm-flo.cpp | 454 ++++++++++++++++++++++++++-------------- 6 files changed, 514 insertions(+), 189 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index f9c3b9f16bf6..88d87a937f58 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -51,6 +51,10 @@ class BinaryBasicBlock { /// Number of times this basic block was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; + /// In cases where the parent function has been split, IsCold == true means + /// this BB will be allocated outside its parent function. + bool IsCold{false}; + /// Vector of all instructions in the block. std::vector Instructions; @@ -229,6 +233,10 @@ class BinaryBasicBlock { return ExecutionCount; } + bool isCold() const { + return IsCold; + } + bool eraseInstruction(MCInst *Inst) { auto I = Instructions.end(); auto B = Instructions.begin(); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index dec5c30bb838..ea119a220562 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -76,6 +76,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, << "\n Section : " << SectionName << "\n Orc Section : " << getCodeSectionName() << "\n IsSimple : " << IsSimple + << "\n IsSplit : " << IsSplit << "\n BB Count : " << BasicBlocksLayout.size(); if (FrameInstructions.size()) { OS << "\n CFI Instrs : " << FrameInstructions.size(); @@ -171,7 +172,12 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, } } - for (auto BB : BasicBlocksLayout) { + for (uint32_t I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { + auto BB = BasicBlocksLayout[I]; + if (I != 0 && + BB->IsCold != BasicBlocksLayout[I - 1]->IsCold) + OS << "------- HOT-COLD SPLIT POINT -------\n\n"; + OS << BB->getName() << " (" << BB->Instructions.size() << " instructions, align : " << BB->getAlignment() << ")\n"; @@ -745,15 +751,19 @@ void BinaryFunction::inferFallThroughCounts() { } uint64_t BinaryFunction::getFunctionScore() { + if (FunctionScore != -1) + return FunctionScore; + uint64_t TotalScore = 0ULL; for (auto BB : layout()) { uint64_t BBExecCount = BB->getExecutionCount(); if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) continue; - BBExecCount *= BB->Instructions.size(); + BBExecCount *= (BB->Instructions.size() - BB->getNumPseudos()); TotalScore += BBExecCount; } - return TotalScore; + FunctionScore = TotalScore; + return FunctionScore; } void BinaryFunction::annotateCFIState() { @@ -831,9 +841,16 @@ bool BinaryFunction::fixCFIState() { uint32_t State = 0; BinaryBasicBlock *EntryBB = *BasicBlocksLayout.begin(); - for (BinaryBasicBlock *BB : BasicBlocksLayout) { + for (uint32_t I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { + BinaryBasicBlock *BB = BasicBlocksLayout[I]; uint32_t BBIndex = BB - &*BasicBlocks.begin(); + // Hot-cold border: check if this is the first BB to be allocated in a cold + // region (a different function). If yes, we need to reset the CFI state. + if (I != 0 && + BB->IsCold != BasicBlocksLayout[I - 1]->IsCold) + State = 0; + // Check if state is what this BB expect it to be at its entry point if (BBCFIState[BBIndex] != State) { // Need to recover the correct state @@ -904,7 +921,7 @@ bool BinaryFunction::fixCFIState() { return true; } -void BinaryFunction::optimizeLayout(HeuristicPriority Priority) { +void BinaryFunction::optimizeLayout(HeuristicPriority Priority, bool Split) { // Bail if no profiling information or if empty if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE || BasicBlocksLayout.empty()) { @@ -913,7 +930,7 @@ void BinaryFunction::optimizeLayout(HeuristicPriority Priority) { // Work on optimal solution if problem is small enough if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) - return solveOptimalLayout(); + return solveOptimalLayout(Split); DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n"); @@ -1131,10 +1148,12 @@ void BinaryFunction::optimizeLayout(HeuristicPriority Priority) { Cluster.end()); } + if (Split) + splitFunction(); fixBranches(); } -void BinaryFunction::solveOptimalLayout() { +void BinaryFunction::solveOptimalLayout(bool Split) { std::vector> Weight; std::map BBToIndex; std::vector IndexToBB; @@ -1234,6 +1253,8 @@ void BinaryFunction::solveOptimalLayout() { BasicBlocksLayout.push_back(BB); } + if (Split) + splitFunction(); fixBranches(); } @@ -1264,8 +1285,12 @@ void BinaryFunction::fixBranches() { // Check if the original fall-through for this block has been moved const MCSymbol *FT = nullptr; - if (I + 1 != BasicBlocksLayout.size()) + bool HotColdBorder = false; + if (I + 1 != BasicBlocksLayout.size()) { FT = BasicBlocksLayout[I + 1]->getLabel(); + if (BB->IsCold != BasicBlocksLayout[I + 1]->IsCold) + HotColdBorder = true; + } const BinaryBasicBlock *OldFTBB = getOriginalLayoutSuccessor(BB); const MCSymbol *OldFT = nullptr; if (OldFTBB != nullptr) @@ -1284,9 +1309,10 @@ void BinaryFunction::fixBranches() { if (MIA->isReturn(*LastInstIter)) continue; } - // Case 1b: Layout has changed and the fallthrough is not the same. Need - // to add a new unconditional branch to jump to the old fallthrough. - if (FT != OldFT && OldFT != nullptr) { + // Case 1b: Layout has changed and the fallthrough is not the same (or the + // fallthrough got moved to a cold region). Need to add a new + // unconditional branch to jump to the old fallthrough. + if ((FT != OldFT || HotColdBorder) && OldFT != nullptr) { MCInst NewInst; if (!MIA->createUncondBranch(NewInst, OldFT, BC.Ctx.get())) llvm_unreachable("Target does not support creating new branches"); @@ -1299,7 +1325,7 @@ void BinaryFunction::fixBranches() { // Case 2: There is a single jump, unconditional, in this basic block if (CondBranch == nullptr) { // Case 2a: It jumps to the new fall-through, so we can delete it - if (TBB == FT) { + if (TBB == FT && !HotColdBorder) { BB->eraseInstruction(UncondBranch); } // Case 2b: If 2a doesn't happen, there is nothing we can do @@ -1310,7 +1336,7 @@ void BinaryFunction::fixBranches() { if (UncondBranch == nullptr) { // Case 3a: If the taken branch goes to the next block in the new layout, // invert this conditional branch logic so we can make this a fallthrough. - if (TBB == FT) { + if (TBB == FT && !HotColdBorder) { assert(OldFT != nullptr && "malformed CFG"); if (!MIA->reverseBranchCondition(*CondBranch, OldFT, BC.Ctx.get())) llvm_unreachable("Target does not support reversing branches"); @@ -1318,7 +1344,7 @@ void BinaryFunction::fixBranches() { } // Case 3b: Need to add a new unconditional branch because layout // has changed - if (FT != OldFT && OldFT != nullptr) { + if ((FT != OldFT || HotColdBorder) && OldFT != nullptr) { MCInst NewInst; if (!MIA->createUncondBranch(NewInst, OldFT, BC.Ctx.get())) llvm_unreachable("Target does not support creating new branches"); @@ -1333,14 +1359,14 @@ void BinaryFunction::fixBranches() { // by another unconditional. // Case 4a: If the unconditional jump target is the new fall through, // delete it. - if (FBB == FT) { + if (FBB == FT && !HotColdBorder) { BB->eraseInstruction(UncondBranch); continue; } // Case 4b: If the taken branch goes to the next block in the new layout, // invert this conditional branch logic so we can make this a fallthrough. // Now we don't need the unconditional jump anymore, so we also delete it. - if (TBB == FT) { + if (TBB == FT && !HotColdBorder) { if (!MIA->reverseBranchCondition(*CondBranch, FBB, BC.Ctx.get())) llvm_unreachable("Target does not support reversing branches"); BB->eraseInstruction(UncondBranch); @@ -1350,5 +1376,30 @@ void BinaryFunction::fixBranches() { } } +void BinaryFunction::splitFunction() { + bool AllCold = true; + for (BinaryBasicBlock *BB : BasicBlocksLayout) { + auto ExecCount = BB->getExecutionCount(); + if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) + return; + if (ExecCount != 0) + AllCold = false; + } + + if (AllCold) + return; + + assert(BasicBlocksLayout.size() > 0); + // Separate hot from cold + for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); + I != E; ++I) { + BinaryBasicBlock *BB = *I; + if (BB->getExecutionCount() != 0) + break; + BB->IsCold = true; + IsSplit = true; + } +} + } // namespace flo } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index a50750945560..7f996bf51c49 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -95,6 +95,7 @@ class BinaryFunction { /// Offset in the file. uint64_t FileOffset{0}; + uint64_t ColdFileOffset{0}; /// Maximum size this function is allowed to have. uint64_t MaxSize{std::numeric_limits::max()}; @@ -113,9 +114,11 @@ class BinaryFunction { /// The address for the code for this function in codegen memory. uint64_t ImageAddress{0}; + uint64_t ColdImageAddress{0}; /// The size of the code in memory. uint64_t ImageSize{0}; + uint64_t ColdImageSize{0}; /// Name for the section this function code should reside in. std::string CodeSectionName; @@ -123,6 +126,10 @@ class BinaryFunction { /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; + /// Score of the function (estimated number of instructions executed, + /// according to profile data). -1 if the score has not been calculated yet. + int64_t FunctionScore{-1}; + /// Binary blob reprsenting action, type, and type index tables for this /// function' LSDA (exception handling). ArrayRef LSDATables; @@ -133,6 +140,10 @@ class BinaryFunction { /// Landing pads for the function. std::set LandingPads; + /// True if this function needs to be emitted in two separate parts, one for + /// the hot basic blocks and another for the cold basic blocks. + bool IsSplit{false}; + /// Release storage used by instructions. BinaryFunction &clearInstructions() { InstrMapType TempMap; @@ -290,7 +301,7 @@ class BinaryFunction { /// Perform optimal code layout based on edge frequencies making necessary /// adjustments to instructions at the end of basic blocks. - void optimizeLayout(HeuristicPriority Priority); + void optimizeLayout(HeuristicPriority Priority, bool Split); /// Dynamic programming implementation for the TSP, applied to BB layout. Find /// the optimal way to maximize weight during a path traversing all BBs. In @@ -298,7 +309,7 @@ class BinaryFunction { /// /// Uses exponential amount of memory on the number of basic blocks and should /// only be used for small functions. - void solveOptimalLayout(); + void solveOptimalLayout(bool Split); /// View CFG in graphviz program void viewGraph(); @@ -330,6 +341,10 @@ class BinaryFunction { return FileOffset; } + uint64_t getColdFileOffset() const { + return ColdFileOffset; + } + /// Return (original) size of the function. uint64_t getSize() const { return Size; @@ -351,6 +366,10 @@ class BinaryFunction { return IsSimple; } + bool isSplit() const { + return IsSplit; + } + MCSymbol *getPersonalityFunction() const { return PersonalityFunction; } @@ -473,6 +492,11 @@ class BinaryFunction { return *this; } + BinaryFunction &setColdFileOffset(uint64_t Offset) { + ColdFileOffset = Offset; + return *this; + } + BinaryFunction &setMaxSize(uint64_t Size) { MaxSize = Size; return *this; @@ -507,21 +531,39 @@ class BinaryFunction { return *this; } + BinaryFunction &setColdImageAddress(uint64_t Address) { + ColdImageAddress = Address; + return *this; + } + /// Return the address of this function' image in memory. uint64_t getImageAddress() const { return ImageAddress; } + uint64_t getColdImageAddress() const { + return ColdImageAddress; + } + BinaryFunction &setImageSize(uint64_t Size) { ImageSize = Size; return *this; } + BinaryFunction &setColdImageSize(uint64_t Size) { + ColdImageSize = Size; + return *this; + } + /// Return the size of this function' image in memory. uint64_t getImageSize() const { return ImageSize; } + uint64_t getColdImageSize() const { + return ColdImageSize; + } + /// Set the profile data for the number of times the function was called. BinaryFunction &setExecutionCount(uint64_t Count) { ExecutionCount = Count; @@ -598,6 +640,10 @@ class BinaryFunction { /// adding jumps based on a new layout order. void fixBranches(); + /// Split function in two: a part with warm or hot BBs and a part with never + /// executed BBs. The cold part is moved to a new BinaryFunction. + void splitFunction(); + /// Process LSDA information for the function. void parseLSDA(ArrayRef LSDAData, uint64_t LSDAAddress); diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 022d8c5ca108..41033dd48f4a 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -683,7 +683,8 @@ void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { } void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, - uint64_t EHFrameAddress, + uint64_t NewEHFrameAddress, + uint64_t NewFrameHdrAddress, ArrayRef FailedAddresses) { DataExtractor Data(EHFrame, /*IsLittleEndian=*/true, @@ -691,6 +692,11 @@ void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, uint32_t Offset = 0; std::map PCToFDE; + DEBUG(dbgs() << format( + "CFIReaderWriter: Starting to patch .eh_frame_hdr.\n" + "New .eh_frame address = %08x\nNew .eh_frame_hdr address = %08x\n", + NewEHFrameAddress, NewFrameHdrAddress)); + // Scans the EHFrame, parsing start addresses for each function while (Data.isValidOffset(Offset)) { uint32_t StartOffset = Offset; @@ -712,7 +718,7 @@ void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, const uint8_t *DataEnd = DataStart; uint64_t FuncAddress = readEncodedPointer(DataEnd, DW_EH_PE_sdata4 | DW_EH_PE_pcrel, - EHFrameAddress + Offset - (uintptr_t)DataEnd); + NewEHFrameAddress + Offset - (uintptr_t)DataEnd); Offset += DataEnd - DataStart; auto I = std::lower_bound(FailedAddresses.begin(), FailedAddresses.end(), @@ -722,7 +728,7 @@ void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, continue; } - PCToFDE[FuncAddress] = EHFrameAddress + StartOffset; + PCToFDE[FuncAddress] = NewEHFrameAddress + StartOffset; Offset = EndStructureOffset; } @@ -742,32 +748,53 @@ void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, const uint8_t *DataStart = reinterpret_cast( HDRData.getData().substr(Offset).data()); const uint8_t *DataEnd = DataStart; - // Advance Offset past .eh_frame addr - readEncodedPointer(DataEnd, EhFrameAddrEncoding); + + uint64_t EHFrameAddrOffset = Offset; + uint64_t EHFrameAddress = readEncodedPointer( + DataEnd, EhFrameAddrEncoding, + FrameHdrAddress + Offset - (uintptr_t)DataEnd, FrameHdrAddress); Offset += DataEnd - DataStart; DataStart = reinterpret_cast( HDRData.getData().substr(Offset).data()); DataEnd = DataStart; + uint64_t FDECountOffset = Offset; uint64_t FDECount = readEncodedPointer( DataEnd, FDECntEncoding, FrameHdrAddress + Offset - (uintptr_t)DataEnd, FrameHdrAddress); Offset += DataEnd - DataStart; assert(FDECount > 0 && "Empty binary search table in .eh_frame_hdr!"); + assert(EhFrameAddrEncoding == (DW_EH_PE_pcrel | DW_EH_PE_sdata4) && + "Don't know how to handle other .eh_frame address encoding!"); + assert(FDECntEncoding == DW_EH_PE_udata4 && + "Don't know how to thandle other .eh_frame_hdr encoding!"); assert(TableEncoding == (DW_EH_PE_datarel | DW_EH_PE_sdata4) && - "Don't know how to handle other .eh_frame.hdr encoding!"); + "Don't know how to handle other .eh_frame_hdr encoding!"); + + // Update .eh_frame address + // Write address using signed 4-byte pc-relative encoding + DEBUG(dbgs() << format("CFIReaderWriter: Patching .eh_frame_hdr contents " + "(.eh_frame pointer) with %08x\n", + EHFrameAddress)); + int64_t RealOffset = EHFrameAddress - EHFrameAddrOffset - NewFrameHdrAddress; + assert(isInt<32>(RealOffset)); + support::ulittle32_t::ref(FrameHdrContents.data() + EHFrameAddrOffset) = + RealOffset; // Offset now points to the binary search table. Update it. + uint64_t LastPC = 0; for (uint64_t I = 0; I != FDECount; ++I) { assert(HDRData.isValidOffset(Offset) && ".eh_frame_hdr table finished earlier than we expected"); DataStart = reinterpret_cast( HDRData.getData().substr(Offset).data()); DataEnd = DataStart; + uint64_t InitialPCOffset = Offset; uint64_t InitialPC = readEncodedPointer( DataEnd, TableEncoding, FrameHdrAddress + Offset - (uintptr_t)DataEnd, FrameHdrAddress); + LastPC = InitialPC; Offset += DataEnd - DataStart; uint64_t FDEPtrOffset = Offset; @@ -775,19 +802,61 @@ void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, HDRData.getData().substr(Offset).data()); DataEnd = DataStart; // Advance Offset past FDEPtr - readEncodedPointer(DataEnd, TableEncoding); + uint64_t FDEPtr = readEncodedPointer( + DataEnd, TableEncoding, FrameHdrAddress + Offset - (uintptr_t)DataEnd, + FrameHdrAddress); Offset += DataEnd - DataStart; - if (uint64_t NewPtr = PCToFDE[InitialPC]) { - // Patch FDEPtr - int64_t RealOffset = NewPtr - FrameHdrAddress; - - assert(isInt<32>(RealOffset)); - DEBUG(dbgs() << format("CFIReaderWriter: Patching .eh_frame_hdr contents " - "@offset %08x with new FDE ptr %08x\n", - FDEPtrOffset, NewPtr)); - support::ulittle32_t::ref(FrameHdrContents.data() + FDEPtrOffset) = RealOffset; - } + // Update InitialPC according to new eh_frame_hdr address + // Write using signed 4-byte "data relative" (relative to .eh_frame_addr) + // encoding + int64_t RealOffset = InitialPC - NewFrameHdrAddress; + assert(isInt<32>(RealOffset)); + support::ulittle32_t::ref(FrameHdrContents.data() + InitialPCOffset) = + RealOffset; + + if (uint64_t NewPtr = PCToFDE[InitialPC]) + RealOffset = NewPtr - NewFrameHdrAddress; + else + RealOffset = FDEPtr - NewFrameHdrAddress; + + assert(isInt<32>(RealOffset)); + DEBUG(dbgs() << format("CFIReaderWriter: Patching .eh_frame_hdr contents " + "@offset %08x with new FDE ptr %08x\n", + FDEPtrOffset, RealOffset + NewFrameHdrAddress)); + support::ulittle32_t::ref(FrameHdrContents.data() + FDEPtrOffset) = RealOffset; + } + // Add new entries (for cold function parts) + uint64_t ExtraEntries = 0; + for (auto I = PCToFDE.upper_bound(LastPC), E = PCToFDE.end(); I != E; ++I) { + ++ExtraEntries; + } + if (ExtraEntries == 0) + return; + FrameHdrContents.resize(FrameHdrContents.size() + (ExtraEntries * 8)); + // Update FDE count + DEBUG(dbgs() << "CFIReaderWriter: Updating .eh_frame_hdr FDE count from " + << FDECount << " to " << (FDECount + ExtraEntries) << "\n"); + support::ulittle32_t::ref(FrameHdrContents.data() + FDECountOffset) = + FDECount + ExtraEntries; + + for (auto I = PCToFDE.upper_bound(LastPC), E = PCToFDE.end(); I != E; ++I) { + // Write PC + DEBUG(dbgs() << format("CFIReaderWriter: Writing extra FDE entry for PC " + "0x%x, FDE pointer 0x%x\n", + I->first, I->second)); + uint64_t InitialPC = I->first; + int64_t RealOffset = InitialPC - NewFrameHdrAddress; + assert(isInt<32>(RealOffset)); + support::ulittle32_t::ref(FrameHdrContents.data() + Offset) = RealOffset; + Offset += 4; + + // Write FDE pointer + uint64_t FDEPtr = I->second; + RealOffset = FDEPtr - NewFrameHdrAddress; + assert(isInt<32>(RealOffset)); + support::ulittle32_t::ref(FrameHdrContents.data() + Offset) = RealOffset; + Offset += 4; } } diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h index e497f70f9a02..9fd3212eb99f 100644 --- a/bolt/Exceptions.h +++ b/bolt/Exceptions.h @@ -30,7 +30,7 @@ void readLSDA(ArrayRef LSDAData, BinaryContext &BC); class CFIReaderWriter { public: explicit CFIReaderWriter(const DWARFFrame &EHFrame, uint64_t FrameHdrAddress, - MutableArrayRef FrameHdrContents) + std::vector &FrameHdrContents) : EHFrame(EHFrame), FrameHdrAddress(FrameHdrAddress), FrameHdrContents(FrameHdrContents) { // Prepare FDEs for fast lookup @@ -48,12 +48,13 @@ class CFIReaderWriter { // Include a new EHFrame, updating the .eh_frame_hdr void rewriteHeaderFor(StringRef EHFrame, uint64_t EHFrameAddress, + uint64_t NewFrameHdrAddress, ArrayRef FailedAddresses); private: const DWARFFrame &EHFrame; uint64_t FrameHdrAddress; - MutableArrayRef FrameHdrContents; + std::vector &FrameHdrContents; FDEsMap FDEs; }; diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index c3aeee90d75a..e82be39c9dc6 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -96,6 +96,11 @@ EliminateUnreachable("eliminate-unreachable", cl::desc("eliminate unreachable code"), cl::Optional); +static cl::opt +SplitFunctions("split-functions", + cl::desc("split functions into hot and cold distinct regions"), + cl::Optional); + static cl::opt ReorderBlocks( "reorder-blocks", cl::desc("redo basic block layout based on profiling data with a specific " @@ -363,7 +368,180 @@ static uint64_t discoverFileOffset(ELFObjectFileBase *File, uint64_t MemAddr) { return 0ULL; } -static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { +// Helper function to emit the contents of a function via a MCStreamer object. +static void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, + BinaryContext &BC, bool EmitColdPart, + bool HasExtraStorage) { + // Define a helper to decode and emit CFI instructions at a given point in a + // BB + auto emitCFIInstr = [&Streamer](MCCFIInstruction &CFIInstr) { + switch (CFIInstr.getOperation()) { + default: + llvm_unreachable("Unexpected instruction"); + case MCCFIInstruction::OpDefCfaOffset: + Streamer.EmitCFIDefCfaOffset(CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpAdjustCfaOffset: + Streamer.EmitCFIAdjustCfaOffset(CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpDefCfa: + Streamer.EmitCFIDefCfa(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpDefCfaRegister: + Streamer.EmitCFIDefCfaRegister(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpOffset: + Streamer.EmitCFIOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpRegister: + Streamer.EmitCFIRegister(CFIInstr.getRegister(), + CFIInstr.getRegister2()); + break; + case MCCFIInstruction::OpRelOffset: + Streamer.EmitCFIRelOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpUndefined: + Streamer.EmitCFIUndefined(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpRememberState: + Streamer.EmitCFIRememberState(); + break; + case MCCFIInstruction::OpRestoreState: + Streamer.EmitCFIRestoreState(); + break; + case MCCFIInstruction::OpRestore: + Streamer.EmitCFIRestore(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpSameValue: + Streamer.EmitCFISameValue(CFIInstr.getRegister()); + break; + } + }; + + // No need for human readability? + // FIXME: what difference does it make in reality? + // Ctx.setUseNamesOnTempLabels(false); + + // Emit function start + + // Each fuction is emmitted into its own section. + MCSectionELF *FunctionSection = + EmitColdPart + ? BC.Ctx->getELFSection( + Function.getCodeSectionName().str().append(".cold"), + ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC) + : BC.Ctx->getELFSection(Function.getCodeSectionName(), + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + + MCSection *Section = FunctionSection; + Streamer.SwitchSection(Section); + + Streamer.EmitCodeAlignment(Function.getAlignment()); + + if (!EmitColdPart) { + MCSymbol *FunctionSymbol = BC.Ctx->getOrCreateSymbol(Function.getName()); + Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); + Streamer.EmitLabel(FunctionSymbol); + } else { + MCSymbol *FunctionSymbol = + BC.Ctx->getOrCreateSymbol(Twine(Function.getName()).concat(".cold")); + Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); + Streamer.EmitLabel(FunctionSymbol); + } + + // Emit CFI start + if (Function.hasCFI() && HasExtraStorage) { + Streamer.EmitCFIStartProc(/*IsSimple=*/false); + if (Function.getPersonalityFunction() != nullptr) { + Streamer.EmitCFIPersonality(Function.getPersonalityFunction(), + Function.getPersonalityEncoding()); + } + // Emit CFI instructions relative to the CIE + for (auto &CFIInstr : Function.cie()) { + // Ignore these CIE CFI insns because LLVM will already emit this. + switch (CFIInstr.getOperation()) { + default: + break; + case MCCFIInstruction::OpDefCfa: + if (CFIInstr.getRegister() == 7 && CFIInstr.getOffset() == 8) + continue; + break; + case MCCFIInstruction::OpOffset: + if (CFIInstr.getRegister() == 16 && CFIInstr.getOffset() == -8) + continue; + break; + } + emitCFIInstr(CFIInstr); + } + } + + // Emit code. + for (auto BB : Function.layout()) { + if (EmitColdPart != BB->isCold()) + continue; + if (BB->getAlignment() > 1) + Streamer.EmitCodeAlignment(BB->getAlignment()); + Streamer.EmitLabel(BB->getLabel()); + for (const auto &Instr : *BB) { + // Handle pseudo instructions. + if (BC.MIA->isEHLabel(Instr)) { + assert(Instr.getNumOperands() == 1 && Instr.getOperand(0).isExpr() && + "bad EH_LABEL instruction"); + auto Label = &(cast(Instr.getOperand(0).getExpr()) + ->getSymbol()); + Streamer.EmitLabel(const_cast(Label)); + continue; + } + if (!BC.MIA->isCFI(Instr)) { + Streamer.EmitInstruction(Instr, *BC.STI); + continue; + } + if (HasExtraStorage) + emitCFIInstr(*Function.getCFIFor(Instr)); + } + } + + // Emit CFI end + if (Function.hasCFI() && HasExtraStorage) + Streamer.EmitCFIEndProc(); + + // TODO: is there any use in emiting end of function? + // Perhaps once we have a support for C++ exceptions. + // auto FunctionEndLabel = Ctx.createTempSymbol("func_end"); + // Streamer.EmitLabel(FunctionEndLabel); + // Streamer.emitELFSize(FunctionSymbol, MCExpr()); +} + +// Helper to locate EH_FRAME_HDR segment, specialized for 64-bit LE ELF +static bool patchEhFrameHdrSegment(const ELFFile *Obj, + raw_pwrite_stream *OS, uint64_t Offset, + uint64_t Addr, uint64_t Size) { + for (const auto &Phdr : Obj->program_headers()) { + if (Phdr.p_type != ELF::PT_GNU_EH_FRAME) + continue; + uint64_t OffsetLoc = (uintptr_t)&Phdr.p_offset - (uintptr_t)Obj->base(); + uint64_t VAddrLoc = (uintptr_t)&Phdr.p_vaddr - (uintptr_t)Obj->base(); + uint64_t PAddrLoc = (uintptr_t)&Phdr.p_paddr - (uintptr_t)Obj->base(); + uint64_t FileSzLoc = (uintptr_t)&Phdr.p_filesz - (uintptr_t)Obj->base(); + uint64_t MemSzLoc = (uintptr_t)&Phdr.p_memsz - (uintptr_t)Obj->base(); + char Buffer[8]; + // Update Offset + support::ulittle64_t::ref(Buffer + 0) = Offset; + OS->pwrite(Buffer, 8, OffsetLoc); + support::ulittle64_t::ref(Buffer + 0) = Addr; + OS->pwrite(Buffer, 8, VAddrLoc); + OS->pwrite(Buffer, 8, PAddrLoc); + support::ulittle64_t::ref(Buffer + 0) = Size; + OS->pwrite(Buffer, 8, FileSzLoc); + OS->pwrite(Buffer, 8, MemSzLoc); + return true; + } + return false; +} + +template static +void OptimizeFile(ELFObjectFile *File, const DataReader &DR) { // FIXME: there should be some way to extract arch and triple information // from the file. @@ -387,8 +565,10 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { uint64_t Addr; uint64_t FileOffset; uint64_t Size; + uint64_t AddrEnd; + uint64_t BumpPtr; }; - BlobTy ExtraStorage = {0ULL, 0ULL, 0ULL}; + BlobTy ExtraStorage = {0ULL, 0ULL, 0ULL, 0ULL, 0ULL}; // Populate array of binary functions and file symbols // from file symbol table. @@ -407,12 +587,15 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (*Name == "__flo_storage") { ExtraStorage.Addr = Symbol.getValue(); + ExtraStorage.BumpPtr = ExtraStorage.Addr; ExtraStorage.FileOffset = discoverFileOffset(File, ExtraStorage.Addr); assert(ExtraStorage.FileOffset != 0 && "Corrupt __flo_storage symbol"); + + FileSymRefs[ExtraStorage.Addr] = Symbol; continue; } - if (*Name == "__flo_storage_size") { - ExtraStorage.Size = Symbol.getValue(); + if (*Name == "__flo_storage_end") { + ExtraStorage.AddrEnd = Symbol.getValue(); continue; } @@ -502,12 +685,14 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { SymbolSize, *BC) ); } + ExtraStorage.Size = ExtraStorage.AddrEnd - ExtraStorage.Addr; ArrayRef LSDAData; uint64_t LSDAAddress{0}; // Process special sections. uint64_t FrameHdrAddress = 0ULL; + uint64_t FrameHdrAlign = 1; StringRef FrameHdrContents; for (const auto &Section : File->sections()) { StringRef SectionName; @@ -527,6 +712,7 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (SectionName == ".eh_frame_hdr") { FrameHdrAddress = Section.getAddress(); FrameHdrContents = SectionContents; + FrameHdrAlign = Section.getAlignment(); } } @@ -694,12 +880,16 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { } if (opts::ReorderBlocks != "disable") { + bool ShouldSplit = opts::SplitFunctions && + (Function.getFunctionScore() * 1000) > TotalScore; if (opts::ReorderBlocks == "branch-predictor") { - BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR); + BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR, + ShouldSplit); } else if (opts::ReorderBlocks == "cache") { - BFI.second.optimizeLayout(BinaryFunction::HP_CACHE_UTILIZATION); + BFI.second.optimizeLayout(BinaryFunction::HP_CACHE_UTILIZATION, + ShouldSplit); } else { - BFI.second.optimizeLayout(BinaryFunction::HP_NONE); + BFI.second.optimizeLayout(BinaryFunction::HP_NONE, ShouldSplit); } if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks"); @@ -755,52 +945,6 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { Streamer->InitSections(false); - // Define a helper to decode and emit CFI instructions at a given point in a - // BB - auto emitCFIInstr = [&Streamer](MCCFIInstruction &CFIInstr) { - switch (CFIInstr.getOperation()) { - default: - llvm_unreachable("Unexpected instruction"); - case MCCFIInstruction::OpDefCfaOffset: - Streamer->EmitCFIDefCfaOffset(CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpAdjustCfaOffset: - Streamer->EmitCFIAdjustCfaOffset(CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpDefCfa: - Streamer->EmitCFIDefCfa(CFIInstr.getRegister(), CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpDefCfaRegister: - Streamer->EmitCFIDefCfaRegister(CFIInstr.getRegister()); - break; - case MCCFIInstruction::OpOffset: - Streamer->EmitCFIOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpRegister: - Streamer->EmitCFIRegister(CFIInstr.getRegister(), - CFIInstr.getRegister2()); - break; - case MCCFIInstruction::OpRelOffset: - Streamer->EmitCFIRelOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpUndefined: - Streamer->EmitCFIUndefined(CFIInstr.getRegister()); - break; - case MCCFIInstruction::OpRememberState: - Streamer->EmitCFIRememberState(); - break; - case MCCFIInstruction::OpRestoreState: - Streamer->EmitCFIRestoreState(); - break; - case MCCFIInstruction::OpRestore: - Streamer->EmitCFIRestore(CFIInstr.getRegister()); - break; - case MCCFIInstruction::OpSameValue: - Streamer->EmitCFISameValue(CFIInstr.getRegister()); - break; - } - }; - bool HasEHFrame = false; bool NoSpaceWarning = false; // Output functions one by one. @@ -813,95 +957,24 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (!opts::shouldProcess(Function.getName())) continue; - DEBUG(dbgs() << "FLO: generating code for function \"" - << Function.getName() << "\"\n"); - - // No need for human readability? - // FIXME: what difference does it make in reality? - //Ctx.setUseNamesOnTempLabels(false); - - // Emit function start + DEBUG(dbgs() << "FLO: generating code for function \"" << Function.getName() + << "\"\n"); - // Each fuction is emmitted into its own section. - MCSectionELF *FunctionSection = - BC->Ctx->getELFSection(Function.getCodeSectionName(), - ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); - - MCSection *Section = FunctionSection; - Streamer->SwitchSection(Section); - - Streamer->EmitCodeAlignment(Function.getAlignment()); - - MCSymbol *FunctionSymbol = BC->Ctx->getOrCreateSymbol(Function.getName()); - Streamer->EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); - Streamer->EmitLabel(FunctionSymbol); - - // Emit CFI start if (Function.hasCFI()) { - if (ExtraStorage.Size != 0ULL) { + if (ExtraStorage.Size != 0) HasEHFrame = true; - Streamer->EmitCFIStartProc(/*IsSimple=*/false); - if (Function.getPersonalityFunction() != nullptr) { - Streamer->EmitCFIPersonality(Function.getPersonalityFunction(), - Function.getPersonalityEncoding()); - } - // Emit CFI instructions relative to the CIE - for (auto &CFIInstr : Function.cie()) { - // Ignore these CIE CFI insns because LLVM will already emit this. - switch (CFIInstr.getOperation()) { - default: - break; - case MCCFIInstruction::OpDefCfa: - if (CFIInstr.getRegister() == 7 && CFIInstr.getOffset() == 8) - continue; - break; - case MCCFIInstruction::OpOffset: - if (CFIInstr.getRegister() == 16 && CFIInstr.getOffset() == -8) - continue; - break; - } - emitCFIInstr(CFIInstr); - } - } else { + else NoSpaceWarning = true; - } } - // Emit code. - for (auto BB : Function.layout()) { - if (BB->getAlignment() > 1) - Streamer->EmitCodeAlignment(BB->getAlignment()); - Streamer->EmitLabel(BB->getLabel()); - for (const auto &Instr : *BB) { - // Handle pseudo instructions. - if (BC->MIA->isEHLabel(Instr)) { - assert(Instr.getNumOperands() == 1 && Instr.getOperand(0).isExpr() && - "bad EH_LABEL instruction"); - auto Label = &(cast( - Instr.getOperand(0).getExpr())->getSymbol()); - Streamer->EmitLabel(const_cast(Label)); - continue; - } - if (!BC->MIA->isCFI(Instr)) { - Streamer->EmitInstruction(Instr, *BC->STI); - continue; - } - if (ExtraStorage.Size == 0) - continue; - emitCFIInstr(*Function.getCFIFor(Instr)); - } - } + emitFunction(*Streamer, Function, *BC.get(), + /*EmitColdPart=*/false, + /*HasExtraStorage=*/ExtraStorage.Size != 0); - // Emit CFI end - if (Function.hasCFI() && ExtraStorage.Size != 0) - Streamer->EmitCFIEndProc(); - - // TODO: is there any use in emiting end of function? - // Perhaps once we have a support for C++ exceptions. - //auto FunctionEndLabel = Ctx.createTempSymbol("func_end"); - //Streamer->EmitLabel(FunctionEndLabel); - //Streamer->emitELFSize(FunctionSymbol, MCExpr()); + if (Function.isSplit()) + emitFunction(*Streamer, Function, *BC.get(), + /*EmitColdPart=*/true, + /*HasExtraStorage=*/ExtraStorage.Size != 0); } if (NoSpaceWarning) { errs() << "FLO-WARNING: missing __flo_storage in this binary. No " @@ -973,32 +1046,66 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { errs() << "FLO: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); } + + if (!Function.isSplit()) + continue; + + SAI = EFMM->SectionAddressInfo.find( + Function.getCodeSectionName().str().append(".cold")); + if (SAI != EFMM->SectionAddressInfo.end()) { + // Align at a 16-byte boundary + ExtraStorage.BumpPtr = (ExtraStorage.BumpPtr + 15) & ~(15ULL); + + DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) + << " with size " << Twine::utohexstr(SAI->second.second) + << '\n'); + OLT.mapSectionAddress(ObjectsHandle, + reinterpret_cast(SAI->second.first), + ExtraStorage.BumpPtr); + Function.setColdImageAddress(SAI->second.first); + Function.setColdImageSize(SAI->second.second); + Function.setColdFileOffset(ExtraStorage.BumpPtr - ExtraStorage.Addr + + ExtraStorage.FileOffset); + ExtraStorage.BumpPtr += SAI->second.second; + } else { + errs() << "FLO: cannot remap function " << Function.getName() << "\n"; + FailedAddresses.emplace_back(Function.getAddress()); + } } // Map .eh_frame StringRef NewEhFrameContents; + uint64_t NewEhFrameAddress = 0; + uint64_t NewEhFrameOffset = 0; if (HasEHFrame) { auto SAI = EFMM->SectionAddressInfo.find(".eh_frame"); if (SAI != EFMM->SectionAddressInfo.end()) { + // Align at an 8-byte boundary + ExtraStorage.BumpPtr = (ExtraStorage.BumpPtr + 7) & ~(7ULL); DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) - << " to 0x" << Twine::utohexstr(ExtraStorage.Addr) + << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) << '\n'); + NewEhFrameAddress = ExtraStorage.BumpPtr; + NewEhFrameOffset = + ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SAI->second.first), - ExtraStorage.Addr); + reinterpret_cast(SAI->second.first), + ExtraStorage.BumpPtr); + ExtraStorage.BumpPtr += SAI->second.second; NewEhFrameContents = StringRef(reinterpret_cast(SAI->second.first), SAI->second.second); - if (ExtraStorage.Size < SAI->second.second) { - errs() << format("FLO fatal error: new .eh_frame requires 0x%x bytes, " - "but __flo_storage in this binary only has 0x%x extra " - "bytes available.", - SAI->second.second, ExtraStorage.Size); - exit(1); - } } else { errs() << "FLO: cannot remap .eh_frame\n"; } } + if (ExtraStorage.BumpPtr - ExtraStorage.Addr > ExtraStorage.Size) { + errs() << format( + "FLO fatal error: __flo_storage in this binary has not enough free " + "space (required %d bytes, available %d bytes).\n", + ExtraStorage.BumpPtr - ExtraStorage.Addr, ExtraStorage.Size); + exit(1); + } OLT.emitAndFinalize(ObjectsHandle); @@ -1007,6 +1114,12 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { static_cast(Streamer.get())->getAssembler().getWriter(); Writer.setStream(RealOut->os()); + // Print _flo_storage area stats for debug + DEBUG(dbgs() << format("INFO: __flo_storage address = 0x%x file offset = " + "0x%x total size = 0x%x\n", + ExtraStorage.Addr, ExtraStorage.FileOffset, + ExtraStorage.Size)); + // Overwrite function in the output file. uint64_t CountOverwrittenFunctions = 0; uint64_t OverwrittenScore = 0; @@ -1015,6 +1128,9 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) continue; + if (Function.isSplit() && (Function.getColdImageAddress() == 0 || + Function.getColdImageSize() == 0)) + continue; if (Function.getImageSize() > Function.getMaxSize()) { errs() << "FLO-WARNING: new function size (0x" @@ -1041,23 +1157,57 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) { &Writer); RealOut->os().seek(Pos); - ++CountOverwrittenFunctions; + if (!Function.isSplit()) { + ++CountOverwrittenFunctions; + if (opts::MaxFunctions && + CountOverwrittenFunctions == opts::MaxFunctions) { + outs() << "FLO: maximum number of functions reached\n"; + break; + } + continue; + } + // Write cold part + outs() << "FLO: rewriting function \"" << Function.getName() + << "\" (cold part)\n"; + RealOut->os().pwrite( + reinterpret_cast(Function.getColdImageAddress()), + Function.getColdImageSize(), Function.getColdFileOffset()); + + ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { outs() << "FLO: maximum number of functions reached\n"; break; } } if (NewEhFrameContents.size()) { + outs() << "FLO: writing a new .eh_frame_hdr\n"; + if (FrameHdrAlign > 1) + ExtraStorage.BumpPtr = + (ExtraStorage.BumpPtr + FrameHdrAlign - 1) & ~(FrameHdrAlign - 1); + if (ExtraStorage.BumpPtr - ExtraStorage.Addr - ExtraStorage.Size < + FrameHdrContents.size()) { + errs() << "FLO fatal error: __flo_storage in this binary has not enough " + "free space\n"; + exit(1); + } std::sort(FailedAddresses.begin(), FailedAddresses.end()); - CFIRdWrt.rewriteHeaderFor(NewEhFrameContents, ExtraStorage.Addr, - FailedAddresses); - outs() << "FLO: rewriting .eh_frame_hdr\n"; + CFIRdWrt.rewriteHeaderFor(NewEhFrameContents, NewEhFrameAddress, + ExtraStorage.BumpPtr, FailedAddresses); + uint64_t HdrFileOffset = + ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; RealOut->os().pwrite(FrameHdrCopy.data(), FrameHdrCopy.size(), - FrameHdrContents.data() - File->getData().data()); + HdrFileOffset); + outs() << "FLO: patching EH_FRAME program segment to reflect new " + ".eh_frame_hdr\n"; + auto Obj = File->getELFFile(); + if (!patchEhFrameHdrSegment(Obj, &RealOut->os(), HdrFileOffset, + ExtraStorage.BumpPtr, FrameHdrCopy.size())) { + outs() << "FAILED to patch program segment!\n"; + } outs() << "FLO: writing a new .eh_frame\n"; RealOut->os().pwrite(NewEhFrameContents.data(), NewEhFrameContents.size(), - ExtraStorage.FileOffset); + NewEhFrameOffset); } if (EntryPointFunction) { @@ -1133,7 +1283,7 @@ int main(int argc, char **argv) { report_error(opts::InputFilename, EC); Binary &Binary = *BinaryOrErr.get().getBinary(); - if (ELFObjectFileBase *e = dyn_cast(&Binary)) { + if (auto *e = dyn_cast(&Binary)) { OptimizeFile(e, *DR.get()); } else { report_error(opts::InputFilename, object_error::invalid_file_type); From 5dfe3048374c0dcf6693d97ecdb6acbff0ded197 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 23 Nov 2015 17:54:18 -0800 Subject: [PATCH 049/904] Refactoring llvm-flo.cpp into a new class RewriteInstance, NFC. Summary: Previously, llvm-flo.cpp contained a long function doing lots of different tasks. This patch refactors this logic into a separate class with different member functions, exposing the relationship between each step of the rewritting process and making it easier to coordinate/change it. (cherry picked from commit ac3e75d35ed758d7b910ce946c1bc9a43949dbb2) --- bolt/BinaryContext.h | 14 +- bolt/CMakeLists.txt | 1 + bolt/RewriteInstance.cpp | 1223 ++++++++++++++++++++++++++++++++++++++ bolt/RewriteInstance.h | 142 +++++ bolt/llvm-flo.cpp | 1192 +------------------------------------ 5 files changed, 1379 insertions(+), 1193 deletions(-) create mode 100644 bolt/RewriteInstance.cpp create mode 100644 bolt/RewriteInstance.h diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index ab0e9888941a..0bfddceef431 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -57,7 +57,9 @@ class BinaryContext { const Target *TheTarget; - MCCodeEmitter *MCE; + std::string TripleName; + + std::unique_ptr MCE; std::unique_ptr MOFI; @@ -77,14 +79,13 @@ class BinaryContext { std::function ErrorCheck; - MCAsmBackend *MAB; - const DataReader &DR; BinaryContext(std::unique_ptr Ctx, std::unique_ptr TheTriple, const Target *TheTarget, - MCCodeEmitter *MCE, + std::string TripleName, + std::unique_ptr MCE, std::unique_ptr MOFI, std::unique_ptr AsmInfo, std::unique_ptr MII, @@ -93,12 +94,12 @@ class BinaryContext { std::unique_ptr MIA, std::unique_ptr MRI, std::unique_ptr DisAsm, - MCAsmBackend *MAB, const DataReader &DR) : Ctx(std::move(Ctx)), TheTriple(std::move(TheTriple)), TheTarget(TheTarget), - MCE(MCE), + TripleName(TripleName), + MCE(std::move(MCE)), MOFI(std::move(MOFI)), AsmInfo(std::move(AsmInfo)), MII(std::move(MII)), @@ -107,7 +108,6 @@ class BinaryContext { MIA(std::move(MIA)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)), - MAB(MAB), DR(DR) {} ~BinaryContext() {} diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index fc75bcbe9844..4c6fcd50b9c2 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -18,4 +18,5 @@ add_llvm_tool(llvm-flo BinaryFunction.cpp DataReader.cpp Exceptions.cpp + RewriteInstance.cpp ) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp new file mode 100644 index 000000000000..fb17584f6034 --- /dev/null +++ b/bolt/RewriteInstance.cpp @@ -0,0 +1,1223 @@ +//===--- RewriteInstance.cpp - Interface for machine-level function -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "RewriteInstance.h" +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "DataReader.h" +#include "Exceptions.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/ExecutionEngine/Orc/LambdaResolver.h" +#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Target/TargetMachine.h" +#include +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "flo" + +using namespace llvm; +using namespace object; +using namespace flo; + +namespace opts { + +static cl::opt +OutputFilename("o", cl::desc(""), cl::Required); + +static cl::list +FunctionNames("funcs", + cl::CommaSeparated, + cl::desc("list of functions to optimize"), + cl::value_desc("func1,func2,func3,...")); + +static cl::list +SkipFunctionNames("skip_funcs", + cl::CommaSeparated, + cl::desc("list of functions to skip"), + cl::value_desc("func1,func2,func3,...")); + +static cl::opt +MaxFunctions("max_funcs", + cl::desc("maximum # of functions to overwrite"), + cl::Optional); + +static cl::opt +EliminateUnreachable("eliminate-unreachable", + cl::desc("eliminate unreachable code"), + cl::Optional); + +static cl::opt +SplitFunctions("split-functions", + cl::desc("split functions into hot and cold distinct regions"), + cl::Optional); + +static cl::opt ReorderBlocks( + "reorder-blocks", + cl::desc("redo basic block layout based on profiling data with a specific " + "priority (none, branch-predictor or cache)"), + cl::value_desc("priority"), cl::init("disable")); + +static cl::opt +DumpEHFrame("dump-eh-frame", cl::desc("dump parsed .eh_frame (debugging)"), + cl::Hidden); + +static cl::opt +PrintAll("print-all", cl::desc("print functions after each stage"), + cl::Hidden); + +static cl::opt +PrintCFG("print-cfg", cl::desc("print functions after CFG construction"), + cl::Hidden); + +static cl::opt +PrintUCE("print-uce", + cl::desc("print functions after unreachable code elimination"), + cl::Hidden); + +static cl::opt +PrintDisasm("print-disasm", cl::desc("print function after disassembly"), + cl::Hidden); + +static cl::opt +PrintEHRanges("print-eh-ranges", + cl::desc("print function with updated exception ranges"), + cl::Hidden); + +static cl::opt +PrintReordered("print-reordered", + cl::desc("print functions after layout optimization"), + cl::Hidden); + + +// Check against lists of functions from options if we should +// optimize the function with a given name. +bool shouldProcess(StringRef FunctionName) { + bool IsValid = true; + if (!FunctionNames.empty()) { + IsValid = false; + for (auto &Name : FunctionNames) { + if (FunctionName == Name) { + IsValid = true; + break; + } + } + } + if (!IsValid) + return false; + + if (!SkipFunctionNames.empty()) { + for (auto &Name : SkipFunctionNames) { + if (FunctionName == Name) { + IsValid = false; + break; + } + } + } + + return IsValid; +} + +} // namespace opts + + +static void report_error(StringRef Message, std::error_code EC) { + assert(EC); + errs() << "FLO: '" << Message << "': " << EC.message() << ".\n"; + exit(1); +} + +static void check_error(std::error_code EC, StringRef Message) { + if (!EC) + return; + report_error(Message, EC); +} + +/// Class responsible for allocating and managing code and data sections. +class ExecutableFileMemoryManager : public SectionMemoryManager { +public: + + // Keep [section name] -> [allocated address, size] map for later remapping. + std::map> SectionAddressInfo; + + ExecutableFileMemoryManager() {} + + uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, + StringRef SectionName) override { + auto ret = + SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, + SectionName); + DEBUG(dbgs() << "FLO: allocating code section : " << SectionName + << " with size " << Size << ", alignment " << Alignment + << " at 0x" << ret << "\n"); + + SectionAddressInfo[SectionName] = {reinterpret_cast(ret), Size}; + + return ret; + } + + uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, StringRef SectionName, + bool IsReadOnly) override { + DEBUG(dbgs() << "FLO: allocating data section : " << SectionName + << " with size " << Size << ", alignment " + << Alignment << "\n"); + auto ret = SectionMemoryManager::allocateDataSection( + Size, Alignment, SectionID, SectionName, IsReadOnly); + + SectionAddressInfo[SectionName] = {reinterpret_cast(ret), Size}; + + return ret; + } + + // Tell EE that we guarantee we don't need stubs. + bool allowStubAllocation() const override { return false; } + + bool finalizeMemory(std::string *ErrMsg = nullptr) override { + DEBUG(dbgs() << "FLO: finalizeMemory()\n"); + return SectionMemoryManager::finalizeMemory(ErrMsg); + } +}; + +/// Create BinaryContext for a given architecture \p ArchName and +/// triple \p TripleName. +static std::unique_ptr CreateBinaryContext( + std::string ArchName, + std::string TripleName, const DataReader &DR) { + + std::string Error; + + std::unique_ptr TheTriple = llvm::make_unique(TripleName); + const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, + *TheTriple, + Error); + if (!TheTarget) { + errs() << "FLO: " << Error; + return nullptr; + } + + std::unique_ptr MRI( + TheTarget->createMCRegInfo(TripleName)); + if (!MRI) { + errs() << "error: no register info for target " << TripleName << "\n"; + return nullptr; + } + + // Set up disassembler. + std::unique_ptr AsmInfo( + TheTarget->createMCAsmInfo(*MRI, TripleName)); + if (!AsmInfo) { + errs() << "error: no assembly info for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr STI( + TheTarget->createMCSubtargetInfo(TripleName, "", "")); + if (!STI) { + errs() << "error: no subtarget info for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MII(TheTarget->createMCInstrInfo()); + if (!MII) { + errs() << "error: no instruction info for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MOFI = + llvm::make_unique(); + std::unique_ptr Ctx = + llvm::make_unique(AsmInfo.get(), MRI.get(), MOFI.get()); + MOFI->InitMCObjectFileInfo(*TheTriple, Reloc::Default, + CodeModel::Default, *Ctx); + + std::unique_ptr DisAsm( + TheTarget->createMCDisassembler(*STI, *Ctx)); + + if (!DisAsm) { + errs() << "error: no disassembler for target " << TripleName << "\n"; + return nullptr; + } + + std::unique_ptr MIA( + TheTarget->createMCInstrAnalysis(MII.get())); + if (!MIA) { + errs() << "error: failed to create instruction analysis for target" + << TripleName << "\n"; + return nullptr; + } + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + std::unique_ptr InstructionPrinter( + TheTarget->createMCInstPrinter(Triple(TripleName), AsmPrinterVariant, + *AsmInfo, *MII, *MRI)); + if (!InstructionPrinter) { + errs() << "error: no instruction printer for target " << TripleName + << '\n'; + return nullptr; + } + InstructionPrinter->setPrintImmHex(true); + + std::unique_ptr MCE( + TheTarget->createMCCodeEmitter(*MII, *MRI, *Ctx)); + + // Make sure we don't miss any output on core dumps. + outs().SetUnbuffered(); + errs().SetUnbuffered(); + dbgs().SetUnbuffered(); + + auto BC = + llvm::make_unique(std::move(Ctx), + std::move(TheTriple), + TheTarget, + TripleName, + std::move(MCE), + std::move(MOFI), + std::move(AsmInfo), + std::move(MII), + std::move(STI), + std::move(InstructionPrinter), + std::move(MIA), + std::move(MRI), + std::move(DisAsm), + DR); + + return BC; +} + +RewriteInstance::RewriteInstance(ELFObjectFileBase *File, + const DataReader &DR) + : File(File), BC(CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR)), + DwCtx(new DWARFContextInMemory(*File)) {} + +RewriteInstance::~RewriteInstance() {} + +void RewriteInstance::run() { + if (!BC) { + errs() << "failed to create a binary context\n"; + return; + } + + readSymbolTable(); + readSpecialSections(); + disassembleFunctions(); + runOptimizationPasses(); + emitFunctions(); + + // Copy input file to output + std::error_code EC; + Out = llvm::make_unique(opts::OutputFilename, EC, + sys::fs::F_None, 0777); + check_error(EC, "cannot create output executable file"); + Out->os() << File->getData(); + + // Rewrite optimized functions back to this output + rewriteFile(); +} + +namespace { + +// Helper function to map a random memory address to a file offset. Returns 0 if +// this address cannot be mapped back to the file. +uint64_t discoverFileOffset(ELFObjectFileBase *File, uint64_t MemAddr) { + for (const auto &Section : File->sections()) { + uint64_t SecAddress = Section.getAddress(); + uint64_t Size = Section.getSize(); + if (MemAddr < SecAddress || + SecAddress + Size <= MemAddr) + continue; + + StringRef SectionContents; + check_error(Section.getContents(SectionContents), + "cannot get section contents"); + uint64_t SecFileOffset = SectionContents.data() - File->getData().data(); + uint64_t MemAddrSecOffset = MemAddr - SecAddress; + return SecFileOffset + MemAddrSecOffset; + } + return 0ULL; +} + +} // anonymous namespace + +void RewriteInstance::readSymbolTable() { + std::string FileSymbolName; + + FileSymRefs.clear(); + BinaryFunctions.clear(); + BC->GlobalAddresses.clear(); + + // For local symbols we want to keep track of associated FILE symbol for + // disambiguation by name. + for (const SymbolRef &Symbol : File->symbols()) { + // Keep undefined symbols for pretty printing? + if (Symbol.getFlags() & SymbolRef::SF_Undefined) + continue; + + ErrorOr Name = Symbol.getName(); + check_error(Name.getError(), "cannot get symbol name"); + + if (*Name == "__flo_storage") { + ExtraStorage.Addr = Symbol.getValue(); + ExtraStorage.BumpPtr = ExtraStorage.Addr; + ExtraStorage.FileOffset = discoverFileOffset(File, ExtraStorage.Addr); + assert(ExtraStorage.FileOffset != 0 && "Corrupt __flo_storage symbol"); + + FileSymRefs[ExtraStorage.Addr] = Symbol; + continue; + } + if (*Name == "__flo_storage_end") { + ExtraStorage.AddrEnd = Symbol.getValue(); + continue; + } + + if (Symbol.getType() == SymbolRef::ST_File) { + // Could be used for local symbol disambiguation. + FileSymbolName = *Name; + continue; + } + + ErrorOr AddressOrErr = Symbol.getAddress(); + check_error(AddressOrErr.getError(), "cannot get symbol address"); + uint64_t Address = *AddressOrErr; + if (Address == 0) { + if (Symbol.getType() == SymbolRef::ST_Function) + errs() << "FLO-WARNING: function with 0 address seen\n"; + continue; + } + + FileSymRefs[Address] = Symbol; + + // There's nothing horribly wrong with anonymous symbols, but let's + // ignore them for now. + if (Name->empty()) + continue; + + // Disambiguate all local symbols before adding to symbol table. + // Since we don't know if we'll see a global with the same name, + // always modify the local name. + std::string UniqueName; + if (Symbol.getFlags() & SymbolRef::SF_Global) { + assert(BC->GlobalSymbols.find(*Name) == BC->GlobalSymbols.end() && + "global name not unique"); + UniqueName = *Name; + /// It's possible we are seeing a globalized local. LLVM might treat it as + /// local if it has a "private global" prefix, e.g. ".L". Thus we have to + /// change the prefix to enforce global scope of the symbol. + if (StringRef(UniqueName) + .startswith(BC->AsmInfo->getPrivateGlobalPrefix())) + UniqueName = "PG." + UniqueName; + } else { + unsigned LocalCount = 1; + std::string LocalName = (*Name).str() + "/" + FileSymbolName + "/"; + + if ((*Name).startswith(BC->AsmInfo->getPrivateGlobalPrefix())) { + LocalName = "PG." + LocalName; + } + + while (BC->GlobalSymbols.find(LocalName + std::to_string(LocalCount)) != + BC->GlobalSymbols.end()) { + ++LocalCount; + } + UniqueName = LocalName + std::to_string(LocalCount); + } + + // Add the name to global symbols map. + BC->GlobalSymbols[UniqueName] = Address; + + // Add to the reverse map. There could multiple names at the same address. + BC->GlobalAddresses.emplace(std::make_pair(Address, UniqueName)); + + // Only consider ST_Function symbols for functions. Although this + // assumption could be broken by assembly functions for which the type + // could be wrong, we skip such entries till the support for + // assembly is implemented. + if (Symbol.getType() != SymbolRef::ST_Function) + continue; + + // TODO: populate address map with PLT entries for better readability. + + // Ignore function with 0 size for now (possibly coming from assembly). + auto SymbolSize = ELFSymbolRef(Symbol).getSize(); + if (SymbolSize == 0) + continue; + + ErrorOr SectionOrErr = Symbol.getSection(); + check_error(SectionOrErr.getError(), "cannot get symbol section"); + section_iterator Section = *SectionOrErr; + if (Section == File->section_end()) { + // Could be an absolute symbol. Could record for pretty printing. + continue; + } + + // Create the function and add to the map. + BinaryFunctions.emplace( + Address, + BinaryFunction(UniqueName, Symbol, *Section, Address, + SymbolSize, *BC) + ); + } + ExtraStorage.Size = ExtraStorage.AddrEnd - ExtraStorage.Addr; +} + +void RewriteInstance::readSpecialSections() { + // Process special sections. + StringRef FrameHdrContents; + for (const auto &Section : File->sections()) { + StringRef SectionName; + check_error(Section.getName(SectionName), "cannot get section name"); + StringRef SectionContents; + check_error(Section.getContents(SectionContents), + "cannot get section contents"); + ArrayRef SectionData( + reinterpret_cast(SectionContents.data()), + Section.getSize()); + + if (SectionName == ".gcc_except_table") { + readLSDA(SectionData, *BC); + LSDAData = SectionData; + LSDAAddress = Section.getAddress(); + } + if (SectionName == ".eh_frame_hdr") { + FrameHdrAddress = Section.getAddress(); + FrameHdrContents = SectionContents; + FrameHdrAlign = Section.getAlignment(); + } + } + + FrameHdrCopy = + std::vector(FrameHdrContents.begin(), FrameHdrContents.end()); + // Process debug sections. + EHFrame = DwCtx->getEHFrame(); + if (opts::DumpEHFrame) { + EHFrame->dump(outs()); + } + CFIRdWrt.reset(new CFIReaderWriter(*EHFrame, FrameHdrAddress, FrameHdrCopy)); + if (!EHFrame->ParseError.empty()) { + errs() << "FLO-WARNING: EHFrame reader failed with message \"" + << EHFrame->ParseError << "\"\n"; + } +} + +void RewriteInstance::disassembleFunctions() { + // Disassemble every function and build it's control flow graph. + TotalScore = 0; + for (auto &BFI : BinaryFunctions) { + BinaryFunction &Function = BFI.second; + + if (!opts::shouldProcess(Function.getName())) { + DEBUG(dbgs() << "FLO: skipping processing function " << Function.getName() + << " per user request.\n"); + continue; + } + + SectionRef Section = Function.getSection(); + assert(Section.containsSymbol(Function.getSymbol()) && + "symbol not in section"); + + // When could it happen? + if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { + DEBUG(dbgs() << "FLO: corresponding section non-executable or empty " + << "for function " << Function.getName()); + continue; + } + + // Set the proper maximum size value after the whole symbol table + // has been processed. + auto SymRefI = FileSymRefs.upper_bound(Function.getAddress()); + if (SymRefI != FileSymRefs.end()) { + auto MaxSize = SymRefI->first - Function.getAddress(); + if (MaxSize < Function.getSize()) { + DEBUG(dbgs() << "FLO: symbol seen in the middle of the function " + << Function.getName() << ". Skipping.\n"); + Function.setSimple(false); + continue; + } + Function.setMaxSize(MaxSize); + } + + StringRef SectionContents; + check_error(Section.getContents(SectionContents), + "cannot get section contents"); + + assert(SectionContents.size() == Section.getSize() && + "section size mismatch"); + + // Function offset from the section start. + auto FunctionOffset = Function.getAddress() - Section.getAddress(); + + // Offset of the function in the file. + Function.setFileOffset( + SectionContents.data() - File->getData().data() + FunctionOffset); + + ArrayRef FunctionData( + reinterpret_cast + (SectionContents.data()) + FunctionOffset, + Function.getSize()); + + if (!Function.disassemble(FunctionData)) + continue; + + if (opts::PrintAll || opts::PrintDisasm) + Function.print(errs(), "after disassembly"); + + if (!Function.isSimple()) + continue; + + // Fill in CFI information for this function + if (EHFrame->ParseError.empty() && Function.isSimple()) { + CFIRdWrt->fillCFIInfoFor(Function); + if (Function.getLSDAAddress() != 0) + Function.setSimple(false); + } + + // Parse LSDA. + if (Function.getLSDAAddress() != 0) + Function.parseLSDA(LSDAData, LSDAAddress); + + if (!Function.buildCFG()) + continue; + + if (opts::PrintAll || opts::PrintCFG) + Function.print(errs(), "after building cfg"); + + TotalScore += Function.getFunctionScore(); + + } // Iterate over all functions +} + +void RewriteInstance::runOptimizationPasses() { + // Run optimization passes. + // + // FIXME: use real optimization passes. + bool NagUser = true; + if (opts::ReorderBlocks != "" && + opts::ReorderBlocks != "disable" && + opts::ReorderBlocks != "none" && + opts::ReorderBlocks != "branch-predictor" && + opts::ReorderBlocks != "cache") { + errs() << "FLO: Unrecognized block reordering priority \"" + << opts::ReorderBlocks << "\".\n"; + exit(1); + } + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + if (!opts::shouldProcess(Function.getName())) + continue; + + if (!Function.isSimple()) + continue; + + // Detect and eliminate unreachable basic blocks. We could have those + // filled with nops and they are used for alignment. + // + // FIXME: this wouldn't work with C++ exceptions until we implement + // support for those as there will be "invisible" edges + // in the graph. + if (opts::EliminateUnreachable && Function.layout_size() > 0) { + if (NagUser) { + outs() + << "FLO-WARNING: Using -eliminate-unreachable is experimental and " + "unsafe for exceptions\n"; + NagUser = false; + } + + std::stack Stack; + std::map Reachable; + BinaryBasicBlock *Entry = *Function.layout_begin(); + Stack.push(Entry); + Reachable[Entry] = true; + // Determine reachable BBs from the entry point + while (!Stack.empty()) { + auto BB = Stack.top(); + Stack.pop(); + for (auto Succ : BB->successors()) { + if (Reachable[Succ]) + continue; + Reachable[Succ] = true; + Stack.push(Succ); + } + } + + auto Count = Function.eraseDeadBBs(Reachable); + if (Count) { + DEBUG(dbgs() << "FLO: Removed " << Count + << " dead basic block(s) in function " + << Function.getName() << '\n'); + } + + if (opts::PrintAll || opts::PrintUCE) + Function.print(errs(), "after unreachable code elimination"); + } + + if (opts::ReorderBlocks != "disable") { + bool ShouldSplit = opts::SplitFunctions && + (Function.getFunctionScore() * 1000) > TotalScore; + if (opts::ReorderBlocks == "branch-predictor") { + BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR, + ShouldSplit); + } else if (opts::ReorderBlocks == "cache") { + BFI.second.optimizeLayout(BinaryFunction::HP_CACHE_UTILIZATION, + ShouldSplit); + } else { + BFI.second.optimizeLayout(BinaryFunction::HP_NONE, ShouldSplit); + } + if (opts::PrintAll || opts::PrintReordered) + Function.print(errs(), "after reordering blocks"); + } + + // Post-processing passes. + // FIXME: Check EH handlers correctly in presence of indirect calls + // Function.updateEHRanges(); + // if (opts::PrintAll || opts::PrintEHRanges) { + // Function.print(errs(), "after updating EH ranges"); + // } + + // After optimizations, fix the CFI state + if (!Function.fixCFIState()) + Function.setSimple(false); + } +} + +namespace { + +// Helper function to emit the contents of a function via a MCStreamer object. +void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, + BinaryContext &BC, bool EmitColdPart, bool HasExtraStorage) { + // Define a helper to decode and emit CFI instructions at a given point in a + // BB + auto emitCFIInstr = [&Streamer](MCCFIInstruction &CFIInstr) { + switch (CFIInstr.getOperation()) { + default: + llvm_unreachable("Unexpected instruction"); + case MCCFIInstruction::OpDefCfaOffset: + Streamer.EmitCFIDefCfaOffset(CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpAdjustCfaOffset: + Streamer.EmitCFIAdjustCfaOffset(CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpDefCfa: + Streamer.EmitCFIDefCfa(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpDefCfaRegister: + Streamer.EmitCFIDefCfaRegister(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpOffset: + Streamer.EmitCFIOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpRegister: + Streamer.EmitCFIRegister(CFIInstr.getRegister(), + CFIInstr.getRegister2()); + break; + case MCCFIInstruction::OpRelOffset: + Streamer.EmitCFIRelOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); + break; + case MCCFIInstruction::OpUndefined: + Streamer.EmitCFIUndefined(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpRememberState: + Streamer.EmitCFIRememberState(); + break; + case MCCFIInstruction::OpRestoreState: + Streamer.EmitCFIRestoreState(); + break; + case MCCFIInstruction::OpRestore: + Streamer.EmitCFIRestore(CFIInstr.getRegister()); + break; + case MCCFIInstruction::OpSameValue: + Streamer.EmitCFISameValue(CFIInstr.getRegister()); + break; + } + }; + + // No need for human readability? + // FIXME: what difference does it make in reality? + // Ctx.setUseNamesOnTempLabels(false); + + // Emit function start + + // Each fuction is emmitted into its own section. + MCSectionELF *FunctionSection = + EmitColdPart + ? BC.Ctx->getELFSection( + Function.getCodeSectionName().str().append(".cold"), + ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC) + : BC.Ctx->getELFSection(Function.getCodeSectionName(), + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + + MCSection *Section = FunctionSection; + Streamer.SwitchSection(Section); + + Streamer.EmitCodeAlignment(Function.getAlignment()); + + if (!EmitColdPart) { + MCSymbol *FunctionSymbol = BC.Ctx->getOrCreateSymbol(Function.getName()); + Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); + Streamer.EmitLabel(FunctionSymbol); + } else { + MCSymbol *FunctionSymbol = + BC.Ctx->getOrCreateSymbol(Twine(Function.getName()).concat(".cold")); + Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); + Streamer.EmitLabel(FunctionSymbol); + } + + // Emit CFI start + if (Function.hasCFI() && HasExtraStorage) { + Streamer.EmitCFIStartProc(/*IsSimple=*/false); + if (Function.getPersonalityFunction() != nullptr) { + Streamer.EmitCFIPersonality(Function.getPersonalityFunction(), + Function.getPersonalityEncoding()); + } + // Emit CFI instructions relative to the CIE + for (auto &CFIInstr : Function.cie()) { + // Ignore these CIE CFI insns because LLVM will already emit this. + switch (CFIInstr.getOperation()) { + default: + break; + case MCCFIInstruction::OpDefCfa: + if (CFIInstr.getRegister() == 7 && CFIInstr.getOffset() == 8) + continue; + break; + case MCCFIInstruction::OpOffset: + if (CFIInstr.getRegister() == 16 && CFIInstr.getOffset() == -8) + continue; + break; + } + emitCFIInstr(CFIInstr); + } + } + + // Emit code. + for (auto BB : Function.layout()) { + if (EmitColdPart != BB->isCold()) + continue; + if (BB->getAlignment() > 1) + Streamer.EmitCodeAlignment(BB->getAlignment()); + Streamer.EmitLabel(BB->getLabel()); + for (const auto &Instr : *BB) { + // Handle pseudo instructions. + if (BC.MIA->isEHLabel(Instr)) { + assert(Instr.getNumOperands() == 1 && Instr.getOperand(0).isExpr() && + "bad EH_LABEL instruction"); + auto Label = &(cast(Instr.getOperand(0).getExpr()) + ->getSymbol()); + Streamer.EmitLabel(const_cast(Label)); + continue; + } + if (!BC.MIA->isCFI(Instr)) { + Streamer.EmitInstruction(Instr, *BC.STI); + continue; + } + if (HasExtraStorage) + emitCFIInstr(*Function.getCFIFor(Instr)); + } + } + + // Emit CFI end + if (Function.hasCFI() && HasExtraStorage) + Streamer.EmitCFIEndProc(); + + // TODO: is there any use in emiting end of function? + // Perhaps once we have a support for C++ exceptions. + // auto FunctionEndLabel = Ctx.createTempSymbol("func_end"); + // Streamer.EmitLabel(FunctionEndLabel); + // Streamer.emitELFSize(FunctionSymbol, MCExpr()); +} + +template +std::vector singletonSet(T t) { + std::vector Vec; + Vec.push_back(std::move(t)); + return Vec; +} + +} // anonymous namespace + +void RewriteInstance::emitFunctions() { + std::error_code EC; + + // This is an object file, which we keep for debugging purposes. + // Once we decide it's useless, we should create it in memory. + std::unique_ptr TempOut = + llvm::make_unique(opts::OutputFilename + ".o", + EC, sys::fs::F_None); + check_error(EC, "cannot create output object file"); + + std::unique_ptr BOS = + make_unique(TempOut->os()); + raw_pwrite_stream *OS = BOS.get(); + + // Implicitly MCObjectStreamer takes ownership of MCAsmBackend (MAB) + // and MCCodeEmitter (MCE). ~MCObjectStreamer() will delete these + // two instances. + auto MCE = BC->TheTarget->createMCCodeEmitter(*BC->MII, *BC->MRI, *BC->Ctx); + auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); + std::unique_ptr Streamer( + BC->TheTarget->createMCObjectStreamer(*BC->TheTriple, + *BC->Ctx, + *MAB, + *OS, + MCE, + *BC->STI, + /* RelaxAll */ false, + /* DWARFMustBeAtTheEnd */ false)); + + Streamer->InitSections(false); + + bool HasEHFrame = false; + bool NoSpaceWarning = false; + // Output functions one by one. + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + if (!Function.isSimple()) + continue; + + if (!opts::shouldProcess(Function.getName())) + continue; + + DEBUG(dbgs() << "FLO: generating code for function \"" << Function.getName() + << "\"\n"); + + if (Function.hasCFI()) { + if (ExtraStorage.Size != 0) + HasEHFrame = true; + else + NoSpaceWarning = true; + } + + emitFunction(*Streamer, Function, *BC.get(), + /*EmitColdPart=*/false, + /*HasExtraStorage=*/ExtraStorage.Size != 0); + + if (Function.isSplit()) + emitFunction(*Streamer, Function, *BC.get(), + /*EmitColdPart=*/true, + /*HasExtraStorage=*/ExtraStorage.Size != 0); + } + if (NoSpaceWarning) { + errs() << "FLO-WARNING: missing __flo_storage in this binary. No " + << "extra space left to allocate the new .eh_frame\n"; + } + + Streamer->Finish(); + + // Get output object as ObjectFile. + std::unique_ptr ObjectMemBuffer = + MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false); + ErrorOr> ObjOrErr = + object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()); + check_error(ObjOrErr.getError(), "error creating in-memory object"); + + auto EFMM = new ExecutableFileMemoryManager(); + SectionMM.reset(EFMM); + + // FIXME: use notifyObjectLoaded() to remap sections. + + DEBUG(dbgs() << "Creating OLT\n"); + // Run ObjectLinkingLayer() with custom memory manager and symbol resolver. + orc::ObjectLinkingLayer<> OLT; + + auto Resolver = orc::createLambdaResolver( + [&](const std::string &Name) { + DEBUG(dbgs() << "FLO: looking for " << Name << "\n"); + auto I = BC->GlobalSymbols.find(Name); + if (I == BC->GlobalSymbols.end()) + return RuntimeDyld::SymbolInfo(nullptr); + return RuntimeDyld::SymbolInfo(I->second, + JITSymbolFlags::None); + }, + [](const std::string &S) { + DEBUG(dbgs() << "FLO: resolving " << S << "\n"); + return nullptr; + } + ); + // FIXME: + auto ObjectsHandle = OLT.addObjectSet( + singletonSet(std::move(ObjOrErr.get())), + SectionMM.get(), + std::move(Resolver)); + //OLT.takeOwnershipOfBuffers(ObjectsHandle, ); + + // Map every function/section current address in memory to that in + // the output binary. + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + if (!Function.isSimple()) + continue; + + auto SAI = EFMM->SectionAddressInfo.find(Function.getCodeSectionName()); + if (SAI != EFMM->SectionAddressInfo.end()) { + DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + << " to 0x" << Twine::utohexstr(Function.getAddress()) + << '\n'); + OLT.mapSectionAddress(ObjectsHandle, + reinterpret_cast(SAI->second.first), + Function.getAddress()); + Function.setImageAddress(SAI->second.first); + Function.setImageSize(SAI->second.second); + } else { + errs() << "FLO: cannot remap function " << Function.getName() << "\n"; + FailedAddresses.emplace_back(Function.getAddress()); + } + + if (!Function.isSplit()) + continue; + + SAI = EFMM->SectionAddressInfo.find( + Function.getCodeSectionName().str().append(".cold")); + if (SAI != EFMM->SectionAddressInfo.end()) { + // Align at a 16-byte boundary + ExtraStorage.BumpPtr = (ExtraStorage.BumpPtr + 15) & ~(15ULL); + + DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) + << " with size " << Twine::utohexstr(SAI->second.second) + << '\n'); + OLT.mapSectionAddress(ObjectsHandle, + reinterpret_cast(SAI->second.first), + ExtraStorage.BumpPtr); + Function.setColdImageAddress(SAI->second.first); + Function.setColdImageSize(SAI->second.second); + Function.setColdFileOffset(ExtraStorage.BumpPtr - ExtraStorage.Addr + + ExtraStorage.FileOffset); + ExtraStorage.BumpPtr += SAI->second.second; + } else { + errs() << "FLO: cannot remap function " << Function.getName() << "\n"; + FailedAddresses.emplace_back(Function.getAddress()); + } + } + // Map .eh_frame + NewEhFrameAddress = 0; + NewEhFrameOffset = 0; + if (HasEHFrame) { + auto SAI = EFMM->SectionAddressInfo.find(".eh_frame"); + if (SAI != EFMM->SectionAddressInfo.end()) { + // Align at an 8-byte boundary + ExtraStorage.BumpPtr = (ExtraStorage.BumpPtr + 7) & ~(7ULL); + DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) + << '\n'); + NewEhFrameAddress = ExtraStorage.BumpPtr; + NewEhFrameOffset = + ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; + OLT.mapSectionAddress(ObjectsHandle, + reinterpret_cast(SAI->second.first), + ExtraStorage.BumpPtr); + ExtraStorage.BumpPtr += SAI->second.second; + NewEhFrameContents = + StringRef(reinterpret_cast(SAI->second.first), + SAI->second.second); + } else { + errs() << "FLO: cannot remap .eh_frame\n"; + } + } + if (ExtraStorage.BumpPtr - ExtraStorage.Addr > ExtraStorage.Size) { + errs() << format( + "FLO fatal error: __flo_storage in this binary has not enough free " + "space (required %d bytes, available %d bytes).\n", + ExtraStorage.BumpPtr - ExtraStorage.Addr, ExtraStorage.Size); + exit(1); + } + + OLT.emitAndFinalize(ObjectsHandle); + TempOut->keep(); +} + +namespace { + +// Helper to locate EH_FRAME_HDR segment, specialized for 64-bit LE ELF +bool patchEhFrameHdrSegment(const ELFFile *Obj, raw_pwrite_stream *OS, + uint64_t Offset, uint64_t Addr, uint64_t Size) { + for (const auto &Phdr : Obj->program_headers()) { + if (Phdr.p_type != ELF::PT_GNU_EH_FRAME) + continue; + uint64_t OffsetLoc = (uintptr_t)&Phdr.p_offset - (uintptr_t)Obj->base(); + uint64_t VAddrLoc = (uintptr_t)&Phdr.p_vaddr - (uintptr_t)Obj->base(); + uint64_t PAddrLoc = (uintptr_t)&Phdr.p_paddr - (uintptr_t)Obj->base(); + uint64_t FileSzLoc = (uintptr_t)&Phdr.p_filesz - (uintptr_t)Obj->base(); + uint64_t MemSzLoc = (uintptr_t)&Phdr.p_memsz - (uintptr_t)Obj->base(); + char Buffer[8]; + // Update Offset + support::ulittle64_t::ref(Buffer + 0) = Offset; + OS->pwrite(Buffer, 8, OffsetLoc); + support::ulittle64_t::ref(Buffer + 0) = Addr; + OS->pwrite(Buffer, 8, VAddrLoc); + OS->pwrite(Buffer, 8, PAddrLoc); + support::ulittle64_t::ref(Buffer + 0) = Size; + OS->pwrite(Buffer, 8, FileSzLoc); + OS->pwrite(Buffer, 8, MemSzLoc); + return true; + } + return false; +} + +} // anonymous namespace + +void RewriteInstance::rewriteFile() { + // FIXME: is there a less painful way to obtain assembler/writer? + auto MCE = BC->TheTarget->createMCCodeEmitter(*BC->MII, *BC->MRI, *BC->Ctx); + auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); + std::unique_ptr Streamer( + BC->TheTarget->createMCObjectStreamer(*BC->TheTriple, + *BC->Ctx, + *MAB, + Out->os(), + MCE, + *BC->STI, + /* RelaxAll */ false, + /* DWARFMustBeAtTheEnd */ false)); + auto &Writer = static_cast(Streamer.get()) + ->getAssembler() + .getWriter(); + + // Print _flo_storage area stats for debug + DEBUG(dbgs() << format("INFO: __flo_storage address = 0x%x file offset = " + "0x%x total size = 0x%x\n", + ExtraStorage.Addr, ExtraStorage.FileOffset, + ExtraStorage.Size)); + + // Overwrite function in the output file. + uint64_t CountOverwrittenFunctions = 0; + uint64_t OverwrittenScore = 0; + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) + continue; + if (Function.isSplit() && (Function.getColdImageAddress() == 0 || + Function.getColdImageSize() == 0)) + continue; + + if (Function.getImageSize() > Function.getMaxSize()) { + errs() << "FLO-WARNING: new function size (0x" + << Twine::utohexstr(Function.getImageSize()) + << ") is larger than maximum allowed size (0x" + << Twine::utohexstr(Function.getMaxSize()) + << ") for function " << Function.getName() << '\n'; + FailedAddresses.emplace_back(Function.getAddress()); + continue; + } + + OverwrittenScore += Function.getFunctionScore(); + // Overwrite function in the output file. + outs() << "FLO: rewriting function \"" << Function.getName() << "\"\n"; + Out->os().pwrite(reinterpret_cast(Function.getImageAddress()), + Function.getImageSize(), Function.getFileOffset()); + + // Write nops at the end of the function. + auto Pos = Out->os().tell(); + Out->os().seek(Function.getFileOffset() + Function.getImageSize()); + MAB->writeNopData(Function.getMaxSize() - Function.getImageSize(), + &Writer); + Out->os().seek(Pos); + + if (!Function.isSplit()) { + ++CountOverwrittenFunctions; + if (opts::MaxFunctions && + CountOverwrittenFunctions == opts::MaxFunctions) { + outs() << "FLO: maximum number of functions reached\n"; + break; + } + continue; + } + + // Write cold part + outs() << "FLO: rewriting function \"" << Function.getName() + << "\" (cold part)\n"; + Out->os().pwrite(reinterpret_cast(Function.getColdImageAddress()), + Function.getColdImageSize(), Function.getColdFileOffset()); + + ++CountOverwrittenFunctions; + if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { + outs() << "FLO: maximum number of functions reached\n"; + break; + } + } + if (NewEhFrameContents.size()) { + outs() << "FLO: writing a new .eh_frame_hdr\n"; + if (FrameHdrAlign > 1) + ExtraStorage.BumpPtr = + (ExtraStorage.BumpPtr + FrameHdrAlign - 1) & ~(FrameHdrAlign - 1); + std::sort(FailedAddresses.begin(), FailedAddresses.end()); + CFIRdWrt->rewriteHeaderFor(NewEhFrameContents, NewEhFrameAddress, + ExtraStorage.BumpPtr, FailedAddresses); + if (ExtraStorage.BumpPtr - ExtraStorage.Addr - ExtraStorage.Size < + FrameHdrCopy.size()) { + errs() << "FLO fatal error: __flo_storage in this binary has not enough " + "free space\n"; + exit(1); + } + + uint64_t HdrFileOffset = + ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; + Out->os().pwrite(FrameHdrCopy.data(), FrameHdrCopy.size(), HdrFileOffset); + outs() << "FLO: patching EH_FRAME program segment to reflect new " + ".eh_frame_hdr\n"; + if (auto ELF64LEFile = dyn_cast(File)) { + auto Obj = ELF64LEFile->getELFFile(); + if (!patchEhFrameHdrSegment(Obj, &Out->os(), HdrFileOffset, + ExtraStorage.BumpPtr, FrameHdrCopy.size())) { + outs() << "FAILED to patch program segment!\n"; + } + } else { + outs() << "FLO-ERROR: program segment NOT patched -- I don't know how to " + "handle this object file!\n"; + } + outs() << "FLO: writing a new .eh_frame\n"; + Out->os().pwrite(NewEhFrameContents.data(), NewEhFrameContents.size(), + NewEhFrameOffset); + } + + outs() << "FLO: " << CountOverwrittenFunctions + << " out of " << BinaryFunctions.size() + << " functions were overwritten.\n"; + + if (TotalScore != 0) { + double Coverage = OverwrittenScore / (double)TotalScore * 100.0; + outs() << format("FLO: Rewritten functions cover %.2lf", Coverage) + << "% of the execution count of simple functions of this binary.\n"; + } + + // TODO: we should find a way to mark the binary as optimized by us. + Out->keep(); +} diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h new file mode 100644 index 000000000000..862641e20094 --- /dev/null +++ b/bolt/RewriteInstance.h @@ -0,0 +1,142 @@ +//===--- RewriteInstance.h - Interface for machine-level function ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to control an instance of a binary rewriting process. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_FLO_REWRITE_INSTANCE_H +#define LLVM_TOOLS_LLVM_FLO_REWRITE_INSTANCE_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ObjectFile.h" +#include + +namespace llvm { + +class DWARFContext; +class DWARFFrame; +class SectionMemoryManager; +class tool_output_file; + +namespace flo { + +class BinaryContext; +class BinaryFunction; +class CFIReaderWriter; +class DataReader; + +/// This class encapsulates all data necessary to carry on binary reading, +/// disassembly, CFG building, BB reordering (among other binary-level +/// optimizations) and rewriting. It also has the logic to coordinate such +/// events. +class RewriteInstance { +public: + RewriteInstance(llvm::object::ELFObjectFileBase *File, const DataReader &DR); + ~RewriteInstance(); + + /// Run all the necessary steps to read, optimize and rewrite the binary. + void run(); + + /// Populate array of binary functions and file symbols from file symbol + /// table. + void readSymbolTable(); + + /// Read .eh_frame, .eh_frame_hdr and .gcc_except_table sections for exception + /// and stack unwinding information. + void readSpecialSections(); + + /// Disassemble each function in the binary and associate it with a + /// BinaryFunction object, preparing all information necessary for binary + /// optimization. + void disassembleFunctions(); + + /// Run optimizations that operate at the binary, or post-linker, level. + void runOptimizationPasses(); + + /// Write all functions to an intermediary object file, map virtual to real + /// addresses and link this object file, resolving all relocations and + /// performing final relaxation. + void emitFunctions(); + + /// Rewrite back all functions (hopefully optimized) that fit in the original + /// memory footprint for that function. If the function is now larger and does + /// not fit in the binary, reject it and preserve the original version of the + /// function. If we couldn't understand the function for some reason in + /// disassembleFunctions(), also preserve the original version. + void rewriteFile(); + +private: + /// An instance of the input binary we are processing, externally owned. + llvm::object::ELFObjectFileBase *File; + + std::unique_ptr BC; + std::unique_ptr DwCtx; + std::unique_ptr CFIRdWrt; + // Our in-memory intermediary object file where we hold final code for + // rewritten functions. + std::unique_ptr SectionMM; + // Our output file where we mix original code from the input binary and + // optimized code for selected functions. + std::unique_ptr Out; + + /// Represent free space we have in the binary to write extra bytes. This free + /// space is pre-delimited in the binary via a linker script that allocates + /// space and inserts a new symbol __flo_storage in the binary. We also use + /// the symbol __flo_storage_end to delimit the end of the contiguous space in + /// the binary where it is safe for us to write new content. We use this extra + /// space for the following activities: + /// + /// * Writing new .eh_frame entries for functions we changed the layout + /// * Writing a new .eh_frame_hdr to allow us to expand the number of + /// .eh_frame entries (FDEs). Note we also keep the old .eh_frame in the + /// binary instact for functions we don't touch. + /// * Writing cold basic blocks + /// + struct BlobTy { + uint64_t Addr; + uint64_t FileOffset; + uint64_t Size; + uint64_t AddrEnd; + /// BumpPtr is a trivial way to keep track of space utilization in this blob + uint64_t BumpPtr; + }; + BlobTy ExtraStorage{0, 0, 0, 0, 0}; + + /// Store all non-zero symbols in this map for a quick address lookup. + std::map FileSymRefs; + + /// Store all functions seen in the binary, sorted by address. + std::map BinaryFunctions; + + /// Exception handling and stack unwinding information in this binary. + ArrayRef LSDAData; + uint64_t LSDAAddress{0}; + std::vector FrameHdrCopy; + uint64_t FrameHdrAddress{0}; + uint64_t FrameHdrAlign{1}; + const llvm::DWARFFrame *EHFrame{nullptr}; + StringRef NewEhFrameContents; + uint64_t NewEhFrameAddress{0}; + uint64_t NewEhFrameOffset{0}; + + // Keep track of functions we fail to write in the binary. We need to avoid + // rewriting CFI info for these functions. + std::vector FailedAddresses; + + /// Total hotness score according to profiling data for this binary. + uint64_t TotalScore{0}; + +}; + +} // namespace flo +} // namespace llvm + +#endif diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-flo.cpp index e82be39c9dc6..5a336b59bb1e 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-flo.cpp @@ -13,48 +13,15 @@ // //===----------------------------------------------------------------------===// -#include "BinaryBasicBlock.h" -#include "BinaryContext.h" -#include "BinaryFunction.h" #include "DataReader.h" -#include "Exceptions.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/DebugInfo/DWARF/DWARFContext.h" -#include "llvm/ExecutionEngine/Orc/LambdaResolver.h" -#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" -#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" -#include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDisassembler.h" -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCInstrAnalysis.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCObjectFileInfo.h" -#include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSection.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Object/ELFObjectFile.h" -#include "llvm/Object/ObjectFile.h" -#include "llvm/Support/Casting.h" +#include "RewriteInstance.h" +#include "llvm/Object/Binary.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/Errc.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Signals.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/ToolOutputFile.h" -#include "llvm/Target/TargetMachine.h" -#include -#include -#include -#include #undef DEBUG_TYPE #define DEBUG_TYPE "flo" @@ -71,106 +38,10 @@ InputFilename(cl::Positional, cl::desc(""), cl::Required); static cl::opt InputDataFilename("data", cl::desc(""), cl::Optional); -static cl::opt -OutputFilename("o", cl::desc(""), cl::Required); - -static cl::list -FunctionNames("funcs", - cl::CommaSeparated, - cl::desc("list of functions to optimize"), - cl::value_desc("func1,func2,func3,...")); - -static cl::list -SkipFunctionNames("skip_funcs", - cl::CommaSeparated, - cl::desc("list of functions to skip"), - cl::value_desc("func1,func2,func3,...")); - -static cl::opt -MaxFunctions("max_funcs", - cl::desc("maximum # of functions to overwrite"), - cl::Optional); - -static cl::opt -EliminateUnreachable("eliminate-unreachable", - cl::desc("eliminate unreachable code"), - cl::Optional); - -static cl::opt -SplitFunctions("split-functions", - cl::desc("split functions into hot and cold distinct regions"), - cl::Optional); - -static cl::opt ReorderBlocks( - "reorder-blocks", - cl::desc("redo basic block layout based on profiling data with a specific " - "priority (none, branch-predictor or cache)"), - cl::value_desc("priority"), cl::init("disable")); - static cl::opt DumpData("dump-data", cl::desc("dump parsed flo data and exit (debugging)"), cl::Hidden); -static cl::opt -DumpEHFrame("dump-eh-frame", cl::desc("dump parsed .eh_frame (debugging)"), - cl::Hidden); - -static cl::opt -PrintAll("print-all", cl::desc("print functions after each stage"), - cl::Hidden); - -static cl::opt -PrintCFG("print-cfg", cl::desc("print functions after CFG construction"), - cl::Hidden); - -static cl::opt -PrintUCE("print-uce", - cl::desc("print functions after unreachable code elimination"), - cl::Hidden); - -static cl::opt -PrintDisasm("print-disasm", cl::desc("print function after disassembly"), - cl::Hidden); - -static cl::opt -PrintEHRanges("print-eh-ranges", - cl::desc("print function with updated exception ranges"), - cl::Hidden); - -static cl::opt -PrintReordered("print-reordered", - cl::desc("print functions after layout optimization"), - cl::Hidden); - - -// Check against lists of functions from options if we should -// optimize the function with a given name. -bool shouldProcess(StringRef FunctionName) { - bool IsValid = true; - if (!FunctionNames.empty()) { - IsValid = false; - for (auto &Name : FunctionNames) { - if (FunctionName == Name) { - IsValid = true; - break; - } - } - } - if (!IsValid) - return false; - - if (!SkipFunctionNames.empty()) { - for (auto &Name : SkipFunctionNames) { - if (FunctionName == Name) { - IsValid = false; - break; - } - } - } - - return IsValid; -} - } // namespace opts static StringRef ToolName; @@ -181,1058 +52,6 @@ static void report_error(StringRef Message, std::error_code EC) { exit(1); } -static void check_error(std::error_code EC, StringRef Message) { - if (!EC) - return; - report_error(Message, EC); -} - -template -static std::vector singletonSet(T t) { - std::vector Vec; - Vec.push_back(std::move(t)); - return Vec; -} - -/// Class responsible for allocating and managing code and data sections. -class ExecutableFileMemoryManager : public SectionMemoryManager { -public: - - // Keep [section name] -> [allocated address, size] map for later remapping. - std::map> SectionAddressInfo; - - ExecutableFileMemoryManager() {} - - uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, - StringRef SectionName) override { - auto ret = - SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, - SectionName); - DEBUG(dbgs() << "FLO: allocating code section : " << SectionName - << " with size " << Size << ", alignment " << Alignment - << " at 0x" << ret << "\n"); - - SectionAddressInfo[SectionName] = {reinterpret_cast(ret), Size}; - - return ret; - } - - uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, StringRef SectionName, - bool IsReadOnly) override { - DEBUG(dbgs() << "FLO: allocating data section : " << SectionName - << " with size " << Size << ", alignment " - << Alignment << "\n"); - auto ret = SectionMemoryManager::allocateDataSection( - Size, Alignment, SectionID, SectionName, IsReadOnly); - - SectionAddressInfo[SectionName] = {reinterpret_cast(ret), Size}; - - return ret; - } - - // Tell EE that we guarantee we don't need stubs. - bool allowStubAllocation() const override { return false; } - - bool finalizeMemory(std::string *ErrMsg = nullptr) override { - DEBUG(dbgs() << "FLO: finalizeMemory()\n"); - return SectionMemoryManager::finalizeMemory(ErrMsg); - } -}; - -/// Create BinaryContext for a given architecture \p ArchName and -/// triple \p TripleName. -static std::unique_ptr CreateBinaryContext( - std::string ArchName, - std::string TripleName, const DataReader &DR) { - - std::string Error; - - std::unique_ptr TheTriple = llvm::make_unique(TripleName); - const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, - *TheTriple, - Error); - if (!TheTarget) { - errs() << ToolName << ": " << Error; - return nullptr; - } - - std::unique_ptr MRI( - TheTarget->createMCRegInfo(TripleName)); - if (!MRI) { - errs() << "error: no register info for target " << TripleName << "\n"; - return nullptr; - } - - // Set up disassembler. - std::unique_ptr AsmInfo( - TheTarget->createMCAsmInfo(*MRI, TripleName)); - if (!AsmInfo) { - errs() << "error: no assembly info for target " << TripleName << "\n"; - return nullptr; - } - - std::unique_ptr STI( - TheTarget->createMCSubtargetInfo(TripleName, "", "")); - if (!STI) { - errs() << "error: no subtarget info for target " << TripleName << "\n"; - return nullptr; - } - - std::unique_ptr MII(TheTarget->createMCInstrInfo()); - if (!MII) { - errs() << "error: no instruction info for target " << TripleName << "\n"; - return nullptr; - } - - std::unique_ptr MOFI = - llvm::make_unique(); - std::unique_ptr Ctx = - llvm::make_unique(AsmInfo.get(), MRI.get(), MOFI.get()); - MOFI->InitMCObjectFileInfo(*TheTriple, Reloc::Default, - CodeModel::Default, *Ctx); - - std::unique_ptr DisAsm( - TheTarget->createMCDisassembler(*STI, *Ctx)); - - if (!DisAsm) { - errs() << "error: no disassembler for target " << TripleName << "\n"; - return nullptr; - } - - std::unique_ptr MIA( - TheTarget->createMCInstrAnalysis(MII.get())); - if (!MIA) { - errs() << "error: failed to create instruction analysis for target" - << TripleName << "\n"; - return nullptr; - } - - int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); - std::unique_ptr InstructionPrinter( - TheTarget->createMCInstPrinter(Triple(TripleName), AsmPrinterVariant, - *AsmInfo, *MII, *MRI)); - if (!InstructionPrinter) { - errs() << "error: no instruction printer for target " << TripleName - << '\n'; - return nullptr; - } - InstructionPrinter->setPrintImmHex(true); - - auto MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *Ctx); - - auto MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, ""); - - // Make sure we don't miss any output on core dumps. - outs().SetUnbuffered(); - errs().SetUnbuffered(); - dbgs().SetUnbuffered(); - - auto BC = - llvm::make_unique(std::move(Ctx), - std::move(TheTriple), - TheTarget, - MCE, - std::move(MOFI), - std::move(AsmInfo), - std::move(MII), - std::move(STI), - std::move(InstructionPrinter), - std::move(MIA), - std::move(MRI), - std::move(DisAsm), - MAB, - DR); - - return BC; -} - -// Helper function to map a random memory address to a file offset. Returns 0 if -// this address cannot be mapped back to the file. -static uint64_t discoverFileOffset(ELFObjectFileBase *File, uint64_t MemAddr) { - for (const auto &Section : File->sections()) { - uint64_t SecAddress = Section.getAddress(); - uint64_t Size = Section.getSize(); - if (MemAddr < SecAddress || - SecAddress + Size <= MemAddr) - continue; - - StringRef SectionContents; - check_error(Section.getContents(SectionContents), - "cannot get section contents"); - uint64_t SecFileOffset = SectionContents.data() - File->getData().data(); - uint64_t MemAddrSecOffset = MemAddr - SecAddress; - return SecFileOffset + MemAddrSecOffset; - } - return 0ULL; -} - -// Helper function to emit the contents of a function via a MCStreamer object. -static void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, - BinaryContext &BC, bool EmitColdPart, - bool HasExtraStorage) { - // Define a helper to decode and emit CFI instructions at a given point in a - // BB - auto emitCFIInstr = [&Streamer](MCCFIInstruction &CFIInstr) { - switch (CFIInstr.getOperation()) { - default: - llvm_unreachable("Unexpected instruction"); - case MCCFIInstruction::OpDefCfaOffset: - Streamer.EmitCFIDefCfaOffset(CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpAdjustCfaOffset: - Streamer.EmitCFIAdjustCfaOffset(CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpDefCfa: - Streamer.EmitCFIDefCfa(CFIInstr.getRegister(), CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpDefCfaRegister: - Streamer.EmitCFIDefCfaRegister(CFIInstr.getRegister()); - break; - case MCCFIInstruction::OpOffset: - Streamer.EmitCFIOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpRegister: - Streamer.EmitCFIRegister(CFIInstr.getRegister(), - CFIInstr.getRegister2()); - break; - case MCCFIInstruction::OpRelOffset: - Streamer.EmitCFIRelOffset(CFIInstr.getRegister(), CFIInstr.getOffset()); - break; - case MCCFIInstruction::OpUndefined: - Streamer.EmitCFIUndefined(CFIInstr.getRegister()); - break; - case MCCFIInstruction::OpRememberState: - Streamer.EmitCFIRememberState(); - break; - case MCCFIInstruction::OpRestoreState: - Streamer.EmitCFIRestoreState(); - break; - case MCCFIInstruction::OpRestore: - Streamer.EmitCFIRestore(CFIInstr.getRegister()); - break; - case MCCFIInstruction::OpSameValue: - Streamer.EmitCFISameValue(CFIInstr.getRegister()); - break; - } - }; - - // No need for human readability? - // FIXME: what difference does it make in reality? - // Ctx.setUseNamesOnTempLabels(false); - - // Emit function start - - // Each fuction is emmitted into its own section. - MCSectionELF *FunctionSection = - EmitColdPart - ? BC.Ctx->getELFSection( - Function.getCodeSectionName().str().append(".cold"), - ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC) - : BC.Ctx->getELFSection(Function.getCodeSectionName(), - ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); - - MCSection *Section = FunctionSection; - Streamer.SwitchSection(Section); - - Streamer.EmitCodeAlignment(Function.getAlignment()); - - if (!EmitColdPart) { - MCSymbol *FunctionSymbol = BC.Ctx->getOrCreateSymbol(Function.getName()); - Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); - Streamer.EmitLabel(FunctionSymbol); - } else { - MCSymbol *FunctionSymbol = - BC.Ctx->getOrCreateSymbol(Twine(Function.getName()).concat(".cold")); - Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); - Streamer.EmitLabel(FunctionSymbol); - } - - // Emit CFI start - if (Function.hasCFI() && HasExtraStorage) { - Streamer.EmitCFIStartProc(/*IsSimple=*/false); - if (Function.getPersonalityFunction() != nullptr) { - Streamer.EmitCFIPersonality(Function.getPersonalityFunction(), - Function.getPersonalityEncoding()); - } - // Emit CFI instructions relative to the CIE - for (auto &CFIInstr : Function.cie()) { - // Ignore these CIE CFI insns because LLVM will already emit this. - switch (CFIInstr.getOperation()) { - default: - break; - case MCCFIInstruction::OpDefCfa: - if (CFIInstr.getRegister() == 7 && CFIInstr.getOffset() == 8) - continue; - break; - case MCCFIInstruction::OpOffset: - if (CFIInstr.getRegister() == 16 && CFIInstr.getOffset() == -8) - continue; - break; - } - emitCFIInstr(CFIInstr); - } - } - - // Emit code. - for (auto BB : Function.layout()) { - if (EmitColdPart != BB->isCold()) - continue; - if (BB->getAlignment() > 1) - Streamer.EmitCodeAlignment(BB->getAlignment()); - Streamer.EmitLabel(BB->getLabel()); - for (const auto &Instr : *BB) { - // Handle pseudo instructions. - if (BC.MIA->isEHLabel(Instr)) { - assert(Instr.getNumOperands() == 1 && Instr.getOperand(0).isExpr() && - "bad EH_LABEL instruction"); - auto Label = &(cast(Instr.getOperand(0).getExpr()) - ->getSymbol()); - Streamer.EmitLabel(const_cast(Label)); - continue; - } - if (!BC.MIA->isCFI(Instr)) { - Streamer.EmitInstruction(Instr, *BC.STI); - continue; - } - if (HasExtraStorage) - emitCFIInstr(*Function.getCFIFor(Instr)); - } - } - - // Emit CFI end - if (Function.hasCFI() && HasExtraStorage) - Streamer.EmitCFIEndProc(); - - // TODO: is there any use in emiting end of function? - // Perhaps once we have a support for C++ exceptions. - // auto FunctionEndLabel = Ctx.createTempSymbol("func_end"); - // Streamer.EmitLabel(FunctionEndLabel); - // Streamer.emitELFSize(FunctionSymbol, MCExpr()); -} - -// Helper to locate EH_FRAME_HDR segment, specialized for 64-bit LE ELF -static bool patchEhFrameHdrSegment(const ELFFile *Obj, - raw_pwrite_stream *OS, uint64_t Offset, - uint64_t Addr, uint64_t Size) { - for (const auto &Phdr : Obj->program_headers()) { - if (Phdr.p_type != ELF::PT_GNU_EH_FRAME) - continue; - uint64_t OffsetLoc = (uintptr_t)&Phdr.p_offset - (uintptr_t)Obj->base(); - uint64_t VAddrLoc = (uintptr_t)&Phdr.p_vaddr - (uintptr_t)Obj->base(); - uint64_t PAddrLoc = (uintptr_t)&Phdr.p_paddr - (uintptr_t)Obj->base(); - uint64_t FileSzLoc = (uintptr_t)&Phdr.p_filesz - (uintptr_t)Obj->base(); - uint64_t MemSzLoc = (uintptr_t)&Phdr.p_memsz - (uintptr_t)Obj->base(); - char Buffer[8]; - // Update Offset - support::ulittle64_t::ref(Buffer + 0) = Offset; - OS->pwrite(Buffer, 8, OffsetLoc); - support::ulittle64_t::ref(Buffer + 0) = Addr; - OS->pwrite(Buffer, 8, VAddrLoc); - OS->pwrite(Buffer, 8, PAddrLoc); - support::ulittle64_t::ref(Buffer + 0) = Size; - OS->pwrite(Buffer, 8, FileSzLoc); - OS->pwrite(Buffer, 8, MemSzLoc); - return true; - } - return false; -} - -template static -void OptimizeFile(ELFObjectFile *File, const DataReader &DR) { - - // FIXME: there should be some way to extract arch and triple information - // from the file. - std::unique_ptr BC = - std::move(CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR)); - if (!BC) { - errs() << "failed to create a binary context\n"; - return; - } - - // Store all non-zero symbols in this map for a quick address lookup. - std::map FileSymRefs; - - // Entry point to the binary. - // - // Note: this is ELF header entry point, but we could have more entry points - // from constructors etc. - BinaryFunction *EntryPointFunction{nullptr}; - - struct BlobTy { - uint64_t Addr; - uint64_t FileOffset; - uint64_t Size; - uint64_t AddrEnd; - uint64_t BumpPtr; - }; - BlobTy ExtraStorage = {0ULL, 0ULL, 0ULL, 0ULL, 0ULL}; - - // Populate array of binary functions and file symbols - // from file symbol table. - // - // For local symbols we want to keep track of associated FILE symbol for - // disambiguation by name. - std::map BinaryFunctions; - std::string FileSymbolName; - for (const SymbolRef &Symbol : File->symbols()) { - // Keep undefined symbols for pretty printing? - if (Symbol.getFlags() & SymbolRef::SF_Undefined) - continue; - - ErrorOr Name = Symbol.getName(); - check_error(Name.getError(), "cannot get symbol name"); - - if (*Name == "__flo_storage") { - ExtraStorage.Addr = Symbol.getValue(); - ExtraStorage.BumpPtr = ExtraStorage.Addr; - ExtraStorage.FileOffset = discoverFileOffset(File, ExtraStorage.Addr); - assert(ExtraStorage.FileOffset != 0 && "Corrupt __flo_storage symbol"); - - FileSymRefs[ExtraStorage.Addr] = Symbol; - continue; - } - if (*Name == "__flo_storage_end") { - ExtraStorage.AddrEnd = Symbol.getValue(); - continue; - } - - if (Symbol.getType() == SymbolRef::ST_File) { - // Could be used for local symbol disambiguation. - FileSymbolName = *Name; - continue; - } - - ErrorOr AddressOrErr = Symbol.getAddress(); - check_error(AddressOrErr.getError(), "cannot get symbol address"); - uint64_t Address = *AddressOrErr; - if (Address == 0) { - if (Symbol.getType() == SymbolRef::ST_Function) - errs() << "FLO-WARNING: function with 0 address seen\n"; - continue; - } - - FileSymRefs[Address] = Symbol; - - // There's nothing horribly wrong with anonymous symbols, but let's - // ignore them for now. - if (Name->empty()) - continue; - - // Disambiguate all local symbols before adding to symbol table. - // Since we don't know if we'll see a global with the same name, - // always modify the local name. - std::string UniqueName; - if (Symbol.getFlags() & SymbolRef::SF_Global) { - assert(BC->GlobalSymbols.find(*Name) == BC->GlobalSymbols.end() && - "global name not unique"); - UniqueName = *Name; - /// It's possible we are seeing a globalized local. LLVM might treat it as - /// local if it has a "private global" prefix, e.g. ".L". Thus we have to - /// change the prefix to enforce global scope of the symbol. - if (StringRef(UniqueName) - .startswith(BC->AsmInfo->getPrivateGlobalPrefix())) - UniqueName = "PG." + UniqueName; - } else { - unsigned LocalCount = 1; - std::string LocalName = (*Name).str() + "/" + FileSymbolName + "/"; - - if ((*Name).startswith(BC->AsmInfo->getPrivateGlobalPrefix())) { - LocalName = "PG." + LocalName; - } - - while (BC->GlobalSymbols.find(LocalName + std::to_string(LocalCount)) != - BC->GlobalSymbols.end()) { - ++LocalCount; - } - UniqueName = LocalName + std::to_string(LocalCount); - } - - // Add the name to global symbols map. - BC->GlobalSymbols[UniqueName] = Address; - - // Add to the reverse map. There could multiple names at the same address. - BC->GlobalAddresses.emplace(std::make_pair(Address, UniqueName)); - - // Only consider ST_Function symbols for functions. Although this - // assumption could be broken by assembly functions for which the type - // could be wrong, we skip such entries till the support for - // assembly is implemented. - if (Symbol.getType() != SymbolRef::ST_Function) - continue; - - // TODO: populate address map with PLT entries for better readability. - - // Ignore function with 0 size for now (possibly coming from assembly). - auto SymbolSize = ELFSymbolRef(Symbol).getSize(); - if (SymbolSize == 0) - continue; - - ErrorOr SectionOrErr = Symbol.getSection(); - check_error(SectionOrErr.getError(), "cannot get symbol section"); - section_iterator Section = *SectionOrErr; - if (Section == File->section_end()) { - // Could be an absolute symbol. Could record for pretty printing. - continue; - } - - // Create the function and add to the map. - BinaryFunctions.emplace( - Address, - BinaryFunction(UniqueName, Symbol, *Section, Address, - SymbolSize, *BC) - ); - } - ExtraStorage.Size = ExtraStorage.AddrEnd - ExtraStorage.Addr; - - ArrayRef LSDAData; - uint64_t LSDAAddress{0}; - - // Process special sections. - uint64_t FrameHdrAddress = 0ULL; - uint64_t FrameHdrAlign = 1; - StringRef FrameHdrContents; - for (const auto &Section : File->sections()) { - StringRef SectionName; - check_error(Section.getName(SectionName), "cannot get section name"); - StringRef SectionContents; - check_error(Section.getContents(SectionContents), - "cannot get section contents"); - ArrayRef SectionData( - reinterpret_cast(SectionContents.data()), - Section.getSize()); - - if (SectionName == ".gcc_except_table") { - readLSDA(SectionData, *BC); - LSDAData = SectionData; - LSDAAddress = Section.getAddress(); - } - if (SectionName == ".eh_frame_hdr") { - FrameHdrAddress = Section.getAddress(); - FrameHdrContents = SectionContents; - FrameHdrAlign = Section.getAlignment(); - } - } - - std::vector FrameHdrCopy(FrameHdrContents.begin(), - FrameHdrContents.end()); - // Process debug sections. - std::unique_ptr DwCtx(new DWARFContextInMemory(*File)); - const DWARFFrame &EHFrame = *DwCtx->getEHFrame(); - if (opts::DumpEHFrame) { - EHFrame.dump(outs()); - } - CFIReaderWriter CFIRdWrt(EHFrame, FrameHdrAddress, FrameHdrCopy); - if (!EHFrame.ParseError.empty()) { - errs() << "FLO-WARNING: EHFrame reader failed with message \"" - << EHFrame.ParseError << "\"\n"; - } - - // Disassemble every function and build it's control flow graph. - uint64_t TotalScore = 0; - for (auto &BFI : BinaryFunctions) { - BinaryFunction &Function = BFI.second; - - if (!opts::shouldProcess(Function.getName())) { - DEBUG(dbgs() << "FLO: skipping processing function " << Function.getName() - << " per user request.\n"); - continue; - } - - SectionRef Section = Function.getSection(); - assert(Section.containsSymbol(Function.getSymbol()) && - "symbol not in section"); - - // When could it happen? - if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { - DEBUG(dbgs() << "FLO: corresponding section non-executable or empty " - << "for function " << Function.getName()); - continue; - } - - // Set the proper maximum size value after the whole symbol table - // has been processed. - auto SymRefI = FileSymRefs.upper_bound(Function.getAddress()); - if (SymRefI != FileSymRefs.end()) { - auto MaxSize = SymRefI->first - Function.getAddress(); - if (MaxSize < Function.getSize()) { - DEBUG(dbgs() << "FLO: symbol seen in the middle of the function " - << Function.getName() << ". Skipping.\n"); - Function.setSimple(false); - continue; - } - Function.setMaxSize(MaxSize); - } - - StringRef SectionContents; - check_error(Section.getContents(SectionContents), - "cannot get section contents"); - - assert(SectionContents.size() == Section.getSize() && - "section size mismatch"); - - // Function offset from the section start. - auto FunctionOffset = Function.getAddress() - Section.getAddress(); - - // Offset of the function in the file. - Function.setFileOffset( - SectionContents.data() - File->getData().data() + FunctionOffset); - - ArrayRef FunctionData( - reinterpret_cast - (SectionContents.data()) + FunctionOffset, - Function.getSize()); - - if (!Function.disassemble(FunctionData)) - continue; - - if (opts::PrintAll || opts::PrintDisasm) - Function.print(errs(), "after disassembly"); - - if (!Function.isSimple()) - continue; - - // Fill in CFI information for this function - if (EHFrame.ParseError.empty() && Function.isSimple()) { - CFIRdWrt.fillCFIInfoFor(Function); - if (Function.getLSDAAddress() != 0) - Function.setSimple(false); - } - - // Parse LSDA. - if (Function.getLSDAAddress() != 0) - Function.parseLSDA(LSDAData, LSDAAddress); - - if (!Function.buildCFG()) - continue; - - if (opts::PrintAll || opts::PrintCFG) - Function.print(errs(), "after building cfg"); - - TotalScore += Function.getFunctionScore(); - - } // Iterate over all functions - - // Run optimization passes. - // - // FIXME: use real optimization passes. - bool NagUser = true; - if (opts::ReorderBlocks != "" && - opts::ReorderBlocks != "disable" && - opts::ReorderBlocks != "none" && - opts::ReorderBlocks != "branch-predictor" && - opts::ReorderBlocks != "cache") { - errs() << ToolName << ": Unrecognized block reordering priority \"" - << opts::ReorderBlocks << "\".\n"; - exit(1); - } - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - - if (!opts::shouldProcess(Function.getName())) - continue; - - if (!Function.isSimple()) - continue; - - // Detect and eliminate unreachable basic blocks. We could have those - // filled with nops and they are used for alignment. - // - // FIXME: this wouldn't work with C++ exceptions until we implement - // support for those as there will be "invisible" edges - // in the graph. - if (opts::EliminateUnreachable && Function.layout_size() > 0) { - if (NagUser) { - outs() - << "FLO-WARNING: Using -eliminate-unreachable is experimental and " - "unsafe for exceptions\n"; - NagUser = false; - } - - std::stack Stack; - std::map Reachable; - BinaryBasicBlock *Entry = *Function.layout_begin(); - Stack.push(Entry); - Reachable[Entry] = true; - // Determine reachable BBs from the entry point - while (!Stack.empty()) { - auto BB = Stack.top(); - Stack.pop(); - for (auto Succ : BB->successors()) { - if (Reachable[Succ]) - continue; - Reachable[Succ] = true; - Stack.push(Succ); - } - } - - auto Count = Function.eraseDeadBBs(Reachable); - if (Count) { - DEBUG(dbgs() << "FLO: Removed " << Count - << " dead basic block(s) in function " - << Function.getName() << '\n'); - } - - if (opts::PrintAll || opts::PrintUCE) - Function.print(errs(), "after unreachable code elimination"); - } - - if (opts::ReorderBlocks != "disable") { - bool ShouldSplit = opts::SplitFunctions && - (Function.getFunctionScore() * 1000) > TotalScore; - if (opts::ReorderBlocks == "branch-predictor") { - BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR, - ShouldSplit); - } else if (opts::ReorderBlocks == "cache") { - BFI.second.optimizeLayout(BinaryFunction::HP_CACHE_UTILIZATION, - ShouldSplit); - } else { - BFI.second.optimizeLayout(BinaryFunction::HP_NONE, ShouldSplit); - } - if (opts::PrintAll || opts::PrintReordered) - Function.print(errs(), "after reordering blocks"); - } - - // Post-processing passes. - // FIXME: Check EH handlers correctly in presence of indirect calls - // Function.updateEHRanges(); - // if (opts::PrintAll || opts::PrintEHRanges) { - // Function.print(errs(), "after updating EH ranges"); - // } - - // After optimizations, fix the CFI state - if (!Function.fixCFIState()) - Function.setSimple(false); - } - - std::error_code EC; - - // This is an object file, which we keep for debugging purposes. - // Once we decide it's useless, we should create it in memory. - std::unique_ptr Out = - llvm::make_unique(opts::OutputFilename + ".o", - EC, sys::fs::F_None); - check_error(EC, "cannot create output object file"); - - std::unique_ptr RealOut = - llvm::make_unique(opts::OutputFilename, - EC, - sys::fs::F_None, - 0777); - check_error(EC, "cannot create output executable file"); - - // Copy input file. - RealOut->os() << File->getData(); - - std::unique_ptr BOS = - make_unique(Out->os()); - raw_pwrite_stream *OS = BOS.get(); - - // Implicitly MCObjectStreamer takes ownership of MCAsmBackend (MAB) - // and MCCodeEmitter (MCE). ~MCObjectStreamer() will delete these - // two instances. - std::unique_ptr Streamer( - BC->TheTarget->createMCObjectStreamer(*BC->TheTriple, - *BC->Ctx, - *BC->MAB, - *OS, - BC->MCE, - *BC->STI, - /* RelaxAll */ false, - /* DWARFMustBeAtTheEnd */ false)); - - Streamer->InitSections(false); - - bool HasEHFrame = false; - bool NoSpaceWarning = false; - // Output functions one by one. - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - - if (!Function.isSimple()) - continue; - - if (!opts::shouldProcess(Function.getName())) - continue; - - DEBUG(dbgs() << "FLO: generating code for function \"" << Function.getName() - << "\"\n"); - - if (Function.hasCFI()) { - if (ExtraStorage.Size != 0) - HasEHFrame = true; - else - NoSpaceWarning = true; - } - - emitFunction(*Streamer, Function, *BC.get(), - /*EmitColdPart=*/false, - /*HasExtraStorage=*/ExtraStorage.Size != 0); - - if (Function.isSplit()) - emitFunction(*Streamer, Function, *BC.get(), - /*EmitColdPart=*/true, - /*HasExtraStorage=*/ExtraStorage.Size != 0); - } - if (NoSpaceWarning) { - errs() << "FLO-WARNING: missing __flo_storage in this binary. No " - << "extra space left to allocate the new .eh_frame\n"; - } - - Streamer->Finish(); - - // Get output object as ObjectFile. - std::unique_ptr ObjectMemBuffer = - MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false); - ErrorOr> ObjOrErr = - object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()); - check_error(ObjOrErr.getError(), "error creating in-memory object"); - - std::unique_ptr - EFMM(new ExecutableFileMemoryManager()); - - // FIXME: use notifyObjectLoaded() to remap sections. - - DEBUG(dbgs() << "Creating OLT\n"); - // Run ObjectLinkingLayer() with custom memory manager and symbol resolver. - orc::ObjectLinkingLayer<> OLT; - - auto Resolver = orc::createLambdaResolver( - [&](const std::string &Name) { - DEBUG(dbgs() << "FLO: looking for " << Name << "\n"); - auto I = BC->GlobalSymbols.find(Name); - if (I == BC->GlobalSymbols.end()) - return RuntimeDyld::SymbolInfo(nullptr); - return RuntimeDyld::SymbolInfo(I->second, - JITSymbolFlags::None); - }, - [](const std::string &S) { - DEBUG(dbgs() << "FLO: resolving " << S << "\n"); - return nullptr; - } - ); - // FIXME: - auto ObjectsHandle = OLT.addObjectSet( - singletonSet(std::move(ObjOrErr.get())), - EFMM.get(), - //std::move(EFMM), - std::move(Resolver)); - //OLT.takeOwnershipOfBuffers(ObjectsHandle, ); - - // Fow now on, keep track of functions we fail to write in the binary. We need - // to avoid rewriting CFI info for these functions. - std::vector FailedAddresses; - - // Map every function/section current address in memory to that in - // the output binary. - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - if (!Function.isSimple()) - continue; - - auto SAI = EFMM->SectionAddressInfo.find(Function.getCodeSectionName()); - if (SAI != EFMM->SectionAddressInfo.end()) { - DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) - << " to 0x" << Twine::utohexstr(Function.getAddress()) - << '\n'); - OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SAI->second.first), - Function.getAddress()); - Function.setImageAddress(SAI->second.first); - Function.setImageSize(SAI->second.second); - } else { - errs() << "FLO: cannot remap function " << Function.getName() << "\n"; - FailedAddresses.emplace_back(Function.getAddress()); - } - - if (!Function.isSplit()) - continue; - - SAI = EFMM->SectionAddressInfo.find( - Function.getCodeSectionName().str().append(".cold")); - if (SAI != EFMM->SectionAddressInfo.end()) { - // Align at a 16-byte boundary - ExtraStorage.BumpPtr = (ExtraStorage.BumpPtr + 15) & ~(15ULL); - - DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) - << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) - << " with size " << Twine::utohexstr(SAI->second.second) - << '\n'); - OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SAI->second.first), - ExtraStorage.BumpPtr); - Function.setColdImageAddress(SAI->second.first); - Function.setColdImageSize(SAI->second.second); - Function.setColdFileOffset(ExtraStorage.BumpPtr - ExtraStorage.Addr + - ExtraStorage.FileOffset); - ExtraStorage.BumpPtr += SAI->second.second; - } else { - errs() << "FLO: cannot remap function " << Function.getName() << "\n"; - FailedAddresses.emplace_back(Function.getAddress()); - } - } - // Map .eh_frame - StringRef NewEhFrameContents; - uint64_t NewEhFrameAddress = 0; - uint64_t NewEhFrameOffset = 0; - if (HasEHFrame) { - auto SAI = EFMM->SectionAddressInfo.find(".eh_frame"); - if (SAI != EFMM->SectionAddressInfo.end()) { - // Align at an 8-byte boundary - ExtraStorage.BumpPtr = (ExtraStorage.BumpPtr + 7) & ~(7ULL); - DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) - << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) - << '\n'); - NewEhFrameAddress = ExtraStorage.BumpPtr; - NewEhFrameOffset = - ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; - OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SAI->second.first), - ExtraStorage.BumpPtr); - ExtraStorage.BumpPtr += SAI->second.second; - NewEhFrameContents = - StringRef(reinterpret_cast(SAI->second.first), - SAI->second.second); - } else { - errs() << "FLO: cannot remap .eh_frame\n"; - } - } - if (ExtraStorage.BumpPtr - ExtraStorage.Addr > ExtraStorage.Size) { - errs() << format( - "FLO fatal error: __flo_storage in this binary has not enough free " - "space (required %d bytes, available %d bytes).\n", - ExtraStorage.BumpPtr - ExtraStorage.Addr, ExtraStorage.Size); - exit(1); - } - - OLT.emitAndFinalize(ObjectsHandle); - - // FIXME: is there a less painful way to obtain assembler/writer? - auto &Writer = - static_cast(Streamer.get())->getAssembler().getWriter(); - Writer.setStream(RealOut->os()); - - // Print _flo_storage area stats for debug - DEBUG(dbgs() << format("INFO: __flo_storage address = 0x%x file offset = " - "0x%x total size = 0x%x\n", - ExtraStorage.Addr, ExtraStorage.FileOffset, - ExtraStorage.Size)); - - // Overwrite function in the output file. - uint64_t CountOverwrittenFunctions = 0; - uint64_t OverwrittenScore = 0; - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - - if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) - continue; - if (Function.isSplit() && (Function.getColdImageAddress() == 0 || - Function.getColdImageSize() == 0)) - continue; - - if (Function.getImageSize() > Function.getMaxSize()) { - errs() << "FLO-WARNING: new function size (0x" - << Twine::utohexstr(Function.getImageSize()) - << ") is larger than maximum allowed size (0x" - << Twine::utohexstr(Function.getMaxSize()) - << ") for function " << Function.getName() << '\n'; - FailedAddresses.emplace_back(Function.getAddress()); - continue; - } - - OverwrittenScore += Function.getFunctionScore(); - // Overwrite function in the output file. - outs() << "FLO: rewriting function \"" << Function.getName() << "\"\n"; - RealOut->os().pwrite( - reinterpret_cast(Function.getImageAddress()), - Function.getImageSize(), - Function.getFileOffset()); - - // Write nops at the end of the function. - auto Pos = RealOut->os().tell(); - RealOut->os().seek(Function.getFileOffset() + Function.getImageSize()); - BC->MAB->writeNopData(Function.getMaxSize() - Function.getImageSize(), - &Writer); - RealOut->os().seek(Pos); - - if (!Function.isSplit()) { - ++CountOverwrittenFunctions; - if (opts::MaxFunctions && - CountOverwrittenFunctions == opts::MaxFunctions) { - outs() << "FLO: maximum number of functions reached\n"; - break; - } - continue; - } - - // Write cold part - outs() << "FLO: rewriting function \"" << Function.getName() - << "\" (cold part)\n"; - RealOut->os().pwrite( - reinterpret_cast(Function.getColdImageAddress()), - Function.getColdImageSize(), Function.getColdFileOffset()); - - ++CountOverwrittenFunctions; - if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { - outs() << "FLO: maximum number of functions reached\n"; - break; - } - } - if (NewEhFrameContents.size()) { - outs() << "FLO: writing a new .eh_frame_hdr\n"; - if (FrameHdrAlign > 1) - ExtraStorage.BumpPtr = - (ExtraStorage.BumpPtr + FrameHdrAlign - 1) & ~(FrameHdrAlign - 1); - if (ExtraStorage.BumpPtr - ExtraStorage.Addr - ExtraStorage.Size < - FrameHdrContents.size()) { - errs() << "FLO fatal error: __flo_storage in this binary has not enough " - "free space\n"; - exit(1); - } - std::sort(FailedAddresses.begin(), FailedAddresses.end()); - CFIRdWrt.rewriteHeaderFor(NewEhFrameContents, NewEhFrameAddress, - ExtraStorage.BumpPtr, FailedAddresses); - uint64_t HdrFileOffset = - ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; - RealOut->os().pwrite(FrameHdrCopy.data(), FrameHdrCopy.size(), - HdrFileOffset); - outs() << "FLO: patching EH_FRAME program segment to reflect new " - ".eh_frame_hdr\n"; - auto Obj = File->getELFFile(); - if (!patchEhFrameHdrSegment(Obj, &RealOut->os(), HdrFileOffset, - ExtraStorage.BumpPtr, FrameHdrCopy.size())) { - outs() << "FAILED to patch program segment!\n"; - } - outs() << "FLO: writing a new .eh_frame\n"; - RealOut->os().pwrite(NewEhFrameContents.data(), NewEhFrameContents.size(), - NewEhFrameOffset); - } - - if (EntryPointFunction) { - DEBUG(dbgs() << "FLO: entry point function is " - << EntryPointFunction->getName() << '\n'); - } else { - DEBUG(dbgs() << "FLO: no entry point function was set\n"); - } - - outs() << "FLO: " << CountOverwrittenFunctions - << " out of " << BinaryFunctions.size() - << " functions were overwritten.\n"; - - if (TotalScore != 0) { - double Coverage = OverwrittenScore / (double)TotalScore * 100.0; - outs() << format("FLO: Rewritten functions cover %.2lf", Coverage) - << "% of the execution count of simple functions of this binary.\n"; - } - - // TODO: we should find a way to mark the binary as optimized by us. - - Out->keep(); - RealOut->keep(); -} - int main(int argc, char **argv) { // Print a stack trace if we signal out. sys::PrintStackTraceOnErrorSignal(); @@ -1283,8 +102,9 @@ int main(int argc, char **argv) { report_error(opts::InputFilename, EC); Binary &Binary = *BinaryOrErr.get().getBinary(); - if (auto *e = dyn_cast(&Binary)) { - OptimizeFile(e, *DR.get()); + if (auto *e = dyn_cast(&Binary)) { + RewriteInstance RI(e, *DR.get()); + RI.run(); } else { report_error(opts::InputFilename, object_error::invalid_file_type); } From 0b9e70b59dd2f5ce9becf91c6ddf135576a8aaa4 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 24 Nov 2015 09:29:41 -0800 Subject: [PATCH 050/904] Change function splitting to be a two-pass process Summary: This patch builds upon the previous patch to create a two-pass process to function splitting. We first perform the full rewriting pipeline to discover which functions need splitting. Afterwards, we restart the pipeline with those functions annotated to be split. (cherry picked from commit 167488fb952ad076f2f0905cd0182ff051f01368) --- bolt/RewriteInstance.cpp | 47 ++++++++++++++++++++++++++++++++++++++-- bolt/RewriteInstance.h | 15 +++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index fb17584f6034..eabdf1adb279 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -330,6 +330,20 @@ RewriteInstance::RewriteInstance(ELFObjectFileBase *File, RewriteInstance::~RewriteInstance() {} +void RewriteInstance::reset() { + BinaryFunctions.clear(); + FileSymRefs.clear(); + auto &DR = BC->DR; + BC = CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR); + DwCtx.reset(new DWARFContextInMemory(*File)); + CFIRdWrt.reset(nullptr); + SectionMM.reset(nullptr); + Out.reset(nullptr); + EHFrame = nullptr; + FailedAddresses.clear(); + TotalScore = 0; +} + void RewriteInstance::run() { if (!BC) { errs() << "failed to create a binary context\n"; @@ -342,6 +356,17 @@ void RewriteInstance::run() { runOptimizationPasses(); emitFunctions(); + if (opts::SplitFunctions && splitLargeFunctions()) { + // Emit again because now some functions have been split + outs() << "FLO: split-functions: starting pass 2...\n"; + reset(); + readSymbolTable(); + readSpecialSections(); + disassembleFunctions(); + runOptimizationPasses(); + emitFunctions(); + } + // Copy input file to output std::error_code EC; Out = llvm::make_unique(opts::OutputFilename, EC, @@ -689,8 +714,8 @@ void RewriteInstance::runOptimizationPasses() { } if (opts::ReorderBlocks != "disable") { - bool ShouldSplit = opts::SplitFunctions && - (Function.getFunctionScore() * 1000) > TotalScore; + bool ShouldSplit = ToSplit.find(BFI.first) != ToSplit.end(); + if (opts::ReorderBlocks == "branch-predictor") { BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR, ShouldSplit); @@ -1063,6 +1088,24 @@ void RewriteInstance::emitFunctions() { TempOut->keep(); } +bool RewriteInstance::splitLargeFunctions() { + bool Changed = false; + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + // Ignore this function if we failed to map it to the output binary + if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) + continue; + + if (Function.getImageSize() <= Function.getMaxSize()) + continue; + + ToSplit.insert(BFI.first); + Changed = true; + } + return Changed; +} + namespace { // Helper to locate EH_FRAME_HDR segment, specialized for 64-bit LE ELF diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 862641e20094..037b23b3f09d 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -18,6 +18,7 @@ #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/ObjectFile.h" #include +#include namespace llvm { @@ -42,6 +43,10 @@ class RewriteInstance { RewriteInstance(llvm::object::ELFObjectFileBase *File, const DataReader &DR); ~RewriteInstance(); + /// Reset all state except for split hints. Used to run a second pass with + /// function splitting information. + void reset(); + /// Run all the necessary steps to read, optimize and rewrite the binary. void run(); @@ -66,6 +71,13 @@ class RewriteInstance { /// performing final relaxation. void emitFunctions(); + /// Check which functions became larger than their original version and + /// annotate function splitting information. + /// + /// Returns true if any function was annotated, requiring us to perform a + /// second pass to emit those functions in two parts. + bool splitLargeFunctions(); + /// Rewrite back all functions (hopefully optimized) that fit in the original /// memory footprint for that function. If the function is now larger and does /// not fit in the binary, reject it and preserve the original version of the @@ -131,6 +143,9 @@ class RewriteInstance { // rewriting CFI info for these functions. std::vector FailedAddresses; + // Keep track of which functions to split in a second pass. + std::set ToSplit; + /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; From a188e3a9d1236a0366c921bb26e16022bbcb7a16 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 24 Nov 2015 13:55:44 -0800 Subject: [PATCH 051/904] Don't choke on DW_CFA_def_cfa_expression and friends Summary: Our CFI parser in the LLVM library was giving up on parsing all CFI instructions when finding a single instruction with expression operands. Yet, all gcc-4.9 binaries seem to have at least one CFI instruction with expression operands (DW_CFA_def_cfa_expression). This patch fixes this and makes DebugInfo continue to parse other instructions, even though it does not completely parse DWARF expressions yet. However, this seems to be enough to allow llvm-flo to process gcc-4.9 binaries because the FDEs with DWARF expressions are linked to the PLT region, and not to functions that we process. If we ever try to read a function whose CFI depends on DWARF expression, which is unlikely, llvm-flo will assert. (cherry picked from commit 77f7faa5300f282e8f390ce597bda3e18e4f3196) --- bolt/Exceptions.cpp | 2 -- bolt/RewriteInstance.cpp | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 41033dd48f4a..241470b2d60d 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -657,8 +657,6 @@ void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { case DW_CFA_val_expression: llvm_unreachable("DWARF CFA expressions unimplemented"); break; - dbgs() << "DW_CFA_val_expression"; - break; case DW_CFA_MIPS_advance_loc8: llvm_unreachable("DW_CFA_MIPS_advance_loc unimplemented"); break; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index eabdf1adb279..9dc7ac8597c9 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -556,8 +556,9 @@ void RewriteInstance::readSpecialSections() { } CFIRdWrt.reset(new CFIReaderWriter(*EHFrame, FrameHdrAddress, FrameHdrCopy)); if (!EHFrame->ParseError.empty()) { - errs() << "FLO-WARNING: EHFrame reader failed with message \"" + errs() << "FLO-ERROR: EHFrame reader failed with message \"" << EHFrame->ParseError << "\"\n"; + exit(1); } } From 8854f5b5b8998e7dfbd1f64e8b7679bf9a34f96a Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 3 Dec 2015 09:45:18 -0800 Subject: [PATCH 052/904] Turns off basic block alignment by default Summary: We found out that the insertion of extra nops to preserve alignment of some loop bodies do not pay off the increased function size, since this extra size may inhibit us from rewriting a reordered version of this function. (cherry picked from commit b2ff1f8ef4319cc7b2d5e316b3399c539e00a27f) --- bolt/RewriteInstance.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 9dc7ac8597c9..3e8a8483cd05 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -95,6 +95,10 @@ static cl::opt ReorderBlocks( "priority (none, branch-predictor or cache)"), cl::value_desc("priority"), cl::init("disable")); +static cl::opt AlignBlocks("align-blocks", + cl::desc("try to align BBs inserting nops"), + cl::Optional); + static cl::opt DumpEHFrame("dump-eh-frame", cl::desc("dump parsed .eh_frame (debugging)"), cl::Hidden); @@ -856,7 +860,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, for (auto BB : Function.layout()) { if (EmitColdPart != BB->isCold()) continue; - if (BB->getAlignment() > 1) + if (opts::AlignBlocks && BB->getAlignment() > 1) Streamer.EmitCodeAlignment(BB->getAlignment()); Streamer.EmitLabel(BB->getLabel()); for (const auto &Instr : *BB) { From 8edf4e9da014aa514f82ae974220b8d5fcc85247 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 3 Dec 2015 13:29:52 -0800 Subject: [PATCH 053/904] Don't touch functions whose internal BBs are targets of interprocedural branches Summary: In a test binary, we found 8 cases where code in a function A would jump to the middle of another function B. In this case, we cannot reorder function B because this would change instruction offsets and break the program. This is pretty rare but can happen in code written in assembly. (cherry picked from commit 2e6bd18ca4766c43aef68226afcf3fddd7d753bd) --- bolt/BinaryContext.h | 4 ++++ bolt/BinaryFunction.cpp | 6 +----- bolt/RewriteInstance.cpp | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 0bfddceef431..6f20615d5db4 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -30,6 +30,7 @@ #include "llvm/Support/TargetRegistry.h" #include #include +#include #include #include @@ -51,6 +52,9 @@ class BinaryContext { // [address] -> [name1], [name2], ... std::multimap GlobalAddresses; + // Set of addresses we cannot relocate because we have a direct branch to it. + std::set InterproceduralBranchTargets; + std::unique_ptr Ctx; std::unique_ptr TheTriple; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index ea119a220562..9098cff69cc5 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -357,7 +357,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << getName() << "\n"); IsSimple = false; - break; } } @@ -374,6 +373,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = LI->second; } } else { + BC.InterproceduralBranchTargets.insert(InstructionTarget); if (!IsCall && Size == 2) { errs() << "FLO-WARNING: relaxed tail call detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) @@ -394,7 +394,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { errs() << "FLO-WARNING: Function \":" << getName() << "\" has call to address zero. Ignoring it.\n"; IsSimple = false; - break; } } } @@ -419,13 +418,11 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // latter case. if (MIA->isIndirectBranch(Instruction)) { IsSimple = false; - break; } // Indirect call. We only need to fix it if the operand is RIP-relative if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { IsSimple = false; - break; } } } @@ -433,7 +430,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { IsSimple = false; - break; } } } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 3e8a8483cd05..27e5ebfde62d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -651,6 +651,25 @@ void RewriteInstance::disassembleFunctions() { TotalScore += Function.getFunctionScore(); } // Iterate over all functions + + // Mark all functions with internal addresses serving as interprocedural + // branch targets as not simple -- pretty rare but can happen in code + // written in assembly. + // TODO: #9301815 + for (auto Addr : BC->InterproceduralBranchTargets) { + // Check if this address is internal to some function we are reordering + auto I = BinaryFunctions.upper_bound(Addr); + if (I == BinaryFunctions.begin()) + continue; + BinaryFunction &Func = (--I)->second; + uint64_t Offset = Addr - I->first; + if (Offset == 0 || Offset >= Func.getSize()) + continue; + errs() << "FLO-WARNING: Function " << Func.getName() + << " has internal BBs that are target of a branch located in " + "another function. We will not process this function.\n"; + Func.setSimple(false); + } } void RewriteInstance::runOptimizationPasses() { From 52bed5a1d925d34f09d7d9f384b0e946227cc5b5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 15 Dec 2015 17:06:27 -0800 Subject: [PATCH 054/904] Turn EH ranges support back on. Summary: Changed the way EH info is stored/extracted from call instruction. Make sure indirect calls work. (cherry picked from commit adce1a20d82a10771125ae86c792f1baad9e60ff) --- bolt/BinaryFunction.cpp | 21 +++++++++++---------- bolt/BinaryFunction.h | 3 +++ bolt/Exceptions.cpp | 23 ++++++++++------------- bolt/RewriteInstance.cpp | 19 +++++++++++-------- 4 files changed, 35 insertions(+), 31 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 9098cff69cc5..412ce06222cd 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -142,16 +142,17 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (BC.MIA->isCall(Instruction)) { if (BC.MIA->isTailCall(Instruction)) OS << " # TAILCALL "; - // FIXME: Print EH handlers correctly in presence of indirect calls -// if (Instruction.getNumOperands() > 1) { -// OS << " # handler: "; -// if (Instruction.getOperand(1).isExpr()) -// OS << cast(Instruction.getOperand(1).getExpr())-> -// getSymbol(); -// else -// OS << '0'; -// OS << "; action: " << Instruction.getOperand(2).getImm(); -// } + if (BC.MIA->isInvoke(Instruction)) { + const MCSymbol *LP; + uint64_t Action; + std::tie(LP, Action) = BC.MIA->getEHInfo(Instruction); + OS << " # handler: "; + if (LP) + OS << *LP; + else + OS << '0'; + OS << "; action: " << Action; + } } OS << "\n"; // In case we need MCInst printer: diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 7f996bf51c49..05f7286d5421 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -650,6 +650,9 @@ class BinaryFunction { /// Update exception handling ranges for the function. void updateEHRanges(); + /// Return true if the function has exception handling tables. + bool hasEHRanges() const { return !CallSites.empty(); } + virtual ~BinaryFunction() {} }; diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 241470b2d60d..ce103df5b6e9 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -126,14 +126,15 @@ void readLSDA(ArrayRef LSDAData, BinaryContext &BC) { errs() << "TType End = " << TTypeEnd << '\n'; } - // Table to store list of indices in type table. Entries are uleb128s values. + // Table to store list of indices in type table. Entries are uleb128 values. auto TypeIndexTableStart = Ptr + TTypeEnd; // Offset past the last decoded index. intptr_t MaxTypeIndexTableOffset = 0; - // The actual type info table starts at the same location, but grows in - // different direction. Encoding is different too (TTypeEncoding). + // The actual type info table starts at the same location as index table, + // but grows in a different direction. It also uses a different encoding - + // specified by TTypeEncoding. auto TypeTableStart = reinterpret_cast(Ptr + TTypeEnd); uint8_t CallSiteEncoding = *Ptr++; @@ -254,7 +255,7 @@ void readLSDA(ArrayRef LSDAData, BinaryContext &BC) { void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, uint64_t LSDASectionAddress) { - assert(CurrentState == State::Disassembled && "unexpecrted function state"); + assert(CurrentState == State::Disassembled && "unexpected function state"); if (!getLSDAAddress()) return; @@ -286,7 +287,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, errs() << "TType End = " << TTypeEnd << '\n'; } - // Table to store list of indices in type table. Entries are uleb128s values. + // Table to store list of indices in type table. Entries are uleb128 values. auto TypeIndexTableStart = Ptr + TTypeEnd; // Offset past the last decoded index. @@ -453,20 +454,16 @@ void BinaryFunction::updateEHRanges() { continue; // Instruction can throw an exception that should be handled. - bool Throws = Instr.getNumOperands() > 1; + bool Throws = BC.MIA->isInvoke(Instr); // Ignore the call if it's a continuation of a no-throw gap. if (!Throws && !StartRange) continue; // Extract exception handling information from the instruction. - const MCSymbol *LP = - Throws ? (Instr.getOperand(1).isExpr() - ? &(cast( - Instr.getOperand(1).getExpr())->getSymbol()) - : nullptr) - : nullptr; - uint64_t Action = Throws ? Instr.getOperand(2).getImm() : 0; + const MCSymbol *LP = nullptr; + uint64_t Action = 0; + std::tie(LP, Action) = BC.MIA->getEHInfo(Instr); // No action if the exception handler has not changed. if (Throws && diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 27e5ebfde62d..4035e46062e8 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -634,8 +634,6 @@ void RewriteInstance::disassembleFunctions() { // Fill in CFI information for this function if (EHFrame->ParseError.empty() && Function.isSimple()) { CFIRdWrt->fillCFIInfoFor(Function); - if (Function.getLSDAAddress() != 0) - Function.setSimple(false); } // Parse LSDA. @@ -754,13 +752,18 @@ void RewriteInstance::runOptimizationPasses() { } // Post-processing passes. - // FIXME: Check EH handlers correctly in presence of indirect calls - // Function.updateEHRanges(); - // if (opts::PrintAll || opts::PrintEHRanges) { - // Function.print(errs(), "after updating EH ranges"); - // } - // After optimizations, fix the CFI state + // Update exception handling information. + Function.updateEHRanges(); + if (opts::PrintAll || opts::PrintEHRanges) { + Function.print(errs(), "after updating EH ranges"); + } + + // TODO: add complete EH ranges support. + if (Function.hasEHRanges()) + Function.setSimple(false); + + // Fix the CFI state. if (!Function.fixCFIState()) Function.setSimple(false); } From 498097743e29025f4b1d593b5f9982b704f6009a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 16 Dec 2015 17:56:49 -0800 Subject: [PATCH 055/904] Ignore functions referencing symbol at 0x0. Summary: Binary code could be weird. It could include calls to address 0 and reference data at 0 (e.g. with lea on x86). LLVM JIT fatals while resolving relocations against symbols at address 0x0. For now we will stop emitting such code, i.e. we'll skip functions. (cherry picked from commit 7f5535ca39a7ed8b39ca26148545f040bff74b94) --- bolt/BinaryFunction.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 412ce06222cd..030f6b7ca54a 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -75,6 +75,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, << "\n Offset : 0x" << Twine::utohexstr(FileOffset) << "\n Section : " << SectionName << "\n Orc Section : " << getCodeSectionName() + << "\n LSDA : 0x" << Twine::utohexstr(getLSDAAddress()) << "\n IsSimple : " << IsSimple << "\n IsSplit : " << IsSplit << "\n BB Count : " << BasicBlocksLayout.size(); @@ -289,7 +290,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { Labels[0] = Ctx->createTempSymbol("BB0", false); auto handleRIPOperand = - [this](MCInst &Instruction, uint64_t Address, uint64_t Size) -> bool { + [&](MCInst &Instruction, uint64_t Address, uint64_t Size) -> bool { uint64_t TargetAddress{0}; MCSymbol *TargetSymbol{nullptr}; if (!BC.MIA->evaluateRIPOperand(Instruction, Address, Size, @@ -302,6 +303,11 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { return false; } // FIXME: check that the address is in data, not in code. + if (TargetAddress == 0) { + errs() << "FLO-WARNING: rip-relative operand is zero in function " + << getName() << ". Ignoring function.\n"; + return false; + } TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); BC.MIA->replaceRIPOperandDisp( Instruction, MCOperand::createExpr(MCSymbolRefExpr::create( @@ -392,8 +398,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = BC.getOrCreateGlobalSymbol(InstructionTarget, "FUNCat"); if (InstructionTarget == 0) { - errs() << "FLO-WARNING: Function \":" << getName() - << "\" has call to address zero. Ignoring it.\n"; + // We actually see calls to address 0 because of the weak symbols + // from the libraries. In reality more often than not it is + // unreachable code, but we don't know it and have to emit calls + // to 0 which make LLVM JIT unhappy. + errs() << "FLO-WARNING: Function " << getName() + << " has a call to address zero. Ignoring function.\n"; IsSimple = false; } } From 81d7fcc241619df3f72238f618808a91bc0efdad Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 17 Dec 2015 12:59:15 -0800 Subject: [PATCH 056/904] Code/comments cleanup. Summary: Consolidate cold function info under cold FragmentInfo. Minor code and comment mods to LSDA handling. (cherry picked from commit 9f846722468d724455e5689452fbd48c9c88eded) --- bolt/BinaryFunction.h | 78 ++++++------ bolt/Exceptions.cpp | 250 +++++++++------------------------------ bolt/RewriteInstance.cpp | 33 +++--- 3 files changed, 114 insertions(+), 247 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 05f7286d5421..1db0ec0662a5 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -80,7 +80,7 @@ class BinaryFunction { /// Name of the function as we know it. std::string Name; - /// Symbol associated with this function. + /// Symbol associated with this function in the input. SymbolRef Symbol; /// Containing section @@ -95,7 +95,6 @@ class BinaryFunction { /// Offset in the file. uint64_t FileOffset{0}; - uint64_t ColdFileOffset{0}; /// Maximum size this function is allowed to have. uint64_t MaxSize{std::numeric_limits::max()}; @@ -114,11 +113,9 @@ class BinaryFunction { /// The address for the code for this function in codegen memory. uint64_t ImageAddress{0}; - uint64_t ColdImageAddress{0}; /// The size of the code in memory. uint64_t ImageSize{0}; - uint64_t ColdImageSize{0}; /// Name for the section this function code should reside in. std::string CodeSectionName; @@ -134,6 +131,9 @@ class BinaryFunction { /// function' LSDA (exception handling). ArrayRef LSDATables; + /// Offset into LSDATables where type tables start. + uint64_t LSDATablesTypeOffset{0}; + /// Original LSDA address for the function. uint64_t LSDAAddress{0}; @@ -240,6 +240,9 @@ class BinaryFunction { // This vector is indexed by BB index. std::vector BBCFIState; + /// Symbol in the output. + const MCSymbol *OutputSymbol; + public: typedef BasicBlockListType::iterator iterator; @@ -321,11 +324,6 @@ class BinaryFunction { return Name; } - /// Return symbol associated with the function start. - SymbolRef getSymbol() const { - return Symbol; - } - /// Return containing file section. SectionRef getSection() const { return Section; @@ -341,10 +339,6 @@ class BinaryFunction { return FileOffset; } - uint64_t getColdFileOffset() const { - return ColdFileOffset; - } - /// Return (original) size of the function. uint64_t getSize() const { return Size; @@ -355,6 +349,11 @@ class BinaryFunction { return MaxSize; } + /// Return MC symbol associtated with the function in the output object. + const MCSymbol *getOutputSymbol() const { + return OutputSymbol; + } + /// Return internal section name for this function. StringRef getCodeSectionName() const { assert(!CodeSectionName.empty() && "no section name for function"); @@ -492,13 +491,13 @@ class BinaryFunction { return *this; } - BinaryFunction &setColdFileOffset(uint64_t Offset) { - ColdFileOffset = Offset; + BinaryFunction &setMaxSize(uint64_t Size) { + MaxSize = Size; return *this; } - BinaryFunction &setMaxSize(uint64_t Size) { - MaxSize = Size; + BinaryFunction &setOutputSymbol(const MCSymbol *Symbol) { + OutputSymbol = Symbol; return *this; } @@ -531,39 +530,21 @@ class BinaryFunction { return *this; } - BinaryFunction &setColdImageAddress(uint64_t Address) { - ColdImageAddress = Address; - return *this; - } - /// Return the address of this function' image in memory. uint64_t getImageAddress() const { return ImageAddress; } - uint64_t getColdImageAddress() const { - return ColdImageAddress; - } - BinaryFunction &setImageSize(uint64_t Size) { ImageSize = Size; return *this; } - BinaryFunction &setColdImageSize(uint64_t Size) { - ColdImageSize = Size; - return *this; - } - /// Return the size of this function' image in memory. uint64_t getImageSize() const { return ImageSize; } - uint64_t getColdImageSize() const { - return ColdImageSize; - } - /// Set the profile data for the number of times the function was called. BinaryFunction &setExecutionCount(uint64_t Count) { ExecutionCount = Count; @@ -653,7 +634,34 @@ class BinaryFunction { /// Return true if the function has exception handling tables. bool hasEHRanges() const { return !CallSites.empty(); } + /// Emit exception handling ranges for the function. + void emitLSDA(); + virtual ~BinaryFunction() {} + + /// Info for fragmented functions. + class FragmentInfo { + private: + uint64_t ImageAddress{0}; + uint64_t ImageSize{0}; + uint64_t FileOffset{0}; + const MCSymbol *OutputSymbol{nullptr}; + public: + uint64_t getImageAddress() const { return ImageAddress; } + uint64_t getImageSize() const { return ImageSize; } + uint64_t getFileOffset() const { return FileOffset; } + const MCSymbol *getOutputSymbol() const { return OutputSymbol; } + + void setImageAddress(uint64_t Address) { ImageAddress = Address; } + void setImageSize(uint64_t Size) { ImageSize = Size; } + void setFileOffset(uint64_t Offset) { FileOffset = Offset; } + void setOutputSymbol(const MCSymbol *Symbol) { OutputSymbol = Symbol; } + }; + + /// Cold fragment of the function. + FragmentInfo ColdFragment; + + FragmentInfo &cold() { return ColdFragment; } }; inline raw_ostream &operator<<(raw_ostream &OS, diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index ce103df5b6e9..f6cfef135e57 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -28,10 +28,6 @@ #undef DEBUG_TYPE #define DEBUG_TYPE "flo-exceptions" -STATISTIC(NumLSDAs, "Number of all LSDAs"); -STATISTIC(NumTrivialLSDAs, - "Number of LSDAs with single call site without landing pad or action"); - using namespace llvm::dwarf; namespace llvm { @@ -46,30 +42,29 @@ PrintExceptions("print-exceptions", } // namespace opts -// readLSDA is reading and dumping the whole .gcc_exception_table section -// at once. +// Read and dump the .gcc_exception_table section entry. // -// .gcc_except_table section contains a set of Language-Specific Data Areas -// which are basically exception handling tables. One LSDA per function. -// One important observation - you can't actually tell which function LSDA -// refers to, and most addresses are relative to the function start. So you -// have to start with parsing .eh_frame entries that refers to LSDA to obtain -// a function context. +// .gcc_except_table section contains a set of Language-Specific Data Areas - +// a fancy name for exception handling tables. There's one LSDA entry per +// function. However, we can't actually tell which function LSDA refers to +// unless we parse .eh_frame entry that refers to the LSDA. +// Then inside LSDA most addresses are encoded relative to the function start, +// so we need the function context in order to get to real addresses. // -// The best visual representation of the tables comprising LSDA and relationship -// between them is illustrated at: +// The best visual representation of the tables comprising LSDA and +// relationships between them is illustrated at: // http://mentorembedded.github.io/cxx-abi/exceptions.pdf // Keep in mind that GCC implementation deviates slightly from that document. // // To summarize, there are 4 tables in LSDA: call site table, actions table, -// types table, and types index table (indirection). The main table contains -// call site entries. Each call site includes a range that can throw an exception, -// a handler (landing pad), and a reference to an entry in the action table. -// A handler and/or action could be 0. An action entry is in fact a head -// of a list of actions associated with a call site and an action table contains -// all such lists (it could be optimize to share list tails). Each action could be -// either to catch an exception of a given type, to perform a cleanup, or to -// propagate an exception after filtering it out (e.g. to make sure function +// types table, and types index table (for indirection). The main table contains +// call site entries. Each call site includes a PC range that can throw an +// exception, a handler (landing pad), and a reference to an entry in the action +// table. The handler and/or action could be 0. The action entry is a head +// of a list of actions associated with a call site. The action table contains +// all such lists (it could be optimized to share list tails). Each action could +// be either to catch an exception of a given type, to perform a cleanup, or to +// propagate the exception after filtering it out (e.g. to make sure function // exception specification is not violated). Catch action contains a reference // to an entry in the type table, and filter action refers to an entry in the // type index table to encode a set of types to filter. @@ -82,177 +77,20 @@ PrintExceptions("print-exceptions", // these tables is encoded in LSDA header. Sizes for both of the tables are not // included anywhere. // -// For the purpose of rewriting exception handling tables, we can reuse action -// table, types table, and type index table in a binary format when type -// references are hard-coded absolute addresses. We still have to parse all the -// table to determine their size. We have to parse call site table and associate -// discovered information with actual call instructions and landing pad blocks. -void readLSDA(ArrayRef LSDAData, BinaryContext &BC) { - const uint8_t *Ptr = LSDAData.data(); - - while (Ptr < LSDAData.data() + LSDAData.size()) { - uint8_t LPStartEncoding = *Ptr++; - // Some of LSDAs are aligned while other are not. We use the hack below - // to work around 0-filled alignment. However it could also mean - // DW_EH_PE_absptr format. - // - // FIXME: the proper way to parse these tables is to get the pointer - // from .eh_frame and parse one entry at a time. - while (!LPStartEncoding) - LPStartEncoding = *Ptr++; - if (opts::PrintExceptions) { - errs() << "[LSDA at 0x" - << Twine::utohexstr(reinterpret_cast(Ptr-1)) << "]:\n"; - } - - ++NumLSDAs; - bool IsTrivial = true; - - uintptr_t LPStart = 0; - if (LPStartEncoding != DW_EH_PE_omit) { - LPStart = readEncodedPointer(Ptr, LPStartEncoding); - } - - uint8_t TTypeEncoding = *Ptr++; - uintptr_t TTypeEnd = 0; - if (TTypeEncoding != DW_EH_PE_omit) { - TTypeEnd = readULEB128(Ptr); - } - - if (opts::PrintExceptions) { - errs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; - errs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; - errs() << "TType Encoding = " << (unsigned)TTypeEncoding << '\n'; - errs() << "TType End = " << TTypeEnd << '\n'; - } - - // Table to store list of indices in type table. Entries are uleb128 values. - auto TypeIndexTableStart = Ptr + TTypeEnd; - - // Offset past the last decoded index. - intptr_t MaxTypeIndexTableOffset = 0; - - // The actual type info table starts at the same location as index table, - // but grows in a different direction. It also uses a different encoding - - // specified by TTypeEncoding. - auto TypeTableStart = reinterpret_cast(Ptr + TTypeEnd); - - uint8_t CallSiteEncoding = *Ptr++; - uint32_t CallSiteTableLength = readULEB128(Ptr); - const uint8_t *CallSiteTableStart = Ptr; - const uint8_t *CallSiteTableEnd = CallSiteTableStart + CallSiteTableLength; - const uint8_t *CallSitePtr = CallSiteTableStart; - const uint8_t *ActionTableStart = CallSiteTableEnd; - - if (opts::PrintExceptions) { - errs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n'; - errs() << "CallSite table length = " << CallSiteTableLength << '\n'; - errs() << '\n'; - } - - unsigned NumCallSites = 0; - while (CallSitePtr < CallSiteTableEnd) { - ++NumCallSites; - uintptr_t Start = readEncodedPointer(CallSitePtr, CallSiteEncoding); - uintptr_t Length = readEncodedPointer(CallSitePtr, CallSiteEncoding); - uintptr_t LandingPad = readEncodedPointer(CallSitePtr, CallSiteEncoding); - - uintptr_t ActionEntry = readULEB128(CallSitePtr); - uint64_t RangeBase = 0; - if (opts::PrintExceptions) { - errs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) - << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) - << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad) - << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; - } - if (ActionEntry != 0) { - auto printType = [&] (int Index, raw_ostream &OS) { - assert(Index > 0 && "only positive indices are valid"); - assert(TTypeEncoding == DW_EH_PE_udata4 && - "only udata4 supported for TTypeEncoding"); - auto TypeAddress = *(TypeTableStart - Index); - if (TypeAddress == 0) { - OS << ""; - return; - } - auto NI = BC.GlobalAddresses.find(TypeAddress); - if (NI != BC.GlobalAddresses.end()) { - OS << NI->second; - } else { - OS << "0x" << Twine::utohexstr(TypeAddress); - } - }; - if (opts::PrintExceptions) - errs() << " actions: "; - const uint8_t *ActionPtr = ActionTableStart + ActionEntry - 1; - long long ActionType; - long long ActionNext; - auto Sep = ""; - do { - ActionType = readSLEB128(ActionPtr); - auto Self = ActionPtr; - ActionNext = readSLEB128(ActionPtr); - if (opts::PrintExceptions) - errs() << Sep << "(" << ActionType << ", " << ActionNext << ") "; - if (ActionType == 0) { - if (opts::PrintExceptions) - errs() << "cleanup"; - } else if (ActionType > 0) { - // It's an index into a type table. - if (opts::PrintExceptions) { - errs() << "catch type "; - printType(ActionType, errs()); - } - } else { // ActionType < 0 - if (opts::PrintExceptions) - errs() << "filter exception types "; - auto TSep = ""; - // ActionType is a negative byte offset into uleb128-encoded table - // of indices with base 1. - // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are - // encoded using uleb128 so we cannot directly dereference them. - auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; - while (auto Index = readULEB128(TypeIndexTablePtr)) { - if (opts::PrintExceptions) { - errs() << TSep; - printType(Index, errs()); - TSep = ", "; - } - } - MaxTypeIndexTableOffset = - std::max(MaxTypeIndexTableOffset, - TypeIndexTablePtr - TypeIndexTableStart); - } - - Sep = "; "; - - ActionPtr = Self + ActionNext; - } while (ActionNext); - if (opts::PrintExceptions) - errs() << '\n'; - } - - if (LandingPad != 0 || ActionEntry != 0) - IsTrivial = false; - } - Ptr = CallSiteTableEnd; - - if (NumCallSites > 1) - IsTrivial = false; - - if (IsTrivial) - ++NumTrivialLSDAs; - - if (opts::PrintExceptions) - errs() << '\n'; - - if (CallSiteTableLength == 0 || TTypeEnd == 0) - continue; - - Ptr = TypeIndexTableStart + MaxTypeIndexTableOffset; - } -} - +// For the purpose of rewriting exception handling tables, we can reuse action, +// types, and type index tables in their original binary format. +// This is only possible when type references are encoded as absolute addresses. +// We still have to parse all the tables to determine their sizes. Then we have +// to parse the call site table and associate discovered information with +// actual call instructions and landing pad blocks. +// +// Ideally we should be able to re-write LSDA in-place, without the need to +// allocate a new space for it. Sadly there's no guarantee that the new call +// site table will be the same size as GCC uses uleb encodings for PC offsets. +// +// For split function re-writing we would need to split LSDA too. +// +// Note: some functions have LSDA entries with 0 call site entries. void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, uint64_t LSDASectionAddress) { assert(CurrentState == State::Disassembled && "unexpected function state"); @@ -281,6 +119,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, } if (opts::PrintExceptions) { + errs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress()) + << " for function " << getName() << "]:\n"; errs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; errs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; errs() << "TType Encoding = " << (unsigned)TTypeEncoding << '\n'; @@ -311,14 +151,15 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, } unsigned NumCallSites = 0; + uint64_t RangeBase = getAddress(); while (CallSitePtr < CallSiteTableEnd) { ++NumCallSites; + uintptr_t Start = readEncodedPointer(CallSitePtr, CallSiteEncoding); uintptr_t Length = readEncodedPointer(CallSitePtr, CallSiteEncoding); uintptr_t LandingPad = readEncodedPointer(CallSitePtr, CallSiteEncoding); - uintptr_t ActionEntry = readULEB128(CallSitePtr); - uint64_t RangeBase = getAddress(); + if (opts::PrintExceptions) { errs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) @@ -346,6 +187,10 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, do { auto &Instruction = II->second; if (BC.MIA->isCall(Instruction)) { + assert(!BC.MIA->isInvoke(Instruction) && + "overlapping exception ranges detected"); + // Add extra operands to a call instruction making it an invoke from + // now on. if (LPSymbol) { Instruction.addOperand(MCOperand::createExpr( MCSymbolRefExpr::create(LPSymbol, @@ -404,7 +249,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // ActionType is a negative byte offset into uleb128-encoded table // of indices with base 1. // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are - // encoded using uleb128 so we cannot directly dereference them. + // encoded using uleb128 thus we cannot directly dereference them. auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; while (auto Index = readULEB128(TypeIndexTablePtr)) { if (opts::PrintExceptions) { @@ -428,6 +273,16 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, } if (opts::PrintExceptions) errs() << '\n'; + + assert(reinterpret_cast(MaxTypeIndexTableOffset) <= + LSDASectionData.data() + LSDASectionData.size() && + "LSDA entry has crossed section boundary"); + + LSDATables = + ArrayRef(ActionTableStart, + reinterpret_cast(MaxTypeIndexTableOffset)); + LSDATablesTypeOffset = + reinterpret_cast(TypeTableStart) - ActionTableStart; } void BinaryFunction::updateEHRanges() { @@ -819,7 +674,8 @@ void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, DEBUG(dbgs() << format("CFIReaderWriter: Patching .eh_frame_hdr contents " "@offset %08x with new FDE ptr %08x\n", FDEPtrOffset, RealOffset + NewFrameHdrAddress)); - support::ulittle32_t::ref(FrameHdrContents.data() + FDEPtrOffset) = RealOffset; + support::ulittle32_t::ref(FrameHdrContents.data() + FDEPtrOffset) = + RealOffset; } // Add new entries (for cold function parts) uint64_t ExtraEntries = 0; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 4035e46062e8..061781baa75a 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -540,7 +540,6 @@ void RewriteInstance::readSpecialSections() { Section.getSize()); if (SectionName == ".gcc_except_table") { - readLSDA(SectionData, *BC); LSDAData = SectionData; LSDAAddress = Section.getAddress(); } @@ -579,13 +578,14 @@ void RewriteInstance::disassembleFunctions() { } SectionRef Section = Function.getSection(); - assert(Section.containsSymbol(Function.getSymbol()) && - "symbol not in section"); - - // When could it happen? + assert(Section.getAddress() <= Function.getAddress() && + Section.getAddress() + Section.getSize() + >= Function.getAddress() + Function.getSize() && + "wrong section for function"); if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { - DEBUG(dbgs() << "FLO: corresponding section non-executable or empty " - << "for function " << Function.getName()); + // When could it happen? + errs() << "FLO: corresponding section is non-executable or empty " + << "for function " << Function.getName(); continue; } @@ -845,11 +845,13 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, MCSymbol *FunctionSymbol = BC.Ctx->getOrCreateSymbol(Function.getName()); Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); Streamer.EmitLabel(FunctionSymbol); + Function.setOutputSymbol(FunctionSymbol); } else { MCSymbol *FunctionSymbol = BC.Ctx->getOrCreateSymbol(Twine(Function.getName()).concat(".cold")); Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); Streamer.EmitLabel(FunctionSymbol); + Function.cold().setOutputSymbol(FunctionSymbol); } // Emit CFI start @@ -1068,10 +1070,10 @@ void RewriteInstance::emitFunctions() { OLT.mapSectionAddress(ObjectsHandle, reinterpret_cast(SAI->second.first), ExtraStorage.BumpPtr); - Function.setColdImageAddress(SAI->second.first); - Function.setColdImageSize(SAI->second.second); - Function.setColdFileOffset(ExtraStorage.BumpPtr - ExtraStorage.Addr + - ExtraStorage.FileOffset); + Function.cold().setImageAddress(SAI->second.first); + Function.cold().setImageSize(SAI->second.second); + Function.cold().setFileOffset(ExtraStorage.BumpPtr - ExtraStorage.Addr + + ExtraStorage.FileOffset); ExtraStorage.BumpPtr += SAI->second.second; } else { errs() << "FLO: cannot remap function " << Function.getName() << "\n"; @@ -1194,8 +1196,8 @@ void RewriteInstance::rewriteFile() { if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) continue; - if (Function.isSplit() && (Function.getColdImageAddress() == 0 || - Function.getColdImageSize() == 0)) + if (Function.isSplit() && (Function.cold().getImageAddress() == 0 || + Function.cold().getImageSize() == 0)) continue; if (Function.getImageSize() > Function.getMaxSize()) { @@ -1234,8 +1236,9 @@ void RewriteInstance::rewriteFile() { // Write cold part outs() << "FLO: rewriting function \"" << Function.getName() << "\" (cold part)\n"; - Out->os().pwrite(reinterpret_cast(Function.getColdImageAddress()), - Function.getColdImageSize(), Function.getColdFileOffset()); + Out->os().pwrite(reinterpret_cast(Function.cold().getImageAddress()), + Function.cold().getImageSize(), + Function.cold().getFileOffset()); ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { From 578ca624995caf00227950e2b3578c1cb93b02e3 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 18 Dec 2015 20:26:44 -0800 Subject: [PATCH 057/904] Fix issues with some CFI instructions with gcc 4.9. Summary: Fixes some issues discovered after hhvm switched to gcc 4.9. Add support for DW_CFA_GNU_args_size instruction. Allow CFI instruction after the last instruction in a function. Reverse conditions of assert for DW_CFA_set_loc. (cherry picked from commit c278bbdd9c4d624c3b333ca2a1737bfe0c4922fb) --- bolt/BinaryFunction.h | 6 ++++++ bolt/Exceptions.cpp | 12 ++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 1db0ec0662a5..974e87b8dfd5 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -450,6 +450,12 @@ class BinaryFunction { // with NOPs and then reorder it away. // We fix this by moving the CFI instruction just before any NOPs. auto I = Instructions.lower_bound(Offset); + if (I == Instructions.end() && Offset == getSize()) { + // Sometimes compiler issues restore_state after all instructions + // in the function (even after nop). + --I; + Offset = I->first; + } assert(I->first == Offset && "CFI pointing to unknown instruction"); if (I == Instructions.begin()) { CIEFrameInstructions.emplace_back(std::forward(Inst)); diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index f6cfef135e57..4c46059425e6 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -447,8 +447,8 @@ void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { Offset, MCCFIInstruction::createRestore(nullptr, Instr.Ops[0])); break; case DW_CFA_set_loc: - assert(Instr.Ops[0] < Address && "set_loc out of function bounds"); - assert(Instr.Ops[0] > Address + Function.getSize() && + assert(Instr.Ops[0] >= Address && "set_loc out of function bounds"); + assert(Instr.Ops[0] <= Address + Function.getSize() && "set_loc out of function bounds"); Offset = Instr.Ops[0] - Address; break; @@ -500,6 +500,11 @@ void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { Offset, MCCFIInstruction::createDefCfaOffset( nullptr, -(DataAlignment * int64_t(Instr.Ops[0])))); break; + case DW_CFA_GNU_args_size: + Function.addCFIInstruction( + Offset, + MCCFIInstruction::createGnuArgsSize(nullptr, Instr.Ops[0])); + break; case DW_CFA_val_offset_sf: case DW_CFA_val_offset: llvm_unreachable("DWARF val_offset() unimplemented"); @@ -512,11 +517,10 @@ void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { case DW_CFA_MIPS_advance_loc8: llvm_unreachable("DW_CFA_MIPS_advance_loc unimplemented"); break; - case DW_CFA_GNU_args_size: case DW_CFA_GNU_window_save: case DW_CFA_lo_user: case DW_CFA_hi_user: - llvm_unreachable("DW_CFA_GNU_* and DW_CFA_*_use unimplemented"); + llvm_unreachable("DW_CFA_GNU_* and DW_CFA_*_user unimplemented"); break; default: llvm_unreachable("Unrecognized CFI instruction"); From e10893cecb036ad1d5cdbed08b9a71b90f9c5c2a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 18 Dec 2015 17:00:46 -0800 Subject: [PATCH 058/904] Write updated LSDA's. Summary: Write new exception ranges tables (LSDA's) into the output file. (cherry picked from commit 4eab37e2c965aab3135cf1f9d3304aabef750195) --- bolt/BinaryFunction.cpp | 5 + bolt/BinaryFunction.h | 52 +++++++-- bolt/Exceptions.cpp | 133 ++++++++++++++++++++--- bolt/RewriteInstance.cpp | 228 +++++++++++++++++++++------------------ bolt/RewriteInstance.h | 70 ++++++++++-- 5 files changed, 355 insertions(+), 133 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 030f6b7ca54a..d3dd85043da1 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -32,6 +32,8 @@ namespace llvm { namespace flo { +uint64_t BinaryFunction::Count = 0; + BinaryBasicBlock * BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { if (Offset > Size) @@ -68,6 +70,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, StringRef SectionName; Section.getName(SectionName); OS << "Binary Function \"" << getName() << "\" " << Annotation << " {" + << "\n Number : " << FunctionNumber << "\n State : " << CurrentState << "\n Address : 0x" << Twine::utohexstr(Address) << "\n Size : 0x" << Twine::utohexstr(Size) @@ -119,6 +122,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; + case MCCFIInstruction::OpGnuArgsSize: OS << "OpGnuArgsSize"; break; + default: OS << "Op#" << Operation; break; } }; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 974e87b8dfd5..31a2e06a3c44 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -127,13 +127,6 @@ class BinaryFunction { /// according to profile data). -1 if the score has not been calculated yet. int64_t FunctionScore{-1}; - /// Binary blob reprsenting action, type, and type index tables for this - /// function' LSDA (exception handling). - ArrayRef LSDATables; - - /// Offset into LSDATables where type tables start. - uint64_t LSDATablesTypeOffset{0}; - /// Original LSDA address for the function. uint64_t LSDAAddress{0}; @@ -217,6 +210,14 @@ class BinaryFunction { }; std::vector CallSites; + /// Binary blobs reprsenting action, type, and type index tables for this + /// function' LSDA (exception handling). + ArrayRef LSDAActionAndTypeTables; + ArrayRef LSDATypeIndexTable; + + /// Marking for the beginning of language-specific data area for the function. + MCSymbol *LSDASymbol{nullptr}; + /// Map to discover which CFIs are attached to a given instruction offset. /// Maps an instruction offset into a FrameInstructions offset. /// This is only relevant to the buildCFG phase and is discarded afterwards. @@ -243,6 +244,12 @@ class BinaryFunction { /// Symbol in the output. const MCSymbol *OutputSymbol; + /// Unique number associated with the function. + uint64_t FunctionNumber; + + /// Count the number of functions created. + static uint64_t Count; + public: typedef BasicBlockListType::iterator iterator; @@ -300,7 +307,9 @@ class BinaryFunction { BinaryFunction(std::string Name, SymbolRef Symbol, SectionRef Section, uint64_t Address, uint64_t Size, BinaryContext &BC) : Name(Name), Symbol(Symbol), Section(Section), Address(Address), - Size(Size), BC(BC), CodeSectionName(".text." + Name) {} + Size(Size), BC(BC), CodeSectionName(".text." + Name), + FunctionNumber(++Count) + {} /// Perform optimal code layout based on edge frequencies making necessary /// adjustments to instructions at the end of basic blocks. @@ -382,6 +391,11 @@ class BinaryFunction { return !FrameInstructions.empty() || !CIEFrameInstructions.empty(); } + /// Return unique number associated with the function. + uint64_t getFunctionNumber() const { + return FunctionNumber; + } + /// Return true if the given address \p PC is inside the function body. bool containsAddress(uint64_t PC) const { return Address <= PC && PC < Address + Size; @@ -563,6 +577,12 @@ class BinaryFunction { return *this; } + /// Set LSDA symbol for the function. + BinaryFunction &setLSDASymbol(MCSymbol *Symbol) { + LSDASymbol = Symbol; + return *this; + } + /// Return the profile information about the number of times /// the function was executed. /// @@ -576,6 +596,20 @@ class BinaryFunction { return LSDAAddress; } + /// Return symbol pointing to function's LSDA. + MCSymbol *getLSDASymbol() { + if (LSDASymbol) + return LSDASymbol; + if (CallSites.empty()) + return nullptr; + + LSDASymbol = + BC.Ctx->getOrCreateSymbol(Twine("GCC_except_table") + + Twine::utohexstr(getFunctionNumber())); + + return LSDASymbol; + } + /// Disassemble function from raw data \p FunctionData. /// If successful, this function will populate the list of instructions /// for this function together with offsets from the function start @@ -641,7 +675,7 @@ class BinaryFunction { bool hasEHRanges() const { return !CallSites.empty(); } /// Emit exception handling ranges for the function. - void emitLSDA(); + void emitLSDA(MCStreamer *Streamer); virtual ~BinaryFunction() {} diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 4c46059425e6..30676a481522 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -18,10 +18,12 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -170,14 +172,20 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // Create a handler entry if necessary. MCSymbol *LPSymbol{nullptr}; if (LandingPad) { - auto Label = Labels.find(LandingPad); - if (Label != Labels.end()) { - LPSymbol = Label->second; + if (Instructions.find(LandingPad) == Instructions.end()) { + errs() << "FLO-WARNING: landing pad " << Twine::utohexstr(LandingPad) + << " not pointing to an instruction in function " + << getName() << " - ignoring.\n"; } else { - LPSymbol = BC.Ctx->createTempSymbol("LP", true); - Labels[LandingPad] = LPSymbol; + auto Label = Labels.find(LandingPad); + if (Label != Labels.end()) { + LPSymbol = Label->second; + } else { + LPSymbol = BC.Ctx->createTempSymbol("LP", true); + Labels[LandingPad] = LPSymbol; + } + LandingPads.insert(LPSymbol); } - LandingPads.insert(LPSymbol); } // Mark all call instructions in the range. @@ -246,7 +254,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (opts::PrintExceptions) errs() << "filter exception types "; auto TSep = ""; - // ActionType is a negative byte offset into uleb128-encoded table + // ActionType is a negative *byte* offset into *uleb128-encoded* table // of indices with base 1. // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are // encoded using uleb128 thus we cannot directly dereference them. @@ -274,15 +282,18 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (opts::PrintExceptions) errs() << '\n'; - assert(reinterpret_cast(MaxTypeIndexTableOffset) <= + assert(TypeIndexTableStart + MaxTypeIndexTableOffset <= LSDASectionData.data() + LSDASectionData.size() && "LSDA entry has crossed section boundary"); - LSDATables = - ArrayRef(ActionTableStart, - reinterpret_cast(MaxTypeIndexTableOffset)); - LSDATablesTypeOffset = - reinterpret_cast(TypeTableStart) - ActionTableStart; + if (TTypeEnd) { + // TypeIndexTableStart is a alias for TypeTableStart. + LSDAActionAndTypeTables = + ArrayRef(ActionTableStart, + TypeIndexTableStart - ActionTableStart); + LSDATypeIndexTable = + ArrayRef(TypeIndexTableStart, MaxTypeIndexTableOffset); + } } void BinaryFunction::updateEHRanges() { @@ -383,6 +394,102 @@ void BinaryFunction::updateEHRanges() { } } +// The code is based on EHStreamer::emitExceptionTable(). +void BinaryFunction::emitLSDA(MCStreamer *Streamer) { + if (CallSites.empty()) { + return; + } + + assert(!isSplit() && "split functions are not supported yet"); + + // Calculate callsite table size. Size of each callsite entry is: + // + // sizeof(start) + sizeof(length) + sizeof(LP) + sizeof(uleb128(action)) + // + // or + // + // sizeof(dwarf::DW_EH_PE_udata4) * 3 + sizeof(uleb128(action)) + uint64_t CallSiteTableLength = CallSites.size() * 4 * 3; + for(const auto &CallSite : CallSites) { + CallSiteTableLength+= getULEB128Size(CallSite.Action); + } + + Streamer->SwitchSection(BC.MOFI->getLSDASection()); + + // When we read we make sure only the following encoding is supported. + constexpr unsigned TTypeEncoding = dwarf::DW_EH_PE_udata4; + + // Type tables have to be aligned at 4 bytes. + Streamer->EmitValueToAlignment(4); + + // Emit the LSDA label. + auto LSDASymbol = getLSDASymbol(); + assert(LSDASymbol && "no LSDA symbol set"); + Streamer->EmitLabel(LSDASymbol); + + // Emit the LSDA header. + Streamer->EmitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format + Streamer->EmitIntValue(TTypeEncoding, 1); // TType format + + // See the comment in EHStreamer::emitExceptionTable() on how we use + // uleb128 encoding (which can use variable number of bytes to encode the same + // value) to ensure type info table is properly aligned at 4 bytes without + // iteratively messing with sizes of the tables. + unsigned CallSiteTableLengthSize = getULEB128Size(CallSiteTableLength); + unsigned TTypeBaseOffset = + sizeof(int8_t) + // Call site format + CallSiteTableLengthSize + // Call site table length size + CallSiteTableLength + // Call site table length + LSDAActionAndTypeTables.size(); // Actions + Types size + unsigned TTypeBaseOffsetSize = getULEB128Size(TTypeBaseOffset); + unsigned TotalSize = + sizeof(int8_t) + // LPStart format + sizeof(int8_t) + // TType format + TTypeBaseOffsetSize + // TType base offset size + TTypeBaseOffset; // TType base offset + unsigned SizeAlign = (4 - TotalSize) & 3; + + // Account for any extra padding that will be added to the call site table + // length. + Streamer->EmitULEB128IntValue(TTypeBaseOffset, SizeAlign); + + // Emit the landing pad call site table. + Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); + Streamer->EmitULEB128IntValue(CallSiteTableLength); + + for (const auto &CallSite : CallSites) { + + const MCSymbol *BeginLabel = CallSite.Start; + const MCSymbol *EndLabel = CallSite.End; + + assert(BeginLabel && "start EH label expected"); + assert(EndLabel && "end EH label expected"); + + Streamer->emitAbsoluteSymbolDiff(BeginLabel, getOutputSymbol(), 4); + Streamer->emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4); + + if (!CallSite.LP) { + Streamer->EmitIntValue(0, 4); + } else { + Streamer->emitAbsoluteSymbolDiff(CallSite.LP, getOutputSymbol(), 4); + } + + Streamer->EmitULEB128IntValue(CallSite.Action); + } + + // Write out action, type, and type index tables at the end. + // + // There's no need to change the original format we saw on input + // unless we are doing a function splitting in which case we can + // perhaps split and optimize the tables. + for(auto const &Byte : LSDAActionAndTypeTables) { + Streamer->EmitIntValue(Byte, 1); + } + for(auto const &Byte : LSDATypeIndexTable) { + Streamer->EmitIntValue(Byte, 1); + } +} + const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 061781baa75a..2a500c492114 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -10,12 +10,12 @@ //===----------------------------------------------------------------------===// -#include "RewriteInstance.h" #include "BinaryBasicBlock.h" #include "BinaryContext.h" #include "BinaryFunction.h" #include "DataReader.h" #include "Exceptions.h" +#include "RewriteInstance.h" #include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" @@ -133,12 +133,15 @@ PrintReordered("print-reordered", // Check against lists of functions from options if we should // optimize the function with a given name. -bool shouldProcess(StringRef FunctionName) { +bool shouldProcess(const BinaryFunction &Function) { + if (MaxFunctions && Function.getFunctionNumber() > MaxFunctions) + return false; + bool IsValid = true; if (!FunctionNames.empty()) { IsValid = false; for (auto &Name : FunctionNames) { - if (FunctionName == Name) { + if (Function.getName() == Name) { IsValid = true; break; } @@ -149,7 +152,7 @@ bool shouldProcess(StringRef FunctionName) { if (!SkipFunctionNames.empty()) { for (auto &Name : SkipFunctionNames) { - if (FunctionName == Name) { + if (Function.getName() == Name) { IsValid = false; break; } @@ -174,52 +177,39 @@ static void check_error(std::error_code EC, StringRef Message) { report_error(Message, EC); } -/// Class responsible for allocating and managing code and data sections. -class ExecutableFileMemoryManager : public SectionMemoryManager { -public: - - // Keep [section name] -> [allocated address, size] map for later remapping. - std::map> SectionAddressInfo; - - ExecutableFileMemoryManager() {} - - uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, - StringRef SectionName) override { - auto ret = - SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, - SectionName); - DEBUG(dbgs() << "FLO: allocating code section : " << SectionName - << " with size " << Size << ", alignment " << Alignment - << " at 0x" << ret << "\n"); - - SectionAddressInfo[SectionName] = {reinterpret_cast(ret), Size}; - - return ret; +uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, + unsigned Alignment, + unsigned SectionID, + StringRef SectionName, + bool IsCode, + bool IsReadOnly) { + uint8_t *ret; + if (IsCode) { + ret = SectionMemoryManager::allocateCodeSection(Size, Alignment, + SectionID, SectionName); + } else { + ret = SectionMemoryManager::allocateDataSection(Size, Alignment, + SectionID, SectionName, + IsReadOnly); } - uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, StringRef SectionName, - bool IsReadOnly) override { - DEBUG(dbgs() << "FLO: allocating data section : " << SectionName - << " with size " << Size << ", alignment " - << Alignment << "\n"); - auto ret = SectionMemoryManager::allocateDataSection( - Size, Alignment, SectionID, SectionName, IsReadOnly); - - SectionAddressInfo[SectionName] = {reinterpret_cast(ret), Size}; + DEBUG(dbgs() << "FLO: allocating " << (IsCode ? "code" : "data") + << " section : " << SectionName + << " with size " << Size << ", alignment " << Alignment + << " at 0x" << ret << "\n"); - return ret; - } + SectionMapInfo[SectionName] = SectionInfo(reinterpret_cast(ret), + Size, + Alignment, + IsCode); - // Tell EE that we guarantee we don't need stubs. - bool allowStubAllocation() const override { return false; } + return ret; +} - bool finalizeMemory(std::string *ErrMsg = nullptr) override { - DEBUG(dbgs() << "FLO: finalizeMemory()\n"); - return SectionMemoryManager::finalizeMemory(ErrMsg); - } -}; +bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { + DEBUG(dbgs() << "FLO: finalizeMemory()\n"); + return SectionMemoryManager::finalizeMemory(ErrMsg); +} /// Create BinaryContext for a given architecture \p ArchName and /// triple \p TripleName. @@ -571,7 +561,7 @@ void RewriteInstance::disassembleFunctions() { for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; - if (!opts::shouldProcess(Function.getName())) { + if (!opts::shouldProcess(Function)) { DEBUG(dbgs() << "FLO: skipping processing function " << Function.getName() << " per user request.\n"); continue; @@ -687,7 +677,7 @@ void RewriteInstance::runOptimizationPasses() { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; - if (!opts::shouldProcess(Function.getName())) + if (!opts::shouldProcess(Function)) continue; if (!Function.isSimple()) @@ -759,10 +749,6 @@ void RewriteInstance::runOptimizationPasses() { Function.print(errs(), "after updating EH ranges"); } - // TODO: add complete EH ranges support. - if (Function.hasEHRanges()) - Function.setSimple(false); - // Fix the CFI state. if (!Function.fixCFIState()) Function.setSimple(false); @@ -817,6 +803,9 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, case MCCFIInstruction::OpSameValue: Streamer.EmitCFISameValue(CFIInstr.getRegister()); break; + case MCCFIInstruction::OpGnuArgsSize: + Streamer.EmitCFIGnuArgsSize(CFIInstr.getOffset()); + break; } }; @@ -861,6 +850,12 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.EmitCFIPersonality(Function.getPersonalityFunction(), Function.getPersonalityEncoding()); } + if (Function.getLSDASymbol()) { + Streamer.EmitCFILsda(Function.getLSDASymbol(), + BC.MOFI->getLSDAEncoding()); + } else { + Streamer.EmitCFILsda(0, dwarf::DW_EH_PE_omit); + } // Emit CFI instructions relative to the CIE for (auto &CFIInstr : Function.cie()) { // Ignore these CIE CFI insns because LLVM will already emit this. @@ -910,6 +905,9 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, if (Function.hasCFI() && HasExtraStorage) Streamer.EmitCFIEndProc(); + // Emit LSDA before anything else? + Function.emitLSDA(&Streamer); + // TODO: is there any use in emiting end of function? // Perhaps once we have a support for C++ exceptions. // auto FunctionEndLabel = Ctx.createTempSymbol("func_end"); @@ -957,7 +955,6 @@ void RewriteInstance::emitFunctions() { Streamer->InitSections(false); - bool HasEHFrame = false; bool NoSpaceWarning = false; // Output functions one by one. for (auto &BFI : BinaryFunctions) { @@ -966,16 +963,14 @@ void RewriteInstance::emitFunctions() { if (!Function.isSimple()) continue; - if (!opts::shouldProcess(Function.getName())) + if (!opts::shouldProcess(Function)) continue; DEBUG(dbgs() << "FLO: generating code for function \"" << Function.getName() - << "\"\n"); + << "\" : " << Function.getFunctionNumber() << '\n'); if (Function.hasCFI()) { - if (ExtraStorage.Size != 0) - HasEHFrame = true; - else + if (ExtraStorage.Size == 0) NoSpaceWarning = true; } @@ -1039,16 +1034,17 @@ void RewriteInstance::emitFunctions() { if (!Function.isSimple()) continue; - auto SAI = EFMM->SectionAddressInfo.find(Function.getCodeSectionName()); - if (SAI != EFMM->SectionAddressInfo.end()) { - DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + auto SMII = EFMM->SectionMapInfo.find(Function.getCodeSectionName()); + if (SMII != EFMM->SectionMapInfo.end()) { + DEBUG(dbgs() << "FLO: mapping 0x" + << Twine::utohexstr(SMII->second.AllocAddress) << " to 0x" << Twine::utohexstr(Function.getAddress()) << '\n'); OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SAI->second.first), + reinterpret_cast(SMII->second.AllocAddress), Function.getAddress()); - Function.setImageAddress(SAI->second.first); - Function.setImageSize(SAI->second.second); + Function.setImageAddress(SMII->second.AllocAddress); + Function.setImageSize(SMII->second.Size); } else { errs() << "FLO: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); @@ -1057,54 +1053,59 @@ void RewriteInstance::emitFunctions() { if (!Function.isSplit()) continue; - SAI = EFMM->SectionAddressInfo.find( + SMII = EFMM->SectionMapInfo.find( Function.getCodeSectionName().str().append(".cold")); - if (SAI != EFMM->SectionAddressInfo.end()) { + if (SMII != EFMM->SectionMapInfo.end()) { // Align at a 16-byte boundary - ExtraStorage.BumpPtr = (ExtraStorage.BumpPtr + 15) & ~(15ULL); - - DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + ExtraStorage.BumpPtr = RoundUpToAlignment(ExtraStorage.BumpPtr, 16); + DEBUG(dbgs() << "FLO: mapping 0x" + << Twine::utohexstr(SMII->second.AllocAddress) << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) - << " with size " << Twine::utohexstr(SAI->second.second) + << " with size " << Twine::utohexstr(SMII->second.Size) << '\n'); OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SAI->second.first), + reinterpret_cast(SMII->second.AllocAddress), ExtraStorage.BumpPtr); - Function.cold().setImageAddress(SAI->second.first); - Function.cold().setImageSize(SAI->second.second); + Function.cold().setImageAddress(SMII->second.AllocAddress); + Function.cold().setImageSize(SMII->second.Size); Function.cold().setFileOffset(ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset); - ExtraStorage.BumpPtr += SAI->second.second; + ExtraStorage.BumpPtr += SMII->second.Size; } else { errs() << "FLO: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); } } - // Map .eh_frame - NewEhFrameAddress = 0; - NewEhFrameOffset = 0; - if (HasEHFrame) { - auto SAI = EFMM->SectionAddressInfo.find(".eh_frame"); - if (SAI != EFMM->SectionAddressInfo.end()) { - // Align at an 8-byte boundary - ExtraStorage.BumpPtr = (ExtraStorage.BumpPtr + 7) & ~(7ULL); - DEBUG(dbgs() << "FLO: mapping 0x" << Twine::utohexstr(SAI->second.first) + + // Map special sections to their addresses in the output image. + // + // TODO: perhaps we should process all the allocated sections here? + std::vector Sections = { ".eh_frame", ".gcc_except_table" }; + for(auto &SectionName : Sections) { + auto SMII = EFMM->SectionMapInfo.find(SectionName); + if (SMII != EFMM->SectionMapInfo.end()) { + SectionInfo &SI = SMII->second; + ExtraStorage.BumpPtr = RoundUpToAlignment(ExtraStorage.BumpPtr, + SI.Alignment); + DEBUG(dbgs() << "FLO: mapping 0x" + << Twine::utohexstr(SI.AllocAddress) << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) << '\n'); - NewEhFrameAddress = ExtraStorage.BumpPtr; - NewEhFrameOffset = - ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; + OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SAI->second.first), + reinterpret_cast(SI.AllocAddress), ExtraStorage.BumpPtr); - ExtraStorage.BumpPtr += SAI->second.second; - NewEhFrameContents = - StringRef(reinterpret_cast(SAI->second.first), - SAI->second.second); + + SI.FileAddress = ExtraStorage.BumpPtr; + SI.FileOffset = ExtraStorage.BumpPtr - ExtraStorage.Addr + + ExtraStorage.FileOffset; + + ExtraStorage.BumpPtr += SI.Size; } else { - errs() << "FLO: cannot remap .eh_frame\n"; + errs() << "FLO: cannot remap " << SectionName << '\n'; } } + if (ExtraStorage.BumpPtr - ExtraStorage.Addr > ExtraStorage.Size) { errs() << format( "FLO fatal error: __flo_storage in this binary has not enough free " @@ -1129,6 +1130,10 @@ bool RewriteInstance::splitLargeFunctions() { if (Function.getImageSize() <= Function.getMaxSize()) continue; + // Don't split functions with exception ranges. + if (Function.hasEHRanges()) + continue; + ToSplit.insert(BFI.first); Changed = true; } @@ -1246,14 +1251,27 @@ void RewriteInstance::rewriteFile() { break; } } - if (NewEhFrameContents.size()) { + + outs() << "FLO: " << CountOverwrittenFunctions + << " out of " << BinaryFunctions.size() + << " functions were overwritten.\n"; + + // If .eh_frame is present it requires special handling. + auto SMII = SectionMM->SectionMapInfo.find(".eh_frame"); + if (SMII != SectionMM->SectionMapInfo.end()) { + auto &EHFrameSI = SMII->second; outs() << "FLO: writing a new .eh_frame_hdr\n"; - if (FrameHdrAlign > 1) + if (FrameHdrAlign > 1) { ExtraStorage.BumpPtr = - (ExtraStorage.BumpPtr + FrameHdrAlign - 1) & ~(FrameHdrAlign - 1); + RoundUpToAlignment(ExtraStorage.BumpPtr, FrameHdrAlign); + } std::sort(FailedAddresses.begin(), FailedAddresses.end()); - CFIRdWrt->rewriteHeaderFor(NewEhFrameContents, NewEhFrameAddress, - ExtraStorage.BumpPtr, FailedAddresses); + CFIRdWrt->rewriteHeaderFor( + StringRef(reinterpret_cast(EHFrameSI.AllocAddress), + EHFrameSI.Size), + EHFrameSI.FileAddress, + ExtraStorage.BumpPtr, + FailedAddresses); if (ExtraStorage.BumpPtr - ExtraStorage.Addr - ExtraStorage.Size < FrameHdrCopy.size()) { errs() << "FLO fatal error: __flo_storage in this binary has not enough " @@ -1276,14 +1294,20 @@ void RewriteInstance::rewriteFile() { outs() << "FLO-ERROR: program segment NOT patched -- I don't know how to " "handle this object file!\n"; } - outs() << "FLO: writing a new .eh_frame\n"; - Out->os().pwrite(NewEhFrameContents.data(), NewEhFrameContents.size(), - NewEhFrameOffset); } - outs() << "FLO: " << CountOverwrittenFunctions - << " out of " << BinaryFunctions.size() - << " functions were overwritten.\n"; + // Write all non-code sections. + for(auto &SMII : SectionMM->SectionMapInfo) { + SectionInfo &SI = SMII.second; + if (SI.IsCode) + continue; + outs() << "FLO: writing new section " << SMII.first << '\n'; + Out->os().pwrite(reinterpret_cast(SI.AllocAddress), + SI.Size, + SI.FileOffset); + + // Update ELF section header. + } if (TotalScore != 0) { double Coverage = OverwrittenScore / (double)TotalScore * 100.0; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 037b23b3f09d..49573f8dac08 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -15,6 +15,7 @@ #define LLVM_TOOLS_LLVM_FLO_REWRITE_INSTANCE_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/ObjectFile.h" #include @@ -24,7 +25,6 @@ namespace llvm { class DWARFContext; class DWARFFrame; -class SectionMemoryManager; class tool_output_file; namespace flo { @@ -34,6 +34,58 @@ class BinaryFunction; class CFIReaderWriter; class DataReader; +/// Section information for mapping and re-writing. +struct SectionInfo { + uint64_t AllocAddress; /// Current location of the section in memory. + uint64_t Size; /// Section size. + unsigned Alignment; /// Alignment of the section. + uint64_t FileAddress{0}; /// Address in the output file. + uint64_t FileOffset{0}; /// Offset in the output file. + bool IsCode{false}; /// Does this section contain code. + + SectionInfo(uint64_t Address = 0, uint64_t Size = 0, unsigned Alignment = 0, + bool IsCode = false) + : AllocAddress(Address), Size(Size), Alignment(Alignment), + IsCode(IsCode) {} +}; + +/// Class responsible for allocating and managing code and data sections. +class ExecutableFileMemoryManager : public SectionMemoryManager { +private: + uint8_t *allocateSection(intptr_t Size, + unsigned Alignment, + unsigned SectionID, + StringRef SectionName, + bool IsCode, + bool IsReadOnly); + +public: + + // Keep [section name] -> [section info] map for later remapping. + std::map SectionMapInfo; + + ExecutableFileMemoryManager() {} + + uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, + StringRef SectionName) override { + return allocateSection(Size, Alignment, SectionID, SectionName, + /*IsCode=*/true, true); + } + + uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, StringRef SectionName, + bool IsReadOnly) override { + return allocateSection(Size, Alignment, SectionID, SectionName, + /*IsCode=*/false, IsReadOnly); + } + + // Tell EE that we guarantee we don't need stubs. + bool allowStubAllocation() const override { return false; } + + bool finalizeMemory(std::string *ErrMsg = nullptr) override; +}; + /// This class encapsulates all data necessary to carry on binary reading, /// disassembly, CFG building, BB reordering (among other binary-level /// optimizations) and rewriting. It also has the logic to coordinate such @@ -92,11 +144,11 @@ class RewriteInstance { std::unique_ptr BC; std::unique_ptr DwCtx; std::unique_ptr CFIRdWrt; - // Our in-memory intermediary object file where we hold final code for - // rewritten functions. - std::unique_ptr SectionMM; - // Our output file where we mix original code from the input binary and - // optimized code for selected functions. + /// Our in-memory intermediary object file where we hold final code for + /// rewritten functions. + std::unique_ptr SectionMM; + /// Our output file where we mix original code from the input binary and + /// optimized code for selected functions. std::unique_ptr Out; /// Represent free space we have in the binary to write extra bytes. This free @@ -139,11 +191,11 @@ class RewriteInstance { uint64_t NewEhFrameAddress{0}; uint64_t NewEhFrameOffset{0}; - // Keep track of functions we fail to write in the binary. We need to avoid - // rewriting CFI info for these functions. + /// Keep track of functions we fail to write in the binary. We need to avoid + /// rewriting CFI info for these functions. std::vector FailedAddresses; - // Keep track of which functions to split in a second pass. + /// Keep track of which functions to split in a second pass. std::set ToSplit; /// Total hotness score according to profiling data for this binary. From bcb2fd7c908f44c3c63f55fa725ab4788fc8813c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 13 Jan 2016 17:19:40 -0800 Subject: [PATCH 059/904] Added an option to reverse original basic blocks order. Summary: Modified processing of "-reorder-blocks=" option and added an option to reverse original basic blocks order for testing purposes. (cherry picked from commit 32afa50cbd6074807e771badb040878637c242b3) --- bolt/BinaryFunction.cpp | 39 +++++++++++++++++++++++++------- bolt/BinaryFunction.h | 35 +++++++++++++++++------------ bolt/RewriteInstance.cpp | 48 ++++++++++++++++++++-------------------- 3 files changed, 76 insertions(+), 46 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index d3dd85043da1..f9bfb8ae3732 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -933,13 +933,30 @@ bool BinaryFunction::fixCFIState() { return true; } -void BinaryFunction::optimizeLayout(HeuristicPriority Priority, bool Split) { - // Bail if no profiling information or if empty - if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE || - BasicBlocksLayout.empty()) { +void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { + if (BasicBlocksLayout.empty() || Type == LT_NONE) + return; + + if (Type == LT_REVERSE) { + BasicBlockOrderType ReverseOrder; + auto FirstBB = BasicBlocksLayout.front(); + ReverseOrder.push_back(FirstBB); + for(auto RBBI = BasicBlocksLayout.rbegin(); *RBBI != FirstBB; ++RBBI) + ReverseOrder.push_back(*RBBI); + BasicBlocksLayout.swap(ReverseOrder); + + if (Split) + splitFunction(); + + fixBranches(); + return; } + // Cannot do optimal layout without profile. + if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + return; + // Work on optimal solution if problem is small enough if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) return solveOptimalLayout(Split); @@ -1062,14 +1079,14 @@ void BinaryFunction::optimizeLayout(HeuristicPriority Priority, bool Split) { AvgFreq[I] = Freq; } - switch(Priority) { - case HP_NONE: { + switch(Type) { + case LT_OPTIMIZE: { for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) if (!Clusters[I].empty()) Order.push_back(I); break; } - case HP_BRANCH_PREDICTOR: { + case LT_OPTIMIZE_BRANCH: { // Do a topological sort for clusters, prioritizing frequently-executed BBs // during the traversal. std::stack Stack; @@ -1137,7 +1154,7 @@ void BinaryFunction::optimizeLayout(HeuristicPriority Priority, bool Split) { }); break; } - case HP_CACHE_UTILIZATION: { + case LT_OPTIMIZE_CACHE: { // Order clusters based on average instruction execution frequency for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) if (!Clusters[I].empty()) @@ -1151,6 +1168,8 @@ void BinaryFunction::optimizeLayout(HeuristicPriority Priority, bool Split) { break; } + default: + llvm_unreachable("unexpected layout type"); } BasicBlocksLayout.clear(); @@ -1349,6 +1368,10 @@ void BinaryFunction::fixBranches() { // Case 3a: If the taken branch goes to the next block in the new layout, // invert this conditional branch logic so we can make this a fallthrough. if (TBB == FT && !HotColdBorder) { + if (OldFT == nullptr) { + errs() << "FLO-ERROR: malfromed CFG for function " << getName() + << " in basic block " << BB->getName() << '\n'; + } assert(OldFT != nullptr && "malformed CFG"); if (!MIA->reverseBranchCondition(*CondBranch, OldFT, BC.Ctx.get())) llvm_unreachable("Target does not support reversing branches"); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 31a2e06a3c44..8ca6446f1ca4 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -53,17 +53,24 @@ class BinaryFunction { Assembled, /// Function has been assembled in memory }; - // Choose which strategy should the block layout heuristic prioritize when - // facing conflicting goals. - enum HeuristicPriority : char { - HP_NONE = 0, - // HP_BRANCH_PREDICTOR is an implementation of what is suggested in Pettis' - // paper (PLDI '90) about block reordering, trying to minimize branch - // mispredictions. - HP_BRANCH_PREDICTOR, - // HP_CACHE_UTILIZATION pigbacks on the idea from Ispike paper (CGO '04) - // that suggests putting frequently executed chains first in the layout. - HP_CACHE_UTILIZATION, + /// Choose which strategy should the block layout heuristic prioritize when + /// facing conflicting goals. + enum LayoutType : char { + /// LT_NONE - do not change layout of basic blocks + LT_NONE = 0, /// no reordering + /// LT_REVERSE - reverse the order of basic blocks, meant for testing + /// purposes. The first basic block is left intact and the rest are + /// put in the reverse order. + LT_REVERSE, + /// LT_OPTIMIZE - optimize layout of basic blocks based on profile. + LT_OPTIMIZE, + /// LT_OPTIMIZE_BRANCH is an implementation of what is suggested in Pettis' + /// paper (PLDI '90) about block reordering, trying to minimize branch + /// mispredictions. + LT_OPTIMIZE_BRANCH, + /// LT_OPTIMIZE_CACHE pigbacks on the idea from Ispike paper (CGO '04) + /// that suggests putting frequently executed chains first in the layout. + LT_OPTIMIZE_CACHE, }; static constexpr uint64_t COUNT_NO_PROFILE = @@ -311,9 +318,9 @@ class BinaryFunction { FunctionNumber(++Count) {} - /// Perform optimal code layout based on edge frequencies making necessary - /// adjustments to instructions at the end of basic blocks. - void optimizeLayout(HeuristicPriority Priority, bool Split); + /// Modify code layout making necessary adjustments to instructions at the + /// end of basic blocks. + void modifyLayout(LayoutType Type, bool Split); /// Dynamic programming implementation for the TSP, applied to BB layout. Find /// the optimal way to maximize weight during a path traversing all BBs. In diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 2a500c492114..cc39ae2e8105 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -89,11 +89,29 @@ SplitFunctions("split-functions", cl::desc("split functions into hot and cold distinct regions"), cl::Optional); -static cl::opt ReorderBlocks( +static cl::opt ReorderBlocks( "reorder-blocks", - cl::desc("redo basic block layout based on profiling data with a specific " - "priority (none, branch-predictor or cache)"), - cl::value_desc("priority"), cl::init("disable")); + cl::desc("change layout of basic blocks in a function"), + cl::init(BinaryFunction::LT_NONE), + cl::values(clEnumValN(BinaryFunction::LT_NONE, + "none", + "do not reorder basic blocks"), + clEnumValN(BinaryFunction::LT_REVERSE, + "reverse", + "layout blocks in reverse order"), + clEnumValN(BinaryFunction::LT_OPTIMIZE, + "normal", + "perform optimal layout based on profile"), + clEnumValN(BinaryFunction::LT_OPTIMIZE_BRANCH, + "branch-predictor", + "perform optimal layout prioritizing branch " + "predictions"), + clEnumValN(BinaryFunction::LT_OPTIMIZE_CACHE, + "cache", + "perform optimal layout prioritizing I-cache " + "behavior"), + clEnumValEnd)); + static cl::opt AlignBlocks("align-blocks", cl::desc("try to align BBs inserting nops"), @@ -665,15 +683,6 @@ void RewriteInstance::runOptimizationPasses() { // // FIXME: use real optimization passes. bool NagUser = true; - if (opts::ReorderBlocks != "" && - opts::ReorderBlocks != "disable" && - opts::ReorderBlocks != "none" && - opts::ReorderBlocks != "branch-predictor" && - opts::ReorderBlocks != "cache") { - errs() << "FLO: Unrecognized block reordering priority \"" - << opts::ReorderBlocks << "\".\n"; - exit(1); - } for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -725,18 +734,9 @@ void RewriteInstance::runOptimizationPasses() { Function.print(errs(), "after unreachable code elimination"); } - if (opts::ReorderBlocks != "disable") { + if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { bool ShouldSplit = ToSplit.find(BFI.first) != ToSplit.end(); - - if (opts::ReorderBlocks == "branch-predictor") { - BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR, - ShouldSplit); - } else if (opts::ReorderBlocks == "cache") { - BFI.second.optimizeLayout(BinaryFunction::HP_CACHE_UTILIZATION, - ShouldSplit); - } else { - BFI.second.optimizeLayout(BinaryFunction::HP_NONE, ShouldSplit); - } + BFI.second.modifyLayout(opts::ReorderBlocks, ShouldSplit); if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks"); } From f52363dbfa8daa4970866f8e40cb4112468b3eca Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sat, 16 Jan 2016 14:58:22 -0800 Subject: [PATCH 060/904] Handle more CFI cases and some. Summary: * Update CFI state for larger range of functions to increase coverage. * Issue more warnings indicating reasons for skipping functions. * Print top called functions in the binary. (cherry picked from commit 52149b74ff29187746ad2e4c554d923ca45e018b) --- bolt/BinaryFunction.cpp | 175 ++++++++++++++++++++++----------------- bolt/RewriteInstance.cpp | 44 ++++++++-- 2 files changed, 140 insertions(+), 79 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index f9bfb8ae3732..76cbbf05e1d4 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -197,6 +197,10 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (BBExecCount != BinaryBasicBlock::COUNT_NO_PROFILE) { OS << " Exec Count : " << BBExecCount << "\n"; } + if (!BBCFIState.empty()) { + unsigned BBIndex = BB - &*BasicBlocks.begin(); + OS << " CFI State : " << BBCFIState[BBIndex] << '\n'; + } if (!BB->Predecessors.empty()) { OS << " Predecessors: "; auto Sep = ""; @@ -331,13 +335,19 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { nulls(), nulls())) { // Ignore this function. Skip to the next one. + errs() << "FLO-WARNING: unable to disassemble instruction at offset 0x" + << Twine::utohexstr(Offset) << " (address 0x" + << Twine::utohexstr(getAddress() + Offset) << ") in function " + << getName() << '\n'; IsSimple = false; break; } if (MIA->isUnsupported(Instruction)) { - DEBUG(dbgs() << "FLO: unsupported instruction seen. Skipping function " - << getName() << ".\n"); + errs() << "FLO-WARNING: unsupported instruction seen at offset 0x" + << Twine::utohexstr(Offset) << " (address 0x" + << Twine::utohexstr(getAddress() + Offset) << ") in function " + << getName() << '\n'; IsSimple = false; break; } @@ -365,9 +375,9 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = Ctx->getOrCreateSymbol(getName()); } else { // Possibly an old-style PIC code - DEBUG(dbgs() << "FLO: internal call detected at 0x" - << Twine::utohexstr(AbsoluteInstrAddr) - << " in function " << getName() << "\n"); + errs() << "FLO: internal call detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << " in function " << getName() << ". Skipping.\n"; IsSimple = false; } } @@ -433,11 +443,17 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Should be an indirect call or an indirect branch. Bail out on the // latter case. if (MIA->isIndirectBranch(Instruction)) { + DEBUG(dbgs() << "FLO-WARNING: indirect branch detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << ". Skipping function " << getName() << ".\n"); IsSimple = false; } // Indirect call. We only need to fix it if the operand is RIP-relative if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { + errs() << "FLO-WARNING: cannot handle RIP operand at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << ". Skipping function " << getName() << ".\n"; IsSimple = false; } } @@ -445,6 +461,9 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } else { if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { + errs() << "FLO-WARNING: cannot handle RIP operand at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << ". Skipping function " << getName() << ".\n"; IsSimple = false; } } @@ -829,22 +848,32 @@ bool BinaryFunction::fixCFIState() { return true; assert(FromState < ToState); + std::vector NewCFIs; + uint32_t NestedLevel = 0; for (uint32_t CurState = FromState; CurState < ToState; ++CurState) { MCCFIInstruction *Instr = &FrameInstructions[CurState]; - if (Instr->getOperation() == MCCFIInstruction::OpRememberState || - Instr->getOperation() == MCCFIInstruction::OpRestoreState) { - // TODO: If in replaying the CFI instructions to reach this state we - // have state stack instructions, we could still work out the logic - // to extract only the necessary instructions to reach this state - // without using the state stack. Not sure if it is worth the effort - // because this happens rarely. - errs() << "FLO-WARNING: CFI rewriter expected state " << ToState - << " but found " << FromState << " instead (@ " << getName() - << "). Giving up this function.\n"; - return false; - } - InsertIt = - addCFIPseudo(InBB, InsertIt, Instr - &*FrameInstructions.begin()); + if (Instr->getOperation() == MCCFIInstruction::OpRememberState) + ++NestedLevel; + if (!NestedLevel) + NewCFIs.push_back(CurState); + if (Instr->getOperation() == MCCFIInstruction::OpRestoreState) + --NestedLevel; + } + + // TODO: If in replaying the CFI instructions to reach this state we + // have state stack instructions, we could still work out the logic + // to extract only the necessary instructions to reach this state + // without using the state stack. Not sure if it is worth the effort + // because this happens rarely. + if (NestedLevel != 0) { + errs() << "FLO-WARNING: CFI rewriter detected nested CFI state while " + << " replaying CFI instructions for BB " << InBB->getName() + << " in function " << getName() << '\n'; + return false; + } + + for(auto CFI : NewCFIs) { + InsertIt = addCFIPseudo(InBB, InsertIt, CFI); ++InsertIt; } @@ -863,66 +892,64 @@ bool BinaryFunction::fixCFIState() { BB->IsCold != BasicBlocksLayout[I - 1]->IsCold) State = 0; - // Check if state is what this BB expect it to be at its entry point - if (BBCFIState[BBIndex] != State) { - // Need to recover the correct state - if (BBCFIState[BBIndex] < State) { - // In this case, State is currently higher than what this BB expect it - // to be. To solve this, we need to insert a CFI instruction to remember - // the old state at function entry, then another CFI instruction to - // restore it at the entry of this BB and replay CFI instructions to - // reach the desired state. - uint32_t OldState = BBCFIState[BBIndex]; - // Remember state at function entry point (our reference state). - BinaryBasicBlock::const_iterator InsertIt = EntryBB->begin(); - while (InsertIt != EntryBB->end() && BC.MIA->isCFI(*InsertIt)) - ++InsertIt; - addCFIPseudo(EntryBB, InsertIt, FrameInstructions.size()); - FrameInstructions.emplace_back( - MCCFIInstruction::createRememberState(nullptr)); - // Restore state - InsertIt = addCFIPseudo(BB, BB->begin(), FrameInstructions.size()); + // We need to recover the correct state if it doesn't match expected + // state at BB entry point. + if (BBCFIState[BBIndex] < State) { + // In this case, State is currently higher than what this BB expect it + // to be. To solve this, we need to insert a CFI instruction to remember + // the old state at function entry, then another CFI instruction to + // restore it at the entry of this BB and replay CFI instructions to + // reach the desired state. + uint32_t OldState = BBCFIState[BBIndex]; + // Remember state at function entry point (our reference state). + BinaryBasicBlock::const_iterator InsertIt = EntryBB->begin(); + while (InsertIt != EntryBB->end() && BC.MIA->isCFI(*InsertIt)) ++InsertIt; - FrameInstructions.emplace_back( - MCCFIInstruction::createRestoreState(nullptr)); - if (!replayCFIInstrs(0, OldState, BB, InsertIt)) - return false; - // Check if we messed up the stack in this process - int StackOffset = 0; - for (BinaryBasicBlock *CurBB : BasicBlocksLayout) { - if (CurBB == BB) - break; - for (auto &Instr : *CurBB) { - if (MCCFIInstruction *CFI = getCFIFor(Instr)) { - if (CFI->getOperation() == MCCFIInstruction::OpRememberState) - ++StackOffset; - if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) - --StackOffset; - } + addCFIPseudo(EntryBB, InsertIt, FrameInstructions.size()); + FrameInstructions.emplace_back( + MCCFIInstruction::createRememberState(nullptr)); + // Restore state + InsertIt = addCFIPseudo(BB, BB->begin(), FrameInstructions.size()); + ++InsertIt; + FrameInstructions.emplace_back( + MCCFIInstruction::createRestoreState(nullptr)); + if (!replayCFIInstrs(0, OldState, BB, InsertIt)) + return false; + // Check if we messed up the stack in this process + int StackOffset = 0; + for (BinaryBasicBlock *CurBB : BasicBlocksLayout) { + if (CurBB == BB) + break; + for (auto &Instr : *CurBB) { + if (MCCFIInstruction *CFI = getCFIFor(Instr)) { + if (CFI->getOperation() == MCCFIInstruction::OpRememberState) + ++StackOffset; + if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) + --StackOffset; } } - auto Pos = BB->begin(); - while (MCCFIInstruction *CFI = getCFIFor(*Pos++)) { - if (CFI->getOperation() == MCCFIInstruction::OpRememberState) - ++StackOffset; - if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) - --StackOffset; - } + } + auto Pos = BB->begin(); + while (MCCFIInstruction *CFI = getCFIFor(*Pos++)) { + if (CFI->getOperation() == MCCFIInstruction::OpRememberState) + ++StackOffset; + if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) + --StackOffset; + } - if (StackOffset != 0) { - errs() << " FLO-WARNING: not possible to remember/recover state" - << "without corrupting CFI state stack in function " - << getName() << "\n"; - return false; - } - } else { - // If BBCFIState[BBIndex] > State, it means we are behind in the - // state. Just emit all instructions to reach this state at the - // beginning of this BB. If this sequence of instructions involve - // remember state or restore state, bail out. - if (!replayCFIInstrs(State, BBCFIState[BBIndex], BB, BB->begin())) - return false; + if (StackOffset != 0) { + errs() << " FLO-WARNING: not possible to remember/recover state" + << "without corrupting CFI state stack in function " + << getName() << "\n"; + return false; } + } else if (BBCFIState[BBIndex] > State) { + // If BBCFIState[BBIndex] > State, it means we are behind in the + // state. Just emit all instructions to reach this state at the + // beginning of this BB. If this sequence of instructions involve + // remember state or restore state, bail out. + if (!replayCFIInstrs(State, BBCFIState[BBIndex], BB, BB->begin())) + return false; } State = BBCFIState[BBIndex + 1]; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index cc39ae2e8105..b46b631b5b87 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -603,8 +603,8 @@ void RewriteInstance::disassembleFunctions() { if (SymRefI != FileSymRefs.end()) { auto MaxSize = SymRefI->first - Function.getAddress(); if (MaxSize < Function.getSize()) { - DEBUG(dbgs() << "FLO: symbol seen in the middle of the function " - << Function.getName() << ". Skipping.\n"); + errs() << "FLO-WARNING: symbol seen in the middle of the function " + << Function.getName() << ". Skipping.\n"; Function.setSimple(false); continue; } @@ -676,6 +676,38 @@ void RewriteInstance::disassembleFunctions() { "another function. We will not process this function.\n"; Func.setSimple(false); } + + uint64_t NumSimpleFunctions{0}; + std::vector ProfiledFunctions; + for (auto &BFI : BinaryFunctions) { + if (!BFI.second.isSimple()) + continue; + ++NumSimpleFunctions; + if (BFI.second.getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) + ProfiledFunctions.push_back(&BFI.second); + } + + errs() << "FLO-INFO: " << ProfiledFunctions.size() << " functions out of " + << NumSimpleFunctions + << " simple functions (" + << format("%.f", + ProfiledFunctions.size() / + (float) NumSimpleFunctions * 100.0) + << "%) have non-empty execution profile.\n"; + + if (ProfiledFunctions.size() > 10) { + errs() << "FLO-INFO: top called functions are:\n"; + std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), + [](BinaryFunction *A, BinaryFunction *B) { + return B->getExecutionCount() < A->getExecutionCount(); + } + ); + auto SFI = ProfiledFunctions.begin(); + for(int i = 0; i < 50 && SFI != ProfiledFunctions.end(); ++SFI, ++i) { + errs() << " " << (*SFI)->getName() << " : " + << (*SFI)->getExecutionCount() << '\n'; + } + } } void RewriteInstance::runOptimizationPasses() { @@ -745,13 +777,15 @@ void RewriteInstance::runOptimizationPasses() { // Update exception handling information. Function.updateEHRanges(); - if (opts::PrintAll || opts::PrintEHRanges) { + if (opts::PrintAll || opts::PrintEHRanges) Function.print(errs(), "after updating EH ranges"); - } // Fix the CFI state. - if (!Function.fixCFIState()) + if (!Function.fixCFIState()) { + errs() << "FLO-WARNING: unable to fix CFI state for function " + << Function.getName() << ". Skipping.\n"; Function.setSimple(false); + } } } From 21d401682ed49451847821f9a9066d8431095559 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 19 Jan 2016 00:20:06 -0800 Subject: [PATCH 061/904] Don't create empty basic blocks. Fix CFI bug. Summary: Some basic blocks were created empty because they only contained alignment nop's. Ignore such nop's before basic block gets created. Fixed intermittent aborts related to CFI update. (cherry picked from commit 0574657b78cd8f9f6e274b3b29cbe2426ec49490) --- bolt/BinaryFunction.cpp | 20 +++++++++++--------- bolt/RewriteInstance.cpp | 14 ++++++++------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 76cbbf05e1d4..ee79308f4c11 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -542,6 +542,13 @@ bool BinaryFunction::buildCFG() { InsertBB = addBasicBlock(LI->first, LI->second, /* DeriveAlignment = */ IsLastInstrNop); } + // Ignore nops. We use nops to derive alignment of the next basic block. + // It will not always work, as some blocks are naturally aligned, but + // it's just part of heuristic for block alignment. + if (MIA->isNoop(InstrInfo.second)) { + IsLastInstrNop = true; + continue; + } if (!InsertBB) { // It must be a fallthrough or unreachable code. Create a new block unless // we see an unconditional branch following a conditional one. @@ -561,13 +568,6 @@ bool BinaryFunction::buildCFG() { // Add associated CFI pseudos in the first offset (0) addCFIPlaceholders(0, InsertBB); } - // Ignore nops. We use nops to derive alignment of the next basic block. - // It will not always work, as some blocks are naturally aligned, but - // it's just part of heuristic for block alignment. - if (MIA->isNoop(InstrInfo.second)) { - IsLastInstrNop = true; - continue; - } IsLastInstrNop = false; InsertBB->addInstruction(InstrInfo.second); @@ -930,16 +930,18 @@ bool BinaryFunction::fixCFIState() { } } auto Pos = BB->begin(); - while (MCCFIInstruction *CFI = getCFIFor(*Pos++)) { + while (Pos != BB->end() && BC.MIA->isCFI(*Pos)) { + auto CFI = getCFIFor(*Pos); if (CFI->getOperation() == MCCFIInstruction::OpRememberState) ++StackOffset; if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) --StackOffset; + ++Pos; } if (StackOffset != 0) { errs() << " FLO-WARNING: not possible to remember/recover state" - << "without corrupting CFI state stack in function " + << " without corrupting CFI state stack in function " << getName() << "\n"; return false; } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index b46b631b5b87..71d2a9b33626 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -690,7 +690,7 @@ void RewriteInstance::disassembleFunctions() { errs() << "FLO-INFO: " << ProfiledFunctions.size() << " functions out of " << NumSimpleFunctions << " simple functions (" - << format("%.f", + << format("%.1f", ProfiledFunctions.size() / (float) NumSimpleFunctions * 100.0) << "%) have non-empty execution profile.\n"; @@ -775,17 +775,19 @@ void RewriteInstance::runOptimizationPasses() { // Post-processing passes. - // Update exception handling information. - Function.updateEHRanges(); - if (opts::PrintAll || opts::PrintEHRanges) - Function.print(errs(), "after updating EH ranges"); - // Fix the CFI state. if (!Function.fixCFIState()) { errs() << "FLO-WARNING: unable to fix CFI state for function " << Function.getName() << ". Skipping.\n"; Function.setSimple(false); + continue; } + + // Update exception handling information. + Function.updateEHRanges(); + if (opts::PrintAll || opts::PrintEHRanges) + Function.print(errs(), "after updating EH ranges"); + } } From 5a26daf08f601ad160097296ba0c51732e0fbd8b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 22 Jan 2016 16:45:39 -0800 Subject: [PATCH 062/904] Allow to partially split functions with exceptions. Summary: We could split functions with exceptions even without creating a new exception handling table. This limits us to only move basic blocks that never throw, and are not a start of a landing pad. (cherry picked from commit 00a987f7248e53ff1f72d2dd224e5b36261a837c) --- bolt/BinaryBasicBlock.h | 10 +++++ bolt/BinaryFunction.cpp | 92 ++++++++++++++++++++++++++++++++-------- bolt/BinaryFunction.h | 26 +++++++++--- bolt/Exceptions.cpp | 29 ++++++------- bolt/RewriteInstance.cpp | 14 +++--- 5 files changed, 126 insertions(+), 45 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 88d87a937f58..2230e3aa8699 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -55,6 +55,10 @@ class BinaryBasicBlock { /// this BB will be allocated outside its parent function. bool IsCold{false}; + /// Indicates if any of instructions in this basic block could throw + /// an exception. + bool CanThrow{false}; + /// Vector of all instructions in the block. std::vector Instructions; @@ -237,6 +241,12 @@ class BinaryBasicBlock { return IsCold; } + /// Return true if any of instructions in this basic block can throw + /// an exception. + bool canThrow() const { + return CanThrow; + } + bool eraseInstruction(MCInst *Inst) { auto I = Instructions.end(); auto B = Instructions.begin(); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index ee79308f4c11..905aa97483e6 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1077,16 +1077,6 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { ClusterEdges[I][J] += Weight[elmt]; } } - DEBUG(for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { - dbgs() << "Cluster number " << I << ": "; - auto Sep = ""; - for (auto BB : Clusters[I]) { - dbgs() << Sep << BB->getName(); - Sep = ", "; - } - dbgs() << "\n"; - }); - std::vector Order; // Cluster layout order // Here we have 3 conflicting goals as to how to layout clusters. If we want @@ -1103,11 +1093,24 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { double Freq = 0.0; for (auto BB : Clusters[I]) { if (!BB->empty() && BB->size() != BB->getNumPseudos()) - Freq += BB->getExecutionCount() / (BB->size() - BB->getNumPseudos()); + Freq += ((double) BB->getExecutionCount()) / + (BB->size() - BB->getNumPseudos()); } AvgFreq[I] = Freq; } + DEBUG( + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + errs() << "Cluster number " << I << " (frequency: " << AvgFreq[I] << ") : "; + auto Sep = ""; + for (auto BB : Clusters[I]) { + errs() << Sep << BB->getName(); + Sep = ", "; + } + errs() << "\n"; + }; + ); + switch(Type) { case LT_OPTIMIZE: { for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) @@ -1201,6 +1204,17 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { llvm_unreachable("unexpected layout type"); } + DEBUG( + errs() << "New cluster order: "; + auto Sep = ""; + for(auto O : Order) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + ); + + BasicBlocksLayout.clear(); for (auto I : Order) { auto &Cluster = Clusters[I]; @@ -1454,14 +1468,56 @@ void BinaryFunction::splitFunction() { return; assert(BasicBlocksLayout.size() > 0); + // Separate hot from cold - for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); - I != E; ++I) { - BinaryBasicBlock *BB = *I; - if (BB->getExecutionCount() != 0) - break; - BB->IsCold = true; - IsSplit = true; + if (!hasEHRanges()) { + for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); + I != E; ++I) { + BinaryBasicBlock *BB = *I; + if (BB->getExecutionCount() != 0) + break; + BB->IsCold = true; + IsSplit = true; + } + } else { + // We cannot move a block that can throw since exception-handling + // runtime cannot deal with split functions. However, if we can guarantee + // that the block never throws, it is safe to move the block to + // decrease the size of the function. + // + // We also cannot move landing pads (or rather entry points for landing + // pads) for the same reason. + for (auto &BB : BasicBlocks) { + if (LandingPads.find(BB.getLabel()) != LandingPads.end()) { + BB.CanThrow = true; + continue; + } + for (auto &Instr : BB) { + if (BC.MIA->isInvoke(Instr)) { + BB.CanThrow = true; + break; + } + } + } + std::stable_sort(BasicBlocksLayout.begin(), BasicBlocksLayout.end(), + [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { + if (A->getExecutionCount() != 0 || B->getExecutionCount() != 0) + return false; + bool CouldMoveA = !A->canThrow(); + bool CouldMoveB = !B->canThrow(); + return CouldMoveA < CouldMoveB; + }); + + for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); + I != E; ++I) { + BinaryBasicBlock *BB = *I; + if (BB->getExecutionCount() != 0) + break; + if (BB->canThrow()) + break; + BB->IsCold = true; + IsSplit = true; + } } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 8ca6446f1ca4..6adabf491d73 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -113,6 +113,13 @@ class BinaryFunction { /// flow graph and re-assemble. bool IsSimple{true}; + /// True if this function needs to be emitted in two separate parts, one for + /// the hot basic blocks and another for the cold basic blocks. + bool IsSplit{false}; + + /// Indicate if this function has associated exception handling metadata. + bool HasEHRanges{false}; + MCSymbol *PersonalityFunction{nullptr}; uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel}; @@ -140,10 +147,6 @@ class BinaryFunction { /// Landing pads for the function. std::set LandingPads; - /// True if this function needs to be emitted in two separate parts, one for - /// the hot basic blocks and another for the cold basic blocks. - bool IsSplit{false}; - /// Release storage used by instructions. BinaryFunction &clearInstructions() { InstrMapType TempMap; @@ -251,6 +254,9 @@ class BinaryFunction { /// Symbol in the output. const MCSymbol *OutputSymbol; + /// Symbol at the end of the function. + MCSymbol *FunctionEndLabel{nullptr}; + /// Unique number associated with the function. uint64_t FunctionNumber; @@ -370,6 +376,15 @@ class BinaryFunction { return OutputSymbol; } + /// Return MC symbol associtated with the end of the function. + MCSymbol *getFunctionEndLabel() { + assert(BC.Ctx && "cannot be called with empty context"); + if (!FunctionEndLabel) { + FunctionEndLabel = BC.Ctx->createTempSymbol("func_end", true); + } + return FunctionEndLabel; + } + /// Return internal section name for this function. StringRef getCodeSectionName() const { assert(!CodeSectionName.empty() && "no section name for function"); @@ -417,6 +432,7 @@ class BinaryFunction { BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, bool DeriveAlignment = false) { assert(!getBasicBlockAtOffset(Offset) && "basic block already exists"); + assert(BC.Ctx && "cannot be called with empty context"); if (!Label) Label = BC.Ctx->createTempSymbol("BB", true); BasicBlocks.emplace_back(BinaryBasicBlock(Label, Offset)); @@ -679,7 +695,7 @@ class BinaryFunction { void updateEHRanges(); /// Return true if the function has exception handling tables. - bool hasEHRanges() const { return !CallSites.empty(); } + bool hasEHRanges() const { return HasEHRanges; } /// Emit exception handling ranges for the function. void emitLSDA(MCStreamer *Streamer); diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 30676a481522..1418222ec849 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -152,11 +152,9 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, errs() << '\n'; } - unsigned NumCallSites = 0; + HasEHRanges = CallSitePtr < CallSiteTableEnd; uint64_t RangeBase = getAddress(); while (CallSitePtr < CallSiteTableEnd) { - ++NumCallSites; - uintptr_t Start = readEncodedPointer(CallSitePtr, CallSiteEncoding); uintptr_t Length = readEncodedPointer(CallSitePtr, CallSiteEncoding); uintptr_t LandingPad = readEncodedPointer(CallSitePtr, CallSiteEncoding); @@ -339,11 +337,18 @@ void BinaryFunction::updateEHRanges() { continue; // Same symbol is used for the beginning and the end of the range. - const MCSymbol *EHSymbol = BC.Ctx->createTempSymbol("EH", true); - MCInst EHLabel; - BC.MIA->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get()); - II = BB->Instructions.insert(II, EHLabel); - ++II; + MCSymbol *EHSymbol{nullptr}; + if (BB->isCold()) { + // If we see a label in the cold block, it means we have to close + // the range using function end symbol. + EHSymbol = getFunctionEndLabel(); + } else { + EHSymbol = BC.Ctx->createTempSymbol("EH", true); + MCInst EHLabel; + BC.MIA->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get()); + II = BB->Instructions.insert(II, EHLabel); + ++II; + } // At this point we could be in the one of the following states: // @@ -384,11 +389,7 @@ void BinaryFunction::updateEHRanges() { // Check if we need to close the range. if (StartRange) { assert(!EndRange && "unexpected end of range"); - EndRange = BC.Ctx->createTempSymbol("EH", true); - MCInst EHLabel; - BC.MIA->createEHLabel(EHLabel, EndRange, BC.Ctx.get()); - BasicBlocksLayout.back()->Instructions.emplace_back(EHLabel); - + EndRange = getFunctionEndLabel(); CallSites.emplace_back(CallSite{StartRange, EndRange, PreviousEH.LP, PreviousEH.Action}); } @@ -400,8 +401,6 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer) { return; } - assert(!isSplit() && "split functions are not supported yet"); - // Calculate callsite table size. Size of each callsite entry is: // // sizeof(start) + sizeof(length) + sizeof(LP) + sizeof(uleb128(action)) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 71d2a9b33626..efe39c94f5e7 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -703,7 +703,7 @@ void RewriteInstance::disassembleFunctions() { } ); auto SFI = ProfiledFunctions.begin(); - for(int i = 0; i < 50 && SFI != ProfiledFunctions.end(); ++SFI, ++i) { + for(int i = 0; i < 100 && SFI != ProfiledFunctions.end(); ++SFI, ++i) { errs() << " " << (*SFI)->getName() << " : " << (*SFI)->getExecutionCount() << '\n'; } @@ -886,7 +886,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.EmitCFIPersonality(Function.getPersonalityFunction(), Function.getPersonalityEncoding()); } - if (Function.getLSDASymbol()) { + if (!EmitColdPart && Function.getLSDASymbol()) { Streamer.EmitCFILsda(Function.getLSDASymbol(), BC.MOFI->getLSDAEncoding()); } else { @@ -941,8 +941,12 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, if (Function.hasCFI() && HasExtraStorage) Streamer.EmitCFIEndProc(); + if (!EmitColdPart && Function.getFunctionEndLabel()) + Streamer.EmitLabel(Function.getFunctionEndLabel()); + // Emit LSDA before anything else? - Function.emitLSDA(&Streamer); + if (!EmitColdPart) + Function.emitLSDA(&Streamer); // TODO: is there any use in emiting end of function? // Perhaps once we have a support for C++ exceptions. @@ -1166,10 +1170,6 @@ bool RewriteInstance::splitLargeFunctions() { if (Function.getImageSize() <= Function.getMaxSize()) continue; - // Don't split functions with exception ranges. - if (Function.hasEHRanges()) - continue; - ToSplit.insert(BFI.first); Changed = true; } From 59e70e5752f2df34293059a33292b4dbbddf5dde Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 26 Jan 2016 16:03:58 -0800 Subject: [PATCH 063/904] Fix a bug with outlining first basic block. Summary: We should never outline the first basic block. Also add an option to accept a file with the list of functions to optimize. (cherry picked from commit fa0e42fb442b1003ecaa114d128c97eec8091b9b) --- bolt/BinaryBasicBlock.h | 12 +++++------- bolt/BinaryFunction.cpp | 15 +++++++++------ bolt/RewriteInstance.cpp | 17 +++++++++++++++++ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 2230e3aa8699..056557faec64 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -55,9 +55,8 @@ class BinaryBasicBlock { /// this BB will be allocated outside its parent function. bool IsCold{false}; - /// Indicates if any of instructions in this basic block could throw - /// an exception. - bool CanThrow{false}; + /// Indicates if the block could be outlined. + bool CanOutline{true}; /// Vector of all instructions in the block. std::vector Instructions; @@ -241,10 +240,9 @@ class BinaryBasicBlock { return IsCold; } - /// Return true if any of instructions in this basic block can throw - /// an exception. - bool canThrow() const { - return CanThrow; + /// Return true if the block could be outlined. + bool canOutline() const { + return CanOutline; } bool eraseInstruction(MCInst *Inst) { diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 905aa97483e6..6fb9865674c9 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1487,14 +1487,19 @@ void BinaryFunction::splitFunction() { // // We also cannot move landing pads (or rather entry points for landing // pads) for the same reason. + // + // Never move the first basic block. + BasicBlocks.front().CanOutline = false; for (auto &BB : BasicBlocks) { + if (!BB.CanOutline) + continue; if (LandingPads.find(BB.getLabel()) != LandingPads.end()) { - BB.CanThrow = true; + BB.CanOutline = false; continue; } for (auto &Instr : BB) { if (BC.MIA->isInvoke(Instr)) { - BB.CanThrow = true; + BB.CanOutline = false; break; } } @@ -1503,9 +1508,7 @@ void BinaryFunction::splitFunction() { [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { if (A->getExecutionCount() != 0 || B->getExecutionCount() != 0) return false; - bool CouldMoveA = !A->canThrow(); - bool CouldMoveB = !B->canThrow(); - return CouldMoveA < CouldMoveB; + return A->canOutline() < B->canOutline(); }); for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); @@ -1513,7 +1516,7 @@ void BinaryFunction::splitFunction() { BinaryBasicBlock *BB = *I; if (BB->getExecutionCount() != 0) break; - if (BB->canThrow()) + if (!BB->canOutline()) break; BB->IsCold = true; IsSplit = true; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index efe39c94f5e7..305af7fb0f1d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -47,6 +47,7 @@ #include "llvm/Support/ToolOutputFile.h" #include "llvm/Target/TargetMachine.h" #include +#include #include #include @@ -68,6 +69,10 @@ FunctionNames("funcs", cl::desc("list of functions to optimize"), cl::value_desc("func1,func2,func3,...")); +static cl::opt +FunctionNamesFile("funcs_file", + cl::desc("file with list of functions to optimize")); + static cl::list SkipFunctionNames("skip_funcs", cl::CommaSeparated, @@ -155,6 +160,15 @@ bool shouldProcess(const BinaryFunction &Function) { if (MaxFunctions && Function.getFunctionNumber() > MaxFunctions) return false; + if (!FunctionNamesFile.empty()) { + std::ifstream FuncsFile(FunctionNamesFile, std::ios::in); + std::string FuncName; + while (std::getline(FuncsFile, FuncName)) { + FunctionNames.push_back(FuncName); + } + FunctionNamesFile = ""; + } + bool IsValid = true; if (!FunctionNames.empty()) { IsValid = false; @@ -911,6 +925,9 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, } } + assert(!Function.begin()->isCold() && + "first basic block should never be cold"); + // Emit code. for (auto BB : Function.layout()) { if (EmitColdPart != BB->isCold()) From 0350f3190d648b156cda95849cf481c1bfcd27c1 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 26 Jan 2016 17:53:08 -0800 Subject: [PATCH 064/904] Enable REPNZ prefix support. Summary: I didn't see a case where REPNZ were not disassembled/reassembled properly. (cherry picked from commit 90cfdc35dec4f8eb83906ac1692024ac9ecd073c) --- bolt/BinaryBasicBlock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 056557faec64..f93526dbacb8 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -240,7 +240,10 @@ class BinaryBasicBlock { return IsCold; } - /// Return true if the block could be outlined. + /// Return true if the block can be outlined. At the moment we disallow + /// outlining of blocks that can potentially throw exceptions or are + /// the beginning of a landing pad. The entry basic block also can + /// never be outlined. bool canOutline() const { return CanOutline; } From 2c3128b633db0ddd481ab97620605e4147788dbb Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 21 Jan 2016 14:18:30 -0800 Subject: [PATCH 065/904] Preserve layout of basic blocks with 0 profile counts. Summary: Preserve original layout for basic blocks that have 0 execution count. Since we don't optimize for size, it's better to rely on the original input order. (cherry picked from commit 7286a095caeed1452251c3fbcf367ed693406557) --- bolt/BinaryFunction.cpp | 69 +++++++++++++++++++++++++--------------- bolt/BinaryFunction.h | 9 +++++- bolt/RewriteInstance.cpp | 2 +- 3 files changed, 53 insertions(+), 27 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 6fb9865674c9..a7d431c79836 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -20,6 +20,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstPrinter.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include @@ -32,6 +33,13 @@ namespace llvm { namespace flo { +namespace opts { + +static cl::opt +PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); + +} // namespace opts + uint64_t BinaryFunction::Count = 0; BinaryBasicBlock * @@ -198,8 +206,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << " Exec Count : " << BBExecCount << "\n"; } if (!BBCFIState.empty()) { - unsigned BBIndex = BB - &*BasicBlocks.begin(); - OS << " CFI State : " << BBCFIState[BBIndex] << '\n'; + OS << " CFI State : " << BBCFIState[getIndex(BB)] << '\n'; } if (!BB->Predecessors.empty()) { OS << " Predecessors: "; @@ -884,7 +891,7 @@ bool BinaryFunction::fixCFIState() { BinaryBasicBlock *EntryBB = *BasicBlocksLayout.begin(); for (uint32_t I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { BinaryBasicBlock *BB = BasicBlocksLayout[I]; - uint32_t BBIndex = BB - &*BasicBlocks.begin(); + uint32_t BBIndex = getIndex(BB); // Hot-cold border: check if this is the first BB to be allocated in a cold // region (a different function). If yes, we need to reset the CFI state. @@ -1001,7 +1008,19 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { std::map Weight; // Define a comparison function to establish SWO between edges - auto Comp = [&Weight](EdgeTy A, EdgeTy B) { return Weight[A] < Weight[B]; }; + auto Comp = [&] (EdgeTy A, EdgeTy B) { + // With equal weights, prioritize branches with lower index + // source/destination. This helps to keep original block order for blocks + // when optimal order cannot be deducted from a profile. + if (Weight[A] == Weight[B]) { + uint32_t ASrcBBIndex = getIndex(A.first); + uint32_t BSrcBBIndex = getIndex(B.first); + if (ASrcBBIndex != BSrcBBIndex) + return ASrcBBIndex > BSrcBBIndex; + return getIndex(A.second) > getIndex(B.second); + } + return Weight[A] < Weight[B]; + }; std::priority_queue, decltype(Comp)> Queue(Comp); typedef std::vector ClusterTy; @@ -1089,7 +1108,7 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { // should put clusters in descending order of hotness. std::vector AvgFreq; AvgFreq.resize(Clusters.size(), 0.0); - for (uint32_t I = 1, E = Clusters.size(); I < E; ++I) { + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { double Freq = 0.0; for (auto BB : Clusters[I]) { if (!BB->empty() && BB->size() != BB->getNumPseudos()) @@ -1099,17 +1118,18 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { AvgFreq[I] = Freq; } - DEBUG( - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { - errs() << "Cluster number " << I << " (frequency: " << AvgFreq[I] << ") : "; - auto Sep = ""; - for (auto BB : Clusters[I]) { - errs() << Sep << BB->getName(); - Sep = ", "; - } - errs() << "\n"; - }; - ); + if (opts::PrintClusters) { + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + errs() << "Cluster number " << I << " (frequency: " << AvgFreq[I] + << ") : "; + auto Sep = ""; + for (auto BB : Clusters[I]) { + errs() << Sep << BB->getName(); + Sep = ", "; + } + errs() << "\n"; + }; + } switch(Type) { case LT_OPTIMIZE: { @@ -1204,16 +1224,15 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { llvm_unreachable("unexpected layout type"); } - DEBUG( - errs() << "New cluster order: "; - auto Sep = ""; - for(auto O : Order) { - errs() << Sep << O; - Sep = ", "; + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for(auto O : Order) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; } - errs() << '\n'; - ); - BasicBlocksLayout.clear(); for (auto I : Order) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 6adabf491d73..e0ea39afe26d 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -339,7 +339,14 @@ class BinaryFunction { /// View CFG in graphviz program void viewGraph(); - /// Basic block iterator + /// Get basic block index assuming it belongs to this function. + unsigned getIndex(const BinaryBasicBlock *BB) const { + assert(BB >= &BasicBlocks.front() && "wrong basic block"); + unsigned I = BB - &BasicBlocks.front(); + assert(I < BasicBlocks.size() && "wrong basic block"); + return I; + } + /// Return the name of the function as extracted from the binary file. StringRef getName() const { diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 305af7fb0f1d..1821381882fa 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -157,7 +157,7 @@ PrintReordered("print-reordered", // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { - if (MaxFunctions && Function.getFunctionNumber() > MaxFunctions) + if (opts::MaxFunctions && Function.getFunctionNumber() > opts::MaxFunctions) return false; if (!FunctionNamesFile.empty()) { From a928f1a025020e91de64f8f02fa82e6cadbd369f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 5 Feb 2016 14:42:04 -0800 Subject: [PATCH 066/904] Rename binary optimizer to BOLT. Summary: BOLT - Binary Optimization and Layout Tool replaces FLO. I'm keeping .fdata extension for "feedback data". (cherry picked from commit 0c85fce10d9d04add8d9789a0d15be745be35e26) --- bolt/BinaryBasicBlock.cpp | 6 +- bolt/BinaryBasicBlock.h | 8 +-- bolt/BinaryContext.cpp | 4 +- bolt/BinaryContext.h | 8 +-- bolt/BinaryFunction.cpp | 34 ++++++------ bolt/BinaryFunction.h | 8 +-- bolt/CMakeLists.txt | 4 +- bolt/DataReader.cpp | 8 +-- bolt/DataReader.h | 12 ++-- bolt/Exceptions.cpp | 10 ++-- bolt/Exceptions.h | 8 +-- bolt/LLVMBuild.txt | 4 +- bolt/RewriteInstance.cpp | 82 ++++++++++++++-------------- bolt/RewriteInstance.h | 8 +-- bolt/{llvm-flo.cpp => llvm-bolt.cpp} | 14 ++--- 15 files changed, 109 insertions(+), 109 deletions(-) rename bolt/{llvm-flo.cpp => llvm-bolt.cpp} (89%) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 225b04f02eaa..c92a40a8a3fb 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -20,10 +20,10 @@ #include #undef DEBUG_TYPE -#define DEBUG_TYPE "flo" +#define DEBUG_TYPE "bolt" namespace llvm { -namespace flo { +namespace bolt { bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { return LHS.Offset < RHS.Offset; @@ -63,5 +63,5 @@ void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) { Predecessors.erase(I); } -} // namespace flo +} // namespace bolt } // namespace llvm diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index f93526dbacb8..d588b318a53d 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_FLO_BINARY_BASIC_BLOCK_H -#define LLVM_TOOLS_LLVM_FLO_BINARY_BASIC_BLOCK_H +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_BASIC_BLOCK_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_BASIC_BLOCK_H #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" @@ -28,7 +28,7 @@ #include namespace llvm { -namespace flo { +namespace bolt { class BinaryFunction; @@ -274,7 +274,7 @@ class BinaryBasicBlock { bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); -} // namespace flo +} // namespace bolt } // namespace llvm #endif diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index eb0f6dd6ef72..88e784ff7963 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -15,7 +15,7 @@ #include "llvm/MC/MCSymbol.h" namespace llvm { -namespace flo { +namespace bolt { MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix) { @@ -43,5 +43,5 @@ MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, return Symbol; } -} // namespace flo +} // namespace bolt } // namespace llvm diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 6f20615d5db4..f7d817aa1fe0 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_FLO_BINARY_CONTEXT_H -#define LLVM_TOOLS_LLVM_FLO_BINARY_CONTEXT_H +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H #include "llvm/ADT/Triple.h" #include "llvm/MC/MCAsmBackend.h" @@ -35,7 +35,7 @@ #include namespace llvm { -namespace flo { +namespace bolt { class DataReader; @@ -123,7 +123,7 @@ class BinaryContext { MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); }; -} // namespace flo +} // namespace bolt } // namespace llvm #endif diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a7d431c79836..7607d7e9ed34 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -28,10 +28,10 @@ #include #undef DEBUG_TYPE -#define DEBUG_TYPE "flo" +#define DEBUG_TYPE "bolt" namespace llvm { -namespace flo { +namespace bolt { namespace opts { @@ -311,7 +311,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { MCSymbol *TargetSymbol{nullptr}; if (!BC.MIA->evaluateRIPOperand(Instruction, Address, Size, TargetAddress)) { - DEBUG(dbgs() << "FLO: rip-relative operand could not be evaluated:\n"; + DEBUG(dbgs() << "BOLT: rip-relative operand could not be evaluated:\n"; BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); dbgs() << '\n'; Instruction.dump_pretty(dbgs(), BC.InstPrinter.get()); @@ -320,7 +320,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } // FIXME: check that the address is in data, not in code. if (TargetAddress == 0) { - errs() << "FLO-WARNING: rip-relative operand is zero in function " + errs() << "BOLT-WARNING: rip-relative operand is zero in function " << getName() << ". Ignoring function.\n"; return false; } @@ -342,7 +342,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { nulls(), nulls())) { // Ignore this function. Skip to the next one. - errs() << "FLO-WARNING: unable to disassemble instruction at offset 0x" + errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x" << Twine::utohexstr(Offset) << " (address 0x" << Twine::utohexstr(getAddress() + Offset) << ") in function " << getName() << '\n'; @@ -351,7 +351,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } if (MIA->isUnsupported(Instruction)) { - errs() << "FLO-WARNING: unsupported instruction seen at offset 0x" + errs() << "BOLT-WARNING: unsupported instruction seen at offset 0x" << Twine::utohexstr(Offset) << " (address 0x" << Twine::utohexstr(getAddress() + Offset) << ") in function " << getName() << '\n'; @@ -382,7 +382,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = Ctx->getOrCreateSymbol(getName()); } else { // Possibly an old-style PIC code - errs() << "FLO: internal call detected at 0x" + errs() << "BOLT: internal call detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << getName() << ". Skipping.\n"; IsSimple = false; @@ -404,7 +404,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } else { BC.InterproceduralBranchTargets.insert(InstructionTarget); if (!IsCall && Size == 2) { - errs() << "FLO-WARNING: relaxed tail call detected at 0x" + errs() << "BOLT-WARNING: relaxed tail call detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Code size will be increased.\n"; } @@ -424,7 +424,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // from the libraries. In reality more often than not it is // unreachable code, but we don't know it and have to emit calls // to 0 which make LLVM JIT unhappy. - errs() << "FLO-WARNING: Function " << getName() + errs() << "BOLT-WARNING: Function " << getName() << " has a call to address zero. Ignoring function.\n"; IsSimple = false; } @@ -450,7 +450,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Should be an indirect call or an indirect branch. Bail out on the // latter case. if (MIA->isIndirectBranch(Instruction)) { - DEBUG(dbgs() << "FLO-WARNING: indirect branch detected at 0x" + DEBUG(dbgs() << "BOLT-WARNING: indirect branch detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << getName() << ".\n"); IsSimple = false; @@ -458,7 +458,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Indirect call. We only need to fix it if the operand is RIP-relative if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { - errs() << "FLO-WARNING: cannot handle RIP operand at 0x" + errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << getName() << ".\n"; IsSimple = false; @@ -468,7 +468,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } else { if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { - errs() << "FLO-WARNING: cannot handle RIP operand at 0x" + errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << getName() << ".\n"; IsSimple = false; @@ -765,7 +765,7 @@ void BinaryFunction::inferFallThroughCounts() { DEBUG({ if (BBExecCount < ReportedBranches) dbgs() - << "FLO-WARNING: Fall-through inference is slightly inconsistent. " + << "BOLT-WARNING: Fall-through inference is slightly inconsistent. " "exec frequency is less than the outgoing edges frequency (" << BBExecCount << " < " << ReportedBranches << ") for BB at offset 0x" @@ -873,7 +873,7 @@ bool BinaryFunction::fixCFIState() { // without using the state stack. Not sure if it is worth the effort // because this happens rarely. if (NestedLevel != 0) { - errs() << "FLO-WARNING: CFI rewriter detected nested CFI state while " + errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state while " << " replaying CFI instructions for BB " << InBB->getName() << " in function " << getName() << '\n'; return false; @@ -947,7 +947,7 @@ bool BinaryFunction::fixCFIState() { } if (StackOffset != 0) { - errs() << " FLO-WARNING: not possible to remember/recover state" + errs() << " BOLT-WARNING: not possible to remember/recover state" << " without corrupting CFI state stack in function " << getName() << "\n"; return false; @@ -1431,7 +1431,7 @@ void BinaryFunction::fixBranches() { // invert this conditional branch logic so we can make this a fallthrough. if (TBB == FT && !HotColdBorder) { if (OldFT == nullptr) { - errs() << "FLO-ERROR: malfromed CFG for function " << getName() + errs() << "BOLT-ERROR: malfromed CFG for function " << getName() << " in basic block " << BB->getName() << '\n'; } assert(OldFT != nullptr && "malformed CFG"); @@ -1543,5 +1543,5 @@ void BinaryFunction::splitFunction() { } } -} // namespace flo +} // namespace bolt } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index e0ea39afe26d..ffb295ed55d4 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -14,8 +14,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_FLO_BINARY_FUNCTION_H -#define LLVM_TOOLS_LLVM_FLO_BINARY_FUNCTION_H +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_H #include "BinaryBasicBlock.h" #include "BinaryContext.h" @@ -39,7 +39,7 @@ using namespace llvm::object; namespace llvm { -namespace flo { +namespace bolt { /// BinaryFunction is a representation of machine-level function. // @@ -747,7 +747,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, return OS; } -} // namespace flo +} // namespace bolt } // namespace llvm #endif diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 4c6fcd50b9c2..2ee858fe5b85 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -11,8 +11,8 @@ set(LLVM_LINK_COMPONENTS Support ) -add_llvm_tool(llvm-flo - llvm-flo.cpp +add_llvm_tool(llvm-bolt + llvm-bolt.cpp BinaryBasicBlock.cpp BinaryContext.cpp BinaryFunction.cpp diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index a0db59b401d2..85e700e513fa 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This family of functions reads profile data written by the perf2flo -// utility and stores it in memory for llvm-flo consumption. +// This family of functions reads profile data written by the perf2bolt +// utility and stores it in memory for llvm-bolt consumption. // //===----------------------------------------------------------------------===// @@ -16,7 +16,7 @@ #include "DataReader.h" namespace llvm { -namespace flo { +namespace bolt { ErrorOr FuncBranchData::getBranch(uint64_t From, uint64_t To) const { @@ -50,7 +50,7 @@ DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { } void DataReader::reportError(StringRef ErrorMsg) { - Diag << "Error reading flo data input file: line " << Line << ", column " + Diag << "Error reading bolt data input file: line " << Line << ", column " << Col << ": " << ErrorMsg << '\n'; } diff --git a/bolt/DataReader.h b/bolt/DataReader.h index a47bcf3ab324..a5b711737ed0 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -7,13 +7,13 @@ // //===----------------------------------------------------------------------===// // -// This family of functions reads profile data written by the perf2flo -// utility and stores it in memory for llvm-flo consumption. +// This family of functions reads profile data written by the perf2bolt +// utility and stores it in memory for llvm-bolt consumption. // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_FLO_DATA_READER_H -#define LLVM_TOOLS_LLVM_FLO_DATA_READER_H +#ifndef LLVM_TOOLS_LLVM_BOLT_DATA_READER_H +#define LLVM_TOOLS_LLVM_BOLT_DATA_READER_H #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringMap.h" @@ -24,7 +24,7 @@ #include "llvm/Support/raw_ostream.h" namespace llvm { -namespace flo { +namespace bolt { struct Location { bool IsSymbol; @@ -75,7 +75,7 @@ class DataReader { static ErrorOr> readPerfData(StringRef Path, raw_ostream &Diag); - /// Parses the input flo data file into internal data structures. We expect + /// Parses the input bolt data file into internal data structures. We expect /// the file format to follow the syntax below. /// /// diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 1418222ec849..c9355030218c 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -28,12 +28,12 @@ #include "llvm/Support/raw_ostream.h" #undef DEBUG_TYPE -#define DEBUG_TYPE "flo-exceptions" +#define DEBUG_TYPE "bolt-exceptions" using namespace llvm::dwarf; namespace llvm { -namespace flo { +namespace bolt { namespace opts { @@ -171,7 +171,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, MCSymbol *LPSymbol{nullptr}; if (LandingPad) { if (Instructions.find(LandingPad) == Instructions.end()) { - errs() << "FLO-WARNING: landing pad " << Twine::utohexstr(LandingPad) + errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LandingPad) << " not pointing to an instruction in function " << getName() << " - ignoring.\n"; } else { @@ -500,7 +500,7 @@ void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { const FDE &CurFDE = *I->second; if (Function.getSize() != CurFDE.getAddressRange()) { - errs() << "FLO-WARNING: CFI information size mismatch for function \"" + errs() << "BOLT-WARNING: CFI information size mismatch for function \"" << Function.getName() << "\"" << format(": Function size is %dB, CFI covers " "%dB\n", @@ -821,5 +821,5 @@ void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, } } -} // namespace flo +} // namespace bolt } // namespace llvm diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h index 9fd3212eb99f..3e98a3949e9a 100644 --- a/bolt/Exceptions.h +++ b/bolt/Exceptions.h @@ -9,8 +9,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_FLO_EXCEPTIONS_H -#define LLVM_TOOLS_LLVM_FLO_EXCEPTIONS_H +#ifndef LLVM_TOOLS_LLVM_BOLT_EXCEPTIONS_H +#define LLVM_TOOLS_LLVM_BOLT_EXCEPTIONS_H #include "BinaryContext.h" #include "llvm/ADT/ArrayRef.h" @@ -19,7 +19,7 @@ #include namespace llvm { -namespace flo { +namespace bolt { class BinaryFunction; @@ -58,7 +58,7 @@ class CFIReaderWriter { FDEsMap FDEs; }; -} // namespace flo +} // namespace bolt } // namespace llvm #endif diff --git a/bolt/LLVMBuild.txt b/bolt/LLVMBuild.txt index eb8a2efe4cd9..26a77a1b3eea 100644 --- a/bolt/LLVMBuild.txt +++ b/bolt/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./tools/llvm-flo/LLVMBuild.txt ---------------------------*- Conf -*--===; +;===- ./tools/llvm-bolt/LLVMBuild.txt ---------------------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -17,6 +17,6 @@ [component_0] type = Tool -name = llvm-flo +name = llvm-bolt parent = Tools required_libraries = MC MCDisassembler MCParser Object all-targets diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 1821381882fa..b17e1d427408 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -52,11 +52,11 @@ #include #undef DEBUG_TYPE -#define DEBUG_TYPE "flo" +#define DEBUG_TYPE "bolt" using namespace llvm; using namespace object; -using namespace flo; +using namespace bolt; namespace opts { @@ -199,7 +199,7 @@ bool shouldProcess(const BinaryFunction &Function) { static void report_error(StringRef Message, std::error_code EC) { assert(EC); - errs() << "FLO: '" << Message << "': " << EC.message() << ".\n"; + errs() << "BOLT-ERROR: '" << Message << "': " << EC.message() << ".\n"; exit(1); } @@ -225,7 +225,7 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, IsReadOnly); } - DEBUG(dbgs() << "FLO: allocating " << (IsCode ? "code" : "data") + DEBUG(dbgs() << "BOLT: allocating " << (IsCode ? "code" : "data") << " section : " << SectionName << " with size " << Size << ", alignment " << Alignment << " at 0x" << ret << "\n"); @@ -239,7 +239,7 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, } bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { - DEBUG(dbgs() << "FLO: finalizeMemory()\n"); + DEBUG(dbgs() << "BOLT: finalizeMemory()\n"); return SectionMemoryManager::finalizeMemory(ErrMsg); } @@ -256,7 +256,7 @@ static std::unique_ptr CreateBinaryContext( *TheTriple, Error); if (!TheTarget) { - errs() << "FLO: " << Error; + errs() << "BOLT: " << Error; return nullptr; } @@ -384,7 +384,7 @@ void RewriteInstance::run() { if (opts::SplitFunctions && splitLargeFunctions()) { // Emit again because now some functions have been split - outs() << "FLO: split-functions: starting pass 2...\n"; + outs() << "BOLT: split-functions: starting pass 2...\n"; reset(); readSymbolTable(); readSpecialSections(); @@ -470,7 +470,7 @@ void RewriteInstance::readSymbolTable() { uint64_t Address = *AddressOrErr; if (Address == 0) { if (Symbol.getType() == SymbolRef::ST_Function) - errs() << "FLO-WARNING: function with 0 address seen\n"; + errs() << "BOLT-WARNING: function with 0 address seen\n"; continue; } @@ -581,7 +581,7 @@ void RewriteInstance::readSpecialSections() { } CFIRdWrt.reset(new CFIReaderWriter(*EHFrame, FrameHdrAddress, FrameHdrCopy)); if (!EHFrame->ParseError.empty()) { - errs() << "FLO-ERROR: EHFrame reader failed with message \"" + errs() << "BOLT-ERROR: EHFrame reader failed with message \"" << EHFrame->ParseError << "\"\n"; exit(1); } @@ -594,7 +594,7 @@ void RewriteInstance::disassembleFunctions() { BinaryFunction &Function = BFI.second; if (!opts::shouldProcess(Function)) { - DEBUG(dbgs() << "FLO: skipping processing function " << Function.getName() + DEBUG(dbgs() << "BOLT: skipping processing function " << Function.getName() << " per user request.\n"); continue; } @@ -606,7 +606,7 @@ void RewriteInstance::disassembleFunctions() { "wrong section for function"); if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { // When could it happen? - errs() << "FLO: corresponding section is non-executable or empty " + errs() << "BOLT: corresponding section is non-executable or empty " << "for function " << Function.getName(); continue; } @@ -617,7 +617,7 @@ void RewriteInstance::disassembleFunctions() { if (SymRefI != FileSymRefs.end()) { auto MaxSize = SymRefI->first - Function.getAddress(); if (MaxSize < Function.getSize()) { - errs() << "FLO-WARNING: symbol seen in the middle of the function " + errs() << "BOLT-WARNING: symbol seen in the middle of the function " << Function.getName() << ". Skipping.\n"; Function.setSimple(false); continue; @@ -685,7 +685,7 @@ void RewriteInstance::disassembleFunctions() { uint64_t Offset = Addr - I->first; if (Offset == 0 || Offset >= Func.getSize()) continue; - errs() << "FLO-WARNING: Function " << Func.getName() + errs() << "BOLT-WARNING: Function " << Func.getName() << " has internal BBs that are target of a branch located in " "another function. We will not process this function.\n"; Func.setSimple(false); @@ -701,7 +701,7 @@ void RewriteInstance::disassembleFunctions() { ProfiledFunctions.push_back(&BFI.second); } - errs() << "FLO-INFO: " << ProfiledFunctions.size() << " functions out of " + errs() << "BOLT-INFO: " << ProfiledFunctions.size() << " functions out of " << NumSimpleFunctions << " simple functions (" << format("%.1f", @@ -710,7 +710,7 @@ void RewriteInstance::disassembleFunctions() { << "%) have non-empty execution profile.\n"; if (ProfiledFunctions.size() > 10) { - errs() << "FLO-INFO: top called functions are:\n"; + errs() << "BOLT-INFO: top called functions are:\n"; std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), [](BinaryFunction *A, BinaryFunction *B) { return B->getExecutionCount() < A->getExecutionCount(); @@ -747,7 +747,7 @@ void RewriteInstance::runOptimizationPasses() { if (opts::EliminateUnreachable && Function.layout_size() > 0) { if (NagUser) { outs() - << "FLO-WARNING: Using -eliminate-unreachable is experimental and " + << "BOLT-WARNING: Using -eliminate-unreachable is experimental and " "unsafe for exceptions\n"; NagUser = false; } @@ -771,7 +771,7 @@ void RewriteInstance::runOptimizationPasses() { auto Count = Function.eraseDeadBBs(Reachable); if (Count) { - DEBUG(dbgs() << "FLO: Removed " << Count + DEBUG(dbgs() << "BOLT: Removed " << Count << " dead basic block(s) in function " << Function.getName() << '\n'); } @@ -791,7 +791,7 @@ void RewriteInstance::runOptimizationPasses() { // Fix the CFI state. if (!Function.fixCFIState()) { - errs() << "FLO-WARNING: unable to fix CFI state for function " + errs() << "BOLT-WARNING: unable to fix CFI state for function " << Function.getName() << ". Skipping.\n"; Function.setSimple(false); continue; @@ -1023,7 +1023,7 @@ void RewriteInstance::emitFunctions() { if (!opts::shouldProcess(Function)) continue; - DEBUG(dbgs() << "FLO: generating code for function \"" << Function.getName() + DEBUG(dbgs() << "BOLT: generating code for function \"" << Function.getName() << "\" : " << Function.getFunctionNumber() << '\n'); if (Function.hasCFI()) { @@ -1041,7 +1041,7 @@ void RewriteInstance::emitFunctions() { /*HasExtraStorage=*/ExtraStorage.Size != 0); } if (NoSpaceWarning) { - errs() << "FLO-WARNING: missing __flo_storage in this binary. No " + errs() << "BOLT-WARNING: missing __flo_storage in this binary. No " << "extra space left to allocate the new .eh_frame\n"; } @@ -1065,7 +1065,7 @@ void RewriteInstance::emitFunctions() { auto Resolver = orc::createLambdaResolver( [&](const std::string &Name) { - DEBUG(dbgs() << "FLO: looking for " << Name << "\n"); + DEBUG(dbgs() << "BOLT: looking for " << Name << "\n"); auto I = BC->GlobalSymbols.find(Name); if (I == BC->GlobalSymbols.end()) return RuntimeDyld::SymbolInfo(nullptr); @@ -1073,7 +1073,7 @@ void RewriteInstance::emitFunctions() { JITSymbolFlags::None); }, [](const std::string &S) { - DEBUG(dbgs() << "FLO: resolving " << S << "\n"); + DEBUG(dbgs() << "BOLT: resolving " << S << "\n"); return nullptr; } ); @@ -1093,7 +1093,7 @@ void RewriteInstance::emitFunctions() { auto SMII = EFMM->SectionMapInfo.find(Function.getCodeSectionName()); if (SMII != EFMM->SectionMapInfo.end()) { - DEBUG(dbgs() << "FLO: mapping 0x" + DEBUG(dbgs() << "BOLT: mapping 0x" << Twine::utohexstr(SMII->second.AllocAddress) << " to 0x" << Twine::utohexstr(Function.getAddress()) << '\n'); @@ -1103,7 +1103,7 @@ void RewriteInstance::emitFunctions() { Function.setImageAddress(SMII->second.AllocAddress); Function.setImageSize(SMII->second.Size); } else { - errs() << "FLO: cannot remap function " << Function.getName() << "\n"; + errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); } @@ -1115,7 +1115,7 @@ void RewriteInstance::emitFunctions() { if (SMII != EFMM->SectionMapInfo.end()) { // Align at a 16-byte boundary ExtraStorage.BumpPtr = RoundUpToAlignment(ExtraStorage.BumpPtr, 16); - DEBUG(dbgs() << "FLO: mapping 0x" + DEBUG(dbgs() << "BOLT: mapping 0x" << Twine::utohexstr(SMII->second.AllocAddress) << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) << " with size " << Twine::utohexstr(SMII->second.Size) @@ -1129,7 +1129,7 @@ void RewriteInstance::emitFunctions() { ExtraStorage.FileOffset); ExtraStorage.BumpPtr += SMII->second.Size; } else { - errs() << "FLO: cannot remap function " << Function.getName() << "\n"; + errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); } } @@ -1144,7 +1144,7 @@ void RewriteInstance::emitFunctions() { SectionInfo &SI = SMII->second; ExtraStorage.BumpPtr = RoundUpToAlignment(ExtraStorage.BumpPtr, SI.Alignment); - DEBUG(dbgs() << "FLO: mapping 0x" + DEBUG(dbgs() << "BOLT: mapping 0x" << Twine::utohexstr(SI.AllocAddress) << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) << '\n'); @@ -1159,13 +1159,13 @@ void RewriteInstance::emitFunctions() { ExtraStorage.BumpPtr += SI.Size; } else { - errs() << "FLO: cannot remap " << SectionName << '\n'; + errs() << "BOLT: cannot remap " << SectionName << '\n'; } } if (ExtraStorage.BumpPtr - ExtraStorage.Addr > ExtraStorage.Size) { errs() << format( - "FLO fatal error: __flo_storage in this binary has not enough free " + "BOLT fatal error: __flo_storage in this binary has not enough free " "space (required %d bytes, available %d bytes).\n", ExtraStorage.BumpPtr - ExtraStorage.Addr, ExtraStorage.Size); exit(1); @@ -1259,7 +1259,7 @@ void RewriteInstance::rewriteFile() { continue; if (Function.getImageSize() > Function.getMaxSize()) { - errs() << "FLO-WARNING: new function size (0x" + errs() << "BOLT-WARNING: new function size (0x" << Twine::utohexstr(Function.getImageSize()) << ") is larger than maximum allowed size (0x" << Twine::utohexstr(Function.getMaxSize()) @@ -1270,7 +1270,7 @@ void RewriteInstance::rewriteFile() { OverwrittenScore += Function.getFunctionScore(); // Overwrite function in the output file. - outs() << "FLO: rewriting function \"" << Function.getName() << "\"\n"; + outs() << "BOLT: rewriting function \"" << Function.getName() << "\"\n"; Out->os().pwrite(reinterpret_cast(Function.getImageAddress()), Function.getImageSize(), Function.getFileOffset()); @@ -1285,14 +1285,14 @@ void RewriteInstance::rewriteFile() { ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { - outs() << "FLO: maximum number of functions reached\n"; + outs() << "BOLT: maximum number of functions reached\n"; break; } continue; } // Write cold part - outs() << "FLO: rewriting function \"" << Function.getName() + outs() << "BOLT: rewriting function \"" << Function.getName() << "\" (cold part)\n"; Out->os().pwrite(reinterpret_cast(Function.cold().getImageAddress()), Function.cold().getImageSize(), @@ -1300,12 +1300,12 @@ void RewriteInstance::rewriteFile() { ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { - outs() << "FLO: maximum number of functions reached\n"; + outs() << "BOLT: maximum number of functions reached\n"; break; } } - outs() << "FLO: " << CountOverwrittenFunctions + outs() << "BOLT: " << CountOverwrittenFunctions << " out of " << BinaryFunctions.size() << " functions were overwritten.\n"; @@ -1313,7 +1313,7 @@ void RewriteInstance::rewriteFile() { auto SMII = SectionMM->SectionMapInfo.find(".eh_frame"); if (SMII != SectionMM->SectionMapInfo.end()) { auto &EHFrameSI = SMII->second; - outs() << "FLO: writing a new .eh_frame_hdr\n"; + outs() << "BOLT: writing a new .eh_frame_hdr\n"; if (FrameHdrAlign > 1) { ExtraStorage.BumpPtr = RoundUpToAlignment(ExtraStorage.BumpPtr, FrameHdrAlign); @@ -1327,7 +1327,7 @@ void RewriteInstance::rewriteFile() { FailedAddresses); if (ExtraStorage.BumpPtr - ExtraStorage.Addr - ExtraStorage.Size < FrameHdrCopy.size()) { - errs() << "FLO fatal error: __flo_storage in this binary has not enough " + errs() << "BOLT fatal error: __flo_storage in this binary has not enough " "free space\n"; exit(1); } @@ -1335,7 +1335,7 @@ void RewriteInstance::rewriteFile() { uint64_t HdrFileOffset = ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; Out->os().pwrite(FrameHdrCopy.data(), FrameHdrCopy.size(), HdrFileOffset); - outs() << "FLO: patching EH_FRAME program segment to reflect new " + outs() << "BOLT: patching EH_FRAME program segment to reflect new " ".eh_frame_hdr\n"; if (auto ELF64LEFile = dyn_cast(File)) { auto Obj = ELF64LEFile->getELFFile(); @@ -1344,7 +1344,7 @@ void RewriteInstance::rewriteFile() { outs() << "FAILED to patch program segment!\n"; } } else { - outs() << "FLO-ERROR: program segment NOT patched -- I don't know how to " + outs() << "BOLT-ERROR: program segment NOT patched -- I don't know how to " "handle this object file!\n"; } } @@ -1354,7 +1354,7 @@ void RewriteInstance::rewriteFile() { SectionInfo &SI = SMII.second; if (SI.IsCode) continue; - outs() << "FLO: writing new section " << SMII.first << '\n'; + outs() << "BOLT: writing new section " << SMII.first << '\n'; Out->os().pwrite(reinterpret_cast(SI.AllocAddress), SI.Size, SI.FileOffset); @@ -1364,7 +1364,7 @@ void RewriteInstance::rewriteFile() { if (TotalScore != 0) { double Coverage = OverwrittenScore / (double)TotalScore * 100.0; - outs() << format("FLO: Rewritten functions cover %.2lf", Coverage) + outs() << format("BOLT: Rewritten functions cover %.2lf", Coverage) << "% of the execution count of simple functions of this binary.\n"; } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 49573f8dac08..1a7a3f94b055 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_FLO_REWRITE_INSTANCE_H -#define LLVM_TOOLS_LLVM_FLO_REWRITE_INSTANCE_H +#ifndef LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H +#define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" @@ -27,7 +27,7 @@ class DWARFContext; class DWARFFrame; class tool_output_file; -namespace flo { +namespace bolt { class BinaryContext; class BinaryFunction; @@ -203,7 +203,7 @@ class RewriteInstance { }; -} // namespace flo +} // namespace bolt } // namespace llvm #endif diff --git a/bolt/llvm-flo.cpp b/bolt/llvm-bolt.cpp similarity index 89% rename from bolt/llvm-flo.cpp rename to bolt/llvm-bolt.cpp index 5a336b59bb1e..1c504ff00e3b 100644 --- a/bolt/llvm-flo.cpp +++ b/bolt/llvm-bolt.cpp @@ -1,4 +1,4 @@ -//===-- llvm-flo.cpp - Feedback-directed layout optimizer -----------------===// +//===-- llvm-bolt.cpp - Feedback-directed layout optimizer ----------------===// // // The LLVM Compiler Infrastructure // @@ -24,11 +24,11 @@ #include "llvm/Support/TargetRegistry.h" #undef DEBUG_TYPE -#define DEBUG_TYPE "flo" +#define DEBUG_TYPE "bolt" using namespace llvm; using namespace object; -using namespace flo; +using namespace bolt; namespace opts { @@ -39,7 +39,7 @@ static cl::opt InputDataFilename("data", cl::desc(""), cl::Optional); static cl::opt -DumpData("dump-data", cl::desc("dump parsed flo data and exit (debugging)"), +DumpData("dump-data", cl::desc("dump parsed bolt data and exit (debugging)"), cl::Hidden); } // namespace opts @@ -79,14 +79,14 @@ int main(int argc, char **argv) { if (!sys::fs::exists(opts::InputFilename)) report_error(opts::InputFilename, errc::no_such_file_or_directory); - std::unique_ptr DR(new DataReader(errs())); + std::unique_ptr DR(new DataReader(errs())); if (!opts::InputDataFilename.empty()) { if (!sys::fs::exists(opts::InputDataFilename)) report_error(opts::InputDataFilename, errc::no_such_file_or_directory); - // Attempt to read input flo data + // Attempt to read input bolt data auto ReaderOrErr = - flo::DataReader::readPerfData(opts::InputDataFilename, errs()); + bolt::DataReader::readPerfData(opts::InputDataFilename, errs()); if (std::error_code EC = ReaderOrErr.getError()) report_error(opts::InputDataFilename, EC); DR.reset(ReaderOrErr.get().release()); From 153dc6cb9b2753b8bef7fead362606330d95e404 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 8 Feb 2016 10:08:28 -0800 Subject: [PATCH 067/904] Keep intermediate .o file only under -keep-tmp option. Summary: We use intermediate .o file for debugging purposes, but there's no reason to generate it by default. Only do it if "-keep-tmp" is specified. (cherry picked from commit e5cd147351656470f34fbfbaab90539401904ebf) --- bolt/RewriteInstance.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index b17e1d427408..ab11f4c50d0d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -153,6 +153,10 @@ PrintReordered("print-reordered", cl::desc("print functions after layout optimization"), cl::Hidden); +static cl::opt +KeepTmp("keep-tmp", + cl::desc("preserve intermediate .o file"), + cl::Hidden); // Check against lists of functions from options if we should // optimize the function with a given name. @@ -1172,7 +1176,9 @@ void RewriteInstance::emitFunctions() { } OLT.emitAndFinalize(ObjectsHandle); - TempOut->keep(); + + if (opts::KeepTmp) + TempOut->keep(); } bool RewriteInstance::splitLargeFunctions() { From 3dfaa52211246118eb0acf516480af4ff42d15af Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 8 Feb 2016 10:02:48 -0800 Subject: [PATCH 068/904] Drop requirement for __flo_storage in the input binary. Summary: We used to require pre-allocated space in the input binary so that we can write extra sections in there (.eh_frame, .eh_frame_hdr, .gcc_except_table, etc.). With this diff there's no further need for pre-allocated storage as we create a new segment and can use as much space as needed. There are certain limitations on where the new segment could be allocated, and as a result the size of the file may increase. There's currently a limitation if the binary size is close to 4GB we cannot allocate new segment prior to that and as a result we require debug info to be stripped to reduce the file size. The fix is in progress. (cherry picked from commit c2cc66fb4186261437160da17e9e9954dbe3dc4b) --- bolt/RewriteInstance.cpp | 377 ++++++++++++++++++++++----------------- bolt/RewriteInstance.h | 59 +++--- 2 files changed, 247 insertions(+), 189 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ab11f4c50d0d..b58ebfb81ccf 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -237,7 +237,8 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, SectionMapInfo[SectionName] = SectionInfo(reinterpret_cast(ret), Size, Alignment, - IsCode); + IsCode, + IsReadOnly); return ret; } @@ -374,12 +375,86 @@ void RewriteInstance::reset() { TotalScore = 0; } +void RewriteInstance::discoverStorage() { + auto ELF64LEFile = dyn_cast(File); + if (!ELF64LEFile) { + errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n"; + exit(1); + } + + auto Obj = ELF64LEFile->getELFFile(); + + // Alignment should be the size of a page. + unsigned Align = 0x200000; + + // Discover important addresses in the binary. + + // This is where the first segment and ELF header were allocated. + uint64_t FirstAllocAddress = std::numeric_limits::max(); + + NextAvailableAddress = 0; + for (const auto &Phdr : Obj->program_headers()) { + if (Phdr.p_type == ELF::PT_LOAD) { + FirstAllocAddress = std::min(FirstAllocAddress, + static_cast(Phdr.p_vaddr)); + NextAvailableAddress = std::max(NextAvailableAddress, + Phdr.p_vaddr + Phdr.p_memsz); + } + } + + assert(NextAvailableAddress && "no PT_LOAD pheader seen"); + + errs() << "BOLT-INFO: first alloc address is 0x" + << Twine::utohexstr(FirstAllocAddress) << '\n'; + + NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, Align); + + // Earliest available offset is the size of the input file. + uint64_t NextAvailableOffset = Obj->size(); + NextAvailableOffset = RoundUpToAlignment(NextAvailableOffset, Align); + + // This is where the black magic happens. Creating PHDR table in a segment + // other than that containing ELF header is tricky. Some loaders and/or + // parts of loaders will apply e_phoff from ELF header assuming both are in + // the same segment, while others will do the proper calculation. + // We create the new PHDR table in such a way that both of the methods + // of loading and locating the table work. There's a slight file size + // overhead because of that. + + if (NextAvailableOffset <= NextAvailableAddress - FirstAllocAddress) { + NextAvailableOffset = NextAvailableAddress - FirstAllocAddress; + } else { + NextAvailableAddress = NextAvailableOffset + FirstAllocAddress; + } + + assert(NextAvailableOffset == NextAvailableAddress - FirstAllocAddress && + "PHDR table address calculation error"); + + errs() << "BOLT-INFO: creating new program header table at address 0x" + << Twine::utohexstr(NextAvailableAddress) << '\n'; + + PHDRTableAddress = NextAvailableAddress; + PHDRTableOffset = NextAvailableOffset; + + // Reserve the space for 3 extra pheaders. + unsigned Phnum = Obj->getHeader()->e_phnum; + Phnum += 3; + + NextAvailableAddress += Phnum * sizeof(ELFFile::Elf_Phdr); + NextAvailableOffset += Phnum * sizeof(ELFFile::Elf_Phdr); + + // TODO: insert alignment here if needed. + NewTextSegmentAddress = NextAvailableAddress; + NewTextSegmentOffset = NextAvailableOffset; +} + void RewriteInstance::run() { if (!BC) { errs() << "failed to create a binary context\n"; return; } + discoverStorage(); readSymbolTable(); readSpecialSections(); disassembleFunctions(); @@ -390,6 +465,7 @@ void RewriteInstance::run() { // Emit again because now some functions have been split outs() << "BOLT: split-functions: starting pass 2...\n"; reset(); + discoverStorage(); readSymbolTable(); readSpecialSections(); disassembleFunctions(); @@ -408,30 +484,6 @@ void RewriteInstance::run() { rewriteFile(); } -namespace { - -// Helper function to map a random memory address to a file offset. Returns 0 if -// this address cannot be mapped back to the file. -uint64_t discoverFileOffset(ELFObjectFileBase *File, uint64_t MemAddr) { - for (const auto &Section : File->sections()) { - uint64_t SecAddress = Section.getAddress(); - uint64_t Size = Section.getSize(); - if (MemAddr < SecAddress || - SecAddress + Size <= MemAddr) - continue; - - StringRef SectionContents; - check_error(Section.getContents(SectionContents), - "cannot get section contents"); - uint64_t SecFileOffset = SectionContents.data() - File->getData().data(); - uint64_t MemAddrSecOffset = MemAddr - SecAddress; - return SecFileOffset + MemAddrSecOffset; - } - return 0ULL; -} - -} // anonymous namespace - void RewriteInstance::readSymbolTable() { std::string FileSymbolName; @@ -449,20 +501,6 @@ void RewriteInstance::readSymbolTable() { ErrorOr Name = Symbol.getName(); check_error(Name.getError(), "cannot get symbol name"); - if (*Name == "__flo_storage") { - ExtraStorage.Addr = Symbol.getValue(); - ExtraStorage.BumpPtr = ExtraStorage.Addr; - ExtraStorage.FileOffset = discoverFileOffset(File, ExtraStorage.Addr); - assert(ExtraStorage.FileOffset != 0 && "Corrupt __flo_storage symbol"); - - FileSymRefs[ExtraStorage.Addr] = Symbol; - continue; - } - if (*Name == "__flo_storage_end") { - ExtraStorage.AddrEnd = Symbol.getValue(); - continue; - } - if (Symbol.getType() == SymbolRef::ST_File) { // Could be used for local symbol disambiguation. FileSymbolName = *Name; @@ -549,7 +587,6 @@ void RewriteInstance::readSymbolTable() { SymbolSize, *BC) ); } - ExtraStorage.Size = ExtraStorage.AddrEnd - ExtraStorage.Addr; } void RewriteInstance::readSpecialSections() { @@ -605,7 +642,7 @@ void RewriteInstance::disassembleFunctions() { SectionRef Section = Function.getSection(); assert(Section.getAddress() <= Function.getAddress() && - Section.getAddress() + Section.getSize() + Section.getAddress() + Section.getSize() >= Function.getAddress() + Function.getSize() && "wrong section for function"); if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { @@ -813,7 +850,7 @@ namespace { // Helper function to emit the contents of a function via a MCStreamer object. void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, - BinaryContext &BC, bool EmitColdPart, bool HasExtraStorage) { + BinaryContext &BC, bool EmitColdPart) { // Define a helper to decode and emit CFI instructions at a given point in a // BB auto emitCFIInstr = [&Streamer](MCCFIInstruction &CFIInstr) { @@ -898,7 +935,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, } // Emit CFI start - if (Function.hasCFI() && HasExtraStorage) { + if (Function.hasCFI()) { Streamer.EmitCFIStartProc(/*IsSimple=*/false); if (Function.getPersonalityFunction() != nullptr) { Streamer.EmitCFIPersonality(Function.getPersonalityFunction(), @@ -953,13 +990,12 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.EmitInstruction(Instr, *BC.STI); continue; } - if (HasExtraStorage) - emitCFIInstr(*Function.getCFIFor(Instr)); + emitCFIInstr(*Function.getCFIFor(Instr)); } } // Emit CFI end - if (Function.hasCFI() && HasExtraStorage) + if (Function.hasCFI()) Streamer.EmitCFIEndProc(); if (!EmitColdPart && Function.getFunctionEndLabel()) @@ -991,7 +1027,7 @@ void RewriteInstance::emitFunctions() { // This is an object file, which we keep for debugging purposes. // Once we decide it's useless, we should create it in memory. std::unique_ptr TempOut = - llvm::make_unique(opts::OutputFilename + ".o", + llvm::make_unique(opts::OutputFilename + ".bolt.o", EC, sys::fs::F_None); check_error(EC, "cannot create output object file"); @@ -1016,7 +1052,6 @@ void RewriteInstance::emitFunctions() { Streamer->InitSections(false); - bool NoSpaceWarning = false; // Output functions one by one. for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -1027,30 +1062,22 @@ void RewriteInstance::emitFunctions() { if (!opts::shouldProcess(Function)) continue; - DEBUG(dbgs() << "BOLT: generating code for function \"" << Function.getName() - << "\" : " << Function.getFunctionNumber() << '\n'); + DEBUG(dbgs() << "BOLT: generating code for function \"" + << Function.getName() << "\" : " + << Function.getFunctionNumber() << '\n'); - if (Function.hasCFI()) { - if (ExtraStorage.Size == 0) - NoSpaceWarning = true; - } - - emitFunction(*Streamer, Function, *BC.get(), - /*EmitColdPart=*/false, - /*HasExtraStorage=*/ExtraStorage.Size != 0); + emitFunction(*Streamer, Function, *BC.get(), /*EmitColdPart=*/false); if (Function.isSplit()) - emitFunction(*Streamer, Function, *BC.get(), - /*EmitColdPart=*/true, - /*HasExtraStorage=*/ExtraStorage.Size != 0); - } - if (NoSpaceWarning) { - errs() << "BOLT-WARNING: missing __flo_storage in this binary. No " - << "extra space left to allocate the new .eh_frame\n"; + emitFunction(*Streamer, Function, *BC.get(), /*EmitColdPart=*/true); } Streamer->Finish(); + ////////////////////////////////////////////////////////////////////////////// + // Assign addresses to new functions/sections. + ////////////////////////////////////////////////////////////////////////////// + // Get output object as ObjectFile. std::unique_ptr ObjectMemBuffer = MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false); @@ -1061,9 +1088,7 @@ void RewriteInstance::emitFunctions() { auto EFMM = new ExecutableFileMemoryManager(); SectionMM.reset(EFMM); - // FIXME: use notifyObjectLoaded() to remap sections. - DEBUG(dbgs() << "Creating OLT\n"); // Run ObjectLinkingLayer() with custom memory manager and symbol resolver. orc::ObjectLinkingLayer<> OLT; @@ -1081,12 +1106,12 @@ void RewriteInstance::emitFunctions() { return nullptr; } ); - // FIXME: auto ObjectsHandle = OLT.addObjectSet( singletonSet(std::move(ObjOrErr.get())), SectionMM.get(), std::move(Resolver)); - //OLT.takeOwnershipOfBuffers(ObjectsHandle, ); + + // FIXME: use notifyObjectLoaded() to remap sections. // Map every function/section current address in memory to that in // the output binary. @@ -1117,21 +1142,20 @@ void RewriteInstance::emitFunctions() { SMII = EFMM->SectionMapInfo.find( Function.getCodeSectionName().str().append(".cold")); if (SMII != EFMM->SectionMapInfo.end()) { - // Align at a 16-byte boundary - ExtraStorage.BumpPtr = RoundUpToAlignment(ExtraStorage.BumpPtr, 16); + // Cold fragments are aligned at 16 bytes. + NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, 16); DEBUG(dbgs() << "BOLT: mapping 0x" << Twine::utohexstr(SMII->second.AllocAddress) - << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) + << " to 0x" << Twine::utohexstr(NextAvailableAddress) << " with size " << Twine::utohexstr(SMII->second.Size) << '\n'); OLT.mapSectionAddress(ObjectsHandle, reinterpret_cast(SMII->second.AllocAddress), - ExtraStorage.BumpPtr); + NextAvailableAddress); Function.cold().setImageAddress(SMII->second.AllocAddress); Function.cold().setImageSize(SMII->second.Size); - Function.cold().setFileOffset(ExtraStorage.BumpPtr - ExtraStorage.Addr + - ExtraStorage.FileOffset); - ExtraStorage.BumpPtr += SMII->second.Size; + Function.cold().setFileOffset(getFileOffsetFor(NextAvailableAddress)); + NextAvailableAddress += SMII->second.Size; } else { errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); @@ -1146,35 +1170,26 @@ void RewriteInstance::emitFunctions() { auto SMII = EFMM->SectionMapInfo.find(SectionName); if (SMII != EFMM->SectionMapInfo.end()) { SectionInfo &SI = SMII->second; - ExtraStorage.BumpPtr = RoundUpToAlignment(ExtraStorage.BumpPtr, + NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, SI.Alignment); DEBUG(dbgs() << "BOLT: mapping 0x" << Twine::utohexstr(SI.AllocAddress) - << " to 0x" << Twine::utohexstr(ExtraStorage.BumpPtr) + << " to 0x" << Twine::utohexstr(NextAvailableAddress) << '\n'); OLT.mapSectionAddress(ObjectsHandle, reinterpret_cast(SI.AllocAddress), - ExtraStorage.BumpPtr); + NextAvailableAddress); - SI.FileAddress = ExtraStorage.BumpPtr; - SI.FileOffset = ExtraStorage.BumpPtr - ExtraStorage.Addr + - ExtraStorage.FileOffset; + SI.FileAddress = NextAvailableAddress; + SI.FileOffset = getFileOffsetFor(NextAvailableAddress); - ExtraStorage.BumpPtr += SI.Size; + NextAvailableAddress += SI.Size; } else { errs() << "BOLT: cannot remap " << SectionName << '\n'; } } - if (ExtraStorage.BumpPtr - ExtraStorage.Addr > ExtraStorage.Size) { - errs() << format( - "BOLT fatal error: __flo_storage in this binary has not enough free " - "space (required %d bytes, available %d bytes).\n", - ExtraStorage.BumpPtr - ExtraStorage.Addr, ExtraStorage.Size); - exit(1); - } - OLT.emitAndFinalize(ObjectsHandle); if (opts::KeepTmp) @@ -1199,38 +1214,82 @@ bool RewriteInstance::splitLargeFunctions() { return Changed; } -namespace { - -// Helper to locate EH_FRAME_HDR segment, specialized for 64-bit LE ELF -bool patchEhFrameHdrSegment(const ELFFile *Obj, raw_pwrite_stream *OS, - uint64_t Offset, uint64_t Addr, uint64_t Size) { - for (const auto &Phdr : Obj->program_headers()) { - if (Phdr.p_type != ELF::PT_GNU_EH_FRAME) - continue; - uint64_t OffsetLoc = (uintptr_t)&Phdr.p_offset - (uintptr_t)Obj->base(); - uint64_t VAddrLoc = (uintptr_t)&Phdr.p_vaddr - (uintptr_t)Obj->base(); - uint64_t PAddrLoc = (uintptr_t)&Phdr.p_paddr - (uintptr_t)Obj->base(); - uint64_t FileSzLoc = (uintptr_t)&Phdr.p_filesz - (uintptr_t)Obj->base(); - uint64_t MemSzLoc = (uintptr_t)&Phdr.p_memsz - (uintptr_t)Obj->base(); - char Buffer[8]; - // Update Offset - support::ulittle64_t::ref(Buffer + 0) = Offset; - OS->pwrite(Buffer, 8, OffsetLoc); - support::ulittle64_t::ref(Buffer + 0) = Addr; - OS->pwrite(Buffer, 8, VAddrLoc); - OS->pwrite(Buffer, 8, PAddrLoc); - support::ulittle64_t::ref(Buffer + 0) = Size; - OS->pwrite(Buffer, 8, FileSzLoc); - OS->pwrite(Buffer, 8, MemSzLoc); - return true; +void RewriteInstance::patchELF() { + auto ELF64LEFile = dyn_cast(File); + if (!ELF64LEFile) { + errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n"; + exit(1); + } + auto Obj = ELF64LEFile->getELFFile(); + auto &OS = Out->os(); + OS.seek(PHDRTableOffset); + + errs() << "BOLT-INFO: writing new program headers at offset 0x" + << Twine::utohexstr(PHDRTableOffset) << '\n'; + + auto Ehdr = Obj->getHeader(); + unsigned Phnum = Ehdr->e_phnum; + + // FIXME: this will depend on the number of segements we plan to write. + Phnum += 1; + + // Copy existing program headers with modifications. + for (auto &Phdr : Obj->program_headers()) { + if (Phdr.p_type == ELF::PT_PHDR) { + auto NewPhdr = Phdr; + NewPhdr.p_offset = PHDRTableOffset; + NewPhdr.p_vaddr = PHDRTableAddress; + NewPhdr.p_paddr = PHDRTableAddress; + NewPhdr.p_filesz = sizeof(NewPhdr) * Phnum; + NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum; + OS.write(reinterpret_cast(&NewPhdr), sizeof(NewPhdr)); + } else if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) { + auto NewPhdr = Phdr; + NewPhdr.p_offset = EHFrameHdrSecInfo.FileOffset; + NewPhdr.p_vaddr = EHFrameHdrSecInfo.FileAddress; + NewPhdr.p_paddr = EHFrameHdrSecInfo.FileAddress; + NewPhdr.p_filesz = EHFrameHdrSecInfo.Size; + NewPhdr.p_memsz = EHFrameHdrSecInfo.Size; + OS.write(reinterpret_cast(&NewPhdr), sizeof(NewPhdr)); + } else { + OS.write(reinterpret_cast(&Phdr), sizeof(Phdr)); + } } - return false; -} -} // anonymous namespace + NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress; + + // Alignment should be the size of a page. + unsigned Align = 0x200000; + + // Add new pheaders + ELFFile::Elf_Phdr NewTextPhdr; + NewTextPhdr.p_type = ELF::PT_LOAD; + NewTextPhdr.p_offset = PHDRTableOffset; + NewTextPhdr.p_vaddr = PHDRTableAddress; + NewTextPhdr.p_paddr = PHDRTableAddress; + NewTextPhdr.p_filesz = NewTextSegmentSize; + NewTextPhdr.p_memsz = NewTextSegmentSize; + NewTextPhdr.p_flags = ELF::PF_R | ELF::PF_X; + NewTextPhdr.p_align = Align; + + OS.write(reinterpret_cast(&NewTextPhdr), sizeof(NewTextPhdr)); + + // Fix ELF header. + uint64_t PhoffLoc = (uintptr_t)&Ehdr->e_phoff - (uintptr_t)Obj->base(); + uint64_t PhnumLoc = (uintptr_t)&Ehdr->e_phnum - (uintptr_t)Obj->base(); + char Buffer[8]; + support::ulittle64_t::ref(Buffer + 0) = PHDRTableOffset; + OS.pwrite(Buffer, 8, PhoffLoc); + support::ulittle16_t::ref(Buffer + 0) = Phnum; + OS.pwrite(Buffer, 2, PhnumLoc); + + // FIXME: Update _end in .dynamic + +} void RewriteInstance::rewriteFile() { - // FIXME: is there a less painful way to obtain assembler/writer? + // We obtain an asm-specific writer so that we can emit nops in an + // architecture-specific way at the end of the function. auto MCE = BC->TheTarget->createMCCodeEmitter(*BC->MII, *BC->MRI, *BC->Ctx); auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); std::unique_ptr Streamer( @@ -1246,11 +1305,10 @@ void RewriteInstance::rewriteFile() { ->getAssembler() .getWriter(); - // Print _flo_storage area stats for debug - DEBUG(dbgs() << format("INFO: __flo_storage address = 0x%x file offset = " - "0x%x total size = 0x%x\n", - ExtraStorage.Addr, ExtraStorage.FileOffset, - ExtraStorage.Size)); + // Make sure output stream has enough space. + auto Offset = Out->os().seek(getFileOffsetFor(NextAvailableAddress)); + assert(Offset == getFileOffsetFor(NextAvailableAddress) && + "error resizing output file"); // Overwrite function in the output file. uint64_t CountOverwrittenFunctions = 0; @@ -1304,6 +1362,8 @@ void RewriteInstance::rewriteFile() { Function.cold().getImageSize(), Function.cold().getFileOffset()); + // FIXME: write nops after cold part too. + ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { outs() << "BOLT: maximum number of functions reached\n"; @@ -1315,59 +1375,50 @@ void RewriteInstance::rewriteFile() { << " out of " << BinaryFunctions.size() << " functions were overwritten.\n"; + // Write all non-code sections. + for(auto &SMII : SectionMM->SectionMapInfo) { + SectionInfo &SI = SMII.second; + if (SI.IsCode) + continue; + outs() << "BOLT: writing new section " << SMII.first << '\n'; + Out->os().pwrite(reinterpret_cast(SI.AllocAddress), + SI.Size, + SI.FileOffset); + } + // If .eh_frame is present it requires special handling. auto SMII = SectionMM->SectionMapInfo.find(".eh_frame"); if (SMII != SectionMM->SectionMapInfo.end()) { - auto &EHFrameSI = SMII->second; + auto &EHFrameSecInfo = SMII->second; outs() << "BOLT: writing a new .eh_frame_hdr\n"; if (FrameHdrAlign > 1) { - ExtraStorage.BumpPtr = - RoundUpToAlignment(ExtraStorage.BumpPtr, FrameHdrAlign); + NextAvailableAddress = + RoundUpToAlignment(NextAvailableAddress, FrameHdrAlign); } + + EHFrameHdrSecInfo.FileAddress = NextAvailableAddress; + EHFrameHdrSecInfo.FileOffset = getFileOffsetFor(NextAvailableAddress); + std::sort(FailedAddresses.begin(), FailedAddresses.end()); CFIRdWrt->rewriteHeaderFor( - StringRef(reinterpret_cast(EHFrameSI.AllocAddress), - EHFrameSI.Size), - EHFrameSI.FileAddress, - ExtraStorage.BumpPtr, + StringRef(reinterpret_cast(EHFrameSecInfo.AllocAddress), + EHFrameSecInfo.Size), + EHFrameSecInfo.FileAddress, + EHFrameHdrSecInfo.FileAddress, FailedAddresses); - if (ExtraStorage.BumpPtr - ExtraStorage.Addr - ExtraStorage.Size < - FrameHdrCopy.size()) { - errs() << "BOLT fatal error: __flo_storage in this binary has not enough " - "free space\n"; - exit(1); - } - uint64_t HdrFileOffset = - ExtraStorage.BumpPtr - ExtraStorage.Addr + ExtraStorage.FileOffset; - Out->os().pwrite(FrameHdrCopy.data(), FrameHdrCopy.size(), HdrFileOffset); - outs() << "BOLT: patching EH_FRAME program segment to reflect new " - ".eh_frame_hdr\n"; - if (auto ELF64LEFile = dyn_cast(File)) { - auto Obj = ELF64LEFile->getELFFile(); - if (!patchEhFrameHdrSegment(Obj, &Out->os(), HdrFileOffset, - ExtraStorage.BumpPtr, FrameHdrCopy.size())) { - outs() << "FAILED to patch program segment!\n"; - } - } else { - outs() << "BOLT-ERROR: program segment NOT patched -- I don't know how to " - "handle this object file!\n"; - } - } + EHFrameHdrSecInfo.Size = FrameHdrCopy.size(); - // Write all non-code sections. - for(auto &SMII : SectionMM->SectionMapInfo) { - SectionInfo &SI = SMII.second; - if (SI.IsCode) - continue; - outs() << "BOLT: writing new section " << SMII.first << '\n'; - Out->os().pwrite(reinterpret_cast(SI.AllocAddress), - SI.Size, - SI.FileOffset); + assert(Out->os().tell() == EHFrameHdrSecInfo.FileOffset && + "offset mismatch"); + Out->os().write(FrameHdrCopy.data(), EHFrameHdrSecInfo.Size); - // Update ELF section header. + NextAvailableAddress += EHFrameHdrSecInfo.Size; } + // Update ELF book-keeping info. + patchELF(); + if (TotalScore != 0) { double Coverage = OverwrittenScore / (double)TotalScore * 100.0; outs() << format("BOLT: Rewritten functions cover %.2lf", Coverage) diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 1a7a3f94b055..f29aee4f6c8a 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -39,14 +39,15 @@ struct SectionInfo { uint64_t AllocAddress; /// Current location of the section in memory. uint64_t Size; /// Section size. unsigned Alignment; /// Alignment of the section. - uint64_t FileAddress{0}; /// Address in the output file. + uint64_t FileAddress{0}; /// Address for the output file (final address). uint64_t FileOffset{0}; /// Offset in the output file. - bool IsCode{false}; /// Does this section contain code. + bool IsCode{false}; /// Does this section contain code? + bool IsReadOnly{false}; /// Is the section read-only? SectionInfo(uint64_t Address = 0, uint64_t Size = 0, unsigned Alignment = 0, - bool IsCode = false) + bool IsCode = false, bool IsReadOnly = false) : AllocAddress(Address), Size(Size), Alignment(Alignment), - IsCode(IsCode) {} + IsCode(IsCode), IsReadOnly(IsReadOnly) {} }; /// Class responsible for allocating and managing code and data sections. @@ -137,6 +138,22 @@ class RewriteInstance { /// disassembleFunctions(), also preserve the original version. void rewriteFile(); +private: + + /// Detect storage available in the binary for allocating new sections. + void discoverStorage(); + + /// Patch ELF book-keeping info. + void patchELF(); + + /// Return file offset corresponding to a given virtual address. + uint64_t getFileOffsetFor(uint64_t Address) { + assert(Address >= NewTextSegmentAddress && + "address in not in the new text segment"); + return Address - NewTextSegmentAddress + NewTextSegmentOffset; + } + + private: /// An instance of the input binary we are processing, externally owned. llvm::object::ELFObjectFileBase *File; @@ -151,28 +168,18 @@ class RewriteInstance { /// optimized code for selected functions. std::unique_ptr Out; - /// Represent free space we have in the binary to write extra bytes. This free - /// space is pre-delimited in the binary via a linker script that allocates - /// space and inserts a new symbol __flo_storage in the binary. We also use - /// the symbol __flo_storage_end to delimit the end of the contiguous space in - /// the binary where it is safe for us to write new content. We use this extra - /// space for the following activities: - /// - /// * Writing new .eh_frame entries for functions we changed the layout - /// * Writing a new .eh_frame_hdr to allow us to expand the number of - /// .eh_frame entries (FDEs). Note we also keep the old .eh_frame in the - /// binary instact for functions we don't touch. - /// * Writing cold basic blocks - /// - struct BlobTy { - uint64_t Addr; - uint64_t FileOffset; - uint64_t Size; - uint64_t AddrEnd; - /// BumpPtr is a trivial way to keep track of space utilization in this blob - uint64_t BumpPtr; - }; - BlobTy ExtraStorage{0, 0, 0, 0, 0}; + uint64_t PHDRTableAddress{0}; + uint64_t PHDRTableOffset{0}; + + /// New code segment info. + uint64_t NewTextSegmentAddress{0}; + uint64_t NewTextSegmentOffset{0}; + uint64_t NewTextSegmentSize{0}; + + /// Track next available address in the new text segment. + uint64_t NextAvailableAddress{0}; + + SectionInfo EHFrameHdrSecInfo; /// Store all non-zero symbols in this map for a quick address lookup. std::map FileSymRefs; From 764073a63643049f52e4e65c142a4b7df54d702c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 12 Feb 2016 19:01:53 -0800 Subject: [PATCH 069/904] Add an option to use PT_GNU_STACK for new segment. Summary: Added an option to reuse existing program header entry. This option allows for bfd tools like strip and objcopy to operate on the optimized binary without destroying it. Also, all new sections are now properly marked in ELF. (cherry picked from commit de1316e83c6923b0a9338f4f9afedeaa94f9c13e) --- bolt/BinaryFunction.cpp | 6 +- bolt/Exceptions.cpp | 8 +- bolt/RewriteInstance.cpp | 318 +++++++++++++++++++++++++++++---------- bolt/RewriteInstance.h | 25 ++- 4 files changed, 264 insertions(+), 93 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 7607d7e9ed34..b1aa3101b529 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -879,7 +879,7 @@ bool BinaryFunction::fixCFIState() { return false; } - for(auto CFI : NewCFIs) { + for (auto CFI : NewCFIs) { InsertIt = addCFIPseudo(InBB, InsertIt, CFI); ++InsertIt; } @@ -977,7 +977,7 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { BasicBlockOrderType ReverseOrder; auto FirstBB = BasicBlocksLayout.front(); ReverseOrder.push_back(FirstBB); - for(auto RBBI = BasicBlocksLayout.rbegin(); *RBBI != FirstBB; ++RBBI) + for (auto RBBI = BasicBlocksLayout.rbegin(); *RBBI != FirstBB; ++RBBI) ReverseOrder.push_back(*RBBI); BasicBlocksLayout.swap(ReverseOrder); @@ -1227,7 +1227,7 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { if (opts::PrintClusters) { errs() << "New cluster order: "; auto Sep = ""; - for(auto O : Order) { + for (auto O : Order) { errs() << Sep << O; Sep = ", "; } diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index c9355030218c..7446227e21c2 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -310,7 +310,7 @@ void BinaryFunction::updateEHRanges() { // If previous call can throw, this is its exception handler. EHInfo PreviousEH = {nullptr, 0}; - for(auto &BB : BasicBlocksLayout) { + for (auto &BB : BasicBlocksLayout) { for (auto II = BB->begin(); II != BB->end(); ++II) { auto Instr = *II; @@ -409,7 +409,7 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer) { // // sizeof(dwarf::DW_EH_PE_udata4) * 3 + sizeof(uleb128(action)) uint64_t CallSiteTableLength = CallSites.size() * 4 * 3; - for(const auto &CallSite : CallSites) { + for (const auto &CallSite : CallSites) { CallSiteTableLength+= getULEB128Size(CallSite.Action); } @@ -481,10 +481,10 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer) { // There's no need to change the original format we saw on input // unless we are doing a function splitting in which case we can // perhaps split and optimize the tables. - for(auto const &Byte : LSDAActionAndTypeTables) { + for (auto const &Byte : LSDAActionAndTypeTables) { Streamer->EmitIntValue(Byte, 1); } - for(auto const &Byte : LSDATypeIndexTable) { + for (auto const &Byte : LSDATypeIndexTable) { Streamer->EmitIntValue(Byte, 1); } } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index b58ebfb81ccf..5a4ce1a53d6d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -94,7 +94,8 @@ SplitFunctions("split-functions", cl::desc("split functions into hot and cold distinct regions"), cl::Optional); -static cl::opt ReorderBlocks( +static cl::opt +ReorderBlocks( "reorder-blocks", cl::desc("change layout of basic blocks in a function"), cl::init(BinaryFunction::LT_NONE), @@ -118,13 +119,18 @@ static cl::opt ReorderBlocks( clEnumValEnd)); -static cl::opt AlignBlocks("align-blocks", - cl::desc("try to align BBs inserting nops"), - cl::Optional); +static cl::opt +AlignBlocks("align-blocks", + cl::desc("try to align BBs inserting nops"), + cl::Optional); + +static cl::opt +UseGnuStack("use-gnu-stack", + cl::desc("use GNU_STACK program header for new segment")); static cl::opt DumpEHFrame("dump-eh-frame", cl::desc("dump parsed .eh_frame (debugging)"), - cl::Hidden); + cl::Hidden); static cl::opt PrintAll("print-all", cl::desc("print functions after each stage"), @@ -384,8 +390,6 @@ void RewriteInstance::discoverStorage() { auto Obj = ELF64LEFile->getELFFile(); - // Alignment should be the size of a page. - unsigned Align = 0x200000; // Discover important addresses in the binary. @@ -393,57 +397,64 @@ void RewriteInstance::discoverStorage() { uint64_t FirstAllocAddress = std::numeric_limits::max(); NextAvailableAddress = 0; + uint64_t NextAvailableOffset = 0; for (const auto &Phdr : Obj->program_headers()) { if (Phdr.p_type == ELF::PT_LOAD) { FirstAllocAddress = std::min(FirstAllocAddress, static_cast(Phdr.p_vaddr)); NextAvailableAddress = std::max(NextAvailableAddress, Phdr.p_vaddr + Phdr.p_memsz); + NextAvailableOffset = std::max(NextAvailableOffset, + Phdr.p_offset + Phdr.p_filesz); } } - assert(NextAvailableAddress && "no PT_LOAD pheader seen"); + assert(NextAvailableAddress && NextAvailableOffset && + "no PT_LOAD pheader seen"); errs() << "BOLT-INFO: first alloc address is 0x" << Twine::utohexstr(FirstAllocAddress) << '\n'; - NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, Align); - - // Earliest available offset is the size of the input file. - uint64_t NextAvailableOffset = Obj->size(); - NextAvailableOffset = RoundUpToAlignment(NextAvailableOffset, Align); - - // This is where the black magic happens. Creating PHDR table in a segment - // other than that containing ELF header is tricky. Some loaders and/or - // parts of loaders will apply e_phoff from ELF header assuming both are in - // the same segment, while others will do the proper calculation. - // We create the new PHDR table in such a way that both of the methods - // of loading and locating the table work. There's a slight file size - // overhead because of that. - - if (NextAvailableOffset <= NextAvailableAddress - FirstAllocAddress) { - NextAvailableOffset = NextAvailableAddress - FirstAllocAddress; - } else { - NextAvailableAddress = NextAvailableOffset + FirstAllocAddress; - } + FirstNonAllocatableOffset = NextAvailableOffset; + + NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, PageAlign); + NextAvailableOffset = RoundUpToAlignment(NextAvailableOffset, PageAlign); + + if (!opts::UseGnuStack) { + // This is where the black magic happens. Creating PHDR table in a segment + // other than that containing ELF header is tricky. Some loaders and/or + // parts of loaders will apply e_phoff from ELF header assuming both are in + // the same segment, while others will do the proper calculation. + // We create the new PHDR table in such a way that both of the methods + // of loading and locating the table work. There's a slight file size + // overhead because of that. + if (NextAvailableOffset <= NextAvailableAddress - FirstAllocAddress) { + NextAvailableOffset = NextAvailableAddress - FirstAllocAddress; + } else { + NextAvailableAddress = NextAvailableOffset + FirstAllocAddress; + } + assert(NextAvailableOffset == NextAvailableAddress - FirstAllocAddress && + "PHDR table address calculation error"); - assert(NextAvailableOffset == NextAvailableAddress - FirstAllocAddress && - "PHDR table address calculation error"); + errs() << "BOLT-INFO: creating new program header table at address 0x" + << Twine::utohexstr(NextAvailableAddress) << ", offset 0x" + << Twine::utohexstr(NextAvailableOffset) << '\n'; - errs() << "BOLT-INFO: creating new program header table at address 0x" - << Twine::utohexstr(NextAvailableAddress) << '\n'; + PHDRTableAddress = NextAvailableAddress; + PHDRTableOffset = NextAvailableOffset; - PHDRTableAddress = NextAvailableAddress; - PHDRTableOffset = NextAvailableOffset; + // Reserve space for 3 extra pheaders. + unsigned Phnum = Obj->getHeader()->e_phnum; + Phnum += 3; - // Reserve the space for 3 extra pheaders. - unsigned Phnum = Obj->getHeader()->e_phnum; - Phnum += 3; + NextAvailableAddress += Phnum * sizeof(ELFFile::Elf_Phdr); + NextAvailableOffset += Phnum * sizeof(ELFFile::Elf_Phdr); + } - NextAvailableAddress += Phnum * sizeof(ELFFile::Elf_Phdr); - NextAvailableOffset += Phnum * sizeof(ELFFile::Elf_Phdr); + // Align at cache line. + NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, 64); + NextAvailableOffset = RoundUpToAlignment(NextAvailableOffset, 64); - // TODO: insert alignment here if needed. NewTextSegmentAddress = NextAvailableAddress; NewTextSegmentOffset = NextAvailableOffset; } @@ -758,7 +769,7 @@ void RewriteInstance::disassembleFunctions() { } ); auto SFI = ProfiledFunctions.begin(); - for(int i = 0; i < 100 && SFI != ProfiledFunctions.end(); ++SFI, ++i) { + for (int i = 0; i < 100 && SFI != ProfiledFunctions.end(); ++SFI, ++i) { errs() << " " << (*SFI)->getName() << " : " << (*SFI)->getExecutionCount() << '\n'; } @@ -1115,6 +1126,7 @@ void RewriteInstance::emitFunctions() { // Map every function/section current address in memory to that in // the output binary. + uint64_t NewTextSectionStartAddress = NextAvailableAddress; for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; if (!Function.isSimple()) @@ -1161,12 +1173,24 @@ void RewriteInstance::emitFunctions() { FailedAddresses.emplace_back(Function.getAddress()); } } + // Add the new text section aggregating all existing code sections. + auto NewTextSectionSize = NextAvailableAddress - NewTextSectionStartAddress; + if (NewTextSectionSize) { + SectionMM->SectionMapInfo[".bolt.text"] = + SectionInfo(0, + NewTextSectionSize, + 16, + true /*IsCode*/, + true /*IsReadOnly*/, + NewTextSectionStartAddress, + getFileOffsetFor(NewTextSectionStartAddress)); + } // Map special sections to their addresses in the output image. // // TODO: perhaps we should process all the allocated sections here? std::vector Sections = { ".eh_frame", ".gcc_except_table" }; - for(auto &SectionName : Sections) { + for (auto &SectionName : Sections) { auto SMII = EFMM->SectionMapInfo.find(SectionName); if (SMII != EFMM->SectionMapInfo.end()) { SectionInfo &SI = SMII->second; @@ -1222,69 +1246,201 @@ void RewriteInstance::patchELF() { } auto Obj = ELF64LEFile->getELFFile(); auto &OS = Out->os(); - OS.seek(PHDRTableOffset); - errs() << "BOLT-INFO: writing new program headers at offset 0x" - << Twine::utohexstr(PHDRTableOffset) << '\n'; + // Write/re-write program headers. + unsigned Phnum = Obj->getHeader()->e_phnum; + if (PHDRTableOffset) { + // Writing new pheader table. + Phnum += 1; // only adding one new segment + // Segment size includes the size of the PHDR area. + NewTextSegmentSize = NextAvailableAddress - PHDRTableAddress; + } else { + assert(!PHDRTableAddress && "unexpected address for program header table"); - auto Ehdr = Obj->getHeader(); - unsigned Phnum = Ehdr->e_phnum; + // Update existing table. + PHDRTableOffset = Obj->getHeader()->e_phoff; + NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress; + } + OS.seek(PHDRTableOffset); - // FIXME: this will depend on the number of segements we plan to write. - Phnum += 1; + bool ModdedGnuStack = false; + bool AddedSegment = false; // Copy existing program headers with modifications. for (auto &Phdr : Obj->program_headers()) { - if (Phdr.p_type == ELF::PT_PHDR) { - auto NewPhdr = Phdr; + auto NewPhdr = Phdr; + if (PHDRTableAddress && Phdr.p_type == ELF::PT_PHDR) { NewPhdr.p_offset = PHDRTableOffset; NewPhdr.p_vaddr = PHDRTableAddress; NewPhdr.p_paddr = PHDRTableAddress; NewPhdr.p_filesz = sizeof(NewPhdr) * Phnum; NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum; - OS.write(reinterpret_cast(&NewPhdr), sizeof(NewPhdr)); } else if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) { - auto NewPhdr = Phdr; + auto SMII = SectionMM->SectionMapInfo.find(".eh_frame_hdr"); + assert(SMII != SectionMM->SectionMapInfo.end() && + ".eh_frame_hdr could not be found for PT_GNU_EH_FRAME"); + auto &EHFrameHdrSecInfo = SMII->second; NewPhdr.p_offset = EHFrameHdrSecInfo.FileOffset; NewPhdr.p_vaddr = EHFrameHdrSecInfo.FileAddress; NewPhdr.p_paddr = EHFrameHdrSecInfo.FileAddress; NewPhdr.p_filesz = EHFrameHdrSecInfo.Size; NewPhdr.p_memsz = EHFrameHdrSecInfo.Size; - OS.write(reinterpret_cast(&NewPhdr), sizeof(NewPhdr)); - } else { - OS.write(reinterpret_cast(&Phdr), sizeof(Phdr)); + } else if (opts::UseGnuStack && Phdr.p_type == ELF::PT_GNU_STACK) { + NewPhdr.p_type = ELF::PT_LOAD; + NewPhdr.p_offset = NewTextSegmentOffset; + NewPhdr.p_vaddr = NewTextSegmentAddress; + NewPhdr.p_paddr = NewTextSegmentAddress; + NewPhdr.p_filesz = NewTextSegmentSize; + NewPhdr.p_memsz = NewTextSegmentSize; + NewPhdr.p_flags = ELF::PF_X | ELF::PF_R; + NewPhdr.p_align = PageAlign; + ModdedGnuStack = true; + } else if (!opts::UseGnuStack && Phdr.p_type == ELF::PT_DYNAMIC) { + // Insert new pheader + ELFFile::Elf_Phdr NewTextPhdr; + NewTextPhdr.p_type = ELF::PT_LOAD; + NewTextPhdr.p_offset = PHDRTableOffset; + NewTextPhdr.p_vaddr = PHDRTableAddress; + NewTextPhdr.p_paddr = PHDRTableAddress; + NewTextPhdr.p_filesz = NewTextSegmentSize; + NewTextPhdr.p_memsz = NewTextSegmentSize; + NewTextPhdr.p_flags = ELF::PF_X | ELF::PF_R; + NewTextPhdr.p_align = PageAlign; + OS.write(reinterpret_cast(&NewTextPhdr), + sizeof(NewTextPhdr)); + AddedSegment = true; } + OS.write(reinterpret_cast(&NewPhdr), sizeof(NewPhdr)); } - NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress; + assert((!opts::UseGnuStack || ModdedGnuStack) && + "could not find GNU_STACK program header to modify"); - // Alignment should be the size of a page. - unsigned Align = 0x200000; + assert((opts::UseGnuStack || AddedSegment) && + "could not add program header for the new segment"); - // Add new pheaders - ELFFile::Elf_Phdr NewTextPhdr; - NewTextPhdr.p_type = ELF::PT_LOAD; - NewTextPhdr.p_offset = PHDRTableOffset; - NewTextPhdr.p_vaddr = PHDRTableAddress; - NewTextPhdr.p_paddr = PHDRTableAddress; - NewTextPhdr.p_filesz = NewTextSegmentSize; - NewTextPhdr.p_memsz = NewTextSegmentSize; - NewTextPhdr.p_flags = ELF::PF_R | ELF::PF_X; - NewTextPhdr.p_align = Align; + // Copy original non-allocatable contents and update section offsets. + uint64_t NextAvailableOffset = getFileOffsetFor(NextAvailableAddress); + assert(NextAvailableOffset >= FirstNonAllocatableOffset && + "next available offset calculation failure"); - OS.write(reinterpret_cast(&NewTextPhdr), sizeof(NewTextPhdr)); + // Re-write using this offset delta. + uint64_t OffsetDelta = NextAvailableOffset - FirstNonAllocatableOffset; - // Fix ELF header. - uint64_t PhoffLoc = (uintptr_t)&Ehdr->e_phoff - (uintptr_t)Obj->base(); - uint64_t PhnumLoc = (uintptr_t)&Ehdr->e_phnum - (uintptr_t)Obj->base(); - char Buffer[8]; - support::ulittle64_t::ref(Buffer + 0) = PHDRTableOffset; - OS.pwrite(Buffer, 8, PhoffLoc); - support::ulittle16_t::ref(Buffer + 0) = Phnum; - OS.pwrite(Buffer, 2, PhnumLoc); + // Make sure offset delta is a multiple of alignment; + OffsetDelta = RoundUpToAlignment(OffsetDelta, MaxNonAllocAlign); + NextAvailableOffset = FirstNonAllocatableOffset + OffsetDelta; + + // FIXME: only write up to SHDR table. + OS.seek(NextAvailableOffset); + OS << File->getData().drop_front(FirstNonAllocatableOffset); + + bool SeenNonAlloc = false; + uint64_t ExtraDelta = 0; // for dynamically adjusting delta + unsigned NumNewSections = 0; + + // Update section table. Note that the section table itself has shifted. + OS.seek(Obj->getHeader()->e_shoff + OffsetDelta); + for (auto &Section : Obj->sections()) { + // Always ignore this section. + if (Section.sh_type == ELF::SHT_NULL) { + OS.write(reinterpret_cast(&Section), sizeof(Section)); + continue; + } + + auto NewSection = Section; + uint64_t SectionLoc = (uintptr_t)&Section - (uintptr_t)Obj->base(); + + ErrorOr SectionName = Obj->getSectionName(&Section); + check_error(SectionName.getError(), "cannot get section name"); + + if (!(Section.sh_flags & ELF::SHF_ALLOC)) { + if (!SeenNonAlloc) { + + // This is where we place all our new sections. + + std::vector SectionsToRewrite; + for (auto &SMII : SectionMM->SectionMapInfo) { + SectionInfo &SI = SMII.second; + if (SI.IsCode && SMII.first != ".bolt.text") + continue; + errs() << "BOLT-INFO: re-writing section header for " + << SMII.first << '\n'; + auto NewSection = Section; + NewSection.sh_name = SI.ShName; + NewSection.sh_type = ELF::SHT_PROGBITS; + NewSection.sh_addr = SI.FileAddress; + NewSection.sh_offset = SI.FileOffset; + NewSection.sh_size = SI.Size; + NewSection.sh_entsize = 0; + NewSection.sh_flags = ELF::SHF_ALLOC | ELF::SHF_EXECINSTR; + NewSection.sh_link = 0; + NewSection.sh_info = 0; + NewSection.sh_addralign = SI.Alignment; + SectionsToRewrite.emplace_back(NewSection); + } + + // Do actual writing after sorting out. + OS.seek(SectionLoc + OffsetDelta); + std::stable_sort(SectionsToRewrite.begin(), SectionsToRewrite.end(), + [] (decltype(Section) A, decltype(Section) B) { + return A.sh_offset < B.sh_offset; + }); + for (auto &SI : SectionsToRewrite) { + OS.write(reinterpret_cast(&SI), + sizeof(SI)); + } + + NumNewSections = SectionsToRewrite.size(); + ExtraDelta += sizeof(Section) * NumNewSections; + + SeenNonAlloc = true; + } + + assert(Section.sh_addralign <= MaxNonAllocAlign && + "unexpected alignment for non-allocatable section"); + assert(Section.sh_offset >= FirstNonAllocatableOffset && + "bad offset for non-allocatable section"); + + NewSection.sh_offset = Section.sh_offset + OffsetDelta; + + if (Section.sh_offset > Obj->getHeader()->e_shoff) { + // The section is going to be shifted. + NewSection.sh_offset = NewSection.sh_offset + ExtraDelta; + } + + if (Section.sh_link) + NewSection.sh_link = Section.sh_link + NumNewSections; + + } else if (*SectionName == ".bss") { + NewSection.sh_offset = NewTextSegmentOffset; + } + + auto SMII = SectionMM->SectionMapInfo.find(*SectionName); + if (SMII != SectionMM->SectionMapInfo.end()) { + auto &SecInfo = SMII->second; + SecInfo.ShName = Section.sh_name; + } + + OS.write(reinterpret_cast(&NewSection), sizeof(NewSection)); + } + + // Write all the sections past the section table again as they are shifted. + auto OffsetPastShdrTable = Obj->getHeader()->e_shoff + + Obj->getHeader()->e_shnum * sizeof(ELFFile::Elf_Shdr); + OS.seek(OffsetPastShdrTable + OffsetDelta + ExtraDelta); + OS << File->getData().drop_front(OffsetPastShdrTable); // FIXME: Update _end in .dynamic + // Fix ELF header. + auto NewEhdr = *Obj->getHeader(); + NewEhdr.e_phoff = PHDRTableOffset; + NewEhdr.e_phnum = Phnum; + NewEhdr.e_shoff = NewEhdr.e_shoff + OffsetDelta; + NewEhdr.e_shnum = NewEhdr.e_shnum + NumNewSections; + NewEhdr.e_shstrndx = NewEhdr.e_shstrndx + NumNewSections; + OS.pwrite(reinterpret_cast(&NewEhdr), sizeof(NewEhdr), 0); } void RewriteInstance::rewriteFile() { @@ -1305,7 +1461,8 @@ void RewriteInstance::rewriteFile() { ->getAssembler() .getWriter(); - // Make sure output stream has enough space. + // Make sure output stream has enough reserved space, otherwise + // pwrite() will fail. auto Offset = Out->os().seek(getFileOffsetFor(NextAvailableAddress)); assert(Offset == getFileOffsetFor(NextAvailableAddress) && "error resizing output file"); @@ -1376,7 +1533,7 @@ void RewriteInstance::rewriteFile() { << " functions were overwritten.\n"; // Write all non-code sections. - for(auto &SMII : SectionMM->SectionMapInfo) { + for (auto &SMII : SectionMM->SectionMapInfo) { SectionInfo &SI = SMII.second; if (SI.IsCode) continue; @@ -1396,6 +1553,7 @@ void RewriteInstance::rewriteFile() { RoundUpToAlignment(NextAvailableAddress, FrameHdrAlign); } + SectionInfo EHFrameHdrSecInfo; EHFrameHdrSecInfo.FileAddress = NextAvailableAddress; EHFrameHdrSecInfo.FileOffset = getFileOffsetFor(NextAvailableAddress); @@ -1413,6 +1571,8 @@ void RewriteInstance::rewriteFile() { "offset mismatch"); Out->os().write(FrameHdrCopy.data(), EHFrameHdrSecInfo.Size); + SectionMM->SectionMapInfo[".eh_frame_hdr"] = EHFrameHdrSecInfo; + NextAvailableAddress += EHFrameHdrSecInfo.Size; } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index f29aee4f6c8a..f0ba0b2921eb 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -39,15 +39,18 @@ struct SectionInfo { uint64_t AllocAddress; /// Current location of the section in memory. uint64_t Size; /// Section size. unsigned Alignment; /// Alignment of the section. - uint64_t FileAddress{0}; /// Address for the output file (final address). - uint64_t FileOffset{0}; /// Offset in the output file. bool IsCode{false}; /// Does this section contain code? bool IsReadOnly{false}; /// Is the section read-only? + uint64_t FileAddress{0}; /// Address for the output file (final address). + uint64_t FileOffset{0}; /// Offset in the output file. + uint64_t ShName{0}; /// Name offset in section header string table. SectionInfo(uint64_t Address = 0, uint64_t Size = 0, unsigned Alignment = 0, - bool IsCode = false, bool IsReadOnly = false) + bool IsCode = false, bool IsReadOnly = false, + uint64_t FileAddress = 0, uint64_t FileOffset = 0) : AllocAddress(Address), Size(Size), Alignment(Alignment), - IsCode(IsCode), IsReadOnly(IsReadOnly) {} + IsCode(IsCode), IsReadOnly(IsReadOnly), FileAddress(FileAddress), + FileOffset(FileOffset) {} }; /// Class responsible for allocating and managing code and data sections. @@ -140,6 +143,12 @@ class RewriteInstance { private: + /// Huge page size used for alignment. + static constexpr unsigned PageAlign = 0x200000; + + /// Maximum alignment for non-allocatable section. + static constexpr unsigned MaxNonAllocAlign = 16; + /// Detect storage available in the binary for allocating new sections. void discoverStorage(); @@ -168,6 +177,9 @@ class RewriteInstance { /// optimized code for selected functions. std::unique_ptr Out; + /// Offset in the input file where non-allocatable sections start. + uint64_t FirstNonAllocatableOffset{0}; + uint64_t PHDRTableAddress{0}; uint64_t PHDRTableOffset{0}; @@ -179,7 +191,8 @@ class RewriteInstance { /// Track next available address in the new text segment. uint64_t NextAvailableAddress{0}; - SectionInfo EHFrameHdrSecInfo; + /// Information on sections to re-write in the binary. + std::map SectionsToRewrite; /// Store all non-zero symbols in this map for a quick address lookup. std::map FileSymRefs; @@ -195,8 +208,6 @@ class RewriteInstance { uint64_t FrameHdrAlign{1}; const llvm::DWARFFrame *EHFrame{nullptr}; StringRef NewEhFrameContents; - uint64_t NewEhFrameAddress{0}; - uint64_t NewEhFrameOffset{0}; /// Keep track of functions we fail to write in the binary. We need to avoid /// rewriting CFI info for these functions. From 4cdf35a78e5e3679a9dc0145b5f072b68cf9bab0 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 22 Feb 2016 18:25:43 -0800 Subject: [PATCH 070/904] Don't abort on unknown CFI instructions. Summary: If we see an unknown CFI instruction, skip processing the function containing it instead of aborting execution. (cherry picked from commit c30215bc8d7f9f0ca7ad948dc289345dc7f844d0) --- bolt/Exceptions.cpp | 34 +++++++++++++++++++++------------- bolt/Exceptions.h | 2 +- bolt/RewriteInstance.cpp | 9 +++++++-- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 7446227e21c2..fd5e05067d7b 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -492,11 +492,11 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer) { const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f; -void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { +bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { uint64_t Address = Function.getAddress(); auto I = FDEs.find(Address); if (I == FDEs.end()) - return; + return true; const FDE &CurFDE = *I->second; if (Function.getSize() != CurFDE.getAddressRange()) { @@ -613,33 +613,41 @@ void CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { break; case DW_CFA_val_offset_sf: case DW_CFA_val_offset: - llvm_unreachable("DWARF val_offset() unimplemented"); - break; + errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n"; + return false; case DW_CFA_expression: case DW_CFA_def_cfa_expression: case DW_CFA_val_expression: - llvm_unreachable("DWARF CFA expressions unimplemented"); - break; + errs() << "BOLT-WARNING: DWARF CFA expressions unimplemented\n"; + return false; case DW_CFA_MIPS_advance_loc8: - llvm_unreachable("DW_CFA_MIPS_advance_loc unimplemented"); - break; + errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n"; + return false; case DW_CFA_GNU_window_save: case DW_CFA_lo_user: case DW_CFA_hi_user: - llvm_unreachable("DW_CFA_GNU_* and DW_CFA_*_user unimplemented"); - break; + errs() << + "BOLT-WARNING: DW_CFA_GNU_* and DW_CFA_*_user unimplemented\n"; + return false; default: - llvm_unreachable("Unrecognized CFI instruction"); + errs() << "BOLT-WARNING: Unrecognized CFI instruction\n"; + return false; } + + return true; }; for (const FrameEntry::Instruction &Instr : *(CurFDE.getLinkedCIE())) { - decodeFrameInstruction(Instr); + if (!decodeFrameInstruction(Instr)) + return false; } for (const FrameEntry::Instruction &Instr : CurFDE) { - decodeFrameInstruction(Instr); + if (!decodeFrameInstruction(Instr)) + return false; } + + return true; } void CFIReaderWriter::rewriteHeaderFor(StringRef EHFrame, diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h index 3e98a3949e9a..52f298d632d7 100644 --- a/bolt/Exceptions.h +++ b/bolt/Exceptions.h @@ -44,7 +44,7 @@ class CFIReaderWriter { using FDEsMap = std::map; - void fillCFIInfoFor(BinaryFunction &Function) const; + bool fillCFIInfoFor(BinaryFunction &Function) const; // Include a new EHFrame, updating the .eh_frame_hdr void rewriteHeaderFor(StringRef EHFrame, uint64_t EHFrameAddress, diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 5a4ce1a53d6d..414a1502269c 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -706,8 +706,13 @@ void RewriteInstance::disassembleFunctions() { continue; // Fill in CFI information for this function - if (EHFrame->ParseError.empty() && Function.isSimple()) { - CFIRdWrt->fillCFIInfoFor(Function); + if (EHFrame->ParseError.empty()) { + if (!CFIRdWrt->fillCFIInfoFor(Function)) { + errs() << "BOLT-WARNING: unable to fill CFI for function " + << Function.getName() << '\n'; + Function.setSimple(false); + continue; + } } // Parse LSDA. From 8e5cb586882882061625531e50471b1e87662397 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 22 Feb 2016 16:49:26 -0800 Subject: [PATCH 071/904] Always split functions under '-split-functions=1' option. Summary: Force the splitting of the function into hot/cold even when the function fits into original slot. This reduces BOLT optimization time by 50% without affecting hhvm performance. (cherry picked from commit f3e00577effcc7e54cbfb004c80f1510f89bdb90) --- bolt/RewriteInstance.cpp | 33 +-------------------------------- bolt/RewriteInstance.h | 10 ---------- 2 files changed, 1 insertion(+), 42 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 414a1502269c..3eda5951b936 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -472,18 +472,6 @@ void RewriteInstance::run() { runOptimizationPasses(); emitFunctions(); - if (opts::SplitFunctions && splitLargeFunctions()) { - // Emit again because now some functions have been split - outs() << "BOLT: split-functions: starting pass 2...\n"; - reset(); - discoverStorage(); - readSymbolTable(); - readSpecialSections(); - disassembleFunctions(); - runOptimizationPasses(); - emitFunctions(); - } - // Copy input file to output std::error_code EC; Out = llvm::make_unique(opts::OutputFilename, EC, @@ -838,8 +826,7 @@ void RewriteInstance::runOptimizationPasses() { } if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { - bool ShouldSplit = ToSplit.find(BFI.first) != ToSplit.end(); - BFI.second.modifyLayout(opts::ReorderBlocks, ShouldSplit); + BFI.second.modifyLayout(opts::ReorderBlocks, opts::SplitFunctions); if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks"); } @@ -1225,24 +1212,6 @@ void RewriteInstance::emitFunctions() { TempOut->keep(); } -bool RewriteInstance::splitLargeFunctions() { - bool Changed = false; - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - - // Ignore this function if we failed to map it to the output binary - if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) - continue; - - if (Function.getImageSize() <= Function.getMaxSize()) - continue; - - ToSplit.insert(BFI.first); - Changed = true; - } - return Changed; -} - void RewriteInstance::patchELF() { auto ELF64LEFile = dyn_cast(File); if (!ELF64LEFile) { diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index f0ba0b2921eb..2ad4efed2cb2 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -127,13 +127,6 @@ class RewriteInstance { /// performing final relaxation. void emitFunctions(); - /// Check which functions became larger than their original version and - /// annotate function splitting information. - /// - /// Returns true if any function was annotated, requiring us to perform a - /// second pass to emit those functions in two parts. - bool splitLargeFunctions(); - /// Rewrite back all functions (hopefully optimized) that fit in the original /// memory footprint for that function. If the function is now larger and does /// not fit in the binary, reject it and preserve the original version of the @@ -213,9 +206,6 @@ class RewriteInstance { /// rewriting CFI info for these functions. std::vector FailedAddresses; - /// Keep track of which functions to split in a second pass. - std::set ToSplit; - /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; From 65cd4c63bb5f65b7794e35ad85ef98e8e948d815 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Thu, 25 Feb 2016 16:57:07 -0800 Subject: [PATCH 072/904] BOLT: Read and tie .debug_line info to IR. Summary: Reads information in the DWARF .debug_line section using LLVM and tie every MCInst to one line of a line table from the input binary. Subsequent diffs will update this information to match the final binary layout and output updated line tables. (cherry picked from commit b36bb4eb1df7060c6899c4162e30bdf5d31022ef) --- bolt/BinaryContext.cpp | 7 +++ bolt/BinaryContext.h | 24 ++++++- bolt/BinaryFunction.cpp | 115 ++++++++++++++++++++++++++++++++-- bolt/BinaryFunction.h | 19 +++++- bolt/CMakeLists.txt | 1 + bolt/DebugLineTableRowRef.cpp | 21 +++++++ bolt/DebugLineTableRowRef.h | 63 +++++++++++++++++++ bolt/RewriteInstance.cpp | 38 ++++++----- bolt/RewriteInstance.h | 1 - 9 files changed, 264 insertions(+), 25 deletions(-) create mode 100644 bolt/DebugLineTableRowRef.cpp create mode 100644 bolt/DebugLineTableRowRef.h diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 88e784ff7963..46ef7b046109 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -43,5 +43,12 @@ MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, return Symbol; } + +void BinaryContext::buildOffsetToDWARFCompileUnitMap() { + for (const auto &CU : DwCtx->compile_units()) { + OffsetToDwarfCU[CU->getOffset()] = CU.get(); + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index f7d817aa1fe0..444492bf0d49 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -15,6 +15,8 @@ #define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H #include "llvm/ADT/Triple.h" +#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" @@ -55,8 +57,14 @@ class BinaryContext { // Set of addresses we cannot relocate because we have a direct branch to it. std::set InterproceduralBranchTargets; + // Map from offset in the .debug_info section of the binary the + // DWARF Compilation Unit that starts at that offset. + std::map OffsetToDwarfCU; + std::unique_ptr Ctx; + std::unique_ptr DwCtx; + std::unique_ptr TheTriple; const Target *TheTarget; @@ -86,6 +94,7 @@ class BinaryContext { const DataReader &DR; BinaryContext(std::unique_ptr Ctx, + std::unique_ptr DwCtx, std::unique_ptr TheTriple, const Target *TheTarget, std::string TripleName, @@ -98,8 +107,10 @@ class BinaryContext { std::unique_ptr MIA, std::unique_ptr MRI, std::unique_ptr DisAsm, - const DataReader &DR) : + const DataReader &DR, + bool LoadDebugContext) : Ctx(std::move(Ctx)), + DwCtx(std::move(DwCtx)), TheTriple(std::move(TheTriple)), TheTarget(TheTarget), TripleName(TripleName), @@ -112,7 +123,11 @@ class BinaryContext { MIA(std::move(MIA)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)), - DR(DR) {} + DR(DR) { + if (LoadDebugContext) { + buildOffsetToDWARFCompileUnitMap(); + } + } ~BinaryContext() {} @@ -121,6 +136,11 @@ class BinaryContext { /// If there are multiple symbols registered at the \p Address, then /// return the first one. MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); + +private: + // Iterates over all DWARF compilation units and maps their offset in the + // binary to themselves in OffsetDwarfCUMap + void buildOffsetToDWARFCompileUnitMap(); }; } // namespace bolt diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index b1aa3101b529..aa1febbab114 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -13,7 +13,9 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" #include "DataReader.h" +#include "DebugLineTableRowRef.h" #include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -38,8 +40,35 @@ namespace opts { static cl::opt PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); +static cl::opt +PrintDebugInfo("print-debug-info", + cl::desc("print debug info when printing functions"), + cl::Hidden); + } // namespace opts +namespace { + +// Finds which DWARF compile unit owns an address in the executable by +// querying .debug_aranges. +DWARFCompileUnit *FindCompileUnitForAddress(uint64_t Address, + const BinaryContext &BC) { + auto DebugAranges = BC.DwCtx->getDebugAranges(); + if (!DebugAranges) + return nullptr; + + uint32_t CompileUnitIndex = DebugAranges->findAddress(Address); + + auto It = BC.OffsetToDwarfCU.find(CompileUnitIndex); + if (It == BC.OffsetToDwarfCU.end()) { + return nullptr; + } else { + return It->second; + } +} + +} // namespace + uint64_t BinaryFunction::Count = 0; BinaryBasicBlock * @@ -135,6 +164,15 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, } }; + // Used in printInstruction below to print debug line information. + DWARFCompileUnit *Unit = nullptr; + const DWARFDebugLine::LineTable *LineTable = nullptr; + + if (opts::PrintDebugInfo) { + Unit = FindCompileUnitForAddress(getAddress(), BC); + LineTable = Unit ? BC.DwCtx->getLineTableForUnit(Unit) : nullptr; + } + auto printInstruction = [&](const MCInst &Instruction) { if (BC.MIA->isEHLabel(Instruction)) { OS << " EH_LABEL: " @@ -168,6 +206,21 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "; action: " << Action; } } + if (opts::PrintDebugInfo && LineTable) { + auto RowRef = DebugLineTableRowRef::fromSMLoc(Instruction.getLoc()); + + if (RowRef != DebugLineTableRowRef::NULL_ROW) { + const auto &Row = LineTable->Rows[RowRef.RowIndex]; + OS << " # debug line " + << LineTable->Prologue.FileNames[Row.File - 1].Name + << ":" << Row.Line; + + if (Row.Column) { + OS << ":" << Row.Column; + } + } + } + OS << "\n"; // In case we need MCInst printer: // Instr.dump_pretty(OS, InstructionPrinter.get()); @@ -294,12 +347,18 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "End of Function \"" << getName() << "\"\n\n"; } -bool BinaryFunction::disassemble(ArrayRef FunctionData) { +bool BinaryFunction::disassemble(ArrayRef FunctionData, + bool ExtractDebugLineData) { assert(FunctionData.size() == getSize() && "function size does not match raw data size"); auto &Ctx = BC.Ctx; auto &MIA = BC.MIA; + DWARFCompileUnit *CompileUnit = nullptr; + + if (ExtractDebugLineData) { + CompileUnit = FindCompileUnitForAddress(getAddress(), BC); + } // Insert a label at the beginning of the function. This will be our first // basic block. @@ -335,16 +394,18 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { for (uint64_t Offset = 0; IsSimple && (Offset < getSize()); ) { MCInst Instruction; uint64_t Size; + uint64_t AbsoluteInstrAddr = getAddress() + Offset; + if (!BC.DisAsm->getInstruction(Instruction, Size, FunctionData.slice(Offset), - getAddress() + Offset, + AbsoluteInstrAddr, nulls(), nulls())) { // Ignore this function. Skip to the next one. errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x" << Twine::utohexstr(Offset) << " (address 0x" - << Twine::utohexstr(getAddress() + Offset) << ") in function " + << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << getName() << '\n'; IsSimple = false; break; @@ -353,13 +414,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (MIA->isUnsupported(Instruction)) { errs() << "BOLT-WARNING: unsupported instruction seen at offset 0x" << Twine::utohexstr(Offset) << " (address 0x" - << Twine::utohexstr(getAddress() + Offset) << ") in function " + << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << getName() << '\n'; IsSimple = false; break; } - uint64_t AbsoluteInstrAddr = getAddress() + Offset; if (MIA->isBranch(Instruction) || MIA->isCall(Instruction)) { uint64_t InstructionTarget = 0; if (MIA->evaluateBranch(Instruction, @@ -476,6 +536,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } } + if (CompileUnit) { + Instruction.setLoc( + findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, + CompileUnit)); + } + addInstruction(Offset, std::move(Instruction)); Offset += Size; @@ -491,6 +557,45 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { return true; } +SMLoc +BinaryFunction::findDebugLineInformationForInstructionAt( + uint64_t Address, + DWARFCompileUnit *Unit) { + // We use the pointer in SMLoc to store an instance of DebugLineTableRowRef, + // which occupies 64 bits. Thus, we can only proceed if the struct fits into + // the pointer itself. + assert( + sizeof(decltype(SMLoc().getPointer())) >= sizeof(DebugLineTableRowRef) && + "Cannot fit instruction debug line information into SMLoc's pointer"); + + const DWARFDebugLine::LineTable *LineTable = + BC.DwCtx->getLineTableForUnit(Unit); + + SMLoc NullResult = DebugLineTableRowRef::NULL_ROW.toSMLoc(); + + if (!LineTable) { + return NullResult; + } + + uint32_t RowIndex = LineTable->lookupAddress(Address); + + if (RowIndex == LineTable->UnknownRowIndex) { + return NullResult; + } + + assert(RowIndex < LineTable->Rows.size() && + "Line Table lookup returned invalid index."); + + decltype(SMLoc().getPointer()) Ptr; + DebugLineTableRowRef *InstructionLocation = + reinterpret_cast(&Ptr); + + InstructionLocation->DwCompileUnitIndex = Unit->getOffset(); + InstructionLocation->RowIndex = RowIndex; + + return SMLoc::getFromPointer(Ptr); +} + bool BinaryFunction::buildCFG() { auto &MIA = BC.MIA; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index ffb295ed55d4..1096a30ba0a3 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -58,7 +58,7 @@ class BinaryFunction { enum LayoutType : char { /// LT_NONE - do not change layout of basic blocks LT_NONE = 0, /// no reordering - /// LT_REVERSE - reverse the order of basic blocks, meant for testing + /// LT_REVERSE - reverse the order of basic blocks, meant for testing /// purposes. The first basic block is left intact and the rest are /// put in the reverse order. LT_REVERSE, @@ -186,6 +186,14 @@ class BinaryFunction { return *this; } + /// Gets debug line information for the instruction located at the given + /// address in the original binary. The SMLoc's pointer is used + /// to point to this information, which is represented by a + /// DebugLineTableRowRef. The returned pointer is null if no debug line + /// information for this instruction was found. + SMLoc findDebugLineInformationForInstructionAt(uint64_t Address, + DWARFCompileUnit *Unit); + const BinaryBasicBlock * getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const; @@ -434,7 +442,7 @@ class BinaryFunction { /// function and append it to the end of list of blocks. /// If \p DeriveAlignment is true, set the alignment of the block based /// on the alignment of the existing offset. - /// + /// /// Returns NULL if basic block already exists at the \p Offset. BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, bool DeriveAlignment = false) { @@ -648,6 +656,10 @@ class BinaryFunction { /// /// \p FunctionData is the set bytes representing the function body. /// + /// \p ExtractDebugLineData is a flag indicating whether DWARF .debug_line + /// information should be looked up and tied to each disassembled + /// instruction. + /// /// The Function should be properly initialized before this function /// is called. I.e. function address and size should be set. /// @@ -655,7 +667,8 @@ class BinaryFunction { /// state to State:Disassembled. /// /// Returns false if disassembly failed. - bool disassemble(ArrayRef FunctionData); + bool disassemble(ArrayRef FunctionData, + bool ExtractDebugLineData = false); /// Builds a list of basic blocks with successor and predecessor info. /// diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 2ee858fe5b85..7b25d1e27984 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -17,6 +17,7 @@ add_llvm_tool(llvm-bolt BinaryContext.cpp BinaryFunction.cpp DataReader.cpp + DebugLineTableRowRef.cpp Exceptions.cpp RewriteInstance.cpp ) diff --git a/bolt/DebugLineTableRowRef.cpp b/bolt/DebugLineTableRowRef.cpp new file mode 100644 index 000000000000..83ed5158e6cc --- /dev/null +++ b/bolt/DebugLineTableRowRef.cpp @@ -0,0 +1,21 @@ +//===--- DebugLineTableRowRef.cpp - Identifies a row in a .debug_line table ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DebugLineTableRowRef.h" + + +namespace llvm { +namespace bolt { + +const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{-1U, -1U}; + +} // namespace bolt +} // namespace llvm diff --git a/bolt/DebugLineTableRowRef.h b/bolt/DebugLineTableRowRef.h new file mode 100644 index 000000000000..66c1be5c43ff --- /dev/null +++ b/bolt/DebugLineTableRowRef.h @@ -0,0 +1,63 @@ +//===--- DebugLineTableRowRef.h - Identifies a row in a .debug_line table -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Class that references a row in a DWARFDebugLine::LineTable by the DWARF +// Context index of the DWARF Compile Unit that owns the Line Table and the row +// index. This is tied to our IR during disassembly so that we can later update +// .debug_line information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUGLINETABLEROWREF_H +#define LLVM_TOOLS_LLVM_BOLT_DEBUGLINETABLEROWREF_H + +#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" +#include "llvm/Support/SMLoc.h" + +namespace llvm { +namespace bolt { + +struct DebugLineTableRowRef { + uint32_t DwCompileUnitIndex; + uint32_t RowIndex; + + const static DebugLineTableRowRef NULL_ROW; + + bool operator==(const DebugLineTableRowRef &Rhs) const { + return DwCompileUnitIndex == Rhs.DwCompileUnitIndex && + RowIndex == Rhs.RowIndex; + } + + bool operator!=(const DebugLineTableRowRef &Rhs) const { + return !(*this == Rhs); + } + + static DebugLineTableRowRef fromSMLoc(const SMLoc &Loc) { + union { + decltype(Loc.getPointer()) Ptr; + DebugLineTableRowRef Ref; + } U; + U.Ptr = Loc.getPointer(); + return U.Ref; + } + + SMLoc toSMLoc() const { + union { + decltype(SMLoc().getPointer()) Ptr; + DebugLineTableRowRef Ref; + } U; + U.Ref = *this; + return SMLoc::getFromPointer(U.Ptr); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 3eda5951b936..546f69ca6855 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -94,6 +94,11 @@ SplitFunctions("split-functions", cl::desc("split functions into hot and cold distinct regions"), cl::Optional); +static cl::opt +UpdateDebugSections("update-debug-sections", + cl::desc("update DWARF debug sections of the executable"), + cl::Optional); + static cl::opt ReorderBlocks( "reorder-blocks", @@ -258,7 +263,9 @@ bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { /// triple \p TripleName. static std::unique_ptr CreateBinaryContext( std::string ArchName, - std::string TripleName, const DataReader &DR) { + std::string TripleName, + const DataReader &DR, + std::unique_ptr DwCtx) { std::string Error; @@ -343,6 +350,7 @@ static std::unique_ptr CreateBinaryContext( auto BC = llvm::make_unique(std::move(Ctx), + std::move(DwCtx), std::move(TheTriple), TheTarget, TripleName, @@ -355,15 +363,18 @@ static std::unique_ptr CreateBinaryContext( std::move(MIA), std::move(MRI), std::move(DisAsm), - DR); + DR, + opts::UpdateDebugSections); return BC; } RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const DataReader &DR) - : File(File), BC(CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR)), - DwCtx(new DWARFContextInMemory(*File)) {} + : File(File), + BC(CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR, + std::unique_ptr(new DWARFContextInMemory(*File)))) +{ } RewriteInstance::~RewriteInstance() {} @@ -371,8 +382,8 @@ void RewriteInstance::reset() { BinaryFunctions.clear(); FileSymRefs.clear(); auto &DR = BC->DR; - BC = CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR); - DwCtx.reset(new DWARFContextInMemory(*File)); + BC = CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR, + std::unique_ptr(new DWARFContextInMemory(*File))); CFIRdWrt.reset(nullptr); SectionMM.reset(nullptr); Out.reset(nullptr); @@ -615,7 +626,7 @@ void RewriteInstance::readSpecialSections() { FrameHdrCopy = std::vector(FrameHdrContents.begin(), FrameHdrContents.end()); // Process debug sections. - EHFrame = DwCtx->getEHFrame(); + EHFrame = BC->DwCtx->getEHFrame(); if (opts::DumpEHFrame) { EHFrame->dump(outs()); } @@ -684,11 +695,11 @@ void RewriteInstance::disassembleFunctions() { (SectionContents.data()) + FunctionOffset, Function.getSize()); - if (!Function.disassemble(FunctionData)) + if (!Function.disassemble(FunctionData, opts::UpdateDebugSections)) continue; if (opts::PrintAll || opts::PrintDisasm) - Function.print(errs(), "after disassembly"); + Function.print(errs(), "after disassembly", true); if (!Function.isSimple()) continue; @@ -711,7 +722,7 @@ void RewriteInstance::disassembleFunctions() { continue; if (opts::PrintAll || opts::PrintCFG) - Function.print(errs(), "after building cfg"); + Function.print(errs(), "after building cfg", true); TotalScore += Function.getFunctionScore(); @@ -822,13 +833,13 @@ void RewriteInstance::runOptimizationPasses() { } if (opts::PrintAll || opts::PrintUCE) - Function.print(errs(), "after unreachable code elimination"); + Function.print(errs(), "after unreachable code elimination", true); } if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { BFI.second.modifyLayout(opts::ReorderBlocks, opts::SplitFunctions); if (opts::PrintAll || opts::PrintReordered) - Function.print(errs(), "after reordering blocks"); + Function.print(errs(), "after reordering blocks", true); } // Post-processing passes. @@ -844,8 +855,7 @@ void RewriteInstance::runOptimizationPasses() { // Update exception handling information. Function.updateEHRanges(); if (opts::PrintAll || opts::PrintEHRanges) - Function.print(errs(), "after updating EH ranges"); - + Function.print(errs(), "after updating EH ranges", true); } } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 2ad4efed2cb2..8c63ed7345e2 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -161,7 +161,6 @@ class RewriteInstance { llvm::object::ELFObjectFileBase *File; std::unique_ptr BC; - std::unique_ptr DwCtx; std::unique_ptr CFIRdWrt; /// Our in-memory intermediary object file where we hold final code for /// rewritten functions. From f7b5cae960d07d0d8e487e8301ca57f50cd1c999 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 3 Mar 2016 10:13:11 -0800 Subject: [PATCH 073/904] Extending support for non-allocatable sections. Summary: The is a set of changes that allow modification of non-allocatable sections in ELF binary. Primarily for the purpose of updating debug info. Extend LLVM interface to allow processing relocations in non-allocatable sections. This allows to produce .debug* sections with resolved relocations against generated code. Extend BOLT rewriting framework to allow appending contents to non-allocatable sections in the binary. Re-worked ELF binary rewriting to support the above and to allow future extensions (e.g. new section names). (cherry picked from commit a48b0da7ca77bd168cc37842ae7bbd8d7bb3f114) --- bolt/RewriteInstance.cpp | 337 ++++++++++++++++++++++++++------------- bolt/RewriteInstance.h | 26 ++- 2 files changed, 249 insertions(+), 114 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 546f69ca6855..632ff12ca73b 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -254,11 +254,41 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, return ret; } +void ExecutableFileMemoryManager::recordNoteSection( + const uint8_t *Data, + uintptr_t Size, + unsigned Alignment, + unsigned SectionID, + StringRef SectionName) { + DEBUG(dbgs() << "BOLT: note section " + << SectionName + << " with size " << Size << ", alignment " << Alignment + << " at 0x" + << Twine::utohexstr(reinterpret_cast(Data)) << '\n'); + if (SectionName == ".debug_line") { + // We need to make a copy of the section contents if we'll need it for + // a future reference. + uint8_t *p = new uint8_t[Size]; + memcpy(p, Data, Size); + NoteSectionInfo[SectionName] = SectionInfo(reinterpret_cast(p), + Size, + Alignment, + /*IsCode=*/false, + /*IsReadOnly*/true); + } +} + bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { DEBUG(dbgs() << "BOLT: finalizeMemory()\n"); return SectionMemoryManager::finalizeMemory(ErrMsg); } +ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { + for (auto &SII : NoteSectionInfo) { + delete[] reinterpret_cast(SII.second.AllocAddress); + } +} + /// Create BinaryContext for a given architecture \p ArchName and /// triple \p TripleName. static std::unique_ptr CreateBinaryContext( @@ -371,10 +401,10 @@ static std::unique_ptr CreateBinaryContext( RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const DataReader &DR) - : File(File), + : InputFile(File), BC(CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR, - std::unique_ptr(new DWARFContextInMemory(*File)))) -{ } + std::unique_ptr(new DWARFContextInMemory(*InputFile)))) { +} RewriteInstance::~RewriteInstance() {} @@ -383,7 +413,7 @@ void RewriteInstance::reset() { FileSymRefs.clear(); auto &DR = BC->DR; BC = CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR, - std::unique_ptr(new DWARFContextInMemory(*File))); + std::unique_ptr(new DWARFContextInMemory(*InputFile))); CFIRdWrt.reset(nullptr); SectionMM.reset(nullptr); Out.reset(nullptr); @@ -393,17 +423,13 @@ void RewriteInstance::reset() { } void RewriteInstance::discoverStorage() { - auto ELF64LEFile = dyn_cast(File); + auto ELF64LEFile = dyn_cast(InputFile); if (!ELF64LEFile) { errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n"; exit(1); } - auto Obj = ELF64LEFile->getELFFile(); - - // Discover important addresses in the binary. - // This is where the first segment and ELF header were allocated. uint64_t FirstAllocAddress = std::numeric_limits::max(); @@ -439,6 +465,9 @@ void RewriteInstance::discoverStorage() { // We create the new PHDR table in such a way that both of the methods // of loading and locating the table work. There's a slight file size // overhead because of that. + // + // NB: bfd's strip command cannot do the above and will corrupt the + // binary during the process of stripping non-allocatable sections. if (NextAvailableOffset <= NextAvailableAddress - FirstAllocAddress) { NextAvailableOffset = NextAvailableAddress - FirstAllocAddress; } else { @@ -483,14 +512,14 @@ void RewriteInstance::run() { runOptimizationPasses(); emitFunctions(); - // Copy input file to output + // Copy allocatable part of the input. std::error_code EC; Out = llvm::make_unique(opts::OutputFilename, EC, sys::fs::F_None, 0777); check_error(EC, "cannot create output executable file"); - Out->os() << File->getData(); + Out->os() << InputFile->getData().substr(0, FirstNonAllocatableOffset); - // Rewrite optimized functions back to this output + // Rewrite allocatable contents and copy non-allocatable parts with mods. rewriteFile(); } @@ -503,7 +532,7 @@ void RewriteInstance::readSymbolTable() { // For local symbols we want to keep track of associated FILE symbol for // disambiguation by name. - for (const SymbolRef &Symbol : File->symbols()) { + for (const SymbolRef &Symbol : InputFile->symbols()) { // Keep undefined symbols for pretty printing? if (Symbol.getFlags() & SymbolRef::SF_Undefined) continue; @@ -585,7 +614,7 @@ void RewriteInstance::readSymbolTable() { ErrorOr SectionOrErr = Symbol.getSection(); check_error(SectionOrErr.getError(), "cannot get symbol section"); section_iterator Section = *SectionOrErr; - if (Section == File->section_end()) { + if (Section == InputFile->section_end()) { // Could be an absolute symbol. Could record for pretty printing. continue; } @@ -602,7 +631,7 @@ void RewriteInstance::readSymbolTable() { void RewriteInstance::readSpecialSections() { // Process special sections. StringRef FrameHdrContents; - for (const auto &Section : File->sections()) { + for (const auto &Section : InputFile->sections()) { StringRef SectionName; check_error(Section.getName(SectionName), "cannot get section name"); StringRef SectionContents; @@ -645,8 +674,8 @@ void RewriteInstance::disassembleFunctions() { BinaryFunction &Function = BFI.second; if (!opts::shouldProcess(Function)) { - DEBUG(dbgs() << "BOLT: skipping processing function " << Function.getName() - << " per user request.\n"); + DEBUG(dbgs() << "BOLT: skipping processing function " + << Function.getName() << " per user request.\n"); continue; } @@ -688,7 +717,7 @@ void RewriteInstance::disassembleFunctions() { // Offset of the function in the file. Function.setFileOffset( - SectionContents.data() - File->getData().data() + FunctionOffset); + SectionContents.data() - InputFile->getData().data() + FunctionOffset); ArrayRef FunctionData( reinterpret_cast @@ -1122,7 +1151,8 @@ void RewriteInstance::emitFunctions() { auto ObjectsHandle = OLT.addObjectSet( singletonSet(std::move(ObjOrErr.get())), SectionMM.get(), - std::move(Resolver)); + std::move(Resolver), + /* ProcessAllSections = */true); // FIXME: use notifyObjectLoaded() to remap sections. @@ -1222,8 +1252,8 @@ void RewriteInstance::emitFunctions() { TempOut->keep(); } -void RewriteInstance::patchELF() { - auto ELF64LEFile = dyn_cast(File); +void RewriteInstance::patchELFPHDRTable() { + auto ELF64LEFile = dyn_cast(InputFile); if (!ELF64LEFile) { errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n"; exit(1); @@ -1232,7 +1262,7 @@ void RewriteInstance::patchELF() { auto &OS = Out->os(); // Write/re-write program headers. - unsigned Phnum = Obj->getHeader()->e_phnum; + Phnum = Obj->getHeader()->e_phnum; if (PHDRTableOffset) { // Writing new pheader table. Phnum += 1; // only adding one new segment @@ -1240,7 +1270,6 @@ void RewriteInstance::patchELF() { NewTextSegmentSize = NextAvailableAddress - PHDRTableAddress; } else { assert(!PHDRTableAddress && "unexpected address for program header table"); - // Update existing table. PHDRTableOffset = Obj->getHeader()->e_phoff; NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress; @@ -1302,29 +1331,109 @@ void RewriteInstance::patchELF() { assert((opts::UseGnuStack || AddedSegment) && "could not add program header for the new segment"); +} + +void RewriteInstance::rewriteNoteSections() { + auto ELF64LEFile = dyn_cast(InputFile); + if (!ELF64LEFile) { + errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n"; + exit(1); + } + auto Obj = ELF64LEFile->getELFFile(); + auto &OS = Out->os(); - // Copy original non-allocatable contents and update section offsets. uint64_t NextAvailableOffset = getFileOffsetFor(NextAvailableAddress); assert(NextAvailableOffset >= FirstNonAllocatableOffset && "next available offset calculation failure"); + OS.seek(NextAvailableOffset); + + // Copy over non-allocatable section contents and update file offsets. + for (auto &Section : Obj->sections()) { + if (Section.sh_type == ELF::SHT_NULL) + continue; + if (Section.sh_flags & ELF::SHF_ALLOC) + continue; - // Re-write using this offset delta. - uint64_t OffsetDelta = NextAvailableOffset - FirstNonAllocatableOffset; + // Insert padding as needed. + if (Section.sh_addralign > 1) { + auto Padding = OffsetToAlignment(NextAvailableOffset, + Section.sh_addralign); + const unsigned char ZeroByte{0}; + for (unsigned I = 0; I < Padding; ++I) + OS.write(ZeroByte); - // Make sure offset delta is a multiple of alignment; - OffsetDelta = RoundUpToAlignment(OffsetDelta, MaxNonAllocAlign); - NextAvailableOffset = FirstNonAllocatableOffset + OffsetDelta; + NextAvailableOffset += Padding; - // FIXME: only write up to SHDR table. - OS.seek(NextAvailableOffset); - OS << File->getData().drop_front(FirstNonAllocatableOffset); + assert(Section.sh_size % Section.sh_addralign == 0 && + "section size does not match section alignment"); + } + + // Copy over section contents. + auto Size = Section.sh_size; + OS << InputFile->getData().substr(Section.sh_offset, Size); + + // Address of extension to the section. + uint64_t Address{0}; + + // Append new section contents if available. + ErrorOr SectionName = Obj->getSectionName(&Section); + check_error(SectionName.getError(), "cannot get section name"); + + auto SII = SectionMM->NoteSectionInfo.find(*SectionName); + if (SII != SectionMM->NoteSectionInfo.end()) { + auto &SI = SII->second; + assert(SI.Alignment <= Section.sh_addralign && + "alignment exceeds value in file"); + outs() << "BOLT: appending contents to section " << *SectionName << '\n'; + // Write section extension. + Address = SI.AllocAddress; + OS.write(reinterpret_cast(Address), SI.Size); + Size += SI.Size; + } + + // Set/modify section info. + SectionMM->NoteSectionInfo[*SectionName] = + SectionInfo(Address, + Size, + Section.sh_addralign, + /*IsCode=*/false, + /*IsReadOnly=*/false, + /*FileAddress=*/0, + NextAvailableOffset); + + NextAvailableOffset += Size; + } +} + +// Rewrite section header table inserting new entries as needed. The sections +// header table size itself may affect the offsets of other sections, +// so we are placing it at the end of the binary. +// +// As we rewrite entries we need to track how many sections were inserted +// as it changes the sh_link value. +// +// The following are assumptoins about file modifications: +// * There are no modifications done to existing allocatable sections. +// * All new allocatable sections are written emmediately after existing +// allocatable sections. +// * There could be modifications done to non-allocatable sections, e.g. +// size could be increased. +// * New non-allocatable sections are added to the end of the file. +void RewriteInstance::patchELFSectionHeaderTable() { + auto ELF64LEFile = dyn_cast(InputFile); + if (!ELF64LEFile) { + errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n"; + exit(1); + } + auto Obj = ELF64LEFile->getELFFile(); + using Elf_Shdr = std::remove_pointer::type::Elf_Shdr; + + auto &OS = Out->os(); - bool SeenNonAlloc = false; - uint64_t ExtraDelta = 0; // for dynamically adjusting delta - unsigned NumNewSections = 0; + auto SHTOffset = OS.tell(); - // Update section table. Note that the section table itself has shifted. - OS.seek(Obj->getHeader()->e_shoff + OffsetDelta); + // Copy over entries for original allocatable sections with minor + // modifications (e.g. name). for (auto &Section : Obj->sections()) { // Always ignore this section. if (Section.sh_type == ELF::SHT_NULL) { @@ -1332,96 +1441,102 @@ void RewriteInstance::patchELF() { continue; } - auto NewSection = Section; - uint64_t SectionLoc = (uintptr_t)&Section - (uintptr_t)Obj->base(); + // Break at first non-allocatable section. + if (!(Section.sh_flags & ELF::SHF_ALLOC)) + break; ErrorOr SectionName = Obj->getSectionName(&Section); check_error(SectionName.getError(), "cannot get section name"); - if (!(Section.sh_flags & ELF::SHF_ALLOC)) { - if (!SeenNonAlloc) { + auto NewSection = Section; + if (*SectionName == ".bss") { + // .bss section offset matches that of the next section. + NewSection.sh_offset = NewTextSegmentOffset; + } - // This is where we place all our new sections. + auto SMII = SectionMM->SectionMapInfo.find(*SectionName); + if (SMII != SectionMM->SectionMapInfo.end()) { + auto &SecInfo = SMII->second; + SecInfo.ShName = Section.sh_name; + } - std::vector SectionsToRewrite; - for (auto &SMII : SectionMM->SectionMapInfo) { - SectionInfo &SI = SMII.second; - if (SI.IsCode && SMII.first != ".bolt.text") - continue; - errs() << "BOLT-INFO: re-writing section header for " - << SMII.first << '\n'; - auto NewSection = Section; - NewSection.sh_name = SI.ShName; - NewSection.sh_type = ELF::SHT_PROGBITS; - NewSection.sh_addr = SI.FileAddress; - NewSection.sh_offset = SI.FileOffset; - NewSection.sh_size = SI.Size; - NewSection.sh_entsize = 0; - NewSection.sh_flags = ELF::SHF_ALLOC | ELF::SHF_EXECINSTR; - NewSection.sh_link = 0; - NewSection.sh_info = 0; - NewSection.sh_addralign = SI.Alignment; - SectionsToRewrite.emplace_back(NewSection); - } + OS.write(reinterpret_cast(&NewSection), sizeof(NewSection)); + } - // Do actual writing after sorting out. - OS.seek(SectionLoc + OffsetDelta); - std::stable_sort(SectionsToRewrite.begin(), SectionsToRewrite.end(), - [] (decltype(Section) A, decltype(Section) B) { - return A.sh_offset < B.sh_offset; - }); - for (auto &SI : SectionsToRewrite) { - OS.write(reinterpret_cast(&SI), - sizeof(SI)); - } + // Create entries for new allocatable sections. + std::vector SectionsToRewrite; + for (auto &SMII : SectionMM->SectionMapInfo) { + SectionInfo &SI = SMII.second; + // Ignore function sections. + if (SI.IsCode && SMII.first != ".bolt.text") + continue; + errs() << "BOLT-INFO: writing section header for " + << SMII.first << '\n'; + Elf_Shdr NewSection; + NewSection.sh_name = SI.ShName; + NewSection.sh_type = ELF::SHT_PROGBITS; + NewSection.sh_addr = SI.FileAddress; + NewSection.sh_offset = SI.FileOffset; + NewSection.sh_size = SI.Size; + NewSection.sh_entsize = 0; + NewSection.sh_flags = ELF::SHF_ALLOC | ELF::SHF_EXECINSTR; + NewSection.sh_link = 0; + NewSection.sh_info = 0; + NewSection.sh_addralign = SI.Alignment; + SectionsToRewrite.emplace_back(NewSection); + } - NumNewSections = SectionsToRewrite.size(); - ExtraDelta += sizeof(Section) * NumNewSections; + // Write section header entries for new allocatable sections in offset order. + std::stable_sort(SectionsToRewrite.begin(), SectionsToRewrite.end(), + [] (Elf_Shdr A, Elf_Shdr B) { + return A.sh_offset < B.sh_offset; + }); + for (auto &SI : SectionsToRewrite) { + OS.write(reinterpret_cast(&SI), + sizeof(SI)); + } - SeenNonAlloc = true; - } + auto NumNewSections = SectionsToRewrite.size(); - assert(Section.sh_addralign <= MaxNonAllocAlign && - "unexpected alignment for non-allocatable section"); - assert(Section.sh_offset >= FirstNonAllocatableOffset && - "bad offset for non-allocatable section"); + // Copy over entries for non-allocatable sections performing necessary + // adjustements. + for (auto &Section : Obj->sections()) { + if (Section.sh_type == ELF::SHT_NULL) + continue; + if (Section.sh_flags & ELF::SHF_ALLOC) + continue; - NewSection.sh_offset = Section.sh_offset + OffsetDelta; + ErrorOr SectionName = Obj->getSectionName(&Section); + check_error(SectionName.getError(), "cannot get section name"); - if (Section.sh_offset > Obj->getHeader()->e_shoff) { - // The section is going to be shifted. - NewSection.sh_offset = NewSection.sh_offset + ExtraDelta; - } + auto SII = SectionMM->NoteSectionInfo.find(*SectionName); + assert(SII != SectionMM->NoteSectionInfo.end() && + "missing section info for non-allocatable section"); - if (Section.sh_link) - NewSection.sh_link = Section.sh_link + NumNewSections; + auto NewSection = Section; + NewSection.sh_offset = SII->second.FileOffset; + NewSection.sh_size = SII->second.Size; - } else if (*SectionName == ".bss") { - NewSection.sh_offset = NewTextSegmentOffset; - } + // Adjust sh_link for sections that use it. + if (Section.sh_link) + NewSection.sh_link = Section.sh_link + NumNewSections; - auto SMII = SectionMM->SectionMapInfo.find(*SectionName); - if (SMII != SectionMM->SectionMapInfo.end()) { - auto &SecInfo = SMII->second; - SecInfo.ShName = Section.sh_name; + // Adjust sh_info for relocation sections. + if (Section.sh_type == ELF::SHT_REL || Section.sh_type == ELF::SHT_RELA) { + if (Section.sh_info) + NewSection.sh_info = Section.sh_info + NumNewSections; } OS.write(reinterpret_cast(&NewSection), sizeof(NewSection)); } - // Write all the sections past the section table again as they are shifted. - auto OffsetPastShdrTable = Obj->getHeader()->e_shoff + - Obj->getHeader()->e_shnum * sizeof(ELFFile::Elf_Shdr); - OS.seek(OffsetPastShdrTable + OffsetDelta + ExtraDelta); - OS << File->getData().drop_front(OffsetPastShdrTable); - // FIXME: Update _end in .dynamic // Fix ELF header. auto NewEhdr = *Obj->getHeader(); NewEhdr.e_phoff = PHDRTableOffset; NewEhdr.e_phnum = Phnum; - NewEhdr.e_shoff = NewEhdr.e_shoff + OffsetDelta; + NewEhdr.e_shoff = SHTOffset; NewEhdr.e_shnum = NewEhdr.e_shnum + NumNewSections; NewEhdr.e_shstrndx = NewEhdr.e_shstrndx + NumNewSections; OS.pwrite(reinterpret_cast(&NewEhdr), sizeof(NewEhdr), 0); @@ -1512,9 +1627,15 @@ void RewriteInstance::rewriteFile() { } } + // Print function statistics. outs() << "BOLT: " << CountOverwrittenFunctions << " out of " << BinaryFunctions.size() << " functions were overwritten.\n"; + if (TotalScore != 0) { + double Coverage = OverwrittenScore / (double)TotalScore * 100.0; + outs() << format("BOLT: Rewritten functions cover %.2lf", Coverage) + << "% of the execution count of simple functions of this binary.\n"; + } // Write all non-code sections. for (auto &SMII : SectionMM->SectionMapInfo) { @@ -1560,14 +1681,14 @@ void RewriteInstance::rewriteFile() { NextAvailableAddress += EHFrameHdrSecInfo.Size; } - // Update ELF book-keeping info. - patchELF(); + // Patch program header table. + patchELFPHDRTable(); - if (TotalScore != 0) { - double Coverage = OverwrittenScore / (double)TotalScore * 100.0; - outs() << format("BOLT: Rewritten functions cover %.2lf", Coverage) - << "% of the execution count of simple functions of this binary.\n"; - } + // Copy non-allocatable sections once allocatable part is finished. + rewriteNoteSections(); + + // Update ELF book-keeping info. + patchELFSectionHeaderTable(); // TODO: we should find a way to mark the binary as optimized by us. Out->keep(); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 8c63ed7345e2..5221d1f7c8ea 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -65,11 +65,16 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { public: - // Keep [section name] -> [section info] map for later remapping. + /// Keep [section name] -> [section info] map for later remapping. std::map SectionMapInfo; + /// Information about non-allocatable sections. + std::map NoteSectionInfo; + ExecutableFileMemoryManager() {} + ~ExecutableFileMemoryManager(); + uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID, StringRef SectionName) override { @@ -84,6 +89,10 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { /*IsCode=*/false, IsReadOnly); } + void recordNoteSection(const uint8_t *Data, uintptr_t Size, + unsigned Alignment, unsigned SectionID, + StringRef SectionName) override; + // Tell EE that we guarantee we don't need stubs. bool allowStubAllocation() const override { return false; } @@ -139,14 +148,17 @@ class RewriteInstance { /// Huge page size used for alignment. static constexpr unsigned PageAlign = 0x200000; - /// Maximum alignment for non-allocatable section. - static constexpr unsigned MaxNonAllocAlign = 16; - - /// Detect storage available in the binary for allocating new sections. + /// Detect addresses and offsets available in the binary for allocating + /// new sections. void discoverStorage(); + /// Rewrite non-allocatable sections with modifications. + void rewriteNoteSections(); + /// Patch ELF book-keeping info. void patchELF(); + void patchELFPHDRTable(); + void patchELFSectionHeaderTable(); /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { @@ -158,7 +170,7 @@ class RewriteInstance { private: /// An instance of the input binary we are processing, externally owned. - llvm::object::ELFObjectFileBase *File; + llvm::object::ELFObjectFileBase *InputFile; std::unique_ptr BC; std::unique_ptr CFIRdWrt; @@ -172,8 +184,10 @@ class RewriteInstance { /// Offset in the input file where non-allocatable sections start. uint64_t FirstNonAllocatableOffset{0}; + /// Information about program header table. uint64_t PHDRTableAddress{0}; uint64_t PHDRTableOffset{0}; + unsigned Phnum{0}; /// New code segment info. uint64_t NewTextSegmentAddress{0}; From 1eed26d1bef379e888dfc767d35ca666ab4f58d4 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Wed, 2 Mar 2016 18:40:10 -0800 Subject: [PATCH 074/904] Write updated .debug_line information to temp file Summary: Writes .debug_line section by setting the state in MCContext that LLVM needs to produce and output the line tables. This basically consists of setting the current location and compile unit offset. This makes LLVM output .debug_line in the temporary file, but not yet in the generated ELF file. Also computes the line table offsets for each compile unit and saves them into BinaryContext. Added an option to print these offsets. (cherry picked from commit 6b68b78d29b8470d5445077ab5471e67e4aea88b) --- bolt/BinaryContext.h | 4 ++ bolt/RewriteInstance.cpp | 105 +++++++++++++++++++++++++++++++++++++++ bolt/RewriteInstance.h | 4 ++ 3 files changed, 113 insertions(+) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 444492bf0d49..dc1b8bcf70b6 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -61,6 +61,10 @@ class BinaryContext { // DWARF Compilation Unit that starts at that offset. std::map OffsetToDwarfCU; + // Maps each compile unit to the offset of its .debug_line line table in the + // output file. + std::map CompileUnitLineTableOffset; + std::unique_ptr Ctx; std::unique_ptr DwCtx; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 632ff12ca73b..d4356ecb0f7d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -14,10 +14,12 @@ #include "BinaryContext.h" #include "BinaryFunction.h" #include "DataReader.h" +#include "DebugLineTableRowRef.h" #include "Exceptions.h" #include "RewriteInstance.h" #include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" @@ -25,6 +27,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInstPrinter.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" @@ -396,6 +399,27 @@ static std::unique_ptr CreateBinaryContext( DR, opts::UpdateDebugSections); + if (opts::UpdateDebugSections) { + // Populate MCContext with DWARF files. + for (const auto &CU : BC->DwCtx->compile_units()) { + const auto CUID = CU->getOffset(); + auto LineTable = BC->DwCtx->getLineTableForUnit(CU.get()); + const auto &FileNames = LineTable->Prologue.FileNames; + for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) { + // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 + // means empty dir. + const char *Dir = FileNames[I].DirIdx ? + LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] : + ""; + BC->Ctx->getDwarfFile( + Dir, + FileNames[I].Name, + I + 1, + CUID); + } + } + } + return BC; } @@ -959,6 +983,10 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); MCSection *Section = FunctionSection; + + Section->setHasInstructions(true); + BC.Ctx->addGenDwarfSection(Section); + Streamer.SwitchSection(Section); Streamer.EmitCodeAlignment(Function.getAlignment()); @@ -1029,6 +1057,29 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, continue; } if (!BC.MIA->isCFI(Instr)) { + if (opts::UpdateDebugSections) { + auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc()); + if (auto CompileUnit = BC.OffsetToDwarfCU[RowReference.DwCompileUnitIndex]) { + + auto OriginalLineTable = + BC.DwCtx->getLineTableForUnit( + CompileUnit); + const auto &OriginalRow = OriginalLineTable->Rows[RowReference.RowIndex]; + + BC.Ctx->setCurrentDwarfLoc( + OriginalRow.File, + OriginalRow.Line, + OriginalRow.Column, + (DWARF2_FLAG_IS_STMT * OriginalRow.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * OriginalRow.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * OriginalRow.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin), + OriginalRow.Isa, + OriginalRow.Discriminator); + BC.Ctx->setDwarfCompileUnitID(CompileUnit->getOffset()); + } + } + Streamer.EmitInstruction(Instr, *BC.STI); continue; } @@ -1120,6 +1171,11 @@ void RewriteInstance::emitFunctions() { // Assign addresses to new functions/sections. ////////////////////////////////////////////////////////////////////////////// + if (opts::UpdateDebugSections) { + // Compute offsets of tables in .debug_line for each compile unit. + computeLineTableOffsets(); + } + // Get output object as ObjectFile. std::unique_ptr ObjectMemBuffer = MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false); @@ -1693,3 +1749,52 @@ void RewriteInstance::rewriteFile() { // TODO: we should find a way to mark the binary as optimized by us. Out->keep(); } + +void RewriteInstance::computeLineTableOffsets() { + const auto LineSection = + BC->Ctx->getObjectFileInfo()->getDwarfLineSection(); + auto CurrentFragment = LineSection->begin(); + uint32_t CurrentOffset = 0; + uint32_t Offset = 0; + + // Line tables are stored in MCContext in ascending order of offset in the + // output file, thus we can compute all table's offset by passing through + // each fragment at most once, continuing from the last CU's beginning + // instead of from the first fragment. + for (const auto &CUIDLineTablePair : BC->Ctx->getMCDwarfLineTables()) { + auto Label = CUIDLineTablePair.second.getLabel(); + + if (!Label) + continue; + + auto Fragment = Label->getFragment(); + + while (&*CurrentFragment != Fragment) { + switch (CurrentFragment->getKind()) { + case MCFragment::FT_Dwarf: + Offset += cast(*CurrentFragment) + .getContents().size() - CurrentOffset; + break; + case MCFragment::FT_Data: + Offset += cast(*CurrentFragment) + .getContents().size() - CurrentOffset; + break; + default: + llvm_unreachable(".debug_line section shouldn't contain other types " + "of fragments."); + } + + ++CurrentFragment; + CurrentOffset = 0; + } + + Offset += Label->getOffset() - CurrentOffset; + CurrentOffset = Label->getOffset(); + + auto CompileUnit = BC->OffsetToDwarfCU[CUIDLineTablePair.first]; + BC->CompileUnitLineTableOffset[CompileUnit] = Offset; + + DEBUG(errs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first + << " has line table at " << Offset << "\n"); + } +} diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 5221d1f7c8ea..abcb3d8115d0 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -160,6 +160,10 @@ class RewriteInstance { void patchELFPHDRTable(); void patchELFSectionHeaderTable(); + /// Computes output .debug_line line table offsets for each compile unit, and + /// stores them into BinaryContext::CompileUnitLineTableOffset. + void computeLineTableOffsets(); + /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { assert(Address >= NewTextSegmentAddress && From ae5470aee039a0b951cc4b0150e8039ee4530559 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 10 Mar 2016 23:03:17 -0800 Subject: [PATCH 075/904] Proper skipping of unsupported CFI instructions. Summary: Skip DW_CFA_expression and DW_CFA_val_expression instructions properly, according to DWARF spec. If CFI range does not match function range skip that function. (cherry picked from commit 49ed9c738d09323e71108ada599376c26757bdf5) --- bolt/Exceptions.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index fd5e05067d7b..c746cf6dca95 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -505,6 +505,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { << format(": Function size is %dB, CFI covers " "%dB\n", Function.getSize(), CurFDE.getAddressRange()); + return false; } Function.setLSDAAddress(CurFDE.getLSDAAddress()); From f8c55fb22f7ab1071e23a867572d8a1b2c61ac58 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 9 Mar 2016 16:06:41 -0800 Subject: [PATCH 076/904] Update stmt_list value to point to new .debug_line offset. Summary: After we add new line number information we have to update stmt_list offsets in .debug_info. For this I had to add a primitive relocations support for non-allocatable sections we are copying from input file. Also enabled functionality to process relocations in non-allocatable sections that LLVM is generating, such as .debug_line. I thought we already had it, but apparently it didn't work, at least not for ELF binaries. (cherry picked from commit a77bd2cf7b86b9290f84ac3e59e8fbe9950a946b) --- bolt/BinaryContext.h | 3 ++ bolt/RewriteInstance.cpp | 81 +++++++++++++++++++++++++++++----------- bolt/RewriteInstance.h | 19 ++++++++-- 3 files changed, 79 insertions(+), 24 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index dc1b8bcf70b6..27e66ca625b3 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -65,6 +65,9 @@ class BinaryContext { // output file. std::map CompileUnitLineTableOffset; + /// Maps DWARF CUID to offset of stmt_list attribute in .debug_info. + std::map LineTableOffsetCUMap; + std::unique_ptr Ctx; std::unique_ptr DwCtx; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index d4356ecb0f7d..70cf45ff00fb 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -257,7 +257,7 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, return ret; } -void ExecutableFileMemoryManager::recordNoteSection( +uint8_t *ExecutableFileMemoryManager::recordNoteSection( const uint8_t *Data, uintptr_t Size, unsigned Alignment, @@ -271,13 +271,19 @@ void ExecutableFileMemoryManager::recordNoteSection( if (SectionName == ".debug_line") { // We need to make a copy of the section contents if we'll need it for // a future reference. - uint8_t *p = new uint8_t[Size]; - memcpy(p, Data, Size); - NoteSectionInfo[SectionName] = SectionInfo(reinterpret_cast(p), - Size, - Alignment, - /*IsCode=*/false, - /*IsReadOnly*/true); + uint8_t *DataCopy = new uint8_t[Size]; + memcpy(DataCopy, Data, Size); + NoteSectionInfo[SectionName] = + SectionInfo(reinterpret_cast(DataCopy), + Size, + Alignment, + /*IsCode=*/false, + /*IsReadOnly*/true); + return DataCopy; + } else { + DEBUG(dbgs() << "BOLT-DEBUG: ignoring section " << SectionName + << " in recordNoteSection()\n"); + return nullptr; } } @@ -404,6 +410,8 @@ static std::unique_ptr CreateBinaryContext( for (const auto &CU : BC->DwCtx->compile_units()) { const auto CUID = CU->getOffset(); auto LineTable = BC->DwCtx->getLineTableForUnit(CU.get()); + auto LineTableOffset = + BC->DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_stmt_list); const auto &FileNames = LineTable->Prologue.FileNames; for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) { // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 @@ -417,6 +425,9 @@ static std::unique_ptr CreateBinaryContext( I + 1, CUID); } + if (LineTableOffset) { + BC->LineTableOffsetCUMap[CUID] = LineTableOffset; + } } } @@ -668,11 +679,12 @@ void RewriteInstance::readSpecialSections() { if (SectionName == ".gcc_except_table") { LSDAData = SectionData; LSDAAddress = Section.getAddress(); - } - if (SectionName == ".eh_frame_hdr") { + } else if (SectionName == ".eh_frame_hdr") { FrameHdrAddress = Section.getAddress(); FrameHdrContents = SectionContents; FrameHdrAlign = Section.getAlignment(); + } else if (SectionName == ".debug_line") { + DebugLineSize = Section.getSize(); } } @@ -1171,6 +1183,9 @@ void RewriteInstance::emitFunctions() { // Assign addresses to new functions/sections. ////////////////////////////////////////////////////////////////////////////// + auto EFMM = new ExecutableFileMemoryManager(); + SectionMM.reset(EFMM); + if (opts::UpdateDebugSections) { // Compute offsets of tables in .debug_line for each compile unit. computeLineTableOffsets(); @@ -1183,10 +1198,6 @@ void RewriteInstance::emitFunctions() { object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()); check_error(ObjOrErr.getError(), "error creating in-memory object"); - auto EFMM = new ExecutableFileMemoryManager(); - SectionMM.reset(EFMM); - - // Run ObjectLinkingLayer() with custom memory manager and symbol resolver. orc::ObjectLinkingLayer<> OLT; @@ -1431,20 +1442,40 @@ void RewriteInstance::rewriteNoteSections() { // Address of extension to the section. uint64_t Address{0}; - // Append new section contents if available. + // Perform section post-processing. ErrorOr SectionName = Obj->getSectionName(&Section); check_error(SectionName.getError(), "cannot get section name"); - auto SII = SectionMM->NoteSectionInfo.find(*SectionName); if (SII != SectionMM->NoteSectionInfo.end()) { auto &SI = SII->second; assert(SI.Alignment <= Section.sh_addralign && "alignment exceeds value in file"); - outs() << "BOLT: appending contents to section " << *SectionName << '\n'; + // Write section extension. Address = SI.AllocAddress; - OS.write(reinterpret_cast(Address), SI.Size); - Size += SI.Size; + if (Address) { + DEBUG(dbgs() << "BOLT: appending contents to section " + << *SectionName << '\n'); + OS.write(reinterpret_cast(Address), SI.Size); + Size += SI.Size; + } + + if (!SI.PendingRelocs.empty()) { + DEBUG(dbgs() << "BOLT-DEBUG: processing relocs for section " + << *SectionName << '\n'); + for (auto &Reloc : SI.PendingRelocs) { + DEBUG(dbgs() << "BOLT-DEBUG: writing value " + << Twine::utohexstr(Reloc.Value) + << " of size " << (unsigned)Reloc.Size + << " at offset " + << Twine::utohexstr(Reloc.Offset) << '\n'); + assert(Reloc.Size == 4 && + "only relocations of size 4 are supported at the moment"); + OS.pwrite(reinterpret_cast(&Reloc.Value), + Reloc.Size, + NextAvailableOffset + Reloc.Offset); + } + } } // Set/modify section info. @@ -1794,7 +1825,15 @@ void RewriteInstance::computeLineTableOffsets() { auto CompileUnit = BC->OffsetToDwarfCU[CUIDLineTablePair.first]; BC->CompileUnitLineTableOffset[CompileUnit] = Offset; - DEBUG(errs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first - << " has line table at " << Offset << "\n"); + auto LTOI = BC->LineTableOffsetCUMap.find(CUIDLineTablePair.first); + if (LTOI != BC->LineTableOffsetCUMap.end()) { + DEBUG(dbgs() << "BOLT-DEBUG: adding relocation for stmt_list " + << "in .debug_info\n"); + auto &SI = SectionMM->NoteSectionInfo[".debug_info"]; + SI.PendingRelocs.emplace_back( + SectionInfo::Reloc{LTOI->second, 4, 0, Offset + DebugLineSize}); + } + DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first + << " has line table at " << Offset << "\n"); } } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index abcb3d8115d0..3d2a2ca5b110 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -45,6 +45,16 @@ struct SectionInfo { uint64_t FileOffset{0}; /// Offset in the output file. uint64_t ShName{0}; /// Name offset in section header string table. + struct Reloc { + uint32_t Offset; + uint8_t Size; + uint8_t Type; // unused atm + uint32_t Value; + }; + + /// Pending relocations for the section. + std::vector PendingRelocs; + SectionInfo(uint64_t Address = 0, uint64_t Size = 0, unsigned Alignment = 0, bool IsCode = false, bool IsReadOnly = false, uint64_t FileAddress = 0, uint64_t FileOffset = 0) @@ -89,9 +99,9 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { /*IsCode=*/false, IsReadOnly); } - void recordNoteSection(const uint8_t *Data, uintptr_t Size, - unsigned Alignment, unsigned SectionID, - StringRef SectionName) override; + uint8_t *recordNoteSection(const uint8_t *Data, uintptr_t Size, + unsigned Alignment, unsigned SectionID, + StringRef SectionName) override; // Tell EE that we guarantee we don't need stubs. bool allowStubAllocation() const override { return false; } @@ -223,6 +233,9 @@ class RewriteInstance { /// rewriting CFI info for these functions. std::vector FailedAddresses; + /// Size of the .debug_line section on input. + uint32_t DebugLineSize{0}; + /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; From 2cb083ef779c503a36dff65397ec1a61e08fab11 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 11 Mar 2016 11:09:34 -0800 Subject: [PATCH 077/904] Check function data in symbol table against data in .eh_frame. Summary: At the moment we rely solely on the symbol table information to discover function boundaries. However, similar information is contained in .eh_frame. Verify that the information from these two sources is consistent, and if it's not, then skip processing the functions with conflicting information. (cherry picked from commit 5aeebc84ef83ef79d788831df7b4cd9b7ef45f19) --- bolt/BinaryFunction.h | 13 +++++++------ bolt/Exceptions.h | 14 ++++++++++++-- bolt/RewriteInstance.cpp | 41 +++++++++++++++++++++++++++++++++++++--- bolt/RewriteInstance.h | 6 +++--- 4 files changed, 60 insertions(+), 14 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 1096a30ba0a3..2069c262923c 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -109,10 +109,6 @@ class BinaryFunction { /// Alignment requirements for the function. uint64_t Alignment{1}; - /// False if the function is too complex to reconstruct its control - /// flow graph and re-assemble. - bool IsSimple{true}; - /// True if this function needs to be emitted in two separate parts, one for /// the hot basic blocks and another for the cold basic blocks. bool IsSplit{false}; @@ -125,6 +121,10 @@ class BinaryFunction { BinaryContext &BC; + /// False if the function is too complex to reconstruct its control + /// flow graph and re-assemble. + bool IsSimple{true}; + /// The address for the code for this function in codegen memory. uint64_t ImageAddress{0}; @@ -326,9 +326,10 @@ class BinaryFunction { BinaryFunction(std::string Name, SymbolRef Symbol, SectionRef Section, - uint64_t Address, uint64_t Size, BinaryContext &BC) : + uint64_t Address, uint64_t Size, BinaryContext &BC, + bool IsSimple = true) : Name(Name), Symbol(Symbol), Section(Section), Address(Address), - Size(Size), BC(BC), CodeSectionName(".text." + Name), + Size(Size), BC(BC), IsSimple(IsSimple), CodeSectionName(".text." + Name), FunctionNumber(++Count) {} diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h index 52f298d632d7..ee7bc8db6019 100644 --- a/bolt/Exceptions.h +++ b/bolt/Exceptions.h @@ -42,8 +42,6 @@ class CFIReaderWriter { } } - using FDEsMap = std::map; - bool fillCFIInfoFor(BinaryFunction &Function) const; // Include a new EHFrame, updating the .eh_frame_hdr @@ -51,6 +49,18 @@ class CFIReaderWriter { uint64_t NewFrameHdrAddress, ArrayRef FailedAddresses); + using FDEsMap = std::map; + using fde_iterator = FDEsMap::const_iterator; + + /// Get all FDEs discovered by this reader. + iterator_range fdes() const { + return iterator_range(FDEs.begin(), FDEs.end()); + } + + const FDEsMap &getFDEs() const { + return FDEs; + } + private: const DWARFFrame &EHFrame; uint64_t FrameHdrAddress; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 70cf45ff00fb..23eef9cee3ca 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -540,9 +540,10 @@ void RewriteInstance::run() { return; } + // Main "loop". discoverStorage(); - readSymbolTable(); readSpecialSections(); + discoverFileObjects(); disassembleFunctions(); runOptimizationPasses(); emitFunctions(); @@ -558,7 +559,7 @@ void RewriteInstance::run() { rewriteFile(); } -void RewriteInstance::readSymbolTable() { +void RewriteInstance::discoverFileObjects() { std::string FileSymbolName; FileSymRefs.clear(); @@ -654,11 +655,45 @@ void RewriteInstance::readSymbolTable() { continue; } + // Checkout for conflicts with function data from FDEs. + bool IsSimple = true; + auto FDEI = CFIRdWrt->getFDEs().lower_bound(Address); + if (FDEI != CFIRdWrt->getFDEs().end()) { + auto &FDE = *FDEI->second; + if (FDEI->first != Address) { + // There's no matching starting address in FDE. Make sure the previous + // FDE does not contain this address. + if (FDEI != CFIRdWrt->getFDEs().begin()) { + --FDEI; + auto &PrevFDE = *FDEI->second; + auto PrevStart = PrevFDE.getInitialLocation(); + auto PrevLength = PrevFDE.getAddressRange(); + if (Address > PrevStart && Address < PrevStart + PrevLength) { + errs() << "BOLT-WARNING: function " << UniqueName + << " is in conflict with FDE [" + << Twine::utohexstr(PrevStart) << ", " + << Twine::utohexstr(PrevStart + PrevLength) + << "). Skipping.\n"; + IsSimple = false; + } + } + } else if (FDE.getAddressRange() != SymbolSize) { + // Function addresses match but sizes differ. + errs() << "BOLT-WARNING: sizes differ for function " << UniqueName + << ". FDE : " << FDE.getAddressRange() + << "; symbol table : " << SymbolSize << ". Skipping.\n"; + + // Create maximum size non-simple function. + IsSimple = false; + SymbolSize = std::max(SymbolSize, FDE.getAddressRange()); + } + } + // Create the function and add to the map. BinaryFunctions.emplace( Address, BinaryFunction(UniqueName, Symbol, *Section, Address, - SymbolSize, *BC) + SymbolSize, *BC, IsSimple) ); } } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 3d2a2ca5b110..edab55ada203 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -125,9 +125,9 @@ class RewriteInstance { /// Run all the necessary steps to read, optimize and rewrite the binary. void run(); - /// Populate array of binary functions and file symbols from file symbol - /// table. - void readSymbolTable(); + /// Populate array of binary functions and other objects of interest + /// from meta data in the file. + void discoverFileObjects(); /// Read .eh_frame, .eh_frame_hdr and .gcc_except_table sections for exception /// and stack unwinding information. From 46a56a729dc4b36d8c4f1e80adacf43a77ef4bec Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Fri, 11 Mar 2016 11:30:30 -0800 Subject: [PATCH 078/904] Write updated .debug_aranges section after optimizations. Summary: Write the .debug_aranges section after optimizations to the output binary. Each function generates at least one range and at most two (one extra for its cold part). The writing is done manually because LLVM's implementation is tied to the output of .debug_info (see EmitGenDwarfInfo and EmitGenDwarfARanges in lib/MC/MCDwarf.cpp), which we don't want to trigger right now. (cherry picked from commit 2b36c8b50f3ecc785fb2cf3fd1c988f476bb9fc8) --- bolt/CMakeLists.txt | 1 + bolt/DebugArangesWriter.cpp | 72 ++++++++++++++++++++++++++++++++++ bolt/DebugArangesWriter.h | 47 ++++++++++++++++++++++ bolt/RewriteInstance.cpp | 78 ++++++++++++++++++++++++++++++++++--- bolt/RewriteInstance.h | 18 +++++++++ 5 files changed, 211 insertions(+), 5 deletions(-) create mode 100644 bolt/DebugArangesWriter.cpp create mode 100644 bolt/DebugArangesWriter.h diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 7b25d1e27984..3d97144787dc 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -17,6 +17,7 @@ add_llvm_tool(llvm-bolt BinaryContext.cpp BinaryFunction.cpp DataReader.cpp + DebugArangesWriter.cpp DebugLineTableRowRef.cpp Exceptions.cpp RewriteInstance.cpp diff --git a/bolt/DebugArangesWriter.cpp b/bolt/DebugArangesWriter.cpp new file mode 100644 index 000000000000..44564cdbad09 --- /dev/null +++ b/bolt/DebugArangesWriter.cpp @@ -0,0 +1,72 @@ +//===--- DebugArangesWriter.h - Writes the .debug_aranges DWARF section ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "DebugArangesWriter.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCObjectWriter.h" + + +namespace llvm { +namespace bolt { + +void DebugArangesWriter::AddRange(uint32_t CompileUnitOffset, + uint64_t Address, + uint64_t Size) { + CUAddressRanges[CompileUnitOffset].push_back(std::make_pair(Address, Size)); +} + +void DebugArangesWriter::Write(MCObjectWriter *Writer) const { + // For reference on the format of the .debug_aranges section, see the DWARF4 + // specification, section 6.1.4 Lookup by Address + // http://www.dwarfstd.org/doc/DWARF4.pdf + for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { + uint64_t Offset = CUOffsetAddressRangesPair.first; + const auto &AddressRanges = CUOffsetAddressRangesPair.second; + + // Emit header. + + // Size of this set: 8 (size of the header) + 4 (padding after header) + // + 2*sizeof(uint64_t) bytes for each of the ranges, plus an extra + // pair of uint64_t's for the terminating, zero-length range. + // Does not include size field itself. + uint64_t Size = 8 + 4 + 2*sizeof(uint64_t) * (AddressRanges.size() + 1); + + // Header field #1: set size. + Writer->writeLE32(Size); + + // Header field #2: version number, 2 as per the specification. + Writer->writeLE16(2); + + // Header field #3: debug info offset of the correspondent compile unit. + Writer->writeLE32(Offset); + + // Header field #4: address size. + // 8 since we only write ELF64 binaries for now. + Writer->write8(8); + + // Header field #5: segment size of target architecture. + Writer->write8(0); + + // Padding before address table - 4 bytes in the 64-bit-pointer case. + Writer->writeLE32(0); + + // Emit address ranges. + for (const auto &Range : AddressRanges) { + Writer->writeLE64(Range.first); + Writer->writeLE64(Range.second); + } + + // Emit terminating address range (offset 0, length 0). + Writer->writeLE64(0); + Writer->writeLE64(0); + } +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/DebugArangesWriter.h b/bolt/DebugArangesWriter.h new file mode 100644 index 000000000000..a975bf090909 --- /dev/null +++ b/bolt/DebugArangesWriter.h @@ -0,0 +1,47 @@ +//===--- DebugArangesWriter.h - Writes the .debug_aranges DWARF section ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Class that serializes a .debug_aranges section of a binary. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUGARANGESWRITER_H +#define LLVM_TOOLS_LLVM_BOLT_DEBUGARANGESWRITER_H + +#include +#include +#include + +namespace llvm { + +class MCObjectWriter; + +namespace bolt { + +class DebugArangesWriter { +public: + DebugArangesWriter() = default; + + /// Adds a range to the .debug_arange section. + void AddRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); + + /// Writes .debug_aranges with the added ranges to the MCObjectWriter. + void Write(MCObjectWriter *Writer) const; + +private: + // Map from compile unit offset to the list of address intervals that belong + // to that compile unit. Each interval is a pair + // (first address, interval size). + std::map>> CUAddressRanges; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 23eef9cee3ca..90bc9e69be02 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1277,6 +1277,11 @@ void RewriteInstance::emitFunctions() { Function.getAddress()); Function.setImageAddress(SMII->second.AllocAddress); Function.setImageSize(SMII->second.Size); + + if (opts::UpdateDebugSections) { + addDebugArangesEntry(Function.getAddress(), Function.getAddress(), + Function.getSize()); + } } else { errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); @@ -1301,12 +1306,23 @@ void RewriteInstance::emitFunctions() { Function.cold().setImageAddress(SMII->second.AllocAddress); Function.cold().setImageSize(SMII->second.Size); Function.cold().setFileOffset(getFileOffsetFor(NextAvailableAddress)); + + if (opts::UpdateDebugSections) { + addDebugArangesEntry(Function.getAddress(), NextAvailableAddress, + Function.cold().getImageSize()); + } + NextAvailableAddress += SMII->second.Size; } else { errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); } } + + // After collecting rewritten function addresses, generate the contents of + // .debug_aranges. + generateDebugAranges(); + // Add the new text section aggregating all existing code sections. auto NewTextSectionSize = NextAvailableAddress - NewTextSectionStartAddress; if (NewTextSectionSize) { @@ -1354,6 +1370,50 @@ void RewriteInstance::emitFunctions() { TempOut->keep(); } +void RewriteInstance::addDebugArangesEntry(uint64_t OriginalFunctionAddress, + uint64_t RangeBegin, + uint64_t RangeSize) { + if (auto DebugAranges = BC->DwCtx->getDebugAranges()) { + uint32_t CUOffset = DebugAranges->findAddress(OriginalFunctionAddress); + if (CUOffset != -1U) { + ArangesWriter.AddRange(CUOffset, RangeBegin, RangeSize); + } + } +} + +void RewriteInstance::generateDebugAranges() { + // Get the address of all non-simple functions and add them intact to aranges. + // Simple functions are rewritten and have their .debug_aranges entries added + // during rewriting. + for (const auto &BFI : BinaryFunctions) { + const auto &Function = BFI.second; + if (!Function.isSimple()) { + addDebugArangesEntry(Function.getAddress(), Function.getAddress(), + Function.getSize()); + } + } + + SmallVector ArangesBuffer; + raw_svector_ostream OS(ArangesBuffer); + + auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); + auto Writer = MAB->createObjectWriter(OS); + + ArangesWriter.Write(Writer); + const auto &DebugArangesContents = OS.str(); + + // Free'd by SectionMM. + uint8_t *SectionData = new uint8_t[DebugArangesContents.size()]; + memcpy(SectionData, DebugArangesContents.data(), DebugArangesContents.size()); + + SectionMM->NoteSectionInfo[".debug_aranges"] = SectionInfo( + reinterpret_cast(SectionData), + DebugArangesContents.size(), + /*Alignment=*/0, + /*IsCode=*/false, + /*IsReadOnly=*/true); +} + void RewriteInstance::patchELFPHDRTable() { auto ELF64LEFile = dyn_cast(InputFile); if (!ELF64LEFile) { @@ -1470,16 +1530,23 @@ void RewriteInstance::rewriteNoteSections() { "section size does not match section alignment"); } - // Copy over section contents. - auto Size = Section.sh_size; - OS << InputFile->getData().substr(Section.sh_offset, Size); + ErrorOr SectionName = Obj->getSectionName(&Section); + check_error(SectionName.getError(), "cannot get section name"); + + // Copy over section contents unless it's .debug_aranges, which shall be + // overwritten if -update-debug-sections is passed. + uint64_t Size = 0; + + if (*SectionName != ".debug_aranges" || !opts::UpdateDebugSections) { + Size = Section.sh_size; + OS << InputFile->getData().substr(Section.sh_offset, Size); + } // Address of extension to the section. uint64_t Address{0}; // Perform section post-processing. - ErrorOr SectionName = Obj->getSectionName(&Section); - check_error(SectionName.getError(), "cannot get section name"); + auto SII = SectionMM->NoteSectionInfo.find(*SectionName); if (SII != SectionMM->NoteSectionInfo.end()) { auto &SI = SII->second; @@ -1678,6 +1745,7 @@ void RewriteInstance::rewriteFile() { *BC->STI, /* RelaxAll */ false, /* DWARFMustBeAtTheEnd */ false)); + auto &Writer = static_cast(Streamer.get()) ->getAssembler() .getWriter(); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index edab55ada203..7a452994dda7 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -14,6 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H #define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H +#include "DebugArangesWriter.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/Object/ELFObjectFile.h" @@ -174,6 +175,19 @@ class RewriteInstance { /// stores them into BinaryContext::CompileUnitLineTableOffset. void computeLineTableOffsets(); + /// Adds an entry to be saved in the .debug_aranges section. + /// \p OriginalFunctionAddress function's address in the original binary, + /// used for compile unit lookup. + /// \p RangeBegin first address of the address range being added. + /// \p RangeSie size in bytes of the address range. + void addDebugArangesEntry(uint64_t OriginalFunctionAddress, + uint64_t RangeBegin, + uint64_t RangeSize); + + /// Generate the contents of the output .debug_aranges section based on the + /// added entries. + void generateDebugAranges(); + /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { assert(Address >= NewTextSegmentAddress && @@ -220,6 +234,10 @@ class RewriteInstance { /// Store all functions seen in the binary, sorted by address. std::map BinaryFunctions; + /// Stores and serializes information that will be put into + /// the .debug_aranges DWARF section. + DebugArangesWriter ArangesWriter; + /// Exception handling and stack unwinding information in this binary. ArrayRef LSDAData; uint64_t LSDAAddress{0}; From 61c819b76cab7f41a4436afc641f8354d41b76d0 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Mon, 14 Mar 2016 11:40:52 -0700 Subject: [PATCH 079/904] Fix default line number information for instructions. Summary: The line number information generated from a null pointer was actually valid, which caused new instructions without the line number information set to have a valid and wrong line number reference. This diff fixes this by making the null pointer be assigned to an invalid line number row. (cherry picked from commit d29160188723ffc80c85716ef5e7818cc0887224) --- bolt/BinaryFunction.cpp | 4 ++-- bolt/DebugLineTableRowRef.cpp | 2 +- bolt/DebugLineTableRowRef.h | 3 ++- bolt/RewriteInstance.cpp | 9 +++++++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index aa1febbab114..878ace291216 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -210,7 +210,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, auto RowRef = DebugLineTableRowRef::fromSMLoc(Instruction.getLoc()); if (RowRef != DebugLineTableRowRef::NULL_ROW) { - const auto &Row = LineTable->Rows[RowRef.RowIndex]; + const auto &Row = LineTable->Rows[RowRef.RowIndex - 1]; OS << " # debug line " << LineTable->Prologue.FileNames[Row.File - 1].Name << ":" << Row.Line; @@ -591,7 +591,7 @@ BinaryFunction::findDebugLineInformationForInstructionAt( reinterpret_cast(&Ptr); InstructionLocation->DwCompileUnitIndex = Unit->getOffset(); - InstructionLocation->RowIndex = RowIndex; + InstructionLocation->RowIndex = RowIndex + 1; return SMLoc::getFromPointer(Ptr); } diff --git a/bolt/DebugLineTableRowRef.cpp b/bolt/DebugLineTableRowRef.cpp index 83ed5158e6cc..d8db983516c0 100644 --- a/bolt/DebugLineTableRowRef.cpp +++ b/bolt/DebugLineTableRowRef.cpp @@ -15,7 +15,7 @@ namespace llvm { namespace bolt { -const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{-1U, -1U}; +const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{0, 0}; } // namespace bolt } // namespace llvm diff --git a/bolt/DebugLineTableRowRef.h b/bolt/DebugLineTableRowRef.h index 66c1be5c43ff..5af011798421 100644 --- a/bolt/DebugLineTableRowRef.h +++ b/bolt/DebugLineTableRowRef.h @@ -10,7 +10,8 @@ // Class that references a row in a DWARFDebugLine::LineTable by the DWARF // Context index of the DWARF Compile Unit that owns the Line Table and the row // index. This is tied to our IR during disassembly so that we can later update -// .debug_line information. +// .debug_line information. The RowIndex has a base of 1, which means a RowIndex +// of 1 maps to the first row of the line table and a RowIndex of 0 is invalid. // //===----------------------------------------------------------------------===// diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 90bc9e69be02..b19d8ec699dc 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1106,12 +1106,17 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, if (!BC.MIA->isCFI(Instr)) { if (opts::UpdateDebugSections) { auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc()); - if (auto CompileUnit = BC.OffsetToDwarfCU[RowReference.DwCompileUnitIndex]) { + if (RowReference != DebugLineTableRowRef::NULL_ROW) { + auto CompileUnit = + BC.OffsetToDwarfCU[RowReference.DwCompileUnitIndex]; + assert(CompileUnit && + "Invalid CU offset set in instruction debug info."); auto OriginalLineTable = BC.DwCtx->getLineTableForUnit( CompileUnit); - const auto &OriginalRow = OriginalLineTable->Rows[RowReference.RowIndex]; + const auto &OriginalRow = + OriginalLineTable->Rows[RowReference.RowIndex - 1]; BC.Ctx->setCurrentDwarfLoc( OriginalRow.File, From c56619db219625410b1c4d0b379e8b57aba10264 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 14 Mar 2016 18:48:05 -0700 Subject: [PATCH 080/904] Refactor existing debugging code. Summary: Almost NFC. Isolate code for updating debug info. (cherry picked from commit e78749b1559e5209f5aa29cdad88a04722d1d2ee) --- bolt/BinaryContext.cpp | 25 +++++++++- bolt/BinaryContext.h | 16 ++----- bolt/BinaryFunction.h | 5 ++ bolt/RewriteInstance.cpp | 99 +++++++++++++++++----------------------- bolt/RewriteInstance.h | 18 ++++++-- 5 files changed, 87 insertions(+), 76 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 46ef7b046109..77f1afe49d41 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -43,11 +43,32 @@ MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, return Symbol; } - -void BinaryContext::buildOffsetToDWARFCompileUnitMap() { +void BinaryContext::preprocessDebugInfo() { + // Iterate over all DWARF compilation units and map their offset in the + // binary to themselves in OffsetDwarfCUMap for (const auto &CU : DwCtx->compile_units()) { OffsetToDwarfCU[CU->getOffset()] = CU.get(); } + + // Populate MCContext with DWARF files. + for (const auto &CU : DwCtx->compile_units()) { + const auto CUID = CU->getOffset(); + auto LineTable = DwCtx->getLineTableForUnit(CU.get()); + const auto &FileNames = LineTable->Prologue.FileNames; + for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) { + // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 + // means empty dir. + const char *Dir = FileNames[I].DirIdx ? + LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] : + ""; + Ctx->getDwarfFile(Dir, FileNames[I].Name, I + 1, CUID); + } + + auto LineTableOffset = + DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_stmt_list); + if (LineTableOffset) + LineTableOffsetCUMap[CUID] = LineTableOffset; + } } } // namespace bolt diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 27e66ca625b3..612cf132e243 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -114,8 +114,7 @@ class BinaryContext { std::unique_ptr MIA, std::unique_ptr MRI, std::unique_ptr DisAsm, - const DataReader &DR, - bool LoadDebugContext) : + const DataReader &DR) : Ctx(std::move(Ctx)), DwCtx(std::move(DwCtx)), TheTriple(std::move(TheTriple)), @@ -130,11 +129,7 @@ class BinaryContext { MIA(std::move(MIA)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)), - DR(DR) { - if (LoadDebugContext) { - buildOffsetToDWARFCompileUnitMap(); - } - } + DR(DR) {} ~BinaryContext() {} @@ -144,10 +139,9 @@ class BinaryContext { /// return the first one. MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); -private: - // Iterates over all DWARF compilation units and maps their offset in the - // binary to themselves in OffsetDwarfCUMap - void buildOffsetToDWARFCompileUnitMap(); + /// Populate some internal data structures with debug info. + void preprocessDebugInfo(); + }; } // namespace bolt diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 2069c262923c..95cff6662aa5 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -726,16 +726,19 @@ class BinaryFunction { /// Info for fragmented functions. class FragmentInfo { private: + uint64_t Address{0}; uint64_t ImageAddress{0}; uint64_t ImageSize{0}; uint64_t FileOffset{0}; const MCSymbol *OutputSymbol{nullptr}; public: + uint64_t getAddress() const { return Address; } uint64_t getImageAddress() const { return ImageAddress; } uint64_t getImageSize() const { return ImageSize; } uint64_t getFileOffset() const { return FileOffset; } const MCSymbol *getOutputSymbol() const { return OutputSymbol; } + void setAddress(uint64_t VAddress) { Address = VAddress; } void setImageAddress(uint64_t Address) { ImageAddress = Address; } void setImageSize(uint64_t Size) { ImageSize = Size; } void setFileOffset(uint64_t Offset) { FileOffset = Offset; } @@ -746,6 +749,8 @@ class BinaryFunction { FragmentInfo ColdFragment; FragmentInfo &cold() { return ColdFragment; } + + const FragmentInfo &cold() const { return ColdFragment; } }; inline raw_ostream &operator<<(raw_ostream &OS, diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index b19d8ec699dc..ab84d52910b2 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -402,34 +402,7 @@ static std::unique_ptr CreateBinaryContext( std::move(MIA), std::move(MRI), std::move(DisAsm), - DR, - opts::UpdateDebugSections); - - if (opts::UpdateDebugSections) { - // Populate MCContext with DWARF files. - for (const auto &CU : BC->DwCtx->compile_units()) { - const auto CUID = CU->getOffset(); - auto LineTable = BC->DwCtx->getLineTableForUnit(CU.get()); - auto LineTableOffset = - BC->DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_stmt_list); - const auto &FileNames = LineTable->Prologue.FileNames; - for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) { - // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 - // means empty dir. - const char *Dir = FileNames[I].DirIdx ? - LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] : - ""; - BC->Ctx->getDwarfFile( - Dir, - FileNames[I].Name, - I + 1, - CUID); - } - if (LineTableOffset) { - BC->LineTableOffsetCUMap[CUID] = LineTableOffset; - } - } - } + DR); return BC; } @@ -543,10 +516,12 @@ void RewriteInstance::run() { // Main "loop". discoverStorage(); readSpecialSections(); + readDebugInfo(); discoverFileObjects(); disassembleFunctions(); runOptimizationPasses(); emitFunctions(); + updateDebugInfo(); // Copy allocatable part of the input. std::error_code EC; @@ -738,6 +713,13 @@ void RewriteInstance::readSpecialSections() { } } +void RewriteInstance::readDebugInfo() { + if (!opts::UpdateDebugSections) + return; + + BC->preprocessDebugInfo(); +} + void RewriteInstance::disassembleFunctions() { // Disassemble every function and build it's control flow graph. TotalScore = 0; @@ -1282,11 +1264,6 @@ void RewriteInstance::emitFunctions() { Function.getAddress()); Function.setImageAddress(SMII->second.AllocAddress); Function.setImageSize(SMII->second.Size); - - if (opts::UpdateDebugSections) { - addDebugArangesEntry(Function.getAddress(), Function.getAddress(), - Function.getSize()); - } } else { errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); @@ -1308,15 +1285,11 @@ void RewriteInstance::emitFunctions() { OLT.mapSectionAddress(ObjectsHandle, reinterpret_cast(SMII->second.AllocAddress), NextAvailableAddress); + Function.cold().setAddress(NextAvailableAddress); Function.cold().setImageAddress(SMII->second.AllocAddress); Function.cold().setImageSize(SMII->second.Size); Function.cold().setFileOffset(getFileOffsetFor(NextAvailableAddress)); - if (opts::UpdateDebugSections) { - addDebugArangesEntry(Function.getAddress(), NextAvailableAddress, - Function.cold().getImageSize()); - } - NextAvailableAddress += SMII->second.Size; } else { errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; @@ -1324,10 +1297,6 @@ void RewriteInstance::emitFunctions() { } } - // After collecting rewritten function addresses, generate the contents of - // .debug_aranges. - generateDebugAranges(); - // Add the new text section aggregating all existing code sections. auto NewTextSectionSize = NextAvailableAddress - NewTextSectionStartAddress; if (NewTextSectionSize) { @@ -1375,28 +1344,33 @@ void RewriteInstance::emitFunctions() { TempOut->keep(); } -void RewriteInstance::addDebugArangesEntry(uint64_t OriginalFunctionAddress, - uint64_t RangeBegin, - uint64_t RangeSize) { - if (auto DebugAranges = BC->DwCtx->getDebugAranges()) { - uint32_t CUOffset = DebugAranges->findAddress(OriginalFunctionAddress); - if (CUOffset != -1U) { - ArangesWriter.AddRange(CUOffset, RangeBegin, RangeSize); +void RewriteInstance::updateFunctionRanges() { + auto addDebugArangesEntry = [&](uint64_t OriginalFunctionAddress, + uint64_t RangeBegin, + uint64_t RangeSize) { + if (auto DebugAranges = BC->DwCtx->getDebugAranges()) { + uint32_t CUOffset = DebugAranges->findAddress(OriginalFunctionAddress); + if (CUOffset != -1U) + ArangesWriter.AddRange(CUOffset, RangeBegin, RangeSize); } - } -} + }; -void RewriteInstance::generateDebugAranges() { - // Get the address of all non-simple functions and add them intact to aranges. - // Simple functions are rewritten and have their .debug_aranges entries added - // during rewriting. for (const auto &BFI : BinaryFunctions) { const auto &Function = BFI.second; - if (!Function.isSimple()) { - addDebugArangesEntry(Function.getAddress(), Function.getAddress(), - Function.getSize()); + // Use either new (image) or original size for the function range. + addDebugArangesEntry(Function.getAddress(), + Function.getAddress(), + Function.isSimple() ? Function.getImageSize() + : Function.getSize()); + if (Function.isSimple() && Function.cold().getImageSize()) { + addDebugArangesEntry(Function.getAddress(), + Function.cold().getAddress(), + Function.cold().getImageSize()); } } +} + +void RewriteInstance::generateDebugAranges() { SmallVector ArangesBuffer; raw_svector_ostream OS(ArangesBuffer); @@ -1945,3 +1919,12 @@ void RewriteInstance::computeLineTableOffsets() { << " has line table at " << Offset << "\n"); } } + +void RewriteInstance::updateDebugInfo() { + if (!opts::UpdateDebugSections) + return; + + updateFunctionRanges(); + + generateDebugAranges(); +} diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 7a452994dda7..32bf3a377c4b 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -134,6 +134,9 @@ class RewriteInstance { /// and stack unwinding information. void readSpecialSections(); + /// Read information from debug sections. + void readDebugInfo(); + /// Disassemble each function in the binary and associate it with a /// BinaryFunction object, preparing all information necessary for binary /// optimization. @@ -147,6 +150,9 @@ class RewriteInstance { /// performing final relaxation. void emitFunctions(); + /// Update debug information in the file for re-written code. + void updateDebugInfo(); + /// Rewrite back all functions (hopefully optimized) that fit in the original /// memory footprint for that function. If the function is now larger and does /// not fit in the binary, reject it and preserve the original version of the @@ -175,14 +181,17 @@ class RewriteInstance { /// stores them into BinaryContext::CompileUnitLineTableOffset. void computeLineTableOffsets(); - /// Adds an entry to be saved in the .debug_aranges section. + /// Adds an entry to be saved in the .debug_aranges/.debug_ranges section. /// \p OriginalFunctionAddress function's address in the original binary, /// used for compile unit lookup. /// \p RangeBegin first address of the address range being added. /// \p RangeSie size in bytes of the address range. - void addDebugArangesEntry(uint64_t OriginalFunctionAddress, - uint64_t RangeBegin, - uint64_t RangeSize); + void addDebugRangesEntry(uint64_t OriginalFunctionAddress, + uint64_t RangeBegin, + uint64_t RangeSize); + + /// Update internal function ranges after functions have been written. + void updateFunctionRanges(); /// Generate the contents of the output .debug_aranges section based on the /// added entries. @@ -195,7 +204,6 @@ class RewriteInstance { return Address - NewTextSegmentAddress + NewTextSegmentOffset; } - private: /// An instance of the input binary we are processing, externally owned. llvm::object::ELFObjectFileBase *InputFile; From e4db8f74ec58d6ef2bd3d667ea0a112d5053df38 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 14 Mar 2016 19:04:23 -0700 Subject: [PATCH 081/904] Update DW_AT_ranges for CU when it exists. Summary: If CU has DW_AT_ranges update the value. Note that it does not create DW_AT_ranges attribute. (cherry picked from commit 8598e8e33ffe64d1668f62a91f63e0086c7e738d) --- bolt/DebugArangesWriter.cpp | 23 +++++++++++- bolt/DebugArangesWriter.h | 15 +++++++- bolt/RewriteInstance.cpp | 74 +++++++++++++++++++++++++++---------- bolt/RewriteInstance.h | 8 ++-- 4 files changed, 95 insertions(+), 25 deletions(-) diff --git a/bolt/DebugArangesWriter.cpp b/bolt/DebugArangesWriter.cpp index 44564cdbad09..1c19c40df23e 100644 --- a/bolt/DebugArangesWriter.cpp +++ b/bolt/DebugArangesWriter.cpp @@ -21,7 +21,28 @@ void DebugArangesWriter::AddRange(uint32_t CompileUnitOffset, CUAddressRanges[CompileUnitOffset].push_back(std::make_pair(Address, Size)); } -void DebugArangesWriter::Write(MCObjectWriter *Writer) const { +void DebugArangesWriter::WriteRangesSection(MCObjectWriter *Writer) { + uint32_t SectionOffset = 0; + for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { + uint64_t CUOffset = CUOffsetAddressRangesPair.first; + RangesSectionOffsetCUMap[CUOffset] = SectionOffset; + const auto &AddressRanges = CUOffsetAddressRangesPair.second; + + // Write all entries. + for (auto &Range : AddressRanges) { + Writer->writeLE64(Range.first); + Writer->writeLE64(Range.first + Range.second); + } + + // Finish with 0 entry. + Writer->writeLE64(0); + Writer->writeLE64(0); + + SectionOffset += AddressRanges.size() * 16 + 16; + } +} + +void DebugArangesWriter::WriteArangesSection(MCObjectWriter *Writer) const { // For reference on the format of the .debug_aranges section, see the DWARF4 // specification, section 6.1.4 Lookup by Address // http://www.dwarfstd.org/doc/DWARF4.pdf diff --git a/bolt/DebugArangesWriter.h b/bolt/DebugArangesWriter.h index a975bf090909..228722845f12 100644 --- a/bolt/DebugArangesWriter.h +++ b/bolt/DebugArangesWriter.h @@ -31,14 +31,27 @@ class DebugArangesWriter { /// Adds a range to the .debug_arange section. void AddRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); + using RangesCUMapType = std::map; + /// Writes .debug_aranges with the added ranges to the MCObjectWriter. - void Write(MCObjectWriter *Writer) const; + void WriteArangesSection(MCObjectWriter *Writer) const; + + /// Writes .debug_ranges with the added ranges to the MCObjectWriter. + void WriteRangesSection(MCObjectWriter *Writer); + + /// Return mapping of CUs to offsets in .debug_ranges. + const RangesCUMapType &getRangesOffsetCUMap() const { + return RangesSectionOffsetCUMap; + } private: // Map from compile unit offset to the list of address intervals that belong // to that compile unit. Each interval is a pair // (first address, interval size). std::map>> CUAddressRanges; + + /// When writing data to .debug_ranges remember offset per CU. + RangesCUMapType RangesSectionOffsetCUMap; }; } // namespace bolt diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ab84d52910b2..a9e0444149f4 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -695,6 +695,8 @@ void RewriteInstance::readSpecialSections() { FrameHdrAlign = Section.getAlignment(); } else if (SectionName == ".debug_line") { DebugLineSize = Section.getSize(); + } else if (SectionName == ".debug_ranges") { + DebugRangesSize = Section.getSize(); } } @@ -1370,27 +1372,36 @@ void RewriteInstance::updateFunctionRanges() { } } -void RewriteInstance::generateDebugAranges() { +void RewriteInstance::generateDebugRanges() { + using RangeType = enum { RANGES, ARANGES }; + for (int IntRT = RANGES; IntRT <= ARANGES; ++IntRT) { + RangeType RT = static_cast(IntRT); + const char *SectionName = (RT == RANGES) ? ".debug_ranges" + : ".debug_aranges"; + SmallVector RangesBuffer; + raw_svector_ostream OS(RangesBuffer); - SmallVector ArangesBuffer; - raw_svector_ostream OS(ArangesBuffer); + auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); + auto Writer = MAB->createObjectWriter(OS); - auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); - auto Writer = MAB->createObjectWriter(OS); - - ArangesWriter.Write(Writer); - const auto &DebugArangesContents = OS.str(); - - // Free'd by SectionMM. - uint8_t *SectionData = new uint8_t[DebugArangesContents.size()]; - memcpy(SectionData, DebugArangesContents.data(), DebugArangesContents.size()); - - SectionMM->NoteSectionInfo[".debug_aranges"] = SectionInfo( - reinterpret_cast(SectionData), - DebugArangesContents.size(), - /*Alignment=*/0, - /*IsCode=*/false, - /*IsReadOnly=*/true); + if (RT == RANGES) { + ArangesWriter.WriteRangesSection(Writer); + } else { + ArangesWriter.WriteArangesSection(Writer); + } + const auto &DebugRangesContents = OS.str(); + + // Free'd by SectionMM. + uint8_t *SectionData = new uint8_t[DebugRangesContents.size()]; + memcpy(SectionData, DebugRangesContents.data(), DebugRangesContents.size()); + + SectionMM->NoteSectionInfo[SectionName] = SectionInfo( + reinterpret_cast(SectionData), + DebugRangesContents.size(), + /*Alignment=*/0, + /*IsCode=*/false, + /*IsReadOnly=*/true); + } } void RewriteInstance::patchELFPHDRTable() { @@ -1926,5 +1937,28 @@ void RewriteInstance::updateDebugInfo() { updateFunctionRanges(); - generateDebugAranges(); + generateDebugRanges(); + + auto &DebugInfoSI = SectionMM->NoteSectionInfo[".debug_info"]; + for (const auto &CU : BC->DwCtx->compile_units()) { + const auto CUID = CU->getOffset(); + + // Update DW_AT_ranges + auto RangesFieldOffset = + BC->DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_ranges); + if (RangesFieldOffset) { + DEBUG(dbgs() << "BOLT-DEBUG: adding relocation for DW_AT_ranges " + << "in .debug_info\n"); + const auto RSOI = ArangesWriter.getRangesOffsetCUMap().find(CUID); + if (RSOI != ArangesWriter.getRangesOffsetCUMap().end()) { + auto Offset = RSOI->second; + DebugInfoSI.PendingRelocs.emplace_back( + SectionInfo::Reloc{RangesFieldOffset, 4, 0, + Offset + DebugRangesSize}); + } else { + DEBUG(dbgs() << "BOLT-DEBUG: no .debug_ranges entry found for CU " + << CUID << '\n'); + } + } + } } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 32bf3a377c4b..e6d76cb2ed83 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -193,9 +193,8 @@ class RewriteInstance { /// Update internal function ranges after functions have been written. void updateFunctionRanges(); - /// Generate the contents of the output .debug_aranges section based on the - /// added entries. - void generateDebugAranges(); + /// Generate new contents for .debug_ranges and .debug_aranges section. + void generateDebugRanges(); /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { @@ -262,6 +261,9 @@ class RewriteInstance { /// Size of the .debug_line section on input. uint32_t DebugLineSize{0}; + /// Size of the .debug_ranges section on input. + uint32_t DebugRangesSize{0}; + /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; From 8e609a798d01a5a7a70b2b2a1ef17d457ac90b2e Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Tue, 15 Mar 2016 16:22:04 -0700 Subject: [PATCH 082/904] Write only minimal .debug_line information. Summary: We used to output .debug_line information for every instruction, but because of the way gdb (and probably lldb as of llvm::DWARFDebugLine::LineTable::findAddress) queries the line table it's not necessary to output information for two instructions if they follow each other and map to the same source line. By not repeating this information we generate a bit less .debug_line data. (cherry picked from commit e0b122e9f6ceec16829628a1db4bb67d135bfd47) --- bolt/RewriteInstance.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index a9e0444149f4..366557b8f39e 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1077,6 +1077,11 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, if (opts::AlignBlocks && BB->getAlignment() > 1) Streamer.EmitCodeAlignment(BB->getAlignment()); Streamer.EmitLabel(BB->getLabel()); + // Remember last .debug_line entry emitted so that we don't repeat them in + // subsequent instructions, as gdb can figure it out by looking at the + // previous instruction with available line number info. + SMLoc LastLocSeen; + for (const auto &Instr : *BB) { // Handle pseudo instructions. if (BC.MIA->isEHLabel(Instr)) { @@ -1090,7 +1095,8 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, if (!BC.MIA->isCFI(Instr)) { if (opts::UpdateDebugSections) { auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc()); - if (RowReference != DebugLineTableRowRef::NULL_ROW) { + if (RowReference != DebugLineTableRowRef::NULL_ROW && + Instr.getLoc().getPointer() != LastLocSeen.getPointer()) { auto CompileUnit = BC.OffsetToDwarfCU[RowReference.DwCompileUnitIndex]; assert(CompileUnit && @@ -1113,6 +1119,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, OriginalRow.Isa, OriginalRow.Discriminator); BC.Ctx->setDwarfCompileUnitID(CompileUnit->getOffset()); + LastLocSeen = Instr.getLoc(); } } From 0b2e2ad7bcf3323164cf27fbf14095fc62985d27 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Wed, 16 Mar 2016 18:08:29 -0700 Subject: [PATCH 083/904] Update subroutine address ranges in binary. Summary: [WIP] Update DWARF info for function address ranges. This diff currently does not work for unknown reasons, but I'm describing here what's the current state. According to both llvm-dwarf and readelf our output seems correct, but GDB does not interpret it as expected. All details go below in hope I missed something. I couldn't actually track the whole change that introduced support for what we need in gdb yet, but I think I can get to it (2007-12-04: Support lexical bocks and function bodies that occupy non-contiguous address ranges). I have reasons to believe gdb at least at some nges). The set of introduced changes was basically this: - After disassembly, iterate over the DIEs in .debug_info and find the ones that correspond to each BinaryFunction. - Refactor DebugArangesWriter to also write addresses of functions to .debug_ranges and track the offsets of function address ranges there - Add some infrastructure to facilitate patching the binary in simple ways (BinaryPatcher.h) - In RewriteInstance, after writing .debug_ranges already with function address ranges, for each function do: -- Find the abbreviation corresponding to the function -- Patch .debug_abbrev to replace DW_AT_low_pc with DW_AT_ranges and DW_AT_high_pc with DW_AT_producer (I'll explain this hack below). Also patch the corresponding forms to DW_FORM_sec_offset and DW_FORM_string (null-terminated in-place string). -- Patch debug_info with the .debug_ranges offset in place of the first 4 bytes of DW_AT_low_pc (DW_AT_ranges only occupies 4 bytes whereas low_pc occupies 8), and write an arbitrary string in-place in the other 12 bytes that were the 4 MSB of low_pc and the 8 bytes of high_pc before the patch. This depends on low_pc and high_pc being put consecutively by the compiler, but it serves to validate the idea. I tried another way of doing it that does not rely on this but it didn't work either and I believe the reason for either not working is the same (and still unknown, but unrelated to them. I might be wrong though, and if I find yet another way of doing it I may try it). The other way was to use a form of DW_FORM_data8 for the section offset. This is disallowed by the specification, but I doubt gdb validates this, as it's just easier to store it as 64-bit anyway as this is even necessary to support 64-bit DWARF (which is not what gcc generates by default apparently). I still need to make changes to the diff to make it production-ready, but first I want to figure out why it doesn't work as expected. By looking at the output of llvm-dwarfdump or readelf, all of .debug_ranges, .debug_abbrev and .debug_info seem to have been correctly updated. However, gdb seems to have serious problems with what we write. (In fact, readelf --debug-dump=Ranges shows some funny warning messages of the form ("Warning: There is a hole [0x100 - 0x120] in .debug_ranges"), but I played around with this and it seems it's just because no compile unit was using these ranges. Changing .debug_info apparently changes these warnings, so they seem to be unrelated to the section itself. Also looking at the hex dump of the section doesn't help, as everything seems fine. llvm-dwarfdump doesn't say anything. So I think .debug_ranges is fine.) The result is that gdb not only doesn't show the function name as we wanted, but it also stops showing line number information. Apparently it's not reading/interpreting the address ranges at all, and so the functions now have no associated address ranges, only the symbol value which allows one to put a breakpoint in the function, but not to show source code. As this left me without more ideas of what to try to feed gdb with, I believe the most promising next trial is to try to debug gdb itself, unless someone spots anything I missed. I found where the interesting part of the code lies for this case (gdb/dwarf2read.c and some other related files, but mainly that one). It seems in some parts gdb uses DW_AT_ranges for only getting its lowest and highest addresses and setting that as low_pc and high_pc (see dwarf2_get_pc_bounds in gdb's code and where it's called). I really hope this is not actually the case for function address ranges. I'll investigate this further. Otherwise I don't think any changes we make will make it work as initially intended, as we'll simply need gdb to support it and in that case it doesn't. (cherry picked from commit 1c76c2187c1beecd53acc893a117c58c5f1e9064) --- bolt/BinaryContext.cpp | 26 ++++++++ bolt/BinaryContext.h | 5 ++ bolt/BinaryFunction.h | 37 +++++++++++ bolt/BinaryPatcher.cpp | 109 ++++++++++++++++++++++++++++++++ bolt/BinaryPatcher.h | 101 ++++++++++++++++++++++++++++++ bolt/CMakeLists.txt | 1 + bolt/DebugArangesWriter.cpp | 58 +++++++++++------ bolt/DebugArangesWriter.h | 15 ++++- bolt/RewriteInstance.cpp | 120 +++++++++++++++++++++++++++++++++--- bolt/RewriteInstance.h | 12 ++++ 10 files changed, 455 insertions(+), 29 deletions(-) create mode 100644 bolt/BinaryPatcher.cpp create mode 100644 bolt/BinaryPatcher.h diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 77f1afe49d41..d73c5d4e78a6 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "BinaryContext.h" +#include "BinaryFunction.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" @@ -71,5 +72,30 @@ void BinaryContext::preprocessDebugInfo() { } } +void BinaryContext::preprocessFunctionDebugInfo( + std::map &BinaryFunctions) { + // For each CU, iterate over its children DIEs and match subroutine DIEs to + // BinaryFunctions. + for (const auto &CU : DwCtx->compile_units()) { + const auto *UnitDIE = CU->getUnitDIE(false); + if (!UnitDIE->hasChildren()) + continue; + + for (auto ChildDIE = UnitDIE->getFirstChild(); + ChildDIE != nullptr && !ChildDIE->isNULL(); + ChildDIE = ChildDIE->getSibling()) { + if (ChildDIE->isSubprogramDIE()) { + uint64_t LowPC, HighPC; + if (ChildDIE->getLowAndHighPC(CU.get(), LowPC, HighPC)) { + auto It = BinaryFunctions.find(LowPC); + if (It != BinaryFunctions.end()) { + It->second.setSubprocedureDIE(CU.get(), ChildDIE); + } + } + } + } + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 612cf132e243..ba74045f13ca 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -39,6 +39,7 @@ namespace llvm { namespace bolt { +class BinaryFunction; class DataReader; class BinaryContext { @@ -142,6 +143,10 @@ class BinaryContext { /// Populate some internal data structures with debug info. void preprocessDebugInfo(); + /// Populate internal data structures with debug info that depends on + /// disassembled functions. + void preprocessFunctionDebugInfo( + std::map &BinaryFunctions); }; } // namespace bolt diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 95cff6662aa5..069a08034820 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -39,6 +39,10 @@ using namespace llvm::object; namespace llvm { + +class DWARFCompileUnit; +class DWARFDebugInfoEntryMinimal; + namespace bolt { /// BinaryFunction is a representation of machine-level function. @@ -147,6 +151,16 @@ class BinaryFunction { /// Landing pads for the function. std::set LandingPads; + /// Associated DIE in the .debug_info section. + const DWARFDebugInfoEntryMinimal *SubprocedureDIE{nullptr}; + + /// DWARF Unit that contains the DIE of this function. + const DWARFCompileUnit *DIECompileUnit{nullptr}; + + /// Offset of this function's address ranges in the .debug_ranges section of + /// the output binary. + uint32_t AddressRangesOffset; + /// Release storage used by instructions. BinaryFunction &clearInstructions() { InstrMapType TempMap; @@ -622,6 +636,14 @@ class BinaryFunction { return *this; } + /// Sets the function's address ranges list offset in .debug_ranges. + void setAddressRangesOffset(uint32_t Offset) { + AddressRangesOffset = Offset; + } + + /// Returns the offset of the function's address ranges in .debug_ranges. + uint32_t getAddressRangesOffset() const { return AddressRangesOffset; } + /// Return the profile information about the number of times /// the function was executed. /// @@ -721,6 +743,21 @@ class BinaryFunction { /// Emit exception handling ranges for the function. void emitLSDA(MCStreamer *Streamer); + /// Sets the associated .debug_info entry. + void setSubprocedureDIE(const DWARFCompileUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE) { + DIECompileUnit = Unit; + SubprocedureDIE = DIE; + } + + const DWARFDebugInfoEntryMinimal *getSubprocedureDIE() const { + return SubprocedureDIE; + } + + const DWARFCompileUnit *getSubprocedureDIECompileUnit() const { + return DIECompileUnit; + } + virtual ~BinaryFunction() {} /// Info for fragmented functions. diff --git a/bolt/BinaryPatcher.cpp b/bolt/BinaryPatcher.cpp new file mode 100644 index 000000000000..8af6018f3a9b --- /dev/null +++ b/bolt/BinaryPatcher.cpp @@ -0,0 +1,109 @@ +//===--- BinaryPatcher.h - Classes for modifying sections of the binary --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryPatcher.h" +#include +#include + +namespace llvm { +namespace bolt { + +void SimpleBinaryPatcher::addBinaryPatch(uint32_t Offset, + const std::string &NewValue) { + Patches.emplace_back(std::make_pair(Offset, NewValue)); +} + +void SimpleBinaryPatcher::addBytePatch(uint32_t Offset, uint8_t Value) { + Patches.emplace_back(std::make_pair(Offset, std::string(1, Value))); +} + +void SimpleBinaryPatcher::addLEPatch(uint32_t Offset, uint64_t NewValue, + size_t ByteSize) { + std::string LE64(ByteSize, 0); + for (size_t I = 0; I < ByteSize; ++I) { + LE64[I] = NewValue & 0xff; + NewValue >>= 8; + } + Patches.emplace_back(std::make_pair(Offset, LE64)); +} + +void SimpleBinaryPatcher::addLE64Patch(uint32_t Offset, uint64_t NewValue) { + addLEPatch(Offset, NewValue, 8); +} + +void SimpleBinaryPatcher::addLE32Patch(uint32_t Offset, uint32_t NewValue) { + addLEPatch(Offset, NewValue, 4); +} + +void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents) { + for (const auto &Patch : Patches) { + uint32_t Offset = Patch.first; + const std::string &ByteSequence = Patch.second; + assert(Offset + ByteSequence.size() <= BinaryContents.size() && + "Applied patch runs over binary size."); + for (uint64_t I = 0, Size = ByteSequence.size(); I < Size; ++I) { + BinaryContents[Offset + I] = ByteSequence[I]; + } + } +} + +void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit, + uint32_t AbbrevCode, + uint16_t AttrTag, + uint8_t NewAttrTag, + uint8_t NewAttrForm) { + assert(Unit && "No compile unit specified."); + Patches[Unit].push_back( + AbbrevAttrPatch{AbbrevCode, AttrTag, NewAttrTag, NewAttrForm}); +} + +void DebugAbbrevPatcher::patchBinary(std::string &Contents) { + SimpleBinaryPatcher Patcher; + + for (const auto &UnitPatchesPair : Patches) { + const auto *Unit = UnitPatchesPair.first; + const auto *UnitAbbreviations = Unit->getAbbreviations(); + assert(UnitAbbreviations && + "Compile unit doesn't have associated abbreviations."); + const auto &UnitPatches = UnitPatchesPair.second; + for (const auto &AttrPatch : UnitPatches) { + const auto *AbbreviationDeclaration = + UnitAbbreviations->getAbbreviationDeclaration(AttrPatch.Code); + assert(AbbreviationDeclaration && "No abbreviation with given code."); + const auto *Attribute = AbbreviationDeclaration->findAttribute( + AttrPatch.Attr); + + if (!Attribute) { + errs() << "Attribute " << AttrPatch.Attr << " does not occur in " + << " abbrev " << AttrPatch.Code << " of CU " << Unit->getOffset() + << " in decl@" << AbbreviationDeclaration + << " and index = " << AbbreviationDeclaration->findAttributeIndex(AttrPatch.Attr) + << "\n"; + errs() << "Look at the abbrev:\n"; + AbbreviationDeclaration->dump(errs()); + + assert(Attribute && "Specified attribute doesn't occur in abbreviation."); + } + // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or + // DW_AT_APPLE_*), they are all small (< 128) and encoded in a single + // byte in ULEB128, otherwise it'll be more tricky as we may need to + // grow or shrink the section. + Patcher.addBytePatch(Attribute->AttrOffset, + AttrPatch.NewAttr); + Patcher.addBytePatch(Attribute->FormOffset, + AttrPatch.NewForm); + } + } + Patcher.patchBinary(Contents); +} + +} // namespace llvm +} // namespace bolt diff --git a/bolt/BinaryPatcher.h b/bolt/BinaryPatcher.h new file mode 100644 index 000000000000..4fe1fe4e1e0a --- /dev/null +++ b/bolt/BinaryPatcher.h @@ -0,0 +1,101 @@ +//===--- BinaryPatcher.h - Classes for modifying sections of the binary --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interfaces for applying small modifications to parts of a binary file. Some +// specializations facilitate the modification of specific ELF/DWARF sections. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_PATCHER_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_PATCHER_H + +#include "llvm/DebugInfo/DWARF/DWARFUnit.h" +#include +#include +#include + +namespace llvm { +namespace bolt { + +/// Abstract interface for classes that apply modifications to a binary string. +class BinaryPatcher { +public: + virtual ~BinaryPatcher() {} + /// Applies in-place modifications to the binary string \p BinaryContents . + virtual void patchBinary(std::string &BinaryContents) = 0; +}; + +/// Applies simple modifications to a binary string, such as directly replacing +/// the contents of a certain portion with a string or an integer. +class SimpleBinaryPatcher : public BinaryPatcher { +private: + std::vector> Patches; + + /// Adds a patch to replace the contents of \p ByteSize bytes with the integer + /// \p NewValue encoded in little-endian, with the least-significant byte + /// being written at the offset \p Offset . + void addLEPatch(uint32_t Offset, uint64_t NewValue, size_t ByteSize); + +public: + ~SimpleBinaryPatcher() {} + + /// Adds a patch to replace the contents of the binary string starting at the + /// specified \p Offset with the string \p NewValue. + void addBinaryPatch(uint32_t Offset, const std::string &NewValue); + + /// Adds a patch to replace the contents of a single byte of the string, at + /// the offset \p Offset, with the value \Value . + void addBytePatch(uint32_t Offset, uint8_t Value); + + /// Adds a patch to put the integer \p NewValue encoded as a 64-bit + /// little-endian value at offset \p Offset. + void addLE64Patch(uint32_t Offset, uint64_t NewValue); + + /// Adds a patch to put the integer \p NewValue encoded as a 32-bit + /// little-endian value at offset \p Offset. + void addLE32Patch(uint32_t Offset, uint32_t NewValue); + + void patchBinary(std::string &BinaryContents) override; +}; + +/// Apply small modifications to the .debug_abbrev DWARF section. +class DebugAbbrevPatcher : public BinaryPatcher { +private: + /// Patch of changing one attribute to another. + struct AbbrevAttrPatch { + uint32_t Code; // Code of abbreviation to be modified. + uint16_t Attr; // ID of attribute to be replaced. + uint8_t NewAttr; // ID of the new attribute. + uint8_t NewForm; // Form of the new attribute. + }; + + std::map> Patches; + +public: + ~DebugAbbrevPatcher() { } + /// Adds a patch to change an attribute of an abbreviation that belongs to + /// \p Unit to another attribute. + /// \p AbbrevCode code of the abbreviation to be modified. + /// \p AttrTag ID of the attribute to be replaced. + /// \p NewAttrTag ID of the new attribute. + /// \p NewAttrForm Form of the new attribute. + /// We only handle standard forms, that are encoded in a single byte. + void addAttributePatch(const DWARFUnit *Unit, + uint32_t AbbrevCode, + uint16_t AttrTag, + uint8_t NewAttrTag, + uint8_t NewAttrForm); + + void patchBinary(std::string &Contents) override; +}; + +} // namespace llvm +} // namespace bolt + +#endif diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 3d97144787dc..44611bc50916 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_tool(llvm-bolt BinaryBasicBlock.cpp BinaryContext.cpp BinaryFunction.cpp + BinaryPatcher.cpp DataReader.cpp DebugArangesWriter.cpp DebugLineTableRowRef.cpp diff --git a/bolt/DebugArangesWriter.cpp b/bolt/DebugArangesWriter.cpp index 1c19c40df23e..680eae0d4b4f 100644 --- a/bolt/DebugArangesWriter.cpp +++ b/bolt/DebugArangesWriter.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "DebugArangesWriter.h" +#include "BinaryFunction.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCObjectWriter.h" @@ -21,24 +22,49 @@ void DebugArangesWriter::AddRange(uint32_t CompileUnitOffset, CUAddressRanges[CompileUnitOffset].push_back(std::make_pair(Address, Size)); } +void DebugArangesWriter::AddRange(BinaryFunction &BF, + uint64_t Address, + uint64_t Size) { + FunctionAddressRanges[&BF].push_back(std::make_pair(Address, Size)); +} + +namespace { + +// Writes address ranges to Writer as pairs of 64-bit (address, size). +// If RelativeRange is true, assumes the address range to be written must be of +// the form (begin address, range size), otherwise (begin address, end address). +// Terminates the list by writing a pair of two zeroes. +// Returns the number of written bytes. +uint32_t WriteAddressRanges( + MCObjectWriter *Writer, + const std::vector> &AddressRanges, + bool RelativeRange) { + // Write entries. + for (auto &Range : AddressRanges) { + Writer->writeLE64(Range.first); + Writer->writeLE64((!RelativeRange) * Range.first + Range.second); + } + // Finish with 0 entries. + Writer->writeLE64(0); + Writer->writeLE64(0); + return AddressRanges.size() * 16 + 16; +} + +} // namespace + void DebugArangesWriter::WriteRangesSection(MCObjectWriter *Writer) { uint32_t SectionOffset = 0; for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { uint64_t CUOffset = CUOffsetAddressRangesPair.first; RangesSectionOffsetCUMap[CUOffset] = SectionOffset; const auto &AddressRanges = CUOffsetAddressRangesPair.second; + SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); + } - // Write all entries. - for (auto &Range : AddressRanges) { - Writer->writeLE64(Range.first); - Writer->writeLE64(Range.first + Range.second); - } - - // Finish with 0 entry. - Writer->writeLE64(0); - Writer->writeLE64(0); - - SectionOffset += AddressRanges.size() * 16 + 16; + for (const auto &BFAddressRangesPair : FunctionAddressRanges) { + BFAddressRangesPair.first->setAddressRangesOffset(SectionOffset); + const auto &AddressRanges = BFAddressRangesPair.second; + SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); } } @@ -77,15 +103,7 @@ void DebugArangesWriter::WriteArangesSection(MCObjectWriter *Writer) const { // Padding before address table - 4 bytes in the 64-bit-pointer case. Writer->writeLE32(0); - // Emit address ranges. - for (const auto &Range : AddressRanges) { - Writer->writeLE64(Range.first); - Writer->writeLE64(Range.second); - } - - // Emit terminating address range (offset 0, length 0). - Writer->writeLE64(0); - Writer->writeLE64(0); + WriteAddressRanges(Writer, AddressRanges, true); } } diff --git a/bolt/DebugArangesWriter.h b/bolt/DebugArangesWriter.h index 228722845f12..e28122c66064 100644 --- a/bolt/DebugArangesWriter.h +++ b/bolt/DebugArangesWriter.h @@ -24,6 +24,8 @@ class MCObjectWriter; namespace bolt { +class BinaryFunction; + class DebugArangesWriter { public: DebugArangesWriter() = default; @@ -31,6 +33,11 @@ class DebugArangesWriter { /// Adds a range to the .debug_arange section. void AddRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); + /// Adds an address range that belongs to a given BinaryFunction. + /// When .debug_ranges is written, the offset of the range corresponding + /// to the function will be set using BF->setAddressRangesOffset(). + void AddRange(BinaryFunction &BF, uint64_t Address, uint64_t Size); + using RangesCUMapType = std::map; /// Writes .debug_aranges with the added ranges to the MCObjectWriter. @@ -48,7 +55,13 @@ class DebugArangesWriter { // Map from compile unit offset to the list of address intervals that belong // to that compile unit. Each interval is a pair // (first address, interval size). - std::map>> CUAddressRanges; + std::map>> + CUAddressRanges; + + // Map from BinaryFunction to the list of address intervals that belong + // to that function, represented like CUAddressRanges. + std::map>> + FunctionAddressRanges; /// When writing data to .debug_ranges remember offset per CU. RangesCUMapType RangesSectionOffsetCUMap; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 366557b8f39e..c682666fa28e 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h" +#include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" @@ -43,6 +44,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Dwarf.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/TargetSelect.h" @@ -516,9 +518,10 @@ void RewriteInstance::run() { // Main "loop". discoverStorage(); readSpecialSections(); - readDebugInfo(); discoverFileObjects(); + readDebugInfo(); disassembleFunctions(); + readFunctionDebugInfo(); runOptimizationPasses(); emitFunctions(); updateDebugInfo(); @@ -722,6 +725,13 @@ void RewriteInstance::readDebugInfo() { BC->preprocessDebugInfo(); } +void RewriteInstance::readFunctionDebugInfo() { + if (!opts::UpdateDebugSections) + return; + + BC->preprocessFunctionDebugInfo(BinaryFunctions); +} + void RewriteInstance::disassembleFunctions() { // Disassemble every function and build it's control flow graph. TotalScore = 0; @@ -1364,17 +1374,22 @@ void RewriteInstance::updateFunctionRanges() { } }; - for (const auto &BFI : BinaryFunctions) { - const auto &Function = BFI.second; + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; // Use either new (image) or original size for the function range. + auto Size = Function.isSimple() ? Function.getImageSize() + : Function.getSize(); addDebugArangesEntry(Function.getAddress(), Function.getAddress(), - Function.isSimple() ? Function.getImageSize() - : Function.getSize()); + Size); + ArangesWriter.AddRange(Function, Function.getAddress(), Size); if (Function.isSimple() && Function.cold().getImageSize()) { addDebugArangesEntry(Function.getAddress(), Function.cold().getAddress(), Function.cold().getImageSize()); + ArangesWriter.AddRange(Function, + Function.cold().getAddress(), + Function.cold().getImageSize()); } } } @@ -1536,7 +1551,12 @@ void RewriteInstance::rewriteNoteSections() { if (*SectionName != ".debug_aranges" || !opts::UpdateDebugSections) { Size = Section.sh_size; - OS << InputFile->getData().substr(Section.sh_offset, Size); + std::string Data = InputFile->getData().substr(Section.sh_offset, Size); + auto SectionPatchersIt = SectionPatchers.find(*SectionName); + if (SectionPatchersIt != SectionPatchers.end()) { + (*SectionPatchersIt->second).patchBinary(Data); + } + OS << Data; } // Address of extension to the section. @@ -1954,8 +1974,8 @@ void RewriteInstance::updateDebugInfo() { auto RangesFieldOffset = BC->DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_ranges); if (RangesFieldOffset) { - DEBUG(dbgs() << "BOLT-DEBUG: adding relocation for DW_AT_ranges " - << "in .debug_info\n"); + DEBUG(dbgs() << "BOLT-DEBUG: adding relocation for DW_AT_ranges for " + << "compile unit in .debug_info\n"); const auto RSOI = ArangesWriter.getRangesOffsetCUMap().find(CUID); if (RSOI != ArangesWriter.getRangesOffsetCUMap().end()) { auto Offset = RSOI->second; @@ -1968,4 +1988,88 @@ void RewriteInstance::updateDebugInfo() { } } } + + updateDWARFSubprogramAddressRanges(); +} + +void RewriteInstance::updateDWARFSubprogramAddressRanges() { + auto AbbrevPatcher = llvm::make_unique(); + auto DebugInfoPatcher = llvm::make_unique(); + + // For each simple function, we update its pointer in .debug_info to point to + // its uptated address ranges. If the function was contiguous, also update its + // abbreviation. + for (const auto &BFI : BinaryFunctions) { + const auto &Function = BFI.second; + if (!Function.isSimple()) { + continue; + } + auto FunctionDIE = Function.getSubprocedureDIE(); + // If we didn't find the DIE associated to the function or the DIE doesn't + // have an abbreviation, give up on this function. + if (!(FunctionDIE && FunctionDIE->getAbbreviationDeclarationPtr())) + continue; + auto DebugRangesOffset = Function.getAddressRangesOffset() + + DebugRangesSize; + const auto *AbbreviationDecl = FunctionDIE->getAbbreviationDeclarationPtr(); + assert(AbbreviationDecl && + "Function DIE doesn't have an abbreviation: not supported yet."); + auto AbbrevCode = AbbreviationDecl->getCode(); + const auto *Unit = Function.getSubprocedureDIECompileUnit(); + + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges) != -1U) { + // Case 1: The function was already non-contiguous and had DW_AT_ranges. + // In this case we simply need to update the value of DW_AT_ranges. + DWARFFormValue FormValue; + uint32_t RangesOffset = -1U; + FunctionDIE->getAttributeValue(Unit, dwarf::DW_AT_ranges, FormValue, + &RangesOffset); + DebugInfoPatcher->addLE32Patch(RangesOffset, DebugRangesOffset); + } else { + // Case 2: The function has both DW_AT_low_pc and DW_AT_high_pc. + // We require the compiler to put both attributes one after the other + // for our approach to work. low_pc and high_pc both occupy 8 bytes + // as we're dealing with a 64-bit ELF. We basically change low_pc to + // DW_AT_ranges and high_pc to DW_AT_producer. ranges spans only 4 bytes + // in 32-bit DWARF, which we assume to be used, which leaves us with 12 + // more bytes. We then set the value of DW_AT_producer as an arbitrary + // 12-byte string that fills the remaining space and leaves the rest of + // the abbreviation layout unchanged. + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) != -1U && + AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc) != -1U) { + uint32_t LowPCOffset = -1U; + uint32_t HighPCOffset = -1U; + DWARFFormValue FormValue; + FunctionDIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, FormValue, + &LowPCOffset); + FunctionDIE->getAttributeValue(Unit, dwarf::DW_AT_high_pc, FormValue, + &HighPCOffset); + + AbbrevPatcher->addAttributePatch(Unit, + AbbrevCode, + dwarf::DW_AT_low_pc, + dwarf::DW_AT_ranges, + dwarf::DW_FORM_sec_offset); + AbbrevPatcher->addAttributePatch(Unit, + AbbrevCode, + dwarf::DW_AT_high_pc, + dwarf::DW_AT_producer, + dwarf::DW_FORM_string); + assert(LowPCOffset != -1U && LowPCOffset + 8 == HighPCOffset && + "We depend on the compiler putting high_pc right after low_pc."); + DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); + std::string ProducerString{"LLVM-BOLT"}; + ProducerString.resize(12, ' '); + ProducerString.back() = '\0'; + + DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); + } else { + DEBUG(errs() << "BOLT-WARNING: Cannot update ranges for function " + << Function.getName() << "\n"); + } + } + } + + SectionPatchers[".debug_abbrev"].reset(AbbrevPatcher.release()); + SectionPatchers[".debug_info"].reset(DebugInfoPatcher.release()); } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index e6d76cb2ed83..83965ef3daa6 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -14,6 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H #define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H +#include "BinaryPatcher.h" #include "DebugArangesWriter.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" @@ -137,6 +138,10 @@ class RewriteInstance { /// Read information from debug sections. void readDebugInfo(); + /// Read information from debug sections that depends on disassembled + /// functions. + void readFunctionDebugInfo(); + /// Disassemble each function in the binary and associate it with a /// BinaryFunction object, preparing all information necessary for binary /// optimization. @@ -196,6 +201,9 @@ class RewriteInstance { /// Generate new contents for .debug_ranges and .debug_aranges section. void generateDebugRanges(); + /// Patches the binary for function address ranges to be updated. + void updateDWARFSubprogramAddressRanges(); + /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { assert(Address >= NewTextSegmentAddress && @@ -245,6 +253,10 @@ class RewriteInstance { /// the .debug_aranges DWARF section. DebugArangesWriter ArangesWriter; + /// Patchers used to apply simple changes to sections of the input binary. + /// Maps section name -> patcher. + std::map> SectionPatchers; + /// Exception handling and stack unwinding information in this binary. ArrayRef LSDAData; uint64_t LSDAAddress{0}; From 81d4d3a102e57c59e4a7e9f17d7c45b46a055c73 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 28 Mar 2016 11:06:28 -0700 Subject: [PATCH 084/904] Populate function execution count while parsing fdata. Summary: Populate function execution count while parsing fdata. Before we used a quadratic algorithm to populate the execution count (had to iterate over *all* branches for every single function). Ignore non-symbol to non-symbol branches while parsing fdata. These changes combined drop HHVM processing time from 4 minutes 53 seconds down to 2 minutes 9 seconds on my devserver. Test case had to be modified since it contained irrelevant branches from PLT to libc. (cherry picked from commit a2a63218cca8b23c7641709a92c40da8fa9cdefe) --- bolt/BinaryFunction.cpp | 3 ++- bolt/DataReader.cpp | 52 ++++++++++++++++++++--------------------- bolt/DataReader.h | 8 +++---- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 878ace291216..be04495a882c 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -604,7 +604,8 @@ bool BinaryFunction::buildCFG() { if (std::error_code EC = BranchDataOrErr.getError()) { DEBUG(dbgs() << "no branch data found for \"" << getName() << "\"\n"); } else { - ExecutionCount = BC.DR.countBranchesTo(getName()); + if (!BranchDataOrErr.get().Data.empty()) + ExecutionCount = BranchDataOrErr.get().ExecutionCount; } if (!isSimple()) diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 85e700e513fa..48565385a99c 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -27,16 +27,6 @@ ErrorOr FuncBranchData::getBranch(uint64_t From, return make_error_code(llvm::errc::invalid_argument); } -uint64_t -FuncBranchData::countBranchesTo(StringRef FuncName) const { - uint64_t TotalCount = 0; - for (const auto &I : Data) { - if (I.To.Offset == 0 && I.To.Name == FuncName) - TotalCount += I.Branches; - } - return TotalCount; -} - ErrorOr> DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { ErrorOr> MB = @@ -161,6 +151,18 @@ bool DataReader::hasData() { } std::error_code DataReader::parse() { + auto GetOrCreateFuncEntry = [&](StringRef Name) { + auto I = FuncsMap.find(Name); + if (I == FuncsMap.end()) { + bool success; + std::tie(I, success) = FuncsMap.insert( + std::make_pair(Name, FuncBranchData(Name, + FuncBranchData::ContainerTy()))); + assert(success && "unexpected result of insert"); + } + return I; + }; + Col = 0; Line = 1; while (hasData()) { @@ -171,16 +173,22 @@ std::error_code DataReader::parse() { Line += 1; BranchInfo BI = Res.get(); - StringRef Name = BI.From.Name; - auto I = FuncsMap.find(Name); - if (I == FuncsMap.end()) { - FuncBranchData::ContainerTy Cont; - Cont.emplace_back(std::move(BI)); - FuncsMap.insert( - std::make_pair(Name, FuncBranchData(Name, std::move(Cont)))); + + // Ignore branches not involving known location. + if (!BI.From.IsSymbol && !BI.To.IsSymbol) continue; - } + + auto I = GetOrCreateFuncEntry(BI.From.Name); I->getValue().Data.emplace_back(std::move(BI)); + + // If destination is the function start - update execution count. + // NB: the data is skewed since we cannot tell tail recursion from + // branches to the function start. + if (BI.To.IsSymbol && BI.To.Offset == 0) { + I = GetOrCreateFuncEntry(BI.To.Name); + I->getValue().ExecutionCount += BI.Branches; + } + } return std::error_code(); } @@ -194,14 +202,6 @@ DataReader::getFuncBranchData(StringRef FuncName) const { return I->getValue(); } -uint64_t DataReader::countBranchesTo(StringRef FuncName) const { - uint64_t TotalCount = 0; - for (const auto &KV : FuncsMap) { - TotalCount += KV.getValue().countBranchesTo(FuncName); - } - return TotalCount; -} - void DataReader::dump() const { for (const auto &Func : FuncsMap) { for (const auto &BI : Func.getValue().Data) { diff --git a/bolt/DataReader.h b/bolt/DataReader.h index a5b711737ed0..9a780b2020ed 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -46,18 +46,19 @@ struct BranchInfo { Branches(Branches) {} }; -class FuncBranchData { -public: +struct FuncBranchData { typedef std::vector ContainerTy; StringRef Name; ContainerTy Data; + /// Total execution count for the function. + int64_t ExecutionCount{0}; + FuncBranchData(StringRef Name, ContainerTy Data) : Name(Name), Data(std::move(Data)) {} ErrorOr getBranch(uint64_t From, uint64_t To) const; - uint64_t countBranchesTo(StringRef FuncName) const; }; //===----------------------------------------------------------------------===// @@ -94,7 +95,6 @@ class DataReader { std::error_code parse(); ErrorOr getFuncBranchData(StringRef FuncName) const; - uint64_t countBranchesTo(StringRef FuncName) const; /// Dumps the entire data structures parsed. Used for debugging. void dump() const; From 8dc6de636fdcad2dca42d621a5bd50d493831956 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 28 Mar 2016 22:39:48 -0700 Subject: [PATCH 085/904] Speedup section remapping. Summary: Before this diff LLVM used to iterate over all sections to find the one with an address we want to remap. Since we have extremely large number of section this process is highly inefficient. Instead we add a new interface to remap a section with a given ID (which effectively is an index into an array of sections), and pass the ID instead of the address. This cuts down the processing time of hhvm binary by 10 seconds, and brings the total processing time to a little under 2 minutes. (cherry picked from commit 98ae2e0aa6d81bab26b79cdef82ecb76d1d65353) --- bolt/RewriteInstance.cpp | 21 +++++++++++++-------- bolt/RewriteInstance.h | 10 ++++++---- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index c682666fa28e..364cad9bc551 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -254,7 +254,10 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, Size, Alignment, IsCode, - IsReadOnly); + IsReadOnly, + 0, + 0, + SectionID); return ret; } @@ -280,7 +283,10 @@ uint8_t *ExecutableFileMemoryManager::recordNoteSection( Size, Alignment, /*IsCode=*/false, - /*IsReadOnly*/true); + /*IsReadOnly*/true, + 0, + 0, + SectionID); return DataCopy; } else { DEBUG(dbgs() << "BOLT-DEBUG: ignoring section " << SectionName @@ -1279,8 +1285,8 @@ void RewriteInstance::emitFunctions() { << " to 0x" << Twine::utohexstr(Function.getAddress()) << '\n'); OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SMII->second.AllocAddress), - Function.getAddress()); + SMII->second.SectionID, + Function.getAddress()); Function.setImageAddress(SMII->second.AllocAddress); Function.setImageSize(SMII->second.Size); } else { @@ -1302,8 +1308,8 @@ void RewriteInstance::emitFunctions() { << " with size " << Twine::utohexstr(SMII->second.Size) << '\n'); OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SMII->second.AllocAddress), - NextAvailableAddress); + SMII->second.SectionID, + NextAvailableAddress); Function.cold().setAddress(NextAvailableAddress); Function.cold().setImageAddress(SMII->second.AllocAddress); Function.cold().setImageSize(SMII->second.Size); @@ -1345,9 +1351,8 @@ void RewriteInstance::emitFunctions() { << '\n'); OLT.mapSectionAddress(ObjectsHandle, - reinterpret_cast(SI.AllocAddress), + SI.SectionID, NextAvailableAddress); - SI.FileAddress = NextAvailableAddress; SI.FileOffset = getFileOffsetFor(NextAvailableAddress); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 83965ef3daa6..2ce706436393 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -46,6 +46,7 @@ struct SectionInfo { uint64_t FileAddress{0}; /// Address for the output file (final address). uint64_t FileOffset{0}; /// Offset in the output file. uint64_t ShName{0}; /// Name offset in section header string table. + unsigned SectionID{0}; /// Unique ID used for address mapping. struct Reloc { uint32_t Offset; @@ -59,10 +60,11 @@ struct SectionInfo { SectionInfo(uint64_t Address = 0, uint64_t Size = 0, unsigned Alignment = 0, bool IsCode = false, bool IsReadOnly = false, - uint64_t FileAddress = 0, uint64_t FileOffset = 0) - : AllocAddress(Address), Size(Size), Alignment(Alignment), - IsCode(IsCode), IsReadOnly(IsReadOnly), FileAddress(FileAddress), - FileOffset(FileOffset) {} + uint64_t FileAddress = 0, uint64_t FileOffset = 0, + unsigned SectionID = 0) + : AllocAddress(Address), Size(Size), Alignment(Alignment), IsCode(IsCode), + IsReadOnly(IsReadOnly), FileAddress(FileAddress), FileOffset(FileOffset), + SectionID(SectionID) {} }; /// Class responsible for allocating and managing code and data sections. From 1f0a8aec4f413d57fe7338e78dbfa63ca78f01e2 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Mon, 28 Mar 2016 17:45:22 -0700 Subject: [PATCH 086/904] Update DWARF lexical blocks address ranges. Summary: Updates DWARF lexical blocks address ranges in the output binary after optimizations. This is similar to updating function address ranges except that the ranges representation needs to be more general, since address ranges can begin or end in the middle of a basic block. The following changes were made: - Added a data structure for iterating over the basic blocks that intersect an address range: BasicBlockTable.h - Added some more bookkeeping in BinaryBasicBlock. Basically, I needed to keep track of the block's size in the input binary as well as its address in the output binary. This information is mostly set by BinaryFunction after disassembly. - Added a representation for address ranges relative to basic blocks (BasicBlockOffsetRanges.h). Will also serve for location lists. - Added a representation for Lexical Blocks (LexicalBlock.h) - Small refactorings in DebugArangesWriter: -- Renamed to DebugRangesSectionsWriter since it also writes .debug_ranges -- Refactored it not to depend on BinaryFunction but instead on anything that can be assined an aoffset in .debug_ranges (added an interface for that) - Iterate over the DIE tree during initialization to find lexical blocks in .debug_info (BinaryContext.cpp) - Added patches to .debug_abbrev and .debug_info in RewriteInstance to update lexical blocks attributes (in fact, this part is very similar to what was done to function address ranges and I just refactored/reused that code) - Added small test case (lexical_blocks_address_ranges_debug.test) (cherry picked from commit 468c9eaf250d50450b910790f6b2f49737a4b0f2) --- bolt/BasicBlockOffsetRanges.cpp | 70 ++++++ bolt/BasicBlockOffsetRanges.h | 58 +++++ bolt/BinaryBasicBlock.cpp | 1 - bolt/BinaryBasicBlock.h | 37 ++- bolt/BinaryContext.cpp | 62 +++++ bolt/BinaryContext.h | 6 +- bolt/BinaryFunction.cpp | 12 +- bolt/BinaryFunction.h | 13 +- bolt/CMakeLists.txt | 3 +- ...iter.cpp => DebugRangesSectionsWriter.cpp} | 26 +-- ...esWriter.h => DebugRangesSectionsWriter.h} | 29 ++- bolt/LexicalBlock.h | 69 ++++++ bolt/RewriteInstance.cpp | 218 +++++++++++------- bolt/RewriteInstance.h | 27 ++- 14 files changed, 512 insertions(+), 119 deletions(-) create mode 100644 bolt/BasicBlockOffsetRanges.cpp create mode 100644 bolt/BasicBlockOffsetRanges.h rename bolt/{DebugArangesWriter.cpp => DebugRangesSectionsWriter.cpp} (80%) rename bolt/{DebugArangesWriter.h => DebugRangesSectionsWriter.h} (64%) create mode 100644 bolt/LexicalBlock.h diff --git a/bolt/BasicBlockOffsetRanges.cpp b/bolt/BasicBlockOffsetRanges.cpp new file mode 100644 index 000000000000..9cb507facd9e --- /dev/null +++ b/bolt/BasicBlockOffsetRanges.cpp @@ -0,0 +1,70 @@ +//===- BasicBlockOffsetRanges.cpp - list of address ranges relative to BBs ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BasicBlockOffsetRanges.h" +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" + +namespace llvm { +namespace bolt { + +void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress) { + auto FirstBB = Function.getBasicBlockContainingOffset( + BeginAddress - Function.getAddress()); + assert(FirstBB && "No basic blocks in the function intersect given range."); + + for (auto I = Function.getIndex(FirstBB), S = Function.size(); I != S; ++I) { + auto BB = Function.getBasicBlockAtIndex(I); + uint64_t BBAddress = Function.getAddress() + BB->getOffset(); + if (BBAddress >= EndAddress) + break; + + uint64_t InternalAddressRangeBegin = std::max(BBAddress, BeginAddress); + assert(BB->getFunction() == &Function && + "Mismatching functions.\n"); + uint64_t InternalAddressRangeEnd = + std::min(BBAddress + Function.getBasicBlockOriginalSize(BB), + EndAddress); + + AddressRanges.push_back( + BBAddressRange{ + BB, + static_cast(InternalAddressRangeBegin - BBAddress), + static_cast(InternalAddressRangeEnd - BBAddress)}); + } +} + +std::vector> +BasicBlockOffsetRanges::getAbsoluteAddressRanges() const { + std::vector> AbsoluteRanges; + for (const auto &BBAddressRange : AddressRanges) { + auto BBOutputAddressRange = + BBAddressRange.BasicBlock->getOutputAddressRange(); + uint64_t NewRangeBegin = BBOutputAddressRange.first + + BBAddressRange.RangeBeginOffset; + // If the end offset pointed to the end of the basic block, then we set + // the new end range to cover the whole basic block as the BB's size + // might have increased. + auto BBFunction = BBAddressRange.BasicBlock->getFunction(); + uint64_t NewRangeEnd = + (BBAddressRange.RangeEndOffset == + BBFunction->getBasicBlockOriginalSize(BBAddressRange.BasicBlock)) + ? BBOutputAddressRange.second + : (BBOutputAddressRange.first + BBAddressRange.RangeEndOffset); + AbsoluteRanges.emplace_back(NewRangeBegin, NewRangeEnd); + } + return AbsoluteRanges; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/BasicBlockOffsetRanges.h b/bolt/BasicBlockOffsetRanges.h new file mode 100644 index 000000000000..f9221ff617e8 --- /dev/null +++ b/bolt/BasicBlockOffsetRanges.h @@ -0,0 +1,58 @@ +//===--- BasicBlockOffsetRanges.h - list of address ranges relative to BBs ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Represents a list of address ranges where addresses are relative to the +// beginning of basic blocks. Useful for converting address ranges in the input +// binary to equivalent ranges after optimizations take place. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BASIC_BLOCK_OFFSET_RANGES_H +#define LLVM_TOOLS_LLVM_BOLT_BASIC_BLOCK_OFFSET_RANGES_H + +#include +#include +#include + +namespace llvm { +namespace bolt { + +class BinaryFunction; +class BinaryBasicBlock; + +class BasicBlockOffsetRanges { +private: + /// An address range inside one basic block. + struct BBAddressRange { + const BinaryBasicBlock *BasicBlock; + /// Beginning of the range counting from BB's start address. + uint16_t RangeBeginOffset; + /// (Exclusive) end of the range counting from BB's start address. + uint16_t RangeEndOffset; + }; + + std::vector AddressRanges; + +public: + /// Add range [BeginAddress, EndAddress) to the address ranges list. + /// \p Function is the function that contains the given address range. + void addAddressRange(BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress); + + /// Returns the list of absolute addresses calculated using the output address + /// of the basic blocks, i.e. the input ranges updated after basic block + /// addresses might have changed. + std::vector> getAbsoluteAddressRanges() const; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index c92a40a8a3fb..0aafc67d053d 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -10,7 +10,6 @@ //===----------------------------------------------------------------------===// #include "BinaryBasicBlock.h" -#include "BinaryFunction.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index d588b318a53d..4eb742cb6a2e 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -26,6 +26,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include +#include namespace llvm { namespace bolt { @@ -39,6 +40,15 @@ class BinaryBasicBlock { /// Label associated with the block. MCSymbol *Label{nullptr}; + /// Function that owns this basic block. + BinaryFunction *Function; + + /// Label associated with the end of the block in the output binary. + MCSymbol *EndLabel{nullptr}; + + /// [Begin, End) address range for this block in the output binary. + std::pair OutputAddressRange{0, 0}; + /// Original offset in the function. uint64_t Offset{std::numeric_limits::max()}; @@ -80,8 +90,9 @@ class BinaryBasicBlock { explicit BinaryBasicBlock( MCSymbol *Label, + BinaryFunction *Function, uint64_t Offset = std::numeric_limits::max()) - : Label(Label), Offset(Offset) {} + : Label(Label), Function(Function), Offset(Offset) {} explicit BinaryBasicBlock(uint64_t Offset) : Offset(Offset) {} @@ -261,6 +272,30 @@ class BinaryBasicBlock { return false; } + /// Sets the symbol pointing to the end of the BB in the output binary. + void setEndLabel(MCSymbol *Symbol) { + EndLabel = Symbol; + } + + /// Gets the symbol pointing to the end of the BB in the output binary. + MCSymbol *getEndLabel() const { + return EndLabel; + } + + /// Sets the memory address range of this BB in the output binary. + void setOutputAddressRange(std::pair Range) { + OutputAddressRange = Range; + } + + /// Gets the memory address range of this BB in the output binary. + std::pair getOutputAddressRange() const { + return OutputAddressRange; + } + + BinaryFunction *getFunction() const { + return Function; + } + private: /// Adds predecessor to the BB. Most likely you don't need to call this. diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index d73c5d4e78a6..00e4ffce1ea4 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -12,12 +12,15 @@ #include "BinaryContext.h" #include "BinaryFunction.h" #include "llvm/ADT/Twine.h" +#include "llvm/DebugInfo/DWARF/DWARFUnit.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" namespace llvm { namespace bolt { +BinaryContext::~BinaryContext() { } + MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix) { MCSymbol *Symbol{nullptr}; @@ -44,6 +47,59 @@ MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, return Symbol; } +} // namespace bolt +} // namespace llvm + +namespace { + +using namespace llvm; +using namespace bolt; + +/// Returns the binary function that contains a given address in the input +/// binary, or nullptr if none does. +BinaryFunction *getBinaryFunctionContainingAddress( + uint64_t Address, + std::map &BinaryFunctions) { + auto It = BinaryFunctions.upper_bound(Address); + if (It != BinaryFunctions.begin()) { + --It; + if (It->first + It->second.getSize() > Address) { + return &It->second; + } + } + return nullptr; +} + +// Traverses the DIE tree in a recursive depth-first search and finds lexical +// blocks, saving them in LexicalBlocks. +void findLexicalBlocks(const DWARFCompileUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + std::map &Functions, + std::vector &LexicalBlocks) { + if (DIE->getTag() == dwarf::DW_TAG_lexical_block) { + LexicalBlocks.emplace_back(Unit, DIE); + auto &LB = LexicalBlocks.back(); + for (const auto &Range : DIE->getAddressRanges(Unit)) { + if (auto *Function = getBinaryFunctionContainingAddress(Range.first, + Functions)) { + if (Function->isSimple()) { + LB.addAddressRange(*Function, Range.first, Range.second); + } + } + } + } + + // Recursively visit each child. + for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { + findLexicalBlocks(Unit, Child, Functions, LexicalBlocks); + } +} + +} // namespace + +namespace llvm { +namespace bolt { + void BinaryContext::preprocessDebugInfo() { // Iterate over all DWARF compilation units and map their offset in the // binary to themselves in OffsetDwarfCUMap @@ -95,6 +151,12 @@ void BinaryContext::preprocessFunctionDebugInfo( } } } + + // Iterate over DIE trees finding lexical blocks. + for (const auto &CU : DwCtx->compile_units()) { + findLexicalBlocks(CU.get(), CU->getUnitDIE(false), BinaryFunctions, + LexicalBlocks); + } } } // namespace bolt diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index ba74045f13ca..8c838b3c2235 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -14,6 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H #define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H +#include "LexicalBlock.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -69,6 +70,9 @@ class BinaryContext { /// Maps DWARF CUID to offset of stmt_list attribute in .debug_info. std::map LineTableOffsetCUMap; + /// List of DWARF lexical blocks in .debug_info. + std::vector LexicalBlocks; + std::unique_ptr Ctx; std::unique_ptr DwCtx; @@ -132,7 +136,7 @@ class BinaryContext { DisAsm(std::move(DisAsm)), DR(DR) {} - ~BinaryContext() {} + ~BinaryContext(); /// Return a global symbol registered at a given \p Address. If no symbol /// exists, create one with unique name using \p Prefix. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index be04495a882c..0637340377d2 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -87,6 +87,16 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { return &(*--I); } +size_t +BinaryFunction::getBasicBlockOriginalSize(const BinaryBasicBlock *BB) const { + auto Index = getIndex(BB); + if (Index + 1 == BasicBlocks.size()) { + return Size - BB->getOffset(); + } else { + return BasicBlocks[Index + 1].getOffset() - BB->getOffset(); + } +} + unsigned BinaryFunction::eraseDeadBBs( std::map &ToPreserve) { BasicBlockOrderType NewLayout; @@ -704,7 +714,7 @@ bool BinaryFunction::buildCFG() { } } - // Set the basic block layout to the original order + // Set the basic block layout to the original order. for (auto &BB : BasicBlocks) { BasicBlocksLayout.emplace_back(&BB); } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 069a08034820..403efeb6699b 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -19,6 +19,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" +#include "DebugRangesSectionsWriter.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" #include "llvm/MC/MCCodeEmitter.h" @@ -48,7 +49,7 @@ namespace bolt { /// BinaryFunction is a representation of machine-level function. // /// We use the term "Binary" as "Machine" was already taken. -class BinaryFunction { +class BinaryFunction : public AddressRangesOwner { public: enum class State : char { Empty = 0, /// Function body is empty @@ -370,6 +371,11 @@ class BinaryFunction { return I; } + /// Returns the n-th basic block in this function in its original layout, or + /// nullptr if n >= size(). + const BinaryBasicBlock * getBasicBlockAtIndex(unsigned Index) const { + return &BasicBlocks.at(Index); + } /// Return the name of the function as extracted from the binary file. StringRef getName() const { @@ -465,7 +471,7 @@ class BinaryFunction { assert(BC.Ctx && "cannot be called with empty context"); if (!Label) Label = BC.Ctx->createTempSymbol("BB", true); - BasicBlocks.emplace_back(BinaryBasicBlock(Label, Offset)); + BasicBlocks.emplace_back(BinaryBasicBlock(Label, this, Offset)); auto BB = &BasicBlocks.back(); @@ -758,6 +764,9 @@ class BinaryFunction { return DIECompileUnit; } + /// Returns the size of the basic block in the original binary. + size_t getBasicBlockOriginalSize(const BinaryBasicBlock *BB) const; + virtual ~BinaryFunction() {} /// Info for fragmented functions. diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 44611bc50916..c66cb442fbfc 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -13,13 +13,14 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-bolt llvm-bolt.cpp + BasicBlockOffsetRanges.cpp BinaryBasicBlock.cpp BinaryContext.cpp BinaryFunction.cpp BinaryPatcher.cpp DataReader.cpp - DebugArangesWriter.cpp DebugLineTableRowRef.cpp + DebugRangesSectionsWriter.cpp Exceptions.cpp RewriteInstance.cpp ) diff --git a/bolt/DebugArangesWriter.cpp b/bolt/DebugRangesSectionsWriter.cpp similarity index 80% rename from bolt/DebugArangesWriter.cpp rename to bolt/DebugRangesSectionsWriter.cpp index 680eae0d4b4f..97838d47f00e 100644 --- a/bolt/DebugArangesWriter.cpp +++ b/bolt/DebugRangesSectionsWriter.cpp @@ -1,4 +1,4 @@ -//===--- DebugArangesWriter.h - Writes the .debug_aranges DWARF section ---===// +//===-- DebugRangesSectionsWriter.h - Writes DWARF address ranges sections -==// // // The LLVM Compiler Infrastructure // @@ -7,25 +7,24 @@ // //===----------------------------------------------------------------------===// -#include "DebugArangesWriter.h" +#include "DebugRangesSectionsWriter.h" #include "BinaryFunction.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCObjectWriter.h" - namespace llvm { namespace bolt { -void DebugArangesWriter::AddRange(uint32_t CompileUnitOffset, - uint64_t Address, - uint64_t Size) { +void DebugRangesSectionsWriter::AddRange(uint32_t CompileUnitOffset, + uint64_t Address, + uint64_t Size) { CUAddressRanges[CompileUnitOffset].push_back(std::make_pair(Address, Size)); } -void DebugArangesWriter::AddRange(BinaryFunction &BF, - uint64_t Address, - uint64_t Size) { - FunctionAddressRanges[&BF].push_back(std::make_pair(Address, Size)); +void DebugRangesSectionsWriter::AddRange(AddressRangesOwner *BF, + uint64_t Address, + uint64_t Size) { + ObjectAddressRanges[BF].push_back(std::make_pair(Address, Size)); } namespace { @@ -52,7 +51,7 @@ uint32_t WriteAddressRanges( } // namespace -void DebugArangesWriter::WriteRangesSection(MCObjectWriter *Writer) { +void DebugRangesSectionsWriter::WriteRangesSection(MCObjectWriter *Writer) { uint32_t SectionOffset = 0; for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { uint64_t CUOffset = CUOffsetAddressRangesPair.first; @@ -61,14 +60,15 @@ void DebugArangesWriter::WriteRangesSection(MCObjectWriter *Writer) { SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); } - for (const auto &BFAddressRangesPair : FunctionAddressRanges) { + for (const auto &BFAddressRangesPair : ObjectAddressRanges) { BFAddressRangesPair.first->setAddressRangesOffset(SectionOffset); const auto &AddressRanges = BFAddressRangesPair.second; SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); } } -void DebugArangesWriter::WriteArangesSection(MCObjectWriter *Writer) const { +void +DebugRangesSectionsWriter::WriteArangesSection(MCObjectWriter *Writer) const { // For reference on the format of the .debug_aranges section, see the DWARF4 // specification, section 6.1.4 Lookup by Address // http://www.dwarfstd.org/doc/DWARF4.pdf diff --git a/bolt/DebugArangesWriter.h b/bolt/DebugRangesSectionsWriter.h similarity index 64% rename from bolt/DebugArangesWriter.h rename to bolt/DebugRangesSectionsWriter.h index e28122c66064..dd1455ab11af 100644 --- a/bolt/DebugArangesWriter.h +++ b/bolt/DebugRangesSectionsWriter.h @@ -1,4 +1,4 @@ -//===--- DebugArangesWriter.h - Writes the .debug_aranges DWARF section ---===// +//===-- DebugRangesSectionsWriter.h - Writes DWARF address ranges sections -==// // // The LLVM Compiler Infrastructure // @@ -7,12 +7,12 @@ // //===----------------------------------------------------------------------===// // -// Class that serializes a .debug_aranges section of a binary. +// Class that serializes the .debug_ranges and .debug_aranges sections. // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUGARANGESWRITER_H -#define LLVM_TOOLS_LLVM_BOLT_DEBUGARANGESWRITER_H +#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUG_RANGES_SECTIONS_WRITER_H +#define LLVM_TOOLS_LLVM_BOLT_DEBUG_RANGES_SECTIONS_WRITER_H #include #include @@ -24,19 +24,26 @@ class MCObjectWriter; namespace bolt { -class BinaryFunction; +/// Abstract interface for classes that represent objects that have +/// associated address ranges in .debug_ranges. These address ranges can +/// be serialized by DebugRangesSectionsWriter which notifies the object +/// of where in the section its address ranges list was written. +class AddressRangesOwner { +public: + virtual void setAddressRangesOffset(uint32_t Offset) = 0; +}; -class DebugArangesWriter { +class DebugRangesSectionsWriter { public: - DebugArangesWriter() = default; + DebugRangesSectionsWriter() = default; /// Adds a range to the .debug_arange section. void AddRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); - /// Adds an address range that belongs to a given BinaryFunction. + /// Adds an address range that belongs to a given object. /// When .debug_ranges is written, the offset of the range corresponding /// to the function will be set using BF->setAddressRangesOffset(). - void AddRange(BinaryFunction &BF, uint64_t Address, uint64_t Size); + void AddRange(AddressRangesOwner *ARO, uint64_t Address, uint64_t Size); using RangesCUMapType = std::map; @@ -60,8 +67,8 @@ class DebugArangesWriter { // Map from BinaryFunction to the list of address intervals that belong // to that function, represented like CUAddressRanges. - std::map>> - FunctionAddressRanges; + std::map>> + ObjectAddressRanges; /// When writing data to .debug_ranges remember offset per CU. RangesCUMapType RangesSectionOffsetCUMap; diff --git a/bolt/LexicalBlock.h b/bolt/LexicalBlock.h new file mode 100644 index 000000000000..a7740f13563e --- /dev/null +++ b/bolt/LexicalBlock.h @@ -0,0 +1,69 @@ +//===--- LexicalBlock.h - DWARF lexical blocks ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Represents DWARF lexical blocks, maintaining their list of address ranges to +// be updated in the output debugging information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_LEXICAL_BLOCK_H +#define LLVM_TOOLS_LLVM_BOLT_LEXICAL_BLOCK_H + +#include "DebugRangesSectionsWriter.h" +#include "BasicBlockOffsetRanges.h" + +namespace llvm { + +class DWARFCompileUnit; +class DWARFDebugInfoEntryMinimal; + +namespace bolt { + +class BasicBlockTable; +class BinaryBasicBlock; +class BinaryFunction; + +class LexicalBlock : public AddressRangesOwner { +public: + LexicalBlock(const DWARFCompileUnit *CU, + const DWARFDebugInfoEntryMinimal *DIE) + : CU(CU), DIE(DIE) { } + + // Add range [BeginAddress, EndAddress) to lexical block. + void addAddressRange(BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress) { + BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress); + } + + std::vector> getAbsoluteAddressRanges() const { + return BBOffsetRanges.getAbsoluteAddressRanges(); + } + + void setAddressRangesOffset(uint32_t Offset) { AddressRangesOffset = Offset; } + + uint32_t getAddressRangesOffset() const { return AddressRangesOffset; } + + const DWARFCompileUnit *getCompileUnit() const { return CU; } + const DWARFDebugInfoEntryMinimal *getDIE() const { return DIE; } + +private: + const DWARFCompileUnit *CU; + const DWARFDebugInfoEntryMinimal *DIE; + + BasicBlockOffsetRanges BBOffsetRanges; + + // Offset of the address ranges of this block in the output .debug_ranges. + uint32_t AddressRangesOffset; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 364cad9bc551..87d0aa958e4e 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -25,6 +25,7 @@ #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler.h" @@ -541,6 +542,7 @@ void RewriteInstance::run() { // Rewrite allocatable contents and copy non-allocatable parts with mods. rewriteFile(); + } void RewriteInstance::discoverFileObjects() { @@ -1144,6 +1146,10 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, } emitCFIInstr(*Function.getCFIFor(Instr)); } + + MCSymbol *BBEndLabel = BC.Ctx->createTempSymbol(); + BB->setEndLabel(BBEndLabel); + Streamer.EmitLabel(BBEndLabel); } // Emit CFI end @@ -1362,6 +1368,26 @@ void RewriteInstance::emitFunctions() { } } + MCAsmLayout Layout( + static_cast(Streamer.get())->getAssembler()); + + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + for (auto &BB : Function) { + if (!(BB.getLabel()->isDefined(false) && + BB.getEndLabel() && BB.getEndLabel()->isDefined(false))) { + continue; + } + uint64_t BaseAddress = (BB.isCold() ? Function.cold().getAddress() + : Function.getAddress()); + uint64_t BeginAddress = + BaseAddress + Layout.getSymbolOffset(*BB.getLabel()); + uint64_t EndAddress = + BaseAddress + Layout.getSymbolOffset(*BB.getEndLabel()); + BB.setOutputAddressRange(std::make_pair(BeginAddress, EndAddress)); + } + } + OLT.emitAndFinalize(ObjectsHandle); if (opts::KeepTmp) @@ -1375,7 +1401,7 @@ void RewriteInstance::updateFunctionRanges() { if (auto DebugAranges = BC->DwCtx->getDebugAranges()) { uint32_t CUOffset = DebugAranges->findAddress(OriginalFunctionAddress); if (CUOffset != -1U) - ArangesWriter.AddRange(CUOffset, RangeBegin, RangeSize); + RangesSectionsWriter.AddRange(CUOffset, RangeBegin, RangeSize); } }; @@ -1387,14 +1413,14 @@ void RewriteInstance::updateFunctionRanges() { addDebugArangesEntry(Function.getAddress(), Function.getAddress(), Size); - ArangesWriter.AddRange(Function, Function.getAddress(), Size); + RangesSectionsWriter.AddRange(&Function, Function.getAddress(), Size); if (Function.isSimple() && Function.cold().getImageSize()) { addDebugArangesEntry(Function.getAddress(), Function.cold().getAddress(), Function.cold().getImageSize()); - ArangesWriter.AddRange(Function, - Function.cold().getAddress(), - Function.cold().getImageSize()); + RangesSectionsWriter.AddRange(&Function, + Function.cold().getAddress(), + Function.cold().getImageSize()); } } } @@ -1412,9 +1438,9 @@ void RewriteInstance::generateDebugRanges() { auto Writer = MAB->createObjectWriter(OS); if (RT == RANGES) { - ArangesWriter.WriteRangesSection(Writer); + RangesSectionsWriter.WriteRangesSection(Writer); } else { - ArangesWriter.WriteArangesSection(Writer); + RangesSectionsWriter.WriteArangesSection(Writer); } const auto &DebugRangesContents = OS.str(); @@ -1906,6 +1932,15 @@ void RewriteInstance::rewriteFile() { Out->keep(); } +void RewriteInstance::updateLexicalBlocksAddresses() { + for (auto &LB : BC->LexicalBlocks) { + for (const auto &Range : LB.getAbsoluteAddressRanges()) { + RangesSectionsWriter.AddRange(&LB, Range.first, + Range.second - Range.first); + } + } +} + void RewriteInstance::computeLineTableOffsets() { const auto LineSection = BC->Ctx->getObjectFileInfo()->getDwarfLineSection(); @@ -1969,6 +2004,8 @@ void RewriteInstance::updateDebugInfo() { updateFunctionRanges(); + updateLexicalBlocksAddresses(); + generateDebugRanges(); auto &DebugInfoSI = SectionMM->NoteSectionInfo[".debug_info"]; @@ -1981,8 +2018,8 @@ void RewriteInstance::updateDebugInfo() { if (RangesFieldOffset) { DEBUG(dbgs() << "BOLT-DEBUG: adding relocation for DW_AT_ranges for " << "compile unit in .debug_info\n"); - const auto RSOI = ArangesWriter.getRangesOffsetCUMap().find(CUID); - if (RSOI != ArangesWriter.getRangesOffsetCUMap().end()) { + const auto RSOI = RangesSectionsWriter.getRangesOffsetCUMap().find(CUID); + if (RSOI != RangesSectionsWriter.getRangesOffsetCUMap().end()) { auto Offset = RSOI->second; DebugInfoSI.PendingRelocs.emplace_back( SectionInfo::Reloc{RangesFieldOffset, 4, 0, @@ -1994,87 +2031,104 @@ void RewriteInstance::updateDebugInfo() { } } - updateDWARFSubprogramAddressRanges(); + updateDWARFAddressRanges(); } -void RewriteInstance::updateDWARFSubprogramAddressRanges() { - auto AbbrevPatcher = llvm::make_unique(); - auto DebugInfoPatcher = llvm::make_unique(); +void RewriteInstance::updateDWARFAddressRanges() { + SectionPatchers[".debug_abbrev"] = llvm::make_unique(); + SectionPatchers[".debug_info"] = llvm::make_unique(); - // For each simple function, we update its pointer in .debug_info to point to - // its uptated address ranges. If the function was contiguous, also update its - // abbreviation. + // Update address ranges of simple functions. for (const auto &BFI : BinaryFunctions) { const auto &Function = BFI.second; - if (!Function.isSimple()) { - continue; + if (Function.isSimple()) { + updateDWARFObjectAddressRanges( + Function.getAddressRangesOffset() + DebugRangesSize, + Function.getSubprocedureDIECompileUnit(), + Function.getSubprocedureDIE()); } - auto FunctionDIE = Function.getSubprocedureDIE(); - // If we didn't find the DIE associated to the function or the DIE doesn't - // have an abbreviation, give up on this function. - if (!(FunctionDIE && FunctionDIE->getAbbreviationDeclarationPtr())) - continue; - auto DebugRangesOffset = Function.getAddressRangesOffset() + - DebugRangesSize; - const auto *AbbreviationDecl = FunctionDIE->getAbbreviationDeclarationPtr(); - assert(AbbreviationDecl && - "Function DIE doesn't have an abbreviation: not supported yet."); - auto AbbrevCode = AbbreviationDecl->getCode(); - const auto *Unit = Function.getSubprocedureDIECompileUnit(); - - if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges) != -1U) { - // Case 1: The function was already non-contiguous and had DW_AT_ranges. - // In this case we simply need to update the value of DW_AT_ranges. + } + + // Update address ranges of lexical blocks. + for (const auto &LB : BC->LexicalBlocks) { + updateDWARFObjectAddressRanges( + LB.getAddressRangesOffset() + DebugRangesSize, + LB.getCompileUnit(), + LB.getDIE()); + } +} + +void RewriteInstance::updateDWARFObjectAddressRanges( + uint32_t DebugRangesOffset, + const DWARFUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE) { + + // Some objects don't have an associated DIE and cannot be updated (such as + // compiler-generated functions). + if (!DIE) { + return; + } + + auto DebugInfoPatcher = + static_cast(SectionPatchers[".debug_info"].get()); + auto AbbrevPatcher = + static_cast(SectionPatchers[".debug_abbrev"].get()); + + assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized."); + + const auto *AbbreviationDecl = DIE->getAbbreviationDeclarationPtr(); + assert(AbbreviationDecl && + "Object's DIE doesn't have an abbreviation: not supported yet."); + auto AbbrevCode = AbbreviationDecl->getCode(); + + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges) != -1U) { + // Case 1: The object was already non-contiguous and had DW_AT_ranges. + // In this case we simply need to update the value of DW_AT_ranges. + DWARFFormValue FormValue; + uint32_t AttrOffset = -1U; + DIE->getAttributeValue(Unit, dwarf::DW_AT_ranges, FormValue, &AttrOffset); + DebugInfoPatcher->addLE32Patch(AttrOffset, DebugRangesOffset); + } else { + // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc. + // We require the compiler to put both attributes one after the other + // for our approach to work. low_pc and high_pc both occupy 8 bytes + // as we're dealing with a 64-bit ELF. We basically change low_pc to + // DW_AT_ranges and high_pc to DW_AT_producer. ranges spans only 4 bytes + // in 32-bit DWARF, which we assume to be used, which leaves us with 12 + // more bytes. We then set the value of DW_AT_producer as an arbitrary + // 12-byte string that fills the remaining space and leaves the rest of + // the abbreviation layout unchanged. + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) != -1U && + AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc) != -1U) { + uint32_t LowPCOffset = -1U; + uint32_t HighPCOffset = -1U; DWARFFormValue FormValue; - uint32_t RangesOffset = -1U; - FunctionDIE->getAttributeValue(Unit, dwarf::DW_AT_ranges, FormValue, - &RangesOffset); - DebugInfoPatcher->addLE32Patch(RangesOffset, DebugRangesOffset); + DIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, FormValue, + &LowPCOffset); + DIE->getAttributeValue(Unit, dwarf::DW_AT_high_pc, FormValue, + &HighPCOffset); + + AbbrevPatcher->addAttributePatch(Unit, + AbbrevCode, + dwarf::DW_AT_low_pc, + dwarf::DW_AT_ranges, + dwarf::DW_FORM_sec_offset); + AbbrevPatcher->addAttributePatch(Unit, + AbbrevCode, + dwarf::DW_AT_high_pc, + dwarf::DW_AT_producer, + dwarf::DW_FORM_string); + assert(LowPCOffset != -1U && LowPCOffset + 8 == HighPCOffset && + "We depend on the compiler putting high_pc right after low_pc."); + DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); + std::string ProducerString{"LLVM-BOLT"}; + ProducerString.resize(12, ' '); + ProducerString.back() = '\0'; + + DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); } else { - // Case 2: The function has both DW_AT_low_pc and DW_AT_high_pc. - // We require the compiler to put both attributes one after the other - // for our approach to work. low_pc and high_pc both occupy 8 bytes - // as we're dealing with a 64-bit ELF. We basically change low_pc to - // DW_AT_ranges and high_pc to DW_AT_producer. ranges spans only 4 bytes - // in 32-bit DWARF, which we assume to be used, which leaves us with 12 - // more bytes. We then set the value of DW_AT_producer as an arbitrary - // 12-byte string that fills the remaining space and leaves the rest of - // the abbreviation layout unchanged. - if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) != -1U && - AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc) != -1U) { - uint32_t LowPCOffset = -1U; - uint32_t HighPCOffset = -1U; - DWARFFormValue FormValue; - FunctionDIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, FormValue, - &LowPCOffset); - FunctionDIE->getAttributeValue(Unit, dwarf::DW_AT_high_pc, FormValue, - &HighPCOffset); - - AbbrevPatcher->addAttributePatch(Unit, - AbbrevCode, - dwarf::DW_AT_low_pc, - dwarf::DW_AT_ranges, - dwarf::DW_FORM_sec_offset); - AbbrevPatcher->addAttributePatch(Unit, - AbbrevCode, - dwarf::DW_AT_high_pc, - dwarf::DW_AT_producer, - dwarf::DW_FORM_string); - assert(LowPCOffset != -1U && LowPCOffset + 8 == HighPCOffset && - "We depend on the compiler putting high_pc right after low_pc."); - DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); - std::string ProducerString{"LLVM-BOLT"}; - ProducerString.resize(12, ' '); - ProducerString.back() = '\0'; - - DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); - } else { - DEBUG(errs() << "BOLT-WARNING: Cannot update ranges for function " - << Function.getName() << "\n"); - } + DEBUG(errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << "\n"); } } - - SectionPatchers[".debug_abbrev"].reset(AbbrevPatcher.release()); - SectionPatchers[".debug_info"].reset(DebugInfoPatcher.release()); } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 2ce706436393..7871f7e960cf 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -15,7 +15,7 @@ #define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H #include "BinaryPatcher.h" -#include "DebugArangesWriter.h" +#include "DebugRangesSectionsWriter.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/Object/ELFObjectFile.h" @@ -200,11 +200,26 @@ class RewriteInstance { /// Update internal function ranges after functions have been written. void updateFunctionRanges(); + /// Update lexical blocks ranges after optimizations. + void updateLexicalBlocksAddresses(); + /// Generate new contents for .debug_ranges and .debug_aranges section. void generateDebugRanges(); - /// Patches the binary for function address ranges to be updated. - void updateDWARFSubprogramAddressRanges(); + /// Patches the binary for DWARF address ranges (e.g. in functions and lexical + /// blocks) to be updated. + void updateDWARFAddressRanges(); + + /// Patches the binary for an object's address ranges to be updated. + /// The object can be a anything that has associated address ranges via either + /// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc). + /// \p DebugRangesOffset is the offset in .debug_ranges of the object's + /// new address ranges in the output binary. + /// \p Unit Compile uniit the object belongs to. + /// \p DIE is the object's DIE in the input binary. + void updateDWARFObjectAddressRanges(uint32_t DebugRangesOffset, + const DWARFUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE); /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { @@ -251,9 +266,9 @@ class RewriteInstance { /// Store all functions seen in the binary, sorted by address. std::map BinaryFunctions; - /// Stores and serializes information that will be put into - /// the .debug_aranges DWARF section. - DebugArangesWriter ArangesWriter; + /// Stores and serializes information that will be put into the .debug_ranges + /// and .debug_aranges DWARF sections. + DebugRangesSectionsWriter RangesSectionsWriter; /// Patchers used to apply simple changes to sections of the input binary. /// Maps section name -> patcher. From cf93b9a22ea6f1fdafa75352a07de580f0ec35c5 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Fri, 1 Apr 2016 15:09:34 -0700 Subject: [PATCH 087/904] Don't skip non-simple functions on function address ranges update. Summary: This fixes a problem in which bolt was generating a malformed .debug_info section on the bzip2 binary. The bug was the following: - A simple and a non-simple function shared an abbreviation - The abbreviation was patched to contain DW_AT_ranges because of the simple function - The non-simple function's data was not updated, but then it didn't match the layout expected by the abbreviation anymore And because we were already creating an address ranges list in .debug_ranges even for non-simple functions, it doesn't make sense not to use it anyway. (cherry picked from commit bf6861c8e5136c5e8424ab7afab4a46d5fa933e3) --- bolt/RewriteInstance.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 87d0aa958e4e..c3499a8437ba 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2038,15 +2038,13 @@ void RewriteInstance::updateDWARFAddressRanges() { SectionPatchers[".debug_abbrev"] = llvm::make_unique(); SectionPatchers[".debug_info"] = llvm::make_unique(); - // Update address ranges of simple functions. + // Update address ranges of functions. for (const auto &BFI : BinaryFunctions) { const auto &Function = BFI.second; - if (Function.isSimple()) { - updateDWARFObjectAddressRanges( - Function.getAddressRangesOffset() + DebugRangesSize, - Function.getSubprocedureDIECompileUnit(), - Function.getSubprocedureDIE()); - } + updateDWARFObjectAddressRanges( + Function.getAddressRangesOffset() + DebugRangesSize, + Function.getSubprocedureDIECompileUnit(), + Function.getSubprocedureDIE()); } // Update address ranges of lexical blocks. From f5a2093bab8dda032e29a01d01d9c62841b7c6df Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 31 Mar 2016 16:38:49 -0700 Subject: [PATCH 088/904] Re-enable conditional function spitting under an option. Summary: Add a parameter value to "-split-functions=" option to allow splitting only when the function is too large to fit: 0 - never split 1 - split if too large to fit 2 - always split We may use this option when the profile data is not very precise. In that case excessive splitting may increase iTLB misses. (cherry picked from commit f5ea08dfff83b6832472292590c838eae58e7d4e) --- bolt/BinaryFunction.h | 9 ++++++- bolt/RewriteInstance.cpp | 51 +++++++++++++++++++++++++++++++++++++--- bolt/RewriteInstance.h | 10 ++++++++ 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 403efeb6699b..6d1231ea3b19 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -58,6 +58,13 @@ class BinaryFunction : public AddressRangesOwner { Assembled, /// Function has been assembled in memory }; + /// Settings for splitting function bodies into hot/cold partitions. + enum SplittingType : char { + ST_NONE = 0, /// Do not split functions + ST_LARGE = 1, /// Only split functions that exceed maximum size + ST_ALL =2, /// Split all functions + }; + /// Choose which strategy should the block layout heuristic prioritize when /// facing conflicting goals. enum LayoutType : char { @@ -73,7 +80,7 @@ class BinaryFunction : public AddressRangesOwner { /// paper (PLDI '90) about block reordering, trying to minimize branch /// mispredictions. LT_OPTIMIZE_BRANCH, - /// LT_OPTIMIZE_CACHE pigbacks on the idea from Ispike paper (CGO '04) + /// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04) /// that suggests putting frequently executed chains first in the layout. LT_OPTIMIZE_CACHE, }; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index c3499a8437ba..d42518402e98 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -95,9 +95,17 @@ EliminateUnreachable("eliminate-unreachable", cl::desc("eliminate unreachable code"), cl::Optional); -static cl::opt +static cl::opt SplitFunctions("split-functions", - cl::desc("split functions into hot and cold distinct regions"), + cl::desc("split functions into hot and cold regions"), + cl::init(BinaryFunction::ST_NONE), + cl::values(clEnumValN(BinaryFunction::ST_NONE, "0", + "do not split any function"), + clEnumValN(BinaryFunction::ST_LARGE, "1", + "split if function is too large to fit"), + clEnumValN(BinaryFunction::ST_ALL, "2", + "split all functions"), + clEnumValEnd), cl::Optional); static cl::opt @@ -531,6 +539,22 @@ void RewriteInstance::run() { readFunctionDebugInfo(); runOptimizationPasses(); emitFunctions(); + + if (opts::SplitFunctions == BinaryFunction::ST_LARGE && + splitLargeFunctions()) { + // Emit again because now some functions have been split + outs() << "BOLT: split-functions: starting pass 2...\n"; + reset(); + discoverStorage(); + readSpecialSections(); + discoverFileObjects(); + readDebugInfo(); + disassembleFunctions(); + readFunctionDebugInfo(); + runOptimizationPasses(); + emitFunctions(); + } + updateDebugInfo(); // Copy allocatable part of the input. @@ -939,7 +963,10 @@ void RewriteInstance::runOptimizationPasses() { } if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { - BFI.second.modifyLayout(opts::ReorderBlocks, opts::SplitFunctions); + bool ShouldSplit = + (opts::SplitFunctions == BinaryFunction::ST_ALL) || + ToSplit.find(BFI.first) != ToSplit.end(); + BFI.second.modifyLayout(opts::ReorderBlocks, ShouldSplit); if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks", true); } @@ -1394,6 +1421,24 @@ void RewriteInstance::emitFunctions() { TempOut->keep(); } +bool RewriteInstance::splitLargeFunctions() { + bool Changed = false; + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + // Ignore this function if we failed to map it to the output binary + if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) + continue; + + if (Function.getImageSize() <= Function.getMaxSize()) + continue; + + ToSplit.insert(BFI.first); + Changed = true; + } + return Changed; +} + void RewriteInstance::updateFunctionRanges() { auto addDebugArangesEntry = [&](uint64_t OriginalFunctionAddress, uint64_t RangeBegin, diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 7871f7e960cf..dde0c4fc0dcd 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -160,6 +160,13 @@ class RewriteInstance { /// Update debug information in the file for re-written code. void updateDebugInfo(); + /// Check which functions became larger than their original version and + /// annotate function splitting information. + /// + /// Returns true if any function was annotated, requiring us to perform a + /// second pass to emit those functions in two parts. + bool splitLargeFunctions(); + /// Rewrite back all functions (hopefully optimized) that fit in the original /// memory footprint for that function. If the function is now larger and does /// not fit in the binary, reject it and preserve the original version of the @@ -293,6 +300,9 @@ class RewriteInstance { /// Size of the .debug_ranges section on input. uint32_t DebugRangesSize{0}; + /// Keep track of which functions to split in a second pass. + std::set ToSplit; + /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; From 97a874bf9bc1c13375d38ccb0a6118c28053be9d Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Fri, 1 Apr 2016 11:37:28 -0700 Subject: [PATCH 089/904] Update DWARF location lists after optimization. Summary: Summary: Update DWARF location lists in .debug_loc and pointers to them in .debug_info so that gdb can print variables which change location during their lifetime. The following changes were made: - Refactored BasicBlockOffsetRanges to allow ranges to be tied to binary information (so that we can reuse it for location lists) - Implemented range compression optimization in BasicBlockOffsetRanges (needed otherwise too much data was being generated). - Added representation for location lists (LocationList.h, BinaryContext.h) - Implemented .debug_loc serializer that keeps the updated offsets (DebugLocWriter.{h,cpp}) - After disassembly, traverse entries in .debug_loc and save them in context (BinaryContext.cpp) - After optimizations, serialize .debug_loc and update pointers in .debug_info (RewriteInstance.cpp) (cherry picked from commit b619f915c673eb9f8d4e13cb08541c845f00836f) --- bolt/BasicBlockOffsetRanges.cpp | 38 +++++++++++--- bolt/BasicBlockOffsetRanges.h | 36 ++++++++----- bolt/BinaryContext.cpp | 15 ++++++ bolt/BinaryContext.h | 5 ++ bolt/CMakeLists.txt | 1 + bolt/DebugLocWriter.cpp | 45 +++++++++++++++++ bolt/DebugLocWriter.h | 53 ++++++++++++++++++++ bolt/LexicalBlock.h | 11 +++- bolt/LocationList.h | 61 ++++++++++++++++++++++ bolt/RewriteInstance.cpp | 89 +++++++++++++++++++++++++++++++-- bolt/RewriteInstance.h | 13 +++++ 11 files changed, 344 insertions(+), 23 deletions(-) create mode 100644 bolt/DebugLocWriter.cpp create mode 100644 bolt/DebugLocWriter.h create mode 100644 bolt/LocationList.h diff --git a/bolt/BasicBlockOffsetRanges.cpp b/bolt/BasicBlockOffsetRanges.cpp index 9cb507facd9e..445da2bc4ef1 100644 --- a/bolt/BasicBlockOffsetRanges.cpp +++ b/bolt/BasicBlockOffsetRanges.cpp @@ -12,13 +12,15 @@ #include "BasicBlockOffsetRanges.h" #include "BinaryBasicBlock.h" #include "BinaryFunction.h" +#include namespace llvm { namespace bolt { void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, uint64_t BeginAddress, - uint64_t EndAddress) { + uint64_t EndAddress, + const BinaryData *Data) { auto FirstBB = Function.getBasicBlockContainingOffset( BeginAddress - Function.getAddress()); assert(FirstBB && "No basic blocks in the function intersect given range."); @@ -40,13 +42,14 @@ void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, BBAddressRange{ BB, static_cast(InternalAddressRangeBegin - BBAddress), - static_cast(InternalAddressRangeEnd - BBAddress)}); + static_cast(InternalAddressRangeEnd - BBAddress), + Data}); } } -std::vector> +std::vector BasicBlockOffsetRanges::getAbsoluteAddressRanges() const { - std::vector> AbsoluteRanges; + std::vector AbsoluteRanges; for (const auto &BBAddressRange : AddressRanges) { auto BBOutputAddressRange = BBAddressRange.BasicBlock->getOutputAddressRange(); @@ -61,9 +64,32 @@ BasicBlockOffsetRanges::getAbsoluteAddressRanges() const { BBFunction->getBasicBlockOriginalSize(BBAddressRange.BasicBlock)) ? BBOutputAddressRange.second : (BBOutputAddressRange.first + BBAddressRange.RangeEndOffset); - AbsoluteRanges.emplace_back(NewRangeBegin, NewRangeEnd); + AbsoluteRanges.emplace_back(AbsoluteRange{NewRangeBegin, NewRangeEnd, + BBAddressRange.Data}); } - return AbsoluteRanges; + if (AbsoluteRanges.empty()) { + return AbsoluteRanges; + } + // Merge adjacent ranges that have the same data. + std::sort(AbsoluteRanges.begin(), AbsoluteRanges.end(), + [](const AbsoluteRange &A, const AbsoluteRange &B) { + return A.Begin < B.Begin; + }); + decltype(AbsoluteRanges) MergedRanges; + + MergedRanges.emplace_back(AbsoluteRanges[0]); + for (unsigned I = 1, S = AbsoluteRanges.size(); I != S; ++I) { + // If this range complements the last one and they point to the same + // (possibly null) data, merge them instead of creating another one. + if (AbsoluteRanges[I].Begin == MergedRanges.back().End && + AbsoluteRanges[I].Data == MergedRanges.back().Data) { + MergedRanges.back().End = AbsoluteRanges[I].End; + } else { + MergedRanges.emplace_back(AbsoluteRanges[I]); + } + } + + return MergedRanges; } } // namespace bolt diff --git a/bolt/BasicBlockOffsetRanges.h b/bolt/BasicBlockOffsetRanges.h index f9221ff617e8..51dac4dc9e50 100644 --- a/bolt/BasicBlockOffsetRanges.h +++ b/bolt/BasicBlockOffsetRanges.h @@ -16,7 +16,9 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_BASIC_BLOCK_OFFSET_RANGES_H #define LLVM_TOOLS_LLVM_BOLT_BASIC_BLOCK_OFFSET_RANGES_H +#include "llvm/ADT/SmallVector.h" #include +#include #include #include @@ -27,6 +29,26 @@ class BinaryFunction; class BinaryBasicBlock; class BasicBlockOffsetRanges { +public: + typedef SmallVectorImpl BinaryData; + struct AbsoluteRange { + uint64_t Begin; + uint64_t End; + const BinaryData *Data; + }; + + /// Add range [BeginAddress, EndAddress) to the address ranges list. + /// \p Function is the function that contains the given address range. + void addAddressRange(BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress, + const BinaryData *Data = nullptr); + + /// Returns the list of absolute addresses calculated using the output address + /// of the basic blocks, i.e. the input ranges updated after basic block + /// addresses might have changed, together with the data associated to them. + std::vector getAbsoluteAddressRanges() const; + private: /// An address range inside one basic block. struct BBAddressRange { @@ -35,21 +57,11 @@ class BasicBlockOffsetRanges { uint16_t RangeBeginOffset; /// (Exclusive) end of the range counting from BB's start address. uint16_t RangeEndOffset; + /// Binary data associated with this range. + const BinaryData *Data; }; std::vector AddressRanges; - -public: - /// Add range [BeginAddress, EndAddress) to the address ranges list. - /// \p Function is the function that contains the given address range. - void addAddressRange(BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress); - - /// Returns the list of absolute addresses calculated using the output address - /// of the basic blocks, i.e. the input ranges updated after basic block - /// addresses might have changed. - std::vector> getAbsoluteAddressRanges() const; }; } // namespace bolt diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 00e4ffce1ea4..1cbca4e193b1 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -157,6 +157,21 @@ void BinaryContext::preprocessFunctionDebugInfo( findLexicalBlocks(CU.get(), CU->getUnitDIE(false), BinaryFunctions, LexicalBlocks); } + + // Iterate over location lists and save them in LocationLists. + auto DebugLoc = DwCtx->getDebugLoc(); + for (const auto &DebugLocEntry : DebugLoc->getLocationLists()) { + LocationLists.emplace_back(DebugLocEntry.Offset); + auto &LocationList = LocationLists.back(); + for (const auto &Location : DebugLocEntry.Entries) { + auto *Function = getBinaryFunctionContainingAddress(Location.Begin, + BinaryFunctions); + if (Function && Function->isSimple()) { + LocationList.addLocation(&Location.Loc, *Function, Location.Begin, + Location.End); + } + } + } } } // namespace bolt diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 8c838b3c2235..d93ecff598d1 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -15,6 +15,7 @@ #define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H #include "LexicalBlock.h" +#include "LocationList.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -36,6 +37,7 @@ #include #include #include +#include namespace llvm { namespace bolt { @@ -73,6 +75,9 @@ class BinaryContext { /// List of DWARF lexical blocks in .debug_info. std::vector LexicalBlocks; + /// List of DWARF location lists in .debug_loc. + std::vector LocationLists; + std::unique_ptr Ctx; std::unique_ptr DwCtx; diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index c66cb442fbfc..7f7343a41c06 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -20,6 +20,7 @@ add_llvm_tool(llvm-bolt BinaryPatcher.cpp DataReader.cpp DebugLineTableRowRef.cpp + DebugLocWriter.cpp DebugRangesSectionsWriter.cpp Exceptions.cpp RewriteInstance.cpp diff --git a/bolt/DebugLocWriter.cpp b/bolt/DebugLocWriter.cpp new file mode 100644 index 000000000000..e2c0e84dbbf0 --- /dev/null +++ b/bolt/DebugLocWriter.cpp @@ -0,0 +1,45 @@ +//===-- DebugLocWriter.cpp - Writes the DWARF .debug_loc section. ----------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DebugLocWriter.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCObjectWriter.h" +#include + +namespace llvm { +namespace bolt { + +void DebugLocWriter::write(const LocationList &LocList, + MCObjectWriter *Writer) { + // Reference: DWARF 4 specification section 7.7.3. + UpdatedOffsets[LocList.getOriginalOffset()] = SectionOffset; + auto AbsoluteRanges = LocList.getAbsoluteAddressRanges(); + + for (const auto &Entry : LocList.getAbsoluteAddressRanges()) { + Writer->writeLE64(Entry.Begin); + Writer->writeLE64(Entry.End); + assert(Entry.Data && "Entry with null location expression."); + Writer->writeLE16(Entry.Data->size()); + + // Need to convert binary data from unsigned char to char. + Writer->writeBytes( + StringRef(reinterpret_cast(Entry.Data->data()), + Entry.Data->size())); + + SectionOffset += 2 * 8 + 2 + Entry.Data->size(); + } + Writer->writeLE64(0); + Writer->writeLE64(0); + SectionOffset += 2 * 8; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/DebugLocWriter.h b/bolt/DebugLocWriter.h new file mode 100644 index 000000000000..c0c60fd8c9ee --- /dev/null +++ b/bolt/DebugLocWriter.h @@ -0,0 +1,53 @@ +//===-- DebugLocWriter.h - Writes the DWARF .debug_loc section -------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Class that serializes the .debug_loc section given LocationLists. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUG_LOC_WRITER_H +#define LLVM_TOOLS_LLVM_BOLT_DEBUG_LOC_WRITER_H + +#include "LocationList.h" +#include +#include + +namespace llvm { + +class MCObjectWriter; + +namespace bolt { + +class DebugLocWriter { +public: + /// Writes the given location list to the writer. + void write(const LocationList &LocList, MCObjectWriter *Writer); + + using UpdatedOffsetMapType = std::map; + + /// Returns mapping from offsets in the input .debug_loc to offsets in the + /// output .debug_loc section with the corresponding updated location list + /// entry. + const UpdatedOffsetMapType &getUpdatedLocationListOffsets() const { + return UpdatedOffsets; + } + +private: + /// Current offset in the section (updated as new entries are written). + uint32_t SectionOffset{0}; + + /// Map from input offsets to output offsets for location lists that were + /// updated, generated after write(). + UpdatedOffsetMapType UpdatedOffsets; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/LexicalBlock.h b/bolt/LexicalBlock.h index a7740f13563e..fd085e62b277 100644 --- a/bolt/LexicalBlock.h +++ b/bolt/LexicalBlock.h @@ -35,7 +35,7 @@ class LexicalBlock : public AddressRangesOwner { const DWARFDebugInfoEntryMinimal *DIE) : CU(CU), DIE(DIE) { } - // Add range [BeginAddress, EndAddress) to lexical block. + /// Add range [BeginAddress, EndAddress) to lexical block. void addAddressRange(BinaryFunction &Function, uint64_t BeginAddress, uint64_t EndAddress) { @@ -43,7 +43,14 @@ class LexicalBlock : public AddressRangesOwner { } std::vector> getAbsoluteAddressRanges() const { - return BBOffsetRanges.getAbsoluteAddressRanges(); + auto AddressRangesWithData = BBOffsetRanges.getAbsoluteAddressRanges(); + std::vector> AddressRanges( + AddressRangesWithData.size()); + for (unsigned I = 0, S = AddressRanges.size(); I != S; ++I) { + AddressRanges[I] = std::make_pair(AddressRangesWithData[I].Begin, + AddressRangesWithData[I].End); + } + return AddressRanges; } void setAddressRangesOffset(uint32_t Offset) { AddressRangesOffset = Offset; } diff --git a/bolt/LocationList.h b/bolt/LocationList.h new file mode 100644 index 000000000000..7cf4fab14507 --- /dev/null +++ b/bolt/LocationList.h @@ -0,0 +1,61 @@ +//===--- LocationList.h - DWARF location lists ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Represents DWARF location lists, maintaining their list of location +// expressions and the address ranges in which they are valid to be updated in +// the output debugging information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_LOCATION_LIST_H +#define LLVM_TOOLS_LLVM_BOLT_LOCATION_LIST_H + +#include "BasicBlockOffsetRanges.h" + +namespace llvm { + +class DWARFCompileUnit; +class DWARFDebugInfoEntryMinimal; + +namespace bolt { + +class BinaryBasicBlock; + +class LocationList { +public: + LocationList(uint32_t Offset) : DebugLocOffset(Offset) { } + + /// Add a location expression that is valid in [BeginAddress, EndAddress) + /// within Function to location list. + void addLocation(const BasicBlockOffsetRanges::BinaryData *Expression, + BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress) { + BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress, + Expression); + } + + std::vector + getAbsoluteAddressRanges() const { + return BBOffsetRanges.getAbsoluteAddressRanges(); + } + + uint32_t getOriginalOffset() const { return DebugLocOffset; } + +private: + BasicBlockOffsetRanges BBOffsetRanges; + + // Offset of this location list in the input .debug_loc section. + uint32_t DebugLocOffset; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index d42518402e98..77592c515443 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -732,6 +732,8 @@ void RewriteInstance::readSpecialSections() { DebugLineSize = Section.getSize(); } else if (SectionName == ".debug_ranges") { DebugRangesSize = Section.getSize(); + } else if (SectionName == ".debug_loc") { + DebugLocSize = Section.getSize(); } } @@ -1502,6 +1504,85 @@ void RewriteInstance::generateDebugRanges() { } } +void RewriteInstance::updateLocationLists() { + // Write new contents to .debug_loc. + SmallVector DebugLocBuffer; + raw_svector_ostream OS(DebugLocBuffer); + + auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); + auto Writer = MAB->createObjectWriter(OS); + + DebugLocWriter LocationListsWriter; + + for (const auto &Loc : BC->LocationLists) { + LocationListsWriter.write(Loc, Writer); + } + + const auto &DebugLocContents = OS.str(); + + // Free'd by SectionMM. + uint8_t *SectionData = new uint8_t[DebugLocContents.size()]; + memcpy(SectionData, DebugLocContents.data(), DebugLocContents.size()); + + SectionMM->NoteSectionInfo[".debug_loc"] = SectionInfo( + reinterpret_cast(SectionData), + DebugLocContents.size(), + /*Alignment=*/0, + /*IsCode=*/false, + /*IsReadOnly=*/true); + + // For each CU, update pointers into .debug_loc. + for (const auto &CU : BC->DwCtx->compile_units()) { + updateLocationListPointers( + CU.get(), + CU->getUnitDIE(false), + LocationListsWriter.getUpdatedLocationListOffsets()); + } +} + +void RewriteInstance::updateLocationListPointers( + const DWARFUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + const std::map &UpdatedOffsets) { + // Stop if we're in a non-simple function, which will not be rewritten. + auto Tag = DIE->getTag(); + if (Tag == dwarf::DW_TAG_subprogram) { + uint64_t LowPC = -1ULL, HighPC = -1ULL; + DIE->getLowAndHighPC(Unit, LowPC, HighPC); + if (LowPC != -1ULL) { + auto It = BinaryFunctions.find(LowPC); + if (It != BinaryFunctions.end() && !It->second.isSimple()) + return; + } + } + // If the DIE has a DW_AT_location attribute with a section offset, update it. + DWARFFormValue Value; + uint32_t AttrOffset; + if (DIE->getAttributeValue(Unit, dwarf::DW_AT_location, Value, &AttrOffset) && + (Value.isFormClass(DWARFFormValue::FC_Constant) || + Value.isFormClass(DWARFFormValue::FC_SectionOffset))) { + uint64_t DebugLocOffset = -1ULL; + if (Value.isFormClass(DWARFFormValue::FC_SectionOffset)) { + DebugLocOffset = Value.getAsSectionOffset().getValue(); + } else if (Value.isFormClass(DWARFFormValue::FC_Constant)) { // DWARF 3 + DebugLocOffset = Value.getAsUnsignedConstant().getValue(); + } + + auto It = UpdatedOffsets.find(DebugLocOffset); + if (It != UpdatedOffsets.end()) { + auto DebugInfoPatcher = + static_cast( + SectionPatchers[".debug_info"].get()); + DebugInfoPatcher->addLE32Patch(AttrOffset, It->second + DebugLocSize); + } + } + + // Recursively visit children. + for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { + updateLocationListPointers(Unit, Child, UpdatedOffsets); + } +} + void RewriteInstance::patchELFPHDRTable() { auto ELF64LEFile = dyn_cast(InputFile); if (!ELF64LEFile) { @@ -2047,12 +2128,17 @@ void RewriteInstance::updateDebugInfo() { if (!opts::UpdateDebugSections) return; + SectionPatchers[".debug_abbrev"] = llvm::make_unique(); + SectionPatchers[".debug_info"] = llvm::make_unique(); + updateFunctionRanges(); updateLexicalBlocksAddresses(); generateDebugRanges(); + updateLocationLists(); + auto &DebugInfoSI = SectionMM->NoteSectionInfo[".debug_info"]; for (const auto &CU : BC->DwCtx->compile_units()) { const auto CUID = CU->getOffset(); @@ -2080,9 +2166,6 @@ void RewriteInstance::updateDebugInfo() { } void RewriteInstance::updateDWARFAddressRanges() { - SectionPatchers[".debug_abbrev"] = llvm::make_unique(); - SectionPatchers[".debug_info"] = llvm::make_unique(); - // Update address ranges of functions. for (const auto &BFI : BinaryFunctions) { const auto &Function = BFI.second; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index dde0c4fc0dcd..ba8710d34ad8 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -15,6 +15,7 @@ #define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H #include "BinaryPatcher.h" +#include "DebugLocWriter.h" #include "DebugRangesSectionsWriter.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" @@ -210,6 +211,9 @@ class RewriteInstance { /// Update lexical blocks ranges after optimizations. void updateLexicalBlocksAddresses(); + /// Generate new contents for .debug_loc. + void updateLocationLists(); + /// Generate new contents for .debug_ranges and .debug_aranges section. void generateDebugRanges(); @@ -228,6 +232,12 @@ class RewriteInstance { const DWARFUnit *Unit, const DWARFDebugInfoEntryMinimal *DIE); + /// Updates pointers in .debug_info to location lists in .debug_loc. + void updateLocationListPointers( + const DWARFUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + const std::map &UpdatedOffsets); + /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { assert(Address >= NewTextSegmentAddress && @@ -297,6 +307,9 @@ class RewriteInstance { /// Size of the .debug_line section on input. uint32_t DebugLineSize{0}; + /// Size of the .debug_loc section in input. + uint32_t DebugLocSize{0}; + /// Size of the .debug_ranges section on input. uint32_t DebugRangesSize{0}; From 8234ed8b74def08c7f176f90a6fa6c091eb4135b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 6 Apr 2016 18:03:44 -0700 Subject: [PATCH 090/904] Only set output ranges when updating dbg info. Summary: Save processing time by setting output ranges when needed. (cherry picked from commit e514f900f943eccfd82b8f27478aa64c3d1e8618) --- bolt/RewriteInstance.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 77592c515443..7e977562de5b 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1397,23 +1397,25 @@ void RewriteInstance::emitFunctions() { } } - MCAsmLayout Layout( - static_cast(Streamer.get())->getAssembler()); - - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - for (auto &BB : Function) { - if (!(BB.getLabel()->isDefined(false) && - BB.getEndLabel() && BB.getEndLabel()->isDefined(false))) { - continue; + if (opts::UpdateDebugSections) { + MCAsmLayout Layout( + static_cast(Streamer.get())->getAssembler()); + + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + for (auto &BB : Function) { + if (!(BB.getLabel()->isDefined(false) && + BB.getEndLabel() && BB.getEndLabel()->isDefined(false))) { + continue; + } + uint64_t BaseAddress = (BB.isCold() ? Function.cold().getAddress() + : Function.getAddress()); + uint64_t BeginAddress = + BaseAddress + Layout.getSymbolOffset(*BB.getLabel()); + uint64_t EndAddress = + BaseAddress + Layout.getSymbolOffset(*BB.getEndLabel()); + BB.setOutputAddressRange(std::make_pair(BeginAddress, EndAddress)); } - uint64_t BaseAddress = (BB.isCold() ? Function.cold().getAddress() - : Function.getAddress()); - uint64_t BeginAddress = - BaseAddress + Layout.getSymbolOffset(*BB.getLabel()); - uint64_t EndAddress = - BaseAddress + Layout.getSymbolOffset(*BB.getEndLabel()); - BB.setOutputAddressRange(std::make_pair(BeginAddress, EndAddress)); } } From 9f2f73d4b770d03d0e7cd4ed5ac24a3874956d8a Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Tue, 5 Apr 2016 19:35:45 -0700 Subject: [PATCH 091/904] Emit debug line information for non-simple functions. Summary: Non-simple functions aren't emitted, and thus didn't have line number information emitted. This diff emits it for those functions by extending LLVM's generation of the line number program to allow for absolute addresses (it is wholly symbolic), then iterating over the relevant line tables from the input and appending entries with absolute addresses to the line tables to be emited. This still leaves the simple but not overwritten functions unhandled (there were 48 in HHVM in my last run). However, I think that to fix them we'd need another pass, since by the time we realize a simple function wont't fit, debug line info was already written to the output. (cherry picked from commit 6b7733d458602b678a024cecb065dc0778650e5c) --- bolt/DebugRangesSectionsWriter.h | 7 ++++ bolt/RewriteInstance.cpp | 65 +++++++++++++++++++++++++++++++- bolt/RewriteInstance.h | 4 ++ 3 files changed, 75 insertions(+), 1 deletion(-) diff --git a/bolt/DebugRangesSectionsWriter.h b/bolt/DebugRangesSectionsWriter.h index dd1455ab11af..9e3dacc8a4ff 100644 --- a/bolt/DebugRangesSectionsWriter.h +++ b/bolt/DebugRangesSectionsWriter.h @@ -53,6 +53,13 @@ class DebugRangesSectionsWriter { /// Writes .debug_ranges with the added ranges to the MCObjectWriter. void WriteRangesSection(MCObjectWriter *Writer); + /// Resets the writer to a clear state. + void reset() { + CUAddressRanges.clear(); + ObjectAddressRanges.clear(); + RangesSectionOffsetCUMap.clear(); + } + /// Return mapping of CUs to offsets in .debug_ranges. const RangesCUMapType &getRangesOffsetCUMap() const { return RangesSectionOffsetCUMap; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 7e977562de5b..aa143dc69592 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -444,6 +444,7 @@ void RewriteInstance::reset() { Out.reset(nullptr); EHFrame = nullptr; FailedAddresses.clear(); + RangesSectionsWriter.reset(); TotalScore = 0; } @@ -566,7 +567,6 @@ void RewriteInstance::run() { // Rewrite allocatable contents and copy non-allocatable parts with mods. rewriteFile(); - } void RewriteInstance::discoverFileObjects() { @@ -1259,6 +1259,8 @@ void RewriteInstance::emitFunctions() { emitFunction(*Streamer, Function, *BC.get(), /*EmitColdPart=*/true); } + updateDebugLineInfoForNonSimpleFunctions(); + Streamer->Finish(); ////////////////////////////////////////////////////////////////////////////// @@ -1940,6 +1942,7 @@ void RewriteInstance::rewriteFile() { if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) continue; + if (Function.isSplit() && (Function.cold().getImageAddress() == 0 || Function.cold().getImageSize() == 0)) continue; @@ -2260,3 +2263,63 @@ void RewriteInstance::updateDWARFObjectAddressRanges( } } } + +void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { + if (!opts::UpdateDebugSections) + return; + + auto DebugAranges = BC->DwCtx->getDebugAranges(); + assert(DebugAranges && "Need .debug_aranges in the input file."); + + for (auto It : BinaryFunctions) { + const auto &Function = It.second; + + if (Function.isSimple()) + continue; + + uint64_t Address = It.first; + uint32_t CUOffset = DebugAranges->findAddress(Address); + if (CUOffset == -1U) { + DEBUG(errs() << "BOLT-DEBUG: Function does not belong to any compile unit" + << "in .debug_aranges: " << Function.getName() << "\n"); + continue; + } + auto Unit = BC->OffsetToDwarfCU[CUOffset]; + auto LineTable = BC->DwCtx->getLineTableForUnit(Unit); + assert(LineTable && "CU without .debug_line info."); + + std::vector Results; + MCSectionELF *FunctionSection = + BC->Ctx->getELFSection(Function.getCodeSectionName(), + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + + if (LineTable->lookupAddressRange(Address, Function.getSize(), Results)) { + for (auto RowIndex : Results) { + const auto &Row = LineTable->Rows[RowIndex]; + BC->Ctx->setCurrentDwarfLoc( + Row.File, + Row.Line, + Row.Column, + (DWARF2_FLAG_IS_STMT * Row.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * Row.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * Row.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * Row.EpilogueBegin), + Row.Isa, + Row.Discriminator, + Row.Address); + + auto Loc = BC->Ctx->getCurrentDwarfLoc(); + BC->Ctx->clearDwarfLocSeen(); + + auto &OutputLineTable = + BC->Ctx->getMCDwarfLineTable(CUOffset).getMCLineSections(); + OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, + FunctionSection); + } + } else { + DEBUG(errs() << "BOLT-DEBUG: Function " << Function.getName() + << " has no associated line number information.\n"); + } + } +} diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index ba8710d34ad8..9e5dc55dee8b 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -168,6 +168,10 @@ class RewriteInstance { /// second pass to emit those functions in two parts. bool splitLargeFunctions(); + /// Updates debug line information for non-simple functions, which are not + /// rewritten. + void updateDebugLineInfoForNonSimpleFunctions(); + /// Rewrite back all functions (hopefully optimized) that fit in the original /// memory footprint for that function. If the function is now larger and does /// not fit in the binary, reject it and preserve the original version of the From 780951f7691a74b2eea42ab443e07b9801e0b145 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Fri, 8 Apr 2016 11:55:42 -0700 Subject: [PATCH 092/904] Fix behavior with multiple functions with same address. Summary: We were updating only one DIE per function, but because the Linker Script may map multiple functions to the same address this would cause us to generate invalid debug info (as some DIEs weren't updated but their abbreviations were changed). (cherry picked from commit e9366cdfc4b9a4f336df0280901c263e745446e4) --- bolt/BinaryContext.cpp | 2 +- bolt/BinaryFunction.h | 24 ++++++++++-------------- bolt/RewriteInstance.cpp | 10 ++++++---- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 1cbca4e193b1..c3065e023f48 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -145,7 +145,7 @@ void BinaryContext::preprocessFunctionDebugInfo( if (ChildDIE->getLowAndHighPC(CU.get(), LowPC, HighPC)) { auto It = BinaryFunctions.find(LowPC); if (It != BinaryFunctions.end()) { - It->second.setSubprocedureDIE(CU.get(), ChildDIE); + It->second.addSubprocedureDIE(CU.get(), ChildDIE); } } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 6d1231ea3b19..c97d166d0191 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -36,6 +36,7 @@ #include "llvm/Support/raw_ostream.h" #include #include +#include using namespace llvm::object; @@ -159,11 +160,11 @@ class BinaryFunction : public AddressRangesOwner { /// Landing pads for the function. std::set LandingPads; - /// Associated DIE in the .debug_info section. - const DWARFDebugInfoEntryMinimal *SubprocedureDIE{nullptr}; - - /// DWARF Unit that contains the DIE of this function. - const DWARFCompileUnit *DIECompileUnit{nullptr}; + /// Associated DIEs in the .debug_info section with their respective CUs. + /// There can be multiple because of identical code folding performed by + /// the Linker Script. + std::vector> SubprocedureDIEs; /// Offset of this function's address ranges in the .debug_ranges section of /// the output binary. @@ -757,18 +758,13 @@ class BinaryFunction : public AddressRangesOwner { void emitLSDA(MCStreamer *Streamer); /// Sets the associated .debug_info entry. - void setSubprocedureDIE(const DWARFCompileUnit *Unit, + void addSubprocedureDIE(const DWARFCompileUnit *Unit, const DWARFDebugInfoEntryMinimal *DIE) { - DIECompileUnit = Unit; - SubprocedureDIE = DIE; - } - - const DWARFDebugInfoEntryMinimal *getSubprocedureDIE() const { - return SubprocedureDIE; + SubprocedureDIEs.emplace_back(DIE, Unit); } - const DWARFCompileUnit *getSubprocedureDIECompileUnit() const { - return DIECompileUnit; + const decltype(SubprocedureDIEs) &getSubprocedureDIEs() const { + return SubprocedureDIEs; } /// Returns the size of the basic block in the original binary. diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index aa143dc69592..216f4bd72d93 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2174,10 +2174,12 @@ void RewriteInstance::updateDWARFAddressRanges() { // Update address ranges of functions. for (const auto &BFI : BinaryFunctions) { const auto &Function = BFI.second; - updateDWARFObjectAddressRanges( - Function.getAddressRangesOffset() + DebugRangesSize, - Function.getSubprocedureDIECompileUnit(), - Function.getSubprocedureDIE()); + for (const auto DIECompileUnitPair : Function.getSubprocedureDIEs()) { + updateDWARFObjectAddressRanges( + Function.getAddressRangesOffset() + DebugRangesSize, + DIECompileUnitPair.second, + DIECompileUnitPair.first); + } } // Update address ranges of lexical blocks. From 693e49aa726074b0359aeafc6ba100c9f61ed0a9 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Fri, 8 Apr 2016 16:24:38 -0700 Subject: [PATCH 093/904] Update unmatched and nested subprogram DIEs. Summary: readelf was showing some errors because we weren't updating DIEs that were not shallow in the DIE tree, or DIEs of functions with addresses we don't recognize (mostly functions with address 0, which could have been removed by the Linker Script but still have debugging information there). These DIEs need to be updated because their abbreviations are patched. (cherry picked from commit 275de36e62d0aa5ade6c440b2672286abc1de083) --- bolt/BinaryContext.cpp | 47 ++++++++++++++++++------------ bolt/BinaryContext.h | 11 +++++++ bolt/DebugRangesSectionsWriter.cpp | 8 +++++ bolt/DebugRangesSectionsWriter.h | 7 +++++ bolt/RewriteInstance.cpp | 8 +++++ 5 files changed, 63 insertions(+), 18 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index c3065e023f48..58ae61d1c891 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -95,6 +95,32 @@ void findLexicalBlocks(const DWARFCompileUnit *Unit, } } +// Recursively finds DWARF DW_TAG_subprogram DIEs and match them with +// BinaryFunctions. Record DIEs for unknown subprograms (mostly functions that +// are never called and removed from the binary) in Unknown. +void findSubprograms(const DWARFCompileUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + std::map &BinaryFunctions, + BinaryContext::DIECompileUnitVector &Unknown) { + if (DIE->isSubprogramDIE()) { + uint64_t LowPC, HighPC; + if (DIE->getLowAndHighPC(Unit, LowPC, HighPC)) { + auto It = BinaryFunctions.find(LowPC); + if (It != BinaryFunctions.end()) { + It->second.addSubprocedureDIE(Unit, DIE); + } else { + Unknown.emplace_back(DIE, Unit); + } + } + } + + for (auto ChildDIE = DIE->getFirstChild(); + ChildDIE != nullptr && !ChildDIE->isNULL(); + ChildDIE = ChildDIE->getSibling()) { + findSubprograms(Unit, ChildDIE, BinaryFunctions, Unknown); + } +} + } // namespace namespace llvm { @@ -130,26 +156,11 @@ void BinaryContext::preprocessDebugInfo() { void BinaryContext::preprocessFunctionDebugInfo( std::map &BinaryFunctions) { - // For each CU, iterate over its children DIEs and match subroutine DIEs to + // For each CU, iterate over its children DIEs and match subprogram DIEs to // BinaryFunctions. for (const auto &CU : DwCtx->compile_units()) { - const auto *UnitDIE = CU->getUnitDIE(false); - if (!UnitDIE->hasChildren()) - continue; - - for (auto ChildDIE = UnitDIE->getFirstChild(); - ChildDIE != nullptr && !ChildDIE->isNULL(); - ChildDIE = ChildDIE->getSibling()) { - if (ChildDIE->isSubprogramDIE()) { - uint64_t LowPC, HighPC; - if (ChildDIE->getLowAndHighPC(CU.get(), LowPC, HighPC)) { - auto It = BinaryFunctions.find(LowPC); - if (It != BinaryFunctions.end()) { - It->second.addSubprocedureDIE(CU.get(), ChildDIE); - } - } - } - } + findSubprograms(CU.get(), CU->getUnitDIE(false), BinaryFunctions, + UnknownFunctions); } // Iterate over DIE trees finding lexical blocks. diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index d93ecff598d1..bffdccb23413 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -40,6 +40,9 @@ #include namespace llvm { + +class DWARFDebugInfoEntryMinimal; + namespace bolt { class BinaryFunction; @@ -78,6 +81,14 @@ class BinaryContext { /// List of DWARF location lists in .debug_loc. std::vector LocationLists; + using DIECompileUnitVector = + std::vector> ; + + /// List of subprocedure DIEs that have addresses that don't match any + /// function, along with their CU. + DIECompileUnitVector UnknownFunctions; + std::unique_ptr Ctx; std::unique_ptr DwCtx; diff --git a/bolt/DebugRangesSectionsWriter.cpp b/bolt/DebugRangesSectionsWriter.cpp index 97838d47f00e..1e88b7038d27 100644 --- a/bolt/DebugRangesSectionsWriter.cpp +++ b/bolt/DebugRangesSectionsWriter.cpp @@ -65,6 +65,14 @@ void DebugRangesSectionsWriter::WriteRangesSection(MCObjectWriter *Writer) { const auto &AddressRanges = BFAddressRangesPair.second; SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); } + + // Write an empty address list to be used for objects with unknown address + // ranges. + EmptyRangesListOffset = SectionOffset; + SectionOffset += WriteAddressRanges( + Writer, + std::vector>{}, + false); } void diff --git a/bolt/DebugRangesSectionsWriter.h b/bolt/DebugRangesSectionsWriter.h index 9e3dacc8a4ff..b6331f71981d 100644 --- a/bolt/DebugRangesSectionsWriter.h +++ b/bolt/DebugRangesSectionsWriter.h @@ -65,6 +65,10 @@ class DebugRangesSectionsWriter { return RangesSectionOffsetCUMap; } + /// Returns an offset of an empty address ranges list that is always written + /// to .debug_ranges + uint32_t getEmptyRangesListOffset() const { return EmptyRangesListOffset; } + private: // Map from compile unit offset to the list of address intervals that belong // to that compile unit. Each interval is a pair @@ -77,6 +81,9 @@ class DebugRangesSectionsWriter { std::map>> ObjectAddressRanges; + // Offset of an empty address ranges list. + uint32_t EmptyRangesListOffset; + /// When writing data to .debug_ranges remember offset per CU. RangesCUMapType RangesSectionOffsetCUMap; }; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 216f4bd72d93..5ca1f397df58 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2182,6 +2182,14 @@ void RewriteInstance::updateDWARFAddressRanges() { } } + // Update address ranges of DIEs with addresses that don't match functions. + for (auto &DIECompileUnitPair : BC->UnknownFunctions) { + updateDWARFObjectAddressRanges( + RangesSectionsWriter.getEmptyRangesListOffset(), + DIECompileUnitPair.second, + DIECompileUnitPair.first); + } + // Update address ranges of lexical blocks. for (const auto &LB : BC->LexicalBlocks) { updateDWARFObjectAddressRanges( From d8b664d604ffbcfcef3a59be253033ecbdd6549f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 8 Apr 2016 19:30:27 -0700 Subject: [PATCH 094/904] Option to pass a file with list of functions to skip. Summary: Take "-skip_funcs_file=" option and don't process any function listed in the . (cherry picked from commit b11d66c6f347e658461fc8a92c1ef46358f6a0b8) --- bolt/RewriteInstance.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 5ca1f397df58..316f37888af8 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -85,6 +85,10 @@ SkipFunctionNames("skip_funcs", cl::desc("list of functions to skip"), cl::value_desc("func1,func2,func3,...")); +static cl::opt +SkipFunctionNamesFile("skip_funcs_file", + cl::desc("file with list of functions to skip")); + static cl::opt MaxFunctions("max_funcs", cl::desc("maximum # of functions to overwrite"), @@ -189,14 +193,22 @@ bool shouldProcess(const BinaryFunction &Function) { if (opts::MaxFunctions && Function.getFunctionNumber() > opts::MaxFunctions) return false; - if (!FunctionNamesFile.empty()) { + auto populateFunctionNames = [](cl::opt &FunctionNamesFile, + cl::list &FunctionNames) { + assert(!FunctionNamesFile.empty() && "unexpected empty file name"); std::ifstream FuncsFile(FunctionNamesFile, std::ios::in); std::string FuncName; while (std::getline(FuncsFile, FuncName)) { FunctionNames.push_back(FuncName); } FunctionNamesFile = ""; - } + }; + + if (!FunctionNamesFile.empty()) + populateFunctionNames(FunctionNamesFile, FunctionNames); + + if (!SkipFunctionNamesFile.empty()) + populateFunctionNames(SkipFunctionNamesFile, SkipFunctionNames); bool IsValid = true; if (!FunctionNames.empty()) { From 981196b0210288e25c3f583cccf860b637d04ef7 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Tue, 12 Apr 2016 11:41:03 -0700 Subject: [PATCH 095/904] Update address ranges of inlined functions and try/catch blocks. Summary: Update address ranges of inlined functions and try/catch blocks. This was missing and lead gdb to show weird information in a core dump we inspected because of the several nestings of inline in the call stack. This is very similar to Lexical Blocks, so the change is to basically generalize that code to do the same for DW_AT_try_block, DW_AT_catch_block and DW_AT_inlined_subroutine. (cherry picked from commit 575882a5cebb4fe2de405420a5e36c0c6ffd6a73) --- ...icalBlock.h => AddressRangesDWARFObject.h} | 16 +++++----- bolt/BinaryContext.cpp | 32 +++++++++++-------- bolt/BinaryContext.h | 8 +++-- bolt/RewriteInstance.cpp | 21 ++++++------ bolt/RewriteInstance.h | 4 +-- 5 files changed, 45 insertions(+), 36 deletions(-) rename bolt/{LexicalBlock.h => AddressRangesDWARFObject.h} (79%) diff --git a/bolt/LexicalBlock.h b/bolt/AddressRangesDWARFObject.h similarity index 79% rename from bolt/LexicalBlock.h rename to bolt/AddressRangesDWARFObject.h index fd085e62b277..97ee3a43b5cb 100644 --- a/bolt/LexicalBlock.h +++ b/bolt/AddressRangesDWARFObject.h @@ -1,4 +1,4 @@ -//===--- LexicalBlock.h - DWARF lexical blocks ----------------------------===// +//===--- AddressRangesDWARFObject.h - DWARF Entities with address ranges --===// // // The LLVM Compiler Infrastructure // @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_LLVM_BOLT_LEXICAL_BLOCK_H -#define LLVM_TOOLS_LLVM_BOLT_LEXICAL_BLOCK_H +#ifndef LLVM_TOOLS_LLVM_BOLT_ADDRESS_RANGES_DWARF_OBJECT_H +#define LLVM_TOOLS_LLVM_BOLT_ADDRESS_RANGES_DWARF_OBJECT_H #include "DebugRangesSectionsWriter.h" #include "BasicBlockOffsetRanges.h" @@ -29,13 +29,13 @@ class BasicBlockTable; class BinaryBasicBlock; class BinaryFunction; -class LexicalBlock : public AddressRangesOwner { +class AddressRangesDWARFObject : public AddressRangesOwner { public: - LexicalBlock(const DWARFCompileUnit *CU, - const DWARFDebugInfoEntryMinimal *DIE) + AddressRangesDWARFObject(const DWARFCompileUnit *CU, + const DWARFDebugInfoEntryMinimal *DIE) : CU(CU), DIE(DIE) { } - /// Add range [BeginAddress, EndAddress) to lexical block. + /// Add range [BeginAddress, EndAddress) to this object. void addAddressRange(BinaryFunction &Function, uint64_t BeginAddress, uint64_t EndAddress) { @@ -66,7 +66,7 @@ class LexicalBlock : public AddressRangesOwner { BasicBlockOffsetRanges BBOffsetRanges; - // Offset of the address ranges of this block in the output .debug_ranges. + // Offset of the address ranges of this object in the output .debug_ranges. uint32_t AddressRangesOffset; }; diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 58ae61d1c891..601a0e6ff247 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -71,19 +71,25 @@ BinaryFunction *getBinaryFunctionContainingAddress( } // Traverses the DIE tree in a recursive depth-first search and finds lexical -// blocks, saving them in LexicalBlocks. -void findLexicalBlocks(const DWARFCompileUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, - std::map &Functions, - std::vector &LexicalBlocks) { - if (DIE->getTag() == dwarf::DW_TAG_lexical_block) { - LexicalBlocks.emplace_back(Unit, DIE); - auto &LB = LexicalBlocks.back(); +// blocks and instances of inlined subroutines, saving them in +// AddressRangesObjects. +void findAddressRangesObjects( + const DWARFCompileUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + std::map &Functions, + std::vector &AddressRangesObjects) { + auto Tag = DIE->getTag(); + if (Tag == dwarf::DW_TAG_lexical_block || + Tag == dwarf::DW_TAG_inlined_subroutine || + Tag == dwarf::DW_TAG_try_block || + Tag == dwarf::DW_TAG_catch_block) { + AddressRangesObjects.emplace_back(Unit, DIE); + auto &Object = AddressRangesObjects.back(); for (const auto &Range : DIE->getAddressRanges(Unit)) { if (auto *Function = getBinaryFunctionContainingAddress(Range.first, Functions)) { if (Function->isSimple()) { - LB.addAddressRange(*Function, Range.first, Range.second); + Object.addAddressRange(*Function, Range.first, Range.second); } } } @@ -91,7 +97,7 @@ void findLexicalBlocks(const DWARFCompileUnit *Unit, // Recursively visit each child. for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { - findLexicalBlocks(Unit, Child, Functions, LexicalBlocks); + findAddressRangesObjects(Unit, Child, Functions, AddressRangesObjects); } } @@ -163,10 +169,10 @@ void BinaryContext::preprocessFunctionDebugInfo( UnknownFunctions); } - // Iterate over DIE trees finding lexical blocks. + // Iterate over DIE trees finding objects that contain address ranges. for (const auto &CU : DwCtx->compile_units()) { - findLexicalBlocks(CU.get(), CU->getUnitDIE(false), BinaryFunctions, - LexicalBlocks); + findAddressRangesObjects(CU.get(), CU->getUnitDIE(false), BinaryFunctions, + AddressRangesObjects); } // Iterate over location lists and save them in LocationLists. diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index bffdccb23413..24d18af5fd26 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -14,7 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H #define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H -#include "LexicalBlock.h" +#include "AddressRangesDWARFObject.h" #include "LocationList.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" @@ -75,8 +75,10 @@ class BinaryContext { /// Maps DWARF CUID to offset of stmt_list attribute in .debug_info. std::map LineTableOffsetCUMap; - /// List of DWARF lexical blocks in .debug_info. - std::vector LexicalBlocks; + /// List of DWARF entries in .debug_info that have address ranges to be + /// updated. These include lexical blocks (DW_TAG_lexical_block) and concrete + /// instances of inlined subroutines (DW_TAG_inlined_subroutine). + std::vector AddressRangesObjects; /// List of DWARF location lists in .debug_loc. std::vector LocationLists; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 316f37888af8..5fbeca2ed1f9 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2075,10 +2075,10 @@ void RewriteInstance::rewriteFile() { Out->keep(); } -void RewriteInstance::updateLexicalBlocksAddresses() { - for (auto &LB : BC->LexicalBlocks) { - for (const auto &Range : LB.getAbsoluteAddressRanges()) { - RangesSectionsWriter.AddRange(&LB, Range.first, +void RewriteInstance::updateAddressRangesObjects() { + for (auto &Obj : BC->AddressRangesObjects) { + for (const auto &Range : Obj.getAbsoluteAddressRanges()) { + RangesSectionsWriter.AddRange(&Obj, Range.first, Range.second - Range.first); } } @@ -2150,7 +2150,7 @@ void RewriteInstance::updateDebugInfo() { updateFunctionRanges(); - updateLexicalBlocksAddresses(); + updateAddressRangesObjects(); generateDebugRanges(); @@ -2202,12 +2202,13 @@ void RewriteInstance::updateDWARFAddressRanges() { DIECompileUnitPair.first); } - // Update address ranges of lexical blocks. - for (const auto &LB : BC->LexicalBlocks) { + // Update address ranges of DWARF block objects (lexical/try/catch blocks, + // inlined subroutine instances, etc). + for (const auto &Obj : BC->AddressRangesObjects) { updateDWARFObjectAddressRanges( - LB.getAddressRangesOffset() + DebugRangesSize, - LB.getCompileUnit(), - LB.getDIE()); + Obj.getAddressRangesOffset() + DebugRangesSize, + Obj.getCompileUnit(), + Obj.getDIE()); } } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 9e5dc55dee8b..927c8cf56954 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -212,8 +212,8 @@ class RewriteInstance { /// Update internal function ranges after functions have been written. void updateFunctionRanges(); - /// Update lexical blocks ranges after optimizations. - void updateLexicalBlocksAddresses(); + /// Update objects with address ranges after optimization. + void updateAddressRangesObjects(); /// Generate new contents for .debug_loc. void updateLocationLists(); From a55c4e337c34a50444147c8ec2ccf90a0ab7ca6a Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Mon, 11 Apr 2016 17:46:18 -0700 Subject: [PATCH 096/904] Fix debugging info for simple functions that we fail to rewrite. Summary: Simple functions which we fail to rewrite after optimizations were having wrong debugging information because the latter would reflect the optimized version of the function. There are only 48 functions (at this time) in this situation in the HHVM binary. The simple fix is to add another full pass. Another more complicated path, which will be more efficient, is to reset only the BinaryContext and emit again, but then we need to recreate all symbols in the new MCContext and update the pointers. I started taking this path but it started getting too complicated for only those 48 functions (needed to create a new map of global symbols, recreate landing pads - which needed to have the internal intermediate labels in the functions kept to be updated too, etc). Because the overhead is quite large (another full emission pass - around 4m30s here) and the impact is small I put this behind a new command-line flag which is off by default: -fix-debuginfo-large-functions. (cherry picked from commit 3996c36996d990c14b3ba8dd44f4457127f8dbac) --- bolt/RewriteInstance.cpp | 58 +++++++++++++++++++++++++++++++++------- bolt/RewriteInstance.h | 7 ++--- 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 5fbeca2ed1f9..92df056531a9 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -117,6 +117,12 @@ UpdateDebugSections("update-debug-sections", cl::desc("update DWARF debug sections of the executable"), cl::Optional); +static cl::opt +FixDebugInfoLargeFunctions("fix-debuginfo-large-functions", + cl::desc("do another pass if we encounter large " + "functions, to correct their debug info."), + cl::Optional); + static cl::opt ReorderBlocks( "reorder-blocks", @@ -543,6 +549,8 @@ void RewriteInstance::run() { return; } + unsigned PassNumber = 1; + // Main "loop". discoverStorage(); readSpecialSections(); @@ -554,9 +562,10 @@ void RewriteInstance::run() { emitFunctions(); if (opts::SplitFunctions == BinaryFunction::ST_LARGE && - splitLargeFunctions()) { + checkLargeFunctions()) { + ++PassNumber; // Emit again because now some functions have been split - outs() << "BOLT: split-functions: starting pass 2...\n"; + outs() << "BOLT: split-functions: starting pass " << PassNumber << "...\n"; reset(); discoverStorage(); readSpecialSections(); @@ -568,6 +577,36 @@ void RewriteInstance::run() { emitFunctions(); } + // Emit functions again ignoring functions which still didn't fit in their + // original space, so that we don't generate incorrect debugging information + // for them (information that would reflect the optimized version). + if (opts::UpdateDebugSections && opts::FixDebugInfoLargeFunctions && + checkLargeFunctions()) { + ++PassNumber; + outs() << "BOLT: starting pass (ignoring large functions)" + << PassNumber << "...\n"; + reset(); + discoverStorage(); + readSpecialSections(); + discoverFileObjects(); + readDebugInfo(); + disassembleFunctions(); + + for (uint64_t Address : LargeFunctions) { + auto FunctionIt = BinaryFunctions.find(Address); + assert(FunctionIt != BinaryFunctions.end() && + "Invalid large function address."); + errs() << "BOLT-WARNING: Function " << FunctionIt->second.getName() + << " is larger than it's orginal size: emitting again marking it " + << "as not simple.\n"; + FunctionIt->second.setSimple(false); + } + + readFunctionDebugInfo(); + runOptimizationPasses(); + emitFunctions(); + } + updateDebugInfo(); // Copy allocatable part of the input. @@ -979,7 +1018,7 @@ void RewriteInstance::runOptimizationPasses() { if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { bool ShouldSplit = (opts::SplitFunctions == BinaryFunction::ST_ALL) || - ToSplit.find(BFI.first) != ToSplit.end(); + LargeFunctions.find(BFI.first) != LargeFunctions.end(); BFI.second.modifyLayout(opts::ReorderBlocks, ShouldSplit); if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks", true); @@ -1439,8 +1478,8 @@ void RewriteInstance::emitFunctions() { TempOut->keep(); } -bool RewriteInstance::splitLargeFunctions() { - bool Changed = false; +bool RewriteInstance::checkLargeFunctions() { + LargeFunctions.clear(); for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -1451,10 +1490,9 @@ bool RewriteInstance::splitLargeFunctions() { if (Function.getImageSize() <= Function.getMaxSize()) continue; - ToSplit.insert(BFI.first); - Changed = true; + LargeFunctions.insert(BFI.first); } - return Changed; + return !LargeFunctions.empty(); } void RewriteInstance::updateFunctionRanges() { @@ -2281,8 +2319,8 @@ void RewriteInstance::updateDWARFObjectAddressRanges( DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); } else { - DEBUG(errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << "\n"); + errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << "\n"; } } } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 927c8cf56954..d9375fa22a9d 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -166,7 +166,7 @@ class RewriteInstance { /// /// Returns true if any function was annotated, requiring us to perform a /// second pass to emit those functions in two parts. - bool splitLargeFunctions(); + bool checkLargeFunctions(); /// Updates debug line information for non-simple functions, which are not /// rewritten. @@ -317,8 +317,9 @@ class RewriteInstance { /// Size of the .debug_ranges section on input. uint32_t DebugRangesSize{0}; - /// Keep track of which functions to split in a second pass. - std::set ToSplit; + /// Keep track of which functions didn't fit in their original space in the + /// last emission, so that we may either decide to split or not optimize them. + std::set LargeFunctions; /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; From 4639dc9b50d717bf0bf920811a650d4a1b3b78e6 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Thu, 7 Apr 2016 15:06:43 -0700 Subject: [PATCH 097/904] Group debugging info representation and serialization code. Summary: Moved the classes related to representing and serializing DWARF entities into a single header, DebugData.h. (cherry picked from commit dd7ef6b283fcfa6f6d8d6576a78c8769d8669c92) --- bolt/AddressRangesDWARFObject.h | 76 ------ bolt/BasicBlockOffsetRanges.cpp | 96 -------- bolt/BasicBlockOffsetRanges.h | 70 ------ bolt/BinaryContext.h | 9 +- bolt/BinaryFunction.cpp | 1 - bolt/BinaryFunction.h | 2 +- bolt/BinaryPatcher.cpp | 109 --------- bolt/BinaryPatcher.h | 101 -------- bolt/CMakeLists.txt | 6 +- bolt/DebugData.cpp | 306 +++++++++++++++++++++++++ bolt/DebugData.h | 355 +++++++++++++++++++++++++++++ bolt/DebugLineTableRowRef.cpp | 21 -- bolt/DebugLineTableRowRef.h | 64 ------ bolt/DebugLocWriter.cpp | 45 ---- bolt/DebugLocWriter.h | 53 ----- bolt/DebugRangesSectionsWriter.cpp | 119 ---------- bolt/DebugRangesSectionsWriter.h | 94 -------- bolt/LocationList.h | 61 ----- bolt/RewriteInstance.cpp | 1 - bolt/RewriteInstance.h | 4 +- 20 files changed, 668 insertions(+), 925 deletions(-) delete mode 100644 bolt/AddressRangesDWARFObject.h delete mode 100644 bolt/BasicBlockOffsetRanges.cpp delete mode 100644 bolt/BasicBlockOffsetRanges.h delete mode 100644 bolt/BinaryPatcher.cpp delete mode 100644 bolt/BinaryPatcher.h create mode 100644 bolt/DebugData.cpp create mode 100644 bolt/DebugData.h delete mode 100644 bolt/DebugLineTableRowRef.cpp delete mode 100644 bolt/DebugLineTableRowRef.h delete mode 100644 bolt/DebugLocWriter.cpp delete mode 100644 bolt/DebugLocWriter.h delete mode 100644 bolt/DebugRangesSectionsWriter.cpp delete mode 100644 bolt/DebugRangesSectionsWriter.h delete mode 100644 bolt/LocationList.h diff --git a/bolt/AddressRangesDWARFObject.h b/bolt/AddressRangesDWARFObject.h deleted file mode 100644 index 97ee3a43b5cb..000000000000 --- a/bolt/AddressRangesDWARFObject.h +++ /dev/null @@ -1,76 +0,0 @@ -//===--- AddressRangesDWARFObject.h - DWARF Entities with address ranges --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Represents DWARF lexical blocks, maintaining their list of address ranges to -// be updated in the output debugging information. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_ADDRESS_RANGES_DWARF_OBJECT_H -#define LLVM_TOOLS_LLVM_BOLT_ADDRESS_RANGES_DWARF_OBJECT_H - -#include "DebugRangesSectionsWriter.h" -#include "BasicBlockOffsetRanges.h" - -namespace llvm { - -class DWARFCompileUnit; -class DWARFDebugInfoEntryMinimal; - -namespace bolt { - -class BasicBlockTable; -class BinaryBasicBlock; -class BinaryFunction; - -class AddressRangesDWARFObject : public AddressRangesOwner { -public: - AddressRangesDWARFObject(const DWARFCompileUnit *CU, - const DWARFDebugInfoEntryMinimal *DIE) - : CU(CU), DIE(DIE) { } - - /// Add range [BeginAddress, EndAddress) to this object. - void addAddressRange(BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress) { - BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress); - } - - std::vector> getAbsoluteAddressRanges() const { - auto AddressRangesWithData = BBOffsetRanges.getAbsoluteAddressRanges(); - std::vector> AddressRanges( - AddressRangesWithData.size()); - for (unsigned I = 0, S = AddressRanges.size(); I != S; ++I) { - AddressRanges[I] = std::make_pair(AddressRangesWithData[I].Begin, - AddressRangesWithData[I].End); - } - return AddressRanges; - } - - void setAddressRangesOffset(uint32_t Offset) { AddressRangesOffset = Offset; } - - uint32_t getAddressRangesOffset() const { return AddressRangesOffset; } - - const DWARFCompileUnit *getCompileUnit() const { return CU; } - const DWARFDebugInfoEntryMinimal *getDIE() const { return DIE; } - -private: - const DWARFCompileUnit *CU; - const DWARFDebugInfoEntryMinimal *DIE; - - BasicBlockOffsetRanges BBOffsetRanges; - - // Offset of the address ranges of this object in the output .debug_ranges. - uint32_t AddressRangesOffset; -}; - -} // namespace bolt -} // namespace llvm - -#endif diff --git a/bolt/BasicBlockOffsetRanges.cpp b/bolt/BasicBlockOffsetRanges.cpp deleted file mode 100644 index 445da2bc4ef1..000000000000 --- a/bolt/BasicBlockOffsetRanges.cpp +++ /dev/null @@ -1,96 +0,0 @@ -//===- BasicBlockOffsetRanges.cpp - list of address ranges relative to BBs ===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "BasicBlockOffsetRanges.h" -#include "BinaryBasicBlock.h" -#include "BinaryFunction.h" -#include - -namespace llvm { -namespace bolt { - -void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress, - const BinaryData *Data) { - auto FirstBB = Function.getBasicBlockContainingOffset( - BeginAddress - Function.getAddress()); - assert(FirstBB && "No basic blocks in the function intersect given range."); - - for (auto I = Function.getIndex(FirstBB), S = Function.size(); I != S; ++I) { - auto BB = Function.getBasicBlockAtIndex(I); - uint64_t BBAddress = Function.getAddress() + BB->getOffset(); - if (BBAddress >= EndAddress) - break; - - uint64_t InternalAddressRangeBegin = std::max(BBAddress, BeginAddress); - assert(BB->getFunction() == &Function && - "Mismatching functions.\n"); - uint64_t InternalAddressRangeEnd = - std::min(BBAddress + Function.getBasicBlockOriginalSize(BB), - EndAddress); - - AddressRanges.push_back( - BBAddressRange{ - BB, - static_cast(InternalAddressRangeBegin - BBAddress), - static_cast(InternalAddressRangeEnd - BBAddress), - Data}); - } -} - -std::vector -BasicBlockOffsetRanges::getAbsoluteAddressRanges() const { - std::vector AbsoluteRanges; - for (const auto &BBAddressRange : AddressRanges) { - auto BBOutputAddressRange = - BBAddressRange.BasicBlock->getOutputAddressRange(); - uint64_t NewRangeBegin = BBOutputAddressRange.first + - BBAddressRange.RangeBeginOffset; - // If the end offset pointed to the end of the basic block, then we set - // the new end range to cover the whole basic block as the BB's size - // might have increased. - auto BBFunction = BBAddressRange.BasicBlock->getFunction(); - uint64_t NewRangeEnd = - (BBAddressRange.RangeEndOffset == - BBFunction->getBasicBlockOriginalSize(BBAddressRange.BasicBlock)) - ? BBOutputAddressRange.second - : (BBOutputAddressRange.first + BBAddressRange.RangeEndOffset); - AbsoluteRanges.emplace_back(AbsoluteRange{NewRangeBegin, NewRangeEnd, - BBAddressRange.Data}); - } - if (AbsoluteRanges.empty()) { - return AbsoluteRanges; - } - // Merge adjacent ranges that have the same data. - std::sort(AbsoluteRanges.begin(), AbsoluteRanges.end(), - [](const AbsoluteRange &A, const AbsoluteRange &B) { - return A.Begin < B.Begin; - }); - decltype(AbsoluteRanges) MergedRanges; - - MergedRanges.emplace_back(AbsoluteRanges[0]); - for (unsigned I = 1, S = AbsoluteRanges.size(); I != S; ++I) { - // If this range complements the last one and they point to the same - // (possibly null) data, merge them instead of creating another one. - if (AbsoluteRanges[I].Begin == MergedRanges.back().End && - AbsoluteRanges[I].Data == MergedRanges.back().Data) { - MergedRanges.back().End = AbsoluteRanges[I].End; - } else { - MergedRanges.emplace_back(AbsoluteRanges[I]); - } - } - - return MergedRanges; -} - -} // namespace bolt -} // namespace llvm diff --git a/bolt/BasicBlockOffsetRanges.h b/bolt/BasicBlockOffsetRanges.h deleted file mode 100644 index 51dac4dc9e50..000000000000 --- a/bolt/BasicBlockOffsetRanges.h +++ /dev/null @@ -1,70 +0,0 @@ -//===--- BasicBlockOffsetRanges.h - list of address ranges relative to BBs ===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Represents a list of address ranges where addresses are relative to the -// beginning of basic blocks. Useful for converting address ranges in the input -// binary to equivalent ranges after optimizations take place. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_BASIC_BLOCK_OFFSET_RANGES_H -#define LLVM_TOOLS_LLVM_BOLT_BASIC_BLOCK_OFFSET_RANGES_H - -#include "llvm/ADT/SmallVector.h" -#include -#include -#include -#include - -namespace llvm { -namespace bolt { - -class BinaryFunction; -class BinaryBasicBlock; - -class BasicBlockOffsetRanges { -public: - typedef SmallVectorImpl BinaryData; - struct AbsoluteRange { - uint64_t Begin; - uint64_t End; - const BinaryData *Data; - }; - - /// Add range [BeginAddress, EndAddress) to the address ranges list. - /// \p Function is the function that contains the given address range. - void addAddressRange(BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress, - const BinaryData *Data = nullptr); - - /// Returns the list of absolute addresses calculated using the output address - /// of the basic blocks, i.e. the input ranges updated after basic block - /// addresses might have changed, together with the data associated to them. - std::vector getAbsoluteAddressRanges() const; - -private: - /// An address range inside one basic block. - struct BBAddressRange { - const BinaryBasicBlock *BasicBlock; - /// Beginning of the range counting from BB's start address. - uint16_t RangeBeginOffset; - /// (Exclusive) end of the range counting from BB's start address. - uint16_t RangeEndOffset; - /// Binary data associated with this range. - const BinaryData *Data; - }; - - std::vector AddressRanges; -}; - -} // namespace bolt -} // namespace llvm - -#endif diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 24d18af5fd26..70e5e4b7ce94 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -14,8 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H #define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H -#include "AddressRangesDWARFObject.h" -#include "LocationList.h" +#include "DebugData.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -75,14 +74,14 @@ class BinaryContext { /// Maps DWARF CUID to offset of stmt_list attribute in .debug_info. std::map LineTableOffsetCUMap; + /// List of DWARF location lists in .debug_loc. + std::vector LocationLists; + /// List of DWARF entries in .debug_info that have address ranges to be /// updated. These include lexical blocks (DW_TAG_lexical_block) and concrete /// instances of inlined subroutines (DW_TAG_inlined_subroutine). std::vector AddressRangesObjects; - /// List of DWARF location lists in .debug_loc. - std::vector LocationLists; - using DIECompileUnitVector = std::vector> ; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 0637340377d2..0f77c756e36c 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -13,7 +13,6 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" #include "DataReader.h" -#include "DebugLineTableRowRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/MC/MCAsmInfo.h" diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index c97d166d0191..de1c0fbabe08 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -19,7 +19,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" -#include "DebugRangesSectionsWriter.h" +#include "DebugData.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" #include "llvm/MC/MCCodeEmitter.h" diff --git a/bolt/BinaryPatcher.cpp b/bolt/BinaryPatcher.cpp deleted file mode 100644 index 8af6018f3a9b..000000000000 --- a/bolt/BinaryPatcher.cpp +++ /dev/null @@ -1,109 +0,0 @@ -//===--- BinaryPatcher.h - Classes for modifying sections of the binary --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "BinaryPatcher.h" -#include -#include - -namespace llvm { -namespace bolt { - -void SimpleBinaryPatcher::addBinaryPatch(uint32_t Offset, - const std::string &NewValue) { - Patches.emplace_back(std::make_pair(Offset, NewValue)); -} - -void SimpleBinaryPatcher::addBytePatch(uint32_t Offset, uint8_t Value) { - Patches.emplace_back(std::make_pair(Offset, std::string(1, Value))); -} - -void SimpleBinaryPatcher::addLEPatch(uint32_t Offset, uint64_t NewValue, - size_t ByteSize) { - std::string LE64(ByteSize, 0); - for (size_t I = 0; I < ByteSize; ++I) { - LE64[I] = NewValue & 0xff; - NewValue >>= 8; - } - Patches.emplace_back(std::make_pair(Offset, LE64)); -} - -void SimpleBinaryPatcher::addLE64Patch(uint32_t Offset, uint64_t NewValue) { - addLEPatch(Offset, NewValue, 8); -} - -void SimpleBinaryPatcher::addLE32Patch(uint32_t Offset, uint32_t NewValue) { - addLEPatch(Offset, NewValue, 4); -} - -void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents) { - for (const auto &Patch : Patches) { - uint32_t Offset = Patch.first; - const std::string &ByteSequence = Patch.second; - assert(Offset + ByteSequence.size() <= BinaryContents.size() && - "Applied patch runs over binary size."); - for (uint64_t I = 0, Size = ByteSequence.size(); I < Size; ++I) { - BinaryContents[Offset + I] = ByteSequence[I]; - } - } -} - -void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit, - uint32_t AbbrevCode, - uint16_t AttrTag, - uint8_t NewAttrTag, - uint8_t NewAttrForm) { - assert(Unit && "No compile unit specified."); - Patches[Unit].push_back( - AbbrevAttrPatch{AbbrevCode, AttrTag, NewAttrTag, NewAttrForm}); -} - -void DebugAbbrevPatcher::patchBinary(std::string &Contents) { - SimpleBinaryPatcher Patcher; - - for (const auto &UnitPatchesPair : Patches) { - const auto *Unit = UnitPatchesPair.first; - const auto *UnitAbbreviations = Unit->getAbbreviations(); - assert(UnitAbbreviations && - "Compile unit doesn't have associated abbreviations."); - const auto &UnitPatches = UnitPatchesPair.second; - for (const auto &AttrPatch : UnitPatches) { - const auto *AbbreviationDeclaration = - UnitAbbreviations->getAbbreviationDeclaration(AttrPatch.Code); - assert(AbbreviationDeclaration && "No abbreviation with given code."); - const auto *Attribute = AbbreviationDeclaration->findAttribute( - AttrPatch.Attr); - - if (!Attribute) { - errs() << "Attribute " << AttrPatch.Attr << " does not occur in " - << " abbrev " << AttrPatch.Code << " of CU " << Unit->getOffset() - << " in decl@" << AbbreviationDeclaration - << " and index = " << AbbreviationDeclaration->findAttributeIndex(AttrPatch.Attr) - << "\n"; - errs() << "Look at the abbrev:\n"; - AbbreviationDeclaration->dump(errs()); - - assert(Attribute && "Specified attribute doesn't occur in abbreviation."); - } - // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or - // DW_AT_APPLE_*), they are all small (< 128) and encoded in a single - // byte in ULEB128, otherwise it'll be more tricky as we may need to - // grow or shrink the section. - Patcher.addBytePatch(Attribute->AttrOffset, - AttrPatch.NewAttr); - Patcher.addBytePatch(Attribute->FormOffset, - AttrPatch.NewForm); - } - } - Patcher.patchBinary(Contents); -} - -} // namespace llvm -} // namespace bolt diff --git a/bolt/BinaryPatcher.h b/bolt/BinaryPatcher.h deleted file mode 100644 index 4fe1fe4e1e0a..000000000000 --- a/bolt/BinaryPatcher.h +++ /dev/null @@ -1,101 +0,0 @@ -//===--- BinaryPatcher.h - Classes for modifying sections of the binary --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Interfaces for applying small modifications to parts of a binary file. Some -// specializations facilitate the modification of specific ELF/DWARF sections. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_PATCHER_H -#define LLVM_TOOLS_LLVM_BOLT_BINARY_PATCHER_H - -#include "llvm/DebugInfo/DWARF/DWARFUnit.h" -#include -#include -#include - -namespace llvm { -namespace bolt { - -/// Abstract interface for classes that apply modifications to a binary string. -class BinaryPatcher { -public: - virtual ~BinaryPatcher() {} - /// Applies in-place modifications to the binary string \p BinaryContents . - virtual void patchBinary(std::string &BinaryContents) = 0; -}; - -/// Applies simple modifications to a binary string, such as directly replacing -/// the contents of a certain portion with a string or an integer. -class SimpleBinaryPatcher : public BinaryPatcher { -private: - std::vector> Patches; - - /// Adds a patch to replace the contents of \p ByteSize bytes with the integer - /// \p NewValue encoded in little-endian, with the least-significant byte - /// being written at the offset \p Offset . - void addLEPatch(uint32_t Offset, uint64_t NewValue, size_t ByteSize); - -public: - ~SimpleBinaryPatcher() {} - - /// Adds a patch to replace the contents of the binary string starting at the - /// specified \p Offset with the string \p NewValue. - void addBinaryPatch(uint32_t Offset, const std::string &NewValue); - - /// Adds a patch to replace the contents of a single byte of the string, at - /// the offset \p Offset, with the value \Value . - void addBytePatch(uint32_t Offset, uint8_t Value); - - /// Adds a patch to put the integer \p NewValue encoded as a 64-bit - /// little-endian value at offset \p Offset. - void addLE64Patch(uint32_t Offset, uint64_t NewValue); - - /// Adds a patch to put the integer \p NewValue encoded as a 32-bit - /// little-endian value at offset \p Offset. - void addLE32Patch(uint32_t Offset, uint32_t NewValue); - - void patchBinary(std::string &BinaryContents) override; -}; - -/// Apply small modifications to the .debug_abbrev DWARF section. -class DebugAbbrevPatcher : public BinaryPatcher { -private: - /// Patch of changing one attribute to another. - struct AbbrevAttrPatch { - uint32_t Code; // Code of abbreviation to be modified. - uint16_t Attr; // ID of attribute to be replaced. - uint8_t NewAttr; // ID of the new attribute. - uint8_t NewForm; // Form of the new attribute. - }; - - std::map> Patches; - -public: - ~DebugAbbrevPatcher() { } - /// Adds a patch to change an attribute of an abbreviation that belongs to - /// \p Unit to another attribute. - /// \p AbbrevCode code of the abbreviation to be modified. - /// \p AttrTag ID of the attribute to be replaced. - /// \p NewAttrTag ID of the new attribute. - /// \p NewAttrForm Form of the new attribute. - /// We only handle standard forms, that are encoded in a single byte. - void addAttributePatch(const DWARFUnit *Unit, - uint32_t AbbrevCode, - uint16_t AttrTag, - uint8_t NewAttrTag, - uint8_t NewAttrForm); - - void patchBinary(std::string &Contents) override; -}; - -} // namespace llvm -} // namespace bolt - -#endif diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 7f7343a41c06..3e33a968a599 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -13,15 +13,11 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-bolt llvm-bolt.cpp - BasicBlockOffsetRanges.cpp BinaryBasicBlock.cpp BinaryContext.cpp BinaryFunction.cpp - BinaryPatcher.cpp DataReader.cpp - DebugLineTableRowRef.cpp - DebugLocWriter.cpp - DebugRangesSectionsWriter.cpp + DebugData.cpp Exceptions.cpp RewriteInstance.cpp ) diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp new file mode 100644 index 000000000000..79ea84dd816c --- /dev/null +++ b/bolt/DebugData.cpp @@ -0,0 +1,306 @@ +//===- DebugData.cpp - Representation and writing of debugging information. ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DebugData.h" +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCObjectWriter.h" +#include +#include + +namespace llvm { +namespace bolt { + +const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{0, 0}; + +void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress, + const BinaryData *Data) { + auto FirstBB = Function.getBasicBlockContainingOffset( + BeginAddress - Function.getAddress()); + assert(FirstBB && "No basic blocks in the function intersect given range."); + + for (auto I = Function.getIndex(FirstBB), S = Function.size(); I != S; ++I) { + auto BB = Function.getBasicBlockAtIndex(I); + uint64_t BBAddress = Function.getAddress() + BB->getOffset(); + if (BBAddress >= EndAddress) + break; + + uint64_t InternalAddressRangeBegin = std::max(BBAddress, BeginAddress); + assert(BB->getFunction() == &Function && + "Mismatching functions.\n"); + uint64_t InternalAddressRangeEnd = + std::min(BBAddress + Function.getBasicBlockOriginalSize(BB), + EndAddress); + + AddressRanges.push_back( + BBAddressRange{ + BB, + static_cast(InternalAddressRangeBegin - BBAddress), + static_cast(InternalAddressRangeEnd - BBAddress), + Data}); + } +} + +std::vector +BasicBlockOffsetRanges::getAbsoluteAddressRanges() const { + std::vector AbsoluteRanges; + for (const auto &BBAddressRange : AddressRanges) { + auto BBOutputAddressRange = + BBAddressRange.BasicBlock->getOutputAddressRange(); + uint64_t NewRangeBegin = BBOutputAddressRange.first + + BBAddressRange.RangeBeginOffset; + // If the end offset pointed to the end of the basic block, then we set + // the new end range to cover the whole basic block as the BB's size + // might have increased. + auto BBFunction = BBAddressRange.BasicBlock->getFunction(); + uint64_t NewRangeEnd = + (BBAddressRange.RangeEndOffset == + BBFunction->getBasicBlockOriginalSize(BBAddressRange.BasicBlock)) + ? BBOutputAddressRange.second + : (BBOutputAddressRange.first + BBAddressRange.RangeEndOffset); + AbsoluteRanges.emplace_back(AbsoluteRange{NewRangeBegin, NewRangeEnd, + BBAddressRange.Data}); + } + if (AbsoluteRanges.empty()) { + return AbsoluteRanges; + } + // Merge adjacent ranges that have the same data. + std::sort(AbsoluteRanges.begin(), AbsoluteRanges.end(), + [](const AbsoluteRange &A, const AbsoluteRange &B) { + return A.Begin < B.Begin; + }); + decltype(AbsoluteRanges) MergedRanges; + + MergedRanges.emplace_back(AbsoluteRanges[0]); + for (unsigned I = 1, S = AbsoluteRanges.size(); I != S; ++I) { + // If this range complements the last one and they point to the same + // (possibly null) data, merge them instead of creating another one. + if (AbsoluteRanges[I].Begin == MergedRanges.back().End && + AbsoluteRanges[I].Data == MergedRanges.back().Data) { + MergedRanges.back().End = AbsoluteRanges[I].End; + } else { + MergedRanges.emplace_back(AbsoluteRanges[I]); + } + } + + return MergedRanges; +} + +void DebugRangesSectionsWriter::AddRange(uint32_t CompileUnitOffset, + uint64_t Address, + uint64_t Size) { + CUAddressRanges[CompileUnitOffset].push_back(std::make_pair(Address, Size)); +} + +void DebugRangesSectionsWriter::AddRange(AddressRangesOwner *BF, + uint64_t Address, + uint64_t Size) { + ObjectAddressRanges[BF].push_back(std::make_pair(Address, Size)); +} + +namespace { + +// Writes address ranges to Writer as pairs of 64-bit (address, size). +// If RelativeRange is true, assumes the address range to be written must be of +// the form (begin address, range size), otherwise (begin address, end address). +// Terminates the list by writing a pair of two zeroes. +// Returns the number of written bytes. +uint32_t WriteAddressRanges( + MCObjectWriter *Writer, + const std::vector> &AddressRanges, + bool RelativeRange) { + // Write entries. + for (auto &Range : AddressRanges) { + Writer->writeLE64(Range.first); + Writer->writeLE64((!RelativeRange) * Range.first + Range.second); + } + // Finish with 0 entries. + Writer->writeLE64(0); + Writer->writeLE64(0); + return AddressRanges.size() * 16 + 16; +} + +} // namespace + +void DebugRangesSectionsWriter::WriteRangesSection(MCObjectWriter *Writer) { + uint32_t SectionOffset = 0; + for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { + uint64_t CUOffset = CUOffsetAddressRangesPair.first; + RangesSectionOffsetCUMap[CUOffset] = SectionOffset; + const auto &AddressRanges = CUOffsetAddressRangesPair.second; + SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); + } + + for (const auto &BFAddressRangesPair : ObjectAddressRanges) { + BFAddressRangesPair.first->setAddressRangesOffset(SectionOffset); + const auto &AddressRanges = BFAddressRangesPair.second; + SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); + } + + // Write an empty address list to be used for objects with unknown address + // ranges. + EmptyRangesListOffset = SectionOffset; + SectionOffset += WriteAddressRanges( + Writer, + std::vector>{}, + false); +} + +void +DebugRangesSectionsWriter::WriteArangesSection(MCObjectWriter *Writer) const { + // For reference on the format of the .debug_aranges section, see the DWARF4 + // specification, section 6.1.4 Lookup by Address + // http://www.dwarfstd.org/doc/DWARF4.pdf + for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { + uint64_t Offset = CUOffsetAddressRangesPair.first; + const auto &AddressRanges = CUOffsetAddressRangesPair.second; + + // Emit header. + + // Size of this set: 8 (size of the header) + 4 (padding after header) + // + 2*sizeof(uint64_t) bytes for each of the ranges, plus an extra + // pair of uint64_t's for the terminating, zero-length range. + // Does not include size field itself. + uint64_t Size = 8 + 4 + 2*sizeof(uint64_t) * (AddressRanges.size() + 1); + + // Header field #1: set size. + Writer->writeLE32(Size); + + // Header field #2: version number, 2 as per the specification. + Writer->writeLE16(2); + + // Header field #3: debug info offset of the correspondent compile unit. + Writer->writeLE32(Offset); + + // Header field #4: address size. + // 8 since we only write ELF64 binaries for now. + Writer->write8(8); + + // Header field #5: segment size of target architecture. + Writer->write8(0); + + // Padding before address table - 4 bytes in the 64-bit-pointer case. + Writer->writeLE32(0); + + WriteAddressRanges(Writer, AddressRanges, true); + } +} + +void DebugLocWriter::write(const LocationList &LocList, + MCObjectWriter *Writer) { + // Reference: DWARF 4 specification section 7.7.3. + UpdatedOffsets[LocList.getOriginalOffset()] = SectionOffset; + auto AbsoluteRanges = LocList.getAbsoluteAddressRanges(); + + for (const auto &Entry : LocList.getAbsoluteAddressRanges()) { + Writer->writeLE64(Entry.Begin); + Writer->writeLE64(Entry.End); + assert(Entry.Data && "Entry with null location expression."); + Writer->writeLE16(Entry.Data->size()); + + // Need to convert binary data from unsigned char to char. + Writer->writeBytes( + StringRef(reinterpret_cast(Entry.Data->data()), + Entry.Data->size())); + + SectionOffset += 2 * 8 + 2 + Entry.Data->size(); + } + Writer->writeLE64(0); + Writer->writeLE64(0); + SectionOffset += 2 * 8; +} + +void SimpleBinaryPatcher::addBinaryPatch(uint32_t Offset, + const std::string &NewValue) { + Patches.emplace_back(std::make_pair(Offset, NewValue)); +} + +void SimpleBinaryPatcher::addBytePatch(uint32_t Offset, uint8_t Value) { + Patches.emplace_back(std::make_pair(Offset, std::string(1, Value))); +} + +void SimpleBinaryPatcher::addLEPatch(uint32_t Offset, uint64_t NewValue, + size_t ByteSize) { + std::string LE64(ByteSize, 0); + for (size_t I = 0; I < ByteSize; ++I) { + LE64[I] = NewValue & 0xff; + NewValue >>= 8; + } + Patches.emplace_back(std::make_pair(Offset, LE64)); +} + +void SimpleBinaryPatcher::addLE64Patch(uint32_t Offset, uint64_t NewValue) { + addLEPatch(Offset, NewValue, 8); +} + +void SimpleBinaryPatcher::addLE32Patch(uint32_t Offset, uint32_t NewValue) { + addLEPatch(Offset, NewValue, 4); +} + +void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents) { + for (const auto &Patch : Patches) { + uint32_t Offset = Patch.first; + const std::string &ByteSequence = Patch.second; + assert(Offset + ByteSequence.size() <= BinaryContents.size() && + "Applied patch runs over binary size."); + for (uint64_t I = 0, Size = ByteSequence.size(); I < Size; ++I) { + BinaryContents[Offset + I] = ByteSequence[I]; + } + } +} + +void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit, + uint32_t AbbrevCode, + uint16_t AttrTag, + uint8_t NewAttrTag, + uint8_t NewAttrForm) { + assert(Unit && "No compile unit specified."); + Patches[Unit].push_back( + AbbrevAttrPatch{AbbrevCode, AttrTag, NewAttrTag, NewAttrForm}); +} + +void DebugAbbrevPatcher::patchBinary(std::string &Contents) { + SimpleBinaryPatcher Patcher; + + for (const auto &UnitPatchesPair : Patches) { + const auto *Unit = UnitPatchesPair.first; + const auto *UnitAbbreviations = Unit->getAbbreviations(); + assert(UnitAbbreviations && + "Compile unit doesn't have associated abbreviations."); + const auto &UnitPatches = UnitPatchesPair.second; + for (const auto &AttrPatch : UnitPatches) { + const auto *AbbreviationDeclaration = + UnitAbbreviations->getAbbreviationDeclaration(AttrPatch.Code); + assert(AbbreviationDeclaration && "No abbreviation with given code."); + const auto *Attribute = AbbreviationDeclaration->findAttribute( + AttrPatch.Attr); + + assert(Attribute && "Specified attribute doesn't occur in abbreviation."); + // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or + // DW_AT_APPLE_*), they are all small (< 128) and encoded in a single + // byte in ULEB128, otherwise it'll be more tricky as we may need to + // grow or shrink the section. + Patcher.addBytePatch(Attribute->AttrOffset, + AttrPatch.NewAttr); + Patcher.addBytePatch(Attribute->FormOffset, + AttrPatch.NewForm); + } + } + Patcher.patchBinary(Contents); +} + + + +} // namespace bolt +} // namespace llvm diff --git a/bolt/DebugData.h b/bolt/DebugData.h new file mode 100644 index 000000000000..7152a4515cd0 --- /dev/null +++ b/bolt/DebugData.h @@ -0,0 +1,355 @@ +//===-- DebugData.h - Representation and writing of debugging information. -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Classes that represent and serialize DWARF-related entities. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUG_DATA_H +#define LLVM_TOOLS_LLVM_BOLT_DEBUG_DATA_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" +#include "llvm/Support/SMLoc.h" +#include +#include +#include +#include + +namespace llvm { + +class DWARFCompileUnit; +class DWARFDebugInfoEntryMinimal; +class MCObjectWriter; + +namespace bolt { + +class BasicBlockTable; +class BinaryBasicBlock; +class BinaryFunction; + +/// Eeferences a row in a DWARFDebugLine::LineTable by the DWARF +/// Context index of the DWARF Compile Unit that owns the Line Table and the row +/// index. This is tied to our IR during disassembly so that we can later update +/// .debug_line information. RowIndex has a base of 1, which means a RowIndex +/// of 1 maps to the first row of the line table and a RowIndex of 0 is invalid. +struct DebugLineTableRowRef { + uint32_t DwCompileUnitIndex; + uint32_t RowIndex; + + const static DebugLineTableRowRef NULL_ROW; + + bool operator==(const DebugLineTableRowRef &Rhs) const { + return DwCompileUnitIndex == Rhs.DwCompileUnitIndex && + RowIndex == Rhs.RowIndex; + } + + bool operator!=(const DebugLineTableRowRef &Rhs) const { + return !(*this == Rhs); + } + + static DebugLineTableRowRef fromSMLoc(const SMLoc &Loc) { + union { + decltype(Loc.getPointer()) Ptr; + DebugLineTableRowRef Ref; + } U; + U.Ptr = Loc.getPointer(); + return U.Ref; + } + + SMLoc toSMLoc() const { + union { + decltype(SMLoc().getPointer()) Ptr; + DebugLineTableRowRef Ref; + } U; + U.Ref = *this; + return SMLoc::getFromPointer(U.Ptr); + } +}; + +/// Represents a list of address ranges where addresses are relative to the +/// beginning of basic blocks. Useful for converting address ranges in the input +/// binary to equivalent ranges after optimizations take place. +class BasicBlockOffsetRanges { +public: + typedef SmallVectorImpl BinaryData; + struct AbsoluteRange { + uint64_t Begin; + uint64_t End; + const BinaryData *Data; + }; + + /// Add range [BeginAddress, EndAddress) to the address ranges list. + /// \p Function is the function that contains the given address range. + void addAddressRange(BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress, + const BinaryData *Data = nullptr); + + /// Returns the list of absolute addresses calculated using the output address + /// of the basic blocks, i.e. the input ranges updated after basic block + /// addresses might have changed, together with the data associated to them. + std::vector getAbsoluteAddressRanges() const; + +private: + /// An address range inside one basic block. + struct BBAddressRange { + const BinaryBasicBlock *BasicBlock; + /// Beginning of the range counting from BB's start address. + uint16_t RangeBeginOffset; + /// (Exclusive) end of the range counting from BB's start address. + uint16_t RangeEndOffset; + /// Binary data associated with this range. + const BinaryData *Data; + }; + + std::vector AddressRanges; +}; + +/// Abstract interface for classes that represent objects that have +/// associated address ranges in .debug_ranges. These address ranges can +/// be serialized by DebugRangesSectionsWriter which notifies the object +/// of where in the section its address ranges list was written. +class AddressRangesOwner { +public: + virtual void setAddressRangesOffset(uint32_t Offset) = 0; +}; + +/// Represents DWARF entities that have generic address ranges, maintaining +/// their address ranges to be updated on the output debugging information. +class AddressRangesDWARFObject : public AddressRangesOwner { +public: + AddressRangesDWARFObject(const DWARFCompileUnit *CU, + const DWARFDebugInfoEntryMinimal *DIE) + : CU(CU), DIE(DIE) { } + + /// Add range [BeginAddress, EndAddress) to this object. + void addAddressRange(BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress) { + BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress); + } + + std::vector> getAbsoluteAddressRanges() const { + auto AddressRangesWithData = BBOffsetRanges.getAbsoluteAddressRanges(); + std::vector> AddressRanges( + AddressRangesWithData.size()); + for (unsigned I = 0, S = AddressRanges.size(); I != S; ++I) { + AddressRanges[I] = std::make_pair(AddressRangesWithData[I].Begin, + AddressRangesWithData[I].End); + } + return AddressRanges; + } + + void setAddressRangesOffset(uint32_t Offset) { AddressRangesOffset = Offset; } + + uint32_t getAddressRangesOffset() const { return AddressRangesOffset; } + + const DWARFCompileUnit *getCompileUnit() const { return CU; } + const DWARFDebugInfoEntryMinimal *getDIE() const { return DIE; } + +private: + const DWARFCompileUnit *CU; + const DWARFDebugInfoEntryMinimal *DIE; + + BasicBlockOffsetRanges BBOffsetRanges; + + // Offset of the address ranges of this object in the output .debug_ranges. + uint32_t AddressRangesOffset; +}; + + + +/// Represents DWARF location lists, maintaining their list of location +/// expressions and the address ranges in which they are valid to be updated in +/// the output debugging information. +class LocationList { +public: + LocationList(uint32_t Offset) : DebugLocOffset(Offset) { } + + /// Add a location expression that is valid in [BeginAddress, EndAddress) + /// within Function to location list. + void addLocation(const BasicBlockOffsetRanges::BinaryData *Expression, + BinaryFunction &Function, + uint64_t BeginAddress, + uint64_t EndAddress) { + BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress, + Expression); + } + + std::vector + getAbsoluteAddressRanges() const { + return BBOffsetRanges.getAbsoluteAddressRanges(); + } + + uint32_t getOriginalOffset() const { return DebugLocOffset; } + +private: + BasicBlockOffsetRanges BBOffsetRanges; + + // Offset of this location list in the input .debug_loc section. + uint32_t DebugLocOffset; +}; + +/// Serializes the .debug_ranges and .debug_aranges DWARF sections. +class DebugRangesSectionsWriter { +public: + DebugRangesSectionsWriter() = default; + + /// Adds a range to the .debug_arange section. + void AddRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); + + /// Adds an address range that belongs to a given object. + /// When .debug_ranges is written, the offset of the range corresponding + /// to the function will be set using BF->setAddressRangesOffset(). + void AddRange(AddressRangesOwner *ARO, uint64_t Address, uint64_t Size); + + using RangesCUMapType = std::map; + + /// Writes .debug_aranges with the added ranges to the MCObjectWriter. + void WriteArangesSection(MCObjectWriter *Writer) const; + + /// Writes .debug_ranges with the added ranges to the MCObjectWriter. + void WriteRangesSection(MCObjectWriter *Writer); + + /// Resets the writer to a clear state. + void reset() { + CUAddressRanges.clear(); + ObjectAddressRanges.clear(); + RangesSectionOffsetCUMap.clear(); + } + + /// Return mapping of CUs to offsets in .debug_ranges. + const RangesCUMapType &getRangesOffsetCUMap() const { + return RangesSectionOffsetCUMap; + } + + /// Returns an offset of an empty address ranges list that is always written + /// to .debug_ranges + uint32_t getEmptyRangesListOffset() const { return EmptyRangesListOffset; } + +private: + // Map from compile unit offset to the list of address intervals that belong + // to that compile unit. Each interval is a pair + // (first address, interval size). + std::map>> + CUAddressRanges; + + // Map from BinaryFunction to the list of address intervals that belong + // to that function, represented like CUAddressRanges. + std::map>> + ObjectAddressRanges; + + // Offset of an empty address ranges list. + uint32_t EmptyRangesListOffset; + + /// When writing data to .debug_ranges remember offset per CU. + RangesCUMapType RangesSectionOffsetCUMap; +}; + +/// Serializes the .debug_loc DWARF section with LocationLists. +class DebugLocWriter { +public: + /// Writes the given location list to the writer. + void write(const LocationList &LocList, MCObjectWriter *Writer); + + using UpdatedOffsetMapType = std::map; + + /// Returns mapping from offsets in the input .debug_loc to offsets in the + /// output .debug_loc section with the corresponding updated location list + /// entry. + const UpdatedOffsetMapType &getUpdatedLocationListOffsets() const { + return UpdatedOffsets; + } + +private: + /// Current offset in the section (updated as new entries are written). + uint32_t SectionOffset{0}; + + /// Map from input offsets to output offsets for location lists that were + /// updated, generated after write(). + UpdatedOffsetMapType UpdatedOffsets; +}; + +/// Abstract interface for classes that apply modifications to a binary string. +class BinaryPatcher { +public: + virtual ~BinaryPatcher() {} + /// Applies in-place modifications to the binary string \p BinaryContents . + virtual void patchBinary(std::string &BinaryContents) = 0; +}; + +/// Applies simple modifications to a binary string, such as directly replacing +/// the contents of a certain portion with a string or an integer. +class SimpleBinaryPatcher : public BinaryPatcher { +private: + std::vector> Patches; + + /// Adds a patch to replace the contents of \p ByteSize bytes with the integer + /// \p NewValue encoded in little-endian, with the least-significant byte + /// being written at the offset \p Offset . + void addLEPatch(uint32_t Offset, uint64_t NewValue, size_t ByteSize); + +public: + ~SimpleBinaryPatcher() {} + + /// Adds a patch to replace the contents of the binary string starting at the + /// specified \p Offset with the string \p NewValue. + void addBinaryPatch(uint32_t Offset, const std::string &NewValue); + + /// Adds a patch to replace the contents of a single byte of the string, at + /// the offset \p Offset, with the value \Value . + void addBytePatch(uint32_t Offset, uint8_t Value); + + /// Adds a patch to put the integer \p NewValue encoded as a 64-bit + /// little-endian value at offset \p Offset. + void addLE64Patch(uint32_t Offset, uint64_t NewValue); + + /// Adds a patch to put the integer \p NewValue encoded as a 32-bit + /// little-endian value at offset \p Offset. + void addLE32Patch(uint32_t Offset, uint32_t NewValue); + + void patchBinary(std::string &BinaryContents) override; +}; + +/// Apply small modifications to the .debug_abbrev DWARF section. +class DebugAbbrevPatcher : public BinaryPatcher { +private: + /// Patch of changing one attribute to another. + struct AbbrevAttrPatch { + uint32_t Code; // Code of abbreviation to be modified. + uint16_t Attr; // ID of attribute to be replaced. + uint8_t NewAttr; // ID of the new attribute. + uint8_t NewForm; // Form of the new attribute. + }; + + std::map> Patches; + +public: + ~DebugAbbrevPatcher() { } + /// Adds a patch to change an attribute of an abbreviation that belongs to + /// \p Unit to another attribute. + /// \p AbbrevCode code of the abbreviation to be modified. + /// \p AttrTag ID of the attribute to be replaced. + /// \p NewAttrTag ID of the new attribute. + /// \p NewAttrForm Form of the new attribute. + /// We only handle standard forms, that are encoded in a single byte. + void addAttributePatch(const DWARFUnit *Unit, + uint32_t AbbrevCode, + uint16_t AttrTag, + uint8_t NewAttrTag, + uint8_t NewAttrForm); + + void patchBinary(std::string &Contents) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/DebugLineTableRowRef.cpp b/bolt/DebugLineTableRowRef.cpp deleted file mode 100644 index d8db983516c0..000000000000 --- a/bolt/DebugLineTableRowRef.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===--- DebugLineTableRowRef.cpp - Identifies a row in a .debug_line table ==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "DebugLineTableRowRef.h" - - -namespace llvm { -namespace bolt { - -const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{0, 0}; - -} // namespace bolt -} // namespace llvm diff --git a/bolt/DebugLineTableRowRef.h b/bolt/DebugLineTableRowRef.h deleted file mode 100644 index 5af011798421..000000000000 --- a/bolt/DebugLineTableRowRef.h +++ /dev/null @@ -1,64 +0,0 @@ -//===--- DebugLineTableRowRef.h - Identifies a row in a .debug_line table -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Class that references a row in a DWARFDebugLine::LineTable by the DWARF -// Context index of the DWARF Compile Unit that owns the Line Table and the row -// index. This is tied to our IR during disassembly so that we can later update -// .debug_line information. The RowIndex has a base of 1, which means a RowIndex -// of 1 maps to the first row of the line table and a RowIndex of 0 is invalid. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUGLINETABLEROWREF_H -#define LLVM_TOOLS_LLVM_BOLT_DEBUGLINETABLEROWREF_H - -#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" -#include "llvm/Support/SMLoc.h" - -namespace llvm { -namespace bolt { - -struct DebugLineTableRowRef { - uint32_t DwCompileUnitIndex; - uint32_t RowIndex; - - const static DebugLineTableRowRef NULL_ROW; - - bool operator==(const DebugLineTableRowRef &Rhs) const { - return DwCompileUnitIndex == Rhs.DwCompileUnitIndex && - RowIndex == Rhs.RowIndex; - } - - bool operator!=(const DebugLineTableRowRef &Rhs) const { - return !(*this == Rhs); - } - - static DebugLineTableRowRef fromSMLoc(const SMLoc &Loc) { - union { - decltype(Loc.getPointer()) Ptr; - DebugLineTableRowRef Ref; - } U; - U.Ptr = Loc.getPointer(); - return U.Ref; - } - - SMLoc toSMLoc() const { - union { - decltype(SMLoc().getPointer()) Ptr; - DebugLineTableRowRef Ref; - } U; - U.Ref = *this; - return SMLoc::getFromPointer(U.Ptr); - } -}; - -} // namespace bolt -} // namespace llvm - -#endif diff --git a/bolt/DebugLocWriter.cpp b/bolt/DebugLocWriter.cpp deleted file mode 100644 index e2c0e84dbbf0..000000000000 --- a/bolt/DebugLocWriter.cpp +++ /dev/null @@ -1,45 +0,0 @@ -//===-- DebugLocWriter.cpp - Writes the DWARF .debug_loc section. ----------==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "DebugLocWriter.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCObjectWriter.h" -#include - -namespace llvm { -namespace bolt { - -void DebugLocWriter::write(const LocationList &LocList, - MCObjectWriter *Writer) { - // Reference: DWARF 4 specification section 7.7.3. - UpdatedOffsets[LocList.getOriginalOffset()] = SectionOffset; - auto AbsoluteRanges = LocList.getAbsoluteAddressRanges(); - - for (const auto &Entry : LocList.getAbsoluteAddressRanges()) { - Writer->writeLE64(Entry.Begin); - Writer->writeLE64(Entry.End); - assert(Entry.Data && "Entry with null location expression."); - Writer->writeLE16(Entry.Data->size()); - - // Need to convert binary data from unsigned char to char. - Writer->writeBytes( - StringRef(reinterpret_cast(Entry.Data->data()), - Entry.Data->size())); - - SectionOffset += 2 * 8 + 2 + Entry.Data->size(); - } - Writer->writeLE64(0); - Writer->writeLE64(0); - SectionOffset += 2 * 8; -} - -} // namespace bolt -} // namespace llvm diff --git a/bolt/DebugLocWriter.h b/bolt/DebugLocWriter.h deleted file mode 100644 index c0c60fd8c9ee..000000000000 --- a/bolt/DebugLocWriter.h +++ /dev/null @@ -1,53 +0,0 @@ -//===-- DebugLocWriter.h - Writes the DWARF .debug_loc section -------------==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Class that serializes the .debug_loc section given LocationLists. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUG_LOC_WRITER_H -#define LLVM_TOOLS_LLVM_BOLT_DEBUG_LOC_WRITER_H - -#include "LocationList.h" -#include -#include - -namespace llvm { - -class MCObjectWriter; - -namespace bolt { - -class DebugLocWriter { -public: - /// Writes the given location list to the writer. - void write(const LocationList &LocList, MCObjectWriter *Writer); - - using UpdatedOffsetMapType = std::map; - - /// Returns mapping from offsets in the input .debug_loc to offsets in the - /// output .debug_loc section with the corresponding updated location list - /// entry. - const UpdatedOffsetMapType &getUpdatedLocationListOffsets() const { - return UpdatedOffsets; - } - -private: - /// Current offset in the section (updated as new entries are written). - uint32_t SectionOffset{0}; - - /// Map from input offsets to output offsets for location lists that were - /// updated, generated after write(). - UpdatedOffsetMapType UpdatedOffsets; -}; - -} // namespace bolt -} // namespace llvm - -#endif diff --git a/bolt/DebugRangesSectionsWriter.cpp b/bolt/DebugRangesSectionsWriter.cpp deleted file mode 100644 index 1e88b7038d27..000000000000 --- a/bolt/DebugRangesSectionsWriter.cpp +++ /dev/null @@ -1,119 +0,0 @@ -//===-- DebugRangesSectionsWriter.h - Writes DWARF address ranges sections -==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "DebugRangesSectionsWriter.h" -#include "BinaryFunction.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCObjectWriter.h" - -namespace llvm { -namespace bolt { - -void DebugRangesSectionsWriter::AddRange(uint32_t CompileUnitOffset, - uint64_t Address, - uint64_t Size) { - CUAddressRanges[CompileUnitOffset].push_back(std::make_pair(Address, Size)); -} - -void DebugRangesSectionsWriter::AddRange(AddressRangesOwner *BF, - uint64_t Address, - uint64_t Size) { - ObjectAddressRanges[BF].push_back(std::make_pair(Address, Size)); -} - -namespace { - -// Writes address ranges to Writer as pairs of 64-bit (address, size). -// If RelativeRange is true, assumes the address range to be written must be of -// the form (begin address, range size), otherwise (begin address, end address). -// Terminates the list by writing a pair of two zeroes. -// Returns the number of written bytes. -uint32_t WriteAddressRanges( - MCObjectWriter *Writer, - const std::vector> &AddressRanges, - bool RelativeRange) { - // Write entries. - for (auto &Range : AddressRanges) { - Writer->writeLE64(Range.first); - Writer->writeLE64((!RelativeRange) * Range.first + Range.second); - } - // Finish with 0 entries. - Writer->writeLE64(0); - Writer->writeLE64(0); - return AddressRanges.size() * 16 + 16; -} - -} // namespace - -void DebugRangesSectionsWriter::WriteRangesSection(MCObjectWriter *Writer) { - uint32_t SectionOffset = 0; - for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { - uint64_t CUOffset = CUOffsetAddressRangesPair.first; - RangesSectionOffsetCUMap[CUOffset] = SectionOffset; - const auto &AddressRanges = CUOffsetAddressRangesPair.second; - SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); - } - - for (const auto &BFAddressRangesPair : ObjectAddressRanges) { - BFAddressRangesPair.first->setAddressRangesOffset(SectionOffset); - const auto &AddressRanges = BFAddressRangesPair.second; - SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); - } - - // Write an empty address list to be used for objects with unknown address - // ranges. - EmptyRangesListOffset = SectionOffset; - SectionOffset += WriteAddressRanges( - Writer, - std::vector>{}, - false); -} - -void -DebugRangesSectionsWriter::WriteArangesSection(MCObjectWriter *Writer) const { - // For reference on the format of the .debug_aranges section, see the DWARF4 - // specification, section 6.1.4 Lookup by Address - // http://www.dwarfstd.org/doc/DWARF4.pdf - for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { - uint64_t Offset = CUOffsetAddressRangesPair.first; - const auto &AddressRanges = CUOffsetAddressRangesPair.second; - - // Emit header. - - // Size of this set: 8 (size of the header) + 4 (padding after header) - // + 2*sizeof(uint64_t) bytes for each of the ranges, plus an extra - // pair of uint64_t's for the terminating, zero-length range. - // Does not include size field itself. - uint64_t Size = 8 + 4 + 2*sizeof(uint64_t) * (AddressRanges.size() + 1); - - // Header field #1: set size. - Writer->writeLE32(Size); - - // Header field #2: version number, 2 as per the specification. - Writer->writeLE16(2); - - // Header field #3: debug info offset of the correspondent compile unit. - Writer->writeLE32(Offset); - - // Header field #4: address size. - // 8 since we only write ELF64 binaries for now. - Writer->write8(8); - - // Header field #5: segment size of target architecture. - Writer->write8(0); - - // Padding before address table - 4 bytes in the 64-bit-pointer case. - Writer->writeLE32(0); - - WriteAddressRanges(Writer, AddressRanges, true); - } -} - -} // namespace bolt -} // namespace llvm diff --git a/bolt/DebugRangesSectionsWriter.h b/bolt/DebugRangesSectionsWriter.h deleted file mode 100644 index b6331f71981d..000000000000 --- a/bolt/DebugRangesSectionsWriter.h +++ /dev/null @@ -1,94 +0,0 @@ -//===-- DebugRangesSectionsWriter.h - Writes DWARF address ranges sections -==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Class that serializes the .debug_ranges and .debug_aranges sections. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_DEBUG_RANGES_SECTIONS_WRITER_H -#define LLVM_TOOLS_LLVM_BOLT_DEBUG_RANGES_SECTIONS_WRITER_H - -#include -#include -#include - -namespace llvm { - -class MCObjectWriter; - -namespace bolt { - -/// Abstract interface for classes that represent objects that have -/// associated address ranges in .debug_ranges. These address ranges can -/// be serialized by DebugRangesSectionsWriter which notifies the object -/// of where in the section its address ranges list was written. -class AddressRangesOwner { -public: - virtual void setAddressRangesOffset(uint32_t Offset) = 0; -}; - -class DebugRangesSectionsWriter { -public: - DebugRangesSectionsWriter() = default; - - /// Adds a range to the .debug_arange section. - void AddRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); - - /// Adds an address range that belongs to a given object. - /// When .debug_ranges is written, the offset of the range corresponding - /// to the function will be set using BF->setAddressRangesOffset(). - void AddRange(AddressRangesOwner *ARO, uint64_t Address, uint64_t Size); - - using RangesCUMapType = std::map; - - /// Writes .debug_aranges with the added ranges to the MCObjectWriter. - void WriteArangesSection(MCObjectWriter *Writer) const; - - /// Writes .debug_ranges with the added ranges to the MCObjectWriter. - void WriteRangesSection(MCObjectWriter *Writer); - - /// Resets the writer to a clear state. - void reset() { - CUAddressRanges.clear(); - ObjectAddressRanges.clear(); - RangesSectionOffsetCUMap.clear(); - } - - /// Return mapping of CUs to offsets in .debug_ranges. - const RangesCUMapType &getRangesOffsetCUMap() const { - return RangesSectionOffsetCUMap; - } - - /// Returns an offset of an empty address ranges list that is always written - /// to .debug_ranges - uint32_t getEmptyRangesListOffset() const { return EmptyRangesListOffset; } - -private: - // Map from compile unit offset to the list of address intervals that belong - // to that compile unit. Each interval is a pair - // (first address, interval size). - std::map>> - CUAddressRanges; - - // Map from BinaryFunction to the list of address intervals that belong - // to that function, represented like CUAddressRanges. - std::map>> - ObjectAddressRanges; - - // Offset of an empty address ranges list. - uint32_t EmptyRangesListOffset; - - /// When writing data to .debug_ranges remember offset per CU. - RangesCUMapType RangesSectionOffsetCUMap; -}; - -} // namespace bolt -} // namespace llvm - -#endif diff --git a/bolt/LocationList.h b/bolt/LocationList.h deleted file mode 100644 index 7cf4fab14507..000000000000 --- a/bolt/LocationList.h +++ /dev/null @@ -1,61 +0,0 @@ -//===--- LocationList.h - DWARF location lists ----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Represents DWARF location lists, maintaining their list of location -// expressions and the address ranges in which they are valid to be updated in -// the output debugging information. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_LOCATION_LIST_H -#define LLVM_TOOLS_LLVM_BOLT_LOCATION_LIST_H - -#include "BasicBlockOffsetRanges.h" - -namespace llvm { - -class DWARFCompileUnit; -class DWARFDebugInfoEntryMinimal; - -namespace bolt { - -class BinaryBasicBlock; - -class LocationList { -public: - LocationList(uint32_t Offset) : DebugLocOffset(Offset) { } - - /// Add a location expression that is valid in [BeginAddress, EndAddress) - /// within Function to location list. - void addLocation(const BasicBlockOffsetRanges::BinaryData *Expression, - BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress) { - BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress, - Expression); - } - - std::vector - getAbsoluteAddressRanges() const { - return BBOffsetRanges.getAbsoluteAddressRanges(); - } - - uint32_t getOriginalOffset() const { return DebugLocOffset; } - -private: - BasicBlockOffsetRanges BBOffsetRanges; - - // Offset of this location list in the input .debug_loc section. - uint32_t DebugLocOffset; -}; - -} // namespace bolt -} // namespace llvm - -#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 92df056531a9..5d8e7cc71970 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -14,7 +14,6 @@ #include "BinaryContext.h" #include "BinaryFunction.h" #include "DataReader.h" -#include "DebugLineTableRowRef.h" #include "Exceptions.h" #include "RewriteInstance.h" #include "llvm/ADT/STLExtras.h" diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index d9375fa22a9d..85cc84ced20c 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -14,9 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H #define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H -#include "BinaryPatcher.h" -#include "DebugLocWriter.h" -#include "DebugRangesSectionsWriter.h" +#include "DebugData.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/Object/ELFObjectFile.h" From 662a355a33d8a2eb161b4657a7a65de59bc9f1e5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 19 Apr 2016 22:00:29 -0700 Subject: [PATCH 098/904] Special handling for GNU_args_size call frame instruction. Summary: GNU_args_size is a special kind of CFI that tells runtime to adjust %rsp when control is passed to a landing pad. It is used for annotating call instructions that pass (extra) parameters on the stack and there's a corresponding landing pad. It is also special in a way that its value is not handled by DW_CFA_remember_state/DW_CFA_restore_state instruction sequence that we utilize to restore the state after block re-ordering. This diff adds association of call instructions with GNU_args_size value when it's used. If the function does not use GNU_args_size, there is no overhead. Otherwise, we regenerate GNU_args_size instruction during code emission, i.e. after all optimizations and block-reordering. (cherry picked from commit 0fd858dec5b10420feb3640f1f3b917b67c48911) --- bolt/BinaryFunction.cpp | 62 +++++++++++++++++++++++++++++---- bolt/BinaryFunction.h | 43 +++++++++++++++++------ bolt/Exceptions.cpp | 1 + bolt/RewriteInstance.cpp | 74 +++++++++++++++++++++++----------------- 4 files changed, 130 insertions(+), 50 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 0f77c756e36c..ca359385987b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -213,6 +213,9 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, else OS << '0'; OS << "; action: " << Action; + auto GnuArgsSize = BC.MIA->getGnuArgsSize(Instruction); + if (GnuArgsSize >= 0) + OS << "; GNU_args_size = " << GnuArgsSize; } } if (opts::PrintDebugInfo && LineTable) { @@ -825,6 +828,9 @@ bool BinaryFunction::buildCFG() { // Update the state. CurrentState = State::CFG; + // Annotate invoke instructions with GNU_args_size data. + propagateGnuArgsSizeInfo(); + return true; } @@ -939,15 +945,13 @@ void BinaryFunction::annotateCFIState() { ++HighestState; if (CFI->getOperation() == MCCFIInstruction::OpRememberState) { StateStack.push(State); - continue; - } - if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) { + } else if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) { assert(!StateStack.empty() && "Corrupt CFI stack"); State = StateStack.top(); StateStack.pop(); - continue; + } else if (CFI->getOperation() != MCCFIInstruction::OpGnuArgsSize) { + State = HighestState; } - State = HighestState; } } @@ -995,8 +999,12 @@ bool BinaryFunction::fixCFIState() { } for (auto CFI : NewCFIs) { - InsertIt = addCFIPseudo(InBB, InsertIt, CFI); - ++InsertIt; + // Ignore GNU_args_size instructions. + if (FrameInstructions[CFI].getOperation() != + MCCFIInstruction::OpGnuArgsSize) { + InsertIt = addCFIPseudo(InBB, InsertIt, CFI); + ++InsertIt; + } } return true; @@ -1658,5 +1666,45 @@ void BinaryFunction::splitFunction() { } } +void BinaryFunction::propagateGnuArgsSizeInfo() { + assert(CurrentState == State::CFG && "unexpected function state"); + + if (!hasEHRanges() || !usesGnuArgsSize()) + return; + + // The current value of DW_CFA_GNU_args_size affects all following + // invoke instructions untill the next CFI overrides it. + // It is important to iterate basic blocks in the original order when + // assigning the value. + uint64_t CurrentGnuArgsSize = 0; + for (auto &BB : BasicBlocks) { + for (auto II = BB.begin(); II != BB.end(); ) { + auto &Instr = *II; + if (BC.MIA->isCFI(Instr)) { + auto CFI = getCFIFor(Instr); + if (CFI->getOperation() == MCCFIInstruction::OpGnuArgsSize) { + CurrentGnuArgsSize = CFI->getOffset(); + // Delete DW_CFA_GNU_args_size instructions and only regenerate + // during the final code emission. The information is embedded + // inside call instructions. + II = BB.Instructions.erase(II); + } else { + ++II; + } + continue; + } + + if (BC.MIA->isInvoke(Instr)) { + // Add the value of GNU_args_size as an extra operand if landing pad + // is non-emptry. + if (BC.MIA->getEHInfo(Instr).first) { + Instr.addOperand(MCOperand::createImm(CurrentGnuArgsSize)); + } + } + ++II; + } + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index de1c0fbabe08..3df5247adb06 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -122,13 +122,6 @@ class BinaryFunction : public AddressRangesOwner { /// Alignment requirements for the function. uint64_t Alignment{1}; - /// True if this function needs to be emitted in two separate parts, one for - /// the hot basic blocks and another for the cold basic blocks. - bool IsSplit{false}; - - /// Indicate if this function has associated exception handling metadata. - bool HasEHRanges{false}; - MCSymbol *PersonalityFunction{nullptr}; uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel}; @@ -138,6 +131,16 @@ class BinaryFunction : public AddressRangesOwner { /// flow graph and re-assemble. bool IsSimple{true}; + /// True if this function needs to be emitted in two separate parts, one for + /// the hot basic blocks and another for the cold basic blocks. + bool IsSplit{false}; + + /// Indicate if this function has associated exception handling metadata. + bool HasEHRanges{false}; + + /// True if the function uses DW_CFA_GNU_args_size CFIs. + bool UsesGnuArgsSize{false}; + /// The address for the code for this function in codegen memory. uint64_t ImageAddress{0}; @@ -440,10 +443,21 @@ class BinaryFunction : public AddressRangesOwner { return IsSimple; } + /// Return true if the function body is non-contiguous. bool isSplit() const { return IsSplit; } + /// Return true if the function has exception handling tables. + bool hasEHRanges() const { + return HasEHRanges; + } + + /// Return true if the function uses DW_CFA_GNU_args_size CFIs. + bool usesGnuArgsSize() const { + return UsesGnuArgsSize; + } + MCSymbol *getPersonalityFunction() const { return PersonalityFunction; } @@ -531,7 +545,8 @@ class BinaryFunction : public AddressRangesOwner { // with NOPs and then reorder it away. // We fix this by moving the CFI instruction just before any NOPs. auto I = Instructions.lower_bound(Offset); - if (I == Instructions.end() && Offset == getSize()) { + if (Offset == getSize()) { + assert(I == Instructions.end() && "unexpected iterator value"); // Sometimes compiler issues restore_state after all instructions // in the function (even after nop). --I; @@ -593,6 +608,11 @@ class BinaryFunction : public AddressRangesOwner { return *this; } + BinaryFunction &setUsesGnuArgsSize(bool Uses = true) { + UsesGnuArgsSize = Uses; + return *this; + } + BinaryFunction &setPersonalityFunction(uint64_t Addr) { PersonalityFunction = BC.getOrCreateGlobalSymbol(Addr, "FUNCat"); return *this; @@ -737,6 +757,10 @@ class BinaryFunction : public AddressRangesOwner { /// is corrupted. If it is unable to fix it, it returns false. bool fixCFIState(); + /// Associate DW_CFA_GNU_args_size info with invoke instructions + /// (call instructions with non-empty landing pad). + void propagateGnuArgsSizeInfo(); + /// Traverse the CFG checking branches, inverting their condition, removing or /// adding jumps based on a new layout order. void fixBranches(); @@ -751,9 +775,6 @@ class BinaryFunction : public AddressRangesOwner { /// Update exception handling ranges for the function. void updateEHRanges(); - /// Return true if the function has exception handling tables. - bool hasEHRanges() const { return HasEHRanges; } - /// Emit exception handling ranges for the function. void emitLSDA(MCStreamer *Streamer); diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index c746cf6dca95..2734d4e2c389 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -611,6 +611,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { Function.addCFIInstruction( Offset, MCCFIInstruction::createGnuArgsSize(nullptr, Instr.Ops[0])); + Function.setUsesGnuArgsSize(); break; case DW_CFA_val_offset_sf: case DW_CFA_val_offset: diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 5d8e7cc71970..1650d220aa27 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1168,6 +1168,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, "first basic block should never be cold"); // Emit code. + int64_t CurrentGnuArgsSize = 0; for (auto BB : Function.layout()) { if (EmitColdPart != BB->isCold()) continue; @@ -1189,41 +1190,50 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.EmitLabel(const_cast(Label)); continue; } - if (!BC.MIA->isCFI(Instr)) { - if (opts::UpdateDebugSections) { - auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc()); - if (RowReference != DebugLineTableRowRef::NULL_ROW && - Instr.getLoc().getPointer() != LastLocSeen.getPointer()) { - auto CompileUnit = - BC.OffsetToDwarfCU[RowReference.DwCompileUnitIndex]; - assert(CompileUnit && - "Invalid CU offset set in instruction debug info."); - - auto OriginalLineTable = - BC.DwCtx->getLineTableForUnit( - CompileUnit); - const auto &OriginalRow = - OriginalLineTable->Rows[RowReference.RowIndex - 1]; - - BC.Ctx->setCurrentDwarfLoc( - OriginalRow.File, - OriginalRow.Line, - OriginalRow.Column, - (DWARF2_FLAG_IS_STMT * OriginalRow.IsStmt) | - (DWARF2_FLAG_BASIC_BLOCK * OriginalRow.BasicBlock) | - (DWARF2_FLAG_PROLOGUE_END * OriginalRow.PrologueEnd) | - (DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin), - OriginalRow.Isa, - OriginalRow.Discriminator); - BC.Ctx->setDwarfCompileUnitID(CompileUnit->getOffset()); - LastLocSeen = Instr.getLoc(); - } + if (BC.MIA->isCFI(Instr)) { + emitCFIInstr(*Function.getCFIFor(Instr)); + continue; + } + if (opts::UpdateDebugSections) { + auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc()); + if (RowReference != DebugLineTableRowRef::NULL_ROW && + Instr.getLoc().getPointer() != LastLocSeen.getPointer()) { + auto CompileUnit = + BC.OffsetToDwarfCU[RowReference.DwCompileUnitIndex]; + assert(CompileUnit && + "Invalid CU offset set in instruction debug info."); + + auto OriginalLineTable = + BC.DwCtx->getLineTableForUnit( + CompileUnit); + const auto &OriginalRow = + OriginalLineTable->Rows[RowReference.RowIndex - 1]; + + BC.Ctx->setCurrentDwarfLoc( + OriginalRow.File, + OriginalRow.Line, + OriginalRow.Column, + (DWARF2_FLAG_IS_STMT * OriginalRow.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * OriginalRow.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * OriginalRow.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin), + OriginalRow.Isa, + OriginalRow.Discriminator); + BC.Ctx->setDwarfCompileUnitID(CompileUnit->getOffset()); + LastLocSeen = Instr.getLoc(); } + } - Streamer.EmitInstruction(Instr, *BC.STI); - continue; + // Emit GNU_args_size CFIs as necessary. + if (Function.usesGnuArgsSize() && BC.MIA->isInvoke(Instr)) { + auto NewGnuArgsSize = BC.MIA->getGnuArgsSize(Instr); + if (NewGnuArgsSize >= 0 && NewGnuArgsSize != CurrentGnuArgsSize) { + CurrentGnuArgsSize = NewGnuArgsSize; + Streamer.EmitCFIGnuArgsSize(CurrentGnuArgsSize); + } } - emitCFIInstr(*Function.getCFIFor(Instr)); + + Streamer.EmitInstruction(Instr, *BC.STI); } MCSymbol *BBEndLabel = BC.Ctx->createTempSymbol(); From 3ab1bea28ec8ac75e7a97b90466fe5fe45349837 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 20 Apr 2016 15:31:11 -0700 Subject: [PATCH 099/904] Changed splitting options and fixed sorting. Summary: Splitting option now has different meanings/values. Since landing pads are mostly always cold/frozen, we should split them before anything else (we still check the execution count is 0). That's value '1'. Everything else goes on top of that and has increased value (2 - large functions, 3 - everything). Sorting was non-deterministic and somewhat broken for functions with EH ranges. Fixed that and added '-split-all-cold' option to outline all 0-count blocks. Fixed compilation of test cases. After my last commit the binaries were linked to wrong source files (i.e. debug info). Had to rebuild the binaries from updated sources. (cherry picked from commit 32f218b2aa057896a9ca3bf8de94ff7bc3c0b35d) --- bolt/BinaryFunction.cpp | 82 ++++++++++++++++++++++------------------ bolt/BinaryFunction.h | 6 ++- bolt/RewriteInstance.cpp | 12 ++++-- 3 files changed, 58 insertions(+), 42 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index ca359385987b..843f3ced3c73 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -36,6 +36,11 @@ namespace bolt { namespace opts { +static cl::opt +AgressiveSplitting("split-all-cold", + cl::desc("outline as many cold basic blocks as possible"), + cl::Optional); + static cl::opt PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); @@ -1611,34 +1616,26 @@ void BinaryFunction::splitFunction() { assert(BasicBlocksLayout.size() > 0); - // Separate hot from cold - if (!hasEHRanges()) { - for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); - I != E; ++I) { - BinaryBasicBlock *BB = *I; - if (BB->getExecutionCount() != 0) - break; - BB->IsCold = true; - IsSplit = true; + // Never outline the first basic block. + BasicBlocks.front().CanOutline = false; + for (auto &BB : BasicBlocks) { + if (!BB.CanOutline) + continue; + if (BB.getExecutionCount() != 0) { + BB.CanOutline = false; + continue; } - } else { - // We cannot move a block that can throw since exception-handling - // runtime cannot deal with split functions. However, if we can guarantee - // that the block never throws, it is safe to move the block to - // decrease the size of the function. - // - // We also cannot move landing pads (or rather entry points for landing - // pads) for the same reason. - // - // Never move the first basic block. - BasicBlocks.front().CanOutline = false; - for (auto &BB : BasicBlocks) { - if (!BB.CanOutline) - continue; + if (hasEHRanges()) { + // We cannot move landing pads (or rather entry points for landing + // pads). if (LandingPads.find(BB.getLabel()) != LandingPads.end()) { BB.CanOutline = false; continue; } + // We cannot move a block that can throw since exception-handling + // runtime cannot deal with split functions. However, if we can guarantee + // that the block never throws, it is safe to move the block to + // decrease the size of the function. for (auto &Instr : BB) { if (BC.MIA->isInvoke(Instr)) { BB.CanOutline = false; @@ -1646,23 +1643,36 @@ void BinaryFunction::splitFunction() { } } } + } + + if (opts::AgressiveSplitting) { + // All blocks with 0 count that we can move go to the end of the function. std::stable_sort(BasicBlocksLayout.begin(), BasicBlocksLayout.end(), [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { - if (A->getExecutionCount() != 0 || B->getExecutionCount() != 0) - return false; return A->canOutline() < B->canOutline(); }); + } else if (hasEHRanges()) { + // Typically functions with exception handling have landing pads at the end. + // We cannot move beginning of landing pads, but we can move 0-count blocks + // comprising landing pads to the end and thus facilitating splitting. + auto FirstLP = BasicBlocksLayout.begin(); + while (LandingPads.find((*FirstLP)->getLabel()) != LandingPads.end()) + ++FirstLP; + + std::stable_sort(FirstLP, BasicBlocksLayout.end(), + [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { + return A->canOutline() < B->canOutline(); + }); + } - for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); - I != E; ++I) { - BinaryBasicBlock *BB = *I; - if (BB->getExecutionCount() != 0) - break; - if (!BB->canOutline()) - break; - BB->IsCold = true; - IsSplit = true; - } + // Separate hot from cold + for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); + I != E; ++I) { + BinaryBasicBlock *BB = *I; + if (!BB->canOutline()) + break; + BB->IsCold = true; + IsSplit = true; } } @@ -1673,7 +1683,7 @@ void BinaryFunction::propagateGnuArgsSizeInfo() { return; // The current value of DW_CFA_GNU_args_size affects all following - // invoke instructions untill the next CFI overrides it. + // invoke instructions until the next CFI overrides it. // It is important to iterate basic blocks in the original order when // assigning the value. uint64_t CurrentGnuArgsSize = 0; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 3df5247adb06..721a5598ee59 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -62,8 +62,10 @@ class BinaryFunction : public AddressRangesOwner { /// Settings for splitting function bodies into hot/cold partitions. enum SplittingType : char { ST_NONE = 0, /// Do not split functions - ST_LARGE = 1, /// Only split functions that exceed maximum size - ST_ALL =2, /// Split all functions + ST_EH, /// Split blocks comprising landing pads + ST_LARGE, /// Split functions that exceed maximum size in addition + /// to landing pads. + ST_ALL, /// Split all functions }; /// Choose which strategy should the block layout heuristic prioritize when diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 1650d220aa27..02390eec4457 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -104,9 +104,11 @@ SplitFunctions("split-functions", cl::init(BinaryFunction::ST_NONE), cl::values(clEnumValN(BinaryFunction::ST_NONE, "0", "do not split any function"), - clEnumValN(BinaryFunction::ST_LARGE, "1", - "split if function is too large to fit"), - clEnumValN(BinaryFunction::ST_ALL, "2", + clEnumValN(BinaryFunction::ST_EH, "1", + "split all landing pads"), + clEnumValN(BinaryFunction::ST_LARGE, "2", + "also split if function too large to fit"), + clEnumValN(BinaryFunction::ST_ALL, "3", "split all functions"), clEnumValEnd), cl::Optional); @@ -1017,7 +1019,9 @@ void RewriteInstance::runOptimizationPasses() { if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { bool ShouldSplit = (opts::SplitFunctions == BinaryFunction::ST_ALL) || - LargeFunctions.find(BFI.first) != LargeFunctions.end(); + (opts::SplitFunctions == BinaryFunction::ST_EH && + Function.hasEHRanges()) || + (LargeFunctions.find(BFI.first) != LargeFunctions.end()); BFI.second.modifyLayout(opts::ReorderBlocks, ShouldSplit); if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks", true); From f4b48b3a13ddcb57844e108c08257aa9e9990370 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 8 Apr 2016 12:18:06 -0700 Subject: [PATCH 100/904] Tool to merge .fdata files. Summary: merge-fdata tool takes multiple .fdata files and outputs to stdout combined fdata. Takes about 2 seconds per each additional .fdata file with hhvm production data. (cherry picked from commit 574323b086be91ff3e777bbedcfffee0a7c793d2) --- bolt/BinaryFunction.h | 2 +- bolt/CMakeLists.txt | 2 + bolt/DataReader.h | 39 +++++++- bolt/LLVMBuild.txt | 3 + bolt/llvm-bolt.cpp | 2 +- bolt/merge-fdata/CMakeLists.txt | 6 ++ bolt/merge-fdata/LLVMBuild.txt | 21 ++++ bolt/merge-fdata/Makefile | 19 ++++ bolt/merge-fdata/merge-fdata.cpp | 167 +++++++++++++++++++++++++++++++ 9 files changed, 258 insertions(+), 3 deletions(-) create mode 100644 bolt/merge-fdata/CMakeLists.txt create mode 100644 bolt/merge-fdata/LLVMBuild.txt create mode 100644 bolt/merge-fdata/Makefile create mode 100644 bolt/merge-fdata/merge-fdata.cpp diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 721a5598ee59..8188ecbe829c 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -501,7 +501,7 @@ class BinaryFunction : public AddressRangesOwner { if (DeriveAlignment) { uint64_t DerivedAlignment = Offset & (1 + ~Offset); - BB->setAlignment(std::min(DerivedAlignment, uint64_t(16))); + BB->setAlignment(std::min(DerivedAlignment, uint64_t(32))); } return BB; diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 3e33a968a599..f506fb154e71 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(merge-fdata) + set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} CodeGen diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 9a780b2020ed..fb0aa6ee7f20 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -33,6 +33,24 @@ struct Location { Location(bool IsSymbol, StringRef Name, uint64_t Offset) : IsSymbol(IsSymbol), Name(Name), Offset(Offset) {} + + bool operator==(const Location &RHS) const { + return IsSymbol == RHS.IsSymbol && + Name == RHS.Name && + Offset == RHS.Offset; + } + + bool operator<(const Location &RHS) const { + if (IsSymbol < RHS.IsSymbol) + return true; + + if (Name < RHS.Name) + return true; + + return IsSymbol == RHS.IsSymbol && + Name == RHS.Name && + Offset < RHS.Offset; + } }; struct BranchInfo { @@ -44,6 +62,21 @@ struct BranchInfo { BranchInfo(Location From, Location To, int64_t Mispreds, int64_t Branches) : From(std::move(From)), To(std::move(To)), Mispreds(Mispreds), Branches(Branches) {} + + bool operator==(const BranchInfo &RHS) const { + return From == RHS.From && + To == RHS.To; + } + + bool operator<(const BranchInfo &RHS) const { + if (From < RHS.From) + return true; + + if (From == RHS.From) + return (To < RHS.To); + + return false; + } }; struct FuncBranchData { @@ -96,6 +129,10 @@ class DataReader { ErrorOr getFuncBranchData(StringRef FuncName) const; + using FuncsMapType = StringMap; + + FuncsMapType &getAllFuncsData() { return FuncsMap; } + /// Dumps the entire data structures parsed. Used for debugging. void dump() const; @@ -115,7 +152,7 @@ class DataReader { StringRef ParsingBuf; unsigned Line; unsigned Col; - StringMap FuncsMap; + FuncsMapType FuncsMap; static const char FieldSeparator = ' '; }; diff --git a/bolt/LLVMBuild.txt b/bolt/LLVMBuild.txt index 26a77a1b3eea..5432794ea143 100644 --- a/bolt/LLVMBuild.txt +++ b/bolt/LLVMBuild.txt @@ -15,6 +15,9 @@ ; ;===------------------------------------------------------------------------===; +[common] +subdirectories = merge-fdata + [component_0] type = Tool name = llvm-bolt diff --git a/bolt/llvm-bolt.cpp b/bolt/llvm-bolt.cpp index 1c504ff00e3b..e1a5ad80c693 100644 --- a/bolt/llvm-bolt.cpp +++ b/bolt/llvm-bolt.cpp @@ -72,7 +72,7 @@ int main(int argc, char **argv) { cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); cl::ParseCommandLineOptions(argc, argv, - "llvm feedback-directed layout optimizer\n"); + "BOLT - Binary Optimization and Layout Tool\n"); ToolName = argv[0]; diff --git a/bolt/merge-fdata/CMakeLists.txt b/bolt/merge-fdata/CMakeLists.txt new file mode 100644 index 000000000000..b36df1fb0577 --- /dev/null +++ b/bolt/merge-fdata/CMakeLists.txt @@ -0,0 +1,6 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_llvm_executable(merge-fdata + merge-fdata.cpp + ../DataReader.cpp +) diff --git a/bolt/merge-fdata/LLVMBuild.txt b/bolt/merge-fdata/LLVMBuild.txt new file mode 100644 index 000000000000..39bc693fb469 --- /dev/null +++ b/bolt/merge-fdata/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./tools/llvm-bolt/merge-fdata/LLVMBuild.txt --------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Tool +name = merge-fdata +parent = llvm-bolt diff --git a/bolt/merge-fdata/Makefile b/bolt/merge-fdata/Makefile new file mode 100644 index 000000000000..41687a01b91b --- /dev/null +++ b/bolt/merge-fdata/Makefile @@ -0,0 +1,19 @@ +##===- tools/lli/Makefile ------------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL := ../../.. +TOOLNAME := merge-fdata + +include $(LEVEL)/Makefile.config + +LINK_COMPONENTS := support + +SOURCES := merge-fdata.cpp ../DataReader.cpp + +include $(LLVM_SRC_ROOT)/Makefile.rules diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/merge-fdata/merge-fdata.cpp new file mode 100644 index 000000000000..477dc71fa6f3 --- /dev/null +++ b/bolt/merge-fdata/merge-fdata.cpp @@ -0,0 +1,167 @@ +//===-- merge-fdata.cpp - Tool for merging profile in fdata format --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// merge-fdata 1.fdata 2.fdata 3.fdata > merged.fdata +// +//===----------------------------------------------------------------------===// + +#include "../DataReader.h" +#include "llvm/Object/Binary.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/StringPool.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; +using namespace object; +using namespace bolt; + +namespace opts { + +static cl::list +InputDataFilenames(cl::Positional, + cl::CommaSeparated, + cl::desc(" []..."), + cl::OneOrMore); + +} // namespace opts + +static StringRef ToolName; + +static void report_error(StringRef Message, std::error_code EC) { + assert(EC); + errs() << ToolName << ": '" << Message << "': " << EC.message() << ".\n"; + exit(1); +} + +int main(int argc, char **argv) { + // Print a stack trace if we signal out. + sys::PrintStackTraceOnErrorSignal(); + PrettyStackTraceProgram X(argc, argv); + + llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. + + cl::ParseCommandLineOptions(argc, argv, + "merge fdata into a single file"); + + ToolName = argv[0]; + + // All merged data. + DataReader::FuncsMapType MergedFunctionsData; + + // Merged functions data has to replace strings refs with strings from the + // pool. + StringPool MergedStringPool; + + // Temporary storage for all strings so they don't get destroyed. + std::vector AllStrings; + + // Copy branch info replacing string references with internal storage + // references. + auto CopyBranchInfo = [&](const BranchInfo &BI, + std::vector &BIData) { + auto FromNamePtr = MergedStringPool.intern(BI.From.Name); + auto ToNamePtr = MergedStringPool.intern(BI.To.Name); + BIData.emplace_back(BranchInfo(Location(BI.From.IsSymbol, + *FromNamePtr, + BI.From.Offset), + Location(BI.To.IsSymbol, + *ToNamePtr, + BI.To.Offset), + BI.Mispreds, + BI.Branches)); + AllStrings.emplace_back(FromNamePtr); // keep the reference + AllStrings.emplace_back(ToNamePtr); // keep the reference + }; + + for (auto &InputDataFilename : opts::InputDataFilenames) { + if (!sys::fs::exists(InputDataFilename)) + report_error(InputDataFilename, errc::no_such_file_or_directory); + + errs() << "Merging data from " << InputDataFilename << "...\n"; + + // Attempt to read input bolt data + auto ReaderOrErr = + bolt::DataReader::readPerfData(InputDataFilename, errs()); + if (std::error_code EC = ReaderOrErr.getError()) + report_error(InputDataFilename, EC); + + for(auto &FI : ReaderOrErr.get()->getAllFuncsData()) { + auto MI = MergedFunctionsData.find(FI.second.Name); + if (MI != MergedFunctionsData.end()) { + std::vector TmpBI; + for (auto &BI : FI.second.Data) { + // Find and merge a corresponding entry or copy data. + auto TI = std::lower_bound(MI->second.Data.begin(), + MI->second.Data.end(), + BI); + if (TI != MI->second.Data.end() && *TI == BI) { + TI->Branches += BI.Branches; + TI->Mispreds += BI.Mispreds; + } else { + CopyBranchInfo(BI, TmpBI); + } + } + // Merge in the temp vector making sure it doesn't contain duplicates. + std::sort(TmpBI.begin(), TmpBI.end()); + BranchInfo *PrevBI = nullptr; + for (auto &BI : TmpBI) { + if (PrevBI && *PrevBI == BI) { + PrevBI->Branches += BI.Branches; + PrevBI->Mispreds += BI.Mispreds; + } else { + MI->second.Data.emplace_back(BI); + PrevBI = &MI->second.Data.back(); + } + } + std::sort(MI->second.Data.begin(), MI->second.Data.end()); + } else { + auto NamePtr = MergedStringPool.intern(FI.second.Name); + AllStrings.emplace_back(NamePtr); // keep the ref + bool Success; + std::tie(MI, Success) = MergedFunctionsData.insert( + std::make_pair(*NamePtr, + FuncBranchData(*NamePtr, + FuncBranchData::ContainerTy()))); + // Copy with string conversion while eliminating duplicates. + std::sort(FI.second.Data.begin(), FI.second.Data.end()); + BranchInfo *PrevBI = nullptr; + for (auto &BI : FI.second.Data) { + if (PrevBI && *PrevBI == BI) { + PrevBI->Branches += BI.Branches; + PrevBI->Mispreds += BI.Mispreds; + } else { + CopyBranchInfo(BI, MI->second.Data); + PrevBI = &MI->second.Data.back(); + } + } + } + } + } + + // Print all the data in the original format + for (auto &FDI : MergedFunctionsData) { + for (auto &BD : FDI.second.Data) { + outs() << BD.From.IsSymbol << " " << FDI.first() << " " + << Twine::utohexstr(BD.From.Offset) << " " + << BD.To.IsSymbol << " " << BD.To.Name << " " + << Twine::utohexstr(BD.To.Offset) << " " + << BD.Mispreds << " " << BD.Branches << '\n'; + } + } + + errs() << "All data merged successfully.\n"; + + AllStrings.clear(); + + return EXIT_SUCCESS; +} From 8e39efcf6b7a72786a199dbfcffead22cb63472c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 25 Apr 2016 22:13:12 -0700 Subject: [PATCH 101/904] Fix ninja install-* for BOLT utilities. Summary: Make sure we can install all tools needed for processing BOLT .fdata files such as perf2bolt, merge-fdata, etc. (cherry picked from commit 3b9a809190437b515ae4ae5593167f71e734c8b6) --- bolt/merge-fdata/CMakeLists.txt | 2 +- bolt/merge-fdata/merge-fdata.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/merge-fdata/CMakeLists.txt b/bolt/merge-fdata/CMakeLists.txt index b36df1fb0577..9adf4a1248a1 100644 --- a/bolt/merge-fdata/CMakeLists.txt +++ b/bolt/merge-fdata/CMakeLists.txt @@ -1,6 +1,6 @@ set(LLVM_LINK_COMPONENTS Support) -add_llvm_executable(merge-fdata +add_llvm_tool(merge-fdata merge-fdata.cpp ../DataReader.cpp ) diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/merge-fdata/merge-fdata.cpp index 477dc71fa6f3..cab715648faf 100644 --- a/bolt/merge-fdata/merge-fdata.cpp +++ b/bolt/merge-fdata/merge-fdata.cpp @@ -95,7 +95,7 @@ int main(int argc, char **argv) { if (std::error_code EC = ReaderOrErr.getError()) report_error(InputDataFilename, EC); - for(auto &FI : ReaderOrErr.get()->getAllFuncsData()) { + for (auto &FI : ReaderOrErr.get()->getAllFuncsData()) { auto MI = MergedFunctionsData.find(FI.second.Name); if (MI != MergedFunctionsData.end()) { std::vector TmpBI; From 5a7e60e3dfbc51f24b965410cfc357a49940d90b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 21 Apr 2016 09:54:33 -0700 Subject: [PATCH 102/904] Option to break in given functions. Summary: Added option "-break-funcs=func1,func2,...." to coredump in any given function by introducing ud2 sequence at the beginning of the function. Useful for debugging and validating stack traces. Also renamed options containing "_" to use "-" instead. Also run hhvm test with "-update-debug-sections". (cherry picked from commit 5231aa7cf775c1799167c0e57096a804ea994b01) --- bolt/RewriteInstance.cpp | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 02390eec4457..8ec2e402af1c 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -68,6 +68,13 @@ namespace opts { static cl::opt OutputFilename("o", cl::desc(""), cl::Required); +static cl::list +BreakFunctionNames("break-funcs", + cl::CommaSeparated, + cl::desc("list of functions to core dump on (debugging)"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden); + static cl::list FunctionNames("funcs", cl::CommaSeparated, @@ -75,21 +82,21 @@ FunctionNames("funcs", cl::value_desc("func1,func2,func3,...")); static cl::opt -FunctionNamesFile("funcs_file", +FunctionNamesFile("funcs-file", cl::desc("file with list of functions to optimize")); static cl::list -SkipFunctionNames("skip_funcs", +SkipFunctionNames("skip-funcs", cl::CommaSeparated, cl::desc("list of functions to skip"), cl::value_desc("func1,func2,func3,...")); static cl::opt -SkipFunctionNamesFile("skip_funcs_file", +SkipFunctionNamesFile("skip-funcs-file", cl::desc("file with list of functions to skip")); static cl::opt -MaxFunctions("max_funcs", +MaxFunctions("max-funcs", cl::desc("maximum # of functions to overwrite"), cl::Optional); @@ -1171,6 +1178,16 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, assert(!Function.begin()->isCold() && "first basic block should never be cold"); + // Emit UD2 at the beginning if requested by user. + if (!opts::BreakFunctionNames.empty()) { + for (auto &Name : opts::BreakFunctionNames) { + if (Function.getName() == Name) { + Streamer.EmitIntValue(0x0B0F, 2); // UD2: 0F 0B + break; + } + } + } + // Emit code. int64_t CurrentGnuArgsSize = 0; for (auto BB : Function.layout()) { From 0e74981854b772ff972f6535913495b490599e61 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 26 Apr 2016 23:42:39 -0700 Subject: [PATCH 103/904] Fix for functions in different segments. Summary: In a test binary some functions are placed in a segment preceding the segment containing .text section. As a result, we were miscalculating maximum function size as the calculation was based on addresses only. This diff fixes the calculation by checking if symbol after function belongs to the same section. If it does not, then we set the maximum function size based on the size of the containing section and not on the address distance to the next symbol. (cherry picked from commit 91f9061b0e30f90f6a4666dcf03ff844a5851dd5) --- bolt/RewriteInstance.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 8ec2e402af1c..0114b489356f 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -853,7 +853,25 @@ void RewriteInstance::disassembleFunctions() { // has been processed. auto SymRefI = FileSymRefs.upper_bound(Function.getAddress()); if (SymRefI != FileSymRefs.end()) { - auto MaxSize = SymRefI->first - Function.getAddress(); + uint64_t MaxSize; + auto SectionIter = *SymRefI->second.getSection(); + if (SectionIter != InputFile->section_end() && + *SectionIter == Function.getSection()) { + MaxSize = SymRefI->first - Function.getAddress(); + } else { + // Function runs till the end of the containing section assuming + // the section does not run over the next symbol. + uint64_t SectionEnd = Function.getSection().getAddress() + + Function.getSection().getSize(); + if (SectionEnd > SymRefI->first) { + errs() << "BOLT-WARNING: symbol after " << Function.getName() + << " should not be in the same section.\n"; + MaxSize = 0; + } else { + MaxSize = SectionEnd - Function.getAddress(); + } + } + if (MaxSize < Function.getSize()) { errs() << "BOLT-WARNING: symbol seen in the middle of the function " << Function.getName() << ". Skipping.\n"; From baae7a1323bdffca21ff26a54c665dcebd76fcd2 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 27 Apr 2016 18:06:18 -0700 Subject: [PATCH 104/904] Make merge-fdata generate smaller .fdata files. Summary: A lot of the space in the merged .fdata is taken by branches to and from [heap], which is jitted code. On different machines, or during different runs, jitted addresses are all different. We don't use these addresses, but we need branch info to get accurate function call counts. This diff treats all [heap] addresses the same, resulting in a simplified merged file. The size of the compressed file decreased from 70MB to 8MB. (cherry picked from commit 302fe17febffbf82f1045fd0918f7ec8dac059ef) --- bolt/DataReader.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/DataReader.h b/bolt/DataReader.h index fb0aa6ee7f20..b5e570779ea7 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -37,7 +37,7 @@ struct Location { bool operator==(const Location &RHS) const { return IsSymbol == RHS.IsSymbol && Name == RHS.Name && - Offset == RHS.Offset; + (Name == "[heap]" || Offset == RHS.Offset); } bool operator<(const Location &RHS) const { @@ -49,6 +49,7 @@ struct Location { return IsSymbol == RHS.IsSymbol && Name == RHS.Name && + Name != "[heap]" && Offset < RHS.Offset; } }; From 259fcf38f6b91eb65271ba61dd8f2cdc5072a30a Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Thu, 28 Apr 2016 12:55:35 -0700 Subject: [PATCH 105/904] Fix "Cannot update ranges for DIE at offset" error messages. Summary: Fix the error message by not printing it :) Explanation: a previous diff accidentally removed this error message from within the DEBUG macro, and it's expected that we'll have a bunch of them since a lot of the DIEs we try to update are empty or meaningless. For instance (and mainly), there is a huge number of lexical block DIEs with no attributes in .debug_info. In the first phase of collecting debugging info, we store the offsets of all these DIEs, only later to realize that we cannot update their address ranges because they have none. A better fix would be to check this earlier and not store offsets of DIEs we cannot update to begin with. (cherry picked from commit d06f7f5817b4c0accfe2f6409a8a816ca1ee27b8) --- bolt/RewriteInstance.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 0114b489356f..88ec95ca9cb8 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2367,8 +2367,8 @@ void RewriteInstance::updateDWARFObjectAddressRanges( DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); } else { - errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << "\n"; + DEBUG(errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << "\n"); } } } From 2bad085673c6bb145b3d724b59d659e58b238fbb Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Fri, 15 Apr 2016 15:59:52 -0700 Subject: [PATCH 106/904] Optimize calls to functions that are a single unconditional jump Summary: Many functions (around 600) in the HHVM binary are simply a single unconditional jump instruction to another function. These can be trivially optimized by modifying the call sites to directly call the branch target instead (because it also happens with more than one jump in sequence, we do it iteratively). This diff also adds a very simple analysis/optimization pass system in which this pass is the first one to be implemented. A follow-up to this could be to move the current optimizations to other passes. (cherry picked from commit a91d2e2d0fca3874a9baa5021d7e393be734f22b) --- bolt/BinaryPassManager.cpp | 30 ++++++++++ bolt/BinaryPassManager.h | 92 ++++++++++++++++++++++++++++ bolt/BinaryPasses.cpp | 120 +++++++++++++++++++++++++++++++++++++ bolt/CMakeLists.txt | 2 + bolt/RewriteInstance.cpp | 2 + 5 files changed, 246 insertions(+) create mode 100644 bolt/BinaryPassManager.cpp create mode 100644 bolt/BinaryPassManager.h create mode 100644 bolt/BinaryPasses.cpp diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp new file mode 100644 index 000000000000..3d4278925dbf --- /dev/null +++ b/bolt/BinaryPassManager.cpp @@ -0,0 +1,30 @@ +//===--- BinaryPassManager.cpp - Binary-level analysis/optimization passes ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryPassManager.h" + +namespace llvm { +namespace bolt { + +std::unique_ptr +BinaryFunctionPassManager::GlobalPassManager; + +void BinaryFunctionPassManager::runAllPasses( + BinaryContext &BC, + std::map &Functions) { + auto &Manager = getGlobalPassManager(); + Manager.BC = &BC; + Manager.BFs = &Functions; + Manager.runPasses(); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h new file mode 100644 index 000000000000..697772a602a2 --- /dev/null +++ b/bolt/BinaryPassManager.h @@ -0,0 +1,92 @@ +//===--- BinaryPassManager.h - Binary-level analysis/optimization passes --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A very simple binary-level analysis/optimization passes system. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H + +#include "BinaryFunction.h" +#include "llvm/Support/Options.h" +#include "llvm/Support/CommandLine.h" +#include +#include +#include + +namespace llvm { +namespace bolt { + +/// An optimization/analysis pass that runs on functions. +class BinaryFunctionPass { +public: + virtual ~BinaryFunctionPass() = default; + virtual void runOnFunctions(BinaryContext &BC, + std::map &BFs) = 0; +}; + +/// Simple class for managing analyses and optimizations on BinaryFunctions. +class BinaryFunctionPassManager { +private: + BinaryContext *BC; + std::map *BFs; + std::vector *, + std::unique_ptr>> Passes; + + /// Manager that contains all implemented passes. + static std::unique_ptr GlobalPassManager; + +public: + BinaryFunctionPassManager(BinaryContext *BC = nullptr, + std::map *BFs = nullptr) + : BC(BC), BFs(BFs) {} + + static BinaryFunctionPassManager &getGlobalPassManager() { + if (!GlobalPassManager) { + GlobalPassManager = llvm::make_unique(); + } + return *GlobalPassManager.get(); + } + + /// Adds a pass to this manager based on the value of its corresponding + /// command-line option. + void registerPass(std::unique_ptr Pass, + const cl::opt *Opt) { + Passes.emplace_back(Opt, std::move(Pass)); + } + + /// Run all registered passes in the order they were added. + void runPasses() { + for (const auto &OptPassPair : Passes) { + if (*OptPassPair.first) { + OptPassPair.second->runOnFunctions(*BC, *BFs); + } + } + } + + /// Runs all enabled implemented passes on all functions. + static void runAllPasses(BinaryContext &BC, + std::map &Functions); + +}; + +template *Opt> +class RegisterBinaryPass { +public: + RegisterBinaryPass() { + BinaryFunctionPassManager::getGlobalPassManager().registerPass( + std::move(llvm::make_unique()), Opt); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp new file mode 100644 index 000000000000..5299d2d6d604 --- /dev/null +++ b/bolt/BinaryPasses.cpp @@ -0,0 +1,120 @@ +//===--- BinaryPasses.cpp - Binary-level analysis/optimization passes -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryPassManager.h" + +#define DEBUG_TYPE "bolt" + +namespace llvm { +namespace bolt { + +/// Detects functions that simply do a tail call when they are called and +/// optimizes calls to these functions. +class OptimizeBodylessFunctions : public BinaryFunctionPass { +private: + /// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G, + /// thus calls to F can be optimized to calls to G. + std::map EquivalentCallTarget; + + void analyze(BinaryFunction &BF, + BinaryContext &BC, + std::map &BFs) { + if (BF.size() != 1 || BF.begin()->size() == 0) + return; + + auto &BB = *BF.begin(); + const auto &FirstInst = *BB.begin(); + + if (!BC.MIA->isTailCall(FirstInst)) + return; + + auto &Op1 = FirstInst.getOperand(0); + if (!Op1.isExpr()) + return; + + if (auto Expr = dyn_cast(Op1.getExpr())) { + auto AddressIt = BC.GlobalSymbols.find(Expr->getSymbol().getName()); + if (AddressIt != BC.GlobalSymbols.end()) { + auto CalleeIt = BFs.find(AddressIt->second); + if (CalleeIt != BFs.end()) { + assert(Expr->getSymbol().getName() == CalleeIt->second.getName()); + EquivalentCallTarget[BF.getName()] = &CalleeIt->second; + } + } + } + } + + void optimizeCalls(BinaryFunction &BF, + BinaryContext &BC) { + for (auto BBIt = BF.begin(), BBEnd = BF.end(); BBIt != BBEnd; ++BBIt) { + for (auto InstIt = BBIt->begin(), InstEnd = BBIt->end(); + InstIt != InstEnd; ++InstIt) { + auto &Inst = *InstIt; + if (BC.MIA->isCall(Inst)) { + auto &Op1 = Inst.getOperand(0); + if (Op1.isExpr()) { + if (auto Expr = dyn_cast(Op1.getExpr())) { + auto OriginalTarget = Expr->getSymbol().getName(); + auto Target = OriginalTarget; + // Iteratively update target since we could have f1() calling f2() + // calling f3() calling f4() and we want to output f1() directly + // calling f4(). + while (EquivalentCallTarget.count(Target)) { + Target = EquivalentCallTarget.find(Target)->second->getName(); + } + if (Target != OriginalTarget) { + DEBUG(errs() << "BOLT-DEBUG: Optimizing " << BF.getName() + << ": replacing call to " + << OriginalTarget + << " by call to " << Target << "\n"); + Inst.clear(); + Inst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create( + BC.Ctx->getOrCreateSymbol(Target), *BC.Ctx))); + } + } + } + } + } + } + } + +public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs) override { + for (auto &It : BFs) { + analyze(It.second, BC, BFs); + } + for (auto &It : BFs) { + optimizeCalls(It.second, BC); + } + } +}; + +namespace opts { + +static llvm::cl::opt +OptimizeBodylessFunctions( + "optimize-bodyless-functions", + llvm::cl::desc("optimize functions that just do a tail call"), + llvm::cl::Optional); + +} // namespace opts + +namespace { + +RegisterBinaryPass +RegisterOptimizeBodylessFunctions; + +} // namespace + +} // namespace bolt +} // namespace llvm diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index f506fb154e71..1adf2aaf1e59 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -18,6 +18,8 @@ add_llvm_tool(llvm-bolt BinaryBasicBlock.cpp BinaryContext.cpp BinaryFunction.cpp + BinaryPasses.cpp + BinaryPassManager.cpp DataReader.cpp DebugData.cpp Exceptions.cpp diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 88ec95ca9cb8..1923a0650c26 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -13,6 +13,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" #include "BinaryFunction.h" +#include "BinaryPassManager.h" #include "DataReader.h" #include "Exceptions.h" #include "RewriteInstance.h" @@ -1053,6 +1054,7 @@ void RewriteInstance::runOptimizationPasses() { } // Post-processing passes. + BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions); // Fix the CFI state. if (!Function.fixCFIState()) { From 025e72f8f4fab32a54622483b8d256d3952c8e03 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Fri, 15 Apr 2016 15:59:52 -0700 Subject: [PATCH 107/904] Optimize calls to functions that are a single unconditional jump Summary: Many functions (around 600) in the HHVM binary are simply a single unconditional jump instruction to another function. These can be trivially optimized by modifying the call sites to directly call the branch target instead (because it also happens with more than one jump in sequence, we do it iteratively). This diff also adds a very simple analysis/optimization pass system in which this pass is the first one to be implemented. A follow-up to this could be to move the current optimizations to other passes. (cherry picked from commit 974cc3684358bbb6430f26de1526329e57f1d8d7) --- bolt/BinaryPass.cpp | 124 +++++++++++++++++++++++++++++++++++++++ bolt/BinaryPass.h | 71 ++++++++++++++++++++++ bolt/RewriteInstance.cpp | 2 + 3 files changed, 197 insertions(+) create mode 100644 bolt/BinaryPass.cpp create mode 100644 bolt/BinaryPass.h diff --git a/bolt/BinaryPass.cpp b/bolt/BinaryPass.cpp new file mode 100644 index 000000000000..fca85a4a6d8b --- /dev/null +++ b/bolt/BinaryPass.cpp @@ -0,0 +1,124 @@ +//===--- BinaryPass.h - Binary-level analysis/optimization passes ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryPass.h" + +#define DEBUG_TYPE "bolt" + +namespace opts { + +static llvm::cl::opt +OptimizeBodylessFunctions( + "optimize-bodyless-functions", + llvm::cl::desc("optimize functions that just do a tail call"), + llvm::cl::Optional); + +} // namespace opts + +namespace llvm { +namespace bolt { + +/// Detects functions that simply do a tail call when they are called and +/// optimizes calls to these functions. +class OptimizeBodylessFunctions : public BinaryFunctionPass { +private: + /// Maps function name to BinaryFunction. + std::map FunctionByName; + + /// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G, + /// thus calls to F can be optimized to calls to G. + std::map EquivalentCallTarget; + + void analyze(BinaryFunction &BF, + BinaryContext &BC) { + if (BF.size() != 1 || BF.begin()->size() == 0) + return; + + auto &BB = *BF.begin(); + const auto &FirstInst = *BB.begin(); + + if (!BC.MIA->isTailCall(FirstInst)) + return; + + auto &Op1 = FirstInst.getOperand(0); + if (!Op1.isExpr()) + return; + + if (auto Expr = dyn_cast(Op1.getExpr())) { + auto CalleeIt = FunctionByName.find(Expr->getSymbol().getName()); + if (CalleeIt != FunctionByName.end()) { + assert(Expr->getSymbol().getName() == CalleeIt->second->getName()); + EquivalentCallTarget[BF.getName()] = CalleeIt->second; + } + } + } + + void optimizeCalls(BinaryFunction &BF, + BinaryContext &BC) { + for (auto BBIt = BF.begin(), BBEnd = BF.end(); BBIt != BBEnd; ++BBIt) { + for (auto InstIt = BBIt->begin(), InstEnd = BBIt->end(); + InstIt != InstEnd; ++InstIt) { + auto &Inst = *InstIt; + if (BC.MIA->isCall(Inst)) { + auto &Op1 = Inst.getOperand(0); + if (Op1.isExpr()) { + if (auto Expr = dyn_cast(Op1.getExpr())) { + auto OriginalTarget = Expr->getSymbol().getName(); + errs() << "BOLT-OPT: " << BF.getName() << " calls " << OriginalTarget << "\n"; + auto Target = OriginalTarget; + while (EquivalentCallTarget.count(Target)) { + Target = EquivalentCallTarget.find(Target)->second->getName(); + } + if (Target != OriginalTarget) { + DEBUG(errs() << "BOLT-DEBUG: Optimizing " << BF.getName() + << ": replacing call to " + << OriginalTarget + << " by call to " << Target << "\n"); + Inst.clear(); + Inst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create( + BC.Ctx->getOrCreateSymbol(Target), *BC.Ctx))); + } + } + } + } + } + } + } + +public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs) override { + for (auto &It : BFs) { + FunctionByName[It.second.getName()] = &It.second; + } + for (auto &It : BFs) { + analyze(It.second, BC); + } + for (auto &It : BFs) { + optimizeCalls(It.second, BC); + } + } +}; + +void BinaryFunctionPassManager::runAllPasses( + BinaryContext &BC, + std::map &Functions) { + BinaryFunctionPassManager PassManager(BC, Functions); + + PassManager.registerPass(make_unique(), + opts::OptimizeBodylessFunctions); + + PassManager.runPasses(); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/BinaryPass.h b/bolt/BinaryPass.h new file mode 100644 index 000000000000..9e36def518db --- /dev/null +++ b/bolt/BinaryPass.h @@ -0,0 +1,71 @@ +//===--- BinaryPass.h - Binary-level analysis/optimization passes ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A very simple binary-level analysis/optimization passes system. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_H + +#include "BinaryFunction.h" +#include "llvm/Support/Options.h" +#include "llvm/Support/CommandLine.h" +#include +#include +#include + +namespace llvm { +namespace bolt { + +/// An optimization/analysis pass that runs on functions. +class BinaryFunctionPass { +public: + virtual ~BinaryFunctionPass() = default; + virtual void runOnFunctions(BinaryContext &BC, + std::map &BFs) = 0; +}; + +/// Simple class for managing analyses and optimizations on BinaryFunctions. +class BinaryFunctionPassManager { +protected: + BinaryContext &BC; + std::map &BFs; + std::vector> Passes; + +public: + BinaryFunctionPassManager(BinaryContext &BC, + std::map &BFs) + : BC(BC), BFs(BFs) {} + + /// Adds a pass to this manager based on the value of its corresponding + /// command-line option. + void registerPass(std::unique_ptr Pass, + const cl::opt &Opt) { + if (Opt) { + Passes.emplace_back(std::move(Pass)); + } + } + + /// Run all registered passes in the order they were added. + void runPasses() { + for (const auto &Pass : Passes) { + Pass->runOnFunctions(BC, BFs); + } + } + + /// Runs all enabled implemented passes on all functions. + static void runAllPasses(BinaryContext &BC, + std::map &Functions); +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 1923a0650c26..486b84ed958b 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1069,6 +1069,8 @@ void RewriteInstance::runOptimizationPasses() { if (opts::PrintAll || opts::PrintEHRanges) Function.print(errs(), "after updating EH ranges", true); } + + BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions); } namespace { From 428d2d1656eb73053fdf735ea3779c4f53f22c05 Mon Sep 17 00:00:00 2001 From: Gabriel Poesia Date: Mon, 25 Apr 2016 14:25:58 -0700 Subject: [PATCH 108/904] Inlining of small functions. Summary: Added an optimization pass of inlining calls to small functions (with only one basic block). Inlining is done in a very simple way, inserting instructions to simulate the changes to the stack pointer that call/ret would make before/after the inlined function executes. Also, the heuristic prefers to inline calls that happen in the hottest blocks (by looking at their execution count). Calls in cold blocks are ignored. (cherry picked from commit 42752c63f9ff80dd1fa8d60ad82ad8a1136864bd) --- bolt/BinaryBasicBlock.h | 12 +- bolt/BinaryFunction.h | 14 ++ bolt/BinaryPass.cpp | 124 ---------------- bolt/BinaryPass.h | 71 --------- bolt/BinaryPassManager.cpp | 33 ++++- bolt/BinaryPassManager.h | 44 ++---- bolt/BinaryPasses.cpp | 294 +++++++++++++++++++++++++++---------- bolt/BinaryPasses.h | 95 ++++++++++++ bolt/RewriteInstance.cpp | 1 - 9 files changed, 371 insertions(+), 317 deletions(-) delete mode 100644 bolt/BinaryPass.cpp delete mode 100644 bolt/BinaryPass.h create mode 100644 bolt/BinaryPasses.h diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 4eb742cb6a2e..0ba3a8630d08 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -260,12 +260,22 @@ class BinaryBasicBlock { } bool eraseInstruction(MCInst *Inst) { + return replaceInstruction(Inst, std::vector()); + } + + /// Replace an instruction with a sequence of instructions. Returns true + /// if the instruction to be replaced was found and replaced. + bool replaceInstruction(MCInst *Inst, + const std::vector &Replacement) { auto I = Instructions.end(); auto B = Instructions.begin(); while (I > B) { --I; if (&*I == Inst) { - Instructions.erase(I); + Instructions.insert( + Instructions.erase(I), + Replacement.begin(), + Replacement.end()); return true; } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 8188ecbe829c..2f57c86a12e0 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -793,6 +793,20 @@ class BinaryFunction : public AddressRangesOwner { /// Returns the size of the basic block in the original binary. size_t getBasicBlockOriginalSize(const BinaryBasicBlock *BB) const; + /// Returns an estimate of the function's hot part after splitting. + /// This is a very rough estimate, as with C++ exceptions there are + /// blocks we don't move, and it makes no attempt at estimating the size + /// of the added/removed branch instructions. + size_t estimateHotSize() const { + size_t Estimate = 0; + for (const auto *BB : BasicBlocksLayout) { + if (BB->ExecutionCount != 0) { + Estimate += getBasicBlockOriginalSize(BB); + } + } + return Estimate; + } + virtual ~BinaryFunction() {} /// Info for fragmented functions. diff --git a/bolt/BinaryPass.cpp b/bolt/BinaryPass.cpp deleted file mode 100644 index fca85a4a6d8b..000000000000 --- a/bolt/BinaryPass.cpp +++ /dev/null @@ -1,124 +0,0 @@ -//===--- BinaryPass.h - Binary-level analysis/optimization passes ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "BinaryPass.h" - -#define DEBUG_TYPE "bolt" - -namespace opts { - -static llvm::cl::opt -OptimizeBodylessFunctions( - "optimize-bodyless-functions", - llvm::cl::desc("optimize functions that just do a tail call"), - llvm::cl::Optional); - -} // namespace opts - -namespace llvm { -namespace bolt { - -/// Detects functions that simply do a tail call when they are called and -/// optimizes calls to these functions. -class OptimizeBodylessFunctions : public BinaryFunctionPass { -private: - /// Maps function name to BinaryFunction. - std::map FunctionByName; - - /// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G, - /// thus calls to F can be optimized to calls to G. - std::map EquivalentCallTarget; - - void analyze(BinaryFunction &BF, - BinaryContext &BC) { - if (BF.size() != 1 || BF.begin()->size() == 0) - return; - - auto &BB = *BF.begin(); - const auto &FirstInst = *BB.begin(); - - if (!BC.MIA->isTailCall(FirstInst)) - return; - - auto &Op1 = FirstInst.getOperand(0); - if (!Op1.isExpr()) - return; - - if (auto Expr = dyn_cast(Op1.getExpr())) { - auto CalleeIt = FunctionByName.find(Expr->getSymbol().getName()); - if (CalleeIt != FunctionByName.end()) { - assert(Expr->getSymbol().getName() == CalleeIt->second->getName()); - EquivalentCallTarget[BF.getName()] = CalleeIt->second; - } - } - } - - void optimizeCalls(BinaryFunction &BF, - BinaryContext &BC) { - for (auto BBIt = BF.begin(), BBEnd = BF.end(); BBIt != BBEnd; ++BBIt) { - for (auto InstIt = BBIt->begin(), InstEnd = BBIt->end(); - InstIt != InstEnd; ++InstIt) { - auto &Inst = *InstIt; - if (BC.MIA->isCall(Inst)) { - auto &Op1 = Inst.getOperand(0); - if (Op1.isExpr()) { - if (auto Expr = dyn_cast(Op1.getExpr())) { - auto OriginalTarget = Expr->getSymbol().getName(); - errs() << "BOLT-OPT: " << BF.getName() << " calls " << OriginalTarget << "\n"; - auto Target = OriginalTarget; - while (EquivalentCallTarget.count(Target)) { - Target = EquivalentCallTarget.find(Target)->second->getName(); - } - if (Target != OriginalTarget) { - DEBUG(errs() << "BOLT-DEBUG: Optimizing " << BF.getName() - << ": replacing call to " - << OriginalTarget - << " by call to " << Target << "\n"); - Inst.clear(); - Inst.addOperand(MCOperand::createExpr( - MCSymbolRefExpr::create( - BC.Ctx->getOrCreateSymbol(Target), *BC.Ctx))); - } - } - } - } - } - } - } - -public: - void runOnFunctions(BinaryContext &BC, - std::map &BFs) override { - for (auto &It : BFs) { - FunctionByName[It.second.getName()] = &It.second; - } - for (auto &It : BFs) { - analyze(It.second, BC); - } - for (auto &It : BFs) { - optimizeCalls(It.second, BC); - } - } -}; - -void BinaryFunctionPassManager::runAllPasses( - BinaryContext &BC, - std::map &Functions) { - BinaryFunctionPassManager PassManager(BC, Functions); - - PassManager.registerPass(make_unique(), - opts::OptimizeBodylessFunctions); - - PassManager.runPasses(); -} - -} // namespace bolt -} // namespace llvm diff --git a/bolt/BinaryPass.h b/bolt/BinaryPass.h deleted file mode 100644 index 9e36def518db..000000000000 --- a/bolt/BinaryPass.h +++ /dev/null @@ -1,71 +0,0 @@ -//===--- BinaryPass.h - Binary-level analysis/optimization passes ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// A very simple binary-level analysis/optimization passes system. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_H -#define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_H - -#include "BinaryFunction.h" -#include "llvm/Support/Options.h" -#include "llvm/Support/CommandLine.h" -#include -#include -#include - -namespace llvm { -namespace bolt { - -/// An optimization/analysis pass that runs on functions. -class BinaryFunctionPass { -public: - virtual ~BinaryFunctionPass() = default; - virtual void runOnFunctions(BinaryContext &BC, - std::map &BFs) = 0; -}; - -/// Simple class for managing analyses and optimizations on BinaryFunctions. -class BinaryFunctionPassManager { -protected: - BinaryContext &BC; - std::map &BFs; - std::vector> Passes; - -public: - BinaryFunctionPassManager(BinaryContext &BC, - std::map &BFs) - : BC(BC), BFs(BFs) {} - - /// Adds a pass to this manager based on the value of its corresponding - /// command-line option. - void registerPass(std::unique_ptr Pass, - const cl::opt &Opt) { - if (Opt) { - Passes.emplace_back(std::move(Pass)); - } - } - - /// Run all registered passes in the order they were added. - void runPasses() { - for (const auto &Pass : Passes) { - Pass->runOnFunctions(BC, BFs); - } - } - - /// Runs all enabled implemented passes on all functions. - static void runAllPasses(BinaryContext &BC, - std::map &Functions); -}; - -} // namespace bolt -} // namespace llvm - -#endif diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 3d4278925dbf..fdc5856d2af4 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -11,18 +11,39 @@ #include "BinaryPassManager.h" +namespace opts { + +static llvm::cl::opt +OptimizeBodylessFunctions( + "optimize-bodyless-functions", + llvm::cl::desc("optimize functions that just do a tail call"), + llvm::cl::Optional); + +static llvm::cl::opt +InlineSmallFunctions( + "inline-small-functions", + llvm::cl::desc("inline functions with a single basic block"), + llvm::cl::Optional); + +} // namespace opts + namespace llvm { namespace bolt { -std::unique_ptr -BinaryFunctionPassManager::GlobalPassManager; - void BinaryFunctionPassManager::runAllPasses( BinaryContext &BC, std::map &Functions) { - auto &Manager = getGlobalPassManager(); - Manager.BC = &BC; - Manager.BFs = &Functions; + BinaryFunctionPassManager Manager(BC, Functions); + + // Here we manage dependencies/order manually, since passes are ran in the + // order they're registered. + + Manager.registerPass(llvm::make_unique(), + opts::OptimizeBodylessFunctions); + + Manager.registerPass(llvm::make_unique(), + opts::InlineSmallFunctions); + Manager.runPasses(); } diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h index 697772a602a2..0e9b5915377d 100644 --- a/bolt/BinaryPassManager.h +++ b/bolt/BinaryPassManager.h @@ -15,6 +15,7 @@ #define LLVM_TOOLS_LLVM_BOLT_BINARY_FUNCTION_PASS_MANAGER_H #include "BinaryFunction.h" +#include "BinaryPasses.h" #include "llvm/Support/Options.h" #include "llvm/Support/CommandLine.h" #include @@ -24,49 +25,31 @@ namespace llvm { namespace bolt { -/// An optimization/analysis pass that runs on functions. -class BinaryFunctionPass { -public: - virtual ~BinaryFunctionPass() = default; - virtual void runOnFunctions(BinaryContext &BC, - std::map &BFs) = 0; -}; - /// Simple class for managing analyses and optimizations on BinaryFunctions. class BinaryFunctionPassManager { private: - BinaryContext *BC; - std::map *BFs; - std::vector *, + BinaryContext &BC; + std::map &BFs; + std::vector &, std::unique_ptr>> Passes; - /// Manager that contains all implemented passes. - static std::unique_ptr GlobalPassManager; - public: - BinaryFunctionPassManager(BinaryContext *BC = nullptr, - std::map *BFs = nullptr) + BinaryFunctionPassManager(BinaryContext &BC, + std::map &BFs) : BC(BC), BFs(BFs) {} - static BinaryFunctionPassManager &getGlobalPassManager() { - if (!GlobalPassManager) { - GlobalPassManager = llvm::make_unique(); - } - return *GlobalPassManager.get(); - } - /// Adds a pass to this manager based on the value of its corresponding /// command-line option. void registerPass(std::unique_ptr Pass, - const cl::opt *Opt) { + const cl::opt &Opt) { Passes.emplace_back(Opt, std::move(Pass)); } /// Run all registered passes in the order they were added. void runPasses() { for (const auto &OptPassPair : Passes) { - if (*OptPassPair.first) { - OptPassPair.second->runOnFunctions(*BC, *BFs); + if (OptPassPair.first) { + OptPassPair.second->runOnFunctions(BC, BFs); } } } @@ -77,15 +60,6 @@ class BinaryFunctionPassManager { }; -template *Opt> -class RegisterBinaryPass { -public: - RegisterBinaryPass() { - BinaryFunctionPassManager::getGlobalPassManager().registerPass( - std::move(llvm::make_unique()), Opt); - } -}; - } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 5299d2d6d604..259582d15a1f 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -9,112 +9,248 @@ // //===----------------------------------------------------------------------===// -#include "BinaryPassManager.h" +#include "BinaryPasses.h" #define DEBUG_TYPE "bolt" namespace llvm { namespace bolt { -/// Detects functions that simply do a tail call when they are called and -/// optimizes calls to these functions. -class OptimizeBodylessFunctions : public BinaryFunctionPass { -private: - /// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G, - /// thus calls to F can be optimized to calls to G. - std::map EquivalentCallTarget; - - void analyze(BinaryFunction &BF, - BinaryContext &BC, - std::map &BFs) { - if (BF.size() != 1 || BF.begin()->size() == 0) - return; - - auto &BB = *BF.begin(); - const auto &FirstInst = *BB.begin(); - - if (!BC.MIA->isTailCall(FirstInst)) - return; - - auto &Op1 = FirstInst.getOperand(0); - if (!Op1.isExpr()) - return; - - if (auto Expr = dyn_cast(Op1.getExpr())) { - auto AddressIt = BC.GlobalSymbols.find(Expr->getSymbol().getName()); - if (AddressIt != BC.GlobalSymbols.end()) { - auto CalleeIt = BFs.find(AddressIt->second); - if (CalleeIt != BFs.end()) { - assert(Expr->getSymbol().getName() == CalleeIt->second.getName()); - EquivalentCallTarget[BF.getName()] = &CalleeIt->second; - } +void OptimizeBodylessFunctions::analyze( + BinaryFunction &BF, + BinaryContext &BC, + std::map &BFs) { + if (BF.size() != 1 || BF.begin()->size() == 0) + return; + + auto &BB = *BF.begin(); + const auto &FirstInst = *BB.begin(); + + if (!BC.MIA->isTailCall(FirstInst)) + return; + + auto &Op1 = FirstInst.getOperand(0); + if (!Op1.isExpr()) + return; + + if (auto Expr = dyn_cast(Op1.getExpr())) { + auto AddressIt = BC.GlobalSymbols.find(Expr->getSymbol().getName()); + if (AddressIt != BC.GlobalSymbols.end()) { + auto CalleeIt = BFs.find(AddressIt->second); + if (CalleeIt != BFs.end()) { + assert(Expr->getSymbol().getName() == CalleeIt->second.getName()); + EquivalentCallTarget[BF.getName()] = &CalleeIt->second; } } } +} - void optimizeCalls(BinaryFunction &BF, - BinaryContext &BC) { - for (auto BBIt = BF.begin(), BBEnd = BF.end(); BBIt != BBEnd; ++BBIt) { - for (auto InstIt = BBIt->begin(), InstEnd = BBIt->end(); - InstIt != InstEnd; ++InstIt) { - auto &Inst = *InstIt; - if (BC.MIA->isCall(Inst)) { - auto &Op1 = Inst.getOperand(0); - if (Op1.isExpr()) { - if (auto Expr = dyn_cast(Op1.getExpr())) { - auto OriginalTarget = Expr->getSymbol().getName(); - auto Target = OriginalTarget; - // Iteratively update target since we could have f1() calling f2() - // calling f3() calling f4() and we want to output f1() directly - // calling f4(). - while (EquivalentCallTarget.count(Target)) { - Target = EquivalentCallTarget.find(Target)->second->getName(); - } - if (Target != OriginalTarget) { - DEBUG(errs() << "BOLT-DEBUG: Optimizing " << BF.getName() - << ": replacing call to " - << OriginalTarget - << " by call to " << Target << "\n"); - Inst.clear(); - Inst.addOperand(MCOperand::createExpr( - MCSymbolRefExpr::create( +void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, + BinaryContext &BC) { + for (auto BBIt = BF.begin(), BBEnd = BF.end(); BBIt != BBEnd; ++BBIt) { + for (auto InstIt = BBIt->begin(), InstEnd = BBIt->end(); + InstIt != InstEnd; ++InstIt) { + auto &Inst = *InstIt; + if (BC.MIA->isCall(Inst)) { + auto &Op1 = Inst.getOperand(0); + if (Op1.isExpr()) { + if (auto Expr = dyn_cast(Op1.getExpr())) { + auto OriginalTarget = Expr->getSymbol().getName(); + auto Target = OriginalTarget; + // Iteratively update target since we could have f1() calling f2() + // calling f3() calling f4() and we want to output f1() directly + // calling f4(). + while (EquivalentCallTarget.count(Target)) { + Target = EquivalentCallTarget.find(Target)->second->getName(); + } + if (Target != OriginalTarget) { + DEBUG(errs() << "BOLT-DEBUG: Optimizing " << BF.getName() + << ": replacing call to " + << OriginalTarget + << " by call to " << Target << "\n"); + Inst.clear(); + Inst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create( BC.Ctx->getOrCreateSymbol(Target), *BC.Ctx))); - } } } } } } } +} + +void OptimizeBodylessFunctions::runOnFunctions( + BinaryContext &BC, + std::map &BFs) { + for (auto &It : BFs) { + analyze(It.second, BC, BFs); + } + for (auto &It : BFs) { + optimizeCalls(It.second, BC); + } +} + +void InlineSmallFunctions::findInliningCandidates( + BinaryContext &BC, + const std::map &BFs) { + for (const auto &BFIt : BFs) { + const auto &Function = BFIt.second; + if (Function.size() != 1) + continue; + auto &BB = *Function.begin(); + const auto &LastInstruction = *BB.rbegin(); + // Check if the function is small enough and doesn't do a tail call. + // The size we use includes pseudo-instructions but here they shouldn't + // matter. So some opportunities may be missed because of this. + if (BB.size() > 0 && + BB.size() <= kMaxInstructions && + BC.MIA->isReturn(LastInstruction) && + !BC.MIA->isTailCall(LastInstruction)) { + InliningCandidates.insert(Function.getName()); + } + } + + DEBUG(errs() << "BOLT-DEBUG: " << InliningCandidates.size() + << " inlineable functions.\n"); +} + +namespace { + +/// Returns whether a function creates a stack frame for itself or not. +/// If so, we need to manipulate the stack pointer when calling this function. +/// Since we're only inlining very small functions, we return false for now, but +/// we could for instance check if the function starts with 'push ebp'. +/// TODO generalize this. +bool createsStackFrame(const BinaryBasicBlock &) { + return false; +} + +} // namespace + +void InlineSmallFunctions::inlineCall( + BinaryContext &BC, + BinaryBasicBlock &BB, + MCInst *CallInst, + const BinaryBasicBlock &InlinedFunctionBB) { + assert(BC.MIA->isCall(*CallInst) && "Can only inline a call."); + assert(BC.MIA->isReturn(*InlinedFunctionBB.rbegin()) && + "Inlined function should end with a return."); + + std::vector InlinedInstance; + + bool ShouldAdjustStack = createsStackFrame(InlinedFunctionBB); + + // Move stack like 'call' would if needed. + if (ShouldAdjustStack) { + MCInst StackInc; + BC.MIA->createStackPointerIncrement(StackInc); + InlinedInstance.push_back(StackInc); + } -public: - void runOnFunctions(BinaryContext &BC, - std::map &BFs) override { - for (auto &It : BFs) { - analyze(It.second, BC, BFs); + for (auto Instruction : InlinedFunctionBB) { + if (BC.MIA->isReturn(Instruction)) { + break; } - for (auto &It : BFs) { - optimizeCalls(It.second, BC); + if (!BC.MIA->isEHLabel(Instruction) && + !BC.MIA->isCFI(Instruction)) { + InlinedInstance.push_back(Instruction); } } -}; -namespace opts { + // Move stack pointer like 'ret' would. + if (ShouldAdjustStack) { + MCInst StackDec; + BC.MIA->createStackPointerDecrement(StackDec); + InlinedInstance.push_back(StackDec); + } -static llvm::cl::opt -OptimizeBodylessFunctions( - "optimize-bodyless-functions", - llvm::cl::desc("optimize functions that just do a tail call"), - llvm::cl::Optional); + BB.replaceInstruction(CallInst, InlinedInstance); +} -} // namespace opts +void InlineSmallFunctions::inlineCallsInFunction( + BinaryContext &BC, + BinaryFunction &Function) { + std::vector Blocks(Function.layout().begin(), + Function.layout().end()); + std::sort(Blocks.begin(), Blocks.end(), + [](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) { + return BB1->getExecutionCount() > BB2->getExecutionCount(); + }); + uint32_t ExtraSize = 0; -namespace { + for (auto BB : Blocks) { + for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) { + auto &Inst = *InstIt; + if (BC.MIA->isCall(Inst)) { + totalDynamicCalls += BB->getExecutionCount(); + } + } + } -RegisterBinaryPass -RegisterOptimizeBodylessFunctions; + for (auto BB : Blocks) { + if (BB->isCold()) + continue; -} // namespace + for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ) { + auto &Inst = *InstIt; + if (BC.MIA->isCall(Inst) && + !BC.MIA->isTailCall(Inst) && + Inst.size() == 1 && + Inst.getOperand(0).isExpr()) { + auto Target = dyn_cast( + Inst.getOperand(0).getExpr()); + assert(Target && "Not MCSymbolRefExpr"); + auto FunctionIt = FunctionByName.find(Target->getSymbol().getName()); + if (FunctionIt != FunctionByName.end()) { + auto &TargetFunction = *FunctionIt->second; + bool CallToInlineableFunction = + InliningCandidates.count(TargetFunction.getName()); + + totalInlineableCalls += + CallToInlineableFunction * BB->getExecutionCount(); + + if (CallToInlineableFunction && + TargetFunction.getSize() + ExtraSize + + Function.estimateHotSize() < Function.getMaxSize()) { + auto NextInstIt = std::next(InstIt); + inlineCall(BC, *BB, &Inst, *TargetFunction.begin()); + DEBUG(errs() << "BOLT-DEBUG: Inlining call to " + << TargetFunction.getName() << " in " + << Function.getName() << "\n"); + InstIt = NextInstIt; + ExtraSize += TargetFunction.getSize(); + inlinedDynamicCalls += BB->getExecutionCount(); + continue; + } + } + } + + ++InstIt; + } + } +} + +void InlineSmallFunctions::runOnFunctions( + BinaryContext &BC, + std::map &BFs) { + for (auto &It : BFs) { + FunctionByName[It.second.getName()] = &It.second; + } + findInliningCandidates(BC, BFs); + uint32_t ConsideredFunctions = 0; + for (auto &It : BFs) { + if (ConsideredFunctions == kMaxFunctions) + break; + inlineCallsInFunction(BC, It.second); + ++ConsideredFunctions; + } + DEBUG(errs() << "BOLT-DEBUG: Inlined " << inlinedDynamicCalls << " of " + << totalDynamicCalls << " function calls in the profile.\n"); + DEBUG(errs() << "BOLT-DEBUG: Inlined calls represent " + << (100.0 * inlinedDynamicCalls / totalInlineableCalls) + << "% of all inlineable calls in the profile.\n"); +} } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h new file mode 100644 index 000000000000..36c8c9e899bf --- /dev/null +++ b/bolt/BinaryPasses.h @@ -0,0 +1,95 @@ +//===--- BinaryPasses.h - Binary-level analysis/optimization passes -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The set of optimization/analysis passes that run on BinaryFunctions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_PASSES_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_PASSES_H + +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include +#include +#include + +namespace llvm { +namespace bolt { + +/// An optimization/analysis pass that runs on functions. +class BinaryFunctionPass { +public: + virtual ~BinaryFunctionPass() = default; + virtual void runOnFunctions(BinaryContext &BC, + std::map &BFs) = 0; +}; + +/// Detects functions that simply do a tail call when they are called and +/// optimizes calls to these functions. +class OptimizeBodylessFunctions : public BinaryFunctionPass { +private: + /// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G, + /// thus calls to F can be optimized to calls to G. + std::map EquivalentCallTarget; + + void analyze(BinaryFunction &BF, + BinaryContext &BC, + std::map &BFs); + + void optimizeCalls(BinaryFunction &BF, + BinaryContext &BC); + +public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs) override; +}; + +/// Inlining of single basic block functions. +/// The pass currently does not handle CFI instructions. This is needed for +/// correctness and we may break exception handling because of this. +class InlineSmallFunctions : public BinaryFunctionPass { +private: + std::set InliningCandidates; + /// Maps function name to BinaryFunction. + std::map FunctionByName; + + /// Maximum number of instructions in an inlined function. + static const unsigned kMaxInstructions = 8; + /// Maximum number of functions that will be considered for inlining (in + /// ascending address order). + static const unsigned kMaxFunctions = 30000; + + /// Statistics collected for debugging. + uint64_t totalDynamicCalls = 0; + uint64_t inlinedDynamicCalls = 0; + uint64_t totalInlineableCalls = 0; + + void findInliningCandidates(BinaryContext &BC, + const std::map &BFs); + + /// Inline the call in CallInst to InlinedFunctionBB (the only BB of the + /// called function). + void inlineCall(BinaryContext &BC, + BinaryBasicBlock &BB, + MCInst *CallInst, + const BinaryBasicBlock &InlinedFunctionBB); + + void inlineCallsInFunction(BinaryContext &BC, + BinaryFunction &Function); + +public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 486b84ed958b..52b7c51d9bfe 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1054,7 +1054,6 @@ void RewriteInstance::runOptimizationPasses() { } // Post-processing passes. - BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions); // Fix the CFI state. if (!Function.fixCFIState()) { From 15e6cddec68730a2835464e4875c546aabda1c70 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 2 May 2016 12:47:18 -0700 Subject: [PATCH 109/904] Put all optimization passes under the pass manager. Summary: Move eliminate unreachable code, block reordering, and CFI/exception fixup into official optimization passes. (cherry picked from commit 6f0a3da156d01a9216f2dfbfa0d36f4f1f51105e) --- bolt/BinaryPassManager.cpp | 29 ++++++- bolt/BinaryPassManager.h | 20 +++-- bolt/BinaryPasses.cpp | 172 +++++++++++++++++++++++++++++++++++-- bolt/BinaryPasses.h | 39 ++++++++- bolt/RewriteInstance.cpp | 124 ++------------------------ 5 files changed, 250 insertions(+), 134 deletions(-) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index fdc5856d2af4..e7bd9119368d 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -13,6 +13,11 @@ namespace opts { +static llvm::cl::opt +EliminateUnreachable("eliminate-unreachable", + llvm::cl::desc("eliminate unreachable code"), + llvm::cl::Optional); + static llvm::cl::opt OptimizeBodylessFunctions( "optimize-bodyless-functions", @@ -30,20 +35,38 @@ InlineSmallFunctions( namespace llvm { namespace bolt { +cl::opt BinaryFunctionPassManager::AlwaysOn( + "always-run-pass", + llvm::cl::desc("Used for passes that are always enabled"), + cl::init(true), + cl::ReallyHidden); + +bool BinaryFunctionPassManager::NagUser = false; + void BinaryFunctionPassManager::runAllPasses( - BinaryContext &BC, - std::map &Functions) { - BinaryFunctionPassManager Manager(BC, Functions); + BinaryContext &BC, + std::map &Functions, + std::set &LargeFunctions +) { + BinaryFunctionPassManager Manager(BC, Functions, LargeFunctions); // Here we manage dependencies/order manually, since passes are ran in the // order they're registered. + Manager.registerPass( + std::move(llvm::make_unique(Manager.NagUser)), + opts::EliminateUnreachable); + + Manager.registerPass(std::move(llvm::make_unique())); + Manager.registerPass(llvm::make_unique(), opts::OptimizeBodylessFunctions); Manager.registerPass(llvm::make_unique(), opts::InlineSmallFunctions); + Manager.registerPass(std::move(llvm::make_unique())); + Manager.runPasses(); } diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h index 0e9b5915377d..5875a99b5a7e 100644 --- a/bolt/BinaryPassManager.h +++ b/bolt/BinaryPassManager.h @@ -28,15 +28,19 @@ namespace bolt { /// Simple class for managing analyses and optimizations on BinaryFunctions. class BinaryFunctionPassManager { private: + static cl::opt AlwaysOn; + static bool NagUser; BinaryContext &BC; std::map &BFs; + std::set &LargeFunctions; std::vector &, std::unique_ptr>> Passes; -public: + public: BinaryFunctionPassManager(BinaryContext &BC, - std::map &BFs) - : BC(BC), BFs(BFs) {} + std::map &BFs, + std::set &LargeFunctions) + : BC(BC), BFs(BFs), LargeFunctions(LargeFunctions) {} /// Adds a pass to this manager based on the value of its corresponding /// command-line option. @@ -45,18 +49,24 @@ class BinaryFunctionPassManager { Passes.emplace_back(Opt, std::move(Pass)); } + /// Adds an unconditionally run pass to this manager. + void registerPass(std::unique_ptr Pass) { + Passes.emplace_back(AlwaysOn, std::move(Pass)); + } + /// Run all registered passes in the order they were added. void runPasses() { for (const auto &OptPassPair : Passes) { if (OptPassPair.first) { - OptPassPair.second->runOnFunctions(BC, BFs); + OptPassPair.second->runOnFunctions(BC, BFs, LargeFunctions); } } } /// Runs all enabled implemented passes on all functions. static void runAllPasses(BinaryContext &BC, - std::map &Functions); + std::map &Functions, + std::set &largeFunctions); }; diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 259582d15a1f..dde0811df818 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -10,9 +10,45 @@ //===----------------------------------------------------------------------===// #include "BinaryPasses.h" +#include "llvm/Support/Options.h" #define DEBUG_TYPE "bolt" +namespace opts { + +extern llvm::cl::opt PrintAll; +extern llvm::cl::opt PrintReordered; +extern llvm::cl::opt PrintEHRanges; +extern llvm::cl::opt PrintUCE; +extern llvm::cl::opt SplitFunctions; +extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function); + +static llvm::cl::opt +ReorderBlocks( + "reorder-blocks", + llvm::cl::desc("change layout of basic blocks in a function"), + llvm::cl::init(llvm::bolt::BinaryFunction::LT_NONE), + llvm::cl::values(clEnumValN(llvm::bolt::BinaryFunction::LT_NONE, + "none", + "do not reorder basic blocks"), + clEnumValN(llvm::bolt::BinaryFunction::LT_REVERSE, + "reverse", + "layout blocks in reverse order"), + clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE, + "normal", + "perform optimal layout based on profile"), + clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE_BRANCH, + "branch-predictor", + "perform optimal layout prioritizing branch " + "predictions"), + clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE_CACHE, + "cache", + "perform optimal layout prioritizing I-cache " + "behavior"), + clEnumValEnd)); + +} // namespace opts + namespace llvm { namespace bolt { @@ -82,12 +118,19 @@ void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, void OptimizeBodylessFunctions::runOnFunctions( BinaryContext &BC, - std::map &BFs) { + std::map &BFs, + std::set &) { for (auto &It : BFs) { - analyze(It.second, BC, BFs); + auto &Function = It.second; + if (Function.isSimple() && opts::shouldProcess(Function)) { + analyze(Function, BC, BFs); + } } for (auto &It : BFs) { - optimizeCalls(It.second, BC); + auto &Function = It.second; + if (Function.isSimple() && opts::shouldProcess(Function)) { + optimizeCalls(Function, BC); + } } } @@ -96,7 +139,9 @@ void InlineSmallFunctions::findInliningCandidates( const std::map &BFs) { for (const auto &BFIt : BFs) { const auto &Function = BFIt.second; - if (Function.size() != 1) + if (!Function.isSimple() || + !opts::shouldProcess(Function) || + Function.size() != 1) continue; auto &BB = *Function.begin(); const auto &LastInstruction = *BB.rbegin(); @@ -233,16 +278,20 @@ void InlineSmallFunctions::inlineCallsInFunction( void InlineSmallFunctions::runOnFunctions( BinaryContext &BC, - std::map &BFs) { + std::map &BFs, + std::set &) { for (auto &It : BFs) { FunctionByName[It.second.getName()] = &It.second; } findInliningCandidates(BC, BFs); uint32_t ConsideredFunctions = 0; for (auto &It : BFs) { + auto &Function = It.second; + if (!Function.isSimple() || !opts::shouldProcess(Function)) + continue; if (ConsideredFunctions == kMaxFunctions) break; - inlineCallsInFunction(BC, It.second); + inlineCallsInFunction(BC, Function); ++ConsideredFunctions; } DEBUG(errs() << "BOLT-DEBUG: Inlined " << inlinedDynamicCalls << " of " @@ -252,5 +301,116 @@ void InlineSmallFunctions::runOnFunctions( << "% of all inlineable calls in the profile.\n"); } +void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { + if (!Function.isSimple() || !opts::shouldProcess(Function)) return; + + // FIXME: this wouldn't work with C++ exceptions until we implement + // support for those as there will be "invisible" edges + // in the graph. + if (Function.layout_size() > 0) { + if (NagUser) { + outs() + << "BOLT-WARNING: Using -eliminate-unreachable is experimental and " + "unsafe for exceptions\n"; + NagUser = false; + } + + if (Function.hasEHRanges()) return; + + std::stack Stack; + std::map Reachable; + BinaryBasicBlock *Entry = *Function.layout_begin(); + Stack.push(Entry); + Reachable[Entry] = true; + // Determine reachable BBs from the entry point + while (!Stack.empty()) { + auto BB = Stack.top(); + Stack.pop(); + for (auto Succ : BB->successors()) { + if (Reachable[Succ]) + continue; + Reachable[Succ] = true; + Stack.push(Succ); + } + } + + auto Count = Function.eraseDeadBBs(Reachable); + if (Count) { + DEBUG(dbgs() << "BOLT: Removed " << Count + << " dead basic block(s) in function " + << Function.getName() << '\n'); + } + + if (opts::PrintAll || opts::PrintUCE) + Function.print(errs(), "after unreachable code elimination", true); + } +} + +void EliminateUnreachableBlocks::runOnFunctions( + BinaryContext&, + std::map &BFs, + std::set & +) { + for (auto &It : BFs) { + runOnFunction(It.second); + } +} + +void ReorderBasicBlocks::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions +) { + for (auto &It : BFs) { + auto &Function = It.second; + + if (!Function.isSimple()) + continue; + + if (!opts::shouldProcess(Function)) + continue; + + if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { + bool ShouldSplit = + (opts::SplitFunctions == BinaryFunction::ST_ALL) || + (opts::SplitFunctions == BinaryFunction::ST_EH && + Function.hasEHRanges()) || + (LargeFunctions.find(It.first) != LargeFunctions.end()); + Function.modifyLayout(opts::ReorderBlocks, ShouldSplit); + if (opts::PrintAll || opts::PrintReordered) + Function.print(errs(), "after reordering blocks", true); + } + } +} + +void FixupFunctions::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set & +) { + for (auto &It : BFs) { + auto &Function = It.second; + + if (!Function.isSimple()) + continue; + + if (!opts::shouldProcess(Function)) + continue; + + // Fix the CFI state. + if (!Function.fixCFIState()) { + errs() << "BOLT-WARNING: unable to fix CFI state for function " + << Function.getName() << ". Skipping.\n"; + Function.setSimple(false); + continue; + } + + // Update exception handling information. + Function.updateEHRanges(); + if (opts::PrintAll || opts::PrintEHRanges) + Function.print(errs(), "after updating EH ranges", true); + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 36c8c9e899bf..4cc9a6508c9c 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -28,7 +28,8 @@ class BinaryFunctionPass { public: virtual ~BinaryFunctionPass() = default; virtual void runOnFunctions(BinaryContext &BC, - std::map &BFs) = 0; + std::map &BFs, + std::set &LargeFunctions) = 0; }; /// Detects functions that simply do a tail call when they are called and @@ -48,7 +49,8 @@ class OptimizeBodylessFunctions : public BinaryFunctionPass { public: void runOnFunctions(BinaryContext &BC, - std::map &BFs) override; + std::map &BFs, + std::set &LargeFunctions) override; }; /// Inlining of single basic block functions. @@ -86,7 +88,38 @@ class InlineSmallFunctions : public BinaryFunctionPass { public: void runOnFunctions(BinaryContext &BC, - std::map &BFs) override; + std::map &BFs, + std::set &LargeFunctions) override; +}; + +/// Detect and eliminate unreachable basic blocks. We could have those +/// filled with nops and they are used for alignment. +class EliminateUnreachableBlocks : public BinaryFunctionPass { + bool& NagUser; + void runOnFunction(BinaryFunction& Function); + public: + explicit EliminateUnreachableBlocks(bool &nagUser) : NagUser(nagUser) { } + + void runOnFunctions(BinaryContext&, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +// Reorder the basic blocks for each function based on hotness. +class ReorderBasicBlocks : public BinaryFunctionPass { + public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +/// Fix the CFI state and exception handling information after all other +/// passes have completed. +class FixupFunctions : public BinaryFunctionPass { + public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; }; } // namespace bolt diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 52b7c51d9bfe..de5193bb383c 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -101,12 +101,7 @@ MaxFunctions("max-funcs", cl::desc("maximum # of functions to overwrite"), cl::Optional); -static cl::opt -EliminateUnreachable("eliminate-unreachable", - cl::desc("eliminate unreachable code"), - cl::Optional); - -static cl::opt +cl::opt SplitFunctions("split-functions", cl::desc("split functions into hot and cold regions"), cl::init(BinaryFunction::ST_NONE), @@ -132,31 +127,6 @@ FixDebugInfoLargeFunctions("fix-debuginfo-large-functions", "functions, to correct their debug info."), cl::Optional); -static cl::opt -ReorderBlocks( - "reorder-blocks", - cl::desc("change layout of basic blocks in a function"), - cl::init(BinaryFunction::LT_NONE), - cl::values(clEnumValN(BinaryFunction::LT_NONE, - "none", - "do not reorder basic blocks"), - clEnumValN(BinaryFunction::LT_REVERSE, - "reverse", - "layout blocks in reverse order"), - clEnumValN(BinaryFunction::LT_OPTIMIZE, - "normal", - "perform optimal layout based on profile"), - clEnumValN(BinaryFunction::LT_OPTIMIZE_BRANCH, - "branch-predictor", - "perform optimal layout prioritizing branch " - "predictions"), - clEnumValN(BinaryFunction::LT_OPTIMIZE_CACHE, - "cache", - "perform optimal layout prioritizing I-cache " - "behavior"), - clEnumValEnd)); - - static cl::opt AlignBlocks("align-blocks", cl::desc("try to align BBs inserting nops"), @@ -170,7 +140,7 @@ static cl::opt DumpEHFrame("dump-eh-frame", cl::desc("dump parsed .eh_frame (debugging)"), cl::Hidden); -static cl::opt +cl::opt PrintAll("print-all", cl::desc("print functions after each stage"), cl::Hidden); @@ -178,7 +148,7 @@ static cl::opt PrintCFG("print-cfg", cl::desc("print functions after CFG construction"), cl::Hidden); -static cl::opt +cl::opt PrintUCE("print-uce", cl::desc("print functions after unreachable code elimination"), cl::Hidden); @@ -187,12 +157,12 @@ static cl::opt PrintDisasm("print-disasm", cl::desc("print function after disassembly"), cl::Hidden); -static cl::opt +cl::opt PrintEHRanges("print-eh-ranges", cl::desc("print function with updated exception ranges"), cl::Hidden); -static cl::opt +cl::opt PrintReordered("print-reordered", cl::desc("print functions after layout optimization"), cl::Hidden); @@ -606,7 +576,7 @@ void RewriteInstance::run() { assert(FunctionIt != BinaryFunctions.end() && "Invalid large function address."); errs() << "BOLT-WARNING: Function " << FunctionIt->second.getName() - << " is larger than it's orginal size: emitting again marking it " + << " is larger than its orginal size: emitting again marking it " << "as not simple.\n"; FunctionIt->second.setSimple(false); } @@ -989,87 +959,7 @@ void RewriteInstance::disassembleFunctions() { void RewriteInstance::runOptimizationPasses() { // Run optimization passes. // - // FIXME: use real optimization passes. - bool NagUser = true; - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - - if (!opts::shouldProcess(Function)) - continue; - - if (!Function.isSimple()) - continue; - - // Detect and eliminate unreachable basic blocks. We could have those - // filled with nops and they are used for alignment. - // - // FIXME: this wouldn't work with C++ exceptions until we implement - // support for those as there will be "invisible" edges - // in the graph. - if (opts::EliminateUnreachable && Function.layout_size() > 0) { - if (NagUser) { - outs() - << "BOLT-WARNING: Using -eliminate-unreachable is experimental and " - "unsafe for exceptions\n"; - NagUser = false; - } - - std::stack Stack; - std::map Reachable; - BinaryBasicBlock *Entry = *Function.layout_begin(); - Stack.push(Entry); - Reachable[Entry] = true; - // Determine reachable BBs from the entry point - while (!Stack.empty()) { - auto BB = Stack.top(); - Stack.pop(); - for (auto Succ : BB->successors()) { - if (Reachable[Succ]) - continue; - Reachable[Succ] = true; - Stack.push(Succ); - } - } - - auto Count = Function.eraseDeadBBs(Reachable); - if (Count) { - DEBUG(dbgs() << "BOLT: Removed " << Count - << " dead basic block(s) in function " - << Function.getName() << '\n'); - } - - if (opts::PrintAll || opts::PrintUCE) - Function.print(errs(), "after unreachable code elimination", true); - } - - if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { - bool ShouldSplit = - (opts::SplitFunctions == BinaryFunction::ST_ALL) || - (opts::SplitFunctions == BinaryFunction::ST_EH && - Function.hasEHRanges()) || - (LargeFunctions.find(BFI.first) != LargeFunctions.end()); - BFI.second.modifyLayout(opts::ReorderBlocks, ShouldSplit); - if (opts::PrintAll || opts::PrintReordered) - Function.print(errs(), "after reordering blocks", true); - } - - // Post-processing passes. - - // Fix the CFI state. - if (!Function.fixCFIState()) { - errs() << "BOLT-WARNING: unable to fix CFI state for function " - << Function.getName() << ". Skipping.\n"; - Function.setSimple(false); - continue; - } - - // Update exception handling information. - Function.updateEHRanges(); - if (opts::PrintAll || opts::PrintEHRanges) - Function.print(errs(), "after updating EH ranges", true); - } - - BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions); + BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions, LargeFunctions); } namespace { From a149448301641429810380e54f72424520755f57 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 11 May 2016 19:13:38 -0700 Subject: [PATCH 110/904] Fix issue with garbage address in .debug_line. Summary: While emitting debug lines for a function we don't overwrite, we don't have a code section context that is needed by default writing routine. Hence we have to emit end_sequence after the last address, not at the end of section. (cherry picked from commit 12cf0eb4fab44ee0d803220ef57b11ad8f35ca0e) --- bolt/RewriteInstance.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index de5193bb383c..86b3ea3b7fcd 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2296,7 +2296,8 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); - if (LineTable->lookupAddressRange(Address, Function.getSize(), Results)) { + if (LineTable->lookupAddressRange(Address, Function.getMaxSize() + 1, + Results)) { for (auto RowIndex : Results) { const auto &Row = LineTable->Rows[RowIndex]; BC->Ctx->setCurrentDwarfLoc( From 08ce2a63b5f2929aac8d2c045db41616b4c6ef8f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 2 May 2016 12:47:18 -0700 Subject: [PATCH 111/904] Patch forward jumping tail calls to prevent branch mispredictions. Summary: A simple optimization to prevent branch misprediction for tail calls. Convert the sequence: j L1 ... L1: jmp foo # tail call into: j foo but only if 'j foo' turns out to be a forward branch. (cherry picked from commit 78e2a4878ad28b502dc66f65afb9e3bfaac0c7e5) --- bolt/BinaryBasicBlock.cpp | 8 +++ bolt/BinaryBasicBlock.h | 9 +++ bolt/BinaryPassManager.cpp | 15 +++++ bolt/BinaryPasses.cpp | 115 +++++++++++++++++++++++++++++++++++++ bolt/BinaryPasses.h | 26 +++++++++ 5 files changed, 173 insertions(+) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 0aafc67d053d..39e51fc353c4 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -62,5 +62,13 @@ void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) { Predecessors.erase(I); } +bool BinaryBasicBlock::analyzeBranch(const MCInstrAnalysis &MIA, + const MCSymbol *&TBB, + const MCSymbol *&FBB, + MCInst *&CondBranch, + MCInst *&UncondBranch) { + return MIA.analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch); +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 0ba3a8630d08..31f6595bb4e6 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -19,6 +19,7 @@ #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstPrinter.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -306,6 +307,14 @@ class BinaryBasicBlock { return Function; } + /// Analyze and interpret the terminators of this basic block. TBB must be + /// initialized with the original fall-through for this BB. + bool analyzeBranch(const MCInstrAnalysis &MIA, + const MCSymbol *&TBB, + const MCSymbol *&FBB, + MCInst *&CondBranch, + MCInst *&UncondBranch); + private: /// Adds predecessor to the BB. Most likely you don't need to call this. diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index e7bd9119368d..da78727fab39 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -30,6 +30,12 @@ InlineSmallFunctions( llvm::cl::desc("inline functions with a single basic block"), llvm::cl::Optional); +static llvm::cl::opt +SimplifyConditionalTailCalls("simplify-conditional-tail-calls", + llvm::cl::desc("simplify conditional tail calls " + "by removing unnecessary jumps"), + llvm::cl::Optional); + } // namespace opts namespace llvm { @@ -59,6 +65,15 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(std::move(llvm::make_unique())); + Manager.registerPass(llvm::make_unique(), + opts::SimplifyConditionalTailCalls); + + // The tail call fixup pass may introduce unreachable code. Add another + // instance of EliminateUnreachableBlocks here to catch it. + Manager.registerPass( + std::move(llvm::make_unique(Manager.NagUser)), + opts::EliminateUnreachable); + Manager.registerPass(llvm::make_unique(), opts::OptimizeBodylessFunctions); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index dde0811df818..b2290ccc73ec 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -412,5 +412,120 @@ void FixupFunctions::runOnFunctions( } } +bool SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, + BinaryFunction &BF) { + if (BF.layout_size() == 0) + return false; + + auto &MIA = BC.MIA; + uint64_t NumLocalTailCalls = 0; + uint64_t NumLocalPatchedTailCalls = 0; + + for (auto* BB : BF.layout()) { + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + + // Determine the control flow at the end of each basic block + if (!BB->analyzeBranch(*MIA, TBB, FBB, CondBranch, UncondBranch)) { + continue; + } + + // TODO: do we need to test for other branch patterns? + + // For this particular case, the first basic block ends with + // a conditional branch and has two successors, one fall-through + // and one for when the condition is true. + // The target of the conditional is a basic block with a single + // unconditional branch (i.e. tail call) to another function. + // We don't care about the contents of the fall-through block. + // Note: this code makes the assumption that the fall-through + // block is the last successor. + if (CondBranch && !UncondBranch && BB->succ_size() == 2) { + // Find conditional branch target assuming the fall-through is + // always the last successor. + auto *CondTargetBB = *BB->succ_begin(); + + // Does the BB contain a single instruction? + if (CondTargetBB->size() - CondTargetBB->getNumPseudos() == 1) { + // Check to see if the sole instruction is a tail call. + auto const &Instr = *CondTargetBB->begin(); + + if (MIA->isTailCall(Instr)) { + ++NumTailCallCandidates; + ++NumLocalTailCalls; + + auto const &TailTargetSymExpr = + cast(Instr.getOperand(0).getExpr()); + auto const &TailTarget = TailTargetSymExpr->getSymbol(); + + // Lookup the address for the current function and + // the tail call target. + auto const FnAddress = BC.GlobalSymbols.find(BF.getName()); + auto const TailAddress = BC.GlobalSymbols.find(TailTarget.getName()); + if (FnAddress == BC.GlobalSymbols.end() || + TailAddress == BC.GlobalSymbols.end()) { + continue; + } + + // Check to make sure we would be doing a forward jump. + // This assumes the address range of the current BB and the + // tail call target address don't overlap. + if (FnAddress->second < TailAddress->second) { + ++NumTailCallsPatched; + ++NumLocalPatchedTailCalls; + + // Is the original jump forward or backward? + const bool isForward = + TailAddress->second > FnAddress->second + BB->getOffset(); + + if (isForward) ++NumOrigForwardBranches; + + // Patch the new target address into the conditional branch. + CondBranch->getOperand(0).setExpr(TailTargetSymExpr); + // Remove the unused successor which may be eliminated later + // if there are no other users. + BB->removeSuccessor(CondTargetBB); + DEBUG(dbgs() << "patched " << (isForward ? "(fwd)" : "(back)") + << " tail call in " << BF.getName() << ".\n";); + } + } + } + } + } + + DEBUG(dbgs() << "BOLT: patched " << NumLocalPatchedTailCalls + << " tail calls (" << NumOrigForwardBranches << " forward)" + << " from a total of " << NumLocalTailCalls + << " in function " << BF.getName() << "\n";); + + return NumLocalPatchedTailCalls > 0; +} + +void SimplifyConditionalTailCalls::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set & +) { + for (auto &It : BFs) { + auto &Function = It.second; + + if (!Function.isSimple()) + continue; + + // Fix tail calls to reduce branch mispredictions. + if (fixTailCalls(BC, Function)) { + if (opts::PrintAll || opts::PrintReordered) { + Function.print(errs(), "after tail call patching", true); + } + } + } + + outs() << "BOLT: patched " << NumTailCallsPatched + << " tail calls (" << NumOrigForwardBranches << " forward)" + << " from a total of " << NumTailCallCandidates << "\n"; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 4cc9a6508c9c..fd224f2bf200 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -122,6 +122,32 @@ class FixupFunctions : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// An optimization to simplify conditional tail calls by removing +/// unnecessary branches. +/// +/// Convert the sequence: +/// +/// j L1 +/// ... +/// L1: jmp foo # tail call +/// +/// into: +/// j foo +/// +/// but only if 'j foo' turns out to be a forward branch. +/// +class SimplifyConditionalTailCalls : public BinaryFunctionPass { + uint64_t NumTailCallCandidates{0}; + uint64_t NumTailCallsPatched{0}; + uint64_t NumOrigForwardBranches{0}; + + bool fixTailCalls(BinaryContext &BC, BinaryFunction &BF); + public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + } // namespace bolt } // namespace llvm From b5f7828330b7a5d2c99c74b93af869981183ec6e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 16 May 2016 17:02:17 -0700 Subject: [PATCH 112/904] Overwrite contents of .debug_line section. Summary: Overwrite contents of .debug_line section since we don't reference the original contents anymore. This saves ~100MB of HHVM binary. (cherry picked from commit 5f45f8c2f02a985499b00f8554c3acb271987642) --- bolt/RewriteInstance.cpp | 27 ++++++++++++++++++--------- bolt/RewriteInstance.h | 19 +++++++++++++------ 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 86b3ea3b7fcd..32b02dcf5374 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -268,6 +268,7 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, return ret; } +/// Notifier for non-allocatable (note) section. uint8_t *ExecutableFileMemoryManager::recordNoteSection( const uint8_t *Data, uintptr_t Size, @@ -758,8 +759,6 @@ void RewriteInstance::readSpecialSections() { FrameHdrAddress = Section.getAddress(); FrameHdrContents = SectionContents; FrameHdrAlign = Section.getAlignment(); - } else if (SectionName == ".debug_line") { - DebugLineSize = Section.getSize(); } else if (SectionName == ".debug_ranges") { DebugRangesSize = Section.getSize(); } else if (SectionName == ".debug_loc") { @@ -1697,11 +1696,11 @@ void RewriteInstance::rewriteNoteSections() { ErrorOr SectionName = Obj->getSectionName(&Section); check_error(SectionName.getError(), "cannot get section name"); - // Copy over section contents unless it's .debug_aranges, which shall be - // overwritten if -update-debug-sections is passed. + // New section size. uint64_t Size = 0; - if (*SectionName != ".debug_aranges" || !opts::UpdateDebugSections) { + // Copy over section contents unless it's one of the sections we ovewrite. + if (!shouldOverwriteSection(*SectionName)) { Size = Section.sh_size; std::string Data = InputFile->getData().substr(Section.sh_offset, Size); auto SectionPatchersIt = SectionPatchers.find(*SectionName); @@ -1725,7 +1724,8 @@ void RewriteInstance::rewriteNoteSections() { // Write section extension. Address = SI.AllocAddress; if (Address) { - DEBUG(dbgs() << "BOLT: appending contents to section " + DEBUG(dbgs() << "BOLT: " << (Size ? "appending" : "writing") + << " contents to section " << *SectionName << '\n'); OS.write(reinterpret_cast(Address), SI.Size); Size += SI.Size; @@ -2081,7 +2081,6 @@ void RewriteInstance::computeLineTableOffsets() { continue; auto Fragment = Label->getFragment(); - while (&*CurrentFragment != Fragment) { switch (CurrentFragment->getKind()) { case MCFragment::FT_Dwarf: @@ -2096,7 +2095,6 @@ void RewriteInstance::computeLineTableOffsets() { llvm_unreachable(".debug_line section shouldn't contain other types " "of fragments."); } - ++CurrentFragment; CurrentOffset = 0; } @@ -2113,7 +2111,7 @@ void RewriteInstance::computeLineTableOffsets() { << "in .debug_info\n"); auto &SI = SectionMM->NoteSectionInfo[".debug_info"]; SI.PendingRelocs.emplace_back( - SectionInfo::Reloc{LTOI->second, 4, 0, Offset + DebugLineSize}); + SectionInfo::Reloc{LTOI->second, 4, 0, Offset}); } DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first << " has line table at " << Offset << "\n"); @@ -2326,3 +2324,14 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { } } } + +bool RewriteInstance::shouldOverwriteSection(StringRef SectionName) { + if (opts::UpdateDebugSections) { + for (auto &OverwriteName : DebugSectionsToOverwrite) { + if (SectionName == OverwriteName) + return true; + } + } + + return false; +} diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 85cc84ced20c..9f2ee57480e9 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -179,9 +179,6 @@ class RewriteInstance { private: - /// Huge page size used for alignment. - static constexpr unsigned PageAlign = 0x200000; - /// Detect addresses and offsets available in the binary for allocating /// new sections. void discoverStorage(); @@ -247,7 +244,20 @@ class RewriteInstance { return Address - NewTextSegmentAddress + NewTextSegmentOffset; } + /// Return true if we should overwrite contents of the section instead + /// of appending contents to it. + bool shouldOverwriteSection(StringRef SectionName); + private: + + /// If we are updating debug info, these are the section we need to overwrite. + static constexpr const char *DebugSectionsToOverwrite[] = { + ".debug_aranges", + ".debug_line"}; + + /// Huge page size used for alignment. + static constexpr unsigned PageAlign = 0x200000; + /// An instance of the input binary we are processing, externally owned. llvm::object::ELFObjectFileBase *InputFile; @@ -306,9 +316,6 @@ class RewriteInstance { /// rewriting CFI info for these functions. std::vector FailedAddresses; - /// Size of the .debug_line section on input. - uint32_t DebugLineSize{0}; - /// Size of the .debug_loc section in input. uint32_t DebugLocSize{0}; From 83a6d1974cbe56bc026dec46475257dff7779fe4 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 17 May 2016 18:10:14 -0700 Subject: [PATCH 113/904] Create DW_AT_ranges for compile units. Summary: Some compile unit DIEs might be missing DW_AT_ranges because they were compiled without "-ffunction-sections" option. This diff adds the attribute to all compile units. If the section is not present, we need to create it. Will do it in a separate diff. (cherry picked from commit 061af43bf5d137b89dbf1317be5081191b18f561) --- bolt/RewriteInstance.cpp | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 32b02dcf5374..557aa4dbbf4e 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2133,33 +2133,22 @@ void RewriteInstance::updateDebugInfo() { updateLocationLists(); - auto &DebugInfoSI = SectionMM->NoteSectionInfo[".debug_info"]; + updateDWARFAddressRanges(); +} + +void RewriteInstance::updateDWARFAddressRanges() { + // Update DW_AT_ranges for all compilation units. for (const auto &CU : BC->DwCtx->compile_units()) { const auto CUID = CU->getOffset(); - - // Update DW_AT_ranges - auto RangesFieldOffset = - BC->DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_ranges); - if (RangesFieldOffset) { - DEBUG(dbgs() << "BOLT-DEBUG: adding relocation for DW_AT_ranges for " - << "compile unit in .debug_info\n"); - const auto RSOI = RangesSectionsWriter.getRangesOffsetCUMap().find(CUID); - if (RSOI != RangesSectionsWriter.getRangesOffsetCUMap().end()) { - auto Offset = RSOI->second; - DebugInfoSI.PendingRelocs.emplace_back( - SectionInfo::Reloc{RangesFieldOffset, 4, 0, - Offset + DebugRangesSize}); - } else { - DEBUG(dbgs() << "BOLT-DEBUG: no .debug_ranges entry found for CU " - << CUID << '\n'); - } + const auto RSOI = RangesSectionsWriter.getRangesOffsetCUMap().find(CUID); + if (RSOI != RangesSectionsWriter.getRangesOffsetCUMap().end()) { + auto Offset = RSOI->second; + updateDWARFObjectAddressRanges(Offset + DebugRangesSize, + CU.get(), + CU->getUnitDIE()); } } - updateDWARFAddressRanges(); -} - -void RewriteInstance::updateDWARFAddressRanges() { // Update address ranges of functions. for (const auto &BFI : BinaryFunctions) { const auto &Function = BFI.second; From 59acb16c2b14fe5d81cdadf6fe98c46caa87ce52 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 23 May 2016 19:36:38 -0700 Subject: [PATCH 114/904] Miscellaneous fixes for debug info. Summary: * Fix several cases for handling debug info: - properly update CU DW_AT_ranges for function with folded body due to ICF optimization - convert ranges to DW_AT_ranges from hi/low PC for all DIEs - add support for [a, a) range - update CU ranges even when there are no functions registered * Overwrite .debug_ranges section instead of appending. * Convert assertions in debug info handling part into warnings. (cherry picked from commit a4ef8f57655c9d46f59caf69cbb0f0b49f6f2b6d) --- bolt/BinaryContext.cpp | 31 ++++++++++----- bolt/BinaryFunction.h | 2 +- bolt/DebugData.cpp | 11 +++++- bolt/DebugData.h | 46 +++++++++++++++------- bolt/RewriteInstance.cpp | 85 ++++++++++++++++++++++++++++------------ bolt/RewriteInstance.h | 11 ++++-- 6 files changed, 131 insertions(+), 55 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 601a0e6ff247..50c6584631a1 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -83,13 +83,25 @@ void findAddressRangesObjects( Tag == dwarf::DW_TAG_inlined_subroutine || Tag == dwarf::DW_TAG_try_block || Tag == dwarf::DW_TAG_catch_block) { - AddressRangesObjects.emplace_back(Unit, DIE); - auto &Object = AddressRangesObjects.back(); - for (const auto &Range : DIE->getAddressRanges(Unit)) { - if (auto *Function = getBinaryFunctionContainingAddress(Range.first, - Functions)) { - if (Function->isSimple()) { + auto const &Ranges = DIE->getAddressRanges(Unit); + if (!Ranges.empty()) { + // We have to process all ranges, even for functions that we are not + // updating. The primary reason is that abbrev entries are shared + // and if we convert one DIE, it may affect the rest. Thus + // the conservative approach that does not involve expanding + // .debug_abbrev, is to switch all DIEs to use .debug_ranges, even if + // they use a single [a,b) range. The secondary reason is that it allows + // us to get rid of the original portion of .debug_ranges to save + // space in the binary. + auto Function = getBinaryFunctionContainingAddress(Ranges.front().first, + Functions); + AddressRangesObjects.emplace_back(Unit, DIE); + auto &Object = AddressRangesObjects.back(); + for (const auto &Range : Ranges) { + if (Function && Function->isSimple()) { Object.addAddressRange(*Function, Range.first, Range.second); + } else { + Object.addAbsoluteRange(Range.first, Range.second); } } } @@ -101,14 +113,15 @@ void findAddressRangesObjects( } } -// Recursively finds DWARF DW_TAG_subprogram DIEs and match them with -// BinaryFunctions. Record DIEs for unknown subprograms (mostly functions that -// are never called and removed from the binary) in Unknown. +/// Recursively finds DWARF DW_TAG_subprogram DIEs and match them with +/// BinaryFunctions. Record DIEs for unknown subprograms (mostly functions that +/// are never called and removed from the binary) in Unknown. void findSubprograms(const DWARFCompileUnit *Unit, const DWARFDebugInfoEntryMinimal *DIE, std::map &BinaryFunctions, BinaryContext::DIECompileUnitVector &Unknown) { if (DIE->isSubprogramDIE()) { + // TODO: handle DW_AT_ranges. uint64_t LowPC, HighPC; if (DIE->getLowAndHighPC(Unit, LowPC, HighPC)) { auto It = BinaryFunctions.find(LowPC); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 2f57c86a12e0..b1abf9050799 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -173,7 +173,7 @@ class BinaryFunction : public AddressRangesOwner { /// Offset of this function's address ranges in the .debug_ranges section of /// the output binary. - uint32_t AddressRangesOffset; + uint32_t AddressRangesOffset{-1U}; /// Release storage used by instructions. BinaryFunction &clearInstructions() { diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index 79ea84dd816c..8a6e329dafc1 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -28,12 +28,19 @@ void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, const BinaryData *Data) { auto FirstBB = Function.getBasicBlockContainingOffset( BeginAddress - Function.getAddress()); - assert(FirstBB && "No basic blocks in the function intersect given range."); + if (!FirstBB) { + errs() << "BOLT-WARNING: no basic blocks in function " + << Function.getName() << " intersect with debug range [0x" + << Twine::utohexstr(BeginAddress) << ", 0x" + << Twine::utohexstr(EndAddress) << ")\n"; + return; + } for (auto I = Function.getIndex(FirstBB), S = Function.size(); I != S; ++I) { auto BB = Function.getBasicBlockAtIndex(I); uint64_t BBAddress = Function.getAddress() + BB->getOffset(); - if (BBAddress >= EndAddress) + // Note the special handling for [a, a) address range. + if (BBAddress >= EndAddress && BeginAddress != EndAddress) break; uint64_t InternalAddressRangeBegin = std::max(BBAddress, BeginAddress); diff --git a/bolt/DebugData.h b/bolt/DebugData.h index 7152a4515cd0..53f90da124e2 100644 --- a/bolt/DebugData.h +++ b/bolt/DebugData.h @@ -136,14 +136,23 @@ class AddressRangesDWARFObject : public AddressRangesOwner { BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress); } + /// Add range that is guaranteed to not change. + void addAbsoluteRange(uint64_t BeginAddress, + uint64_t EndAddress) { + AbsoluteRanges.emplace_back(std::make_pair(BeginAddress, EndAddress)); + } + std::vector> getAbsoluteAddressRanges() const { auto AddressRangesWithData = BBOffsetRanges.getAbsoluteAddressRanges(); - std::vector> AddressRanges( - AddressRangesWithData.size()); + std::vector> + AddressRanges(AddressRangesWithData.size()); for (unsigned I = 0, S = AddressRanges.size(); I != S; ++I) { AddressRanges[I] = std::make_pair(AddressRangesWithData[I].Begin, AddressRangesWithData[I].End); } + std::move(AbsoluteRanges.begin(), + AbsoluteRanges.end(), + std::back_inserter(AddressRanges)); return AddressRanges; } @@ -160,8 +169,10 @@ class AddressRangesDWARFObject : public AddressRangesOwner { BasicBlockOffsetRanges BBOffsetRanges; - // Offset of the address ranges of this object in the output .debug_ranges. - uint32_t AddressRangesOffset; + std::vector> AbsoluteRanges; + + /// Offset of the address ranges of this object in the output .debug_ranges. + uint32_t AddressRangesOffset{-1U}; }; @@ -193,7 +204,7 @@ class LocationList { private: BasicBlockOffsetRanges BBOffsetRanges; - // Offset of this location list in the input .debug_loc section. + /// Offset of this location list in the input .debug_loc section. uint32_t DebugLocOffset; }; @@ -234,19 +245,26 @@ class DebugRangesSectionsWriter { /// to .debug_ranges uint32_t getEmptyRangesListOffset() const { return EmptyRangesListOffset; } + using CUAddressRangesType = + std::map>>; + + /// Return ranges for a given CU. + const CUAddressRangesType &getCUAddressRanges() const { + return CUAddressRanges; + } + private: - // Map from compile unit offset to the list of address intervals that belong - // to that compile unit. Each interval is a pair - // (first address, interval size). - std::map>> - CUAddressRanges; - - // Map from BinaryFunction to the list of address intervals that belong - // to that function, represented like CUAddressRanges. + /// Map from compile unit offset to the list of address intervals that belong + /// to that compile unit. Each interval is a pair + /// (first address, interval size). + CUAddressRangesType CUAddressRanges; + + /// Map from BinaryFunction to the list of address intervals that belong + /// to that function, represented like CUAddressRanges. std::map>> ObjectAddressRanges; - // Offset of an empty address ranges list. + /// Offset of an empty address ranges list. uint32_t EmptyRangesListOffset; /// When writing data to .debug_ranges remember offset per CU. diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 557aa4dbbf4e..e3cdc4d3d7e9 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -759,8 +759,6 @@ void RewriteInstance::readSpecialSections() { FrameHdrAddress = Section.getAddress(); FrameHdrContents = SectionContents; FrameHdrAlign = Section.getAlignment(); - } else if (SectionName == ".debug_ranges") { - DebugRangesSize = Section.getSize(); } else if (SectionName == ".debug_loc") { DebugLocSize = Section.getSize(); } @@ -1436,11 +1434,14 @@ bool RewriteInstance::checkLargeFunctions() { } void RewriteInstance::updateFunctionRanges() { - auto addDebugArangesEntry = [&](uint64_t OriginalFunctionAddress, + auto addDebugArangesEntry = [&](const BinaryFunction &Function, uint64_t RangeBegin, uint64_t RangeSize) { - if (auto DebugAranges = BC->DwCtx->getDebugAranges()) { - uint32_t CUOffset = DebugAranges->findAddress(OriginalFunctionAddress); + // The function potentially has multiple associated CUs because of + // the identical code folding optimization. Update all of them with + // the range. + for (const auto DIECompileUnitPair : Function.getSubprocedureDIEs()) { + auto CUOffset = DIECompileUnitPair.second->getOffset(); if (CUOffset != -1U) RangesSectionsWriter.AddRange(CUOffset, RangeBegin, RangeSize); } @@ -1448,15 +1449,18 @@ void RewriteInstance::updateFunctionRanges() { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; + // If function doesn't have registered DIEs - there's nothting to update. + if (Function.getSubprocedureDIEs().empty()) + continue; // Use either new (image) or original size for the function range. auto Size = Function.isSimple() ? Function.getImageSize() : Function.getSize(); - addDebugArangesEntry(Function.getAddress(), + addDebugArangesEntry(Function, Function.getAddress(), Size); RangesSectionsWriter.AddRange(&Function, Function.getAddress(), Size); if (Function.isSimple() && Function.cold().getImageSize()) { - addDebugArangesEntry(Function.getAddress(), + addDebugArangesEntry(Function, Function.cold().getAddress(), Function.cold().getImageSize()); RangesSectionsWriter.AddRange(&Function, @@ -2129,6 +2133,8 @@ void RewriteInstance::updateDebugInfo() { updateAddressRangesObjects(); + updateEmptyModuleRanges(); + generateDebugRanges(); updateLocationLists(); @@ -2136,17 +2142,28 @@ void RewriteInstance::updateDebugInfo() { updateDWARFAddressRanges(); } +void RewriteInstance::updateEmptyModuleRanges() { + const auto &CUAddressRanges = RangesSectionsWriter.getCUAddressRanges(); + for (const auto &CU : BC->DwCtx->compile_units()) { + if (CUAddressRanges.find(CU->getOffset()) != CUAddressRanges.end()) + continue; + auto const &Ranges = CU->getUnitDIE(true)->getAddressRanges(CU.get()); + for (auto const &Range : Ranges) { + RangesSectionsWriter.AddRange(CU->getOffset(), + Range.first, + Range.second - Range.first); + } + } +} + void RewriteInstance::updateDWARFAddressRanges() { // Update DW_AT_ranges for all compilation units. for (const auto &CU : BC->DwCtx->compile_units()) { const auto CUID = CU->getOffset(); const auto RSOI = RangesSectionsWriter.getRangesOffsetCUMap().find(CUID); - if (RSOI != RangesSectionsWriter.getRangesOffsetCUMap().end()) { - auto Offset = RSOI->second; - updateDWARFObjectAddressRanges(Offset + DebugRangesSize, - CU.get(), - CU->getUnitDIE()); - } + if (RSOI == RangesSectionsWriter.getRangesOffsetCUMap().end()) + continue; + updateDWARFObjectAddressRanges(RSOI->second, CU.get(), CU->getUnitDIE()); } // Update address ranges of functions. @@ -2154,7 +2171,7 @@ void RewriteInstance::updateDWARFAddressRanges() { const auto &Function = BFI.second; for (const auto DIECompileUnitPair : Function.getSubprocedureDIEs()) { updateDWARFObjectAddressRanges( - Function.getAddressRangesOffset() + DebugRangesSize, + Function.getAddressRangesOffset(), DIECompileUnitPair.second, DIECompileUnitPair.first); } @@ -2172,7 +2189,7 @@ void RewriteInstance::updateDWARFAddressRanges() { // inlined subroutine instances, etc). for (const auto &Obj : BC->AddressRangesObjects) { updateDWARFObjectAddressRanges( - Obj.getAddressRangesOffset() + DebugRangesSize, + Obj.getAddressRangesOffset(), Obj.getCompileUnit(), Obj.getDIE()); } @@ -2189,6 +2206,11 @@ void RewriteInstance::updateDWARFObjectAddressRanges( return; } + if (DebugRangesOffset == -1U) { + errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + } + auto DebugInfoPatcher = static_cast(SectionPatchers[".debug_info"].get()); auto AbbrevPatcher = @@ -2197,8 +2219,13 @@ void RewriteInstance::updateDWARFObjectAddressRanges( assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized."); const auto *AbbreviationDecl = DIE->getAbbreviationDeclarationPtr(); - assert(AbbreviationDecl && - "Object's DIE doesn't have an abbreviation: not supported yet."); + if (!AbbreviationDecl) { + errs() << "BOLT-WARNING: object's DIE doesn't have an abbreviation: " + << "skipping update. DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + return; + } + auto AbbrevCode = AbbreviationDecl->getCode(); if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges) != -1U) { @@ -2238,17 +2265,20 @@ void RewriteInstance::updateDWARFObjectAddressRanges( dwarf::DW_AT_high_pc, dwarf::DW_AT_producer, dwarf::DW_FORM_string); - assert(LowPCOffset != -1U && LowPCOffset + 8 == HighPCOffset && - "We depend on the compiler putting high_pc right after low_pc."); + if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { + errs() << "BOLT-WARNING: we depend on the compiler putting high_pc " + << "right after low_pc. Not updating DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + return; + } DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); std::string ProducerString{"LLVM-BOLT"}; ProducerString.resize(12, ' '); ProducerString.back() = '\0'; - DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); } else { - DEBUG(errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << "\n"); + errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; } } } @@ -2258,7 +2288,11 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { return; auto DebugAranges = BC->DwCtx->getDebugAranges(); - assert(DebugAranges && "Need .debug_aranges in the input file."); + if (!DebugAranges) { + errs() << "BOLT-WARNING: need .debug_aranges in the input file to update " + << "debug info for non-simple functions.\n"; + return; + } for (auto It : BinaryFunctions) { const auto &Function = It.second; @@ -2269,8 +2303,9 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { uint64_t Address = It.first; uint32_t CUOffset = DebugAranges->findAddress(Address); if (CUOffset == -1U) { - DEBUG(errs() << "BOLT-DEBUG: Function does not belong to any compile unit" - << "in .debug_aranges: " << Function.getName() << "\n"); + errs() << "BOLT-WARNING: function " << Function.getName() + << " does not belong to any compile unit in .debug_aranges. " + << "Cannot update line number information.\n"; continue; } auto Unit = BC->OffsetToDwarfCU[CUOffset]; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 9f2ee57480e9..930f7be30fec 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -210,6 +210,11 @@ class RewriteInstance { /// Update objects with address ranges after optimization. void updateAddressRangesObjects(); + /// If we've never mapped the unit, e.g. because there were no functions + /// marked in DWARF, update with the original ranges so that we can free up + /// the old part of .debug_ranges. + void updateEmptyModuleRanges(); + /// Generate new contents for .debug_loc. void updateLocationLists(); @@ -253,7 +258,8 @@ class RewriteInstance { /// If we are updating debug info, these are the section we need to overwrite. static constexpr const char *DebugSectionsToOverwrite[] = { ".debug_aranges", - ".debug_line"}; + ".debug_line", + ".debug_ranges"}; /// Huge page size used for alignment. static constexpr unsigned PageAlign = 0x200000; @@ -319,9 +325,6 @@ class RewriteInstance { /// Size of the .debug_loc section in input. uint32_t DebugLocSize{0}; - /// Size of the .debug_ranges section on input. - uint32_t DebugRangesSize{0}; - /// Keep track of which functions didn't fit in their original space in the /// last emission, so that we may either decide to split or not optimize them. std::set LargeFunctions; From 53e794fe86cc95705f11b9e9b8d9c077d6069cde Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 24 May 2016 14:54:23 -0700 Subject: [PATCH 115/904] Fix for clang .debug_info. Summary: Clang uses different attribute for high_pc which was incompatible with the way we were updating ranges. This diff fixes it. (cherry picked from commit f44929987060e55b996ce0e6b5ca5addc43a00d6) --- bolt/RewriteInstance.cpp | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index e3cdc4d3d7e9..bea925596e68 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2249,11 +2249,26 @@ void RewriteInstance::updateDWARFObjectAddressRanges( AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc) != -1U) { uint32_t LowPCOffset = -1U; uint32_t HighPCOffset = -1U; - DWARFFormValue FormValue; - DIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, FormValue, + DWARFFormValue LowPCFormValue; + DWARFFormValue HighPCFormValue; + DIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, LowPCFormValue, &LowPCOffset); - DIE->getAttributeValue(Unit, dwarf::DW_AT_high_pc, FormValue, + DIE->getAttributeValue(Unit, dwarf::DW_AT_high_pc, HighPCFormValue, &HighPCOffset); + if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr || + (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && + HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && + HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { + errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " + "at offset 0x" << Twine::utohexstr(DIE->getOffset()) << '\n'; + return; + } + if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { + errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " + "Cannot update DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + return; + } AbbrevPatcher->addAttributePatch(Unit, AbbrevCode, @@ -2265,15 +2280,19 @@ void RewriteInstance::updateDWARFObjectAddressRanges( dwarf::DW_AT_high_pc, dwarf::DW_AT_producer, dwarf::DW_FORM_string); - if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { - errs() << "BOLT-WARNING: we depend on the compiler putting high_pc " - << "right after low_pc. Not updating DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; - return; + unsigned StringSize = 0; + if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr || + HighPCFormValue.getForm() == dwarf::DW_FORM_data8) { + StringSize = 12; + } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) { + StringSize = 8; + } else { + assert(0 && "unexpected form"); } + DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); std::string ProducerString{"LLVM-BOLT"}; - ProducerString.resize(12, ' '); + ProducerString.resize(StringSize, ' '); ProducerString.back() = '\0'; DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); } else { From 32396570598116d95b98ea47cc8e97c76eb2d204 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 24 May 2016 20:50:36 -0700 Subject: [PATCH 116/904] Better .debug_line for non-simple functions. Summary: Generate .debug_line info for non-simple functions in a way that if preferrable by 'objdump -S'. (cherry picked from commit f04f72e3c36b7dffd945b39d8b07f7bf5db3cb91) --- bolt/RewriteInstance.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index bea925596e68..fbea6141c2ff 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2337,8 +2337,10 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); - if (LineTable->lookupAddressRange(Address, Function.getMaxSize() + 1, + if (LineTable->lookupAddressRange(Address, Function.getMaxSize(), Results)) { + auto &OutputLineTable = + BC->Ctx->getMCDwarfLineTable(CUOffset).getMCLineSections(); for (auto RowIndex : Results) { const auto &Row = LineTable->Rows[RowIndex]; BC->Ctx->setCurrentDwarfLoc( @@ -2352,15 +2354,19 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { Row.Isa, Row.Discriminator, Row.Address); - auto Loc = BC->Ctx->getCurrentDwarfLoc(); BC->Ctx->clearDwarfLocSeen(); - - auto &OutputLineTable = - BC->Ctx->getMCDwarfLineTable(CUOffset).getMCLineSections(); OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, FunctionSection); } + // Add an empty entry past the end of the function + // for end_sequence mark. + BC->Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0, + Address + Function.getMaxSize()); + auto Loc = BC->Ctx->getCurrentDwarfLoc(); + BC->Ctx->clearDwarfLocSeen(); + OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, + FunctionSection); } else { DEBUG(errs() << "BOLT-DEBUG: Function " << Function.getName() << " has no associated line number information.\n"); From 9c95f683bace7a47ab03e2e3cbfe1e2deaf754b9 Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Mon, 23 May 2016 16:16:25 -0700 Subject: [PATCH 117/904] Correctly updating landing pad exec counts. (cherry picked from commit 0a0c786f2b2ac3882b5345e52334fdbe20abfc1b) --- bolt/BinaryFunction.cpp | 21 +++++++++++++++++++-- bolt/DataReader.cpp | 13 +++++++++++++ bolt/DataReader.h | 4 ++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 843f3ced3c73..52ad882eedb5 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -450,6 +450,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData, // or a recursive call. bool IsCall = MIA->isCall(Instruction); bool IsCondBranch = MIA->isConditionalBranch(Instruction); + bool IsInvoke = MIA->isInvoke(Instruction); MCSymbol *TargetSymbol{nullptr}; uint64_t TargetOffset{0}; @@ -518,7 +519,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData, // Add local branch info. LocalBranches.push_back({Offset, TargetOffset}); } - if (IsCondBranch) { + if (IsCondBranch /*|| IsInvoke*/) { // Add fallthrough branch info. FTBranches.push_back({Offset, Offset + Size}); } @@ -715,7 +716,8 @@ bool BinaryFunction::buildCFG() { addCFIPlaceholders(CFIOffset, InsertBB); // How well do we detect tail calls here? - if (MIA->isTerminator(InstrInfo.second)) { + if (MIA->isTerminator(InstrInfo.second) /*|| + MIA->isInvoke(InstrInfo.second)*/) { PrevBB = InsertBB; InsertBB = nullptr; } @@ -800,6 +802,8 @@ bool BinaryFunction::buildCFG() { if (BB.succ_size() == 0) { IsPrevFT = MIA->isTerminator(*LastInstIter) ? false : true; } else if (BB.succ_size() == 1) { + /*assert(!MIA->isInvoke(*LastInstIter) && + "found throw with assocoated local branch");*/ IsPrevFT = MIA->isConditionalBranch(*LastInstIter) ? true : false; } else { // Ends with 2 branches, with an indirect jump or it is a conditional @@ -842,6 +846,8 @@ bool BinaryFunction::buildCFG() { void BinaryFunction::inferFallThroughCounts() { assert(!BasicBlocks.empty() && "basic block list should not be empty"); + auto BranchDataOrErr = BC.DR.getFuncBranchData(getName()); + // Compute preliminary execution time for each basic block for (auto &CurBB : BasicBlocks) { if (&CurBB == &*BasicBlocks.begin()) { @@ -864,6 +870,17 @@ void BinaryFunction::inferFallThroughCounts() { } } + // Udate execution counts of landing pad blocks. + if (!BranchDataOrErr.getError()) { + const FuncBranchData &BranchData = BranchDataOrErr.get(); + for (const auto &I : BranchData.EntryData) { + BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); + if (BB && LandingPads.find(BB->getLabel()) != LandingPads.end()) { + BB->ExecutionCount += I.Branches; + } + } + } + // Work on a basic block at a time, propagating frequency information forwards // It is important to walk in the layour order for (auto &CurBB : BasicBlocks) { diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 48565385a99c..967b4ef2e085 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -157,6 +157,7 @@ std::error_code DataReader::parse() { bool success; std::tie(I, success) = FuncsMap.insert( std::make_pair(Name, FuncBranchData(Name, + FuncBranchData::ContainerTy(), FuncBranchData::ContainerTy()))); assert(success && "unexpected result of insert"); } @@ -181,6 +182,12 @@ std::error_code DataReader::parse() { auto I = GetOrCreateFuncEntry(BI.From.Name); I->getValue().Data.emplace_back(std::move(BI)); + // Add entry data for branches from another function. + if (BI.To.IsSymbol && !BI.From.Name.equals(BI.To.Name)) { + I = GetOrCreateFuncEntry(BI.To.Name); + I->getValue().EntryData.emplace_back(std::move(BI)); + } + // If destination is the function start - update execution count. // NB: the data is skewed since we cannot tell tail recursion from // branches to the function start. @@ -204,10 +211,16 @@ DataReader::getFuncBranchData(StringRef FuncName) const { void DataReader::dump() const { for (const auto &Func : FuncsMap) { + Diag << Func.getKey() << " branches:\n"; for (const auto &BI : Func.getValue().Data) { Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; } + Diag << Func.getKey() << " entry points:\n"; + for (const auto &BI : Func.getValue().EntryData) { + Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " + << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; + } } } } diff --git a/bolt/DataReader.h b/bolt/DataReader.h index b5e570779ea7..88f087306a7d 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -85,6 +85,7 @@ struct FuncBranchData { StringRef Name; ContainerTy Data; + ContainerTy EntryData; /// Total execution count for the function. int64_t ExecutionCount{0}; @@ -92,6 +93,9 @@ struct FuncBranchData { FuncBranchData(StringRef Name, ContainerTy Data) : Name(Name), Data(std::move(Data)) {} + FuncBranchData(StringRef Name, ContainerTy Data, ContainerTy EntryData) + : Name(Name), Data(std::move(Data)), EntryData(std::move(EntryData)) {} + ErrorOr getBranch(uint64_t From, uint64_t To) const; }; From 6f26af98e4149269e3aa805497afed93ac19494d Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Tue, 24 May 2016 09:26:25 -0700 Subject: [PATCH 118/904] Taking LP counts into account for FT count inference (cherry picked from commit 67ed01a95f96147c966a68ba1b903911e9413e5a) --- bolt/BinaryBasicBlock.h | 8 ++++++- bolt/BinaryFunction.cpp | 48 ++++++++++++++++++++++++++++++++--------- bolt/BinaryFunction.h | 16 ++++++++++++++ 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 31f6595bb4e6..0b5b0d133c00 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -75,6 +75,7 @@ class BinaryBasicBlock { /// CFG information. std::vector Predecessors; std::vector Successors; + std::set LandingPads; struct BinaryBranchInfo { uint64_t Count; @@ -236,6 +237,11 @@ class BinaryBasicBlock { uint64_t Count = 0, uint64_t MispredictedCount = 0); + /// Adds block to landing pad list. + void addLandingPad(BinaryBasicBlock *LPBlock) { + LandingPads.insert(LPBlock); + } + /// Remove /p Succ basic block from the list of successors. Update the /// list of predecessors of /p Succ and update branch info. void removeSuccessor(BinaryBasicBlock *Succ); @@ -314,7 +320,7 @@ class BinaryBasicBlock { const MCSymbol *&FBB, MCInst *&CondBranch, MCInst *&UncondBranch); - + private: /// Adds predecessor to the BB. Most likely you don't need to call this. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 52ad882eedb5..3d8719e78ec3 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -450,7 +450,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData, // or a recursive call. bool IsCall = MIA->isCall(Instruction); bool IsCondBranch = MIA->isConditionalBranch(Instruction); - bool IsInvoke = MIA->isInvoke(Instruction); MCSymbol *TargetSymbol{nullptr}; uint64_t TargetOffset{0}; @@ -519,11 +518,10 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData, // Add local branch info. LocalBranches.push_back({Offset, TargetOffset}); } - if (IsCondBranch /*|| IsInvoke*/) { + if (IsCondBranch) { // Add fallthrough branch info. FTBranches.push_back({Offset, Offset + Size}); } - } else { // Should be an indirect call or an indirect branch. Bail out on the // latter case. @@ -715,9 +713,18 @@ bool BinaryFunction::buildCFG() { CFIOffset = getSize(); addCFIPlaceholders(CFIOffset, InsertBB); + // Store info about associated landing pad. + if (MIA->isInvoke(InstrInfo.second)) { + const MCSymbol *LP; + uint64_t Action; + std::tie(LP, Action) = MIA->getEHInfo(InstrInfo.second); + if (LP) { + LPToBBIndex[LP].push_back(getIndex(InsertBB)); + } + } + // How well do we detect tail calls here? - if (MIA->isTerminator(InstrInfo.second) /*|| - MIA->isInvoke(InstrInfo.second)*/) { + if (MIA->isTerminator(InstrInfo.second)) { PrevBB = InsertBB; InsertBB = nullptr; } @@ -802,8 +809,6 @@ bool BinaryFunction::buildCFG() { if (BB.succ_size() == 0) { IsPrevFT = MIA->isTerminator(*LastInstIter) ? false : true; } else if (BB.succ_size() == 1) { - /*assert(!MIA->isInvoke(*LastInstIter) && - "found throw with assocoated local branch");*/ IsPrevFT = MIA->isConditionalBranch(*LastInstIter) ? true : false; } else { // Ends with 2 branches, with an indirect jump or it is a conditional @@ -819,6 +824,17 @@ bool BinaryFunction::buildCFG() { DEBUG(dbgs() << "last block was marked as a fall-through\n"); } + // Add associated landing pad blocks to each basic block. + for (auto &BB : BasicBlocks) { + if (LandingPads.find(BB.getLabel()) != LandingPads.end()) { + MCSymbol *LP = BB.getLabel(); + for (unsigned I : LPToBBIndex.at(LP)) { + BinaryBasicBlock *ThrowBB = getBasicBlockAtIndex(I); + ThrowBB->addLandingPad(&BB); + } + } + } + // Infer frequency for non-taken branches if (ExecutionCount != COUNT_NO_PROFILE && !BranchDataOrErr.getError()) { inferFallThroughCounts(); @@ -833,6 +849,7 @@ bool BinaryFunction::buildCFG() { clearLabels(); clearLocalBranches(); clearFTBranches(); + clearLPToBBIndex(); // Update the state. CurrentState = State::CFG; @@ -899,14 +916,25 @@ void BinaryFunction::inferFallThroughCounts() { ReportedBranches += SuccCount.Count; } + // Calculate frequency of throws from this node according to LBR data + // for branching into associated landing pads. Since it is possible + // for a landing pad to be associated with more than one basic blocks, + // we may overestimate the frequency of throws for such blocks. + uint64_t ReportedThrows = 0; + for (BinaryBasicBlock *LP: CurBB.LandingPads) { + ReportedThrows += LP->ExecutionCount; + } + + uint64_t TotalReportedJumps = ReportedBranches + ReportedThrows; + // Infer the frequency of the fall-through edge, representing not taking the // branch uint64_t Inferred = 0; - if (BBExecCount > ReportedBranches) - Inferred = BBExecCount - ReportedBranches; + if (BBExecCount > TotalReportedJumps) + Inferred = BBExecCount - TotalReportedJumps; DEBUG({ - if (BBExecCount < ReportedBranches) + if (BBExecCount < TotalReportedJumps) dbgs() << "BOLT-WARNING: Fall-through inference is slightly inconsistent. " "exec frequency is less than the outgoing edges frequency (" diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index b1abf9050799..94ab3c8883c1 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -209,6 +209,13 @@ class BinaryFunction : public AddressRangesOwner { return *this; } + /// Release memory taken by landing pad info. + BinaryFunction &clearLPToBBIndex() { + LandingPadsMapType TempMap; + LPToBBIndex.swap(TempMap); + return *this; + } + BinaryFunction &updateState(BinaryFunction::State State) { CurrentState = State; return *this; @@ -230,6 +237,11 @@ class BinaryFunction : public AddressRangesOwner { LocalBranchesListType LocalBranches; LocalBranchesListType FTBranches; + /// Storage for all landing pads and their corresponding invokes. + using LandingPadsMapType = std::map >; + LandingPadsMapType LPToBBIndex; + /// Map offset in the function to a local label. using LabelsMapType = std::map; LabelsMapType Labels; @@ -390,6 +402,10 @@ class BinaryFunction : public AddressRangesOwner { return &BasicBlocks.at(Index); } + BinaryBasicBlock * getBasicBlockAtIndex(unsigned Index) { + return &BasicBlocks.at(Index); + } + /// Return the name of the function as extracted from the binary file. StringRef getName() const { return Name; From 0f22f53ad811fcb0185e8a3849988af6381c381f Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Thu, 26 May 2016 15:10:09 -0700 Subject: [PATCH 119/904] Better edge counts for fall through blocks in presence of C++ exceptions. Summary: The inference algorithm for counts of fall through edges takes possible jumps to landing pad blocks into account. Also, the landing pad block execution counts are updated using profile data. (cherry picked from commit 9d941b5bdd27272ecf7db7169369898d928487c2) --- bolt/BinaryBasicBlock.cpp | 5 +++ bolt/BinaryBasicBlock.h | 64 +++++++++++++++++++++++++++++++++++++-- bolt/BinaryFunction.cpp | 22 ++++++++++++++ bolt/BinaryFunction.h | 3 +- 4 files changed, 89 insertions(+), 5 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 39e51fc353c4..35a0a314a29a 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -62,6 +62,11 @@ void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) { Predecessors.erase(I); } +void BinaryBasicBlock::addLandingPad(BinaryBasicBlock *LPBlock) { + LandingPads.insert(LPBlock); + LPBlock->Throwers.insert(this); +} + bool BinaryBasicBlock::analyzeBranch(const MCInstrAnalysis &MIA, const MCSymbol *&TBB, const MCSymbol *&FBB, diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 0b5b0d133c00..07962f99d5bb 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -75,6 +75,7 @@ class BinaryBasicBlock { /// CFG information. std::vector Predecessors; std::vector Successors; + std::set Throwers; std::set LandingPads; struct BinaryBranchInfo { @@ -137,6 +138,10 @@ class BinaryBasicBlock { typedef std::vector::const_iterator const_pred_iterator; typedef std::vector::iterator succ_iterator; typedef std::vector::const_iterator const_succ_iterator; + typedef std::set::iterator throw_iterator; + typedef std::set::const_iterator const_throw_iterator; + typedef std::set::iterator lp_iterator; + typedef std::set::const_iterator const_lp_iterator; typedef std::vector::reverse_iterator pred_reverse_iterator; typedef std::vector::const_reverse_iterator @@ -145,6 +150,15 @@ class BinaryBasicBlock { succ_reverse_iterator; typedef std::vector::const_reverse_iterator const_succ_reverse_iterator; + typedef std::set::reverse_iterator + throw_reverse_iterator; + typedef std::set::const_reverse_iterator + const_throw_reverse_iterator; + typedef std::set::reverse_iterator + lp_reverse_iterator; + typedef std::set::const_reverse_iterator + const_lp_reverse_iterator; + pred_iterator pred_begin() { return Predecessors.begin(); } const_pred_iterator pred_begin() const { return Predecessors.begin(); } pred_iterator pred_end() { return Predecessors.end(); } @@ -179,6 +193,40 @@ class BinaryBasicBlock { } bool succ_empty() const { return Successors.empty(); } + throw_iterator throw_begin() { return Throwers.begin(); } + const_throw_iterator throw_begin() const { return Throwers.begin(); } + throw_iterator throw_end() { return Throwers.end(); } + const_throw_iterator throw_end() const { return Throwers.end(); } + throw_reverse_iterator throw_rbegin() + { return Throwers.rbegin();} + const_throw_reverse_iterator throw_rbegin() const + { return Throwers.rbegin();} + throw_reverse_iterator throw_rend() + { return Throwers.rend(); } + const_throw_reverse_iterator throw_rend() const + { return Throwers.rend(); } + unsigned throw_size() const { + return (unsigned)Throwers.size(); + } + bool throw_empty() const { return Throwers.empty(); } + + lp_iterator lp_begin() { return LandingPads.begin(); } + const_lp_iterator lp_begin() const { return LandingPads.begin(); } + lp_iterator lp_end() { return LandingPads.end(); } + const_lp_iterator lp_end() const { return LandingPads.end(); } + lp_reverse_iterator lp_rbegin() + { return LandingPads.rbegin(); } + const_lp_reverse_iterator lp_rbegin() const + { return LandingPads.rbegin(); } + lp_reverse_iterator lp_rend() + { return LandingPads.rend(); } + const_lp_reverse_iterator lp_rend() const + { return LandingPads.rend(); } + unsigned lp_size() const { + return (unsigned)LandingPads.size(); + } + bool lp_empty() const { return LandingPads.empty(); } + inline iterator_range predecessors() { return iterator_range(pred_begin(), pred_end()); } @@ -191,6 +239,18 @@ class BinaryBasicBlock { inline iterator_range successors() const { return iterator_range(succ_begin(), succ_end()); } + inline iterator_range throwers() { + return iterator_range(throw_begin(), throw_end()); + } + inline iterator_range throwers() const { + return iterator_range(throw_begin(), throw_end()); + } + inline iterator_range landing_pads() { + return iterator_range(lp_begin(), lp_end()); + } + inline iterator_range landing_pads() const { + return iterator_range(lp_begin(), lp_end()); + } /// Return symbol marking the start of this basic block. MCSymbol *getLabel() const { @@ -238,9 +298,7 @@ class BinaryBasicBlock { uint64_t MispredictedCount = 0); /// Adds block to landing pad list. - void addLandingPad(BinaryBasicBlock *LPBlock) { - LandingPads.insert(LPBlock); - } + void addLandingPad(BinaryBasicBlock *LPBlock); /// Remove /p Succ basic block from the list of successors. Update the /// list of predecessors of /p Succ and update branch info. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 3d8719e78ec3..81da4f4b5c61 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -287,6 +287,15 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, } OS << '\n'; } + if (!BB->Throwers.empty()) { + OS << " Throwers: "; + auto Sep = ""; + for (auto Throw : BB->Throwers) { + OS << Sep << Throw->getName(); + Sep = ", "; + } + OS << '\n'; + } Offset = RoundUpToAlignment(Offset, BB->getAlignment()); @@ -323,6 +332,19 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } + if (!BB->LandingPads.empty()) { + OS << " Landing Pads: "; + auto Sep = ""; + for (auto LP : BB->LandingPads) { + OS << Sep << LP->getName(); + if (ExecutionCount != COUNT_NO_PROFILE) { + OS << " (count: " << LP->ExecutionCount << ")"; + } + Sep = ", "; + } + OS << '\n'; + } + OS << '\n'; } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 94ab3c8883c1..7e27b6329777 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -238,8 +238,7 @@ class BinaryFunction : public AddressRangesOwner { LocalBranchesListType FTBranches; /// Storage for all landing pads and their corresponding invokes. - using LandingPadsMapType = std::map >; + using LandingPadsMapType = std::map >; LandingPadsMapType LPToBBIndex; /// Map offset in the function to a local label. From 54060115b877d5256c3305f4d5a9e4579ee0c5af Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 27 May 2016 20:19:19 -0700 Subject: [PATCH 120/904] Improvements for debug info. Summary: Assembly functions could have no corresponding DW_AT_subprogram entries, yet they are represented in module ranges (and .debug_aranges) and will have line number information. Make sure we update those. Eliminated unnecessary data structures and optimized some passes. For .debug_loc unused location entries are no longer processed resulting in smaller output files. Overall it's a small processing time improvement and memory imporement. (cherry picked from commit 10634104ebf096fd84a4ec80c6b89ca1fc93a9dc) --- bolt/BinaryContext.cpp | 76 +++++++++++++++++--------- bolt/BinaryContext.h | 16 ++---- bolt/BinaryFunction.cpp | 112 +++++++++++++++------------------------ bolt/BinaryFunction.h | 41 +++++++------- bolt/DebugData.cpp | 8 +-- bolt/RewriteInstance.cpp | 86 ++++++++++++++---------------- bolt/RewriteInstance.h | 6 +-- 7 files changed, 164 insertions(+), 181 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 50c6584631a1..cbfc30695901 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -116,7 +116,7 @@ void findAddressRangesObjects( /// Recursively finds DWARF DW_TAG_subprogram DIEs and match them with /// BinaryFunctions. Record DIEs for unknown subprograms (mostly functions that /// are never called and removed from the binary) in Unknown. -void findSubprograms(const DWARFCompileUnit *Unit, +void findSubprograms(DWARFCompileUnit *Unit, const DWARFDebugInfoEntryMinimal *DIE, std::map &BinaryFunctions, BinaryContext::DIECompileUnitVector &Unknown) { @@ -126,7 +126,7 @@ void findSubprograms(const DWARFCompileUnit *Unit, if (DIE->getLowAndHighPC(Unit, LowPC, HighPC)) { auto It = BinaryFunctions.find(LowPC); if (It != BinaryFunctions.end()) { - It->second.addSubprocedureDIE(Unit, DIE); + It->second.addSubprogramDIE(Unit, DIE); } else { Unknown.emplace_back(DIE, Unit); } @@ -145,13 +145,8 @@ void findSubprograms(const DWARFCompileUnit *Unit, namespace llvm { namespace bolt { -void BinaryContext::preprocessDebugInfo() { - // Iterate over all DWARF compilation units and map their offset in the - // binary to themselves in OffsetDwarfCUMap - for (const auto &CU : DwCtx->compile_units()) { - OffsetToDwarfCU[CU->getOffset()] = CU.get(); - } - +void BinaryContext::preprocessDebugInfo( + std::map &BinaryFunctions) { // Populate MCContext with DWARF files. for (const auto &CU : DwCtx->compile_units()) { const auto CUID = CU->getOffset(); @@ -165,23 +160,53 @@ void BinaryContext::preprocessDebugInfo() { ""; Ctx->getDwarfFile(Dir, FileNames[I].Name, I + 1, CUID); } - - auto LineTableOffset = - DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_stmt_list); - if (LineTableOffset) - LineTableOffsetCUMap[CUID] = LineTableOffset; } -} -void BinaryContext::preprocessFunctionDebugInfo( - std::map &BinaryFunctions) { // For each CU, iterate over its children DIEs and match subprogram DIEs to // BinaryFunctions. - for (const auto &CU : DwCtx->compile_units()) { + for (auto &CU : DwCtx->compile_units()) { findSubprograms(CU.get(), CU->getUnitDIE(false), BinaryFunctions, UnknownFunctions); } + // Some functions may not have a corresponding subprogram DIE + // yet they will be included in some CU and will have line number information. + // Hence we need to associate them with the CU and include in CU ranges. + for (auto &AddrFunctionPair : BinaryFunctions) { + auto FunctionAddress = AddrFunctionPair.first; + auto &Function = AddrFunctionPair.second; + if (!Function.getSubprogramDIEs().empty()) + continue; + if (auto DebugAranges = DwCtx->getDebugAranges()) { + auto CUOffset = DebugAranges->findAddress(FunctionAddress); + if (CUOffset != -1U) { + Function.addSubprogramDIE(DwCtx->getCompileUnitForOffset(CUOffset), + nullptr); + continue; + } + } + +#ifdef DWARF_LOOKUP_ALL_RANGES + // Last resort - iterate over all compile units. This should not happen + // very often. If it does, we need to create a separate lookup table + // similar to .debug_aranges internally. This slows down processing + // considerably. + for (const auto &CU : DwCtx->compile_units()) { + const auto *CUDie = CU->getUnitDIE(); + for (const auto &Range : CUDie->getAddressRanges(CU.get())) { + if (FunctionAddress >= Range.first && + FunctionAddress < Range.second) { + Function.addSubprogramDIE(CU.get(), nullptr); + break; + } + } + } +#endif + } +} + +void BinaryContext::preprocessFunctionDebugInfo( + std::map &BinaryFunctions) { // Iterate over DIE trees finding objects that contain address ranges. for (const auto &CU : DwCtx->compile_units()) { findAddressRangesObjects(CU.get(), CU->getUnitDIE(false), BinaryFunctions, @@ -191,15 +216,18 @@ void BinaryContext::preprocessFunctionDebugInfo( // Iterate over location lists and save them in LocationLists. auto DebugLoc = DwCtx->getDebugLoc(); for (const auto &DebugLocEntry : DebugLoc->getLocationLists()) { + if (DebugLocEntry.Entries.empty()) + continue; + auto StartAddress = DebugLocEntry.Entries.front().Begin; + auto *Function = getBinaryFunctionContainingAddress(StartAddress, + BinaryFunctions); + if (!Function || !Function->isSimple()) + continue; LocationLists.emplace_back(DebugLocEntry.Offset); auto &LocationList = LocationLists.back(); for (const auto &Location : DebugLocEntry.Entries) { - auto *Function = getBinaryFunctionContainingAddress(Location.Begin, - BinaryFunctions); - if (Function && Function->isSimple()) { - LocationList.addLocation(&Location.Loc, *Function, Location.Begin, - Location.End); - } + LocationList.addLocation(&Location.Loc, *Function, Location.Begin, + Location.End); } } } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 70e5e4b7ce94..867181b164e6 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -63,17 +63,6 @@ class BinaryContext { // Set of addresses we cannot relocate because we have a direct branch to it. std::set InterproceduralBranchTargets; - // Map from offset in the .debug_info section of the binary the - // DWARF Compilation Unit that starts at that offset. - std::map OffsetToDwarfCU; - - // Maps each compile unit to the offset of its .debug_line line table in the - // output file. - std::map CompileUnitLineTableOffset; - - /// Maps DWARF CUID to offset of stmt_list attribute in .debug_info. - std::map LineTableOffsetCUMap; - /// List of DWARF location lists in .debug_loc. std::vector LocationLists; @@ -86,7 +75,7 @@ class BinaryContext { std::vector> ; - /// List of subprocedure DIEs that have addresses that don't match any + /// List of subprogram DIEs that have addresses that don't match any /// function, along with their CU. DIECompileUnitVector UnknownFunctions; @@ -162,7 +151,8 @@ class BinaryContext { MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); /// Populate some internal data structures with debug info. - void preprocessDebugInfo(); + void preprocessDebugInfo( + std::map &BinaryFunctions); /// Populate internal data structures with debug info that depends on /// disassembled functions. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 81da4f4b5c61..9d55e9e6b500 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -53,22 +53,42 @@ PrintDebugInfo("print-debug-info", namespace { -// Finds which DWARF compile unit owns an address in the executable by -// querying .debug_aranges. -DWARFCompileUnit *FindCompileUnitForAddress(uint64_t Address, - const BinaryContext &BC) { - auto DebugAranges = BC.DwCtx->getDebugAranges(); - if (!DebugAranges) - return nullptr; +/// Gets debug line information for the instruction located at the given +/// address in the original binary. The SMLoc's pointer is used +/// to point to this information, which is represented by a +/// DebugLineTableRowRef. The returned pointer is null if no debug line +/// information for this instruction was found. +SMLoc findDebugLineInformationForInstructionAt( + uint64_t Address, + DWARFUnitLineTable &ULT) { + // We use the pointer in SMLoc to store an instance of DebugLineTableRowRef, + // which occupies 64 bits. Thus, we can only proceed if the struct fits into + // the pointer itself. + assert( + sizeof(decltype(SMLoc().getPointer())) >= sizeof(DebugLineTableRowRef) && + "Cannot fit instruction debug line information into SMLoc's pointer"); - uint32_t CompileUnitIndex = DebugAranges->findAddress(Address); + SMLoc NullResult = DebugLineTableRowRef::NULL_ROW.toSMLoc(); - auto It = BC.OffsetToDwarfCU.find(CompileUnitIndex); - if (It == BC.OffsetToDwarfCU.end()) { - return nullptr; - } else { - return It->second; - } + auto &LineTable = ULT.second; + if (!LineTable) + return NullResult; + + uint32_t RowIndex = LineTable->lookupAddress(Address); + if (RowIndex == LineTable->UnknownRowIndex) + return NullResult; + + assert(RowIndex < LineTable->Rows.size() && + "Line Table lookup returned invalid index."); + + decltype(SMLoc().getPointer()) Ptr; + DebugLineTableRowRef *InstructionLocation = + reinterpret_cast(&Ptr); + + InstructionLocation->DwCompileUnitIndex = ULT.first->getOffset(); + InstructionLocation->RowIndex = RowIndex + 1; + + return SMLoc::getFromPointer(Ptr); } } // namespace @@ -179,13 +199,9 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, }; // Used in printInstruction below to print debug line information. - DWARFCompileUnit *Unit = nullptr; - const DWARFDebugLine::LineTable *LineTable = nullptr; - - if (opts::PrintDebugInfo) { - Unit = FindCompileUnitForAddress(getAddress(), BC); - LineTable = Unit ? BC.DwCtx->getLineTableForUnit(Unit) : nullptr; - } + const DWARFDebugLine::LineTable *LineTable = + opts::PrintDebugInfo ? getDWARFUnitLineTable().second + : nullptr; auto printInstruction = [&](const MCInst &Instruction) { if (BC.MIA->isEHLabel(Instruction)) { @@ -386,18 +402,14 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "End of Function \"" << getName() << "\"\n\n"; } -bool BinaryFunction::disassemble(ArrayRef FunctionData, - bool ExtractDebugLineData) { +bool BinaryFunction::disassemble(ArrayRef FunctionData) { assert(FunctionData.size() == getSize() && "function size does not match raw data size"); auto &Ctx = BC.Ctx; auto &MIA = BC.MIA; - DWARFCompileUnit *CompileUnit = nullptr; - if (ExtractDebugLineData) { - CompileUnit = FindCompileUnitForAddress(getAddress(), BC); - } + DWARFUnitLineTable ULT = getDWARFUnitLineTable(); // Insert a label at the beginning of the function. This will be our first // basic block. @@ -409,7 +421,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData, MCSymbol *TargetSymbol{nullptr}; if (!BC.MIA->evaluateRIPOperand(Instruction, Address, Size, TargetAddress)) { - DEBUG(dbgs() << "BOLT: rip-relative operand could not be evaluated:\n"; + DEBUG(dbgs() << "BOLT: rip-relative operand can't be evaluated:\n"; BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); dbgs() << '\n'; Instruction.dump_pretty(dbgs(), BC.InstPrinter.get()); @@ -574,10 +586,9 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData, } } - if (CompileUnit) { + if (ULT.first && ULT.second) { Instruction.setLoc( - findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, - CompileUnit)); + findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, ULT)); } addInstruction(Offset, std::move(Instruction)); @@ -595,45 +606,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData, return true; } -SMLoc -BinaryFunction::findDebugLineInformationForInstructionAt( - uint64_t Address, - DWARFCompileUnit *Unit) { - // We use the pointer in SMLoc to store an instance of DebugLineTableRowRef, - // which occupies 64 bits. Thus, we can only proceed if the struct fits into - // the pointer itself. - assert( - sizeof(decltype(SMLoc().getPointer())) >= sizeof(DebugLineTableRowRef) && - "Cannot fit instruction debug line information into SMLoc's pointer"); - - const DWARFDebugLine::LineTable *LineTable = - BC.DwCtx->getLineTableForUnit(Unit); - - SMLoc NullResult = DebugLineTableRowRef::NULL_ROW.toSMLoc(); - - if (!LineTable) { - return NullResult; - } - - uint32_t RowIndex = LineTable->lookupAddress(Address); - - if (RowIndex == LineTable->UnknownRowIndex) { - return NullResult; - } - - assert(RowIndex < LineTable->Rows.size() && - "Line Table lookup returned invalid index."); - - decltype(SMLoc().getPointer()) Ptr; - DebugLineTableRowRef *InstructionLocation = - reinterpret_cast(&Ptr); - - InstructionLocation->DwCompileUnitIndex = Unit->getOffset(); - InstructionLocation->RowIndex = RowIndex + 1; - - return SMLoc::getFromPointer(Ptr); -} - bool BinaryFunction::buildCFG() { auto &MIA = BC.MIA; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 7e27b6329777..fd97eae09392 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -47,6 +47,9 @@ class DWARFDebugInfoEntryMinimal; namespace bolt { +using DWARFUnitLineTable = std::pair; + /// BinaryFunction is a representation of machine-level function. // /// We use the term "Binary" as "Machine" was already taken. @@ -166,10 +169,9 @@ class BinaryFunction : public AddressRangesOwner { std::set LandingPads; /// Associated DIEs in the .debug_info section with their respective CUs. - /// There can be multiple because of identical code folding performed by - /// the Linker Script. + /// There can be multiple because of identical code folding. std::vector> SubprocedureDIEs; + DWARFCompileUnit *>> SubprogramDIEs; /// Offset of this function's address ranges in the .debug_ranges section of /// the output binary. @@ -221,14 +223,6 @@ class BinaryFunction : public AddressRangesOwner { return *this; } - /// Gets debug line information for the instruction located at the given - /// address in the original binary. The SMLoc's pointer is used - /// to point to this information, which is represented by a - /// DebugLineTableRowRef. The returned pointer is null if no debug line - /// information for this instruction was found. - SMLoc findDebugLineInformationForInstructionAt(uint64_t Address, - DWARFCompileUnit *Unit); - const BinaryBasicBlock * getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const; @@ -730,10 +724,6 @@ class BinaryFunction : public AddressRangesOwner { /// /// \p FunctionData is the set bytes representing the function body. /// - /// \p ExtractDebugLineData is a flag indicating whether DWARF .debug_line - /// information should be looked up and tied to each disassembled - /// instruction. - /// /// The Function should be properly initialized before this function /// is called. I.e. function address and size should be set. /// @@ -741,8 +731,7 @@ class BinaryFunction : public AddressRangesOwner { /// state to State:Disassembled. /// /// Returns false if disassembly failed. - bool disassemble(ArrayRef FunctionData, - bool ExtractDebugLineData = false); + bool disassemble(ArrayRef FunctionData); /// Builds a list of basic blocks with successor and predecessor info. /// @@ -796,13 +785,23 @@ class BinaryFunction : public AddressRangesOwner { void emitLSDA(MCStreamer *Streamer); /// Sets the associated .debug_info entry. - void addSubprocedureDIE(const DWARFCompileUnit *Unit, + void addSubprogramDIE(DWARFCompileUnit *Unit, const DWARFDebugInfoEntryMinimal *DIE) { - SubprocedureDIEs.emplace_back(DIE, Unit); + SubprogramDIEs.emplace_back(DIE, Unit); } - const decltype(SubprocedureDIEs) &getSubprocedureDIEs() const { - return SubprocedureDIEs; + const decltype(SubprogramDIEs) &getSubprogramDIEs() const { + return SubprogramDIEs; + } + + /// Return DWARF compile unit with line info. + DWARFUnitLineTable getDWARFUnitLineTable() const { + for (auto &DIEUnitPair : SubprogramDIEs) { + if (auto *LT = BC.DwCtx->getLineTableForUnit(DIEUnitPair.second)) { + return std::make_pair(DIEUnitPair.second, LT); + } + } + return std::make_pair(nullptr, nullptr); } /// Returns the size of the basic block in the original binary. diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index 8a6e329dafc1..b1462fc284c6 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -50,7 +50,7 @@ void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, std::min(BBAddress + Function.getBasicBlockOriginalSize(BB), EndAddress); - AddressRanges.push_back( + AddressRanges.emplace_back( BBAddressRange{ BB, static_cast(InternalAddressRangeBegin - BBAddress), @@ -107,13 +107,13 @@ BasicBlockOffsetRanges::getAbsoluteAddressRanges() const { void DebugRangesSectionsWriter::AddRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size) { - CUAddressRanges[CompileUnitOffset].push_back(std::make_pair(Address, Size)); + CUAddressRanges[CompileUnitOffset].emplace_back(std::make_pair(Address, Size)); } void DebugRangesSectionsWriter::AddRange(AddressRangesOwner *BF, uint64_t Address, uint64_t Size) { - ObjectAddressRanges[BF].push_back(std::make_pair(Address, Size)); + ObjectAddressRanges[BF].emplace_back(std::make_pair(Address, Size)); } namespace { @@ -273,7 +273,7 @@ void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit, uint8_t NewAttrTag, uint8_t NewAttrForm) { assert(Unit && "No compile unit specified."); - Patches[Unit].push_back( + Patches[Unit].emplace_back( AbbrevAttrPatch{AbbrevCode, AttrTag, NewAttrTag, NewAttrForm}); } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index fbea6141c2ff..ee035987b956 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -50,6 +50,8 @@ #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TimeValue.h" +#include "llvm/Support/Timer.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Target/TargetMachine.h" #include @@ -563,7 +565,7 @@ void RewriteInstance::run() { if (opts::UpdateDebugSections && opts::FixDebugInfoLargeFunctions && checkLargeFunctions()) { ++PassNumber; - outs() << "BOLT: starting pass (ignoring large functions)" + outs() << "BOLT: starting pass (ignoring large functions) " << PassNumber << "...\n"; reset(); discoverStorage(); @@ -783,7 +785,7 @@ void RewriteInstance::readDebugInfo() { if (!opts::UpdateDebugSections) return; - BC->preprocessDebugInfo(); + BC->preprocessDebugInfo(BinaryFunctions); } void RewriteInstance::readFunctionDebugInfo() { @@ -868,7 +870,7 @@ void RewriteInstance::disassembleFunctions() { (SectionContents.data()) + FunctionOffset, Function.getSize()); - if (!Function.disassemble(FunctionData, opts::UpdateDebugSections)) + if (!Function.disassemble(FunctionData)) continue; if (opts::PrintAll || opts::PrintDisasm) @@ -1127,17 +1129,17 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc()); if (RowReference != DebugLineTableRowRef::NULL_ROW && Instr.getLoc().getPointer() != LastLocSeen.getPointer()) { - auto CompileUnit = - BC.OffsetToDwarfCU[RowReference.DwCompileUnitIndex]; - assert(CompileUnit && + auto ULT = Function.getDWARFUnitLineTable(); + auto Unit = ULT.first; + auto OriginalLineTable = ULT.second; + + assert(Unit && OriginalLineTable && "Invalid CU offset set in instruction debug info."); + assert(RowReference.DwCompileUnitIndex == Unit->getOffset() && + "DWARF compile unit mismatch"); - auto OriginalLineTable = - BC.DwCtx->getLineTableForUnit( - CompileUnit); const auto &OriginalRow = OriginalLineTable->Rows[RowReference.RowIndex - 1]; - BC.Ctx->setCurrentDwarfLoc( OriginalRow.File, OriginalRow.Line, @@ -1148,7 +1150,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, (DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin), OriginalRow.Isa, OriginalRow.Discriminator); - BC.Ctx->setDwarfCompileUnitID(CompileUnit->getOffset()); + BC.Ctx->setDwarfCompileUnitID(Unit->getOffset()); LastLocSeen = Instr.getLoc(); } } @@ -1261,7 +1263,7 @@ void RewriteInstance::emitFunctions() { if (opts::UpdateDebugSections) { // Compute offsets of tables in .debug_line for each compile unit. - computeLineTableOffsets(); + updateLineTableOffsets(); } // Get output object as ObjectFile. @@ -1440,7 +1442,7 @@ void RewriteInstance::updateFunctionRanges() { // The function potentially has multiple associated CUs because of // the identical code folding optimization. Update all of them with // the range. - for (const auto DIECompileUnitPair : Function.getSubprocedureDIEs()) { + for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { auto CUOffset = DIECompileUnitPair.second->getOffset(); if (CUOffset != -1U) RangesSectionsWriter.AddRange(CUOffset, RangeBegin, RangeSize); @@ -1450,7 +1452,7 @@ void RewriteInstance::updateFunctionRanges() { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; // If function doesn't have registered DIEs - there's nothting to update. - if (Function.getSubprocedureDIEs().empty()) + if (Function.getSubprogramDIEs().empty()) continue; // Use either new (image) or original size for the function range. auto Size = Function.isSimple() ? Function.getImageSize() @@ -2067,7 +2069,7 @@ void RewriteInstance::updateAddressRangesObjects() { } } -void RewriteInstance::computeLineTableOffsets() { +void RewriteInstance::updateLineTableOffsets() { const auto LineSection = BC->Ctx->getObjectFileInfo()->getDwarfLineSection(); auto CurrentFragment = LineSection->begin(); @@ -2080,10 +2082,20 @@ void RewriteInstance::computeLineTableOffsets() { // instead of from the first fragment. for (const auto &CUIDLineTablePair : BC->Ctx->getMCDwarfLineTables()) { auto Label = CUIDLineTablePair.second.getLabel(); - if (!Label) continue; + auto CUOffset = CUIDLineTablePair.first; + if (CUOffset == -1U) + continue; + + auto *CU = BC->DwCtx->getCompileUnitForOffset(CUOffset); + assert(CU && "expected non-null CU"); + auto LTOffset = + BC->DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list); + if (!LTOffset) + continue; + auto Fragment = Label->getFragment(); while (&*CurrentFragment != Fragment) { switch (CurrentFragment->getKind()) { @@ -2106,17 +2118,10 @@ void RewriteInstance::computeLineTableOffsets() { Offset += Label->getOffset() - CurrentOffset; CurrentOffset = Label->getOffset(); - auto CompileUnit = BC->OffsetToDwarfCU[CUIDLineTablePair.first]; - BC->CompileUnitLineTableOffset[CompileUnit] = Offset; + auto &SI = SectionMM->NoteSectionInfo[".debug_info"]; + SI.PendingRelocs.emplace_back( + SectionInfo::Reloc{LTOffset, 4, 0, Offset}); - auto LTOI = BC->LineTableOffsetCUMap.find(CUIDLineTablePair.first); - if (LTOI != BC->LineTableOffsetCUMap.end()) { - DEBUG(dbgs() << "BOLT-DEBUG: adding relocation for stmt_list " - << "in .debug_info\n"); - auto &SI = SectionMM->NoteSectionInfo[".debug_info"]; - SI.PendingRelocs.emplace_back( - SectionInfo::Reloc{LTOI->second, 4, 0, Offset}); - } DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first << " has line table at " << Offset << "\n"); } @@ -2169,7 +2174,7 @@ void RewriteInstance::updateDWARFAddressRanges() { // Update address ranges of functions. for (const auto &BFI : BinaryFunctions) { const auto &Function = BFI.second; - for (const auto DIECompileUnitPair : Function.getSubprocedureDIEs()) { + for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { updateDWARFObjectAddressRanges( Function.getAddressRangesOffset(), DIECompileUnitPair.second, @@ -2306,30 +2311,18 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { if (!opts::UpdateDebugSections) return; - auto DebugAranges = BC->DwCtx->getDebugAranges(); - if (!DebugAranges) { - errs() << "BOLT-WARNING: need .debug_aranges in the input file to update " - << "debug info for non-simple functions.\n"; - return; - } - for (auto It : BinaryFunctions) { const auto &Function = It.second; if (Function.isSimple()) continue; - uint64_t Address = It.first; - uint32_t CUOffset = DebugAranges->findAddress(Address); - if (CUOffset == -1U) { - errs() << "BOLT-WARNING: function " << Function.getName() - << " does not belong to any compile unit in .debug_aranges. " - << "Cannot update line number information.\n"; - continue; - } - auto Unit = BC->OffsetToDwarfCU[CUOffset]; - auto LineTable = BC->DwCtx->getLineTableForUnit(Unit); - assert(LineTable && "CU without .debug_line info."); + auto ULT = Function.getDWARFUnitLineTable(); + auto Unit = ULT.first; + auto LineTable = ULT.second; + + if (!LineTable) + continue; // nothing to update for this function std::vector Results; MCSectionELF *FunctionSection = @@ -2337,10 +2330,11 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + uint64_t Address = It.first; if (LineTable->lookupAddressRange(Address, Function.getMaxSize(), Results)) { auto &OutputLineTable = - BC->Ctx->getMCDwarfLineTable(CUOffset).getMCLineSections(); + BC->Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections(); for (auto RowIndex : Results) { const auto &Row = LineTable->Rows[RowIndex]; BC->Ctx->setCurrentDwarfLoc( diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 930f7be30fec..b99bcf7aa78d 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -191,9 +191,9 @@ class RewriteInstance { void patchELFPHDRTable(); void patchELFSectionHeaderTable(); - /// Computes output .debug_line line table offsets for each compile unit, and - /// stores them into BinaryContext::CompileUnitLineTableOffset. - void computeLineTableOffsets(); + /// Computes output .debug_line line table offsets for each compile unit, + /// and updates stmt_list for a corresponding compile unit. + void updateLineTableOffsets(); /// Adds an entry to be saved in the .debug_aranges/.debug_ranges section. /// \p OriginalFunctionAddress function's address in the original binary, From 8e17e27b8f2575ce977b0692dc32ee14b951159c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 31 May 2016 19:29:34 -0700 Subject: [PATCH 121/904] Fix large functions debug info by default. Summary: Turn on -fix-debuginfo-large-functions by default. In the process of testing I've discovered that we output cold code for functions that were too large to be emitted. Fixed that. (cherry picked from commit 2fc1f793a277a99d84977a6db9dea01e433bb138) --- bolt/RewriteInstance.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ee035987b956..dd2d79267ca4 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -125,9 +125,11 @@ UpdateDebugSections("update-debug-sections", static cl::opt FixDebugInfoLargeFunctions("fix-debuginfo-large-functions", + cl::init(true), cl::desc("do another pass if we encounter large " "functions, to correct their debug info."), - cl::Optional); + cl::Optional, + cl::ReallyHidden); static cl::opt AlignBlocks("align-blocks", @@ -1306,6 +1308,7 @@ void RewriteInstance::emitFunctions() { if (!Function.isSimple()) continue; + auto TooLarge = false; auto SMII = EFMM->SectionMapInfo.find(Function.getCodeSectionName()); if (SMII != EFMM->SectionMapInfo.end()) { DEBUG(dbgs() << "BOLT: mapping 0x" @@ -1317,6 +1320,10 @@ void RewriteInstance::emitFunctions() { Function.getAddress()); Function.setImageAddress(SMII->second.AllocAddress); Function.setImageSize(SMII->second.Size); + if (Function.getImageSize() > Function.getMaxSize()) { + TooLarge = true; + FailedAddresses.emplace_back(Function.getAddress()); + } } else { errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); @@ -1340,10 +1347,10 @@ void RewriteInstance::emitFunctions() { NextAvailableAddress); Function.cold().setAddress(NextAvailableAddress); Function.cold().setImageAddress(SMII->second.AllocAddress); - Function.cold().setImageSize(SMII->second.Size); + Function.cold().setImageSize(TooLarge ? 0 : SMII->second.Size); Function.cold().setFileOffset(getFileOffsetFor(NextAvailableAddress)); - NextAvailableAddress += SMII->second.Size; + NextAvailableAddress += Function.cold().getImageSize(); } else { errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; FailedAddresses.emplace_back(Function.getAddress()); From 2524d335ec0e94666d774c2082661382cdaaf195 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 7 Jun 2016 16:27:52 -0700 Subject: [PATCH 122/904] Allocate BinaryBasicBlocks with new rather than storing them in the BasicBlocks vector. Summary: This will help optimization passes that need to modify the CFG after it is constructed. Otherwise, the BinaryBasicBlock pointers stored in the layout, successors and predecessors would need to be modified every time a new basic block is created. (cherry picked from commit a064db57416d328498d527247e6a10cc05243908) --- bolt/BinaryFunction.cpp | 114 ++++++++++++++++++++------------------- bolt/BinaryFunction.h | 63 +++++++++++++++------- bolt/BinaryPasses.cpp | 4 +- bolt/RewriteInstance.cpp | 4 +- 4 files changed, 109 insertions(+), 76 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 9d55e9e6b500..b454de3fd526 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -103,12 +103,12 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { if (BasicBlocks.empty()) return nullptr; - auto I = std::upper_bound(BasicBlocks.begin(), - BasicBlocks.end(), + auto I = std::upper_bound(begin(), + end(), BinaryBasicBlock(Offset)); - assert(I != BasicBlocks.begin() && "first basic block not at offset 0"); + assert(I != begin() && "first basic block not at offset 0"); - return &(*--I); + return &*--I; } size_t @@ -117,7 +117,7 @@ BinaryFunction::getBasicBlockOriginalSize(const BinaryBasicBlock *BB) const { if (Index + 1 == BasicBlocks.size()) { return Size - BB->getOffset(); } else { - return BasicBlocks[Index + 1].getOffset() - BB->getOffset(); + return BasicBlocks[Index + 1]->getOffset() - BB->getOffset(); } } @@ -725,8 +725,8 @@ bool BinaryFunction::buildCFG() { } // Set the basic block layout to the original order. - for (auto &BB : BasicBlocks) { - BasicBlocksLayout.emplace_back(&BB); + for (auto BB : BasicBlocks) { + BasicBlocksLayout.emplace_back(BB); } // Intermediate dump. @@ -786,23 +786,23 @@ bool BinaryFunction::buildCFG() { // profile data, which were already accounted for in LocalBranches). PrevBB = nullptr; bool IsPrevFT = false; // Is previous block a fall-through. - for (auto &BB : BasicBlocks) { + for (auto BB : BasicBlocks) { if (IsPrevFT) { - PrevBB->addSuccessor(&BB, BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE, + PrevBB->addSuccessor(BB, BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE, BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE); } - if (BB.empty()) { + if (BB->empty()) { IsPrevFT = true; - PrevBB = &BB; + PrevBB = BB; continue; } - auto LastInstIter = --BB.end(); - while (MIA->isCFI(*LastInstIter) && LastInstIter != BB.begin()) + auto LastInstIter = --BB->end(); + while (MIA->isCFI(*LastInstIter) && LastInstIter != BB->begin()) --LastInstIter; - if (BB.succ_size() == 0) { + if (BB->succ_size() == 0) { IsPrevFT = MIA->isTerminator(*LastInstIter) ? false : true; - } else if (BB.succ_size() == 1) { + } else if (BB->succ_size() == 1) { IsPrevFT = MIA->isConditionalBranch(*LastInstIter) ? true : false; } else { // Ends with 2 branches, with an indirect jump or it is a conditional @@ -810,7 +810,7 @@ bool BinaryFunction::buildCFG() { IsPrevFT = false; } - PrevBB = &BB; + PrevBB = BB; } if (!IsPrevFT) { @@ -819,12 +819,12 @@ bool BinaryFunction::buildCFG() { } // Add associated landing pad blocks to each basic block. - for (auto &BB : BasicBlocks) { - if (LandingPads.find(BB.getLabel()) != LandingPads.end()) { - MCSymbol *LP = BB.getLabel(); + for (auto BB : BasicBlocks) { + if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { + MCSymbol *LP = BB->getLabel(); for (unsigned I : LPToBBIndex.at(LP)) { BinaryBasicBlock *ThrowBB = getBasicBlockAtIndex(I); - ThrowBB->addLandingPad(&BB); + ThrowBB->addLandingPad(BB); } } } @@ -860,20 +860,20 @@ void BinaryFunction::inferFallThroughCounts() { auto BranchDataOrErr = BC.DR.getFuncBranchData(getName()); // Compute preliminary execution time for each basic block - for (auto &CurBB : BasicBlocks) { - if (&CurBB == &*BasicBlocks.begin()) { - CurBB.ExecutionCount = ExecutionCount; + for (auto CurBB : BasicBlocks) { + if (CurBB == *BasicBlocks.begin()) { + CurBB->ExecutionCount = ExecutionCount; continue; } - CurBB.ExecutionCount = 0; + CurBB->ExecutionCount = 0; } - for (auto &CurBB : BasicBlocks) { - auto SuccCount = CurBB.BranchInfo.begin(); - for (auto Succ : CurBB.successors()) { + for (auto CurBB : BasicBlocks) { + auto SuccCount = CurBB->BranchInfo.begin(); + for (auto Succ : CurBB->successors()) { // Do not update execution count of the entry block (when we have tail // calls). We already accounted for those when computing the func count. - if (Succ == &*BasicBlocks.begin()) + if (Succ == *BasicBlocks.begin()) continue; if (SuccCount->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) Succ->ExecutionCount += SuccCount->Count; @@ -894,18 +894,18 @@ void BinaryFunction::inferFallThroughCounts() { // Work on a basic block at a time, propagating frequency information forwards // It is important to walk in the layour order - for (auto &CurBB : BasicBlocks) { - uint64_t BBExecCount = CurBB.getExecutionCount(); + for (auto CurBB : BasicBlocks) { + uint64_t BBExecCount = CurBB->getExecutionCount(); // Propagate this information to successors, filling in fall-through edges // with frequency information - if (CurBB.succ_size() == 0) + if (CurBB->succ_size() == 0) continue; // Calculate frequency of outgoing branches from this node according to // LBR data uint64_t ReportedBranches = 0; - for (auto &SuccCount : CurBB.BranchInfo) { + for (auto &SuccCount : CurBB->BranchInfo) { if (SuccCount.Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) ReportedBranches += SuccCount.Count; } @@ -915,7 +915,7 @@ void BinaryFunction::inferFallThroughCounts() { // for a landing pad to be associated with more than one basic blocks, // we may overestimate the frequency of throws for such blocks. uint64_t ReportedThrows = 0; - for (BinaryBasicBlock *LP: CurBB.LandingPads) { + for (BinaryBasicBlock *LP: CurBB->LandingPads) { ReportedThrows += LP->ExecutionCount; } @@ -934,15 +934,15 @@ void BinaryFunction::inferFallThroughCounts() { "exec frequency is less than the outgoing edges frequency (" << BBExecCount << " < " << ReportedBranches << ") for BB at offset 0x" - << Twine::utohexstr(getAddress() + CurBB.getOffset()) << '\n'; + << Twine::utohexstr(getAddress() + CurBB->getOffset()) << '\n'; }); // Put this information into the fall-through edge - if (CurBB.succ_size() == 0) + if (CurBB->succ_size() == 0) continue; // If there is a FT, the last successor will be it. - auto &SuccCount = CurBB.BranchInfo.back(); - auto &Succ = CurBB.Successors.back(); + auto &SuccCount = CurBB->BranchInfo.back(); + auto &Succ = CurBB->Successors.back(); if (SuccCount.Count == BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { SuccCount.Count = Inferred; Succ->ExecutionCount += Inferred; @@ -977,12 +977,12 @@ void BinaryFunction::annotateCFIState() { std::stack StateStack; for (auto CI = BasicBlocks.begin(), CE = BasicBlocks.end(); CI != CE; ++CI) { - BinaryBasicBlock &CurBB = *CI; + BinaryBasicBlock *CurBB = *CI; // Annotate this BB entry BBCFIState.emplace_back(State); // Advance state - for (const auto &Instr : CurBB) { + for (const auto &Instr : *CurBB) { MCCFIInstruction *CFI = getCFIFor(Instr); if (CFI == nullptr) continue; @@ -1520,10 +1520,10 @@ void BinaryFunction::solveOptimalLayout(bool Split) { const BinaryBasicBlock * BinaryFunction::getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const { - auto I = std::upper_bound(BasicBlocks.begin(), BasicBlocks.end(), *BB); - assert(I != BasicBlocks.begin() && "first basic block not at offset 0"); + auto I = std::upper_bound(begin(), end(), *BB); + assert(I != begin() && "first basic block not at offset 0"); - if (I == BasicBlocks.end()) + if (I == end()) return nullptr; return &*I; } @@ -1656,28 +1656,28 @@ void BinaryFunction::splitFunction() { assert(BasicBlocksLayout.size() > 0); // Never outline the first basic block. - BasicBlocks.front().CanOutline = false; - for (auto &BB : BasicBlocks) { - if (!BB.CanOutline) + BasicBlocks.front()->CanOutline = false; + for (auto BB : BasicBlocks) { + if (!BB->CanOutline) continue; - if (BB.getExecutionCount() != 0) { - BB.CanOutline = false; + if (BB->getExecutionCount() != 0) { + BB->CanOutline = false; continue; } if (hasEHRanges()) { // We cannot move landing pads (or rather entry points for landing // pads). - if (LandingPads.find(BB.getLabel()) != LandingPads.end()) { - BB.CanOutline = false; + if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { + BB->CanOutline = false; continue; } // We cannot move a block that can throw since exception-handling // runtime cannot deal with split functions. However, if we can guarantee // that the block never throws, it is safe to move the block to // decrease the size of the function. - for (auto &Instr : BB) { + for (auto &Instr : *BB) { if (BC.MIA->isInvoke(Instr)) { - BB.CanOutline = false; + BB->CanOutline = false; break; } } @@ -1726,8 +1726,8 @@ void BinaryFunction::propagateGnuArgsSizeInfo() { // It is important to iterate basic blocks in the original order when // assigning the value. uint64_t CurrentGnuArgsSize = 0; - for (auto &BB : BasicBlocks) { - for (auto II = BB.begin(); II != BB.end(); ) { + for (auto BB : BasicBlocks) { + for (auto II = BB->begin(); II != BB->end(); ) { auto &Instr = *II; if (BC.MIA->isCFI(Instr)) { auto CFI = getCFIFor(Instr); @@ -1736,7 +1736,7 @@ void BinaryFunction::propagateGnuArgsSizeInfo() { // Delete DW_CFA_GNU_args_size instructions and only regenerate // during the final code emission. The information is embedded // inside call instructions. - II = BB.Instructions.erase(II); + II = BB->Instructions.erase(II); } else { ++II; } @@ -1755,5 +1755,11 @@ void BinaryFunction::propagateGnuArgsSizeInfo() { } } +BinaryFunction::~BinaryFunction() { + for (auto BB : BasicBlocks) { + delete BB; + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index fd97eae09392..2f196f04a4b4 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -281,11 +281,15 @@ class BinaryFunction : public AddressRangesOwner { // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), // the terminating instructions need to be modified. - using BasicBlockListType = std::vector; + using BasicBlockListType = std::vector; using BasicBlockOrderType = std::vector; BasicBlockListType BasicBlocks; BasicBlockOrderType BasicBlocksLayout; + // Map that keeps track of the index of each basic block in the BasicBlocks + // vector. Used to make getIndex fast. + std::map BasicBlockIndices; + // At each basic block entry we attach a CFI state to detect if reordering // corrupts the CFI state for a block. The CFI state is simply the index in // FrameInstructions for the CFI responsible for creating this state. @@ -304,12 +308,31 @@ class BinaryFunction : public AddressRangesOwner { /// Count the number of functions created. static uint64_t Count; + template + class Iterator : public std::iterator { + public: + Iterator &operator++() { ++itr; return *this; } + Iterator &operator--() { --itr; return *this; } + Iterator operator++(int) { auto tmp(itr); itr++; return tmp; } + Iterator operator--(int) { auto tmp(itr); itr--; return tmp; } + bool operator==(const Iterator& other) const { return itr == other.itr; } + bool operator!=(const Iterator& other) const { return itr != other.itr; } + T& operator*() { return **itr; } + Iterator(Itr itr) : itr(itr) { } + private: + Itr itr; + }; + public: - typedef BasicBlockListType::iterator iterator; - typedef BasicBlockListType::const_iterator const_iterator; - typedef std::reverse_iterator const_reverse_iterator; - typedef std::reverse_iterator reverse_iterator; + typedef Iterator iterator; + typedef Iterator const_iterator; + typedef Iterator reverse_iterator; + typedef Iterator const_reverse_iterator; + typedef BasicBlockOrderType::iterator order_iterator; typedef BasicBlockOrderType::const_iterator const_order_iterator; @@ -326,10 +349,10 @@ class BinaryFunction : public AddressRangesOwner { unsigned size() const { return (unsigned)BasicBlocks.size();} bool empty() const { return BasicBlocks.empty(); } - const BinaryBasicBlock &front() const { return BasicBlocks.front(); } - BinaryBasicBlock &front() { return BasicBlocks.front(); } - const BinaryBasicBlock & back() const { return BasicBlocks.back(); } - BinaryBasicBlock & back() { return BasicBlocks.back(); } + const BinaryBasicBlock &front() const { return *BasicBlocks.front(); } + BinaryBasicBlock &front() { return *BasicBlocks.front(); } + const BinaryBasicBlock & back() const { return *BasicBlocks.back(); } + BinaryBasicBlock & back() { return *BasicBlocks.back(); } unsigned layout_size() const { return (unsigned)BasicBlocksLayout.size(); @@ -357,6 +380,10 @@ class BinaryFunction : public AddressRangesOwner { return iterator_range(cie_begin(), cie_end()); } + BinaryFunction& operator=(const BinaryFunction &) = delete; + BinaryFunction(const BinaryFunction &) = delete; + + BinaryFunction(BinaryFunction &&) = default; BinaryFunction(std::string Name, SymbolRef Symbol, SectionRef Section, uint64_t Address, uint64_t Size, BinaryContext &BC, @@ -383,20 +410,18 @@ class BinaryFunction : public AddressRangesOwner { /// Get basic block index assuming it belongs to this function. unsigned getIndex(const BinaryBasicBlock *BB) const { - assert(BB >= &BasicBlocks.front() && "wrong basic block"); - unsigned I = BB - &BasicBlocks.front(); - assert(I < BasicBlocks.size() && "wrong basic block"); - return I; + assert(BasicBlockIndices.find(BB) != BasicBlockIndices.end()); + return BasicBlockIndices.find(BB)->second; } /// Returns the n-th basic block in this function in its original layout, or /// nullptr if n >= size(). const BinaryBasicBlock * getBasicBlockAtIndex(unsigned Index) const { - return &BasicBlocks.at(Index); + return BasicBlocks.at(Index); } BinaryBasicBlock * getBasicBlockAtIndex(unsigned Index) { - return &BasicBlocks.at(Index); + return BasicBlocks.at(Index); } /// Return the name of the function as extracted from the binary file. @@ -504,15 +529,17 @@ class BinaryFunction : public AddressRangesOwner { assert(BC.Ctx && "cannot be called with empty context"); if (!Label) Label = BC.Ctx->createTempSymbol("BB", true); - BasicBlocks.emplace_back(BinaryBasicBlock(Label, this, Offset)); + BasicBlocks.emplace_back(new BinaryBasicBlock(Label, this, Offset)); - auto BB = &BasicBlocks.back(); + auto BB = BasicBlocks.back(); if (DeriveAlignment) { uint64_t DerivedAlignment = Offset & (1 + ~Offset); BB->setAlignment(std::min(DerivedAlignment, uint64_t(32))); } + BasicBlockIndices[BB] = BasicBlocks.size() - 1; + return BB; } @@ -821,7 +848,7 @@ class BinaryFunction : public AddressRangesOwner { return Estimate; } - virtual ~BinaryFunction() {} + virtual ~BinaryFunction(); /// Info for fragmented functions. class FragmentInfo { diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index b2290ccc73ec..5cc2bc6691e8 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -56,7 +56,7 @@ void OptimizeBodylessFunctions::analyze( BinaryFunction &BF, BinaryContext &BC, std::map &BFs) { - if (BF.size() != 1 || BF.begin()->size() == 0) + if (BF.size() != 1 || (*BF.begin()).size() == 0) return; auto &BB = *BF.begin(); @@ -84,7 +84,7 @@ void OptimizeBodylessFunctions::analyze( void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, BinaryContext &BC) { for (auto BBIt = BF.begin(), BBEnd = BF.end(); BBIt != BBEnd; ++BBIt) { - for (auto InstIt = BBIt->begin(), InstEnd = BBIt->end(); + for (auto InstIt = (*BBIt).begin(), InstEnd = (*BBIt).end(); InstIt != InstEnd; ++InstIt) { auto &Inst = *InstIt; if (BC.MIA->isCall(Inst)) { diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index dd2d79267ca4..e6fb8bbcd170 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1087,7 +1087,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, } } - assert(!Function.begin()->isCold() && + assert(!(*Function.begin()).isCold() && "first basic block should never be cold"); // Emit UD2 at the beginning if requested by user. @@ -2318,7 +2318,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { if (!opts::UpdateDebugSections) return; - for (auto It : BinaryFunctions) { + for (auto &It : BinaryFunctions) { const auto &Function = It.second; if (Function.isSimple()) From 16f6712abdfecb93f9277805078269f87ee42a76 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 7 Jun 2016 16:27:52 -0700 Subject: [PATCH 123/904] Indirect call optimization. (cherry picked from commit bdc7b95d49eadfec17538edf471c21e42c3d0232) --- bolt/BinaryBasicBlock.h | 5 + bolt/BinaryFunction.cpp | 53 ++++++- bolt/BinaryFunction.h | 40 ++++- bolt/BinaryPassManager.cpp | 10 ++ bolt/BinaryPassManager.h | 2 +- bolt/BinaryPasses.cpp | 306 +++++++++++++++++++++++++++++++++++++ bolt/BinaryPasses.h | 8 + bolt/DataReader.cpp | 19 +++ bolt/DataReader.h | 1 + 9 files changed, 433 insertions(+), 11 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 07962f99d5bb..cb2009be8deb 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -290,6 +290,11 @@ class BinaryBasicBlock { return Offset; } + /// Set offset of the basic block from the function start. + void setOffset(uint64_t newOffset) { + Offset = newOffset; + } + /// Adds block to successor list, and also updates predecessor list for /// successor block. /// Set branch info for this path. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index b454de3fd526..e7ceddc8ebfa 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -220,7 +220,11 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "\n"; return; } - BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); + if (!BC.MIA->isUnsupported(Instruction)) { + BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); + } else { + OS << "unsupported (probably jmpr)"; + } if (BC.MIA->isCall(Instruction)) { if (BC.MIA->isTailCall(Instruction)) OS << " # TAILCALL "; @@ -542,12 +546,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } } - Instruction.clear(); - Instruction.addOperand( - MCOperand::createExpr( - MCSymbolRefExpr::create(TargetSymbol, - MCSymbolRefExpr::VK_None, - *Ctx))); if (!IsCall) { // Add local branch info. LocalBranches.push_back({Offset, TargetOffset}); @@ -556,15 +554,54 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Add fallthrough branch info. FTBranches.push_back({Offset, Offset + Size}); } + + if (IsCall || !IsCondBranch) { + if (MIA->isIndirectBranch(Instruction)) { +#if 0 + dbgs() << "Indirect call/branch @ " + << Twine::utohexstr(Offset) << "\n"; +#endif + NonLocalIndirectBranches.push_back(Offset); + } + } + + Instruction.clear(); + Instruction.addOperand( + MCOperand::createExpr( + MCSymbolRefExpr::create(TargetSymbol, + MCSymbolRefExpr::VK_None, + *Ctx))); } else { + if (MIA->isCall(Instruction)) { +#if 0 + dbgs() << getName() << ": indirect call/branch @ " + << Twine::utohexstr(Offset) << "\n"; +#endif + NonLocalIndirectBranches.push_back(Offset); + } + // Should be an indirect call or an indirect branch. Bail out on the // latter case. if (MIA->isIndirectBranch(Instruction)) { DEBUG(dbgs() << "BOLT-WARNING: indirect branch detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << getName() << ".\n"); - IsSimple = false; + if (!MIA->isConditionalBranch(Instruction)) { +#if 0 + dbgs() << getName() << ": indirect call/branch @ " + << Twine::utohexstr(Offset) << "\n"; +#endif + NonLocalIndirectBranches.push_back(Offset); + + MCInst tmp(Instruction); + if (1 || !MIA->isTerminator(tmp) || !MIA->convertJmpToTailCall(tmp)) { + IsSimple = false; + } + } else { + IsSimple = false; + } } + // Indirect call. We only need to fix it if the operand is RIP-relative if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 2f196f04a4b4..ed8e1bc6b1dd 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -19,6 +19,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" +#include "DataReader.h" #include "DebugData.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" @@ -36,6 +37,7 @@ #include "llvm/Support/raw_ostream.h" #include #include +#include #include using namespace llvm::object; @@ -235,6 +237,10 @@ class BinaryFunction : public AddressRangesOwner { using LandingPadsMapType = std::map >; LandingPadsMapType LPToBBIndex; + /// Storage for non-local branches + using NonLocalIndirectBranchesListType = std::vector; + NonLocalIndirectBranchesListType NonLocalIndirectBranches; + /// Map offset in the function to a local label. using LabelsMapType = std::map; LabelsMapType Labels; @@ -288,7 +294,7 @@ class BinaryFunction : public AddressRangesOwner { // Map that keeps track of the index of each basic block in the BasicBlocks // vector. Used to make getIndex fast. - std::map BasicBlockIndices; + std::unordered_map BasicBlockIndices; // At each basic block entry we attach a CFI state to detect if reordering // corrupts the CFI state for a block. The CFI state is simply the index in @@ -336,6 +342,28 @@ class BinaryFunction : public AddressRangesOwner { typedef BasicBlockOrderType::iterator order_iterator; typedef BasicBlockOrderType::const_iterator const_order_iterator; + typedef NonLocalIndirectBranchesListType::iterator nlib_iterator; + typedef NonLocalIndirectBranchesListType::const_iterator const_nlib_iterator; + + nlib_iterator begin_nlibs() { + return NonLocalIndirectBranches.begin(); + } + const_nlib_iterator begin_nlibs() const { + return NonLocalIndirectBranches.begin(); + } + nlib_iterator end_nlibs() { + return NonLocalIndirectBranches.end(); + } + const_nlib_iterator end_nlibs() const { + return NonLocalIndirectBranches.end(); + } + inline iterator_range nlibs() { + return iterator_range(begin_nlibs(), end_nlibs()); + } + inline iterator_range nlibs() const { + return iterator_range(begin_nlibs(), end_nlibs()); + } + // CFG iterators. iterator begin() { return BasicBlocks.begin(); } const_iterator begin() const { return BasicBlocks.begin(); } @@ -525,7 +553,8 @@ class BinaryFunction : public AddressRangesOwner { /// Returns NULL if basic block already exists at the \p Offset. BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, bool DeriveAlignment = false) { - assert(!getBasicBlockAtOffset(Offset) && "basic block already exists"); + assert(CurrentState == State::CFG || + (!getBasicBlockAtOffset(Offset) && "basic block already exists")); assert(BC.Ctx && "cannot be called with empty context"); if (!Label) Label = BC.Ctx->createTempSymbol("BB", true); @@ -556,6 +585,13 @@ class BinaryFunction : public AddressRangesOwner { return nullptr; } + void updateLayout(BinaryBasicBlock* start, + const std::vector& newBBs) { + BasicBlocksLayout.insert(BasicBlocksLayout.begin() + getIndex(start) + 1, + newBBs.begin(), + newBBs.end()); + } + /// Return basic block that originally contained offset \p Offset /// from the function start. BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index da78727fab39..ed444af4f52c 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -24,6 +24,13 @@ OptimizeBodylessFunctions( llvm::cl::desc("optimize functions that just do a tail call"), llvm::cl::Optional); +static llvm::cl::opt +OptimizeIndirectBranches( + "optimize-indirect-branches", + llvm::cl::desc("optimize indirect branches"), + llvm::cl::init(true), + llvm::cl::Optional); + static llvm::cl::opt InlineSmallFunctions( "inline-small-functions", @@ -65,6 +72,9 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(std::move(llvm::make_unique())); + Manager.registerPass(llvm::make_unique(), + opts::OptimizeIndirectBranches); + Manager.registerPass(llvm::make_unique(), opts::SimplifyConditionalTailCalls); diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h index 5875a99b5a7e..b5b54fc5d482 100644 --- a/bolt/BinaryPassManager.h +++ b/bolt/BinaryPassManager.h @@ -66,7 +66,7 @@ class BinaryFunctionPassManager { /// Runs all enabled implemented passes on all functions. static void runAllPasses(BinaryContext &BC, std::map &Functions, - std::set &largeFunctions); + std::set &LargeFunctions); }; diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 5cc2bc6691e8..8ec7623c0fff 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -23,6 +23,21 @@ extern llvm::cl::opt PrintUCE; extern llvm::cl::opt SplitFunctions; extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function); +static llvm::cl::opt +OptimizeIndirectBranchesThreshold( + "optimize-indirect-branches-threshold", + llvm::cl::desc("threshold for optimizing a frequently taken indirect call"), + llvm::cl::init(90), + llvm::cl::Optional); + +static llvm::cl::opt +OptimizeIndirectBranchesTopN( + "optimize-indirect-branches-topn", + llvm::cl::desc("number of targets to consider when doing indirect " + "branch optimization"), + llvm::cl::init(2), + llvm::cl::Optional); + static llvm::cl::opt ReorderBlocks( "reorder-blocks", @@ -527,5 +542,296 @@ void SimplifyConditionalTailCalls::runOnFunctions( << " from a total of " << NumTailCallCandidates << "\n"; } +namespace { + +template +void printInstruction(S& OS, BinaryContext& BC, const MCInst &Instruction, bool printMCInst = false) { + if (!BC.MIA->isUnsupported(Instruction)) { + BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); + } else { + OS << "unsupported (probably jmpr)"; + } + OS << "\n"; + if (printMCInst) { + Instruction.dump_pretty(OS, BC.InstPrinter.get()); + OS << "\n"; + } +} + +template +uint64_t computeCodeSize(BinaryContext& BC, Itr beg, Itr end) { + uint64_t size = 0; + while (beg != end) { + // Calculate the size of the instruction. + // Note: this is imprecise since happening prior to relaxation. + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + printInstruction(dbgs(), BC, *beg, false); + BC.MCE->encodeInstruction(*beg++, VecOS, Fixups, *BC.STI); + size += Code.size(); + } + return size; +} + +} + +void OptimizeIndirectBranches::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions +) { + uint64_t TotalBranches = 0; + uint64_t TotalIndirectCalls = 0; + uint64_t TotalIndirectCallsites = 0; + uint64_t TotalIndirectCandidateCalls = 0; + for (auto &BFIt : BFs) { + auto &Function = BFIt.second; + + if (!Function.isSimple() || !opts::shouldProcess(Function)) + continue; + + auto BranchDataOrErr = BC.DR.getFuncBranchData(Function.getName()); + if (std::error_code EC = BranchDataOrErr.getError()) { + DEBUG(dbgs() << "no branch data found for \"" + << Function.getName() << "\"\n"); + continue; + } + const FuncBranchData &BranchData = BranchDataOrErr.get(); + + // Note: this is not just counting calls. + TotalBranches += BranchData.ExecutionCount; + + uint64_t Total = 0; + for (auto &nlib : Function.nlibs()) { + auto Branches = BranchData.getBranchRange(nlib); + for (auto &BInfo : Branches) { + Total += BInfo.Branches; + } + std::vector targets; + for (auto &BInfo : Branches) { + targets.push_back(BInfo); + } + + std::sort(targets.begin(), targets.end(), + [](const BranchInfo& a, const BranchInfo& b) { + return a.Branches > b.Branches; + }); + + if (!targets.empty()) { + uint64_t TopNBranches = 0; + + const int NumTargets = std::distance(targets.begin(), targets.end()); + const int N = std::min(int(opts::OptimizeIndirectBranchesTopN), + NumTargets); + + for (int i = 0; i < N; ++i) { + TopNBranches += targets[i].Branches; + } + + const double TopNFrequency = 100.0 * TopNBranches / Total; + + if (TopNFrequency >= opts::OptimizeIndirectBranchesThreshold) { + double Threshold = double(opts::OptimizeIndirectBranchesThreshold); + bool Separator = false; + + dbgs() << "BOLT: candidate branch info: " + << Function.getName() << " @ " << nlib + << " -> "; + + for (int i = 0; i < N && Threshold > 0; i++) { + const auto Frequency = 100.0 * targets[i].Branches / Total; + if (Separator) { + dbgs() << " | "; + } + Separator = true; + dbgs() << targets[i].To.Name + << ", count = " << targets[i].Branches + << ", mispreds = " << targets[i].Mispreds + << ", freq = " << (int)Frequency << "%"; + TotalIndirectCandidateCalls += targets[i].Branches; + Threshold -= Frequency; + } + dbgs() << "\n"; + + //assert(!targets[0].From.IsSymbol); + auto IndCallBlock = + Function.getBasicBlockContainingOffset(targets[0].From.Offset); + +#if 0 + // scan insts for jump (use analyze?) + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + bool Found = MIA->analyzeBranch(IndCallBlock->Instructions, + TBB, + FBB, + CondBranch, + UncondBranch); + assert(Found); + // how to assert that UncondBranch is the one we want? + assert(UncondBranch != nullptr); +#else + MCInst* CallInst = nullptr; + uint64_t InstOffset{RoundUpToAlignment(IndCallBlock->getOffset(), + IndCallBlock->getAlignment())}; + + size_t CallInstIdx = 0; + for (auto &Instr : *IndCallBlock) { + // Calculate the size of the instruction. + // Note: this is imprecise since happening prior to relaxation. + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + BC.MCE->encodeInstruction(Instr, VecOS, Fixups, *BC.STI); + if (InstOffset == targets[0].From.Offset) { + CallInst = &Instr; + } + ++CallInstIdx; + InstOffset += Code.size(); + } + assert(CallInst); +#endif + + std::vector Targets; + for (int i = 0; i < N; ++i) { + assert(targets[i].To.IsSymbol); + // Is this right? lookupSym doesn't always return a result + auto Symbol = BC.Ctx->getOrCreateSymbol(targets[i].To.Name); + assert(Symbol); + Targets.push_back(Symbol); + } + + MCInst* SourceInst = CallInst; // for now +#if 0 + for (auto &Instr : *IndCallBlock) { + if (&Instr == CallInst) break; + if (Instr.getNumOperands() > 0) { + printInstruction(dbgs(), BC, Instr, true); + for (unsigned int i = 0; i < Instr.getNumOperands(); ++i) { + auto &Operand = Instr.getOperand(i); + dbgs() << "isreg("<< i << ") = " << Operand.isReg() << "\n"; + dbgs() << "isexpr(" << i << ") = " << Operand.isExpr() << "\n"; + SourceInst = &Instr; // WRONG + } + } + if (&Instr == CallInst) break; + } + dbgs() << "-----------\n"; + assert(SourceInst); +#endif + + auto ICPcode = BC.MIA->indirectCallPromotion( + *SourceInst, // == CallInst for now + *CallInst, + Targets, + BC.Ctx.get()); + + if (!ICPcode.empty()) { + for (auto &entry : ICPcode) { + auto &Sym = entry.first; + auto &Insts = entry.second; + if (Sym) dbgs() << Sym->getName() << ":\n"; + for (auto &Instr : Insts) { + printInstruction(dbgs(), BC, Instr, false); + } + } + + // create new bbs with correct code in each one + // first + auto oldSuccRange = IndCallBlock->successors(); + std::vector oldSucc(oldSuccRange.begin(), oldSuccRange.end()); + BinaryBasicBlock* LastBlock = IndCallBlock; + BinaryBasicBlock* MergeBlock = nullptr; + std::vector newBBs; + + assert(!BC.MIA->isTailCall(*CallInst) || oldSucc.empty()); + + // Remove all successors from block doing the indirect call. + for (auto succ : oldSucc) { + IndCallBlock->removeSuccessor(succ); + } + assert(IndCallBlock->succ_empty()); + + dbgs() << "IndCallBlock = " << IndCallBlock << "\n"; + + if (ICPcode.back().second.empty()) { // merge block + // Create BB for merge block following old call + + uint64_t total = 0; + for (auto &entry : ICPcode) { + total += computeCodeSize(BC, entry.second.begin(), entry.second.end()); + } + + // adjust all other blocks by total + for (auto &BB : Function) { + if (BB.getOffset() > IndCallBlock->getOffset()) { + BB.setOffset(BB.getOffset() + total); + } + } + + //dbgs() << "total = " << total << "\n"; + //dbgs() << "InstOffset = " << InstOffset << "\n"; + MergeBlock = Function.addBasicBlock(total + InstOffset, ICPcode.back().first); + newBBs.push_back(MergeBlock); + for (auto succ : oldSucc) { + MergeBlock->addSuccessor(succ); + } + dbgs() << "MergeBlock = " << MergeBlock << "\n"; + + // Move instructions from the tail of the original call block + // to the merge block. + std::vector MovedInst; + + while(&IndCallBlock->back() != CallInst) { + auto &lastInst = IndCallBlock->back(); + MovedInst.push_back(lastInst); + IndCallBlock->eraseInstruction(&lastInst); + } + IndCallBlock->eraseInstruction(CallInst); + + for (auto itr = MovedInst.rbegin(); itr != MovedInst.rend(); ++itr) { + MergeBlock->addInstruction(*itr); + } + + ICPcode.pop_back(); // remove merge block + } + + for (auto &entry : ICPcode) { + auto &Sym = entry.first; + auto &Insts = entry.second; + if (Sym) { + auto TBB = Function.addBasicBlock(InstOffset, Sym); + newBBs.push_back(TBB); + LastBlock->addSuccessor(TBB); + LastBlock = TBB; + InstOffset += computeCodeSize(BC, Insts.begin(), Insts.end()); + dbgs() << "TBB = " << TBB << "\n"; + } + for (auto &Inst : Insts) { + LastBlock->addInstruction(Inst); + } + if (MergeBlock) LastBlock->addSuccessor(MergeBlock); + } + + // update BBlayout in Function, XXX is this right? + Function.updateLayout(IndCallBlock, newBBs); + } + } + } + + ++TotalIndirectCallsites; + } + TotalIndirectCalls += Total; + } + + dbgs() << "BOLT: total indirect callsites/candidate calls/calls/branches = " + << TotalIndirectCallsites << "/" + << TotalIndirectCandidateCalls << "/" + << TotalIndirectCalls << "/" + << TotalBranches << "\n"; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index fd224f2bf200..141cafb64822 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -148,6 +148,14 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// Optimize indirect calls. +class OptimizeIndirectBranches : public BinaryFunctionPass { + public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + } // namespace bolt } // namespace llvm diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 967b4ef2e085..7b14f62c2b87 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -18,6 +18,21 @@ namespace llvm { namespace bolt { +iterator_range +FuncBranchData::getBranchRange(uint64_t From) const { + assert(std::is_sorted(Data.begin(), Data.end())); + struct Compare { + bool operator()(const BranchInfo &BI, const uint64_t val) const { + return BI.From.Offset < val; + } + bool operator()(const uint64_t val, const BranchInfo &BI) const { + return val < BI.From.Offset; + } + }; + auto Range = std::equal_range(Data.begin(), Data.end(), From, Compare()); + return iterator_range(Range.first, Range.second); +} + ErrorOr FuncBranchData::getBranch(uint64_t From, uint64_t To) const { for (const auto &I : Data) { @@ -195,8 +210,12 @@ std::error_code DataReader::parse() { I = GetOrCreateFuncEntry(BI.To.Name); I->getValue().ExecutionCount += BI.Branches; } + } + for (auto &FuncBranches : FuncsMap) { + std::sort(FuncBranches.second.Data.begin(), FuncBranches.second.Data.end()); } + return std::error_code(); } diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 88f087306a7d..a921a32f3066 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -96,6 +96,7 @@ struct FuncBranchData { FuncBranchData(StringRef Name, ContainerTy Data, ContainerTy EntryData) : Name(Name), Data(std::move(Data)), EntryData(std::move(EntryData)) {} + iterator_range getBranchRange(uint64_t From) const; ErrorOr getBranch(uint64_t From, uint64_t To) const; }; From 74a11ad37e71fc6c85ef369aa4a658e5f67b8970 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 8 Jun 2016 17:38:13 -0700 Subject: [PATCH 124/904] Revert "Indirect call optimization." This reverts commit 33966090e18545b64013614e7929ff1bdcdf10d5. (cherry picked from commit d5f2193fe9ed95414363f8a07554dcf58cc066b5) --- bolt/BinaryBasicBlock.h | 5 - bolt/BinaryFunction.cpp | 53 +------ bolt/BinaryFunction.h | 40 +---- bolt/BinaryPassManager.cpp | 10 -- bolt/BinaryPassManager.h | 2 +- bolt/BinaryPasses.cpp | 306 ------------------------------------- bolt/BinaryPasses.h | 8 - bolt/DataReader.cpp | 19 --- bolt/DataReader.h | 1 - 9 files changed, 11 insertions(+), 433 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index cb2009be8deb..07962f99d5bb 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -290,11 +290,6 @@ class BinaryBasicBlock { return Offset; } - /// Set offset of the basic block from the function start. - void setOffset(uint64_t newOffset) { - Offset = newOffset; - } - /// Adds block to successor list, and also updates predecessor list for /// successor block. /// Set branch info for this path. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index e7ceddc8ebfa..b454de3fd526 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -220,11 +220,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "\n"; return; } - if (!BC.MIA->isUnsupported(Instruction)) { - BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); - } else { - OS << "unsupported (probably jmpr)"; - } + BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); if (BC.MIA->isCall(Instruction)) { if (BC.MIA->isTailCall(Instruction)) OS << " # TAILCALL "; @@ -546,6 +542,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } } + Instruction.clear(); + Instruction.addOperand( + MCOperand::createExpr( + MCSymbolRefExpr::create(TargetSymbol, + MCSymbolRefExpr::VK_None, + *Ctx))); if (!IsCall) { // Add local branch info. LocalBranches.push_back({Offset, TargetOffset}); @@ -554,54 +556,15 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Add fallthrough branch info. FTBranches.push_back({Offset, Offset + Size}); } - - if (IsCall || !IsCondBranch) { - if (MIA->isIndirectBranch(Instruction)) { -#if 0 - dbgs() << "Indirect call/branch @ " - << Twine::utohexstr(Offset) << "\n"; -#endif - NonLocalIndirectBranches.push_back(Offset); - } - } - - Instruction.clear(); - Instruction.addOperand( - MCOperand::createExpr( - MCSymbolRefExpr::create(TargetSymbol, - MCSymbolRefExpr::VK_None, - *Ctx))); } else { - if (MIA->isCall(Instruction)) { -#if 0 - dbgs() << getName() << ": indirect call/branch @ " - << Twine::utohexstr(Offset) << "\n"; -#endif - NonLocalIndirectBranches.push_back(Offset); - } - // Should be an indirect call or an indirect branch. Bail out on the // latter case. if (MIA->isIndirectBranch(Instruction)) { DEBUG(dbgs() << "BOLT-WARNING: indirect branch detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << getName() << ".\n"); - if (!MIA->isConditionalBranch(Instruction)) { -#if 0 - dbgs() << getName() << ": indirect call/branch @ " - << Twine::utohexstr(Offset) << "\n"; -#endif - NonLocalIndirectBranches.push_back(Offset); - - MCInst tmp(Instruction); - if (1 || !MIA->isTerminator(tmp) || !MIA->convertJmpToTailCall(tmp)) { - IsSimple = false; - } - } else { - IsSimple = false; - } + IsSimple = false; } - // Indirect call. We only need to fix it if the operand is RIP-relative if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index ed8e1bc6b1dd..2f196f04a4b4 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -19,7 +19,6 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" -#include "DataReader.h" #include "DebugData.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" @@ -37,7 +36,6 @@ #include "llvm/Support/raw_ostream.h" #include #include -#include #include using namespace llvm::object; @@ -237,10 +235,6 @@ class BinaryFunction : public AddressRangesOwner { using LandingPadsMapType = std::map >; LandingPadsMapType LPToBBIndex; - /// Storage for non-local branches - using NonLocalIndirectBranchesListType = std::vector; - NonLocalIndirectBranchesListType NonLocalIndirectBranches; - /// Map offset in the function to a local label. using LabelsMapType = std::map; LabelsMapType Labels; @@ -294,7 +288,7 @@ class BinaryFunction : public AddressRangesOwner { // Map that keeps track of the index of each basic block in the BasicBlocks // vector. Used to make getIndex fast. - std::unordered_map BasicBlockIndices; + std::map BasicBlockIndices; // At each basic block entry we attach a CFI state to detect if reordering // corrupts the CFI state for a block. The CFI state is simply the index in @@ -342,28 +336,6 @@ class BinaryFunction : public AddressRangesOwner { typedef BasicBlockOrderType::iterator order_iterator; typedef BasicBlockOrderType::const_iterator const_order_iterator; - typedef NonLocalIndirectBranchesListType::iterator nlib_iterator; - typedef NonLocalIndirectBranchesListType::const_iterator const_nlib_iterator; - - nlib_iterator begin_nlibs() { - return NonLocalIndirectBranches.begin(); - } - const_nlib_iterator begin_nlibs() const { - return NonLocalIndirectBranches.begin(); - } - nlib_iterator end_nlibs() { - return NonLocalIndirectBranches.end(); - } - const_nlib_iterator end_nlibs() const { - return NonLocalIndirectBranches.end(); - } - inline iterator_range nlibs() { - return iterator_range(begin_nlibs(), end_nlibs()); - } - inline iterator_range nlibs() const { - return iterator_range(begin_nlibs(), end_nlibs()); - } - // CFG iterators. iterator begin() { return BasicBlocks.begin(); } const_iterator begin() const { return BasicBlocks.begin(); } @@ -553,8 +525,7 @@ class BinaryFunction : public AddressRangesOwner { /// Returns NULL if basic block already exists at the \p Offset. BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, bool DeriveAlignment = false) { - assert(CurrentState == State::CFG || - (!getBasicBlockAtOffset(Offset) && "basic block already exists")); + assert(!getBasicBlockAtOffset(Offset) && "basic block already exists"); assert(BC.Ctx && "cannot be called with empty context"); if (!Label) Label = BC.Ctx->createTempSymbol("BB", true); @@ -585,13 +556,6 @@ class BinaryFunction : public AddressRangesOwner { return nullptr; } - void updateLayout(BinaryBasicBlock* start, - const std::vector& newBBs) { - BasicBlocksLayout.insert(BasicBlocksLayout.begin() + getIndex(start) + 1, - newBBs.begin(), - newBBs.end()); - } - /// Return basic block that originally contained offset \p Offset /// from the function start. BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index ed444af4f52c..da78727fab39 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -24,13 +24,6 @@ OptimizeBodylessFunctions( llvm::cl::desc("optimize functions that just do a tail call"), llvm::cl::Optional); -static llvm::cl::opt -OptimizeIndirectBranches( - "optimize-indirect-branches", - llvm::cl::desc("optimize indirect branches"), - llvm::cl::init(true), - llvm::cl::Optional); - static llvm::cl::opt InlineSmallFunctions( "inline-small-functions", @@ -72,9 +65,6 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(std::move(llvm::make_unique())); - Manager.registerPass(llvm::make_unique(), - opts::OptimizeIndirectBranches); - Manager.registerPass(llvm::make_unique(), opts::SimplifyConditionalTailCalls); diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h index b5b54fc5d482..5875a99b5a7e 100644 --- a/bolt/BinaryPassManager.h +++ b/bolt/BinaryPassManager.h @@ -66,7 +66,7 @@ class BinaryFunctionPassManager { /// Runs all enabled implemented passes on all functions. static void runAllPasses(BinaryContext &BC, std::map &Functions, - std::set &LargeFunctions); + std::set &largeFunctions); }; diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 8ec7623c0fff..5cc2bc6691e8 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -23,21 +23,6 @@ extern llvm::cl::opt PrintUCE; extern llvm::cl::opt SplitFunctions; extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function); -static llvm::cl::opt -OptimizeIndirectBranchesThreshold( - "optimize-indirect-branches-threshold", - llvm::cl::desc("threshold for optimizing a frequently taken indirect call"), - llvm::cl::init(90), - llvm::cl::Optional); - -static llvm::cl::opt -OptimizeIndirectBranchesTopN( - "optimize-indirect-branches-topn", - llvm::cl::desc("number of targets to consider when doing indirect " - "branch optimization"), - llvm::cl::init(2), - llvm::cl::Optional); - static llvm::cl::opt ReorderBlocks( "reorder-blocks", @@ -542,296 +527,5 @@ void SimplifyConditionalTailCalls::runOnFunctions( << " from a total of " << NumTailCallCandidates << "\n"; } -namespace { - -template -void printInstruction(S& OS, BinaryContext& BC, const MCInst &Instruction, bool printMCInst = false) { - if (!BC.MIA->isUnsupported(Instruction)) { - BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); - } else { - OS << "unsupported (probably jmpr)"; - } - OS << "\n"; - if (printMCInst) { - Instruction.dump_pretty(OS, BC.InstPrinter.get()); - OS << "\n"; - } -} - -template -uint64_t computeCodeSize(BinaryContext& BC, Itr beg, Itr end) { - uint64_t size = 0; - while (beg != end) { - // Calculate the size of the instruction. - // Note: this is imprecise since happening prior to relaxation. - SmallString<256> Code; - SmallVector Fixups; - raw_svector_ostream VecOS(Code); - printInstruction(dbgs(), BC, *beg, false); - BC.MCE->encodeInstruction(*beg++, VecOS, Fixups, *BC.STI); - size += Code.size(); - } - return size; -} - -} - -void OptimizeIndirectBranches::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions -) { - uint64_t TotalBranches = 0; - uint64_t TotalIndirectCalls = 0; - uint64_t TotalIndirectCallsites = 0; - uint64_t TotalIndirectCandidateCalls = 0; - for (auto &BFIt : BFs) { - auto &Function = BFIt.second; - - if (!Function.isSimple() || !opts::shouldProcess(Function)) - continue; - - auto BranchDataOrErr = BC.DR.getFuncBranchData(Function.getName()); - if (std::error_code EC = BranchDataOrErr.getError()) { - DEBUG(dbgs() << "no branch data found for \"" - << Function.getName() << "\"\n"); - continue; - } - const FuncBranchData &BranchData = BranchDataOrErr.get(); - - // Note: this is not just counting calls. - TotalBranches += BranchData.ExecutionCount; - - uint64_t Total = 0; - for (auto &nlib : Function.nlibs()) { - auto Branches = BranchData.getBranchRange(nlib); - for (auto &BInfo : Branches) { - Total += BInfo.Branches; - } - std::vector targets; - for (auto &BInfo : Branches) { - targets.push_back(BInfo); - } - - std::sort(targets.begin(), targets.end(), - [](const BranchInfo& a, const BranchInfo& b) { - return a.Branches > b.Branches; - }); - - if (!targets.empty()) { - uint64_t TopNBranches = 0; - - const int NumTargets = std::distance(targets.begin(), targets.end()); - const int N = std::min(int(opts::OptimizeIndirectBranchesTopN), - NumTargets); - - for (int i = 0; i < N; ++i) { - TopNBranches += targets[i].Branches; - } - - const double TopNFrequency = 100.0 * TopNBranches / Total; - - if (TopNFrequency >= opts::OptimizeIndirectBranchesThreshold) { - double Threshold = double(opts::OptimizeIndirectBranchesThreshold); - bool Separator = false; - - dbgs() << "BOLT: candidate branch info: " - << Function.getName() << " @ " << nlib - << " -> "; - - for (int i = 0; i < N && Threshold > 0; i++) { - const auto Frequency = 100.0 * targets[i].Branches / Total; - if (Separator) { - dbgs() << " | "; - } - Separator = true; - dbgs() << targets[i].To.Name - << ", count = " << targets[i].Branches - << ", mispreds = " << targets[i].Mispreds - << ", freq = " << (int)Frequency << "%"; - TotalIndirectCandidateCalls += targets[i].Branches; - Threshold -= Frequency; - } - dbgs() << "\n"; - - //assert(!targets[0].From.IsSymbol); - auto IndCallBlock = - Function.getBasicBlockContainingOffset(targets[0].From.Offset); - -#if 0 - // scan insts for jump (use analyze?) - const MCSymbol *TBB = nullptr; - const MCSymbol *FBB = nullptr; - MCInst *CondBranch = nullptr; - MCInst *UncondBranch = nullptr; - bool Found = MIA->analyzeBranch(IndCallBlock->Instructions, - TBB, - FBB, - CondBranch, - UncondBranch); - assert(Found); - // how to assert that UncondBranch is the one we want? - assert(UncondBranch != nullptr); -#else - MCInst* CallInst = nullptr; - uint64_t InstOffset{RoundUpToAlignment(IndCallBlock->getOffset(), - IndCallBlock->getAlignment())}; - - size_t CallInstIdx = 0; - for (auto &Instr : *IndCallBlock) { - // Calculate the size of the instruction. - // Note: this is imprecise since happening prior to relaxation. - SmallString<256> Code; - SmallVector Fixups; - raw_svector_ostream VecOS(Code); - BC.MCE->encodeInstruction(Instr, VecOS, Fixups, *BC.STI); - if (InstOffset == targets[0].From.Offset) { - CallInst = &Instr; - } - ++CallInstIdx; - InstOffset += Code.size(); - } - assert(CallInst); -#endif - - std::vector Targets; - for (int i = 0; i < N; ++i) { - assert(targets[i].To.IsSymbol); - // Is this right? lookupSym doesn't always return a result - auto Symbol = BC.Ctx->getOrCreateSymbol(targets[i].To.Name); - assert(Symbol); - Targets.push_back(Symbol); - } - - MCInst* SourceInst = CallInst; // for now -#if 0 - for (auto &Instr : *IndCallBlock) { - if (&Instr == CallInst) break; - if (Instr.getNumOperands() > 0) { - printInstruction(dbgs(), BC, Instr, true); - for (unsigned int i = 0; i < Instr.getNumOperands(); ++i) { - auto &Operand = Instr.getOperand(i); - dbgs() << "isreg("<< i << ") = " << Operand.isReg() << "\n"; - dbgs() << "isexpr(" << i << ") = " << Operand.isExpr() << "\n"; - SourceInst = &Instr; // WRONG - } - } - if (&Instr == CallInst) break; - } - dbgs() << "-----------\n"; - assert(SourceInst); -#endif - - auto ICPcode = BC.MIA->indirectCallPromotion( - *SourceInst, // == CallInst for now - *CallInst, - Targets, - BC.Ctx.get()); - - if (!ICPcode.empty()) { - for (auto &entry : ICPcode) { - auto &Sym = entry.first; - auto &Insts = entry.second; - if (Sym) dbgs() << Sym->getName() << ":\n"; - for (auto &Instr : Insts) { - printInstruction(dbgs(), BC, Instr, false); - } - } - - // create new bbs with correct code in each one - // first - auto oldSuccRange = IndCallBlock->successors(); - std::vector oldSucc(oldSuccRange.begin(), oldSuccRange.end()); - BinaryBasicBlock* LastBlock = IndCallBlock; - BinaryBasicBlock* MergeBlock = nullptr; - std::vector newBBs; - - assert(!BC.MIA->isTailCall(*CallInst) || oldSucc.empty()); - - // Remove all successors from block doing the indirect call. - for (auto succ : oldSucc) { - IndCallBlock->removeSuccessor(succ); - } - assert(IndCallBlock->succ_empty()); - - dbgs() << "IndCallBlock = " << IndCallBlock << "\n"; - - if (ICPcode.back().second.empty()) { // merge block - // Create BB for merge block following old call - - uint64_t total = 0; - for (auto &entry : ICPcode) { - total += computeCodeSize(BC, entry.second.begin(), entry.second.end()); - } - - // adjust all other blocks by total - for (auto &BB : Function) { - if (BB.getOffset() > IndCallBlock->getOffset()) { - BB.setOffset(BB.getOffset() + total); - } - } - - //dbgs() << "total = " << total << "\n"; - //dbgs() << "InstOffset = " << InstOffset << "\n"; - MergeBlock = Function.addBasicBlock(total + InstOffset, ICPcode.back().first); - newBBs.push_back(MergeBlock); - for (auto succ : oldSucc) { - MergeBlock->addSuccessor(succ); - } - dbgs() << "MergeBlock = " << MergeBlock << "\n"; - - // Move instructions from the tail of the original call block - // to the merge block. - std::vector MovedInst; - - while(&IndCallBlock->back() != CallInst) { - auto &lastInst = IndCallBlock->back(); - MovedInst.push_back(lastInst); - IndCallBlock->eraseInstruction(&lastInst); - } - IndCallBlock->eraseInstruction(CallInst); - - for (auto itr = MovedInst.rbegin(); itr != MovedInst.rend(); ++itr) { - MergeBlock->addInstruction(*itr); - } - - ICPcode.pop_back(); // remove merge block - } - - for (auto &entry : ICPcode) { - auto &Sym = entry.first; - auto &Insts = entry.second; - if (Sym) { - auto TBB = Function.addBasicBlock(InstOffset, Sym); - newBBs.push_back(TBB); - LastBlock->addSuccessor(TBB); - LastBlock = TBB; - InstOffset += computeCodeSize(BC, Insts.begin(), Insts.end()); - dbgs() << "TBB = " << TBB << "\n"; - } - for (auto &Inst : Insts) { - LastBlock->addInstruction(Inst); - } - if (MergeBlock) LastBlock->addSuccessor(MergeBlock); - } - - // update BBlayout in Function, XXX is this right? - Function.updateLayout(IndCallBlock, newBBs); - } - } - } - - ++TotalIndirectCallsites; - } - TotalIndirectCalls += Total; - } - - dbgs() << "BOLT: total indirect callsites/candidate calls/calls/branches = " - << TotalIndirectCallsites << "/" - << TotalIndirectCandidateCalls << "/" - << TotalIndirectCalls << "/" - << TotalBranches << "\n"; -} - } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 141cafb64822..fd224f2bf200 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -148,14 +148,6 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { std::set &LargeFunctions) override; }; -/// Optimize indirect calls. -class OptimizeIndirectBranches : public BinaryFunctionPass { - public: - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; -}; - } // namespace bolt } // namespace llvm diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 7b14f62c2b87..967b4ef2e085 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -18,21 +18,6 @@ namespace llvm { namespace bolt { -iterator_range -FuncBranchData::getBranchRange(uint64_t From) const { - assert(std::is_sorted(Data.begin(), Data.end())); - struct Compare { - bool operator()(const BranchInfo &BI, const uint64_t val) const { - return BI.From.Offset < val; - } - bool operator()(const uint64_t val, const BranchInfo &BI) const { - return val < BI.From.Offset; - } - }; - auto Range = std::equal_range(Data.begin(), Data.end(), From, Compare()); - return iterator_range(Range.first, Range.second); -} - ErrorOr FuncBranchData::getBranch(uint64_t From, uint64_t To) const { for (const auto &I : Data) { @@ -210,12 +195,8 @@ std::error_code DataReader::parse() { I = GetOrCreateFuncEntry(BI.To.Name); I->getValue().ExecutionCount += BI.Branches; } - } - for (auto &FuncBranches : FuncsMap) { - std::sort(FuncBranches.second.Data.begin(), FuncBranches.second.Data.end()); } - return std::error_code(); } diff --git a/bolt/DataReader.h b/bolt/DataReader.h index a921a32f3066..88f087306a7d 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -96,7 +96,6 @@ struct FuncBranchData { FuncBranchData(StringRef Name, ContainerTy Data, ContainerTy EntryData) : Name(Name), Data(std::move(Data)), EntryData(std::move(EntryData)) {} - iterator_range getBranchRange(uint64_t From) const; ErrorOr getBranch(uint64_t From, uint64_t To) const; }; From a667d934605c638ebc63f07a27477fc483faebc4 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 9 Jun 2016 17:45:15 -0700 Subject: [PATCH 125/904] [merge-fdata] Add option to print function list. Summary: Print total number of functions/objects that have profile and add new options: -print - print the list of objects with count to stderr =none - do not print objects/functions =exec - print functions sorted by execution count =branches - print functions sorted by total branch count -q - do not print merged data to stdout (cherry picked from commit e987f38aac488dd76a48c934ea74f6cf3c92ef5f) --- bolt/merge-fdata/merge-fdata.cpp | 89 ++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 9 deletions(-) diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/merge-fdata/merge-fdata.cpp index cab715648faf..4f9e292ef55e 100644 --- a/bolt/merge-fdata/merge-fdata.cpp +++ b/bolt/merge-fdata/merge-fdata.cpp @@ -27,12 +27,40 @@ using namespace bolt; namespace opts { +enum SortType : char { + ST_NONE, + ST_EXEC_COUNT, /// Sort based on function execution count. + ST_TOTAL_BRANCHES, /// Sort based on all branches in the function. +}; + static cl::list InputDataFilenames(cl::Positional, cl::CommaSeparated, cl::desc(" []..."), cl::OneOrMore); +static cl::opt +SuppressMergedDataOutput("q", + cl::desc("do not print merged data to stdout"), + cl::init(false), + cl::Optional); + +static cl::opt +PrintFunctionList( + "print", + cl::desc("print the list of objects with count to stderr"), + cl::init(ST_NONE), + cl::values(clEnumValN(ST_NONE, + "none", + "do not print objects/functions"), + clEnumValN(ST_EXEC_COUNT, + "exec", + "print functions sorted by execution count"), + clEnumValN(ST_TOTAL_BRANCHES, + "branches", + "print functions sorted by total branch count"), + clEnumValEnd)); + } // namespace opts static StringRef ToolName; @@ -98,6 +126,7 @@ int main(int argc, char **argv) { for (auto &FI : ReaderOrErr.get()->getAllFuncsData()) { auto MI = MergedFunctionsData.find(FI.second.Name); if (MI != MergedFunctionsData.end()) { + MI->second.ExecutionCount += FI.second.ExecutionCount; std::vector TmpBI; for (auto &BI : FI.second.Data) { // Find and merge a corresponding entry or copy data. @@ -132,6 +161,7 @@ int main(int argc, char **argv) { std::make_pair(*NamePtr, FuncBranchData(*NamePtr, FuncBranchData::ContainerTy()))); + MI->second.ExecutionCount = FI.second.ExecutionCount; // Copy with string conversion while eliminating duplicates. std::sort(FI.second.Data.begin(), FI.second.Data.end()); BranchInfo *PrevBI = nullptr; @@ -148,18 +178,59 @@ int main(int argc, char **argv) { } } - // Print all the data in the original format - for (auto &FDI : MergedFunctionsData) { - for (auto &BD : FDI.second.Data) { - outs() << BD.From.IsSymbol << " " << FDI.first() << " " - << Twine::utohexstr(BD.From.Offset) << " " - << BD.To.IsSymbol << " " << BD.To.Name << " " - << Twine::utohexstr(BD.To.Offset) << " " - << BD.Mispreds << " " << BD.Branches << '\n'; + if (!opts::SuppressMergedDataOutput) { + // Print all the data in the original format + for (auto &FDI : MergedFunctionsData) { + for (auto &BD : FDI.second.Data) { + outs() << BD.From.IsSymbol << " " << FDI.first() << " " + << Twine::utohexstr(BD.From.Offset) << " " + << BD.To.IsSymbol << " " << BD.To.Name << " " + << Twine::utohexstr(BD.To.Offset) << " " + << BD.Mispreds << " " << BD.Branches << '\n'; + } } } - errs() << "All data merged successfully.\n"; + errs() << "Data for " << MergedFunctionsData.size() + << " unique objects successfully merged.\n"; + + if (opts::PrintFunctionList != opts::ST_NONE) { + // List of function names with execution count. + std::vector> + FunctionList(MergedFunctionsData.size()); + using CountFuncType = + std::function( + const StringMapEntry&)>; + CountFuncType ExecCountFunc = [](const StringMapEntry &v) { + return std::make_pair(v.second.ExecutionCount, + v.second.Name); + }; + CountFuncType BranchCountFunc = [](const StringMapEntry &v){ + // Return total branch count. + uint64_t BranchCount = 0; + for (const auto &BI : v.second.Data) + BranchCount += BI.Branches; + return std::make_pair(BranchCount, + v.second.Name); + }; + + CountFuncType CountFunc = (opts::PrintFunctionList == opts::ST_EXEC_COUNT) + ? ExecCountFunc + : BranchCountFunc; + std::transform(MergedFunctionsData.begin(), + MergedFunctionsData.end(), + FunctionList.begin(), + CountFunc); + std::stable_sort(FunctionList.rbegin(), FunctionList.rend()); + errs() << "Functions sorted by " + << (opts::PrintFunctionList == opts::ST_EXEC_COUNT + ? "execution" + : "total branch") + << " count:\n"; + for (auto &FI : FunctionList) { + errs() << FI.second << " : " << FI.first << '\n'; + } + } AllStrings.clear(); From c1cef599752202abe741d609201b7d8938df1088 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 15 Jun 2016 18:36:16 -0700 Subject: [PATCH 126/904] Reject profile data for functions that do not match. Summary: Verify profile data for a function and reject if there are branches that don't correspond to any branches in the function CFG. Note that we have to ignore branches resulting from recursive calls. Fix printing instruction offsets in disassembled state. Allow function to have non-zero execution count even if we don't have branch information. (cherry picked from commit 193b57a36941c71d8ab5646e52e2ee7dfb5ac123) --- bolt/BinaryFunction.cpp | 148 +++++++++++++++++++++++++++++++++------ bolt/BinaryFunction.h | 76 +++++++++----------- bolt/RewriteInstance.cpp | 29 +++++--- 3 files changed, 181 insertions(+), 72 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index b454de3fd526..954c4aa2a249 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -166,8 +166,10 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, } if (ImageAddress) OS << "\n Image : 0x" << Twine::utohexstr(ImageAddress); - if (ExecutionCount != COUNT_NO_PROFILE) + if (ExecutionCount != COUNT_NO_PROFILE) { OS << "\n Exec Count : " << ExecutionCount; + OS << "\n Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f); + } OS << "\n}\n"; @@ -262,7 +264,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (BasicBlocks.empty() && !Instructions.empty()) { // Print before CFG was built. for (const auto &II : Instructions) { - auto Offset = II.first; + Offset = II.first; // Print label if exists at this offset. auto LI = Labels.find(Offset); @@ -549,8 +551,8 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { MCSymbolRefExpr::VK_None, *Ctx))); if (!IsCall) { - // Add local branch info. - LocalBranches.push_back({Offset, TargetOffset}); + // Add taken branch info. + TakenBranches.push_back({Offset, TargetOffset}); } if (IsCondBranch) { // Add fallthrough branch info. @@ -611,11 +613,10 @@ bool BinaryFunction::buildCFG() { auto &MIA = BC.MIA; auto BranchDataOrErr = BC.DR.getFuncBranchData(getName()); - if (std::error_code EC = BranchDataOrErr.getError()) { + if (!BranchDataOrErr) { DEBUG(dbgs() << "no branch data found for \"" << getName() << "\"\n"); } else { - if (!BranchDataOrErr.get().Data.empty()) - ExecutionCount = BranchDataOrErr.get().ExecutionCount; + ExecutionCount = BranchDataOrErr->ExecutionCount; } if (!isSimple()) @@ -736,7 +737,11 @@ bool BinaryFunction::buildCFG() { // e.g. exit(3), etc. Otherwise we'll see a false fall-through // blocks. - for (auto &Branch : LocalBranches) { + // Make sure we can use profile data for this function. + if (BranchDataOrErr) + evaluateProfileData(BranchDataOrErr.get()); + + for (auto &Branch : TakenBranches) { DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first) << "] -> [0x" << Twine::utohexstr(Branch.second) << "]\n"); BinaryBasicBlock *FromBB = getBasicBlockContainingOffset(Branch.first); @@ -783,7 +788,7 @@ bool BinaryFunction::buildCFG() { } // Add fall-through branches (except for non-taken conditional branches with - // profile data, which were already accounted for in LocalBranches). + // profile data, which were already accounted for in TakenBranches). PrevBB = nullptr; bool IsPrevFT = false; // Is previous block a fall-through. for (auto BB : BasicBlocks) { @@ -830,20 +835,19 @@ bool BinaryFunction::buildCFG() { } // Infer frequency for non-taken branches - if (ExecutionCount != COUNT_NO_PROFILE && !BranchDataOrErr.getError()) { + if (hasValidProfile()) inferFallThroughCounts(); - } // Update CFI information for each BB annotateCFIState(); // Clean-up memory taken by instructions and labels. - clearInstructions(); - clearCFIOffsets(); - clearLabels(); - clearLocalBranches(); - clearFTBranches(); - clearLPToBBIndex(); + clearList(Instructions); + clearList(OffsetToCFI); + clearList(Labels); + clearList(TakenBranches); + clearList(FTBranches); + clearList(LPToBBIndex); // Update the state. CurrentState = State::CFG; @@ -854,6 +858,108 @@ bool BinaryFunction::buildCFG() { return true; } +void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { + BranchListType ProfileBranches(BranchData.Data.size()); + std::transform(BranchData.Data.begin(), + BranchData.Data.end(), + ProfileBranches.begin(), + [](const BranchInfo &BI) { + return std::make_pair(BI.From.Offset, + BI.To.Name == BI.From.Name ? + BI.To.Offset : -1U); + }); + BranchListType LocalProfileBranches; + std::copy_if(ProfileBranches.begin(), + ProfileBranches.end(), + std::back_inserter(LocalProfileBranches), + [](const std::pair &Branch) { + return Branch.second != -1U; + }); + + // Until we define a minimal profile, we consider no branch data to be a valid + // profile. It could happen to a function without branches. + if (LocalProfileBranches.empty()) { + ProfileMatchRatio = 1.0f; + return; + } + + std::sort(LocalProfileBranches.begin(), LocalProfileBranches.end()); + + BranchListType FunctionBranches = TakenBranches; + FunctionBranches.insert(FunctionBranches.end(), + FTBranches.begin(), + FTBranches.end()); + std::sort(FunctionBranches.begin(), FunctionBranches.end()); + + BranchListType DiffBranches; // Branches in profile without a match. + std::set_difference(LocalProfileBranches.begin(), + LocalProfileBranches.end(), + FunctionBranches.begin(), + FunctionBranches.end(), + std::back_inserter(DiffBranches)); + + // Branches without a match in CFG. + BranchListType OrphanBranches; + + // Eliminate recursive calls and returns from recursive calls from the list + // of branches that have no match. They are not considered local branches. + auto isRecursiveBranch = [&](std::pair &Branch) { + auto SrcInstrI = Instructions.find(Branch.first); + if (SrcInstrI == Instructions.end()) + return false; + + // Check if it is a recursive call. + if (BC.MIA->isCall(SrcInstrI->second) && Branch.second == 0) + return true; + + auto DstInstrI = Instructions.find(Branch.second); + if (DstInstrI == Instructions.end()) + return false; + + // Check if it is a return from a recursive call. + bool IsSrcReturn = BC.MIA->isReturn(SrcInstrI->second); + // "rep ret" is considered to be 2 different instructions. + if (!IsSrcReturn && BC.MIA->isPrefix(SrcInstrI->second)) { + auto SrcInstrSuccessorI = SrcInstrI; + ++SrcInstrSuccessorI; + assert(SrcInstrSuccessorI != Instructions.end() && + "unexpected prefix instruction at the end of function"); + IsSrcReturn = BC.MIA->isReturn(SrcInstrSuccessorI->second); + } + if (IsSrcReturn && Branch.second != 0) { + // Make sure the destination follows the call instruction. + auto DstInstrPredecessorI = DstInstrI; + --DstInstrPredecessorI; + assert(DstInstrPredecessorI != Instructions.end() && "invalid iterator"); + if (BC.MIA->isCall(DstInstrPredecessorI->second)) + return true; + } + return false; + }; + std::remove_copy_if(DiffBranches.begin(), + DiffBranches.end(), + std::back_inserter(OrphanBranches), + isRecursiveBranch); + + ProfileMatchRatio = + (float) (LocalProfileBranches.size() - OrphanBranches.size()) / + (float) LocalProfileBranches.size(); + + if (!OrphanBranches.empty()) { + errs() << "BOLT-WARNING: profile branches match only " + << format("%.1f%%", ProfileMatchRatio * 100.0f) << " (" + << (LocalProfileBranches.size() - OrphanBranches.size()) << '/' + << LocalProfileBranches.size() << ") for function " + << getName() << '\n'; + DEBUG( + for (auto &OBranch : OrphanBranches) + errs() << "\t0x" << Twine::utohexstr(OBranch.first) << " -> 0x" + << Twine::utohexstr(OBranch.second) << " (0x" + << Twine::utohexstr(OBranch.first + getAddress()) << " -> 0x" + ); + } +} + void BinaryFunction::inferFallThroughCounts() { assert(!BasicBlocks.empty() && "basic block list should not be empty"); @@ -881,7 +987,7 @@ void BinaryFunction::inferFallThroughCounts() { } } - // Udate execution counts of landing pad blocks. + // Update execution counts of landing pad blocks. if (!BranchDataOrErr.getError()) { const FuncBranchData &BranchData = BranchDataOrErr.get(); for (const auto &I : BranchData.EntryData) { @@ -893,7 +999,7 @@ void BinaryFunction::inferFallThroughCounts() { } // Work on a basic block at a time, propagating frequency information forwards - // It is important to walk in the layour order + // It is important to walk in the layout order for (auto CurBB : BasicBlocks) { uint64_t BBExecCount = CurBB->getExecutionCount(); @@ -1036,7 +1142,7 @@ bool BinaryFunction::fixCFIState() { // without using the state stack. Not sure if it is worth the effort // because this happens rarely. if (NestedLevel != 0) { - errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state while " + errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state while" << " replaying CFI instructions for BB " << InBB->getName() << " in function " << getName() << '\n'; return false; @@ -1157,7 +1263,7 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { } // Cannot do optimal layout without profile. - if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + if (!hasValidProfile()) return; // Work on optimal solution if problem is small enough diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 2f196f04a4b4..8359549bc51b 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -19,6 +19,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" +#include "DataReader.h" #include "DebugData.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" @@ -158,6 +159,9 @@ class BinaryFunction : public AddressRangesOwner { /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; + /// Profile match ration. + float ProfileMatchRatio{0.0}; + /// Score of the function (estimated number of instructions executed, /// according to profile data). -1 if the score has not been calculated yet. int64_t FunctionScore{-1}; @@ -177,44 +181,10 @@ class BinaryFunction : public AddressRangesOwner { /// the output binary. uint32_t AddressRangesOffset{-1U}; - /// Release storage used by instructions. - BinaryFunction &clearInstructions() { - InstrMapType TempMap; - Instructions.swap(TempMap); - return *this; - } - - /// Release storage used by CFI offsets map. - BinaryFunction &clearCFIOffsets() { - std::multimap TempMap; - OffsetToCFI.swap(TempMap); - return *this; - } - - /// Release storage used by instructions. - BinaryFunction &clearLabels() { - LabelsMapType TempMap; - Labels.swap(TempMap); - return *this; - } - - /// Release memory taken by local branch info. - BinaryFunction &clearLocalBranches() { - LocalBranchesListType TempList; - LocalBranches.swap(TempList); - return *this; - } - - BinaryFunction &clearFTBranches() { - LocalBranchesListType TempList; - FTBranches.swap(TempList); - return *this; - } - - /// Release memory taken by landing pad info. - BinaryFunction &clearLPToBBIndex() { - LandingPadsMapType TempMap; - LPToBBIndex.swap(TempMap); + /// Release memory taken by the list. + template BinaryFunction &clearList(T& List) { + T TempList; + TempList.swap(List); return *this; } @@ -223,13 +193,14 @@ class BinaryFunction : public AddressRangesOwner { return *this; } + /// Return basic block that originally was laid out immediately following + /// the given /p BB basic block. const BinaryBasicBlock * getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const; - /// Storage for all local branches in the function (non-fall-throughs). - using LocalBranchesListType = std::vector>; - LocalBranchesListType LocalBranches; - LocalBranchesListType FTBranches; + using BranchListType = std::vector>; + BranchListType TakenBranches; /// All local taken branches. + BranchListType FTBranches; /// All fall-through branches. /// Storage for all landing pads and their corresponding invokes. using LandingPadsMapType = std::map >; @@ -574,6 +545,21 @@ class BinaryFunction : public AddressRangesOwner { Instructions.emplace(Offset, std::forward(Instruction)); } + /// Return instruction at a given offset in the function. Valid before + /// CFG is constructed. + MCInst *getInstructionAtOffset(uint64_t Offset) { + assert(CurrentState == State::Disassembled && + "can only call function in Disassembled state"); + auto II = Instructions.find(Offset); + return (II == Instructions.end()) ? nullptr : &II->second; + } + + /// Return true if function profile is present and accurate. + bool hasValidProfile() { + return ExecutionCount != COUNT_NO_PROFILE && + ProfileMatchRatio == 1.0f; + } + void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) { assert(!Instructions.empty()); @@ -747,7 +733,7 @@ class BinaryFunction : public AddressRangesOwner { /// If successful, this function will populate the list of instructions /// for this function together with offsets from the function start /// in the input. It will also populate Labels with destinations for - /// local branches, and LocalBranches with [from, to] info. + /// local branches, and TakenBranches with [from, to] info. /// /// \p FunctionData is the set bytes representing the function body. /// @@ -768,6 +754,10 @@ class BinaryFunction : public AddressRangesOwner { /// State::CFG. Returns false if CFG cannot be built. bool buildCFG(); + /// Check how closely the profile data matches the function and set + /// ProfileMatchRatio to reflect the accuracy. + void evaluateProfileData(const FuncBranchData &BranchData); + /// Walks the list of basic blocks filling in missing information about /// edge frequency for fall-throughs. /// diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index e6fb8bbcd170..9eb3a9a3399c 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -925,22 +925,35 @@ void RewriteInstance::disassembleFunctions() { } uint64_t NumSimpleFunctions{0}; + uint64_t NumStaleProfileFunctions{0}; std::vector ProfiledFunctions; for (auto &BFI : BinaryFunctions) { - if (!BFI.second.isSimple()) + auto &Function = BFI.second; + if (!Function.isSimple()) continue; ++NumSimpleFunctions; - if (BFI.second.getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) - ProfiledFunctions.push_back(&BFI.second); + if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + continue; + if (Function.hasValidProfile()) + ProfiledFunctions.push_back(&Function); + else + ++NumStaleProfileFunctions; } - errs() << "BOLT-INFO: " << ProfiledFunctions.size() << " functions out of " - << NumSimpleFunctions - << " simple functions (" + errs() << "BOLT-INFO: " + << ProfiledFunctions.size() + NumStaleProfileFunctions + << " functions out of " << NumSimpleFunctions << " simple functions (" << format("%.1f", - ProfiledFunctions.size() / - (float) NumSimpleFunctions * 100.0) + (ProfiledFunctions.size() + NumStaleProfileFunctions) / + (float) NumSimpleFunctions * 100.0f) << "%) have non-empty execution profile.\n"; + if (NumStaleProfileFunctions) { + errs() << "BOLT-INFO: " << NumStaleProfileFunctions + << format(" (%.1f%) ", NumStaleProfileFunctions / + (float) NumSimpleFunctions * 100.0f) + << " function" << (NumStaleProfileFunctions == 1 ? "" : "s") + << " have invalid (possibly stale) profile.\n"; + } if (ProfiledFunctions.size() > 10) { errs() << "BOLT-INFO: top called functions are:\n"; From 31ed5c134cb014bbc211c21c2ccfa8d960781443 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 10 Jun 2016 17:13:05 -0700 Subject: [PATCH 127/904] Support for multiple function names. Summary: With ICF optimization in the linker we were getting mismatches of function names in .fdata and BinaryFunction name. This diff adds support for multiple function names for BinaryFunction and does a match against all possible names for the profile. (cherry picked from commit 1dbd32874b789f1e147c5abaaa699934f321e5f9) --- bolt/BinaryFunction.cpp | 16 +++++++++++---- bolt/BinaryFunction.h | 44 ++++++++++++++++++++++++++++++++-------- bolt/DataReader.cpp | 18 +++++++++------- bolt/DataReader.h | 4 +++- bolt/RewriteInstance.cpp | 30 +++++++++++++++++++-------- 5 files changed, 83 insertions(+), 29 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 954c4aa2a249..44ceafb744bd 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -140,8 +140,16 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, bool PrintInstructions) const { StringRef SectionName; Section.getName(SectionName); - OS << "Binary Function \"" << getName() << "\" " << Annotation << " {" - << "\n Number : " << FunctionNumber + OS << "Binary Function \"" << getName() << "\" " << Annotation << " {"; + if (Names.size() > 1) { + OS << "\n Other names : "; + auto Sep = ""; + for (unsigned i = 0; i < Names.size() - 1; ++i) { + OS << Sep << Names[i]; + Sep = "\n "; + } + } + OS << "\n Number : " << FunctionNumber << "\n State : " << CurrentState << "\n Address : 0x" << Twine::utohexstr(Address) << "\n Size : 0x" << Twine::utohexstr(Size) @@ -612,7 +620,7 @@ bool BinaryFunction::buildCFG() { auto &MIA = BC.MIA; - auto BranchDataOrErr = BC.DR.getFuncBranchData(getName()); + auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); if (!BranchDataOrErr) { DEBUG(dbgs() << "no branch data found for \"" << getName() << "\"\n"); } else { @@ -963,7 +971,7 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { void BinaryFunction::inferFallThroughCounts() { assert(!BasicBlocks.empty() && "basic block list should not be empty"); - auto BranchDataOrErr = BC.DR.getFuncBranchData(getName()); + auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); // Compute preliminary execution time for each basic block for (auto CurBB : BasicBlocks) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 8359549bc51b..a739227781f5 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -103,8 +103,8 @@ class BinaryFunction : public AddressRangesOwner { /// Current state of the function. State CurrentState{State::Empty}; - /// Name of the function as we know it. - std::string Name; + /// A list of function names. + std::vector Names; /// Symbol associated with this function in the input. SymbolRef Symbol; @@ -356,12 +356,12 @@ class BinaryFunction : public AddressRangesOwner { BinaryFunction(BinaryFunction &&) = default; - BinaryFunction(std::string Name, SymbolRef Symbol, SectionRef Section, + BinaryFunction(const std::string &Name, SymbolRef Symbol, SectionRef Section, uint64_t Address, uint64_t Size, BinaryContext &BC, bool IsSimple = true) : - Name(Name), Symbol(Symbol), Section(Section), Address(Address), - Size(Size), BC(BC), IsSimple(IsSimple), CodeSectionName(".text." + Name), - FunctionNumber(++Count) + Names({Name}), Symbol(Symbol), Section(Section), + Address(Address), Size(Size), BC(BC), IsSimple(IsSimple), + CodeSectionName(".text." + Name), FunctionNumber(++Count) {} /// Modify code layout making necessary adjustments to instructions at the @@ -396,8 +396,31 @@ class BinaryFunction : public AddressRangesOwner { } /// Return the name of the function as extracted from the binary file. - StringRef getName() const { - return Name; + /// If the function has multiple names - return the last one + /// followed by "(*#)". + /// We should preferably only use getName() for diagnostics and use + /// hasName() to match function name against a given string. + /// + /// We pick the last name from the list to match the name of the function + /// in profile data for easier manual analysis. + std::string getName() const { + return Names.size() == 1 ? + Names.back() : + (Names.back() + "(*" + std::to_string(Names.size()) + ")"); + } + + /// Check if (possibly one out of many) function name matches the given + /// string. Use this member function instead of direct name comparison. + bool hasName(std::string &FunctionName) const { + for (auto &Name : Names) + if (Name == FunctionName) + return true; + return false; + } + + /// Return a vector of all possible names for the function. + const std::vector &getNames() const { + return Names; } /// Return containing file section. @@ -488,6 +511,11 @@ class BinaryFunction : public AddressRangesOwner { return Address <= PC && PC < Address + Size; } + /// Register alternative function name. + void addAlternativeName(std::string NewName) { + Names.emplace_back(NewName); + } + /// Create a basic block at a given \p Offset in the /// function and append it to the end of list of blocks. /// If \p DeriveAlignment is true, set the alignment of the block based diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 967b4ef2e085..5411633d1ef7 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -201,12 +201,15 @@ std::error_code DataReader::parse() { } ErrorOr -DataReader::getFuncBranchData(StringRef FuncName) const { - const auto I = FuncsMap.find(FuncName); - if (I == FuncsMap.end()) { - return make_error_code(llvm::errc::invalid_argument); +DataReader::getFuncBranchData(const std::vector &FuncNames) const { + // Do a reverse order iteration since the name in profile has a higher chance + // of matching a name at the end of the list. + for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { + const auto I = FuncsMap.find(*FI); + if (I != FuncsMap.end()) + return I->getValue(); } - return I->getValue(); + return make_error_code(llvm::errc::invalid_argument); } void DataReader::dump() const { @@ -223,5 +226,6 @@ void DataReader::dump() const { } } } -} -} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 88f087306a7d..7f74988d160d 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -22,6 +22,7 @@ #include "llvm/Support/ErrorOr.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include namespace llvm { namespace bolt { @@ -132,7 +133,8 @@ class DataReader { /// offset 12, with 4 mispredictions and 221 branches std::error_code parse(); - ErrorOr getFuncBranchData(StringRef FuncName) const; + ErrorOr getFuncBranchData( + const std::vector &FuncNames) const; using FuncsMapType = StringMap; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 9eb3a9a3399c..ef9801f09b6e 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -203,7 +203,7 @@ bool shouldProcess(const BinaryFunction &Function) { if (!FunctionNames.empty()) { IsValid = false; for (auto &Name : FunctionNames) { - if (Function.getName() == Name) { + if (Function.hasName(Name)) { IsValid = true; break; } @@ -214,7 +214,7 @@ bool shouldProcess(const BinaryFunction &Function) { if (!SkipFunctionNames.empty()) { for (auto &Name : SkipFunctionNames) { - if (Function.getName() == Name) { + if (Function.hasName(Name)) { IsValid = false; break; } @@ -734,12 +734,24 @@ void RewriteInstance::discoverFileObjects() { } } - // Create the function and add to the map. - BinaryFunctions.emplace( - Address, - BinaryFunction(UniqueName, Symbol, *Section, Address, - SymbolSize, *BC, IsSimple) - ); + auto BFI = BinaryFunctions.find(Address); + if (BFI != BinaryFunctions.end()) { + // Duplicate function name. Make sure everything matches before we add + // an alternative name. + if (SymbolSize != BFI->second.getSize()) { + errs() << "BOLT-WARNING: size mismatch for duplicate entries " + << UniqueName << ':' << SymbolSize << " and " + << BFI->second.getName() << ':' << BFI->second.getSize() << '\n'; + } + BFI->second.addAlternativeName(UniqueName); + } else { + // Create the function and add to the map. + BinaryFunctions.emplace( + Address, + BinaryFunction(UniqueName, Symbol, *Section, Address, + SymbolSize, *BC, IsSimple) + ); + } } } @@ -1106,7 +1118,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, // Emit UD2 at the beginning if requested by user. if (!opts::BreakFunctionNames.empty()) { for (auto &Name : opts::BreakFunctionNames) { - if (Function.getName() == Name) { + if (Function.hasName(Name)) { Streamer.EmitIntValue(0x0B0F, 2); // UD2: 0F 0B break; } From 3dc89985499d05ef73e1638a4a34cc8227846fc3 Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Thu, 16 Jun 2016 18:47:57 -0700 Subject: [PATCH 128/904] Refactoring of the reordering algorithms Summary: The various reorder and clustering algorithms have been refactored into separate classes, so that it is easier to add new algorithms and/or change the logic of algorithm selection. (cherry picked from commit 7e5fd0df112e510b693cfcc8cbfc82bed9b9bc96) --- bolt/BinaryBasicBlock.h | 22 +- bolt/BinaryFunction.cpp | 387 +++------------------------------ bolt/BinaryFunction.h | 45 ++-- bolt/CMakeLists.txt | 1 + bolt/ReorderAlgorithm.cpp | 436 ++++++++++++++++++++++++++++++++++++++ bolt/ReorderAlgorithm.h | 168 +++++++++++++++ 6 files changed, 681 insertions(+), 378 deletions(-) create mode 100644 bolt/ReorderAlgorithm.cpp create mode 100644 bolt/ReorderAlgorithm.h diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 07962f99d5bb..ea292f26edd2 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -85,9 +85,6 @@ class BinaryBasicBlock { /// Each successor has a corresponding BranchInfo entry in the list. std::vector BranchInfo; - typedef std::vector::iterator branch_info_iterator; - typedef std::vector::const_iterator - const_branch_info_iterator; BinaryBasicBlock() {} @@ -252,6 +249,25 @@ class BinaryBasicBlock { return iterator_range(lp_begin(), lp_end()); } + // BranchInfo iterators. + typedef std::vector::const_iterator + const_branch_info_iterator; + + const_branch_info_iterator branch_info_begin() const + { return BranchInfo.begin(); } + const_branch_info_iterator branch_info_end() const + { return BranchInfo.end(); } + unsigned branch_info_size() const { + return (unsigned)BranchInfo.size(); + } + bool branch_info_empty() const + { return BranchInfo.empty(); } + + inline iterator_range branch_info() const { + return iterator_range( + branch_info_begin(), branch_info_end()); + } + /// Return symbol marking the start of this basic block. MCSymbol *getLabel() const { return Label; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 44ceafb744bd..5bf0e07cbf3e 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -12,6 +12,7 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" +#include "ReorderAlgorithm.h" #include "DataReader.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -41,9 +42,6 @@ AgressiveSplitting("split-all-cold", cl::desc("outline as many cold basic blocks as possible"), cl::Optional); -static cl::opt -PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); - static cl::opt PrintDebugInfo("print-debug-info", cl::desc("print debug info when printing functions"), @@ -1254,378 +1252,47 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { if (BasicBlocksLayout.empty() || Type == LT_NONE) return; - if (Type == LT_REVERSE) { - BasicBlockOrderType ReverseOrder; - auto FirstBB = BasicBlocksLayout.front(); - ReverseOrder.push_back(FirstBB); - for (auto RBBI = BasicBlocksLayout.rbegin(); *RBBI != FirstBB; ++RBBI) - ReverseOrder.push_back(*RBBI); - BasicBlocksLayout.swap(ReverseOrder); - - if (Split) - splitFunction(); - - fixBranches(); - - return; - } + BasicBlockOrderType NewLayout; + std::unique_ptr Algo; // Cannot do optimal layout without profile. - if (!hasValidProfile()) + if (Type != LT_REVERSE && !hasValidProfile()) return; - // Work on optimal solution if problem is small enough - if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) - return solveOptimalLayout(Split); - - DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n"); - - // Greedy heuristic implementation for the TSP, applied to BB layout. Try to - // maximize weight during a path traversing all BBs. In this way, we will - // convert the hottest branches into fall-throughs. - - // Encode an edge between two basic blocks, source and destination - typedef std::pair EdgeTy; - std::map Weight; - - // Define a comparison function to establish SWO between edges - auto Comp = [&] (EdgeTy A, EdgeTy B) { - // With equal weights, prioritize branches with lower index - // source/destination. This helps to keep original block order for blocks - // when optimal order cannot be deducted from a profile. - if (Weight[A] == Weight[B]) { - uint32_t ASrcBBIndex = getIndex(A.first); - uint32_t BSrcBBIndex = getIndex(B.first); - if (ASrcBBIndex != BSrcBBIndex) - return ASrcBBIndex > BSrcBBIndex; - return getIndex(A.second) > getIndex(B.second); - } - return Weight[A] < Weight[B]; - }; - std::priority_queue, decltype(Comp)> Queue(Comp); - - typedef std::vector ClusterTy; - typedef std::map BBToClusterMapTy; - std::vector Clusters; - BBToClusterMapTy BBToClusterMap; - - // Encode relative weights between two clusters - std::vector> ClusterEdges; - ClusterEdges.resize(BasicBlocksLayout.size()); - - for (auto BB : BasicBlocksLayout) { - // Create a cluster for this BB - uint32_t I = Clusters.size(); - Clusters.emplace_back(); - auto &Cluster = Clusters.back(); - Cluster.push_back(BB); - BBToClusterMap[BB] = I; - // Populate priority queue with edges - auto BI = BB->BranchInfo.begin(); - for (auto &I : BB->successors()) { - if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) - Weight[std::make_pair(BB, I)] = BI->Count; - Queue.push(std::make_pair(BB, I)); - ++BI; - } - } - - // Grow clusters in a greedy fashion - while (!Queue.empty()) { - auto elmt = Queue.top(); - Queue.pop(); - - BinaryBasicBlock *BBSrc = elmt.first; - BinaryBasicBlock *BBDst = elmt.second; - - // Case 1: BBSrc and BBDst are the same. Ignore this edge - if (BBSrc == BBDst || BBDst == *BasicBlocksLayout.begin()) - continue; - - int I = BBToClusterMap[BBSrc]; - int J = BBToClusterMap[BBDst]; - - // Case 2: If they are already allocated at the same cluster, just increase - // the weight of this cluster - if (I == J) { - ClusterEdges[I][I] += Weight[elmt]; - continue; - } - - auto &ClusterA = Clusters[I]; - auto &ClusterB = Clusters[J]; - if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { - // Case 3: BBSrc is at the end of a cluster and BBDst is at the start, - // allowing us to merge two clusters - for (auto BB : ClusterB) - BBToClusterMap[BB] = I; - ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); - ClusterB.clear(); - // Iterate through all inter-cluster edges and transfer edges targeting - // cluster B to cluster A. - // It is bad to have to iterate though all edges when we could have a list - // of predecessors for cluster B. However, it's not clear if it is worth - // the added code complexity to create a data structure for clusters that - // maintains a list of predecessors. Maybe change this if it becomes a - // deal breaker. - for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) - ClusterEdges[K][I] += ClusterEdges[K][J]; - } else { - // Case 4: Both BBSrc and BBDst are allocated in positions we cannot - // merge them. Annotate the weight of this edge in the weight between - // clusters to help us decide ordering between these clusters. - ClusterEdges[I][J] += Weight[elmt]; - } - } - std::vector Order; // Cluster layout order - - // Here we have 3 conflicting goals as to how to layout clusters. If we want - // to minimize jump offsets, we should put clusters with heavy inter-cluster - // dependence as close as possible. If we want to maximize the probability - // that all inter-cluster edges are predicted as not-taken, we should enforce - // a topological order to make targets appear after sources, creating forward - // branches. If we want to separate hot from cold blocks to maximize the - // probability that unfrequently executed code doesn't pollute the cache, we - // should put clusters in descending order of hotness. - std::vector AvgFreq; - AvgFreq.resize(Clusters.size(), 0.0); - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { - double Freq = 0.0; - for (auto BB : Clusters[I]) { - if (!BB->empty() && BB->size() != BB->getNumPseudos()) - Freq += ((double) BB->getExecutionCount()) / - (BB->size() - BB->getNumPseudos()); - } - AvgFreq[I] = Freq; - } - - if (opts::PrintClusters) { - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { - errs() << "Cluster number " << I << " (frequency: " << AvgFreq[I] - << ") : "; - auto Sep = ""; - for (auto BB : Clusters[I]) { - errs() << Sep << BB->getName(); - Sep = ", "; - } - errs() << "\n"; - }; - } - - switch(Type) { - case LT_OPTIMIZE: { - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!Clusters[I].empty()) - Order.push_back(I); - break; - } - case LT_OPTIMIZE_BRANCH: { - // Do a topological sort for clusters, prioritizing frequently-executed BBs - // during the traversal. - std::stack Stack; - std::vector Status; - std::vector Parent; - Status.resize(Clusters.size(), 0); - Parent.resize(Clusters.size(), 0); - constexpr uint32_t STACKED = 1; - constexpr uint32_t VISITED = 2; - Status[0] = STACKED; - Stack.push(0); - while (!Stack.empty()) { - uint32_t I = Stack.top(); - if (!(Status[I] & VISITED)) { - Status[I] |= VISITED; - // Order successors by weight - auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { - return ClusterEdges[I][A] > ClusterEdges[I][B]; - }; - std::priority_queue, - decltype(ClusterComp)> SuccQueue(ClusterComp); - for (auto &Target: ClusterEdges[I]) { - if (Target.second > 0 && !(Status[Target.first] & STACKED) && - !Clusters[Target.first].empty()) { - Parent[Target.first] = I; - Status[Target.first] = STACKED; - SuccQueue.push(Target.first); - } - } - while (!SuccQueue.empty()) { - Stack.push(SuccQueue.top()); - SuccQueue.pop(); - } - continue; - } - // Already visited this node - Stack.pop(); - Order.push_back(I); - } - std::reverse(Order.begin(), Order.end()); - // Put unreachable clusters at the end - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!(Status[I] & VISITED) && !Clusters[I].empty()) - Order.push_back(I); - - // Sort nodes with equal precedence - auto Beg = Order.begin(); - // Don't reorder the first cluster, which contains the function entry point - ++Beg; - std::stable_sort(Beg, Order.end(), - [&AvgFreq, &Parent](uint32_t A, uint32_t B) { - uint32_t P = Parent[A]; - while (Parent[P] != 0) { - if (Parent[P] == B) - return false; - P = Parent[P]; - } - P = Parent[B]; - while (Parent[P] != 0) { - if (Parent[P] == A) - return true; - P = Parent[P]; - } - return AvgFreq[A] > AvgFreq[B]; - }); - break; - } - case LT_OPTIMIZE_CACHE: { - // Order clusters based on average instruction execution frequency - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!Clusters[I].empty()) - Order.push_back(I); - auto Beg = Order.begin(); - // Don't reorder the first cluster, which contains the function entry point - ++Beg; - std::stable_sort(Beg, Order.end(), [&AvgFreq](uint32_t A, uint32_t B) { - return AvgFreq[A] > AvgFreq[B]; - }); - - break; - } - default: - llvm_unreachable("unexpected layout type"); - } - - if (opts::PrintClusters) { - errs() << "New cluster order: "; - auto Sep = ""; - for (auto O : Order) { - errs() << Sep << O; - Sep = ", "; - } - errs() << '\n'; + if (Type == LT_REVERSE) { + Algo.reset(new ReverseReorderAlgorithm()); } - - BasicBlocksLayout.clear(); - for (auto I : Order) { - auto &Cluster = Clusters[I]; - BasicBlocksLayout.insert(BasicBlocksLayout.end(), Cluster.begin(), - Cluster.end()); + else if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) { + // Work on optimal solution if problem is small enough + DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n"); + Algo.reset(new OptimalReorderAlgorithm()); } + else { + DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n"); - if (Split) - splitFunction(); - fixBranches(); -} - -void BinaryFunction::solveOptimalLayout(bool Split) { - std::vector> Weight; - std::map BBToIndex; - std::vector IndexToBB; + std::unique_ptr CAlgo(new GreedyClusterAlgorithm()); - DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n"); - - unsigned N = BasicBlocksLayout.size(); - // Populating weight map and index map - for (auto BB : BasicBlocksLayout) { - BBToIndex[BB] = IndexToBB.size(); - IndexToBB.push_back(BB); - } - Weight.resize(N); - for (auto BB : BasicBlocksLayout) { - auto BI = BB->BranchInfo.begin(); - Weight[BBToIndex[BB]].resize(N); - for (auto I : BB->successors()) { - if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) - Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count; - ++BI; - } - } + switch(Type) { + case LT_OPTIMIZE: + Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); + break; - std::vector> DP; - DP.resize(1 << N); - for (auto &Elmt : DP) { - Elmt.resize(N, -1); - } - // Start with the entry basic block being allocated with cost zero - DP[1][0] = 0; - // Walk through TSP solutions using a bitmask to represent state (current set - // of BBs in the layout) - unsigned BestSet = 1; - unsigned BestLast = 0; - int64_t BestWeight = 0; - for (unsigned Set = 1; Set < (1U << N); ++Set) { - // Traverse each possibility of Last BB visited in this layout - for (unsigned Last = 0; Last < N; ++Last) { - // Case 1: There is no possible layout with this BB as Last - if (DP[Set][Last] == -1) - continue; + case LT_OPTIMIZE_BRANCH: + Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); + break; - // Case 2: There is a layout with this Set and this Last, and we try - // to expand this set with New - for (unsigned New = 1; New < N; ++New) { - // Case 2a: BB "New" is already in this Set - if ((Set & (1 << New)) != 0) - continue; + case LT_OPTIMIZE_CACHE: + Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); + break; - // Case 2b: BB "New" is not in this set and we add it to this Set and - // record total weight of this layout with "New" as the last BB. - unsigned NewSet = (Set | (1 << New)); - if (DP[NewSet][New] == -1) - DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; - DP[NewSet][New] = std::max(DP[NewSet][New], - DP[Set][Last] + (int64_t)Weight[Last][New]); - - if (DP[NewSet][New] > BestWeight) { - BestWeight = DP[NewSet][New]; - BestSet = NewSet; - BestLast = New; - } - } + default: + llvm_unreachable("unexpected layout type"); } } - std::vector PastLayout = BasicBlocksLayout; - - // Define final function layout based on layout that maximizes weight + Algo->reorderBasicBlocks(*this, NewLayout); BasicBlocksLayout.clear(); - unsigned Last = BestLast; - unsigned Set = BestSet; - std::vector Visited; - Visited.resize(N); - Visited[Last] = true; - BasicBlocksLayout.push_back(IndexToBB[Last]); - Set = Set & ~(1U << Last); - while (Set != 0) { - int64_t Best = -1; - for (unsigned I = 0; I < N; ++I) { - if (DP[Set][I] == -1) - continue; - if (DP[Set][I] > Best) { - Last = I; - Best = DP[Set][I]; - } - } - Visited[Last] = true; - BasicBlocksLayout.push_back(IndexToBB[Last]); - Set = Set & ~(1U << Last); - } - std::reverse(BasicBlocksLayout.begin(), BasicBlocksLayout.end()); - - // Finalize layout with BBs that weren't assigned to the layout - for (auto BB : PastLayout) { - if (Visited[BBToIndex[BB]] == false) - BasicBlocksLayout.push_back(BB); - } + BasicBlocksLayout.swap(NewLayout); if (Split) splitFunction(); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index a739227781f5..93dcf84897fe 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -306,6 +306,9 @@ class BinaryFunction : public AddressRangesOwner { typedef BasicBlockOrderType::iterator order_iterator; typedef BasicBlockOrderType::const_iterator const_order_iterator; + typedef BasicBlockOrderType::reverse_iterator reverse_order_iterator; + typedef BasicBlockOrderType::const_reverse_iterator + const_reverse_order_iterator; // CFG iterators. iterator begin() { return BasicBlocks.begin(); } @@ -325,19 +328,39 @@ class BinaryFunction : public AddressRangesOwner { const BinaryBasicBlock & back() const { return *BasicBlocks.back(); } BinaryBasicBlock & back() { return *BasicBlocks.back(); } - unsigned layout_size() const { - return (unsigned)BasicBlocksLayout.size(); - } - const_order_iterator layout_begin() const { - return BasicBlocksLayout.begin(); - } - order_iterator layout_begin() { return BasicBlocksLayout.begin(); } + order_iterator layout_begin() { return BasicBlocksLayout.begin(); } + const_order_iterator layout_begin() const + { return BasicBlocksLayout.begin(); } + order_iterator layout_end() { return BasicBlocksLayout.end(); } + const_order_iterator layout_end() const + { return BasicBlocksLayout.end(); } + reverse_order_iterator layout_rbegin() + { return BasicBlocksLayout.rbegin(); } + const_reverse_order_iterator layout_rbegin() const + { return BasicBlocksLayout.rbegin(); } + reverse_order_iterator layout_rend() + { return BasicBlocksLayout.rend(); } + const_reverse_order_iterator layout_rend() const + { return BasicBlocksLayout.rend(); } + unsigned layout_size() const { return (unsigned)BasicBlocksLayout.size(); } + bool layout_empty() const { return BasicBlocksLayout.empty(); } + const BinaryBasicBlock *layout_front() const + { return BasicBlocksLayout.front(); } + BinaryBasicBlock *layout_front() { return BasicBlocksLayout.front(); } + const BinaryBasicBlock *layout_back() const + { return BasicBlocksLayout.back(); } + BinaryBasicBlock *layout_back() { return BasicBlocksLayout.back(); } inline iterator_range layout() { return iterator_range(BasicBlocksLayout.begin(), BasicBlocksLayout.end()); } + inline iterator_range layout() const { + return iterator_range(BasicBlocksLayout.begin(), + BasicBlocksLayout.end()); + } + cfi_iterator cie_begin() { return CIEFrameInstructions.begin(); } const_cfi_iterator cie_begin() const { return CIEFrameInstructions.begin(); } cfi_iterator cie_end() { return CIEFrameInstructions.end(); } @@ -368,14 +391,6 @@ class BinaryFunction : public AddressRangesOwner { /// end of basic blocks. void modifyLayout(LayoutType Type, bool Split); - /// Dynamic programming implementation for the TSP, applied to BB layout. Find - /// the optimal way to maximize weight during a path traversing all BBs. In - /// this way, we will convert the hottest branches into fall-throughs. - /// - /// Uses exponential amount of memory on the number of basic blocks and should - /// only be used for small functions. - void solveOptimalLayout(bool Split); - /// View CFG in graphviz program void viewGraph(); diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 1adf2aaf1e59..53faad6bd59a 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -24,4 +24,5 @@ add_llvm_tool(llvm-bolt DebugData.cpp Exceptions.cpp RewriteInstance.cpp + ReorderAlgorithm.cpp ) diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp new file mode 100644 index 000000000000..8465b9aff4d1 --- /dev/null +++ b/bolt/ReorderAlgorithm.cpp @@ -0,0 +1,436 @@ +//===--- ReorderAlgorithm.cpp - Basic block reorderng algorithms ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implements different basic block reordering algorithms. +// +//===----------------------------------------------------------------------===// + +#include "ReorderAlgorithm.h" +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "llvm/Support/CommandLine.h" +#include + +using namespace llvm; +using namespace bolt; + +namespace opts { + +static cl::opt +PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); + +} // namespace opts + +void ClusterAlgorithm::computeClusterAverageFrequency() { + AvgFreq.resize(Clusters.size(), 0.0); + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + double Freq = 0.0; + for (auto BB : Clusters[I]) { + if (!BB->empty() && BB->size() != BB->getNumPseudos()) + Freq += ((double) BB->getExecutionCount()) / + (BB->size() - BB->getNumPseudos()); + } + AvgFreq[I] = Freq; + } +} + +void ClusterAlgorithm::printClusters() const { + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + errs() << "Cluster number " << I; + if (AvgFreq.size() == Clusters.size()) + errs() << " (frequency: " << AvgFreq[I] << ")"; + errs() << " : "; + auto Sep = ""; + for (auto BB : Clusters[I]) { + errs() << Sep << BB->getName(); + Sep = ", "; + } + errs() << "\n"; + } +} + +void ClusterAlgorithm::reset() { + Clusters.clear(); + ClusterEdges.clear(); + AvgFreq.clear(); +} + +void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { + reset(); + + // Greedy heuristic implementation for the TSP, applied to BB layout. Try to + // maximize weight during a path traversing all BBs. In this way, we will + // convert the hottest branches into fall-throughs. + + // Encode an edge between two basic blocks, source and destination + typedef std::pair EdgeTy; + std::map Weight; + + // Define a comparison function to establish SWO between edges + auto Comp = [&] (EdgeTy A, EdgeTy B) { + // With equal weights, prioritize branches with lower index + // source/destination. This helps to keep original block order for blocks + // when optimal order cannot be deducted from a profile. + if (Weight[A] == Weight[B]) { + uint32_t ASrcBBIndex = BF.getIndex(A.first); + uint32_t BSrcBBIndex = BF.getIndex(B.first); + if (ASrcBBIndex != BSrcBBIndex) + return ASrcBBIndex > BSrcBBIndex; + return BF.getIndex(A.second) > BF.getIndex(B.second); + } + return Weight[A] < Weight[B]; + }; + std::priority_queue, decltype(Comp)> Queue(Comp); + + typedef std::map BBToClusterMapTy; + BBToClusterMapTy BBToClusterMap; + + ClusterEdges.resize(BF.layout_size()); + + for (auto BB : BF.layout()) { + // Create a cluster for this BB + uint32_t I = Clusters.size(); + Clusters.emplace_back(); + auto &Cluster = Clusters.back(); + Cluster.push_back(BB); + BBToClusterMap[BB] = I; + // Populate priority queue with edges + auto BI = BB->branch_info_begin(); + for (auto &I : BB->successors()) { + if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) + Weight[std::make_pair(BB, I)] = BI->Count; + Queue.push(std::make_pair(BB, I)); + ++BI; + } + } + + // Grow clusters in a greedy fashion + while (!Queue.empty()) { + auto elmt = Queue.top(); + Queue.pop(); + + BinaryBasicBlock *BBSrc = elmt.first; + BinaryBasicBlock *BBDst = elmt.second; + + // Case 1: BBSrc and BBDst are the same. Ignore this edge + if (BBSrc == BBDst || BBDst == *BF.layout_begin()) + continue; + + int I = BBToClusterMap[BBSrc]; + int J = BBToClusterMap[BBDst]; + + // Case 2: If they are already allocated at the same cluster, just increase + // the weight of this cluster + if (I == J) { + ClusterEdges[I][I] += Weight[elmt]; + continue; + } + + auto &ClusterA = Clusters[I]; + auto &ClusterB = Clusters[J]; + if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { + // Case 3: BBSrc is at the end of a cluster and BBDst is at the start, + // allowing us to merge two clusters + for (auto BB : ClusterB) + BBToClusterMap[BB] = I; + ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); + ClusterB.clear(); + // Iterate through all inter-cluster edges and transfer edges targeting + // cluster B to cluster A. + // It is bad to have to iterate though all edges when we could have a list + // of predecessors for cluster B. However, it's not clear if it is worth + // the added code complexity to create a data structure for clusters that + // maintains a list of predecessors. Maybe change this if it becomes a + // deal breaker. + for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) + ClusterEdges[K][I] += ClusterEdges[K][J]; + } else { + // Case 4: Both BBSrc and BBDst are allocated in positions we cannot + // merge them. Annotate the weight of this edge in the weight between + // clusters to help us decide ordering between these clusters. + ClusterEdges[I][J] += Weight[elmt]; + } + } +} + +void OptimalReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + std::vector> Weight; + std::map BBToIndex; + std::vector IndexToBB; + + unsigned N = BF.layout_size(); + // Populating weight map and index map + for (auto BB : BF.layout()) { + BBToIndex[BB] = IndexToBB.size(); + IndexToBB.push_back(BB); + } + Weight.resize(N); + for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); + Weight[BBToIndex[BB]].resize(N); + for (auto I : BB->successors()) { + if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) + Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count; + ++BI; + } + } + + std::vector> DP; + DP.resize(1 << N); + for (auto &Elmt : DP) { + Elmt.resize(N, -1); + } + // Start with the entry basic block being allocated with cost zero + DP[1][0] = 0; + // Walk through TSP solutions using a bitmask to represent state (current set + // of BBs in the layout) + unsigned BestSet = 1; + unsigned BestLast = 0; + int64_t BestWeight = 0; + for (unsigned Set = 1; Set < (1U << N); ++Set) { + // Traverse each possibility of Last BB visited in this layout + for (unsigned Last = 0; Last < N; ++Last) { + // Case 1: There is no possible layout with this BB as Last + if (DP[Set][Last] == -1) + continue; + + // Case 2: There is a layout with this Set and this Last, and we try + // to expand this set with New + for (unsigned New = 1; New < N; ++New) { + // Case 2a: BB "New" is already in this Set + if ((Set & (1 << New)) != 0) + continue; + + // Case 2b: BB "New" is not in this set and we add it to this Set and + // record total weight of this layout with "New" as the last BB. + unsigned NewSet = (Set | (1 << New)); + if (DP[NewSet][New] == -1) + DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; + DP[NewSet][New] = std::max(DP[NewSet][New], + DP[Set][Last] + (int64_t)Weight[Last][New]); + + if (DP[NewSet][New] > BestWeight) { + BestWeight = DP[NewSet][New]; + BestSet = NewSet; + BestLast = New; + } + } + } + } + + // Define final function layout based on layout that maximizes weight + unsigned Last = BestLast; + unsigned Set = BestSet; + std::vector Visited; + Visited.resize(N); + Visited[Last] = true; + Order.push_back(IndexToBB[Last]); + Set = Set & ~(1U << Last); + while (Set != 0) { + int64_t Best = -1; + for (unsigned I = 0; I < N; ++I) { + if (DP[Set][I] == -1) + continue; + if (DP[Set][I] > Best) { + Last = I; + Best = DP[Set][I]; + } + } + Visited[Last] = true; + Order.push_back(IndexToBB[Last]); + Set = Set & ~(1U << Last); + } + std::reverse(Order.begin(), Order.end()); + + // Finalize layout with BBs that weren't assigned to the layout + for (auto BB : BF.layout()) { + if (Visited[BBToIndex[BB]] == false) + Order.push_back(BB); + } +} + +void OptimizeReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Arrange basic blocks according to clusters. + for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); +} + +void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + std::vector &Clusters = CAlgo->Clusters;; + std::vector> &ClusterEdges = CAlgo->ClusterEdges; + + // Compute clusters' average frequencies. + CAlgo->computeClusterAverageFrequency(); + std::vector &AvgFreq = CAlgo->AvgFreq;; + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Cluster layout order + std::vector ClusterOrder; + + // Do a topological sort for clusters, prioritizing frequently-executed BBs + // during the traversal. + std::stack Stack; + std::vector Status; + std::vector Parent; + Status.resize(Clusters.size(), 0); + Parent.resize(Clusters.size(), 0); + constexpr uint32_t STACKED = 1; + constexpr uint32_t VISITED = 2; + Status[0] = STACKED; + Stack.push(0); + while (!Stack.empty()) { + uint32_t I = Stack.top(); + if (!(Status[I] & VISITED)) { + Status[I] |= VISITED; + // Order successors by weight + auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { + return ClusterEdges[I][A] > ClusterEdges[I][B]; + }; + std::priority_queue, + decltype(ClusterComp)> SuccQueue(ClusterComp); + for (auto &Target: ClusterEdges[I]) { + if (Target.second > 0 && !(Status[Target.first] & STACKED) && + !Clusters[Target.first].empty()) { + Parent[Target.first] = I; + Status[Target.first] = STACKED; + SuccQueue.push(Target.first); + } + } + while (!SuccQueue.empty()) { + Stack.push(SuccQueue.top()); + SuccQueue.pop(); + } + continue; + } + // Already visited this node + Stack.pop(); + ClusterOrder.push_back(I); + } + std::reverse(ClusterOrder.begin(), ClusterOrder.end()); + // Put unreachable clusters at the end + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!(Status[I] & VISITED) && !Clusters[I].empty()) + ClusterOrder.push_back(I); + + // Sort nodes with equal precedence + auto Beg = ClusterOrder.begin(); + // Don't reorder the first cluster, which contains the function entry point + ++Beg; + std::stable_sort(Beg, ClusterOrder.end(), + [&AvgFreq, &Parent](uint32_t A, uint32_t B) { + uint32_t P = Parent[A]; + while (Parent[P] != 0) { + if (Parent[P] == B) + return false; + P = Parent[P]; + } + P = Parent[B]; + while (Parent[P] != 0) { + if (Parent[P] == A) + return true; + P = Parent[P]; + } + return AvgFreq[A] > AvgFreq[B]; + }); + + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for (auto O : ClusterOrder) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + } + + // Arrange basic blocks according to cluster order. + for (uint32_t ClusterIndex : ClusterOrder) { + ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); + } +} + +void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + std::vector &Clusters = CAlgo->Clusters;; + + // Compute clusters' average frequencies. + CAlgo->computeClusterAverageFrequency(); + std::vector &AvgFreq = CAlgo->AvgFreq;; + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Cluster layout order + std::vector ClusterOrder; + + // Order clusters based on average instruction execution frequency + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!Clusters[I].empty()) + ClusterOrder.push_back(I); + auto Beg = ClusterOrder.begin(); + // Don't reorder the first cluster, which contains the function entry point + ++Beg; + std::stable_sort(Beg, ClusterOrder.end(), [&AvgFreq](uint32_t A, uint32_t B) { + return AvgFreq[A] > AvgFreq[B]; + }); + + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for (auto O : ClusterOrder) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + } + + // Arrange basic blocks according to cluster order. + for (uint32_t ClusterIndex : ClusterOrder) { + ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); + } +} + +void ReverseReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + auto FirstBB = *BF.layout_begin(); + Order.push_back(FirstBB); + for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) + Order.push_back(*RLI); +} + + diff --git a/bolt/ReorderAlgorithm.h b/bolt/ReorderAlgorithm.h new file mode 100644 index 000000000000..9ea30ed19f81 --- /dev/null +++ b/bolt/ReorderAlgorithm.h @@ -0,0 +1,168 @@ +//===- ReorderAlgorithm.h - Interface for basic block reorderng algorithms ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to different basic block reordering algorithms. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H +#define LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H + +#include "llvm/Support/ErrorHandling.h" +#include +#include +#include + + +namespace llvm { +namespace bolt { + + +class BinaryBasicBlock; +class BinaryFunction; + +/// Objects of this class implement various basic block clustering algorithms. +/// Basic block clusters are chains of basic blocks that should be laid out +/// in this order to maximize performace. These algorithms group basic blocks +/// into clusters using execution profile data and various heuristics. +class ClusterAlgorithm { +public: + typedef std::vector ClusterTy; + std::vector Clusters; + std::vector> ClusterEdges; + std::vector AvgFreq; + + /// Group the basic blocks the given function into clusters stored in the + /// Clusters vector. Also encode relative weights between two clusters in + /// the ClusterEdges vector. This vector is indexed by the clusters indices + /// in the Clusters vector. + virtual void clusterBasicBlocks(const BinaryFunction &BF) =0; + + /// Compute for each cluster its averagae execution frequency, that is + /// the sum of average frequencies of its blocks (execution count / # instrs). + /// The average frequencies are stored in the AvgFreq vector, index by the + /// cluster indices in the Clusters vector. + void computeClusterAverageFrequency(); + + /// Clear clusters and related info. + void reset(); + + void printClusters() const; + + virtual ~ClusterAlgorithm() { } +}; + + +/// This clustering algorithm is based on a greedy heuristic suggested by +/// Pettis (PLDI '90). +class GreedyClusterAlgorithm : public ClusterAlgorithm { +public: + void clusterBasicBlocks(const BinaryFunction &BF) override; +}; + +/// Objects of this class implement various basic block reordering alogrithms. +/// Most of these algorithms depend on a clustering alogrithm. +/// Here we have 3 conflicting goals as to how to layout clusters. If we want +/// to minimize jump offsets, we should put clusters with heavy inter-cluster +/// dependence as close as possible. If we want to maximize the probability +/// that all inter-cluster edges are predicted as not-taken, we should enforce +/// a topological order to make targets appear after sources, creating forward +/// branches. If we want to separate hot from cold blocks to maximize the +/// probability that unfrequently executed code doesn't pollute the cache, we +/// should put clusters in descending order of hotness. +class ReorderAlgorithm { +protected: + std::unique_ptr CAlgo; + +public: + ReorderAlgorithm() { } + explicit ReorderAlgorithm(std::unique_ptr CAlgo) : + CAlgo(std::move(CAlgo)) { } + + typedef std::vector BasicBlockOrder; + + /// Reorder the basic blocks of the given function and store the new order in + /// the new Clusters vector. + virtual void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const =0; + + void setClusterAlgorithm(ClusterAlgorithm *CAlgo) { + this->CAlgo.reset(CAlgo); + } + + virtual ~ReorderAlgorithm() { } +}; + + +/// Dynamic programming implementation for the TSP, applied to BB layout. Find +/// the optimal way to maximize weight during a path traversing all BBs. In +/// this way, we will convert the hottest branches into fall-throughs. +/// +/// Uses exponential amount of memory on the number of basic blocks and should +/// only be used for small functions. +class OptimalReorderAlgorithm : public ReorderAlgorithm { +public: + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// Simple algorithm that groups basic blocks into clusters and then +/// lays them out cluster after cluster. +class OptimizeReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeReorderAlgorithm(std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// This reorder algorithm tries to ensure that all inter-cluster edges are +/// predicted as not-taken, by enforcing a topological order to make +/// targets appear after sources, creating forward branches. +class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeBranchReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// This reorder tries to separate hot from cold blocks to maximize the +/// probability that unfrequently executed code doesn't pollute the cache, by +/// putting clusters in descending order of hotness. +class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeCacheReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// Toy example that simply reverses the original basic block order. +class ReverseReorderAlgorithm : public ReorderAlgorithm { +public: + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +} // namespace bolt +} // namespace llvm + +#endif + From d55170a5cb3dc1e43b4caa2d6605a36fb7569455 Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Mon, 27 Jun 2016 14:51:38 -0700 Subject: [PATCH 129/904] Fix for ignoring fall-through profile data when jump is followed by no-op Summary: When a conditional jump is followed by one or more no-ops, the destination of fall-through branch was recorded as the first no-op in FuncBranchInfo. However the fall-through basic block after the jump starts after the no-ops, so the profile data could not match the CFG and was ignored. (cherry picked from commit 620273f0f7e1dbd57f82aa28b3c5803ad3a16078) --- bolt/BinaryFunction.cpp | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 5bf0e07cbf3e..a0fdf052ed02 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -775,11 +775,28 @@ bool BinaryFunction::buildCFG() { << Twine::utohexstr(Branch.second) << "]\n"); BinaryBasicBlock *FromBB = getBasicBlockContainingOffset(Branch.first); assert(FromBB && "cannot find BB containing FROM branch"); + // Try to find the destination basic block. If the jump instruction was + // followed by a no-op then the destination offset recorded in FTBranches + // will point to that no-op but the destination basic block will start + // after the no-op due to ingoring no-ops when creating basic blocks. + // So we have to skip any no-ops when trying to find the destination + // basic block. BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second); - // We have a fall-through that does not point to another BB, ignore it as - // it may happen in cases where we have a BB finished by two branches. - if (ToBB == nullptr) - continue; + if (ToBB == nullptr) { + auto I = Instructions.find(Branch.second), E = Instructions.end(); + while (ToBB == nullptr && I != E && MIA->isNoop(I->second)) { + ++I; + if (I == E) + break; + ToBB = getBasicBlockAtOffset(I->first); + } + if (ToBB == nullptr) { + // We have a fall-through that does not point to another BB, ignore it + // as it may happen in cases where we have a BB finished by two + // branches. + continue; + } + } // Does not add a successor if we can't find profile data, leave it to the // inference pass to guess its frequency From 291371bddc81860c78f1fdcef1d584e696e560b1 Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Tue, 21 Jun 2016 18:44:42 -0700 Subject: [PATCH 130/904] perf2bolt can extract branch records with histories Summary: Added perf2bolt functionality for extracting branch records with histories of previous branches. The length of the histories is user defined, and the default is 0 (previous functionality). Also, DataReader can parse perf2bolt output with histories. Note: creating profile data with long histories can increase their size significantly (2x for history of length 1, 3x for length 2 etc). (cherry picked from commit f618f4530a3a1ffd5077bb65a66555a3fa6d429e) --- bolt/DataReader.cpp | 224 +++++++++++++++++++++++++++++-- bolt/DataReader.h | 65 +++++++-- bolt/merge-fdata/merge-fdata.cpp | 70 +++++++--- 3 files changed, 322 insertions(+), 37 deletions(-) diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 5411633d1ef7..cfc3a41ae006 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -14,10 +14,112 @@ #include "DataReader.h" +#include namespace llvm { namespace bolt { +void BranchInfo::mergeWith(const BranchInfo &BI) { + + // Merge branch and misprediction counts. + Branches += BI.Branches; + Mispreds += BI.Mispreds; + + // Trivial cases + if (BI.Histories.size() == 0) + return; + + if (Histories.size() == 0) { + Histories = BI.Histories; + return; + } + + // map BranchContext -> (mispreds, count), used to merge histories + std::map> HistMap; + + // Add histories of this BranchInfo into HistMap. + for (const auto &H : Histories) { + BranchContext C; + for (const auto &LocPair : H.Context) { + C.emplace_back(LocPair); + const auto I = HistMap.find(C); + if (I == HistMap.end()) { + HistMap.insert( + std::make_pair(C, std::make_pair(H.Mispreds, H.Branches))); + } + else { + I->second.first += H.Mispreds; + I->second.second += H.Branches; + } + } + } + + // Add histories of BI into HistMap. + for (const auto &H : BI.Histories) { + BranchContext C; + for (const auto &LocPair : H.Context) { + C.emplace_back(LocPair); + const auto I = HistMap.find(C); + if (I == HistMap.end()) { + HistMap.insert( + std::make_pair(C, std::make_pair(H.Mispreds, H.Branches))); + } + else { + I->second.first += H.Mispreds; + I->second.second += H.Branches; + } + } + } + + // Helper function that checks whether context A is a prefix of context B. + auto isPrefix = [] (const BranchContext &A, const BranchContext &B) -> bool { + for (unsigned i = 0; i < A.size(); ++i) { + if (i >= B.size() || A[i] != B[i]) + return false; + } + return true; + }; + + // Extract merged histories from HistMap. Keep only the longest history + // between histories that share a common prefix. + Histories.clear(); + auto I = HistMap.begin(), E = HistMap.end(); + auto NextI = I; + ++NextI; + for ( ; I != E; ++I, ++NextI) { + if (NextI != E && isPrefix(I->first, NextI->first)) + continue; + + Histories.emplace_back(BranchHistory(I->second.first, + I->second.second, + I->first)); + } +} + +void BranchInfo::print(raw_ostream &OS) const { + OS << From.IsSymbol << " " << From.Name << " " + << Twine::utohexstr(From.Offset) << " " + << To.IsSymbol << " " << To.Name << " " + << Twine::utohexstr(To.Offset) << " " + << Mispreds << " " << Branches; + + if (Histories.size() == 0) { + OS << "\n"; + return; + } + + OS << " " << Histories.size() << "\n"; + for (const auto &H : Histories) { + OS << H.Mispreds << " " << H.Branches << " " << H.Context.size() << "\n"; + for (const auto &C : H.Context) { + OS << C.first.IsSymbol << " " << C.first.Name << " " + << Twine::utohexstr(C.first.Offset) << " " + << C.second.IsSymbol << " " << C.second.Name << " " + << Twine::utohexstr(C.second.Offset) << "\n"; + } + } +} + ErrorOr FuncBranchData::getBranch(uint64_t From, uint64_t To) const { for (const auto &I : Data) { @@ -54,20 +156,46 @@ bool DataReader::expectAndConsumeFS() { return true; } -ErrorOr DataReader::parseString(char EndChar) { - auto StringEnd = ParsingBuf.find(EndChar); +bool DataReader::checkAndConsumeNewLine() { + if (ParsingBuf[0] != '\n') + return false; + + ParsingBuf = ParsingBuf.drop_front(1); + Col = 0; + Line += 1; + return true; +} + +ErrorOr DataReader::parseString(char EndChar, bool EndNl) { + std::string EndChars(1, EndChar); + if (EndNl) + EndChars.push_back('\n'); + auto StringEnd = ParsingBuf.find_first_of(EndChars); if (StringEnd == StringRef::npos || StringEnd == 0) { reportError("malformed field"); return make_error_code(llvm::errc::io_error); } + StringRef Str = ParsingBuf.substr(0, StringEnd); - ParsingBuf = ParsingBuf.drop_front(StringEnd + 1); - Col += StringEnd + 1; + + // If EndNl was set and nl was found instead of EndChar, do not consume the + // new line. + bool EndNlInstreadOfEndChar = + ParsingBuf[StringEnd] == '\n' && EndChar != '\n'; + unsigned End = EndNlInstreadOfEndChar ? StringEnd : StringEnd + 1; + + ParsingBuf = ParsingBuf.drop_front(End); + if (EndChar == '\n') { + Col = 0; + Line += 1; + } else { + Col += End; + } return Str; } -ErrorOr DataReader::parseNumberField(char EndChar) { - auto NumStrRes = parseString(EndChar); +ErrorOr DataReader::parseNumberField(char EndChar, bool EndNl) { + auto NumStrRes = parseString(EndChar, EndNl); if (std::error_code EC = NumStrRes.getError()) return EC; StringRef NumStr = NumStrRes.get(); @@ -80,7 +208,7 @@ ErrorOr DataReader::parseNumberField(char EndChar) { return Num; } -ErrorOr DataReader::parseLocation() { +ErrorOr DataReader::parseLocation(char EndChar, bool EndNl) { // Read whether the location of the branch should be DSO or a symbol // 0 means it is a DSO. 1 means it is a global symbol. 2 means it is a local // symbol. @@ -103,7 +231,7 @@ ErrorOr DataReader::parseLocation() { StringRef Name = NameRes.get(); // Read the offset - auto OffsetStrRes = parseString(FieldSeparator); + auto OffsetStrRes = parseString(EndChar, EndNl); if (std::error_code EC = OffsetStrRes.getError()) return EC; StringRef OffsetStr = OffsetStrRes.get(); @@ -117,13 +245,49 @@ ErrorOr DataReader::parseLocation() { return Location(IsSymbol, Name, Offset); } +ErrorOr DataReader::parseBranchHistory() { + auto MRes = parseNumberField(FieldSeparator); + if (std::error_code EC = MRes.getError()) + return EC; + int64_t NumMispreds = MRes.get(); + + auto BRes = parseNumberField(FieldSeparator); + if (std::error_code EC = BRes.getError()) + return EC; + int64_t NumBranches = BRes.get(); + + auto LRes = parseNumberField('\n'); + if (std::error_code EC = LRes.getError()) + return EC; + int64_t ContextLength = LRes.get(); + assert(ContextLength > 0 && "found branch context with length 0"); + + BranchContext Context; + for (unsigned i = 0; i < ContextLength; ++i) { + auto Res = parseLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location CtxFrom = Res.get(); + + Res = parseLocation('\n'); + if (std::error_code EC = Res.getError()) + return EC; + Location CtxTo = Res.get(); + + Context.emplace_back(std::make_pair(std::move(CtxFrom), + std::move(CtxTo))); + } + + return BranchHistory(NumMispreds, NumBranches, std::move(Context)); +} + ErrorOr DataReader::parseBranchInfo() { - auto Res = parseLocation(); + auto Res = parseLocation(FieldSeparator); if (std::error_code EC = Res.getError()) return EC; Location From = Res.get(); - Res = parseLocation(); + Res = parseLocation(FieldSeparator); if (std::error_code EC = Res.getError()) return EC; Location To = Res.get(); @@ -133,12 +297,32 @@ ErrorOr DataReader::parseBranchInfo() { return EC; int64_t NumMispreds = MRes.get(); - auto BRes = parseNumberField('\n'); + auto BRes = parseNumberField(FieldSeparator, /* EndNl = */ true); if (std::error_code EC = BRes.getError()) return EC; int64_t NumBranches = BRes.get(); - return BranchInfo(std::move(From), std::move(To), NumMispreds, NumBranches); + BranchHistories Histories; + + if (!checkAndConsumeNewLine()) { + auto HRes = parseNumberField('\n'); + if (std::error_code EC = HRes.getError()) + return EC; + int64_t NumHistories = HRes.get(); + assert(NumHistories > 0 && "found branch history list with length 0"); + + for (unsigned i = 0; i < NumHistories; ++i) { + auto Res = parseBranchHistory(); + if (std::error_code EC = Res.getError()) + return EC; + BranchHistory Hist = Res.get(); + + Histories.emplace_back(std::move(Hist)); + } + } + + return BranchInfo(std::move(From), std::move(To), NumMispreds, NumBranches, + std::move(Histories)); } bool DataReader::hasData() { @@ -170,8 +354,6 @@ std::error_code DataReader::parse() { auto Res = parseBranchInfo(); if (std::error_code EC = Res.getError()) return EC; - Col = 0; - Line += 1; BranchInfo BI = Res.get(); @@ -218,11 +400,25 @@ void DataReader::dump() const { for (const auto &BI : Func.getValue().Data) { Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; + for (const auto &HI : BI.Histories) { + Diag << "\thistory " << HI.Mispreds << " " << HI.Branches << "\n"; + for (const auto &CI : HI.Context) { + Diag << "\t" << CI.first.Name << " " << CI.first.Offset << " " + << CI.second.Name << " " << CI.second.Offset << "\n"; + } + } } Diag << Func.getKey() << " entry points:\n"; for (const auto &BI : Func.getValue().EntryData) { Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " << BI.To.Offset << " " << BI.Mispreds << " " << BI.Branches << "\n"; + for (const auto &HI : BI.Histories) { + Diag << "\thistory " << HI.Mispreds << " " << HI.Branches << "\n"; + for (const auto &CI : HI.Context) { + Diag << "\t" << CI.first.Name << " " << CI.first.Offset << " " + << CI.second.Name << " " << CI.second.Offset << "\n"; + } + } } } } diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 7f74988d160d..39d02e9799b0 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -55,15 +55,30 @@ struct Location { } }; +typedef std::vector> BranchContext; + +struct BranchHistory { + int64_t Mispreds; + int64_t Branches; + BranchContext Context; + + BranchHistory(int64_t Mispreds, int64_t Branches, BranchContext Context) + : Mispreds(Mispreds), Branches(Branches), Context(std::move(Context)) {} +}; + +typedef std::vector BranchHistories; + struct BranchInfo { Location From; Location To; int64_t Mispreds; int64_t Branches; + BranchHistories Histories; - BranchInfo(Location From, Location To, int64_t Mispreds, int64_t Branches) + BranchInfo(Location From, Location To, int64_t Mispreds, int64_t Branches, + BranchHistories Histories) : From(std::move(From)), To(std::move(To)), Mispreds(Mispreds), - Branches(Branches) {} + Branches(Branches), Histories(std::move(Histories)) {} bool operator==(const BranchInfo &RHS) const { return From == RHS.From && @@ -79,6 +94,12 @@ struct BranchInfo { return false; } + + /// Merges the branch and misprediction counts as well as the histories of BI + /// with those of this objetc. + void mergeWith(const BranchInfo &BI); + + void print(raw_ostream &OS) const; }; struct FuncBranchData { @@ -120,17 +141,43 @@ class DataReader { /// /// /// - /// + /// [ + /// + /// + /// ...] + /// + /// Each history entry follows the syntax below. + /// + /// + /// + /// + /// ... /// /// In field we record 0 if our closest address is a DSO load /// address or 1 if our closest address is an ELF symbol. /// - /// Example: + /// Examples: /// /// 1 main 3fb 0 /lib/ld-2.21.so 12 4 221 /// /// The example records branches from symbol main, offset 3fb, to DSO ld-2.21, - /// offset 12, with 4 mispredictions and 221 branches + /// offset 12, with 4 mispredictions and 221 branches. No history is provided. + /// + /// 2 t2.c/func 11 1 globalfunc 1d 0 1775 2 + /// 0 1002 2 + /// 2 t2.c/func 31 2 t2.c/func d + /// 2 t2.c/func 18 2 t2.c/func 20 + /// 0 773 2 + /// 2 t2.c/func 71 2 t2.c/func d + /// 2 t2.c/func 18 2 t2.c/func 60 + /// + /// The examples records branches from local symbol func (from t2.c), offset + /// 11, to global symbol globalfunc, offset 1d, with 1775 branches, no + /// mispreds. Of these branches, 1002 were preceeded by a sequence of + /// branches from func, offset 18 to offset 20 and then from offset 31 to + /// offset d. The rest 773 branches were preceeded by a different sequence + /// of branches, from func, offset 18 to offset 60 and then from offset 71 to + /// offset d. std::error_code parse(); ErrorOr getFuncBranchData( @@ -147,9 +194,11 @@ class DataReader { void reportError(StringRef ErrorMsg); bool expectAndConsumeFS(); - ErrorOr parseString(char EndChar); - ErrorOr parseNumberField(char EndChar); - ErrorOr parseLocation(); + bool checkAndConsumeNewLine(); + ErrorOr parseString(char EndChar, bool EndNl=false); + ErrorOr parseNumberField(char EndChar, bool EndNl=false); + ErrorOr parseLocation(char EndChar, bool EndNl=false); + ErrorOr parseBranchHistory(); ErrorOr parseBranchInfo(); bool hasData(); diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/merge-fdata/merge-fdata.cpp index 4f9e292ef55e..83e3fd338fa9 100644 --- a/bolt/merge-fdata/merge-fdata.cpp +++ b/bolt/merge-fdata/merge-fdata.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // merge-fdata 1.fdata 2.fdata 3.fdata > merged.fdata -// +// //===----------------------------------------------------------------------===// #include "../DataReader.h" @@ -99,6 +99,27 @@ int main(int argc, char **argv) { std::vector &BIData) { auto FromNamePtr = MergedStringPool.intern(BI.From.Name); auto ToNamePtr = MergedStringPool.intern(BI.To.Name); + BranchHistories Histories; + for (const auto &HI : BI.Histories) { + BranchContext Context; + for (const auto &CI : HI.Context) { + const auto &CtxFrom = CI.first; + const auto CtxFromNamePtr = MergedStringPool.intern(CtxFrom.Name); + const auto &CtxTo = CI.second; + const auto CtxToNamePtr = MergedStringPool.intern(CtxTo.Name); + Context.emplace_back(std::make_pair(Location(CtxFrom.IsSymbol, + *CtxFromNamePtr, + CtxFrom.Offset), + Location(CtxTo.IsSymbol, + *CtxToNamePtr, + CtxTo.Offset))); + AllStrings.emplace_back(CtxFromNamePtr); // keep the reference + AllStrings.emplace_back(CtxToNamePtr); // keep the reference + } + Histories.emplace_back(BranchHistory(HI.Mispreds, + HI.Branches, + std::move(Context))); + } BIData.emplace_back(BranchInfo(Location(BI.From.IsSymbol, *FromNamePtr, BI.From.Offset), @@ -106,9 +127,33 @@ int main(int argc, char **argv) { *ToNamePtr, BI.To.Offset), BI.Mispreds, - BI.Branches)); + BI.Branches, + std::move(Histories))); + AllStrings.emplace_back(FromNamePtr); // keep the reference + AllStrings.emplace_back(ToNamePtr); // keep the reference + }; + + // Simply replace string references in BranchInfo with internal storage + // references. + auto replaceStringRefs = [&] (BranchInfo &BI) { + auto FromNamePtr = MergedStringPool.intern(BI.From.Name); + BI.From.Name = *FromNamePtr; AllStrings.emplace_back(FromNamePtr); // keep the reference + + auto ToNamePtr = MergedStringPool.intern(BI.To.Name); + BI.To.Name = *ToNamePtr; AllStrings.emplace_back(ToNamePtr); // keep the reference + + for (auto &HI : BI.Histories) { + for (auto &CI : HI.Context) { + auto CtxFromNamePtr = MergedStringPool.intern(CI.first.Name); + CI.first.Name = *CtxFromNamePtr; + AllStrings.emplace_back(CtxFromNamePtr); // keep the reference + auto CtxToNamePtr = MergedStringPool.intern(CI.second.Name); + CI.second.Name = *CtxToNamePtr; + AllStrings.emplace_back(CtxToNamePtr); // keep the reference + } + } }; for (auto &InputDataFilename : opts::InputDataFilenames) { @@ -134,8 +179,8 @@ int main(int argc, char **argv) { MI->second.Data.end(), BI); if (TI != MI->second.Data.end() && *TI == BI) { - TI->Branches += BI.Branches; - TI->Mispreds += BI.Mispreds; + replaceStringRefs(BI); + TI->mergeWith(BI); } else { CopyBranchInfo(BI, TmpBI); } @@ -145,8 +190,7 @@ int main(int argc, char **argv) { BranchInfo *PrevBI = nullptr; for (auto &BI : TmpBI) { if (PrevBI && *PrevBI == BI) { - PrevBI->Branches += BI.Branches; - PrevBI->Mispreds += BI.Mispreds; + PrevBI->mergeWith(BI); } else { MI->second.Data.emplace_back(BI); PrevBI = &MI->second.Data.back(); @@ -167,8 +211,8 @@ int main(int argc, char **argv) { BranchInfo *PrevBI = nullptr; for (auto &BI : FI.second.Data) { if (PrevBI && *PrevBI == BI) { - PrevBI->Branches += BI.Branches; - PrevBI->Mispreds += BI.Mispreds; + replaceStringRefs(BI); + PrevBI->mergeWith(BI); } else { CopyBranchInfo(BI, MI->second.Data); PrevBI = &MI->second.Data.back(); @@ -180,13 +224,9 @@ int main(int argc, char **argv) { if (!opts::SuppressMergedDataOutput) { // Print all the data in the original format - for (auto &FDI : MergedFunctionsData) { - for (auto &BD : FDI.second.Data) { - outs() << BD.From.IsSymbol << " " << FDI.first() << " " - << Twine::utohexstr(BD.From.Offset) << " " - << BD.To.IsSymbol << " " << BD.To.Name << " " - << Twine::utohexstr(BD.To.Offset) << " " - << BD.Mispreds << " " << BD.Branches << '\n'; + for (const auto &FDI : MergedFunctionsData) { + for (const auto &BD : FDI.second.Data) { + BD.print(outs()); } } } From bd7bc669ef340904e60f1611a619a1475778cfb0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 1 Jul 2016 08:40:56 -0700 Subject: [PATCH 131/904] Add option to dump CFGs in (simple) graphviz format during all passes. Summary: I noticed the BinaryFunction::viewGraph() method that hadn't been implemented and decided I could use a simple DOT dumper for CFGs while working on the indirect call optimization. I've implemented the bare minimum for the dumper. It's just nodes+BB labels with dges. We can add more detailed information as needed/desired. (cherry picked from commit 1974e68116c25d2616cccd9539608cfabaa18dc0) --- bolt/BinaryFunction.cpp | 71 ++++++++++++++++++++++++++++++++++++++++ bolt/BinaryFunction.h | 15 +++++++-- bolt/BinaryPasses.cpp | 11 +++++++ bolt/RewriteInstance.cpp | 8 +++++ 4 files changed, 103 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a0fdf052ed02..fff33e4d8850 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -24,6 +24,7 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -1316,6 +1317,76 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { fixBranches(); } +namespace { + +#ifndef MAX_PATH +#define MAX_PATH 255 +#endif + +std::string constructFilename(std::string Filename, + std::string Annotation, + std::string Suffix) { + std::replace(Filename.begin(), Filename.end(), '/', '-'); + if (!Annotation.empty()) { + Annotation.insert(0, "-"); + } + if (Filename.size() + Annotation.size() + Suffix.size() > MAX_PATH) { + assert(Suffix.size() + Annotation.size() <= MAX_PATH); + dbgs() << "BOLT-WARNING: Filename \"" << Filename << Annotation << Suffix + << "\" exceeds the " << MAX_PATH << " size limit, truncating.\n"; + Filename.resize(MAX_PATH - (Suffix.size() + Annotation.size())); + } + Filename += Annotation; + Filename += Suffix; + return Filename; +} + +} + +void BinaryFunction::dumpGraph(raw_ostream& OS) const { + OS << "strict digraph \"" << getName() << "\" {\n"; + for (auto *BB : BasicBlocks) { + for (auto *Succ : BB->successors()) { + OS << "\"" << BB->getName() << "\" -> " + << "\"" << Succ->getName() << "\"\n"; + } + } + OS << "}\n"; +} + +void BinaryFunction::viewGraph() const { + SmallString Filename; + if (auto EC = sys::fs::createTemporaryFile("bolt-cfg", "dot", Filename)) { + dbgs() << "BOLT-WARNING: " << EC.message() << ", unable to create " + << " bolt-cfg-XXXXX.dot temporary file.\n"; + return; + } + dumpGraphToFile(Filename.str()); + if (DisplayGraph(Filename)) { + dbgs() << "BOLT-WARNING: Can't display " << Filename + << " with graphviz.\n"; + } + if (auto EC = sys::fs::remove(Filename)) { + dbgs() << "BOLT-WARNING: " << EC.message() << ", failed to remove " + << Filename.str() << "\n"; + } +} + +void BinaryFunction::dumpGraphForPass(std::string Annotation) const { + dumpGraphToFile(constructFilename(getName(), Annotation, ".dot")); +} + +void BinaryFunction::dumpGraphToFile(std::string Filename) const { + std::error_code EC; + raw_fd_ostream of(Filename, EC, sys::fs::F_None); + if (EC) { + dbgs() << "BOLT-WARNING: " << EC.message() << ", unable to open " + << Filename << " for output.\n"; + return; + } + dumpGraph(of); +} + const BinaryBasicBlock * BinaryFunction::getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const { auto I = std::upper_bound(begin(), end(), *BB); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 93dcf84897fe..9c2a54e62c43 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -392,9 +392,20 @@ class BinaryFunction : public AddressRangesOwner { void modifyLayout(LayoutType Type, bool Split); /// View CFG in graphviz program - void viewGraph(); + void viewGraph() const; - /// Get basic block index assuming it belongs to this function. + /// Dump CFG in graphviz format + void dumpGraph(raw_ostream& OS) const; + + /// Dump CFG in graphviz format to file. + void dumpGraphToFile(std::string Filename) const; + + /// Dump CFG in graphviz format to a file with a filename that is derived + /// from the function name and Annotation strings. Useful for dumping the + /// CFG after an optimization pass. + void dumpGraphForPass(std::string Annotation = "") const; + + /// Get basic block index assuming it belongs to this function. unsigned getIndex(const BinaryBasicBlock *BB) const { assert(BasicBlockIndices.find(BB) != BasicBlockIndices.end()); return BasicBlockIndices.find(BB)->second; diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 5cc2bc6691e8..b525972d89e9 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -17,6 +17,7 @@ namespace opts { extern llvm::cl::opt PrintAll; +extern llvm::cl::opt DumpDotAll; extern llvm::cl::opt PrintReordered; extern llvm::cl::opt PrintEHRanges; extern llvm::cl::opt PrintUCE; @@ -343,6 +344,9 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { if (opts::PrintAll || opts::PrintUCE) Function.print(errs(), "after unreachable code elimination", true); + + if (opts::DumpDotAll) + Function.dumpGraphForPass("unreachable-code"); } } @@ -379,6 +383,8 @@ void ReorderBasicBlocks::runOnFunctions( Function.modifyLayout(opts::ReorderBlocks, ShouldSplit); if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks", true); + if (opts::DumpDotAll) + Function.dumpGraphForPass("reordering"); } } } @@ -409,6 +415,8 @@ void FixupFunctions::runOnFunctions( Function.updateEHRanges(); if (opts::PrintAll || opts::PrintEHRanges) Function.print(errs(), "after updating EH ranges", true); + if (opts::DumpDotAll) + Function.dumpGraphForPass("update-EH-ranges"); } } @@ -519,6 +527,9 @@ void SimplifyConditionalTailCalls::runOnFunctions( if (opts::PrintAll || opts::PrintReordered) { Function.print(errs(), "after tail call patching", true); } + if (opts::DumpDotAll) { + Function.dumpGraphForPass("tail-call-patching"); + } } } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ef9801f09b6e..3d80d5ef3e7f 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -148,6 +148,11 @@ cl::opt PrintAll("print-all", cl::desc("print functions after each stage"), cl::Hidden); +cl::opt +DumpDotAll("dump-dot-all", + cl::desc("dump function CFGs to graphviz format after each stage"), + cl::Hidden); + static cl::opt PrintCFG("print-cfg", cl::desc("print functions after CFG construction"), cl::Hidden); @@ -913,6 +918,9 @@ void RewriteInstance::disassembleFunctions() { if (opts::PrintAll || opts::PrintCFG) Function.print(errs(), "after building cfg", true); + if (opts::DumpDotAll) + Function.dumpGraphForPass("build-cfg"); + TotalScore += Function.getFunctionScore(); } // Iterate over all functions From 9390913ca35f4a5c1df0549647c3e5d082aa539d Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Sun, 3 Jul 2016 21:30:35 -0700 Subject: [PATCH 132/904] Fix in inferFallthroughCounts Summary: This fixes the initialization of basic block execution counts, where we should skip edges to the first basic block but we were not skipping the corresponding profile info. Also, I removed a check that was done twice. (cherry picked from commit 67e26151b4d34852ea9315c29a1388e2bbac7a14) --- bolt/BinaryFunction.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index fff33e4d8850..0784c2351714 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1003,8 +1003,10 @@ void BinaryFunction::inferFallThroughCounts() { for (auto Succ : CurBB->successors()) { // Do not update execution count of the entry block (when we have tail // calls). We already accounted for those when computing the func count. - if (Succ == *BasicBlocks.begin()) + if (Succ == *BasicBlocks.begin()) { + ++SuccCount; continue; + } if (SuccCount->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) Succ->ExecutionCount += SuccCount->Count; ++SuccCount; @@ -1067,9 +1069,6 @@ void BinaryFunction::inferFallThroughCounts() { << Twine::utohexstr(getAddress() + CurBB->getOffset()) << '\n'; }); - // Put this information into the fall-through edge - if (CurBB->succ_size() == 0) - continue; // If there is a FT, the last successor will be it. auto &SuccCount = CurBB->BranchInfo.back(); auto &Succ = CurBB->Successors.back(); From 0c5ae8db2076154e6e5b862f463fa9294d270f88 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 Jul 2016 11:48:50 -0700 Subject: [PATCH 133/904] Use unordered_map instead of map in ReorderAlgorithm and BinaryFunction::BasicBlockIndices. Summary: Use unordered_map instead of map in ReorderAlgorithm and BinaryFunction::BasicBlockIndices. Cuts about 30sec off the processing time for the hhvm binary. (~8.5 min to ~8min) (cherry picked from commit 031c8491032640a45d5910ecf3a5a7a63be70d57) --- bolt/BinaryFunction.h | 4 ++-- bolt/ReorderAlgorithm.cpp | 30 ++++++++++++++++++++++++++---- bolt/ReorderAlgorithm.h | 4 ++-- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 9c2a54e62c43..35ab0ba8ed6a 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -36,7 +36,7 @@ #include "llvm/Support/Dwarf.h" #include "llvm/Support/raw_ostream.h" #include -#include +#include #include using namespace llvm::object; @@ -259,7 +259,7 @@ class BinaryFunction : public AddressRangesOwner { // Map that keeps track of the index of each basic block in the BasicBlocks // vector. Used to make getIndex fast. - std::map BasicBlockIndices; + std::unordered_map BasicBlockIndices; // At each basic block entry we attach a CFI state to detect if reordering // corrupts the CFI state for a block. The CFI state is simply the index in diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp index 8465b9aff4d1..23d1c31a5fe3 100644 --- a/bolt/ReorderAlgorithm.cpp +++ b/bolt/ReorderAlgorithm.cpp @@ -16,6 +16,7 @@ #include "BinaryFunction.h" #include "llvm/Support/CommandLine.h" #include +#include using namespace llvm; using namespace bolt; @@ -27,6 +28,26 @@ PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); } // namespace opts +namespace { + +template +inline void hashCombine(size_t &Seed, const T &Val) { + std::hash Hasher; + Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); +} + +template +struct HashPair { + size_t operator()(const std::pair& Val) const { + std::hash Hasher; + size_t Seed = Hasher(Val.first); + hashCombine(Seed, Val.second); + return Seed; + } +}; + +} + void ClusterAlgorithm::computeClusterAverageFrequency() { AvgFreq.resize(Clusters.size(), 0.0); for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { @@ -70,7 +91,8 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { // Encode an edge between two basic blocks, source and destination typedef std::pair EdgeTy; - std::map Weight; + typedef HashPair Hasher; + std::unordered_map Weight; // Define a comparison function to establish SWO between edges auto Comp = [&] (EdgeTy A, EdgeTy B) { @@ -88,7 +110,7 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { }; std::priority_queue, decltype(Comp)> Queue(Comp); - typedef std::map BBToClusterMapTy; + typedef std::unordered_map BBToClusterMapTy; BBToClusterMapTy BBToClusterMap; ClusterEdges.resize(BF.layout_size()); @@ -162,7 +184,7 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { void OptimalReorderAlgorithm::reorderBasicBlocks( const BinaryFunction &BF, BasicBlockOrder &Order) const { std::vector> Weight; - std::map BBToIndex; + std::unordered_map BBToIndex; std::vector IndexToBB; unsigned N = BF.layout_size(); @@ -280,7 +302,7 @@ void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( // Cluster basic blocks. CAlgo->clusterBasicBlocks(BF); std::vector &Clusters = CAlgo->Clusters;; - std::vector> &ClusterEdges = CAlgo->ClusterEdges; + auto &ClusterEdges = CAlgo->ClusterEdges; // Compute clusters' average frequencies. CAlgo->computeClusterAverageFrequency(); diff --git a/bolt/ReorderAlgorithm.h b/bolt/ReorderAlgorithm.h index 9ea30ed19f81..4a4947c662ef 100644 --- a/bolt/ReorderAlgorithm.h +++ b/bolt/ReorderAlgorithm.h @@ -15,7 +15,7 @@ #define LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H #include "llvm/Support/ErrorHandling.h" -#include +#include #include #include @@ -35,7 +35,7 @@ class ClusterAlgorithm { public: typedef std::vector ClusterTy; std::vector Clusters; - std::vector> ClusterEdges; + std::vector> ClusterEdges; std::vector AvgFreq; /// Group the basic blocks the given function into clusters stored in the From 788a287bab408de37deb40c81b9a9b5a2861cebd Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 Jul 2016 21:43:43 -0700 Subject: [PATCH 134/904] Store index inside BinaryBasicBlock instead of in map on BinaryFunction. Summary: Store the basic block index inside the BinaryBasicBlock instead of a map in BinaryFunction. This cut another 15-20 sec. from the processing time for hhvm. (cherry picked from commit 2ba0e71791e7a7f100c1548fe183c2c58f0756d7) --- bolt/BinaryBasicBlock.h | 3 +++ bolt/BinaryFunction.h | 12 ++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index ea292f26edd2..b564078faca3 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -56,6 +56,9 @@ class BinaryBasicBlock { /// Alignment requirements for the block. uint64_t Alignment{1}; + /// Index to BasicBlocks vector in BinaryFunction. + unsigned Index{~0u}; + /// Number of pseudo instructions in this block. uint32_t NumPseudos{0}; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 35ab0ba8ed6a..446f791b4ccf 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -257,10 +257,6 @@ class BinaryFunction : public AddressRangesOwner { BasicBlockListType BasicBlocks; BasicBlockOrderType BasicBlocksLayout; - // Map that keeps track of the index of each basic block in the BasicBlocks - // vector. Used to make getIndex fast. - std::unordered_map BasicBlockIndices; - // At each basic block entry we attach a CFI state to detect if reordering // corrupts the CFI state for a block. The CFI state is simply the index in // FrameInstructions for the CFI responsible for creating this state. @@ -405,10 +401,10 @@ class BinaryFunction : public AddressRangesOwner { /// CFG after an optimization pass. void dumpGraphForPass(std::string Annotation = "") const; - /// Get basic block index assuming it belongs to this function. + /// Get basic block index assuming it belongs to this function. unsigned getIndex(const BinaryBasicBlock *BB) const { - assert(BasicBlockIndices.find(BB) != BasicBlockIndices.end()); - return BasicBlockIndices.find(BB)->second; + assert(BB->Index < BasicBlocks.size()); + return BB->Index; } /// Returns the n-th basic block in this function in its original layout, or @@ -563,7 +559,7 @@ class BinaryFunction : public AddressRangesOwner { BB->setAlignment(std::min(DerivedAlignment, uint64_t(32))); } - BasicBlockIndices[BB] = BasicBlocks.size() - 1; + BB->Index = BasicBlocks.size() - 1; return BB; } From 266ca89e52efbc14d95c1441bc697748779e12f3 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 11 Jul 2016 18:51:13 -0700 Subject: [PATCH 135/904] Create alternative name for local symbols. Summary: If a profile data was collected on a stripped binary but an input to BOLT is unstripped, we would use a different mangling scheme for local functions and ignore their profiles. To solve the issue this diff adds alternative name for all local functions such that one of the names would match the name in the profile. If the input binary was stripped, we reject it, unless "-allow-stripped" option was passed. It's more complicated to do a matching in this case since we have less information than at the time of profile collection. It's also not that simple to tell if the profile was gathered on a stripped binary (in which case we would have no issue matching data). (cherry picked from commit 69e797fc8e0a13f86bdc2c8802c579798a9eccb7) --- bolt/BinaryContext.h | 9 ++++ bolt/DataReader.cpp | 9 ++++ bolt/DataReader.h | 7 +++ bolt/RewriteInstance.cpp | 106 ++++++++++++++++++++++++++------------- 4 files changed, 96 insertions(+), 35 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 867181b164e6..c7f756727453 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -150,6 +150,15 @@ class BinaryContext { /// return the first one. MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); + /// Register a symbol with \p Name at a given \p Address. + void registerNameAtAddress(const std::string &Name, uint64_t Address) { + // Add the name to global symbols map. + GlobalSymbols[Name] = Address; + + // Add to the reverse map. There could multiple names at the same address. + GlobalAddresses.emplace(std::make_pair(Address, Name)); + } + /// Populate some internal data structures with debug info. void preprocessDebugInfo( std::map &BinaryFunctions); diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index cfc3a41ae006..4cdeefa08eab 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -394,6 +394,15 @@ DataReader::getFuncBranchData(const std::vector &FuncNames) const { return make_error_code(llvm::errc::invalid_argument); } +bool DataReader::hasLocalsWithFileName() const { + for (const auto &Func : FuncsMap) { + const auto &FuncName = Func.getKey(); + if (FuncName.count('/') == 2 && FuncName[0] != '/') + return true; + } + return false; +} + void DataReader::dump() const { for (const auto &Func : FuncsMap) { Diag << Func.getKey() << " branches:\n"; diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 39d02e9799b0..692bc4e41fb9 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -187,6 +187,13 @@ class DataReader { FuncsMapType &getAllFuncsData() { return FuncsMap; } + const FuncsMapType &getAllFuncsData() const { return FuncsMap; } + + /// Return true if profile contains an entry for a local function + /// that has a non-empty associated file name. + bool hasLocalsWithFileName() const; + + /// Dumps the entire data structures parsed. Used for debugging. void dump() const; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 3d80d5ef3e7f..2221e2dddca0 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -181,6 +181,11 @@ KeepTmp("keep-tmp", cl::desc("preserve intermediate .o file"), cl::Hidden); +cl::opt +AllowStripped("allow-stripped", + cl::desc("allow processing of stripped binaries"), + cl::Hidden); + // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { @@ -611,6 +616,7 @@ void RewriteInstance::run() { void RewriteInstance::discoverFileObjects() { std::string FileSymbolName; + bool SeenFileName = false; FileSymRefs.clear(); BinaryFunctions.clear(); @@ -623,12 +629,13 @@ void RewriteInstance::discoverFileObjects() { if (Symbol.getFlags() & SymbolRef::SF_Undefined) continue; - ErrorOr Name = Symbol.getName(); - check_error(Name.getError(), "cannot get symbol name"); + ErrorOr NameOrError = Symbol.getName(); + check_error(NameOrError.getError(), "cannot get symbol name"); if (Symbol.getType() == SymbolRef::ST_File) { // Could be used for local symbol disambiguation. - FileSymbolName = *Name; + FileSymbolName = *NameOrError; + SeenFileName = true; continue; } @@ -645,43 +652,61 @@ void RewriteInstance::discoverFileObjects() { // There's nothing horribly wrong with anonymous symbols, but let's // ignore them for now. - if (Name->empty()) + if (NameOrError->empty()) continue; + /// It is possible we are seeing a globalized local. LLVM might treat it as + /// a local if it has a "private global" prefix, e.g. ".L". Thus we have to + /// change the prefix to enforce global scope of the symbol. + std::string Name = + NameOrError->startswith(BC->AsmInfo->getPrivateGlobalPrefix()) + ? "PG." + std::string(*NameOrError) + : std::string(*NameOrError); + // Disambiguate all local symbols before adding to symbol table. - // Since we don't know if we'll see a global with the same name, + // Since we don't know if we will see a global with the same name, // always modify the local name. + // + // NOTE: the naming convention for local symbols should match + // the one we use for profile data. std::string UniqueName; + std::string AlternativeName; if (Symbol.getFlags() & SymbolRef::SF_Global) { - assert(BC->GlobalSymbols.find(*Name) == BC->GlobalSymbols.end() && + assert(BC->GlobalSymbols.find(Name) == BC->GlobalSymbols.end() && "global name not unique"); - UniqueName = *Name; - /// It's possible we are seeing a globalized local. LLVM might treat it as - /// local if it has a "private global" prefix, e.g. ".L". Thus we have to - /// change the prefix to enforce global scope of the symbol. - if (StringRef(UniqueName) - .startswith(BC->AsmInfo->getPrivateGlobalPrefix())) - UniqueName = "PG." + UniqueName; + UniqueName = Name; } else { - unsigned LocalCount = 1; - std::string LocalName = (*Name).str() + "/" + FileSymbolName + "/"; - - if ((*Name).startswith(BC->AsmInfo->getPrivateGlobalPrefix())) { - LocalName = "PG." + LocalName; - } - - while (BC->GlobalSymbols.find(LocalName + std::to_string(LocalCount)) != - BC->GlobalSymbols.end()) { - ++LocalCount; - } - UniqueName = LocalName + std::to_string(LocalCount); + // If we have a local file name, we should create 2 variants for the + // function name. The reason is that perf profile might have been + // collected on a binary that did not have the local file name (e.g. as + // a side effect of stripping debug info from the binary): + // + // primary: / + // alternative: // + // + // The field is used for disambiguation of local symbols since there + // could be identical function names coming from identical file names + // (e.g. from different directories). + std::string Prefix = Name + "/"; + std::string AltPrefix; + if (!FileSymbolName.empty()) + AltPrefix = Prefix + FileSymbolName + "/"; + + auto uniquifyName = [&] (std::string NamePrefix) { + unsigned LocalID = 1; + while (BC->GlobalSymbols.find(NamePrefix + std::to_string(LocalID)) + != BC->GlobalSymbols.end()) + ++LocalID; + return NamePrefix + std::to_string(LocalID); + }; + UniqueName = uniquifyName(Prefix); + if (!AltPrefix.empty()) + AlternativeName = uniquifyName(AltPrefix); } - // Add the name to global symbols map. - BC->GlobalSymbols[UniqueName] = Address; - - // Add to the reverse map. There could multiple names at the same address. - BC->GlobalAddresses.emplace(std::make_pair(Address, UniqueName)); + BC->registerNameAtAddress(UniqueName, Address); + if (!AlternativeName.empty()) + BC->registerNameAtAddress(AlternativeName, Address); // Only consider ST_Function symbols for functions. Although this // assumption could be broken by assembly functions for which the type @@ -750,13 +775,24 @@ void RewriteInstance::discoverFileObjects() { } BFI->second.addAlternativeName(UniqueName); } else { - // Create the function and add to the map. - BinaryFunctions.emplace( + // Create the function and add it to the map. + auto Result = BinaryFunctions.emplace( Address, - BinaryFunction(UniqueName, Symbol, *Section, Address, - SymbolSize, *BC, IsSimple) - ); + BinaryFunction(UniqueName, Symbol, *Section, Address, SymbolSize, + *BC, IsSimple)); + BFI = Result.first; } + if (!AlternativeName.empty()) + BFI->second.addAlternativeName(AlternativeName); + } + + if (!SeenFileName && BC->DR.hasLocalsWithFileName() && !opts::AllowStripped) { + errs() << "BOLT-ERROR: input binary does not have local file symbols " + "but profile data includes function names with embedded file " + "names. It appears that the input binary was stripped while a " + "profiled binary was not. If you know what you are doing and " + "wish to proceed, use -allow-stripped option.\n"; + exit(1); } } From 967fad6ca16c62a392c6f5cf7555bc01e2b0f074 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 12 Jul 2016 16:43:53 -0700 Subject: [PATCH 136/904] Fix crash in patchELFPHDRTable when no functions are modified. Summary: patchELFPHDRTable was asserting that it could not find an entry for .eh_frame_hdr in SectionMapInfo when no functions were modified by BOLT. This just changes code to skip modifying GNU_EH_FRAME program headers hen SectionMapInfo is empty. The existing header is copied and written instead. (cherry picked from commit e3feb4161dc5653acbd8b823699cc440410d426a) --- bolt/RewriteInstance.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 2221e2dddca0..dfd068a087dd 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1697,14 +1697,14 @@ void RewriteInstance::patchELFPHDRTable() { NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum; } else if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) { auto SMII = SectionMM->SectionMapInfo.find(".eh_frame_hdr"); - assert(SMII != SectionMM->SectionMapInfo.end() && - ".eh_frame_hdr could not be found for PT_GNU_EH_FRAME"); - auto &EHFrameHdrSecInfo = SMII->second; - NewPhdr.p_offset = EHFrameHdrSecInfo.FileOffset; - NewPhdr.p_vaddr = EHFrameHdrSecInfo.FileAddress; - NewPhdr.p_paddr = EHFrameHdrSecInfo.FileAddress; - NewPhdr.p_filesz = EHFrameHdrSecInfo.Size; - NewPhdr.p_memsz = EHFrameHdrSecInfo.Size; + if (SMII != SectionMM->SectionMapInfo.end()) { + auto &EHFrameHdrSecInfo = SMII->second; + NewPhdr.p_offset = EHFrameHdrSecInfo.FileOffset; + NewPhdr.p_vaddr = EHFrameHdrSecInfo.FileAddress; + NewPhdr.p_paddr = EHFrameHdrSecInfo.FileAddress; + NewPhdr.p_filesz = EHFrameHdrSecInfo.Size; + NewPhdr.p_memsz = EHFrameHdrSecInfo.Size; + } } else if (opts::UseGnuStack && Phdr.p_type == ELF::PT_GNU_STACK) { NewPhdr.p_type = ELF::PT_LOAD; NewPhdr.p_offset = NewTextSegmentOffset; From 382609dc996a5f5aab3f399e77911e4236f8a9ea Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 19 Jul 2016 11:19:18 -0700 Subject: [PATCH 137/904] Shorten instructions if possible. Summary: Generate short versions of branch instructions by default and rely on relaxation to produce longer versions when needed. Also produce short versions of arithmetic instructions if immediate fits into one byte. This was only triggered once on HHVM binary. (cherry picked from commit ed9c4a7dc40d367c4099ddeabbe29d02191fbf6a) --- bolt/BinaryFunction.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 0784c2351714..c224bbcd2eeb 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -480,6 +480,9 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { break; } + // Convert instruction to a shorter version that could be relaxed if needed. + MIA->shortenInstruction(Instruction); + if (MIA->isBranch(Instruction) || MIA->isCall(Instruction)) { uint64_t InstructionTarget = 0; if (MIA->evaluateBranch(Instruction, From d7b674529424eea6609045c84bc15ca2a799f54a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 31 May 2016 19:12:26 -0700 Subject: [PATCH 138/904] Move debug-handling code into DWARFRewriter (NFC). Summary: RewriteInstance.cpp is getting too big. Split the code. (cherry picked from commit 048338fc97d3c533757e33f64feb58f14b2de59c) --- bolt/CMakeLists.txt | 1 + bolt/DWARFRewriter.cpp | 493 +++++++++++++++++++++++++++++++++++++++ bolt/RewriteInstance.cpp | 467 +----------------------------------- 3 files changed, 498 insertions(+), 463 deletions(-) create mode 100644 bolt/DWARFRewriter.cpp diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 53faad6bd59a..c82c8de44e4d 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -25,4 +25,5 @@ add_llvm_tool(llvm-bolt Exceptions.cpp RewriteInstance.cpp ReorderAlgorithm.cpp + DWARFRewriter.cpp ) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp new file mode 100644 index 000000000000..0a58e789d7de --- /dev/null +++ b/bolt/DWARFRewriter.cpp @@ -0,0 +1,493 @@ +//===--- DWARFRewriter.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "RewriteInstance.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h" +#include "llvm/DebugInfo/DWARF/DWARFFormValue.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/TimeValue.h" +#include "llvm/Support/Timer.h" +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +using namespace llvm; +using namespace object; +using namespace bolt; + +void RewriteInstance::updateDebugInfo() { + SectionPatchers[".debug_abbrev"] = llvm::make_unique(); + SectionPatchers[".debug_info"] = llvm::make_unique(); + + updateFunctionRanges(); + + updateAddressRangesObjects(); + + updateEmptyModuleRanges(); + + generateDebugRanges(); + + updateLocationLists(); + + updateDWARFAddressRanges(); +} + +void RewriteInstance::updateEmptyModuleRanges() { + const auto &CUAddressRanges = RangesSectionsWriter.getCUAddressRanges(); + for (const auto &CU : BC->DwCtx->compile_units()) { + if (CUAddressRanges.find(CU->getOffset()) != CUAddressRanges.end()) + continue; + auto const &Ranges = CU->getUnitDIE(true)->getAddressRanges(CU.get()); + for (auto const &Range : Ranges) { + RangesSectionsWriter.AddRange(CU->getOffset(), + Range.first, + Range.second - Range.first); + } + } +} + +void RewriteInstance::updateDWARFAddressRanges() { + // Update DW_AT_ranges for all compilation units. + for (const auto &CU : BC->DwCtx->compile_units()) { + const auto CUID = CU->getOffset(); + const auto RSOI = RangesSectionsWriter.getRangesOffsetCUMap().find(CUID); + if (RSOI == RangesSectionsWriter.getRangesOffsetCUMap().end()) + continue; + updateDWARFObjectAddressRanges(RSOI->second, CU.get(), CU->getUnitDIE()); + } + + // Update address ranges of functions. + for (const auto &BFI : BinaryFunctions) { + const auto &Function = BFI.second; + for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { + updateDWARFObjectAddressRanges( + Function.getAddressRangesOffset(), + DIECompileUnitPair.second, + DIECompileUnitPair.first); + } + } + + // Update address ranges of DIEs with addresses that don't match functions. + for (auto &DIECompileUnitPair : BC->UnknownFunctions) { + updateDWARFObjectAddressRanges( + RangesSectionsWriter.getEmptyRangesListOffset(), + DIECompileUnitPair.second, + DIECompileUnitPair.first); + } + + // Update address ranges of DWARF block objects (lexical/try/catch blocks, + // inlined subroutine instances, etc). + for (const auto &Obj : BC->AddressRangesObjects) { + updateDWARFObjectAddressRanges( + Obj.getAddressRangesOffset(), + Obj.getCompileUnit(), + Obj.getDIE()); + } +} + +void RewriteInstance::updateDWARFObjectAddressRanges( + uint32_t DebugRangesOffset, + const DWARFUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE) { + + // Some objects don't have an associated DIE and cannot be updated (such as + // compiler-generated functions). + if (!DIE) { + return; + } + + if (DebugRangesOffset == -1U) { + errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + } + + auto DebugInfoPatcher = + static_cast(SectionPatchers[".debug_info"].get()); + auto AbbrevPatcher = + static_cast(SectionPatchers[".debug_abbrev"].get()); + + assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized."); + + const auto *AbbreviationDecl = DIE->getAbbreviationDeclarationPtr(); + if (!AbbreviationDecl) { + errs() << "BOLT-WARNING: object's DIE doesn't have an abbreviation: " + << "skipping update. DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + return; + } + + auto AbbrevCode = AbbreviationDecl->getCode(); + + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges) != -1U) { + // Case 1: The object was already non-contiguous and had DW_AT_ranges. + // In this case we simply need to update the value of DW_AT_ranges. + DWARFFormValue FormValue; + uint32_t AttrOffset = -1U; + DIE->getAttributeValue(Unit, dwarf::DW_AT_ranges, FormValue, &AttrOffset); + DebugInfoPatcher->addLE32Patch(AttrOffset, DebugRangesOffset); + } else { + // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc. + // We require the compiler to put both attributes one after the other + // for our approach to work. low_pc and high_pc both occupy 8 bytes + // as we're dealing with a 64-bit ELF. We basically change low_pc to + // DW_AT_ranges and high_pc to DW_AT_producer. ranges spans only 4 bytes + // in 32-bit DWARF, which we assume to be used, which leaves us with 12 + // more bytes. We then set the value of DW_AT_producer as an arbitrary + // 12-byte string that fills the remaining space and leaves the rest of + // the abbreviation layout unchanged. + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) != -1U && + AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc) != -1U) { + uint32_t LowPCOffset = -1U; + uint32_t HighPCOffset = -1U; + DWARFFormValue LowPCFormValue; + DWARFFormValue HighPCFormValue; + DIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, LowPCFormValue, + &LowPCOffset); + DIE->getAttributeValue(Unit, dwarf::DW_AT_high_pc, HighPCFormValue, + &HighPCOffset); + if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr || + (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && + HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && + HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { + errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " + "at offset 0x" << Twine::utohexstr(DIE->getOffset()) << '\n'; + return; + } + if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { + errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " + "Cannot update DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + return; + } + + AbbrevPatcher->addAttributePatch(Unit, + AbbrevCode, + dwarf::DW_AT_low_pc, + dwarf::DW_AT_ranges, + dwarf::DW_FORM_sec_offset); + AbbrevPatcher->addAttributePatch(Unit, + AbbrevCode, + dwarf::DW_AT_high_pc, + dwarf::DW_AT_producer, + dwarf::DW_FORM_string); + unsigned StringSize = 0; + if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr || + HighPCFormValue.getForm() == dwarf::DW_FORM_data8) { + StringSize = 12; + } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) { + StringSize = 8; + } else { + assert(0 && "unexpected form"); + } + + DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); + std::string ProducerString{"LLVM-BOLT"}; + ProducerString.resize(StringSize, ' '); + ProducerString.back() = '\0'; + DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); + } else { + errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + } + } +} + +void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { + for (auto &It : BinaryFunctions) { + const auto &Function = It.second; + + if (Function.isSimple()) + continue; + + auto ULT = Function.getDWARFUnitLineTable(); + auto Unit = ULT.first; + auto LineTable = ULT.second; + + if (!LineTable) + continue; // nothing to update for this function + + std::vector Results; + MCSectionELF *FunctionSection = + BC->Ctx->getELFSection(Function.getCodeSectionName(), + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + + uint64_t Address = It.first; + if (LineTable->lookupAddressRange(Address, Function.getMaxSize(), + Results)) { + auto &OutputLineTable = + BC->Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections(); + for (auto RowIndex : Results) { + const auto &Row = LineTable->Rows[RowIndex]; + BC->Ctx->setCurrentDwarfLoc( + Row.File, + Row.Line, + Row.Column, + (DWARF2_FLAG_IS_STMT * Row.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * Row.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * Row.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * Row.EpilogueBegin), + Row.Isa, + Row.Discriminator, + Row.Address); + auto Loc = BC->Ctx->getCurrentDwarfLoc(); + BC->Ctx->clearDwarfLocSeen(); + OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, + FunctionSection); + } + // Add an empty entry past the end of the function + // for end_sequence mark. + BC->Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0, + Address + Function.getMaxSize()); + auto Loc = BC->Ctx->getCurrentDwarfLoc(); + BC->Ctx->clearDwarfLocSeen(); + OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, + FunctionSection); + } else { + DEBUG(errs() << "BOLT-DEBUG: Function " << Function.getName() + << " has no associated line number information.\n"); + } + } +} + +void RewriteInstance::updateAddressRangesObjects() { + for (auto &Obj : BC->AddressRangesObjects) { + for (const auto &Range : Obj.getAbsoluteAddressRanges()) { + RangesSectionsWriter.AddRange(&Obj, Range.first, + Range.second - Range.first); + } + } +} + +void RewriteInstance::updateLineTableOffsets() { + const auto LineSection = + BC->Ctx->getObjectFileInfo()->getDwarfLineSection(); + auto CurrentFragment = LineSection->begin(); + uint32_t CurrentOffset = 0; + uint32_t Offset = 0; + + // Line tables are stored in MCContext in ascending order of offset in the + // output file, thus we can compute all table's offset by passing through + // each fragment at most once, continuing from the last CU's beginning + // instead of from the first fragment. + for (const auto &CUIDLineTablePair : BC->Ctx->getMCDwarfLineTables()) { + auto Label = CUIDLineTablePair.second.getLabel(); + if (!Label) + continue; + + auto CUOffset = CUIDLineTablePair.first; + if (CUOffset == -1U) + continue; + + auto *CU = BC->DwCtx->getCompileUnitForOffset(CUOffset); + assert(CU && "expected non-null CU"); + auto LTOffset = + BC->DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list); + if (!LTOffset) + continue; + + auto Fragment = Label->getFragment(); + while (&*CurrentFragment != Fragment) { + switch (CurrentFragment->getKind()) { + case MCFragment::FT_Dwarf: + Offset += cast(*CurrentFragment) + .getContents().size() - CurrentOffset; + break; + case MCFragment::FT_Data: + Offset += cast(*CurrentFragment) + .getContents().size() - CurrentOffset; + break; + default: + llvm_unreachable(".debug_line section shouldn't contain other types " + "of fragments."); + } + ++CurrentFragment; + CurrentOffset = 0; + } + + Offset += Label->getOffset() - CurrentOffset; + CurrentOffset = Label->getOffset(); + + auto &SI = SectionMM->NoteSectionInfo[".debug_info"]; + SI.PendingRelocs.emplace_back( + SectionInfo::Reloc{LTOffset, 4, 0, Offset}); + + DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first + << " has line table at " << Offset << "\n"); + } +} + +void RewriteInstance::updateFunctionRanges() { + auto addDebugArangesEntry = [&](const BinaryFunction &Function, + uint64_t RangeBegin, + uint64_t RangeSize) { + // The function potentially has multiple associated CUs because of + // the identical code folding optimization. Update all of them with + // the range. + for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { + auto CUOffset = DIECompileUnitPair.second->getOffset(); + if (CUOffset != -1U) + RangesSectionsWriter.AddRange(CUOffset, RangeBegin, RangeSize); + } + }; + + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + // If function doesn't have registered DIEs - there's nothting to update. + if (Function.getSubprogramDIEs().empty()) + continue; + // Use either new (image) or original size for the function range. + auto Size = Function.isSimple() ? Function.getImageSize() + : Function.getSize(); + addDebugArangesEntry(Function, + Function.getAddress(), + Size); + RangesSectionsWriter.AddRange(&Function, Function.getAddress(), Size); + if (Function.isSimple() && Function.cold().getImageSize()) { + addDebugArangesEntry(Function, + Function.cold().getAddress(), + Function.cold().getImageSize()); + RangesSectionsWriter.AddRange(&Function, + Function.cold().getAddress(), + Function.cold().getImageSize()); + } + } +} + +void RewriteInstance::generateDebugRanges() { + using RangeType = enum { RANGES, ARANGES }; + for (int IntRT = RANGES; IntRT <= ARANGES; ++IntRT) { + RangeType RT = static_cast(IntRT); + const char *SectionName = (RT == RANGES) ? ".debug_ranges" + : ".debug_aranges"; + SmallVector RangesBuffer; + raw_svector_ostream OS(RangesBuffer); + + auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); + auto Writer = MAB->createObjectWriter(OS); + + if (RT == RANGES) { + RangesSectionsWriter.WriteRangesSection(Writer); + } else { + RangesSectionsWriter.WriteArangesSection(Writer); + } + const auto &DebugRangesContents = OS.str(); + + // Free'd by SectionMM. + uint8_t *SectionData = new uint8_t[DebugRangesContents.size()]; + memcpy(SectionData, DebugRangesContents.data(), DebugRangesContents.size()); + + SectionMM->NoteSectionInfo[SectionName] = SectionInfo( + reinterpret_cast(SectionData), + DebugRangesContents.size(), + /*Alignment=*/0, + /*IsCode=*/false, + /*IsReadOnly=*/true); + } +} + +void RewriteInstance::updateLocationLists() { + // Write new contents to .debug_loc. + SmallVector DebugLocBuffer; + raw_svector_ostream OS(DebugLocBuffer); + + auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); + auto Writer = MAB->createObjectWriter(OS); + + DebugLocWriter LocationListsWriter; + + for (const auto &Loc : BC->LocationLists) { + LocationListsWriter.write(Loc, Writer); + } + + const auto &DebugLocContents = OS.str(); + + // Free'd by SectionMM. + uint8_t *SectionData = new uint8_t[DebugLocContents.size()]; + memcpy(SectionData, DebugLocContents.data(), DebugLocContents.size()); + + SectionMM->NoteSectionInfo[".debug_loc"] = SectionInfo( + reinterpret_cast(SectionData), + DebugLocContents.size(), + /*Alignment=*/0, + /*IsCode=*/false, + /*IsReadOnly=*/true); + + // For each CU, update pointers into .debug_loc. + for (const auto &CU : BC->DwCtx->compile_units()) { + updateLocationListPointers( + CU.get(), + CU->getUnitDIE(false), + LocationListsWriter.getUpdatedLocationListOffsets()); + } +} + +void RewriteInstance::updateLocationListPointers( + const DWARFUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + const std::map &UpdatedOffsets) { + // Stop if we're in a non-simple function, which will not be rewritten. + auto Tag = DIE->getTag(); + if (Tag == dwarf::DW_TAG_subprogram) { + uint64_t LowPC = -1ULL, HighPC = -1ULL; + DIE->getLowAndHighPC(Unit, LowPC, HighPC); + if (LowPC != -1ULL) { + auto It = BinaryFunctions.find(LowPC); + if (It != BinaryFunctions.end() && !It->second.isSimple()) + return; + } + } + // If the DIE has a DW_AT_location attribute with a section offset, update it. + DWARFFormValue Value; + uint32_t AttrOffset; + if (DIE->getAttributeValue(Unit, dwarf::DW_AT_location, Value, &AttrOffset) && + (Value.isFormClass(DWARFFormValue::FC_Constant) || + Value.isFormClass(DWARFFormValue::FC_SectionOffset))) { + uint64_t DebugLocOffset = -1ULL; + if (Value.isFormClass(DWARFFormValue::FC_SectionOffset)) { + DebugLocOffset = Value.getAsSectionOffset().getValue(); + } else if (Value.isFormClass(DWARFFormValue::FC_Constant)) { // DWARF 3 + DebugLocOffset = Value.getAsUnsignedConstant().getValue(); + } + + auto It = UpdatedOffsets.find(DebugLocOffset); + if (It != UpdatedOffsets.end()) { + auto DebugInfoPatcher = + static_cast( + SectionPatchers[".debug_info"].get()); + DebugInfoPatcher->addLE32Patch(AttrOffset, It->second + DebugLocSize); + } + } + + // Recursively visit children. + for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { + updateLocationListPointers(Unit, Child, UpdatedOffsets); + } +} diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index dfd068a087dd..6bdf2aaaea4d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -20,7 +20,6 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h" -#include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" @@ -44,14 +43,10 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TimeValue.h" -#include "llvm/Support/Timer.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Target/TargetMachine.h" #include @@ -601,7 +596,8 @@ void RewriteInstance::run() { emitFunctions(); } - updateDebugInfo(); + if (opts::UpdateDebugSections) + updateDebugInfo(); // Copy allocatable part of the input. std::error_code EC; @@ -1321,7 +1317,8 @@ void RewriteInstance::emitFunctions() { emitFunction(*Streamer, Function, *BC.get(), /*EmitColdPart=*/true); } - updateDebugLineInfoForNonSimpleFunctions(); + if (opts::UpdateDebugSections) + updateDebugLineInfoForNonSimpleFunctions(); Streamer->Finish(); @@ -1511,154 +1508,6 @@ bool RewriteInstance::checkLargeFunctions() { return !LargeFunctions.empty(); } -void RewriteInstance::updateFunctionRanges() { - auto addDebugArangesEntry = [&](const BinaryFunction &Function, - uint64_t RangeBegin, - uint64_t RangeSize) { - // The function potentially has multiple associated CUs because of - // the identical code folding optimization. Update all of them with - // the range. - for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { - auto CUOffset = DIECompileUnitPair.second->getOffset(); - if (CUOffset != -1U) - RangesSectionsWriter.AddRange(CUOffset, RangeBegin, RangeSize); - } - }; - - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - // If function doesn't have registered DIEs - there's nothting to update. - if (Function.getSubprogramDIEs().empty()) - continue; - // Use either new (image) or original size for the function range. - auto Size = Function.isSimple() ? Function.getImageSize() - : Function.getSize(); - addDebugArangesEntry(Function, - Function.getAddress(), - Size); - RangesSectionsWriter.AddRange(&Function, Function.getAddress(), Size); - if (Function.isSimple() && Function.cold().getImageSize()) { - addDebugArangesEntry(Function, - Function.cold().getAddress(), - Function.cold().getImageSize()); - RangesSectionsWriter.AddRange(&Function, - Function.cold().getAddress(), - Function.cold().getImageSize()); - } - } -} - -void RewriteInstance::generateDebugRanges() { - using RangeType = enum { RANGES, ARANGES }; - for (int IntRT = RANGES; IntRT <= ARANGES; ++IntRT) { - RangeType RT = static_cast(IntRT); - const char *SectionName = (RT == RANGES) ? ".debug_ranges" - : ".debug_aranges"; - SmallVector RangesBuffer; - raw_svector_ostream OS(RangesBuffer); - - auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); - auto Writer = MAB->createObjectWriter(OS); - - if (RT == RANGES) { - RangesSectionsWriter.WriteRangesSection(Writer); - } else { - RangesSectionsWriter.WriteArangesSection(Writer); - } - const auto &DebugRangesContents = OS.str(); - - // Free'd by SectionMM. - uint8_t *SectionData = new uint8_t[DebugRangesContents.size()]; - memcpy(SectionData, DebugRangesContents.data(), DebugRangesContents.size()); - - SectionMM->NoteSectionInfo[SectionName] = SectionInfo( - reinterpret_cast(SectionData), - DebugRangesContents.size(), - /*Alignment=*/0, - /*IsCode=*/false, - /*IsReadOnly=*/true); - } -} - -void RewriteInstance::updateLocationLists() { - // Write new contents to .debug_loc. - SmallVector DebugLocBuffer; - raw_svector_ostream OS(DebugLocBuffer); - - auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); - auto Writer = MAB->createObjectWriter(OS); - - DebugLocWriter LocationListsWriter; - - for (const auto &Loc : BC->LocationLists) { - LocationListsWriter.write(Loc, Writer); - } - - const auto &DebugLocContents = OS.str(); - - // Free'd by SectionMM. - uint8_t *SectionData = new uint8_t[DebugLocContents.size()]; - memcpy(SectionData, DebugLocContents.data(), DebugLocContents.size()); - - SectionMM->NoteSectionInfo[".debug_loc"] = SectionInfo( - reinterpret_cast(SectionData), - DebugLocContents.size(), - /*Alignment=*/0, - /*IsCode=*/false, - /*IsReadOnly=*/true); - - // For each CU, update pointers into .debug_loc. - for (const auto &CU : BC->DwCtx->compile_units()) { - updateLocationListPointers( - CU.get(), - CU->getUnitDIE(false), - LocationListsWriter.getUpdatedLocationListOffsets()); - } -} - -void RewriteInstance::updateLocationListPointers( - const DWARFUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, - const std::map &UpdatedOffsets) { - // Stop if we're in a non-simple function, which will not be rewritten. - auto Tag = DIE->getTag(); - if (Tag == dwarf::DW_TAG_subprogram) { - uint64_t LowPC = -1ULL, HighPC = -1ULL; - DIE->getLowAndHighPC(Unit, LowPC, HighPC); - if (LowPC != -1ULL) { - auto It = BinaryFunctions.find(LowPC); - if (It != BinaryFunctions.end() && !It->second.isSimple()) - return; - } - } - // If the DIE has a DW_AT_location attribute with a section offset, update it. - DWARFFormValue Value; - uint32_t AttrOffset; - if (DIE->getAttributeValue(Unit, dwarf::DW_AT_location, Value, &AttrOffset) && - (Value.isFormClass(DWARFFormValue::FC_Constant) || - Value.isFormClass(DWARFFormValue::FC_SectionOffset))) { - uint64_t DebugLocOffset = -1ULL; - if (Value.isFormClass(DWARFFormValue::FC_SectionOffset)) { - DebugLocOffset = Value.getAsSectionOffset().getValue(); - } else if (Value.isFormClass(DWARFFormValue::FC_Constant)) { // DWARF 3 - DebugLocOffset = Value.getAsUnsignedConstant().getValue(); - } - - auto It = UpdatedOffsets.find(DebugLocOffset); - if (It != UpdatedOffsets.end()) { - auto DebugInfoPatcher = - static_cast( - SectionPatchers[".debug_info"].get()); - DebugInfoPatcher->addLE32Patch(AttrOffset, It->second + DebugLocSize); - } - } - - // Recursively visit children. - for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { - updateLocationListPointers(Unit, Child, UpdatedOffsets); - } -} - void RewriteInstance::patchELFPHDRTable() { auto ELF64LEFile = dyn_cast(InputFile); if (!ELF64LEFile) { @@ -2136,314 +1985,6 @@ void RewriteInstance::rewriteFile() { Out->keep(); } -void RewriteInstance::updateAddressRangesObjects() { - for (auto &Obj : BC->AddressRangesObjects) { - for (const auto &Range : Obj.getAbsoluteAddressRanges()) { - RangesSectionsWriter.AddRange(&Obj, Range.first, - Range.second - Range.first); - } - } -} - -void RewriteInstance::updateLineTableOffsets() { - const auto LineSection = - BC->Ctx->getObjectFileInfo()->getDwarfLineSection(); - auto CurrentFragment = LineSection->begin(); - uint32_t CurrentOffset = 0; - uint32_t Offset = 0; - - // Line tables are stored in MCContext in ascending order of offset in the - // output file, thus we can compute all table's offset by passing through - // each fragment at most once, continuing from the last CU's beginning - // instead of from the first fragment. - for (const auto &CUIDLineTablePair : BC->Ctx->getMCDwarfLineTables()) { - auto Label = CUIDLineTablePair.second.getLabel(); - if (!Label) - continue; - - auto CUOffset = CUIDLineTablePair.first; - if (CUOffset == -1U) - continue; - - auto *CU = BC->DwCtx->getCompileUnitForOffset(CUOffset); - assert(CU && "expected non-null CU"); - auto LTOffset = - BC->DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list); - if (!LTOffset) - continue; - - auto Fragment = Label->getFragment(); - while (&*CurrentFragment != Fragment) { - switch (CurrentFragment->getKind()) { - case MCFragment::FT_Dwarf: - Offset += cast(*CurrentFragment) - .getContents().size() - CurrentOffset; - break; - case MCFragment::FT_Data: - Offset += cast(*CurrentFragment) - .getContents().size() - CurrentOffset; - break; - default: - llvm_unreachable(".debug_line section shouldn't contain other types " - "of fragments."); - } - ++CurrentFragment; - CurrentOffset = 0; - } - - Offset += Label->getOffset() - CurrentOffset; - CurrentOffset = Label->getOffset(); - - auto &SI = SectionMM->NoteSectionInfo[".debug_info"]; - SI.PendingRelocs.emplace_back( - SectionInfo::Reloc{LTOffset, 4, 0, Offset}); - - DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first - << " has line table at " << Offset << "\n"); - } -} - -void RewriteInstance::updateDebugInfo() { - if (!opts::UpdateDebugSections) - return; - - SectionPatchers[".debug_abbrev"] = llvm::make_unique(); - SectionPatchers[".debug_info"] = llvm::make_unique(); - - updateFunctionRanges(); - - updateAddressRangesObjects(); - - updateEmptyModuleRanges(); - - generateDebugRanges(); - - updateLocationLists(); - - updateDWARFAddressRanges(); -} - -void RewriteInstance::updateEmptyModuleRanges() { - const auto &CUAddressRanges = RangesSectionsWriter.getCUAddressRanges(); - for (const auto &CU : BC->DwCtx->compile_units()) { - if (CUAddressRanges.find(CU->getOffset()) != CUAddressRanges.end()) - continue; - auto const &Ranges = CU->getUnitDIE(true)->getAddressRanges(CU.get()); - for (auto const &Range : Ranges) { - RangesSectionsWriter.AddRange(CU->getOffset(), - Range.first, - Range.second - Range.first); - } - } -} - -void RewriteInstance::updateDWARFAddressRanges() { - // Update DW_AT_ranges for all compilation units. - for (const auto &CU : BC->DwCtx->compile_units()) { - const auto CUID = CU->getOffset(); - const auto RSOI = RangesSectionsWriter.getRangesOffsetCUMap().find(CUID); - if (RSOI == RangesSectionsWriter.getRangesOffsetCUMap().end()) - continue; - updateDWARFObjectAddressRanges(RSOI->second, CU.get(), CU->getUnitDIE()); - } - - // Update address ranges of functions. - for (const auto &BFI : BinaryFunctions) { - const auto &Function = BFI.second; - for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { - updateDWARFObjectAddressRanges( - Function.getAddressRangesOffset(), - DIECompileUnitPair.second, - DIECompileUnitPair.first); - } - } - - // Update address ranges of DIEs with addresses that don't match functions. - for (auto &DIECompileUnitPair : BC->UnknownFunctions) { - updateDWARFObjectAddressRanges( - RangesSectionsWriter.getEmptyRangesListOffset(), - DIECompileUnitPair.second, - DIECompileUnitPair.first); - } - - // Update address ranges of DWARF block objects (lexical/try/catch blocks, - // inlined subroutine instances, etc). - for (const auto &Obj : BC->AddressRangesObjects) { - updateDWARFObjectAddressRanges( - Obj.getAddressRangesOffset(), - Obj.getCompileUnit(), - Obj.getDIE()); - } -} - -void RewriteInstance::updateDWARFObjectAddressRanges( - uint32_t DebugRangesOffset, - const DWARFUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE) { - - // Some objects don't have an associated DIE and cannot be updated (such as - // compiler-generated functions). - if (!DIE) { - return; - } - - if (DebugRangesOffset == -1U) { - errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; - } - - auto DebugInfoPatcher = - static_cast(SectionPatchers[".debug_info"].get()); - auto AbbrevPatcher = - static_cast(SectionPatchers[".debug_abbrev"].get()); - - assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized."); - - const auto *AbbreviationDecl = DIE->getAbbreviationDeclarationPtr(); - if (!AbbreviationDecl) { - errs() << "BOLT-WARNING: object's DIE doesn't have an abbreviation: " - << "skipping update. DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; - return; - } - - auto AbbrevCode = AbbreviationDecl->getCode(); - - if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges) != -1U) { - // Case 1: The object was already non-contiguous and had DW_AT_ranges. - // In this case we simply need to update the value of DW_AT_ranges. - DWARFFormValue FormValue; - uint32_t AttrOffset = -1U; - DIE->getAttributeValue(Unit, dwarf::DW_AT_ranges, FormValue, &AttrOffset); - DebugInfoPatcher->addLE32Patch(AttrOffset, DebugRangesOffset); - } else { - // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc. - // We require the compiler to put both attributes one after the other - // for our approach to work. low_pc and high_pc both occupy 8 bytes - // as we're dealing with a 64-bit ELF. We basically change low_pc to - // DW_AT_ranges and high_pc to DW_AT_producer. ranges spans only 4 bytes - // in 32-bit DWARF, which we assume to be used, which leaves us with 12 - // more bytes. We then set the value of DW_AT_producer as an arbitrary - // 12-byte string that fills the remaining space and leaves the rest of - // the abbreviation layout unchanged. - if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) != -1U && - AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc) != -1U) { - uint32_t LowPCOffset = -1U; - uint32_t HighPCOffset = -1U; - DWARFFormValue LowPCFormValue; - DWARFFormValue HighPCFormValue; - DIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, LowPCFormValue, - &LowPCOffset); - DIE->getAttributeValue(Unit, dwarf::DW_AT_high_pc, HighPCFormValue, - &HighPCOffset); - if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr || - (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && - HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && - HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { - errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " - "at offset 0x" << Twine::utohexstr(DIE->getOffset()) << '\n'; - return; - } - if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { - errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " - "Cannot update DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; - return; - } - - AbbrevPatcher->addAttributePatch(Unit, - AbbrevCode, - dwarf::DW_AT_low_pc, - dwarf::DW_AT_ranges, - dwarf::DW_FORM_sec_offset); - AbbrevPatcher->addAttributePatch(Unit, - AbbrevCode, - dwarf::DW_AT_high_pc, - dwarf::DW_AT_producer, - dwarf::DW_FORM_string); - unsigned StringSize = 0; - if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr || - HighPCFormValue.getForm() == dwarf::DW_FORM_data8) { - StringSize = 12; - } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) { - StringSize = 8; - } else { - assert(0 && "unexpected form"); - } - - DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); - std::string ProducerString{"LLVM-BOLT"}; - ProducerString.resize(StringSize, ' '); - ProducerString.back() = '\0'; - DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); - } else { - errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; - } - } -} - -void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { - if (!opts::UpdateDebugSections) - return; - - for (auto &It : BinaryFunctions) { - const auto &Function = It.second; - - if (Function.isSimple()) - continue; - - auto ULT = Function.getDWARFUnitLineTable(); - auto Unit = ULT.first; - auto LineTable = ULT.second; - - if (!LineTable) - continue; // nothing to update for this function - - std::vector Results; - MCSectionELF *FunctionSection = - BC->Ctx->getELFSection(Function.getCodeSectionName(), - ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); - - uint64_t Address = It.first; - if (LineTable->lookupAddressRange(Address, Function.getMaxSize(), - Results)) { - auto &OutputLineTable = - BC->Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections(); - for (auto RowIndex : Results) { - const auto &Row = LineTable->Rows[RowIndex]; - BC->Ctx->setCurrentDwarfLoc( - Row.File, - Row.Line, - Row.Column, - (DWARF2_FLAG_IS_STMT * Row.IsStmt) | - (DWARF2_FLAG_BASIC_BLOCK * Row.BasicBlock) | - (DWARF2_FLAG_PROLOGUE_END * Row.PrologueEnd) | - (DWARF2_FLAG_EPILOGUE_BEGIN * Row.EpilogueBegin), - Row.Isa, - Row.Discriminator, - Row.Address); - auto Loc = BC->Ctx->getCurrentDwarfLoc(); - BC->Ctx->clearDwarfLocSeen(); - OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, - FunctionSection); - } - // Add an empty entry past the end of the function - // for end_sequence mark. - BC->Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0, - Address + Function.getMaxSize()); - auto Loc = BC->Ctx->getCurrentDwarfLoc(); - BC->Ctx->clearDwarfLocSeen(); - OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, - FunctionSection); - } else { - DEBUG(errs() << "BOLT-DEBUG: Function " << Function.getName() - << " has no associated line number information.\n"); - } - } -} - bool RewriteInstance::shouldOverwriteSection(StringRef SectionName) { if (opts::UpdateDebugSections) { for (auto &OverwriteName : DebugSectionsToOverwrite) { From 3f52d26c6844e8f3bf75849d8f3de36b6d4c1e63 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 21 Jul 2016 12:45:35 -0700 Subject: [PATCH 139/904] Add BinaryContext::getSectionForAddress() Summary: Interface for accessing section from BinaryContext. (cherry picked from commit 688f9bb635a40d6f85ab89f729ac71623ba3e70b) --- bolt/BinaryContext.cpp | 10 ++++++++++ bolt/BinaryContext.h | 16 +++++++++++++--- bolt/RewriteInstance.cpp | 5 +++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index cbfc30695901..c83fc3abc1e9 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -232,5 +232,15 @@ void BinaryContext::preprocessFunctionDebugInfo( } } +ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ + auto SI = AllocatableSections.upper_bound(Address); + if (SI != AllocatableSections.begin()) { + --SI; + if (SI->first + SI->second.getSize() > Address) + return SI->second; + } + return std::make_error_code(std::errc::bad_address); +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index c7f756727453..aaf22eb41ab7 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -30,6 +30,8 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/ErrorOr.h" #include "llvm/Support/TargetRegistry.h" #include #include @@ -42,6 +44,8 @@ namespace llvm { class DWARFDebugInfoEntryMinimal; +using namespace object; + namespace bolt { class BinaryFunction; @@ -53,14 +57,17 @@ class BinaryContext { public: - // [name] -> [address] map used for global symbol resolution. + /// [name] -> [address] map used for global symbol resolution. typedef std::map SymbolMapType; SymbolMapType GlobalSymbols; - // [address] -> [name1], [name2], ... + /// [address] -> [name1], [name2], ... std::multimap GlobalAddresses; - // Set of addresses we cannot relocate because we have a direct branch to it. + /// Map virtual address to a section. + std::map AllocatableSections; + + /// Set of addresses we cannot relocate because we have a direct branch to it. std::set InterproceduralBranchTargets; /// List of DWARF location lists in .debug_loc. @@ -150,6 +157,9 @@ class BinaryContext { /// return the first one. MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); + /// Return (allocatable) section containing the given \p Address. + ErrorOr getSectionForAddress(uint64_t Address) const; + /// Register a symbol with \p Name at a given \p Address. void registerNameAtAddress(const std::string &Name, uint64_t Address) { // Add the name to global symbols map. diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 6bdf2aaaea4d..072e50d77bf3 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -815,6 +815,11 @@ void RewriteInstance::readSpecialSections() { } else if (SectionName == ".debug_loc") { DebugLocSize = Section.getSize(); } + + if (Section.isText() || Section.isData() || Section.isBSS()) { + BC->AllocatableSections.emplace(std::make_pair(Section.getAddress(), + Section)); + } } FrameHdrCopy = From ecd1adff80f04e2696799b161f1ffb54e96466d7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 21 Jul 2016 16:40:06 -0700 Subject: [PATCH 140/904] Add movabs -> mov shortening optimization. Add peephole optimization pass that does instruction shortening. Summary: Shorten when a mov instruction has a 64-bit immediate that can be repesented as a sign extended 32-bit number, use the smaller mov instruction (MOV64ri -> MOV64ri32). Add peephole optimization pass that does instruction shortening. (cherry picked from commit d8753336883bb5c8a5a3c3943183ee6a8f564781) --- bolt/BinaryPassManager.cpp | 8 ++++++++ bolt/BinaryPasses.cpp | 29 +++++++++++++++++++++++++++++ bolt/BinaryPasses.h | 9 +++++++++ bolt/RewriteInstance.cpp | 5 +++++ 4 files changed, 51 insertions(+) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index da78727fab39..cd54382703e4 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -36,6 +36,12 @@ SimplifyConditionalTailCalls("simplify-conditional-tail-calls", "by removing unnecessary jumps"), llvm::cl::Optional); +static llvm::cl::opt +Peepholes("peepholes", + llvm::cl::desc("run peephole optimizations"), + llvm::cl::init(true), + llvm::cl::Optional); + } // namespace opts namespace llvm { @@ -82,6 +88,8 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(std::move(llvm::make_unique())); + Manager.registerPass(llvm::make_unique(), opts::Peepholes); + Manager.runPasses(); } diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index b525972d89e9..c525ca9c4358 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -21,6 +21,7 @@ extern llvm::cl::opt DumpDotAll; extern llvm::cl::opt PrintReordered; extern llvm::cl::opt PrintEHRanges; extern llvm::cl::opt PrintUCE; +extern llvm::cl::opt PrintPeepholes; extern llvm::cl::opt SplitFunctions; extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function); @@ -538,5 +539,33 @@ void SimplifyConditionalTailCalls::runOnFunctions( << " from a total of " << NumTailCallCandidates << "\n"; } +void Peepholes::shortenInstructions(BinaryContext &BC, + BinaryFunction &Function) { + for (auto &BB : Function) { + for (auto &Inst : BB) { + BC.MIA->shortenInstruction(Inst); + } + } +} + +void Peepholes::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + for (auto &It : BFs) { + auto &Function = It.second; + if (Function.isSimple() && opts::shouldProcess(Function)) { + shortenInstructions(BC, Function); + + if (opts::PrintAll || opts::PrintPeepholes) { + Function.print(errs(), "after peepholes", true); + } + + if (opts::DumpDotAll) { + Function.dumpGraphForPass("peepholes"); + } + } + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index fd224f2bf200..6d1390442b53 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -148,6 +148,15 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// Perform simple peephole optimizations. +class Peepholes : public BinaryFunctionPass { + void shortenInstructions(BinaryContext &BC, BinaryFunction &Function); + public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + } // namespace bolt } // namespace llvm diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 072e50d77bf3..219e5dcd5dab 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -157,6 +157,11 @@ PrintUCE("print-uce", cl::desc("print functions after unreachable code elimination"), cl::Hidden); +cl::opt +PrintPeepholes("print-peepholes", + cl::desc("print functions after peephole optimization"), + cl::Hidden); + static cl::opt PrintDisasm("print-disasm", cl::desc("print function after disassembly"), cl::Hidden); From a7ccf72f331594bf92ba63100c52f8b9c7f75d57 Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Thu, 26 May 2016 10:58:01 -0700 Subject: [PATCH 141/904] Loop detection for BOLT's CFG. Summary: Loop detection for the CFG data structure. Added a GraphTraits specialization for BOLT's CFG that allows us to use LLVM's loop detection interface. (cherry picked from commit 68ef94430f59c101cd0db4213609cf33acd77006) --- bolt/BinaryBasicBlock.h | 69 ++++++++++++++++++++++ bolt/BinaryFunction.cpp | 122 +++++++++++++++++++++++++++++++++++++++ bolt/BinaryFunction.h | 71 ++++++++++++++++++++++- bolt/BinaryLoop.h | 94 ++++++++++++++++++++++++++++++ bolt/RewriteInstance.cpp | 9 +++ 5 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 bolt/BinaryLoop.h diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index b564078faca3..6fa5ed2b40a5 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -16,6 +16,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" +#include "llvm/ADT/GraphTraits.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" @@ -398,6 +399,14 @@ class BinaryBasicBlock { MCInst *&CondBranch, MCInst *&UncondBranch); + /// Printer required for printing dominator trees. + void printAsOperand(raw_ostream &OS, bool PrintType = true) { + if (PrintType) { + OS << "basic block "; + } + OS << getName(); + } + private: /// Adds predecessor to the BB. Most likely you don't need to call this. @@ -412,6 +421,66 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); } // namespace bolt + + +// GraphTraits specializations for basic block graphs (CFGs) +template <> struct GraphTraits { + typedef bolt::BinaryBasicBlock NodeType; + typedef bolt::BinaryBasicBlock::succ_iterator ChildIteratorType; + + static NodeType *getEntryNode(bolt::BinaryBasicBlock *BB) { return BB; } + static inline ChildIteratorType child_begin(NodeType *N) { + return N->succ_begin(); + } + static inline ChildIteratorType child_end(NodeType *N) { + return N->succ_end(); + } +}; + +template <> struct GraphTraits { + typedef const bolt::BinaryBasicBlock NodeType; + typedef bolt::BinaryBasicBlock::const_succ_iterator ChildIteratorType; + + static NodeType *getEntryNode(const bolt::BinaryBasicBlock *BB) { + return BB; + } + static inline ChildIteratorType child_begin(NodeType *N) { + return N->succ_begin(); + } + static inline ChildIteratorType child_end(NodeType *N) { + return N->succ_end(); + } +}; + +template <> struct GraphTraits> { + typedef bolt::BinaryBasicBlock NodeType; + typedef bolt::BinaryBasicBlock::pred_iterator ChildIteratorType; + static NodeType *getEntryNode(Inverse G) { + return G.Graph; + } + static inline ChildIteratorType child_begin(NodeType *N) { + return N->pred_begin(); + } + static inline ChildIteratorType child_end(NodeType *N) { + return N->pred_end(); + } +}; + +template <> struct GraphTraits> { + typedef const bolt::BinaryBasicBlock NodeType; + typedef bolt::BinaryBasicBlock::const_pred_iterator ChildIteratorType; + static NodeType *getEntryNode(Inverse G) { + return G.Graph; + } + static inline ChildIteratorType child_begin(NodeType *N) { + return N->pred_begin(); + } + static inline ChildIteratorType child_end(NodeType *N) { + return N->pred_end(); + } +}; + + } // namespace llvm #endif diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index c224bbcd2eeb..98d06d816716 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -33,6 +33,7 @@ #undef DEBUG_TYPE #define DEBUG_TYPE "bolt" + namespace llvm { namespace bolt { @@ -1632,5 +1633,126 @@ BinaryFunction::~BinaryFunction() { } } +void BinaryFunction::calculateLoopInfo() { + // Discover loops. + BinaryDominatorTree DomTree(false); + DomTree.recalculate(*this); + BLI.reset(new BinaryLoopInfo()); + BLI->analyze(DomTree); + + // Traverse discovered loops and add depth and profile information. + std::stack St; + for (auto I = BLI->begin(), E = BLI->end(); I != E; ++I) { + St.push(*I); + ++BLI->OuterLoops; + } + + while (!St.empty()) { + BinaryLoop *L = St.top(); + St.pop(); + ++BLI->TotalLoops; + BLI->MaximumDepth = std::max(L->getLoopDepth(), BLI->MaximumDepth); + + // Add nested loops in the stack. + for (BinaryLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + St.push(*I); + } + + // Skip if no valid profile is found. + if (!hasValidProfile()) { + L->EntryCount = COUNT_NO_PROFILE; + L->ExitCount = COUNT_NO_PROFILE; + L->TotalBackEdgeCount = COUNT_NO_PROFILE; + continue; + } + + // Compute back edge count. + SmallVector Latches; + L->getLoopLatches(Latches); + + for (BinaryBasicBlock *Latch : Latches) { + auto BI = Latch->BranchInfo.begin(); + for (BinaryBasicBlock *Succ : Latch->successors()) { + if (Succ == L->getHeader()) { + assert(BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE && + "profile data not found"); + L->TotalBackEdgeCount += BI->Count; + } + ++BI; + } + } + + // Compute entry count. + L->EntryCount = L->getHeader()->getExecutionCount() - L->TotalBackEdgeCount; + + // Compute exit count. + SmallVector ExitEdges; + L->getExitEdges(ExitEdges); + for (BinaryLoop::Edge &Exit : ExitEdges) { + const BinaryBasicBlock *Exiting = Exit.first; + const BinaryBasicBlock *ExitTarget = Exit.second; + auto BI = Exiting->BranchInfo.begin(); + for (BinaryBasicBlock *Succ : Exiting->successors()) { + if (Succ == ExitTarget) { + assert(BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE && + "profile data not found"); + L->ExitCount += BI->Count; + } + ++BI; + } + } + } +} + +void BinaryFunction::printLoopInfo(raw_ostream &OS) const { + OS << "Loop Info for Function \"" << getName() << "\""; + if (hasValidProfile()) { + OS << " (count: " << getExecutionCount() << ")"; + } + OS << "\n"; + + std::stack St; + for (auto I = BLI->begin(), E = BLI->end(); I != E; ++I) { + St.push(*I); + } + while (!St.empty()) { + BinaryLoop *L = St.top(); + St.pop(); + + for (BinaryLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + St.push(*I); + } + + if (!hasValidProfile()) + continue; + + OS << (L->getLoopDepth() > 1 ? "Nested" : "Outer") << " loop header: " + << L->getHeader()->getName(); + OS << "\n"; + OS << "Loop basic blocks: "; + auto Sep = ""; + for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) { + OS << Sep << (*BI)->getName(); + Sep = ", "; + } + OS << "\n"; + if (hasValidProfile()) { + OS << "Total back edge count: " << L->TotalBackEdgeCount << "\n"; + OS << "Loop entry count: " << L->EntryCount << "\n"; + OS << "Loop exit count: " << L->ExitCount << "\n"; + if (L->EntryCount > 0) { + OS << "Average iters per entry: " + << format("%.4lf", (double)L->TotalBackEdgeCount / L->EntryCount) + << "\n"; + } + } + OS << "----\n"; + } + + OS << "Total number of loops: "<< BLI->TotalLoops << "\n"; + OS << "Number of outer loops: " << BLI->OuterLoops << "\n"; + OS << "Maximum nested loop depth: " << BLI->MaximumDepth << "\n\n"; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 446f791b4ccf..6bea0a9b8f9a 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -19,6 +19,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" +#include "BinaryLoop.h" #include "DataReader.h" #include "DebugData.h" #include "llvm/ADT/StringRef.h" @@ -133,6 +134,8 @@ class BinaryFunction : public AddressRangesOwner { BinaryContext &BC; + std::unique_ptr BLI; + /// False if the function is too complex to reconstruct its control /// flow graph and re-assemble. bool IsSimple{true}; @@ -387,6 +390,18 @@ class BinaryFunction : public AddressRangesOwner { /// end of basic blocks. void modifyLayout(LayoutType Type, bool Split); + /// Find the loops in the CFG of the function and store infromation about + /// them. + void calculateLoopInfo(); + + /// Returns if loop detection has been run for this function. + bool hasLoopInfo() const { + return BLI != nullptr; + } + + /// Print loop inforamtion about the function. + void printLoopInfo(raw_ostream &OS) const; + /// View CFG in graphviz program void viewGraph() const; @@ -605,7 +620,7 @@ class BinaryFunction : public AddressRangesOwner { } /// Return true if function profile is present and accurate. - bool hasValidProfile() { + bool hasValidProfile() const { return ExecutionCount != COUNT_NO_PROFILE && ProfileMatchRatio == 1.0f; } @@ -934,6 +949,60 @@ inline raw_ostream &operator<<(raw_ostream &OS, } } // namespace bolt + + +// GraphTraits specializations for function basic block graphs (CFGs) +template <> struct GraphTraits : + public GraphTraits { + static NodeType *getEntryNode(bolt::BinaryFunction *F) { + return *F->layout_begin(); + } + + typedef bolt::BinaryBasicBlock * nodes_iterator; + static nodes_iterator nodes_begin(bolt::BinaryFunction *F) { + return &(*F->begin()); + } + static nodes_iterator nodes_end(bolt::BinaryFunction *F) { + return &(*F->end()); + } + static size_t size(bolt::BinaryFunction *F) { + return F->size(); + } +}; + +template <> struct GraphTraits : + public GraphTraits { + static NodeType *getEntryNode(const bolt::BinaryFunction *F) { + return *F->layout_begin(); + } + + typedef const bolt::BinaryBasicBlock * nodes_iterator; + static nodes_iterator nodes_begin(const bolt::BinaryFunction *F) { + return &(*F->begin()); + } + static nodes_iterator nodes_end(const bolt::BinaryFunction *F) { + return &(*F->end()); + } + static size_t size(const bolt::BinaryFunction *F) { + return F->size(); + } +}; + +template <> struct GraphTraits> : + public GraphTraits> { + static NodeType *getEntryNode(Inverse G) { + return *G.Graph->layout_begin(); + } +}; + +template <> struct GraphTraits> : + public GraphTraits> { + static NodeType *getEntryNode(Inverse G) { + return *G.Graph->layout_begin(); + } +}; + + } // namespace llvm #endif diff --git a/bolt/BinaryLoop.h b/bolt/BinaryLoop.h new file mode 100644 index 000000000000..dc396d54b0b0 --- /dev/null +++ b/bolt/BinaryLoop.h @@ -0,0 +1,94 @@ +//===--- BinaryLoop.h - Interface for machine-level loop ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the BinaryLoop class, which represents a loop in the +// CFG of a binary function, and the BinaryLoopInfo class, which stores +// information about all the loops of a binary function. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_LOOP_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_LOOP_H + +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfoImpl.h" +#include "llvm/Support/GenericDomTreeConstruction.h" + +namespace llvm { +namespace bolt { + +class BinaryBasicBlock; + +typedef DomTreeNodeBase BinaryDomTreeNode; +typedef DominatorTreeBase BinaryDominatorTree; + +class BinaryLoop : public LoopBase { +public: + BinaryLoop() : LoopBase() { } + + // The total count of all the back edges of this loop. + uint64_t TotalBackEdgeCount{0}; + + // The times the loop is entered from outside. + uint64_t EntryCount{0}; + + // The times the loop is exited. + uint64_t ExitCount{0}; + + // Most of the public interface is provided by LoopBase. + +protected: + friend class LoopInfoBase; + explicit BinaryLoop(BinaryBasicBlock *BB) : + LoopBase(BB) { } +}; + +class BinaryLoopInfo : public LoopInfoBase { +public: + BinaryLoopInfo() { } + + unsigned OuterLoops{0}; + unsigned TotalLoops{0}; + unsigned MaximumDepth{0}; + + // Most of the public interface is provided by LoopInfoBase. +}; + +} // namespace bolt +} // namespace llvm + +namespace llvm { + +// BinaryDominatorTree GraphTraits specializations. +template <> struct GraphTraits + : public DomTreeGraphTraitsBase {}; + +template <> struct GraphTraits + : public DomTreeGraphTraitsBase {}; + +template <> struct GraphTraits + : public GraphTraits { + static NodeType *getEntryNode(bolt::BinaryDominatorTree *DT) { + return DT->getRootNode(); + } + + static nodes_iterator nodes_begin(bolt::BinaryDominatorTree *N) { + return df_begin(getEntryNode(N)); + } + + static nodes_iterator nodes_end(bolt::BinaryDominatorTree *N) { + return df_end(getEntryNode(N)); + } +}; + +} // namescpae llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 219e5dcd5dab..e4840466eaa9 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -152,6 +152,10 @@ static cl::opt PrintCFG("print-cfg", cl::desc("print functions after CFG construction"), cl::Hidden); +static cl::opt +PrintLoopInfo("print-loops", cl::desc("print loop related information"), + cl::Hidden); + cl::opt PrintUCE("print-uce", cl::desc("print functions after unreachable code elimination"), @@ -963,6 +967,11 @@ void RewriteInstance::disassembleFunctions() { if (opts::DumpDotAll) Function.dumpGraphForPass("build-cfg"); + if (opts::PrintLoopInfo) { + Function.calculateLoopInfo(); + Function.printLoopInfo(errs()); + } + TotalScore += Function.getFunctionScore(); } // Iterate over all functions From 5652c49b3f244ed8418eb35f4c5733aa22f0591f Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Fri, 3 Jun 2016 00:58:11 -0700 Subject: [PATCH 142/904] Simplification of loads from read-only data sections. Summary: Instructions that load data from the a read-only data section and their target address can be computed statically (e.g. RIP-relative addressing) are modified to corresponding instructions that use immediate operands. We apply the transformation only when the resulting instruction will have smaller or equal size. (cherry picked from commit 31f57ed6f495343c01705622df73de4f23670a49) --- bolt/BinaryContext.cpp | 8 +++ bolt/BinaryContext.h | 6 ++ bolt/BinaryFunction.cpp | 9 +-- bolt/BinaryPassManager.cpp | 11 ++++ bolt/BinaryPasses.cpp | 115 +++++++++++++++++++++++++++++++++++++ bolt/BinaryPasses.h | 25 ++++++++ bolt/RewriteInstance.cpp | 6 ++ 7 files changed, 172 insertions(+), 8 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index c83fc3abc1e9..e8d033422396 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -242,5 +242,13 @@ ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ return std::make_error_code(std::errc::bad_address); } +uint64_t BinaryContext::getInstructionSize(const MCInst &Instr) const { + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + MCE->encodeInstruction(Instr, VecOS, Fixups, *STI); + return Code.size(); +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index aaf22eb41ab7..28ea0b1be021 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -177,6 +177,12 @@ class BinaryContext { /// disassembled functions. void preprocessFunctionDebugInfo( std::map &BinaryFunctions); + + /// Calculate the size of the given instruction. + /// Note: this can be imprecise wrt the final binary since happening prior to + /// relaxation, as well as wrt the original binary because of opcode + /// shortening. + uint64_t getInstructionSize(const MCInst &Instr) const; }; } // namespace bolt diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 98d06d816716..a53500bc9a28 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -327,14 +327,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (auto &Instr : *BB) { printInstruction(Instr); - - // Calculate the size of the instruction. - // Note: this is imprecise since happening prior to relaxation. - SmallString<256> Code; - SmallVector Fixups; - raw_svector_ostream VecOS(Code); - BC.MCE->encodeInstruction(Instr, VecOS, Fixups, *BC.STI); - Offset += Code.size(); + Offset += BC.getInstructionSize(Instr); } if (!BB->Successors.empty()) { diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index cd54382703e4..080f485aa1cf 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -42,6 +42,14 @@ Peepholes("peepholes", llvm::cl::init(true), llvm::cl::Optional); +static llvm::cl::opt +SimplifyRODataLoads("simplify-rodata-loads", + llvm::cl::desc("simplify loads from read-only sections by " + "replacing the memory operand with the " + "constant found in the corresponding " + "section"), + llvm::cl::Optional); + } // namespace opts namespace llvm { @@ -69,6 +77,9 @@ void BinaryFunctionPassManager::runAllPasses( std::move(llvm::make_unique(Manager.NagUser)), opts::EliminateUnreachable); + Manager.registerPass(llvm::make_unique(), + opts::SimplifyRODataLoads); + Manager.registerPass(std::move(llvm::make_unique())); Manager.registerPass(llvm::make_unique(), diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index c525ca9c4358..aa4c5241c9e1 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -22,6 +22,7 @@ extern llvm::cl::opt PrintReordered; extern llvm::cl::opt PrintEHRanges; extern llvm::cl::opt PrintUCE; extern llvm::cl::opt PrintPeepholes; +extern llvm::cl::opt PrintSimplifyROLoads; extern llvm::cl::opt SplitFunctions; extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function); @@ -567,5 +568,119 @@ void Peepholes::runOnFunctions(BinaryContext &BC, } } +bool SimplifyRODataLoads::simplifyRODataLoads( + BinaryContext &BC, BinaryFunction &BF) { + auto &MIA = BC.MIA; + + uint64_t NumLocalLoadsSimplified = 0; + uint64_t NumDynamicLocalLoadsSimplified = 0; + uint64_t NumLocalLoadsFound = 0; + uint64_t NumDynamicLocalLoadsFound = 0; + + for (auto *BB : BF.layout()) { + for (auto &Inst : *BB) { + unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = BC.MII->get(Opcode); + + // Skip instructions that do not load from memory. + if (!Desc.mayLoad()) + continue; + + // Try to statically evaluate the target memory address; + uint64_t TargetAddress; + + if (MIA->hasRIPOperand(Inst)) { + // Try to find the symbol that corresponds to the rip-relative operand. + MCOperand DisplOp; + if (!MIA->getRIPOperandDisp(Inst, DisplOp)) + continue; + + assert(DisplOp.isExpr() && + "found rip-relative with non-symbolic displacement"); + + // Get displacement symbol. + const MCSymbolRefExpr *DisplExpr; + if (!(DisplExpr = dyn_cast(DisplOp.getExpr()))) + continue; + const MCSymbol &DisplSymbol = DisplExpr->getSymbol(); + + // Look up the symbol address in the global symbols map of the binary + // context object. + auto GI = BC.GlobalSymbols.find(DisplSymbol.getName().str()); + if (GI == BC.GlobalSymbols.end()) + continue; + TargetAddress = GI->second; + } else if (!MIA->evaluateMemOperand(Inst, TargetAddress)) { + continue; + } + + // Get the contents of the section containing the target addresss of the + // memory operand. We are only interested in read-only sections. + ErrorOr DataSectionOrErr = + BC.getSectionForAddress(TargetAddress); + if (!DataSectionOrErr) + continue; + SectionRef DataSection = DataSectionOrErr.get(); + if (!DataSection.isReadOnly()) + continue; + uint32_t Offset = TargetAddress - DataSection.getAddress(); + StringRef ConstantData; + if (std::error_code EC = DataSection.getContents(ConstantData)) { + errs() << "BOLT-ERROR: 'cannot get section contents': " + << EC.message() << ".\n"; + exit(1); + } + + ++NumLocalLoadsFound; + if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) + NumDynamicLocalLoadsFound += BB->getExecutionCount(); + + if (MIA->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { + ++NumLocalLoadsSimplified; + if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) + NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); + } + } + } + + NumLoadsFound += NumLocalLoadsFound; + NumDynamicLoadsFound += NumDynamicLocalLoadsFound; + NumLoadsSimplified += NumLocalLoadsSimplified; + NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; + + return NumLocalLoadsSimplified > 0; +} + +void SimplifyRODataLoads::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set & +) { + + for (auto &It : BFs) { + auto &Function = It.second; + + if (!Function.isSimple()) + continue; + + if (simplifyRODataLoads(BC, Function)) { + if (opts::PrintAll || opts::PrintSimplifyROLoads) { + Function.print(errs(), + "after simplifying read-only section loads", + true); + } + if (opts::DumpDotAll) { + Function.dumpGraphForPass("simplify-rodata-loads"); + } + } + } + + outs() << "BOLT: simplified " << NumLoadsSimplified << " out of "; + outs() << NumLoadsFound << " loads from a statically computed address.\n"; + outs() << "BOLT: dynamic loads simplified: " << NumDynamicLoadsSimplified; + outs() << "\n"; + outs() << "BOLT: dynamic loads found: " << NumDynamicLoadsFound << "\n"; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 6d1390442b53..915ff285cdec 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -157,6 +157,31 @@ class Peepholes : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// An optimization to simplify loads from read-only sections.The pass converts +/// load instructions with statically computed target address such as: +/// +/// mov 0x12f(%rip), %eax +/// +/// to their counterparts that use immediate opreands instead of memory loads: +/// +/// mov $0x4007dc, %eax +/// +/// when the target address points somewhere inside a read-only section. +/// +class SimplifyRODataLoads : public BinaryFunctionPass { + uint64_t NumLoadsSimplified{0}; + uint64_t NumDynamicLoadsSimplified{0}; + uint64_t NumLoadsFound{0}; + uint64_t NumDynamicLoadsFound{0}; + + bool simplifyRODataLoads(BinaryContext &BC, BinaryFunction &BF); + +public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + } // namespace bolt } // namespace llvm diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index e4840466eaa9..f645f68923b8 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -175,6 +175,12 @@ PrintEHRanges("print-eh-ranges", cl::desc("print function with updated exception ranges"), cl::Hidden); +cl::opt +PrintSimplifyROLoads("print-simplify-rodata-loads", + cl::desc("print functions after simplification of RO data" + " loads"), + cl::Hidden); + cl::opt PrintReordered("print-reordered", cl::desc("print functions after layout optimization"), From 60af5417668f5fe087b18cca2f04737b295c7cd5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 23 Jul 2016 08:01:53 -0700 Subject: [PATCH 143/904] Factor out instruction printing and size computation. Summary: I've factored out the instruction printing and size computation routines to methods on BinaryContext. I've also added some more debug print functions. This was split off the ICP diff to simplify it a bit. (cherry picked from commit fd2ca5604c1482a0c894f123660487161c686de3) --- bolt/BinaryBasicBlock.cpp | 15 +++++ bolt/BinaryBasicBlock.h | 4 ++ bolt/BinaryContext.cpp | 116 +++++++++++++++++++++++++++++++++++--- bolt/BinaryContext.h | 50 +++++++++++++++- bolt/BinaryFunction.cpp | 114 ++++++------------------------------- bolt/BinaryFunction.h | 10 ++-- bolt/RewriteInstance.cpp | 2 +- 7 files changed, 196 insertions(+), 115 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 35a0a314a29a..7f46b354f620 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "BinaryBasicBlock.h" +#include "BinaryContext.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -75,5 +76,19 @@ bool BinaryBasicBlock::analyzeBranch(const MCInstrAnalysis &MIA, return MIA.analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch); } +void BinaryBasicBlock::dump(BinaryContext& BC) const { + if (Label) dbgs() << Label->getName() << ":\n"; + BC.printInstructions(dbgs(), Instructions.begin(), Instructions.end(), Offset); + dbgs() << "preds:"; + for (auto itr = pred_begin(); itr != pred_end(); ++itr) { + dbgs() << " " << (*itr)->getName(); + } + dbgs() << "\nsuccs:"; + for (auto itr = succ_begin(); itr != succ_end(); ++itr) { + dbgs() << " " << (*itr)->getName(); + } + dbgs() << "\n"; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 6fa5ed2b40a5..3be0ab541717 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -34,6 +34,7 @@ namespace llvm { namespace bolt { class BinaryFunction; +class BinaryContext; /// The intention is to keep the structure similar to MachineBasicBlock as /// we might switch to it at some point. @@ -407,6 +408,9 @@ class BinaryBasicBlock { OS << getName(); } + /// A simple dump function for debugging. + void dump(BinaryContext &BC) const; + private: /// Adds predecessor to the BB. Most likely you don't need to call this. diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index e8d033422396..be42e8bd573b 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -15,10 +15,20 @@ #include "llvm/DebugInfo/DWARF/DWARFUnit.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" namespace llvm { namespace bolt { +namespace opts { + +static cl::opt +PrintDebugInfo("print-debug-info", + cl::desc("print debug info when printing functions"), + cl::Hidden); + +} // namespace opts + BinaryContext::~BinaryContext() { } MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, @@ -47,6 +57,12 @@ MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, return Symbol; } +void BinaryContext::printGlobalSymbols(raw_ostream& OS) const { + for (auto &entry : GlobalSymbols) { + OS << "(" << entry.first << " -> " << entry.second << ")\n"; + } +} + } // namespace bolt } // namespace llvm @@ -232,6 +248,98 @@ void BinaryContext::preprocessFunctionDebugInfo( } } +void BinaryContext::printCFI(raw_ostream &OS, uint32_t Operation) { + switch(Operation) { + case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; + case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; + case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; + case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; + case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; + case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; + case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; + case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; + case MCCFIInstruction::OpAdjustCfaOffset: OS << "OfAdjustCfaOffset"; break; + case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; + case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; + case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; + case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; + case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; + case MCCFIInstruction::OpGnuArgsSize: OS << "OpGnuArgsSize"; break; + default: OS << "Op#" << Operation; break; + } +} + +void BinaryContext::printInstruction(raw_ostream &OS, + const MCInst &Instruction, + uint64_t Offset, + const BinaryFunction* Function, + bool printMCInst) const { + if (MIA->isEHLabel(Instruction)) { + OS << " EH_LABEL: " + << cast(Instruction.getOperand(0).getExpr())-> + getSymbol() + << '\n'; + return; + } + OS << format(" %08" PRIx64 ": ", Offset); + if (Function && MIA->isCFI(Instruction)) { + uint32_t Offset = Instruction.getOperand(0).getImm(); + OS << "\t!CFI\t$" << Offset << "\t; "; + printCFI(OS, Function->getCFIFor(Instruction)->getOperation()); + OS << "\n"; + return; + } + if (!MIA->isUnsupported(Instruction)) { + InstPrinter->printInst(&Instruction, OS, "", *STI); + } else { + OS << "unsupported (probably jmpr)"; + } + if (MIA->isCall(Instruction)) { + if (MIA->isTailCall(Instruction)) + OS << " # TAILCALL "; + if (MIA->isInvoke(Instruction)) { + const MCSymbol *LP; + uint64_t Action; + std::tie(LP, Action) = MIA->getEHInfo(Instruction); + OS << " # handler: "; + if (LP) + OS << *LP; + else + OS << '0'; + OS << "; action: " << Action; + auto GnuArgsSize = MIA->getGnuArgsSize(Instruction); + if (GnuArgsSize >= 0) + OS << "; GNU_args_size = " << GnuArgsSize; + } + } + + const DWARFDebugLine::LineTable *LineTable = + Function && opts::PrintDebugInfo ? Function->getDWARFUnitLineTable().second + : nullptr; + + if (LineTable) { + auto RowRef = DebugLineTableRowRef::fromSMLoc(Instruction.getLoc()); + + if (RowRef != DebugLineTableRowRef::NULL_ROW) { + const auto &Row = LineTable->Rows[RowRef.RowIndex - 1]; + OS << " # debug line " + << LineTable->Prologue.FileNames[Row.File - 1].Name + << ":" << Row.Line; + + if (Row.Column) { + OS << ":" << Row.Column; + } + } + } + + OS << "\n"; + + if (printMCInst) { + Instruction.dump_pretty(OS, InstPrinter.get()); + OS << "\n"; + } +} + ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ auto SI = AllocatableSections.upper_bound(Address); if (SI != AllocatableSections.begin()) { @@ -242,13 +350,5 @@ ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ return std::make_error_code(std::errc::bad_address); } -uint64_t BinaryContext::getInstructionSize(const MCInst &Instr) const { - SmallString<256> Code; - SmallVector Fixups; - raw_svector_ostream VecOS(Code); - MCE->encodeInstruction(Instr, VecOS, Fixups, *STI); - return Code.size(); -} - } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 28ea0b1be021..211736122407 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -32,6 +32,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/ErrorOr.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include #include @@ -157,6 +158,9 @@ class BinaryContext { /// return the first one. MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); + /// Print the global symbol table. + void printGlobalSymbols(raw_ostream& OS) const; + /// Return (allocatable) section containing the given \p Address. ErrorOr getSectionForAddress(uint64_t Address) const; @@ -178,11 +182,53 @@ class BinaryContext { void preprocessFunctionDebugInfo( std::map &BinaryFunctions); - /// Calculate the size of the given instruction. + /// Compute the native code size for a range of instructions. /// Note: this can be imprecise wrt the final binary since happening prior to /// relaxation, as well as wrt the original binary because of opcode /// shortening. - uint64_t getInstructionSize(const MCInst &Instr) const; + template + uint64_t computeCodeSize(Itr Beg, Itr End) const { + uint64_t Size = 0; + while (Beg != End) { + // Calculate the size of the instruction. + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + MCE->encodeInstruction(*Beg++, VecOS, Fixups, *STI); + Size += Code.size(); + } + return Size; + } + + /// Print the string name for a CFI operation. + static void printCFI(raw_ostream &OS, uint32_t Operation); + + /// Print a single MCInst in native format. If Function is non-null, + /// the instruction will be annotated with CFI and possibly DWARF line table + /// info. + /// If printMCInst is true, the instruction is also printed in the + /// architecture independent format. + void printInstruction(raw_ostream &OS, + const MCInst &Instruction, + uint64_t Offset = 0, + const BinaryFunction *Function = nullptr, + bool printMCInst = false) const; + + /// Print a range of instructions. + template + uint64_t printInstructions(raw_ostream &OS, + Itr Begin, + Itr End, + uint64_t Offset = 0, + const BinaryFunction *Function = nullptr, + bool printMCInst = false) const { + while (Begin != End) { + printInstruction(OS, *Begin, Offset, Function, printMCInst); + Offset += computeCodeSize(Begin, Begin + 1); + ++Begin; + } + return Offset; + } }; } // namespace bolt diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a53500bc9a28..8c25cd4c53f6 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -44,11 +44,6 @@ AgressiveSplitting("split-all-cold", cl::desc("outline as many cold basic blocks as possible"), cl::Optional); -static cl::opt -PrintDebugInfo("print-debug-info", - cl::desc("print debug info when printing functions"), - cl::Hidden); - } // namespace opts namespace { @@ -136,6 +131,11 @@ unsigned BinaryFunction::eraseDeadBBs( return Count; } +void BinaryFunction::dump(std::string Annotation, + bool PrintInstructions) const { + print(dbgs(), Annotation, PrintInstructions); +} + void BinaryFunction::print(raw_ostream &OS, std::string Annotation, bool PrintInstructions) const { StringRef SectionName; @@ -187,88 +187,6 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, // Offset of the instruction in function. uint64_t Offset{0}; - auto printCFI = [&OS] (uint32_t Operation) { - switch(Operation) { - case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; - case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; - case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; - case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; - case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; - case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; - case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; - case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; - case MCCFIInstruction::OpAdjustCfaOffset: OS << "OfAdjustCfaOffset"; break; - case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; - case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; - case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; - case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; - case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; - case MCCFIInstruction::OpGnuArgsSize: OS << "OpGnuArgsSize"; break; - default: OS << "Op#" << Operation; break; - } - }; - - // Used in printInstruction below to print debug line information. - const DWARFDebugLine::LineTable *LineTable = - opts::PrintDebugInfo ? getDWARFUnitLineTable().second - : nullptr; - - auto printInstruction = [&](const MCInst &Instruction) { - if (BC.MIA->isEHLabel(Instruction)) { - OS << " EH_LABEL: " - << cast(Instruction.getOperand(0).getExpr())-> - getSymbol() - << '\n'; - return; - } - OS << format(" %08" PRIx64 ": ", Offset); - if (BC.MIA->isCFI(Instruction)) { - uint32_t Offset = Instruction.getOperand(0).getImm(); - OS << "\t!CFI\t$" << Offset << "\t; "; - assert(Offset < FrameInstructions.size() && "Invalid CFI offset"); - printCFI(FrameInstructions[Offset].getOperation()); - OS << "\n"; - return; - } - BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI); - if (BC.MIA->isCall(Instruction)) { - if (BC.MIA->isTailCall(Instruction)) - OS << " # TAILCALL "; - if (BC.MIA->isInvoke(Instruction)) { - const MCSymbol *LP; - uint64_t Action; - std::tie(LP, Action) = BC.MIA->getEHInfo(Instruction); - OS << " # handler: "; - if (LP) - OS << *LP; - else - OS << '0'; - OS << "; action: " << Action; - auto GnuArgsSize = BC.MIA->getGnuArgsSize(Instruction); - if (GnuArgsSize >= 0) - OS << "; GNU_args_size = " << GnuArgsSize; - } - } - if (opts::PrintDebugInfo && LineTable) { - auto RowRef = DebugLineTableRowRef::fromSMLoc(Instruction.getLoc()); - - if (RowRef != DebugLineTableRowRef::NULL_ROW) { - const auto &Row = LineTable->Rows[RowRef.RowIndex - 1]; - OS << " # debug line " - << LineTable->Prologue.FileNames[Row.File - 1].Name - << ":" << Row.Line; - - if (Row.Column) { - OS << ":" << Row.Column; - } - } - } - - OS << "\n"; - // In case we need MCInst printer: - // Instr.dump_pretty(OS, InstructionPrinter.get()); - }; - if (BasicBlocks.empty() && !Instructions.empty()) { // Print before CFG was built. for (const auto &II : Instructions) { @@ -279,7 +197,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (LI != Labels.end()) OS << LI->second->getName() << ":\n"; - printInstruction(II.second); + BC.printInstruction(OS, II.second, Offset, this); } } @@ -325,10 +243,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, Offset = RoundUpToAlignment(Offset, BB->getAlignment()); - for (auto &Instr : *BB) { - printInstruction(Instr); - Offset += BC.getInstructionSize(Instr); - } + // Note: offsets are imprecise since this is happening prior to relaxation. + Offset = BC.printInstructions(OS, BB->begin(), BB->end(), Offset, this); if (!BB->Successors.empty()) { OS << " Successors: "; @@ -387,7 +303,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (auto &Elmt : OffsetToCFI) { OS << format(" %08x:\t", Elmt.first); assert(Elmt.second < FrameInstructions.size() && "Incorrect CFI offset"); - printCFI(FrameInstructions[Elmt.second].getOperation()); + BinaryContext::printCFI(OS, FrameInstructions[Elmt.second].getOperation()); OS << "\n"; } } else { @@ -395,7 +311,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (uint32_t I = 0, E = FrameInstructions.size(); I != E; ++I) { const MCCFIInstruction &CFI = FrameInstructions[I]; OS << format(" %d:\t", I); - printCFI(CFI.getOperation()); + BinaryContext::printCFI(OS, CFI.getOperation()); OS << "\n"; } } @@ -1109,7 +1025,7 @@ void BinaryFunction::annotateCFIState() { // Advance state for (const auto &Instr : *CurBB) { - MCCFIInstruction *CFI = getCFIFor(Instr); + auto *CFI = getCFIFor(Instr); if (CFI == nullptr) continue; ++HighestState; @@ -1221,7 +1137,7 @@ bool BinaryFunction::fixCFIState() { if (CurBB == BB) break; for (auto &Instr : *CurBB) { - if (MCCFIInstruction *CFI = getCFIFor(Instr)) { + if (auto *CFI = getCFIFor(Instr)) { if (CFI->getOperation() == MCCFIInstruction::OpRememberState) ++StackOffset; if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) @@ -1369,7 +1285,9 @@ void BinaryFunction::viewGraph() const { } void BinaryFunction::dumpGraphForPass(std::string Annotation) const { - dumpGraphToFile(constructFilename(getName(), Annotation, ".dot")); + auto Filename = constructFilename(getName(), Annotation, ".dot"); + dbgs() << "BOLT-DEBUG: Dumping CFG to " << Filename << "\n"; + dumpGraphToFile(Filename); } void BinaryFunction::dumpGraphToFile(std::string Filename) const { @@ -1463,7 +1381,7 @@ void BinaryFunction::fixBranches() { // invert this conditional branch logic so we can make this a fallthrough. if (TBB == FT && !HotColdBorder) { if (OldFT == nullptr) { - errs() << "BOLT-ERROR: malfromed CFG for function " << getName() + errs() << "BOLT-ERROR: malformed CFG for function " << getName() << " in basic block " << BB->getName() << '\n'; } assert(OldFT != nullptr && "malformed CFG"); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 6bea0a9b8f9a..aa812ec9bb54 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -424,11 +424,11 @@ class BinaryFunction : public AddressRangesOwner { /// Returns the n-th basic block in this function in its original layout, or /// nullptr if n >= size(). - const BinaryBasicBlock * getBasicBlockAtIndex(unsigned Index) const { + const BinaryBasicBlock *getBasicBlockAtIndex(unsigned Index) const { return BasicBlocks.at(Index); } - BinaryBasicBlock * getBasicBlockAtIndex(unsigned Index) { + BinaryBasicBlock *getBasicBlockAtIndex(unsigned Index) { return BasicBlocks.at(Index); } @@ -598,9 +598,7 @@ class BinaryFunction : public AddressRangesOwner { /// Dump function information to debug output. If \p PrintInstructions /// is true - include instruction disassembly. - void dump(std::string Annotation = "", bool PrintInstructions = true) const { - print(dbgs(), Annotation, PrintInstructions); - } + void dump(std::string Annotation = "", bool PrintInstructions = true) const; /// Print function information to the \p OS stream. void print(raw_ostream &OS, std::string Annotation = "", @@ -669,7 +667,7 @@ class BinaryFunction : public AddressRangesOwner { } /// Retrieve the MCCFIInstruction object associated with a CFI pseudo. - MCCFIInstruction* getCFIFor(const MCInst &Instr) { + const MCCFIInstruction* getCFIFor(const MCInst &Instr) const { if (!BC.MIA->isCFI(Instr)) return nullptr; uint32_t Offset = Instr.getOperand(0).getImm(); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index f645f68923b8..a193c0131209 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1060,7 +1060,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, BinaryContext &BC, bool EmitColdPart) { // Define a helper to decode and emit CFI instructions at a given point in a // BB - auto emitCFIInstr = [&Streamer](MCCFIInstruction &CFIInstr) { + auto emitCFIInstr = [&Streamer](const MCCFIInstruction &CFIInstr) { switch (CFIInstr.getOperation()) { default: llvm_unreachable("Unexpected instruction"); From 58eae6d963aabf7a11eb76788e2bc48ec8ea35fb Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Thu, 9 Jun 2016 11:36:55 -0700 Subject: [PATCH 144/904] Identical Code Folding (ICF) pass Summary: Added an ICF pass to BOLT, that can recognize identical functions and replace references to these functions with references to just one representative. (cherry picked from commit bc6d021ba58ce99bae5e96cf71e5afcf55c065d3) --- bolt/BinaryBasicBlock.cpp | 18 ++ bolt/BinaryBasicBlock.h | 17 ++ bolt/BinaryFunction.cpp | 373 +++++++++++++++++++++++++++++++++++++ bolt/BinaryFunction.h | 67 ++++++- bolt/BinaryPassManager.cpp | 9 + bolt/BinaryPasses.cpp | 228 +++++++++++++++++++++++ bolt/BinaryPasses.h | 41 ++++ bolt/RewriteInstance.cpp | 15 ++ 8 files changed, 766 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 7f46b354f620..3be44979feeb 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -29,6 +29,24 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { return LHS.Offset < RHS.Offset; } +BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const { + for (BinaryBasicBlock *BB : successors()) { + if (BB->getLabel() == Label) + return BB; + } + + return nullptr; +} + +BinaryBasicBlock *BinaryBasicBlock::getLandingPad(const MCSymbol *Label) const { + for (BinaryBasicBlock *BB : landing_pads()) { + if (BB->getLabel() == Label) + return BB; + } + + return nullptr; +} + void BinaryBasicBlock::addSuccessor(BinaryBasicBlock *Succ, uint64_t Count, uint64_t MispredictedCount) { diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 3be0ab541717..344d8e28f49a 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -273,11 +273,28 @@ class BinaryBasicBlock { branch_info_begin(), branch_info_end()); } + /// Get instruction at given index. + MCInst &getInstructionAtIndex(unsigned Index) { + return Instructions.at(Index); + } + + const MCInst &getInstructionAtIndex(unsigned Index) const { + return Instructions.at(Index); + } + /// Return symbol marking the start of this basic block. MCSymbol *getLabel() const { return Label; } + /// Get successor with given label. Returns nullptr if no such + /// successor is found. + BinaryBasicBlock *getSuccessor(const MCSymbol *Label) const; + + /// Get landing pad with given label. Returns nullptr if no such + /// landing pad is found. + BinaryBasicBlock *getLandingPad(const MCSymbol *Label) const; + /// Return local name for the block. StringRef getName() const { return Label->getName(); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 8c25cd4c53f6..43faae7a83bb 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #undef DEBUG_TYPE #define DEBUG_TYPE "bolt" @@ -178,6 +179,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "\n Exec Count : " << ExecutionCount; OS << "\n Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f); } + if (IdenticalFunctionAddress != Address) + OS << "\n Id Fun Addr : 0x" << Twine::utohexstr(IdenticalFunctionAddress); OS << "\n}\n"; @@ -1538,6 +1541,376 @@ void BinaryFunction::propagateGnuArgsSizeInfo() { } } +void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { + if (!hasValidProfile() || !BF.hasValidProfile()) + return; + + // Update BF's execution count. + uint64_t MyExecutionCount = getExecutionCount(); + if (MyExecutionCount != BinaryFunction::COUNT_NO_PROFILE) { + uint64_t OldExecCount = BF.getExecutionCount(); + uint64_t NewExecCount = + OldExecCount == BinaryFunction::COUNT_NO_PROFILE ? + MyExecutionCount : + MyExecutionCount + OldExecCount; + BF.setExecutionCount(NewExecCount); + } + + // Update BF's basic block and edge counts. + auto BBMergeI = BF.begin(); + for (BinaryBasicBlock *BB : BasicBlocks) { + BinaryBasicBlock *BBMerge = &*BBMergeI; + assert(getIndex(BB) == BF.getIndex(BBMerge)); + + // Update BF's basic block count. + uint64_t MyBBExecutionCount = BB->getExecutionCount(); + if (MyBBExecutionCount != BinaryBasicBlock::COUNT_NO_PROFILE) { + uint64_t OldExecCount = BBMerge->getExecutionCount(); + uint64_t NewExecCount = + OldExecCount == BinaryBasicBlock::COUNT_NO_PROFILE ? + MyBBExecutionCount : + MyBBExecutionCount + OldExecCount; + BBMerge->ExecutionCount = NewExecCount; + } + + // Update BF's edge count for successors of this basic block. + auto BBMergeSI = BBMerge->succ_begin(); + auto BII = BB->BranchInfo.begin(); + auto BIMergeI = BBMerge->BranchInfo.begin(); + for (BinaryBasicBlock *BBSucc : BB->successors()) { + BinaryBasicBlock *BBMergeSucc = *BBMergeSI; + assert(getIndex(BBSucc) == BF.getIndex(BBMergeSucc)); + + if (BII->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + uint64_t OldBranchCount = BIMergeI->Count; + uint64_t NewBranchCount = + OldBranchCount == BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE ? + BII->Count : + BII->Count + OldBranchCount; + BIMergeI->Count = NewBranchCount; + } + + if (BII->MispredictedCount != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + uint64_t OldMispredictedCount = BIMergeI->MispredictedCount; + uint64_t NewMispredictedCount = + OldMispredictedCount == BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE ? + BII->MispredictedCount : + BII->MispredictedCount + OldMispredictedCount; + BIMergeI->MispredictedCount = NewMispredictedCount; + } + + ++BBMergeSI; + ++BII; + ++BIMergeI; + } + assert(BBMergeSI == BBMerge->succ_end()); + + ++BBMergeI; + } + assert(BBMergeI == BF.end()); +} + +std::pair BinaryFunction::isCalleeEquivalentWith( + const MCInst &Inst, const BinaryBasicBlock &BB, const MCInst &InstOther, + const BinaryBasicBlock &BBOther, const BinaryFunction &BF) const { + // The callee operand in a direct call is the first operand. This + // operand should be a symbol corresponding to the callee function. + constexpr unsigned CalleeOpIndex = 0; + + // Helper function. + auto getGlobalAddress = [this] (const MCSymbol &Symbol) -> uint64_t { + auto AI = BC.GlobalSymbols.find(Symbol.getName()); + assert(AI != BC.GlobalSymbols.end()); + return AI->second; + }; + + const MCOperand &CalleeOp = Inst.getOperand(CalleeOpIndex); + const MCOperand &CalleeOpOther = InstOther.getOperand(CalleeOpIndex); + if (!CalleeOp.isExpr() || !CalleeOpOther.isExpr()) { + // At least one of these is actually an indirect call. + return std::make_pair(false, 0); + } + + const MCSymbol &CalleeSymbol = CalleeOp.getExpr()->getSymbol(); + uint64_t CalleeAddress = getGlobalAddress(CalleeSymbol); + + const MCSymbol &CalleeSymbolOther = CalleeOpOther.getExpr()->getSymbol(); + uint64_t CalleeAddressOther = getGlobalAddress(CalleeSymbolOther); + + bool BothRecursiveCalls = + CalleeAddress == getAddress() && + CalleeAddressOther == BF.getAddress(); + + bool SameCallee = CalleeAddress == CalleeAddressOther; + + return std::make_pair(BothRecursiveCalls || SameCallee, CalleeOpIndex); +} + +std::pair BinaryFunction::isTargetEquivalentWith( + const MCInst &Inst, const BinaryBasicBlock &BB, const MCInst &InstOther, + const BinaryBasicBlock &BBOther, const BinaryFunction &BF, + bool AreInvokes) const { + // The target operand in a (non-indirect) jump instruction is the + // first operand. + unsigned TargetOpIndex = 0; + if (AreInvokes) { + // The landing pad operand in an invoke is either the second or the + // sixth operand, depending on the number of operands of the invoke. + TargetOpIndex = 1; + if (Inst.getNumOperands() == 7 || Inst.getNumOperands() == 8) + TargetOpIndex = 5; + } + + const MCOperand &TargetOp = Inst.getOperand(TargetOpIndex); + const MCOperand &TargetOpOther = InstOther.getOperand(TargetOpIndex); + if (!TargetOp.isExpr() || !TargetOpOther.isExpr()) { + assert(AreInvokes); + // An invoke without a landing pad operand has no catch handler. As long + // as both invokes have no catch target, we can consider they have the + // same catch target. + return std::make_pair(!TargetOp.isExpr() && !TargetOpOther.isExpr(), + TargetOpIndex); + } + + const MCSymbol &TargetSymbol = TargetOp.getExpr()->getSymbol(); + BinaryBasicBlock *TargetBB = + AreInvokes ? + BB.getLandingPad(&TargetSymbol) : + BB.getSuccessor(&TargetSymbol); + + const MCSymbol &TargetSymbolOther = TargetOpOther.getExpr()->getSymbol(); + BinaryBasicBlock *TargetBBOther = + AreInvokes ? + BBOther.getLandingPad(&TargetSymbolOther) : + BBOther.getSuccessor(&TargetSymbolOther); + + if (TargetBB == nullptr || TargetBBOther == nullptr) { + assert(!AreInvokes); + // This is a tail call implemented with a jump that was not + // converted to a call (e.g. conditional jump). Since the + // instructions were not identical, the functions canot be + // proven identical either. + return std::make_pair(false, 0); + } + + return std::make_pair(getIndex(TargetBB) == BF.getIndex(TargetBBOther), + TargetOpIndex); +} + +bool BinaryFunction::isInstrEquivalentWith( + const MCInst &Inst, const BinaryBasicBlock &BB, const MCInst &InstOther, + const BinaryBasicBlock &BBOther, const BinaryFunction &BF) const { + // First check their opcodes. + if (Inst.getOpcode() != InstOther.getOpcode()) { + return false; + } + + // Then check if they have the same number of operands. + unsigned NumOperands = Inst.getNumOperands(); + unsigned NumOperandsOther = InstOther.getNumOperands(); + if (NumOperands != NumOperandsOther) { + return false; + } + + // We are interested in 3 special cases: + // + // a) both instructions are recursive calls. + // b) both instructions are local jumps to basic blocks with same indices. + // c) both instructions are invokes with landing pad blocks with same indices. + // + // In any of these cases the instructions will differ in some operands, but + // given identical CFG of the functions, they can still be considered + // equivalent. + bool BothCalls = + BC.MIA->isCall(Inst) && + BC.MIA->isCall(InstOther); + bool BothInvokes = + BC.MIA->isInvoke(Inst) && + BC.MIA->isInvoke(InstOther); + bool BothBranches = + BC.MIA->isBranch(Inst) && + !BC.MIA->isIndirectBranch(Inst) && + BC.MIA->isBranch(InstOther) && + !BC.MIA->isIndirectBranch(InstOther); + + if (!BothCalls && !BothInvokes && !BothBranches) { + return Inst.equals(InstOther); + } + + // We figure out if both instructions are recursive calls (case a) or else + // if they are calls to the same function. + bool EquivCallees = false; + unsigned CalleeOpIndex = 0; + if (BothCalls) { + std::tie(EquivCallees, CalleeOpIndex) = + isCalleeEquivalentWith(Inst, BB, InstOther, BBOther, BF); + } + + // We figure out if both instructions are jumps (case b) or invokes (case c) + // with equivalent jump targets or landing pads respectively. + assert(!(BothInvokes && BothBranches)); + bool SameTarget = false; + unsigned TargetOpIndex = 0; + if (BothInvokes || BothBranches) { + std::tie(SameTarget, TargetOpIndex) = + isTargetEquivalentWith(Inst, BB, InstOther, BBOther, BF, BothInvokes); + } + + // Compare all operands. + for (unsigned i = 0; i < NumOperands; ++i) { + if (i == CalleeOpIndex && BothCalls && EquivCallees) + continue; + + if (i == TargetOpIndex && (BothInvokes || BothBranches) && SameTarget) + continue; + + if (!Inst.getOperand(i).equals(InstOther.getOperand(i))) + return false; + } + + // The instructions are equal although (some of) their operands + // may differ. + return true; +} + +bool BinaryFunction::isIdenticalWith(const BinaryFunction &BF) const { + + assert(CurrentState == State::CFG && BF.CurrentState == State::CFG); + + // Compare the two functions, one basic block at a time. + // Currently we require two identical basic blocks to have identical + // instruction sequences and the same index in their corresponding + // functions. The latter is important for CFG equality. + + // We do not consider functions with just different pseudo instruction + // sequences non-identical by default. However we print a wanring + // in case two instructions that are identical have different pseudo + // instruction sequences. + bool PseudosDiffer = false; + + if (size() != BF.size()) + return false; + + auto BBI = BF.begin(); + for (const BinaryBasicBlock *BB : BasicBlocks) { + const BinaryBasicBlock *BBOther = &*BBI; + if (getIndex(BB) != BF.getIndex(BBOther)) + return false; + + // Compare successor basic blocks. + if (BB->succ_size() != BBOther->succ_size()) + return false; + + auto SuccBBI = BBOther->succ_begin(); + for (const BinaryBasicBlock *SuccBB : BB->successors()) { + const BinaryBasicBlock *SuccBBOther = *SuccBBI; + if (getIndex(SuccBB) != BF.getIndex(SuccBBOther)) + return false; + ++SuccBBI; + } + + // Compare landing pads. + if (BB->lp_size() != BBOther->lp_size()) + return false; + + auto LPI = BBOther->lp_begin(); + for (const BinaryBasicBlock *LP : BB->landing_pads()) { + const BinaryBasicBlock *LPOther = *LPI; + if (getIndex(LP) != BF.getIndex(LPOther)) + return false; + ++LPI; + } + + // Compare instructions. + auto I = BB->begin(), E = BB->end(); + auto OtherI = BBOther->begin(), OtherE = BBOther->end(); + while (I != E && OtherI != OtherE) { + const MCInst &Inst = *I; + const MCInst &InstOther = *OtherI; + + bool IsInstPseudo = BC.MII->get(Inst.getOpcode()).isPseudo(); + bool IsInstOtherPseudo = BC.MII->get(InstOther.getOpcode()).isPseudo(); + + if (IsInstPseudo == IsInstOtherPseudo) { + // Either both are pseudos or none is. + bool areEqual = + isInstrEquivalentWith(Inst, *BB, InstOther, *BBOther, BF); + + if (!areEqual && IsInstPseudo) { + // Different pseudo instructions. + PseudosDiffer = true; + } + else if (!areEqual) { + // Different non-pseudo instructions. + return false; + } + + ++I; ++OtherI; + } + else { + // One instruction is a pseudo while the other is not. + PseudosDiffer = true; + IsInstPseudo ? ++I : ++OtherI; + } + } + + // Check for trailing instructions or pseudos in one of the basic blocks. + auto TrailI = I == E ? OtherI : I; + auto TrailE = I == E ? OtherE : E; + while (TrailI != TrailE) { + const MCInst &InstTrail = *TrailI; + if (!BC.MII->get(InstTrail.getOpcode()).isPseudo()) { + // One of the functions has more instructions in this basic block + // than the other, hence not identical. + return false; + } + + // There are trailing pseudos only in one of the basic blocks. + PseudosDiffer = true; + ++TrailI; + } + + ++BBI; + } + + if (PseudosDiffer) { + errs() << "BOLT-WARNING: functions " << getName() << " and "; + errs() << BF.getName() << " are identical, but have different"; + errs() << " pseudo instruction sequences.\n"; + } + + return true; +} + +std::size_t BinaryFunction::hash() const { + assert(CurrentState == State::CFG); + + // The hash is computed by creating a string of all the opcodes + // in the function and hashing that string with std::hash. + std::string Opcodes; + for (const BinaryBasicBlock *BB : BasicBlocks) { + for (const MCInst &Inst : *BB) { + unsigned Opcode = Inst.getOpcode(); + + if (BC.MII->get(Opcode).isPseudo()) + continue; + + if (Opcode == 0) { + Opcodes.push_back(0); + continue; + } + + while (Opcode) { + uint8_t LSB = Opcode & 0xff; + Opcodes.push_back(LSB); + Opcode = Opcode >> 8; + } + } + } + + return std::hash{}(Opcodes); +} + BinaryFunction::~BinaryFunction() { for (auto BB : BasicBlocks) { delete BB; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index aa812ec9bb54..527e5e5fd2eb 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -117,6 +117,16 @@ class BinaryFunction : public AddressRangesOwner { /// base address for position independent binaries. uint64_t Address; + /// Address of an identical function that can replace this one. By default + /// this is the same as the address of this functions, and the icf pass can + /// potentially set it to some other function's address. + /// + /// In case multiple functions are identical to each other, one of the + /// functions (the representative) will point to its own address, while the + /// rest of the functions will point to the representative through one or + /// more steps. + uint64_t IdenticalFunctionAddress; + /// Original size of the function. uint64_t Size; @@ -196,6 +206,32 @@ class BinaryFunction : public AddressRangesOwner { return *this; } + /// Helper function that compares an instruction of this function to the + /// given instruction of the given function. The functions should have + /// identical CFG. + bool isInstrEquivalentWith( + const MCInst &Inst, const BinaryBasicBlock &BB, const MCInst &InstOther, + const BinaryBasicBlock &BBOther, const BinaryFunction &BF) const; + + /// Helper function that compares the callees of two call instructions. + /// Callees are considered equivalent if both refer to the same function + /// or if both calls are recursive. Instructions should have same opcodes + /// and same number of operands. Returns true and the callee operand index + /// when callees are quivalent, and false, 0 otherwise. + std::pair isCalleeEquivalentWith( + const MCInst &Inst, const BinaryBasicBlock &BB, const MCInst &InstOther, + const BinaryBasicBlock &BBOther, const BinaryFunction &BF) const; + + /// Helper function that compares the targets two jump or invoke instructions. + /// A target of an invoke we consider its landing pad basic block. The + /// corresponding functions should have identical CFG. Instructions should + /// have same opcodes and same number of operands. Returns true and the target + /// operand index when targets are equivalent, and false, 0 otherwise. + std::pair isTargetEquivalentWith( + const MCInst &Inst, const BinaryBasicBlock &BB, const MCInst &InstOther, + const BinaryBasicBlock &BBOther, const BinaryFunction &BF, + bool AreInvokes) const; + /// Return basic block that originally was laid out immediately following /// the given /p BB basic block. const BinaryBasicBlock * @@ -381,8 +417,8 @@ class BinaryFunction : public AddressRangesOwner { BinaryFunction(const std::string &Name, SymbolRef Symbol, SectionRef Section, uint64_t Address, uint64_t Size, BinaryContext &BC, bool IsSimple = true) : - Names({Name}), Symbol(Symbol), Section(Section), - Address(Address), Size(Size), BC(BC), IsSimple(IsSimple), + Names({Name}), Symbol(Symbol), Section(Section), Address(Address), + IdenticalFunctionAddress(Address), Size(Size), BC(BC), IsSimple(IsSimple), CodeSectionName(".text." + Name), FunctionNumber(++Count) {} @@ -460,6 +496,10 @@ class BinaryFunction : public AddressRangesOwner { return Names; } + State getCurrentState() const { + return CurrentState; + } + /// Return containing file section. SectionRef getSection() const { return Section; @@ -778,6 +818,17 @@ class BinaryFunction : public AddressRangesOwner { return LSDAAddress; } + /// Return the address of an identical function. If none is found this will + /// return this function's address. + uint64_t getIdenticalFunctionAddress() const { + return IdenticalFunctionAddress; + } + + /// Set the address of an identical function. + void setIdenticalFunctionAddress(uint64_t Address) { + IdenticalFunctionAddress = Address; + } + /// Return symbol pointing to function's LSDA. MCSymbol *getLSDASymbol() { if (LSDASymbol) @@ -864,6 +915,18 @@ class BinaryFunction : public AddressRangesOwner { /// Emit exception handling ranges for the function. void emitLSDA(MCStreamer *Streamer); + /// Merge profile data of this function into those of the given + /// function. The functions should have been proven identical with + /// isIdenticalWith. + void mergeProfileDataInto(BinaryFunction &BF) const; + + /// Returns true if this function has identical code and + /// CFG with the given function. + bool isIdenticalWith(const BinaryFunction &BF) const; + + /// Returns a hash value for the function. To be used for ICF. + std::size_t hash() const; + /// Sets the associated .debug_info entry. void addSubprogramDIE(DWARFCompileUnit *Unit, const DWARFDebugInfoEntryMinimal *DIE) { diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 080f485aa1cf..10c981a78382 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -50,6 +50,12 @@ SimplifyRODataLoads("simplify-rodata-loads", "section"), llvm::cl::Optional); +static llvm::cl::opt +IdenticalCodeFolding( + "icf", + llvm::cl::desc("fold functions with identical code"), + llvm::cl::Optional); + } // namespace opts namespace llvm { @@ -73,6 +79,9 @@ void BinaryFunctionPassManager::runAllPasses( // Here we manage dependencies/order manually, since passes are ran in the // order they're registered. + Manager.registerPass(llvm::make_unique(), + opts::IdenticalCodeFolding); + Manager.registerPass( std::move(llvm::make_unique(Manager.NagUser)), opts::EliminateUnreachable); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index aa4c5241c9e1..c4394c65d67b 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -11,6 +11,7 @@ #include "BinaryPasses.h" #include "llvm/Support/Options.h" +#include #define DEBUG_TYPE "bolt" @@ -23,6 +24,7 @@ extern llvm::cl::opt PrintEHRanges; extern llvm::cl::opt PrintUCE; extern llvm::cl::opt PrintPeepholes; extern llvm::cl::opt PrintSimplifyROLoads; +extern llvm::cl::opt PrintICF; extern llvm::cl::opt SplitFunctions; extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function); @@ -682,5 +684,231 @@ void SimplifyRODataLoads::runOnFunctions( outs() << "BOLT: dynamic loads found: " << NumDynamicLoadsFound << "\n"; } +void IdenticalCodeFolding::discoverCallers( + BinaryContext &BC, std::map &BFs) { + for (auto &I : BFs) { + BinaryFunction &Caller = I.second; + + if (!Caller.isSimple()) + continue; + + for (BinaryBasicBlock &BB : Caller) { + unsigned BlockIndex = Caller.getIndex(&BB); + unsigned InstrIndex = 0; + + for (MCInst &Inst : BB) { + if (!BC.MIA->isCall(Inst)) { + ++InstrIndex; + continue; + } + + const MCOperand &TargetOp = Inst.getOperand(0); + if (!TargetOp.isExpr()) { + // This is an inderect call, we cannot record + // a target. + ++InstrIndex; + continue; + } + + // Find the target function for this call. + const MCExpr *TargetExpr = TargetOp.getExpr(); + assert(TargetExpr->getKind() == MCExpr::SymbolRef); + const MCSymbol &TargetSymbol = + dyn_cast(TargetExpr)->getSymbol(); + auto AI = BC.GlobalSymbols.find(TargetSymbol.getName()); + assert(AI != BC.GlobalSymbols.end()); + uint64_t TargetAddress = AI->second; + auto FI = BFs.find(TargetAddress); + if (FI == BFs.end()) { + // Call to a function without a BinaryFunction object. + ++InstrIndex; + continue; + } + BinaryFunction *Callee = &FI->second; + + // Insert a tuple in the Callers map. + Callers[Callee].emplace_back( + CallSite(&Caller, BlockIndex, InstrIndex)); + + ++InstrIndex; + } + } + } +} + +void IdenticalCodeFolding::foldFunction( + BinaryContext &BC, + std::map &BFs, + BinaryFunction *BFToFold, + BinaryFunction *BFToReplaceWith, + std::set &Modified) { + + // Mark BFToFold as identical with BFTOreplaceWith. + BFToFold->setIdenticalFunctionAddress(BFToReplaceWith->getAddress()); + + // Add the size of BFToFold to the total size savings estimate. + BytesSavedEstimate += BFToFold->getSize(); + + // Get callers of BFToFold. + auto CI = Callers.find(BFToFold); + if (CI == Callers.end()) + return; + std::vector &BFToFoldCallers = CI->second; + + // Get callers of BFToReplaceWith. + std::vector &BFToReplaceWithCallers = Callers[BFToReplaceWith]; + + // Get MCSymbol for BFToReplaceWith. + MCSymbol *SymbolToReplaceWith = + BC.getOrCreateGlobalSymbol(BFToReplaceWith->getAddress(), ""); + + // Traverse callers of BFToFold and replace the calls with calls + // to BFToReplaceWith. + for (const CallSite &CS : BFToFoldCallers) { + // Get call instruction. + BinaryFunction *Caller = CS.Caller; + BinaryBasicBlock *CallBB = Caller->getBasicBlockAtIndex(CS.BlockIndex); + MCInst &CallInst = CallBB->getInstructionAtIndex(CS.InstrIndex); + + // Replace call target with BFToReplaceWith. + MCOperand CallTargetOp = + MCOperand::createExpr( + MCSymbolRefExpr::create( + SymbolToReplaceWith, MCSymbolRefExpr::VK_None, *BC.Ctx)); + assert(BC.MIA->replaceCallTargetOperand(CallInst, CallTargetOp) && + "unexpected call target prevented the replacement"); + + // Add this call site to the callers of BFToReplaceWith. + BFToReplaceWithCallers.emplace_back(CS); + + // Add caller to the set of modified functions. + Modified.insert(Caller); + + // Update dynamic calls folded stat. + if (Caller->hasValidProfile() && + CallBB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) + NumDynamicCallsFolded += CallBB->getExecutionCount(); + } + + // Remove all callers of BFToFold. + BFToFoldCallers.clear(); + + ++NumFunctionsFolded; + + // Merge execution counts of BFToFold into those of BFToReplaceWith. + BFToFold->mergeProfileDataInto(*BFToReplaceWith); +} + +void IdenticalCodeFolding::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set & +) { + + discoverCallers(BC, BFs); + + // This hash table is used to identify identical functions. It maps + // a function to a bucket of functions identical to it. + struct KeyHash { + std::size_t operator()(const BinaryFunction *F) const { return F->hash(); } + }; + struct KeyEqual { + bool operator()(const BinaryFunction *A, const BinaryFunction *B) const { + return A->isIdenticalWith(*B); + } + }; + std::unordered_map, + KeyHash, KeyEqual> Buckets; + + // Set that holds the functions that were modified by the last pass. + std::set Mod; + + // Vector of all the candidate functions to be tested for being identical + // to each other. Initialized with all simple functions. + std::vector Cands; + for (auto &I : BFs) { + BinaryFunction *BF = &I.second; + if (BF->isSimple()) + Cands.emplace_back(BF); + } + + // We repeat the icf pass until no new modifications happen. + unsigned Iter = 1; + do { + Buckets.clear(); + Mod.clear(); + + errs() << "BOLT-INFO: icf pass " << Iter << "...\n"; + + uint64_t NumIdenticalFunctions = 0; + + // Compare candidate functions using the Buckets hash table. Identical + // functions are effiently discovered and added to the same bucket. + for (BinaryFunction *BF : Cands) { + Buckets[BF].emplace_back(BF); + } + + Cands.clear(); + + // Go through the functions of each bucket and fold any references to them + // with the references to the hottest function among them. + for (auto &I : Buckets) { + std::vector &IFs = I.second; + std::sort(IFs.begin(), IFs.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + if (!A->hasValidProfile() && !B->hasValidProfile()) + return false; + + if (!A->hasValidProfile()) + return false; + + if (!B->hasValidProfile()) + return true; + + return B->getExecutionCount() < A->getExecutionCount(); + } + ); + BinaryFunction *Hottest = IFs[0]; + + // For the next pass, we consider only one function from each set of + // identical functions. + Cands.emplace_back(Hottest); + + if (IFs.size() <= 1) + continue; + + NumIdenticalFunctions += IFs.size() - 1; + for (unsigned i = 1; i < IFs.size(); ++i) { + BinaryFunction *BF = IFs[i]; + foldFunction(BC, BFs, BF, Hottest, Mod); + } + } + + errs() << "BOLT-INFO: found " << NumIdenticalFunctions; + errs() << " identical functions.\n"; + errs() << "BOLT-INFO: modified " << Mod.size() << " functions.\n"; + + NumIdenticalFunctionsFound += NumIdenticalFunctions; + + ++Iter; + } while (!Mod.empty()); + + outs() << "BOLT: ICF pass found " << NumIdenticalFunctionsFound; + outs() << " functions identical to some other function.\n"; + outs() << "BOLT: ICF pass folded references to " << NumFunctionsFolded; + outs() << " functions.\n"; + outs() << "BOLT: ICF pass folded " << NumDynamicCallsFolded << " dynamic"; + outs() << " function calls.\n"; + outs() << "BOLT: Removing all identical functions could save "; + outs() << format("%.2lf", (double) BytesSavedEstimate / 1024); + outs() << " KB of code space.\n"; + + if (opts::PrintAll || opts::PrintICF) { + for (auto &I : BFs) { + I.second.print(errs(), "after identical code folding", true); + } + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 915ff285cdec..b227041649b5 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -182,6 +182,47 @@ class SimplifyRODataLoads : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// An optimization that replaces references to identical functions with +/// references to a single one of them. +/// +class IdenticalCodeFolding : public BinaryFunctionPass { + uint64_t NumIdenticalFunctionsFound{0}; + uint64_t NumFunctionsFolded{0}; + uint64_t NumDynamicCallsFolded{0}; + uint64_t BytesSavedEstimate{0}; + + /// Map from a binary function to its callers. + struct CallSite { + BinaryFunction *Caller; + unsigned BlockIndex; + unsigned InstrIndex; + + CallSite(BinaryFunction *Caller, unsigned BlockIndex, unsigned InstrIndex) : + Caller(Caller), BlockIndex(BlockIndex), InstrIndex(InstrIndex) { } + }; + using CallerMap = std::map>; + CallerMap Callers; + + /// Replaces all calls to BFTOFold with calls to BFToReplaceWith and merges + /// the profile data of BFToFold with those of BFToReplaceWith. All modified + /// functions are added to the Modified set. + void foldFunction(BinaryContext &BC, + std::map &BFs, + BinaryFunction *BFToFold, + BinaryFunction *BFToReplaceWith, + std::set &Modified); + + /// Finds callers for each binary function and populates the Callers + /// map. + void discoverCallers(BinaryContext &BC, + std::map &BFs); + + public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + } // namespace bolt } // namespace llvm diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index a193c0131209..255514449e23 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -186,6 +186,11 @@ PrintReordered("print-reordered", cl::desc("print functions after layout optimization"), cl::Hidden); +cl::opt +PrintICF("print-icf", + cl::desc("print functions after ICF optimization"), + cl::Hidden); + static cl::opt KeepTmp("keep-tmp", cl::desc("preserve intermediate .o file"), @@ -805,6 +810,16 @@ void RewriteInstance::discoverFileObjects() { "wish to proceed, use -allow-stripped option.\n"; exit(1); } + + // Register the final names of functions with multiple names with BinaryContext + // data structures. + for (auto &BFI : BinaryFunctions) { + uint64_t Address = BFI.first; + const BinaryFunction &BF = BFI.second; + auto AI = BC->GlobalSymbols.find(BF.getName()); + if (AI == BC->GlobalSymbols.end()) + BC->registerNameAtAddress(BF.getName(), Address); + } } void RewriteInstance::readSpecialSections() { From 1fadad0d4a1c0077aed8a842c4577b8b3266fe6a Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Fri, 15 Jul 2016 16:11:30 -0700 Subject: [PATCH 145/904] Basic block clustering algorithm for minimizing branches. Summary: This algorithm is similar to our main clustering algorithm but uses a different heuristic for selecting edges to become fall-throughs. The weight of an edge is calculated as the win in branches if we choose to layout this edge as a fall-through. For example, the edges A -> B with execution count 100 and A -> C with execution count 500 (where B and C are the only successors of A) have weights -400 and +400 respectively. (cherry picked from commit a04e6b48247d2c9e1536e9a0860bb0b2a6c94a12) --- bolt/BinaryFunction.cpp | 9 +- bolt/BinaryFunction.h | 2 +- bolt/BinaryPasses.cpp | 10 +- bolt/ReorderAlgorithm.cpp | 289 +++++++++++++++++++++++++++++++------- bolt/ReorderAlgorithm.h | 95 ++++++++++++- 5 files changed, 348 insertions(+), 57 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 43faae7a83bb..82a5d18512a7 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1181,7 +1181,8 @@ bool BinaryFunction::fixCFIState() { return true; } -void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { +void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, + bool Split) { if (BasicBlocksLayout.empty() || Type == LT_NONE) return; @@ -1203,7 +1204,11 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { else { DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n"); - std::unique_ptr CAlgo(new GreedyClusterAlgorithm()); + std::unique_ptr CAlgo; + if (MinBranchClusters) + CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); + else + CAlgo.reset(new PHGreedyClusterAlgorithm()); switch(Type) { case LT_OPTIMIZE: diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 527e5e5fd2eb..a95d4600dc92 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -424,7 +424,7 @@ class BinaryFunction : public AddressRangesOwner { /// Modify code layout making necessary adjustments to instructions at the /// end of basic blocks. - void modifyLayout(LayoutType Type, bool Split); + void modifyLayout(LayoutType Type, bool MinBranchClusters, bool Split); /// Find the loops in the CFG of the function and store infromation about /// them. diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index c4394c65d67b..679c06657b22 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -52,6 +52,13 @@ ReorderBlocks( "behavior"), clEnumValEnd)); +static llvm::cl::opt +MinBranchClusters( + "min-branch-clusters", + llvm::cl::desc("use a modified clustering algorithm geared towards " + "minimizing branches"), + llvm::cl::Hidden); + } // namespace opts namespace llvm { @@ -384,7 +391,8 @@ void ReorderBasicBlocks::runOnFunctions( (opts::SplitFunctions == BinaryFunction::ST_EH && Function.hasEHRanges()) || (LargeFunctions.find(It.first) != LargeFunctions.end()); - Function.modifyLayout(opts::ReorderBlocks, ShouldSplit); + Function.modifyLayout(opts::ReorderBlocks, opts::MinBranchClusters, + ShouldSplit); if (opts::PrintAll || opts::PrintReordered) Function.print(errs(), "after reordering blocks", true); if (opts::DumpDotAll) diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp index 23d1c31a5fe3..451e20715665 100644 --- a/bolt/ReorderAlgorithm.cpp +++ b/bolt/ReorderAlgorithm.cpp @@ -18,6 +18,9 @@ #include #include +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + using namespace llvm; using namespace bolt; @@ -82,6 +85,20 @@ void ClusterAlgorithm::reset() { AvgFreq.clear(); } +void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { + OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; +} + +size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { + HashPair Hasher; + return Hasher(std::make_pair(E.Src, E.Dst)); +} + +bool GreedyClusterAlgorithm::EdgeEqual::operator()( + const EdgeTy &A, const EdgeTy &B) const { + return A.Src == B.Src && A.Dst == B.Dst; +} + void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { reset(); @@ -89,96 +106,270 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { // maximize weight during a path traversing all BBs. In this way, we will // convert the hottest branches into fall-throughs. - // Encode an edge between two basic blocks, source and destination - typedef std::pair EdgeTy; - typedef HashPair Hasher; - std::unordered_map Weight; - - // Define a comparison function to establish SWO between edges - auto Comp = [&] (EdgeTy A, EdgeTy B) { - // With equal weights, prioritize branches with lower index - // source/destination. This helps to keep original block order for blocks - // when optimal order cannot be deducted from a profile. - if (Weight[A] == Weight[B]) { - uint32_t ASrcBBIndex = BF.getIndex(A.first); - uint32_t BSrcBBIndex = BF.getIndex(B.first); - if (ASrcBBIndex != BSrcBBIndex) - return ASrcBBIndex > BSrcBBIndex; - return BF.getIndex(A.second) > BF.getIndex(B.second); - } - return Weight[A] < Weight[B]; - }; - std::priority_queue, decltype(Comp)> Queue(Comp); - - typedef std::unordered_map BBToClusterMapTy; - BBToClusterMapTy BBToClusterMap; + // This is the queue of edges from which we will pop edges and use them to + // cluster basic blocks in a greedy fashion. + std::vector Queue; + // Initialize inter-cluster weights. ClusterEdges.resize(BF.layout_size()); + // Initialize clusters and edge queue. for (auto BB : BF.layout()) { - // Create a cluster for this BB + // Create a cluster for this BB. uint32_t I = Clusters.size(); Clusters.emplace_back(); auto &Cluster = Clusters.back(); Cluster.push_back(BB); BBToClusterMap[BB] = I; - // Populate priority queue with edges + // Populate priority queue with edges. auto BI = BB->branch_info_begin(); for (auto &I : BB->successors()) { - if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) - Weight[std::make_pair(BB, I)] = BI->Count; - Queue.push(std::make_pair(BB, I)); + assert(BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE && + "attempted reordering blocks of function with no profile data"); + Queue.emplace_back(EdgeTy(BB, I, BI->Count)); ++BI; } } + // Sort and adjust the edge queue. + initQueue(Queue, BF); - // Grow clusters in a greedy fashion + // Grow clusters in a greedy fashion. while (!Queue.empty()) { - auto elmt = Queue.top(); - Queue.pop(); + auto E = Queue.back(); + Queue.pop_back(); + + BinaryBasicBlock *SrcBB = E.Src; + BinaryBasicBlock *DstBB = E.Dst; - BinaryBasicBlock *BBSrc = elmt.first; - BinaryBasicBlock *BBDst = elmt.second; + DEBUG(dbgs() << "Popped edge "; + E.print(dbgs()); + dbgs() << "\n"); // Case 1: BBSrc and BBDst are the same. Ignore this edge - if (BBSrc == BBDst || BBDst == *BF.layout_begin()) + if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { + DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); continue; + } - int I = BBToClusterMap[BBSrc]; - int J = BBToClusterMap[BBDst]; + int I = BBToClusterMap[SrcBB]; + int J = BBToClusterMap[DstBB]; // Case 2: If they are already allocated at the same cluster, just increase // the weight of this cluster if (I == J) { - ClusterEdges[I][I] += Weight[elmt]; + ClusterEdges[I][I] += E.Count; + DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); continue; } auto &ClusterA = Clusters[I]; auto &ClusterB = Clusters[J]; - if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { - // Case 3: BBSrc is at the end of a cluster and BBDst is at the start, - // allowing us to merge two clusters + if (areClustersCompatible(ClusterA, ClusterB, E)) { + // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, + // allowing us to merge two clusters. for (auto BB : ClusterB) BBToClusterMap[BB] = I; ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); ClusterB.clear(); + // Increase the intra-cluster edge count of cluster A with the count of + // this edge as well as with the total count of previously visited edges + // from cluster B cluster A. + ClusterEdges[I][I] += E.Count; + ClusterEdges[I][I] += ClusterEdges[J][I]; // Iterate through all inter-cluster edges and transfer edges targeting // cluster B to cluster A. - // It is bad to have to iterate though all edges when we could have a list - // of predecessors for cluster B. However, it's not clear if it is worth - // the added code complexity to create a data structure for clusters that - // maintains a list of predecessors. Maybe change this if it becomes a - // deal breaker. for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) ClusterEdges[K][I] += ClusterEdges[K][J]; + DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); + // Adjust the weights of the remaining edges and re-sort the queue. + adjustQueue(Queue, BF); } else { - // Case 4: Both BBSrc and BBDst are allocated in positions we cannot - // merge them. Annotate the weight of this edge in the weight between - // clusters to help us decide ordering between these clusters. - ClusterEdges[I][J] += Weight[elmt]; + // Case 4: Both SrcBB and DstBB are allocated in positions we cannot + // merge them. Add the count of this edge to the inter-cluster edge count + // between clusters A and B to help us decide ordering between these + // clusters. + ClusterEdges[I][J] += E.Count; + DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); + } + } +} + +void GreedyClusterAlgorithm::reset() { + ClusterAlgorithm::reset(); + BBToClusterMap.clear(); +} + +void PHGreedyClusterAlgorithm::initQueue( + std::vector &Queue, const BinaryFunction &BF) { + // Define a comparison function to establish SWO between edges. + auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) { + // With equal weights, prioritize branches with lower index + // source/destination. This helps to keep original block order for blocks + // when optimal order cannot be deducted from a profile. + if (A.Count == B.Count) { + uint32_t ASrcBBIndex = BF.getIndex(A.Src); + uint32_t BSrcBBIndex = BF.getIndex(B.Src); + if (ASrcBBIndex != BSrcBBIndex) + return ASrcBBIndex > BSrcBBIndex; + return BF.getIndex(A.Dst) > BF.getIndex(B.Dst); + } + return A.Count < B.Count; + }; + + // Sort edges in increasing profile count order. + std::sort(Queue.begin(), Queue.end(), Comp); +} + +void PHGreedyClusterAlgorithm::adjustQueue( + std::vector &Queue, const BinaryFunction &BF) { + // Nothing to do. + return; +} + +bool PHGreedyClusterAlgorithm::areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { + return Front.back() == E.Src && Back.front() == E.Dst; +} + +int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( + const EdgeTy &E, const BinaryFunction &BF) const { + const BinaryBasicBlock *SrcBB = E.Src; + const BinaryBasicBlock *DstBB = E.Dst; + + // Initial weight value. + int64_t W = (int64_t)E.Count; + + // Adjust the weight by taking into account other edges with the same source. + auto BI = SrcBB->branch_info_begin(); + for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { + assert(BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE && + "attempted reordering blocks of function with no profile data"); + assert(BI->Count <= std::numeric_limits::max() && + "overflow detected"); + // Ignore edges with same source and destination, edges that target the + // entry block as well as the edge E itself. + if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB) + W -= (int64_t)BI->Count; + ++BI; + } + + // Adjust the weight by taking into account other edges with the same + // destination. + for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { + // Ignore edges with same source and destination as well as the edge E + // itself. + if (PredBB == DstBB || PredBB == SrcBB) + continue; + auto BI = PredBB->branch_info_begin(); + for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { + if (SuccBB == DstBB) + break; + ++BI; + } + assert(BI != PredBB->branch_info_end() && "invalied control flow graph"); + assert(BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE && + "attempted reordering blocks of function with no profile data"); + assert(BI->Count <= std::numeric_limits::max() && + "overflow detected"); + W -= (int64_t)BI->Count; + } + + return W; +} + +void MinBranchGreedyClusterAlgorithm::initQueue( + std::vector &Queue, const BinaryFunction &BF) { + // Initialize edge weights. + for (const EdgeTy &E : Queue) + Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); + + // Sort edges in increasing weight order. + adjustQueue(Queue, BF); +} + +void MinBranchGreedyClusterAlgorithm::adjustQueue( + std::vector &Queue, const BinaryFunction &BF) { + // Define a comparison function to establish SWO between edges. + auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) { + // With equal weights, prioritize branches with lower index + // source/destination. This helps to keep original block order for blocks + // when optimal order cannot be deducted from a profile. + if (Weight[A] == Weight[B]) { + uint32_t ASrcBBIndex = BF.getIndex(A.Src); + uint32_t BSrcBBIndex = BF.getIndex(B.Src); + if (ASrcBBIndex != BSrcBBIndex) + return ASrcBBIndex > BSrcBBIndex; + return BF.getIndex(A.Dst) > BF.getIndex(B.Dst); + } + return Weight[A] < Weight[B]; + }; + + // Iterate through all remaining edges to find edges that have their + // source and destination in the same cluster. + std::vector NewQueue; + for (const EdgeTy &E : Queue) { + BinaryBasicBlock *SrcBB = E.Src; + BinaryBasicBlock *DstBB = E.Dst; + + // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore + // this edge. + if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { + DEBUG(dbgs() << "\tAdjustment: Ignored edge "; + E.print(dbgs()); + dbgs() << " (same src, dst)\n"); + continue; + } + + int I = BBToClusterMap[SrcBB]; + int J = BBToClusterMap[DstBB]; + auto &ClusterA = Clusters[I]; + auto &ClusterB = Clusters[J]; + + // Case 2: They are already allocated at the same cluster or incompatible + // clusters. Adjust the weights of edges with the same source or + // destination, so that this edge has no effect on them any more, and ignore + // this edge. Also increase the intra- (or inter-) cluster edge count. + if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { + ClusterEdges[I][J] += E.Count; + DEBUG(dbgs() << "\tAdjustment: Ignored edge "; + E.print(dbgs()); + dbgs() << " (src, dst belong to same cluster or incompatible " + "clusters)\n"); + for (BinaryBasicBlock *SuccBB : SrcBB->successors()) { + if (SuccBB == DstBB) + continue; + auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); + assert(WI != Weight.end() && "CFG edge not found in Weight map"); + WI->second += (int64_t)E.Count; + } + for (BinaryBasicBlock *PredBB : DstBB->predecessors()) { + if (PredBB == SrcBB) + continue; + auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); + assert(WI != Weight.end() && "CFG edge not found in Weight map"); + WI->second += (int64_t)E.Count; + } + continue; } + + // Case 3: None of the previous cases is true, so just keep this edge in + // the queue. + NewQueue.emplace_back(E); } + + // Sort remaining edges in increasing weight order. + Queue.swap(NewQueue); + std::sort(Queue.begin(), Queue.end(), Comp); +} + +bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { + return Front.back() == E.Src && Back.front() == E.Dst; +} + +void MinBranchGreedyClusterAlgorithm::reset() { + GreedyClusterAlgorithm::reset(); + Weight.clear(); } void OptimalReorderAlgorithm::reorderBasicBlocks( diff --git a/bolt/ReorderAlgorithm.h b/bolt/ReorderAlgorithm.h index 4a4947c662ef..1269a0d5d2e8 100644 --- a/bolt/ReorderAlgorithm.h +++ b/bolt/ReorderAlgorithm.h @@ -21,9 +21,12 @@ namespace llvm { -namespace bolt { + +class raw_ostream; +namespace bolt { + class BinaryBasicBlock; class BinaryFunction; @@ -51,7 +54,7 @@ class ClusterAlgorithm { void computeClusterAverageFrequency(); /// Clear clusters and related info. - void reset(); + virtual void reset(); void printClusters() const; @@ -59,13 +62,97 @@ class ClusterAlgorithm { }; -/// This clustering algorithm is based on a greedy heuristic suggested by -/// Pettis (PLDI '90). +/// Base class for a greedy clustering algorithm that selects edges in order +/// based on some heuristic and uses them to join basic blocks into clusters. class GreedyClusterAlgorithm : public ClusterAlgorithm { +protected: + // Represents an edge between two basic blocks, with source, destination, and + // profile count. + struct EdgeTy { + BinaryBasicBlock *Src; + BinaryBasicBlock *Dst; + uint64_t Count; + + EdgeTy(BinaryBasicBlock *Src, BinaryBasicBlock *Dst, uint64_t Count) : + Src(Src), Dst(Dst), Count(Count) { } + + void print(raw_ostream &OS) const; + }; + + struct EdgeHash { + size_t operator() (const EdgeTy &E) const; + }; + + struct EdgeEqual { + bool operator() (const EdgeTy &A, const EdgeTy &B) const; + }; + + // Virtual methods that allow custom specialization of the heuristic used by + // the algorithm to select edges. + virtual void initQueue( + std::vector &Queue, const BinaryFunction &BF) =0; + virtual void adjustQueue( + std::vector &Queue, const BinaryFunction &BF) =0; + virtual bool areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const =0; + + // Map from basic block to owning cluster index. + using BBToClusterMapTy = std::unordered_map; + BBToClusterMapTy BBToClusterMap; + public: void clusterBasicBlocks(const BinaryFunction &BF) override; + void reset() override; +}; + + +/// This clustering algorithm is based on a greedy heuristic suggested by +/// Pettis and Hansen (PLDI '90). +class PHGreedyClusterAlgorithm : public GreedyClusterAlgorithm { +protected: + void initQueue( + std::vector &Queue, const BinaryFunction &BF) override; + void adjustQueue( + std::vector &Queue, const BinaryFunction &BF) override; + bool areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const + override; +}; + + +/// This clustering algorithm is based on a greedy heuristic that is a +/// modification of the heuristic suggested by Pettis (PLDI '90). It is +/// geared towards minimizing branches. +class MinBranchGreedyClusterAlgorithm : public GreedyClusterAlgorithm { +private: + // Map from an edge to its weight which is used by the algorithm to sort the + // edges. + std::unordered_map Weight; + + // The weight of an edge is calculated as the win in branches if we choose + // to layout this edge as a fall-through. For example, consider the edges + // A -> B with execution count 500, + // A -> C with execution count 100, and + // D -> B with execution count 150 + // wher B, C are the only successors of A and A, D are thr only predessecors + // of B. Then if we choose to layout edge A -> B as a fallthrough, the win in + // branches would be 500 - 100 - 150 = 250. That is the weight of edge A->B. + int64_t calculateWeight(const EdgeTy &E, const BinaryFunction &BF) const; + +protected: + void initQueue( + std::vector &Queue, const BinaryFunction &BF) override; + void adjustQueue( + std::vector &Queue, const BinaryFunction &BF) override; + bool areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const + override; + +public: + void reset() override; }; + /// Objects of this class implement various basic block reordering alogrithms. /// Most of these algorithms depend on a clustering alogrithm. /// Here we have 3 conflicting goals as to how to layout clusters. If we want From 059da8effd5f959a096f21ae10abfde4a1a6c168 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 23 Jul 2016 12:50:34 -0700 Subject: [PATCH 146/904] CFG editing functions Summary: This diff adds a number of methods to BinaryFunction that can be used to edit the CFG after it is created. The basic public functions are: - createBasicBlock - create a new block that is not inserted into the CFG. - insertBasicBlocks - insert a range of blocks (made with createBasicBlock) into the CFG. - updateLayout - update the CFG layout (either by inserting new blocks at a certain point or recomputing the entire layout). - fixFallthroughBranch - add a direct jump to the fallthrough successor for a given block. There are a number of private helper functions used to implement the above. This was split off the ICP diff to simplify it a bit. (cherry picked from commit 4cc475c4c2fb39d88ac0d227e55fe884586a2af2) --- bolt/BinaryBasicBlock.h | 77 ++++++++++++++++-- bolt/BinaryFunction.cpp | 171 +++++++++++++++++++++++++++++++++++++--- bolt/BinaryFunction.h | 76 +++++++++++++++--- 3 files changed, 296 insertions(+), 28 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 344d8e28f49a..3bf3b65b1b7e 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -39,7 +39,13 @@ class BinaryContext; /// The intention is to keep the structure similar to MachineBasicBlock as /// we might switch to it at some point. class BinaryBasicBlock { + public: + struct BinaryBranchInfo { + uint64_t Count; + uint64_t MispredictedCount; /// number of branches mispredicted + }; + private: /// Label associated with the block. MCSymbol *Label{nullptr}; @@ -83,11 +89,6 @@ class BinaryBasicBlock { std::set Throwers; std::set LandingPads; - struct BinaryBranchInfo { - uint64_t Count; - uint64_t MispredictedCount; /// number of branches mispredicted - }; - /// Each successor has a corresponding BranchInfo entry in the list. std::vector BranchInfo; @@ -301,10 +302,23 @@ class BinaryBasicBlock { } /// Add instruction at the end of this basic block. - void addInstruction(MCInst &Inst) { + void addInstruction(MCInst &&Inst) { Instructions.emplace_back(Inst); } + /// Add instruction at the end of this basic block. + void addInstruction(const MCInst &Inst) { + Instructions.push_back(Inst); + } + + /// Add a range of instructions to the end of this basic block. + template + void addInstructions(Itr Begin, Itr End) { + while (Begin != End) { + addInstruction(*Begin++); + } + } + /// Add instruction before Pos in this basic block. const_iterator insertPseudoInstr(const_iterator Pos, MCInst &Instr) { ++NumPseudos; @@ -335,6 +349,24 @@ class BinaryBasicBlock { uint64_t Count = 0, uint64_t MispredictedCount = 0); + /// Add a range of successors. + template + void addSuccessors(Itr Begin, Itr End) { + while (Begin != End) { + addSuccessor(*Begin++); + } + } + + /// Add a range of successors with branch info. + template + void addSuccessors(Itr Begin, Itr End, BrItr BrBegin, BrItr BrEnd) { + assert(std::distance(Begin, End) == std::distance(BrBegin, BrEnd)); + while (Begin != End) { + const auto BrInfo = *BrBegin++; + addSuccessor(*Begin++, BrInfo.Count, BrInfo.MispredictedCount); + } + } + /// Adds block to landing pad list. void addLandingPad(BinaryBasicBlock *LPBlock); @@ -342,6 +374,14 @@ class BinaryBasicBlock { /// list of predecessors of /p Succ and update branch info. void removeSuccessor(BinaryBasicBlock *Succ); + /// Remove a range of successor blocks. + template + void removeSuccessors(Itr Begin, Itr End) { + while (Begin != End) { + removeSuccessor(*Begin++); + } + } + /// Return the information about the number of times this basic block was /// executed. /// @@ -350,6 +390,11 @@ class BinaryBasicBlock { return ExecutionCount; } + /// Set the execution count for this block. + void setExecutionCount(uint64_t Count) { + ExecutionCount = Count; + } + bool isCold() const { return IsCold; } @@ -385,6 +430,21 @@ class BinaryBasicBlock { return false; } + /// Split apart the instructions in this basic block starting at Inst. + /// The instructions following Inst are removed and returned in a vector. + std::vector splitInstructions(const MCInst *Inst) { + std::vector SplitInst; + + assert(!Instructions.empty()); + while(&Instructions.back() != Inst) { + SplitInst.push_back(Instructions.back()); + Instructions.pop_back(); + } + std::reverse(SplitInst.begin(), SplitInst.end()); + + return SplitInst; + } + /// Sets the symbol pointing to the end of the BB in the output binary. void setEndLabel(MCSymbol *Symbol) { EndLabel = Symbol; @@ -436,6 +496,11 @@ class BinaryBasicBlock { /// Remove predecessor of the basic block. Don't use directly, instead /// use removeSuccessor() funciton. void removePredecessor(BinaryBasicBlock *Pred); + + /// Set offset of the basic block from the function start. + void setOffset(uint64_t NewOffset) { + Offset = NewOffset; + } }; bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 82a5d18512a7..1ad70c4fe147 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -99,6 +99,8 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { if (BasicBlocks.empty()) return nullptr; + // This is commented out because it makes BOLT too slow. + // assert(std::is_sorted(begin(), end())); auto I = std::upper_bound(begin(), end(), BinaryBasicBlock(Offset)); @@ -531,8 +533,61 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { return true; } -bool BinaryFunction::buildCFG() { +void BinaryFunction::clearLandingPads(const unsigned StartIndex, + const unsigned NumBlocks) { + // remove all landing pads/throws for the given collection of blocks + for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { + auto *BB = BasicBlocks[I]; + for (auto *LPBlock : BB->LandingPads) { + auto count = LPBlock->Throwers.erase(BB); + assert(count == 1); + } + BB->LandingPads.clear(); + } +} + +void BinaryFunction::addLandingPads(const unsigned StartIndex, + const unsigned NumBlocks) { + for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { + auto *BB = BasicBlocks[I]; + if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { + MCSymbol *LP = BB->getLabel(); + for (unsigned I : LPToBBIndex.at(LP)) { + assert(I < BasicBlocks.size()); + BinaryBasicBlock *ThrowBB = BasicBlocks[I]; + ThrowBB->addLandingPad(BB); + } + } + } +} + +void BinaryFunction::recomputeLandingPads(const unsigned StartIndex, + const unsigned NumBlocks) { + assert(LPToBBIndex.empty()); + + clearLandingPads(StartIndex, NumBlocks); + + for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { + auto *BB = BasicBlocks[I]; + for (auto &Instr : BB->Instructions) { + // Store info about associated landing pad. + if (BC.MIA->isInvoke(Instr)) { + const MCSymbol *LP; + uint64_t Action; + std::tie(LP, Action) = BC.MIA->getEHInfo(Instr); + if (LP) { + LPToBBIndex[LP].push_back(BB->Index); + } + } + } + } + + addLandingPads(StartIndex, NumBlocks); + + clearList(LPToBBIndex); +} +bool BinaryFunction::buildCFG() { auto &MIA = BC.MIA; auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); @@ -764,15 +819,7 @@ bool BinaryFunction::buildCFG() { } // Add associated landing pad blocks to each basic block. - for (auto BB : BasicBlocks) { - if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { - MCSymbol *LP = BB->getLabel(); - for (unsigned I : LPToBBIndex.at(LP)) { - BinaryBasicBlock *ThrowBB = getBasicBlockAtIndex(I); - ThrowBB->addLandingPad(BB); - } - } - } + addLandingPads(0, BasicBlocks.size()); // Infer frequency for non-taken branches if (hasValidProfile()) @@ -1066,6 +1113,7 @@ bool BinaryFunction::fixCFIState() { std::vector NewCFIs; uint32_t NestedLevel = 0; for (uint32_t CurState = FromState; CurState < ToState; ++CurState) { + assert(CurState < FrameInstructions.size()); MCCFIInstruction *Instr = &FrameInstructions[CurState]; if (Instr->getOperation() == MCCFIInstruction::OpRememberState) ++NestedLevel; @@ -1311,6 +1359,8 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const { const BinaryBasicBlock * BinaryFunction::getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const { + // This is commented out because it makes BOLT run too slowly. + //assert(std::is_sorted(begin(), end())); auto I = std::upper_bound(begin(), end(), *BB); assert(I != begin() && "first basic block not at offset 0"); @@ -1343,9 +1393,7 @@ void BinaryFunction::fixBranches() { HotColdBorder = true; } const BinaryBasicBlock *OldFTBB = getOriginalLayoutSuccessor(BB); - const MCSymbol *OldFT = nullptr; - if (OldFTBB != nullptr) - OldFT = OldFTBB->getLabel(); + const MCSymbol *OldFT = OldFTBB ? OldFTBB->getLabel() : nullptr; // Case 1: There are no branches in this basic block and it just falls // through @@ -1431,6 +1479,49 @@ void BinaryFunction::fixBranches() { } } +void BinaryFunction::fixFallthroughBranch(BinaryBasicBlock *Block) { + // No successors, must be a return or similar. + if (Block->succ_size() == 0) return; + + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + + if (!BC.MIA->analyzeBranch(Block->Instructions, TBB, FBB, CondBranch, + UncondBranch)) { + assert(0); + return; + } + + if (!UncondBranch) { + const BinaryBasicBlock* FallThroughBB = nullptr; + if (CondBranch) { + assert(TBB); + // Find the first successor that is not a target of the conditional + // branch. + for (auto *Succ : Block->successors()) { + if (Succ->getLabel() != TBB) { + FallThroughBB = Succ; + break; + } + } + } else { + // pick first successor as fallthrough. + FallThroughBB = *Block->succ_begin(); + } + + assert(FallThroughBB); + + const auto FallThroughLabel = FallThroughBB->getLabel(); + MCInst NewInst; + if (!BC.MIA->createUncondBranch(NewInst, FallThroughLabel, BC.Ctx.get())) { + llvm_unreachable("Target does not support creating new branches"); + } + Block->addInstruction(NewInst); + } +} + void BinaryFunction::splitFunction() { bool AllCold = true; for (BinaryBasicBlock *BB : BasicBlocksLayout) { @@ -1916,6 +2007,60 @@ std::size_t BinaryFunction::hash() const { return std::hash{}(Opcodes); } +void BinaryFunction::insertBasicBlocks( + BinaryBasicBlock *Start, + std::vector> &&NewBBs) { + const auto StartIndex = getIndex(Start); + const auto NumNewBlocks = NewBBs.size(); + + BasicBlocks.insert(BasicBlocks.begin() + StartIndex + 1, + NumNewBlocks, + nullptr); + + auto I = StartIndex + 1; + for (auto &BB : NewBBs) { + assert(!BasicBlocks[I]); + BasicBlocks[I++] = BB.release(); + } + + // Recompute indices and offsets for all basic blocks after Start. + uint64_t Offset = Start->getOffset(); + for (auto I = StartIndex; I < BasicBlocks.size(); ++I) { + auto *BB = BasicBlocks[I]; + BB->setOffset(Offset); + Offset += BC.computeCodeSize(BB->begin(), BB->end()); + BB->Index = I; + } + + // Recompute CFI state for all BBs. + BBCFIState.clear(); + annotateCFIState(); + + recomputeLandingPads(StartIndex, NumNewBlocks + 1); + + // Make sure the basic blocks are sorted properly. + assert(std::is_sorted(begin(), end())); +} + +// TODO: Which of these methods is better? +void BinaryFunction::updateLayout(BinaryBasicBlock* Start, + const unsigned NumNewBlocks) { + // Insert new blocks in the layout immediately after Start. + auto Pos = std::find(layout_begin(), layout_end(), Start); + assert(Pos != layout_end()); + auto Begin = &BasicBlocks[Start->Index + 1]; + auto End = &BasicBlocks[Start->Index + NumNewBlocks + 1]; + BasicBlocksLayout.insert(Pos + 1, Begin, End); +} + +void BinaryFunction::updateLayout(LayoutType Type, + bool MinBranchClusters, + bool Split) { + // Recompute layout with original parameters. + BasicBlocksLayout = BasicBlocks; + modifyLayout(Type, MinBranchClusters, Split); +} + BinaryFunction::~BinaryFunction() { for (auto BB : BasicBlocks) { delete BB; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index a95d4600dc92..d130d364e78f 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -232,6 +232,20 @@ class BinaryFunction : public AddressRangesOwner { const BinaryBasicBlock &BBOther, const BinaryFunction &BF, bool AreInvokes) const; + /// Clear the landing pads for all blocks contained in the range of + /// [StartIndex, StartIndex + NumBlocks). This also has the effect of + /// removing throws that point to any of these blocks. + void clearLandingPads(const unsigned StartIndex, const unsigned NumBlocks); + + /// Add landing pads for all blocks in the range + /// [StartIndex, StartIndex + NumBlocks) using LPToBBIndex. + void addLandingPads(const unsigned StartIndex, const unsigned NumBlocks); + + /// Recompute the landing pad information for all the basic blocks in the + /// range of [StartIndex to StartIndex + NumBlocks). + void recomputeLandingPads(const unsigned StartIndex, + const unsigned NumBlocks); + /// Return basic block that originally was laid out immediately following /// the given /p BB basic block. const BinaryBasicBlock * @@ -594,28 +608,49 @@ class BinaryFunction : public AddressRangesOwner { } /// Create a basic block at a given \p Offset in the - /// function and append it to the end of list of blocks. + /// function. /// If \p DeriveAlignment is true, set the alignment of the block based /// on the alignment of the existing offset. + /// The new block is not inserted into the CFG. The client must + /// use insertBasicBlocks to add any new blocks to the CFG. /// - /// Returns NULL if basic block already exists at the \p Offset. - BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, - bool DeriveAlignment = false) { - assert(!getBasicBlockAtOffset(Offset) && "basic block already exists"); + std::unique_ptr + createBasicBlock(uint64_t Offset, + MCSymbol *Label = nullptr, + bool DeriveAlignment = false) { assert(BC.Ctx && "cannot be called with empty context"); - if (!Label) + if (!Label) { Label = BC.Ctx->createTempSymbol("BB", true); - BasicBlocks.emplace_back(new BinaryBasicBlock(Label, this, Offset)); - - auto BB = BasicBlocks.back(); + } + auto BB = std::unique_ptr( + new BinaryBasicBlock(Label, this, Offset)); if (DeriveAlignment) { uint64_t DerivedAlignment = Offset & (1 + ~Offset); BB->setAlignment(std::min(DerivedAlignment, uint64_t(32))); } + return BB; + } + + /// Create a basic block at a given \p Offset in the + /// function and append it to the end of list of blocks. + /// If \p DeriveAlignment is true, set the alignment of the block based + /// on the alignment of the existing offset. + /// + /// Returns NULL if basic block already exists at the \p Offset. + BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, + bool DeriveAlignment = false) { + assert(CurrentState == State::CFG || + (!getBasicBlockAtOffset(Offset) && "basic block already exists")); + auto BBPtr = createBasicBlock(Offset, Label, DeriveAlignment); + BasicBlocks.emplace_back(BBPtr.release()); + + auto BB = BasicBlocks.back(); BB->Index = BasicBlocks.size() - 1; + assert(CurrentState == State::CFG || std::is_sorted(begin(), end())); + return BB; } @@ -636,6 +671,24 @@ class BinaryFunction : public AddressRangesOwner { /// from the function start. BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); + /// Insert the BBs contained in NewBBs into the basic blocks for this + /// function. Update the associated state of all blocks as needed, i.e. + /// CFI state, BB offsets, BB indices. The new BBs are inserted after + /// Start. This operation could affect fallthrough branches for Start. + /// + void insertBasicBlocks( + BinaryBasicBlock *Start, + std::vector> &&NewBBs); + + /// Update the basic block layout for this function. The BBs from + /// [Start->Index, Start->Index + NumNewBlocks) are inserted into the + /// layout after the BB indicated by Start. + void updateLayout(BinaryBasicBlock* Start, const unsigned NumNewBlocks); + + /// Update the basic block layout for this function. The layout is + /// computed from scratch using modifyLayout. + void updateLayout(LayoutType Type, bool MinBranchClusters, bool Split); + /// Dump function information to debug output. If \p PrintInstructions /// is true - include instruction disassembly. void dump(std::string Annotation = "", bool PrintInstructions = true) const; @@ -902,6 +955,11 @@ class BinaryFunction : public AddressRangesOwner { /// adding jumps based on a new layout order. void fixBranches(); + /// If needed, add an unconditional jmp to the original fallthrough of + /// Block. This is used by the indirect call promotion optimization + /// since it inserts new BBs after the merge block. + void fixFallthroughBranch(BinaryBasicBlock *Block); + /// Split function in two: a part with warm or hot BBs and a part with never /// executed BBs. The cold part is moved to a new BinaryFunction. void splitFunction(); From 8448e8fc2f0c24c055d1a9772f7ccce94aee2cf3 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 28 Jul 2016 18:49:48 -0700 Subject: [PATCH 147/904] Add printing support for indirect tail calls. Summary: LLVM was missing assembler print string for indirect tail calls which are synthetic instructions created by us. (cherry picked from commit 174db0c7074b8f2c7466312ce341a9dafb8f6593) --- bolt/BinaryContext.cpp | 6 +----- bolt/BinaryFunction.cpp | 9 --------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index be42e8bd573b..935a1b34e9a3 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -289,11 +289,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, OS << "\n"; return; } - if (!MIA->isUnsupported(Instruction)) { - InstPrinter->printInst(&Instruction, OS, "", *STI); - } else { - OS << "unsupported (probably jmpr)"; - } + InstPrinter->printInst(&Instruction, OS, "", *STI); if (MIA->isCall(Instruction)) { if (MIA->isTailCall(Instruction)) OS << " # TAILCALL "; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 1ad70c4fe147..33c460fc7e88 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -386,15 +386,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { break; } - if (MIA->isUnsupported(Instruction)) { - errs() << "BOLT-WARNING: unsupported instruction seen at offset 0x" - << Twine::utohexstr(Offset) << " (address 0x" - << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " - << getName() << '\n'; - IsSimple = false; - break; - } - // Convert instruction to a shorter version that could be relaxed if needed. MIA->shortenInstruction(Instruction); From ff1bba3dd81190c5846fa78e3a124ddb79ab5dcc Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Wed, 13 Jul 2016 18:57:40 -0700 Subject: [PATCH 148/904] Fix for correct disassembling of conditional tail calls. Summary: BOLT attempts to convert jumps that serve as tail calls to dedicated tail call instructions, but this is impossible when the jump is conditional because there is no corresponding tail call instruction. This was causing the creation of a duplicate fall-through edge for basic blocks terminated with a conditional jump serving as a tail call when there is profile data available for the non-taken branch. In this case, the first fall-through edge had a count taken from the profile data, while the second has a count computed (incorrectly) by BinaryFunction::inferFallThroughCounts. (cherry picked from commit d72fcb3edd26916bb1add7aa10d4d7e3fc9f6470) --- bolt/BinaryBasicBlock.h | 8 +- bolt/BinaryFunction.cpp | 245 ++++++++++++++++++++++++++++++++++------ bolt/BinaryFunction.h | 41 ++++++- bolt/DataReader.cpp | 25 ++++ bolt/DataReader.h | 19 ++-- 5 files changed, 291 insertions(+), 47 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 3bf3b65b1b7e..1c85e6c25468 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -302,13 +302,17 @@ class BinaryBasicBlock { } /// Add instruction at the end of this basic block. - void addInstruction(MCInst &&Inst) { + /// Returns the index of the instruction in the Instructions vector of the BB. + uint32_t addInstruction(MCInst &&Inst) { Instructions.emplace_back(Inst); + return Instructions.size() - 1; } /// Add instruction at the end of this basic block. - void addInstruction(const MCInst &Inst) { + /// Returns the index of the instruction in the Instructions vector of the BB. + uint32_t addInstruction(const MCInst &Inst) { Instructions.push_back(Inst); + return Instructions.size() - 1; } /// Add a range of instructions to the end of this basic block. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 33c460fc7e88..655852fece6e 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -438,11 +438,22 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { << ". Code size will be increased.\n"; } + assert(!MIA->isTailCall(Instruction) && + "synthetic tail call instruction found"); + // This is a call regardless of the opcode. // Assign proper opcode for tail calls, so that they could be // treated as calls. if (!IsCall) { - MIA->convertJmpToTailCall(Instruction); + if (!MIA->convertJmpToTailCall(Instruction)) { + assert(IsCondBranch && "unknown tail call instruction"); + errs() << "BOLT-WARNING: conditional tail call detected in " + << "function " << getName() << " at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << ".\n"; + } + // TODO: A better way to do this would be using annotations for + // MCInst objects. + TailCallOffsets.emplace(std::make_pair(Offset, InstructionTarget)); IsCall = true; } @@ -557,7 +568,7 @@ void BinaryFunction::recomputeLandingPads(const unsigned StartIndex, assert(LPToBBIndex.empty()); clearLandingPads(StartIndex, NumBlocks); - + for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { auto *BB = BasicBlocks[I]; for (auto &Instr : BB->Instructions) { @@ -614,8 +625,9 @@ bool BinaryFunction::buildCFG() { // sorted by offsets. BinaryBasicBlock *InsertBB{nullptr}; BinaryBasicBlock *PrevBB{nullptr}; - bool IsLastInstrNop = false; - MCInst *PrevInstr{nullptr}; + bool IsLastInstrNop{false}; + bool IsPreviousInstrTailCall{false}; + const MCInst *PrevInstr{nullptr}; auto addCFIPlaceholders = [this](uint64_t CFIOffset, BinaryBasicBlock *InsertBB) { @@ -627,8 +639,10 @@ bool BinaryFunction::buildCFG() { }; for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { - auto &InstrInfo = *I; - auto LI = Labels.find(InstrInfo.first); + const uint32_t Offset = I->first; + const auto &Instr = I->second; + + auto LI = Labels.find(Offset); if (LI != Labels.end()) { // Always create new BB at branch destination. PrevBB = InsertBB; @@ -638,7 +652,7 @@ bool BinaryFunction::buildCFG() { // Ignore nops. We use nops to derive alignment of the next basic block. // It will not always work, as some blocks are naturally aligned, but // it's just part of heuristic for block alignment. - if (MIA->isNoop(InstrInfo.second)) { + if (MIA->isNoop(Instr)) { IsLastInstrNop = true; continue; } @@ -647,24 +661,37 @@ bool BinaryFunction::buildCFG() { // we see an unconditional branch following a conditional one. assert(PrevBB && "no previous basic block for a fall through"); assert(PrevInstr && "no previous instruction for a fall through"); - if (MIA->isUnconditionalBranch(InstrInfo.second) && - !MIA->isUnconditionalBranch(*PrevInstr)) { + if (MIA->isUnconditionalBranch(Instr) && + !MIA->isUnconditionalBranch(*PrevInstr) && !IsPreviousInstrTailCall) { // Temporarily restore inserter basic block. InsertBB = PrevBB; } else { - InsertBB = addBasicBlock(InstrInfo.first, + InsertBB = addBasicBlock(Offset, BC.Ctx->createTempSymbol("FT", true), /* DeriveAlignment = */ IsLastInstrNop); } } - if (InstrInfo.first == 0) { + if (Offset == 0) { // Add associated CFI pseudos in the first offset (0) addCFIPlaceholders(0, InsertBB); } IsLastInstrNop = false; - InsertBB->addInstruction(InstrInfo.second); - PrevInstr = &InstrInfo.second; + uint32_t InsertIndex = InsertBB->addInstruction(Instr); + PrevInstr = &Instr; + + // Record whether this basic block is terminated with a tail call. + auto TCI = TailCallOffsets.find(Offset); + if (TCI != TailCallOffsets.end()) { + uint64_t TargetAddr = TCI->second; + TailCallTerminatedBlocks.emplace( + std::make_pair(InsertBB, + TailCallInfo(Offset, InsertIndex, TargetAddr))); + IsPreviousInstrTailCall = true; + } else { + IsPreviousInstrTailCall = false; + } + // Add associated CFI instrs. We always add the CFI instruction that is // located immediately after this instruction, since the next CFI // instruction reflects the change in state caused by this instruction. @@ -678,27 +705,22 @@ bool BinaryFunction::buildCFG() { addCFIPlaceholders(CFIOffset, InsertBB); // Store info about associated landing pad. - if (MIA->isInvoke(InstrInfo.second)) { + if (MIA->isInvoke(Instr)) { const MCSymbol *LP; uint64_t Action; - std::tie(LP, Action) = MIA->getEHInfo(InstrInfo.second); + std::tie(LP, Action) = MIA->getEHInfo(Instr); if (LP) { LPToBBIndex[LP].push_back(getIndex(InsertBB)); } } // How well do we detect tail calls here? - if (MIA->isTerminator(InstrInfo.second)) { + if (MIA->isTerminator(Instr)) { PrevBB = InsertBB; InsertBB = nullptr; } } - // Set the basic block layout to the original order. - for (auto BB : BasicBlocks) { - BasicBlocksLayout.emplace_back(BB); - } - // Intermediate dump. DEBUG(print(dbgs(), "after creating basic blocks")); @@ -763,16 +785,29 @@ bool BinaryFunction::buildCFG() { // Does not add a successor if we can't find profile data, leave it to the // inference pass to guess its frequency - if (!BranchDataOrErr.getError()) { + if (BranchDataOrErr) { const FuncBranchData &BranchData = BranchDataOrErr.get(); auto BranchInfoOrErr = BranchData.getBranch(Branch.first, Branch.second); - if (!BranchInfoOrErr.getError()) { + if (BranchInfoOrErr) { const BranchInfo &BInfo = BranchInfoOrErr.get(); FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); } } } + for (auto &I : TailCallTerminatedBlocks) { + TailCallInfo &TCInfo = I.second; + if (BranchDataOrErr) { + const FuncBranchData &BranchData = BranchDataOrErr.get(); + auto BranchInfoOrErr = BranchData.getDirectCallBranch(TCInfo.Offset); + if (BranchInfoOrErr) { + const BranchInfo &BInfo = BranchInfoOrErr.get(); + TCInfo.Count = BInfo.Branches; + TCInfo.Mispreds = BInfo.Mispreds; + } + } + } + // Add fall-through branches (except for non-taken conditional branches with // profile data, which were already accounted for in TakenBranches). PrevBB = nullptr; @@ -791,13 +826,34 @@ bool BinaryFunction::buildCFG() { auto LastInstIter = --BB->end(); while (MIA->isCFI(*LastInstIter) && LastInstIter != BB->begin()) --LastInstIter; + + // Check if the last instruction is a conditional jump that serves as a tail + // call. + bool IsCondTailCall = MIA->isConditionalBranch(*LastInstIter) && + TailCallTerminatedBlocks.count(BB); + if (BB->succ_size() == 0) { - IsPrevFT = MIA->isTerminator(*LastInstIter) ? false : true; + if (IsCondTailCall) { + // Conditional tail call without profile data for non-taken branch. + IsPrevFT = true; + } else { + // Unless the last instruction is a terminator, control will fall + // through to the next basic block. + IsPrevFT = MIA->isTerminator(*LastInstIter) ? false : true; + } } else if (BB->succ_size() == 1) { - IsPrevFT = MIA->isConditionalBranch(*LastInstIter) ? true : false; + if (IsCondTailCall) { + // Conditional tail call with data for non-taken branch. A fall-through + // edge has already ben added in the CFG. + IsPrevFT = false; + } else { + // Fall-through should be added if the last instruction is a conditional + // jump, since there was no profile data for the non-taken branch. + IsPrevFT = MIA->isConditionalBranch(*LastInstIter) ? true : false; + } } else { // Ends with 2 branches, with an indirect jump or it is a conditional - // branch whose frequency has been inferred from LBR + // branch whose frequency has been inferred from LBR. IsPrevFT = false; } @@ -819,8 +875,23 @@ bool BinaryFunction::buildCFG() { // Update CFI information for each BB annotateCFIState(); + // Convert conditional tail call branches to conditional branches that jump + // to a tail call. + removeConditionalTailCalls(); + + // Set the basic block layout to the original order. + for (auto BB : BasicBlocks) { + BasicBlocksLayout.emplace_back(BB); + } + + // Fix the possibly corrupted CFI state. CFI state may have been corrupted + // because of the CFG modifications while removing conditional tail calls. + fixCFIState(); + // Clean-up memory taken by instructions and labels. clearList(Instructions); + clearList(TailCallOffsets); + clearList(TailCallTerminatedBlocks); clearList(OffsetToCFI); clearList(Labels); clearList(TakenBranches); @@ -996,6 +1067,14 @@ void BinaryFunction::inferFallThroughCounts() { ReportedBranches += SuccCount.Count; } + // Calculate frequency of outgoing tail calls from this node according to + // LBR data + uint64_t ReportedTailCalls = 0; + auto TCI = TailCallTerminatedBlocks.find(CurBB); + if (TCI != TailCallTerminatedBlocks.end()) { + ReportedTailCalls = TCI->second.Count; + } + // Calculate frequency of throws from this node according to LBR data // for branching into associated landing pads. Since it is possible // for a landing pad to be associated with more than one basic blocks, @@ -1005,7 +1084,8 @@ void BinaryFunction::inferFallThroughCounts() { ReportedThrows += LP->ExecutionCount; } - uint64_t TotalReportedJumps = ReportedBranches + ReportedThrows; + uint64_t TotalReportedJumps = + ReportedBranches + ReportedTailCalls + ReportedThrows; // Infer the frequency of the fall-through edge, representing not taking the // branch @@ -1036,6 +1116,93 @@ void BinaryFunction::inferFallThroughCounts() { return; } +void BinaryFunction::removeConditionalTailCalls() { + for (auto &I : TailCallTerminatedBlocks) { + BinaryBasicBlock *BB = I.first; + TailCallInfo &TCInfo = I.second; + + // Get the conditional tail call instruction. + MCInst &CondTailCallInst = BB->getInstructionAtIndex(TCInfo.Index); + if (!BC.MIA->isConditionalBranch(CondTailCallInst)) { + // The block is not terminated with a conditional tail call. + continue; + } + + // Assert that the tail call does not throw. + const MCSymbol *LP; + uint64_t Action; + std::tie(LP, Action) = BC.MIA->getEHInfo(CondTailCallInst); + assert(!LP && "found tail call with associated landing pad"); + + // Create the uncoditional tail call instruction. + const MCSymbol &TailCallTargetLabel = + cast( + CondTailCallInst.getOperand(0).getExpr())->getSymbol(); + MCInst TailCallInst; + BC.MIA->createTailCall(TailCallInst, &TailCallTargetLabel, BC.Ctx.get()); + + // The way we will remove this conditional tail call depends on the + // direction of the jump when it is taken. We want to preserve this + // direction. + BinaryBasicBlock *TailCallBB = nullptr; + if (getAddress() > TCInfo.TargetAddress) { + // Backward jump: We will reverse the condition of the tail call, change + // its target to the following (currently fall-through) block, and insert + // a new block between them that will contain the uncoditional tail call. + + // Reverse the condition of the tail call and update its target. + unsigned InsertIdx = getIndex(BB) + 1; + assert(InsertIdx < size() && "no fall-through for condtional tail call"); + BinaryBasicBlock *NextBB = getBasicBlockAtIndex(InsertIdx); + BC.MIA->reverseBranchCondition( + CondTailCallInst, NextBB->getLabel(), BC.Ctx.get()); + + // Create a basic block containing the unconditional tail call instruction + // and place it between BB and NextBB. + MCSymbol *TCLabel = BC.Ctx->createTempSymbol("TC", true); + std::vector> TailCallBBs; + TailCallBBs.emplace_back(createBasicBlock(NextBB->getOffset(), TCLabel)); + TailCallBBs[0]->addInstruction(TailCallInst); + insertBasicBlocks(BB, std::move(TailCallBBs), /* UpdateCFIState */ false); + TailCallBB = getBasicBlockAtIndex(InsertIdx); + + // Add the correct CFI state for the new block. + BBCFIState.insert(BBCFIState.begin() + InsertIdx, TCInfo.CFIStateBefore); + } else { + // Forward jump: we will create a new basic block at the end of the + // function containing the uncoditional tail call and change the target of + // the conditional tail call to this basic block. + + // Create a basic block containing the unconditional tail call + // instruction and place it at the end of the function. + const BinaryBasicBlock *LastBB = BasicBlocks.back(); + uint64_t NewBlockOffset = + LastBB->Offset + BC.computeCodeSize(LastBB->begin(), LastBB->end()); + MCSymbol *TCLabel = BC.Ctx->createTempSymbol("TC", true); + TailCallBB = addBasicBlock(NewBlockOffset, TCLabel); + TailCallBB->addInstruction(TailCallInst); + + // Add the correct CFI state for the new block. It has to be inserted in + // the one before last position (the last position holds the CFI state + // after the last block). + BBCFIState.insert(BBCFIState.begin() + BBCFIState.size() - 1, + TCInfo.CFIStateBefore); + + // Replace the target of the conditional tail call with the label of the + // new basic block. + BC.MIA->replaceBranchTarget(CondTailCallInst, TCLabel, BC.Ctx.get()); + } + + // Add the CFG edge from BB to TailCallBB and the corresponding profile + // info. + BB->addSuccessor(TailCallBB, TCInfo.Count, TCInfo.Mispreds); + + // Add execution count for the block. + if (hasValidProfile()) + TailCallBB->ExecutionCount = TCInfo.Count; + } +} + uint64_t BinaryFunction::getFunctionScore() { if (FunctionScore != -1) return FunctionScore; @@ -1064,11 +1231,21 @@ void BinaryFunction::annotateCFIState() { // Annotate this BB entry BBCFIState.emplace_back(State); + // While building the CFG, we want to save the CFI state before a tail call + // instruction, so that we can correctly remove condtional tail calls + auto TCI = TailCallTerminatedBlocks.find(CurBB); + bool SaveState = TCI != TailCallTerminatedBlocks.end(); + // Advance state + uint32_t Idx = 0; for (const auto &Instr : *CurBB) { auto *CFI = getCFIFor(Instr); - if (CFI == nullptr) + if (CFI == nullptr) { + if (SaveState && Idx == TCI->second.Index) + TCI->second.CFIStateBefore = State; + ++Idx; continue; + } ++HighestState; if (CFI->getOperation() == MCCFIInstruction::OpRememberState) { StateStack.push(State); @@ -1079,6 +1256,7 @@ void BinaryFunction::annotateCFIState() { } else if (CFI->getOperation() != MCCFIInstruction::OpGnuArgsSize) { State = HighestState; } + ++Idx; } } @@ -2000,7 +2178,8 @@ std::size_t BinaryFunction::hash() const { void BinaryFunction::insertBasicBlocks( BinaryBasicBlock *Start, - std::vector> &&NewBBs) { + std::vector> &&NewBBs, + bool UpdateCFIState) { const auto StartIndex = getIndex(Start); const auto NumNewBlocks = NewBBs.size(); @@ -2023,9 +2202,11 @@ void BinaryFunction::insertBasicBlocks( BB->Index = I; } - // Recompute CFI state for all BBs. - BBCFIState.clear(); - annotateCFIState(); + if (UpdateCFIState) { + // Recompute CFI state for all BBs. + BBCFIState.clear(); + annotateCFIState(); + } recomputeLandingPads(StartIndex, NumNewBlocks + 1); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index d130d364e78f..9a99e35c30c3 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -268,6 +268,29 @@ class BinaryFunction : public AddressRangesOwner { using InstrMapType = std::map; InstrMapType Instructions; + /// Temporary holder of offsets of tail call instructions before CFG is + /// constructed. Map from offset to the corresponding target address of the + /// tail call. + using TailCallOffsetMapType = std::map; + TailCallOffsetMapType TailCallOffsets; + + /// Temporary holder of tail call terminated basic blocks used during CFG + /// construction. Map from tail call terminated basic block to a struct with + /// information about the tail call. + struct TailCallInfo { + uint32_t Offset; // offset of the tail call from the function start + uint32_t Index; // index of the tail call in the basic block + uint64_t TargetAddress; // address of the callee + uint64_t Count{0}; // taken count from profile data + uint64_t Mispreds{0}; // mispredicted count from progile data + uint32_t CFIStateBefore{0}; // CFI state before the tail call instruction + + TailCallInfo(uint32_t Offset, uint32_t Index, uint64_t TargetAddress) : + Offset(Offset), Index(Index), TargetAddress(TargetAddress) { } + }; + using TailCallBasicBlockMapType = std::map; + TailCallBasicBlockMapType TailCallTerminatedBlocks; + /// List of DWARF CFI instructions. Original CFI from the binary must be /// sorted w.r.t. offset that it appears. We rely on this to replay CFIs /// if needed (to fix state after reordering BBs). @@ -672,13 +695,15 @@ class BinaryFunction : public AddressRangesOwner { BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); /// Insert the BBs contained in NewBBs into the basic blocks for this - /// function. Update the associated state of all blocks as needed, i.e. - /// CFI state, BB offsets, BB indices. The new BBs are inserted after - /// Start. This operation could affect fallthrough branches for Start. + /// function. Update the associated state of all blocks as needed, i.e. + /// BB offsets, BB indices, and optionally CFI state. The new BBs are + /// inserted after Start. This operation could affect fallthrough branches + /// for Start. /// void insertBasicBlocks( BinaryBasicBlock *Start, - std::vector> &&NewBBs); + std::vector> &&NewBBs, + bool UpdateCFIState = true); /// Update the basic block layout for this function. The BBs from /// [Start->Index, Start->Index + NumNewBlocks) are inserted into the @@ -688,7 +713,7 @@ class BinaryFunction : public AddressRangesOwner { /// Update the basic block layout for this function. The layout is /// computed from scratch using modifyLayout. void updateLayout(LayoutType Type, bool MinBranchClusters, bool Split); - + /// Dump function information to debug output. If \p PrintInstructions /// is true - include instruction disassembly. void dump(std::string Annotation = "", bool PrintInstructions = true) const; @@ -932,6 +957,12 @@ class BinaryFunction : public AddressRangesOwner { /// has been filled with LBR data. void inferFallThroughCounts(); + /// Converts conditional tail calls to unconditional tail calls. We do this to + /// handle conditional tail calls correctly and to give a chance to the + /// simplify conditional tail call pass to decide whether to re-optimize them + /// using profile information. + void removeConditionalTailCalls(); + /// Computes a function hotness score: the sum of the products of BB frequency /// and size. uint64_t getFunctionScore(); diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 4cdeefa08eab..cee08aec39c5 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -129,6 +129,26 @@ ErrorOr FuncBranchData::getBranch(uint64_t From, return make_error_code(llvm::errc::invalid_argument); } +ErrorOr +FuncBranchData::getDirectCallBranch(uint64_t From) const { + // Commented out because it can be expensive. + // assert(std::is_sorted(Data.begin(), Data.end())); + struct Compare { + bool operator()(const BranchInfo &BI, const uint64_t Val) const { + return BI.From.Offset < Val; + } + bool operator()(const uint64_t Val, const BranchInfo &BI) const { + return Val < BI.From.Offset; + } + }; + auto Range = std::equal_range(Data.begin(), Data.end(), From, Compare()); + for (auto I = Range.first; I != Range.second; ++I) { + if (I->From.Name != I->To.Name) + return *I; + } + return make_error_code(llvm::errc::invalid_argument); +} + ErrorOr> DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { ErrorOr> MB = @@ -377,8 +397,13 @@ std::error_code DataReader::parse() { I = GetOrCreateFuncEntry(BI.To.Name); I->getValue().ExecutionCount += BI.Branches; } + } + for (auto &FuncBranches : FuncsMap) { + std::stable_sort(FuncBranches.second.Data.begin(), + FuncBranches.second.Data.end()); } + return std::error_code(); } diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 692bc4e41fb9..aea058f557a6 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -42,16 +42,13 @@ struct Location { } bool operator<(const Location &RHS) const { - if (IsSymbol < RHS.IsSymbol) - return true; + if (IsSymbol != RHS.IsSymbol) + return IsSymbol < RHS.IsSymbol; - if (Name < RHS.Name) - return true; + if (Name != RHS.Name) + return Name < RHS.Name; - return IsSymbol == RHS.IsSymbol && - Name == RHS.Name && - Name != "[heap]" && - Offset < RHS.Offset; + return Name != "[heap]" && Offset < RHS.Offset; } }; @@ -119,6 +116,12 @@ struct FuncBranchData { : Name(Name), Data(std::move(Data)), EntryData(std::move(EntryData)) {} ErrorOr getBranch(uint64_t From, uint64_t To) const; + + /// Returns the branch info object associated with a direct call originating + /// from the given offset. If no branch info object is found, an error is + /// returned. If the offset corresponds to an indirect call the behavior is + /// undefined. + ErrorOr getDirectCallBranch(uint64_t From) const; }; //===----------------------------------------------------------------------===// From ca345fc69a2d4f8899ac90bf70ca10a3f434c424 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 28 Jul 2016 10:34:50 -0700 Subject: [PATCH 149/904] Add MCInst annotation mechanism to MCInstrAnalysis class. Summary: Add three new MCOperand types: Annotation, LandingPad and GnuArgsSize. Annotation is used for associating random data with MCInsts. Clients can construct their own annotation types (subclassed from MCAnnotation) and associate them with instructions. Annotations are looked up by string keys. Annotations can be added, removed and queried using an instance of the MCInstrAnalysis class. The LandingPad operand is a MCSymbol, uint64_t pair used to encode exception handling information for call instructions. GnuArgsSize is used to annotate calls with the DW_CFA_GNU_args_size attribute. (cherry picked from commit b581dc1b65477d03560dbfc05876148e442e1d91) --- bolt/BinaryBasicBlock.h | 9 ++------- bolt/BinaryFunction.cpp | 10 +++------- bolt/Exceptions.cpp | 12 +++--------- 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 1c85e6c25468..a8a60cce1e25 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -15,22 +15,17 @@ #define LLVM_TOOLS_LLVM_BOLT_BINARY_BASIC_BLOCK_H #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/ilist.h" #include "llvm/ADT/GraphTraits.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrAnalysis.h" -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Object/ObjectFile.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include #include +#include namespace llvm { +class MCInstrAnalysis; namespace bolt { class BinaryFunction; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 655852fece6e..df7b0fd45ca7 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1794,13 +1794,9 @@ void BinaryFunction::propagateGnuArgsSizeInfo() { continue; } - if (BC.MIA->isInvoke(Instr)) { - // Add the value of GNU_args_size as an extra operand if landing pad - // is non-emptry. - if (BC.MIA->getEHInfo(Instr).first) { - Instr.addOperand(MCOperand::createImm(CurrentGnuArgsSize)); - } - } + // Add the value of GNU_args_size as an extra operand if landing pad + // is non-empty. + BC.MIA->addGnuArgsSize(Instr, CurrentGnuArgsSize); ++II; } } diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 2734d4e2c389..77a9d011dd92 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -197,15 +197,9 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, "overlapping exception ranges detected"); // Add extra operands to a call instruction making it an invoke from // now on. - if (LPSymbol) { - Instruction.addOperand(MCOperand::createExpr( - MCSymbolRefExpr::create(LPSymbol, - MCSymbolRefExpr::VK_None, - *BC.Ctx))); - } else { - Instruction.addOperand(MCOperand::createImm(0)); - } - Instruction.addOperand(MCOperand::createImm(ActionEntry)); + BC.MIA->addEHInfo(Instruction, + MCLandingPad(LPSymbol, ActionEntry), + BC.Ctx.get()); } ++II; } while (II != IE && II->first < Start + Length); From 0ec6587b593dcef297de598911594033ec8cef12 Mon Sep 17 00:00:00 2001 From: Theodoros Kasampalis Date: Fri, 29 Jul 2016 14:17:06 -0700 Subject: [PATCH 150/904] More aggressive inlining pass Summary: This adds functionality for a more aggressive inlining pass, that can inline tail calls and functions with more than one basic block. (cherry picked from commit 7315f52bfc1a49a9b29f81711e7c11a3e1ad7a45) --- bolt/BinaryBasicBlock.h | 14 +- bolt/BinaryFunction.cpp | 9 +- bolt/BinaryFunction.h | 18 ++ bolt/BinaryPassManager.cpp | 6 +- bolt/BinaryPasses.cpp | 412 ++++++++++++++++++++++++++++++++++++- bolt/BinaryPasses.h | 30 ++- 6 files changed, 468 insertions(+), 21 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index a8a60cce1e25..8b358c8f59dc 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -412,23 +412,25 @@ class BinaryBasicBlock { /// Replace an instruction with a sequence of instructions. Returns true /// if the instruction to be replaced was found and replaced. - bool replaceInstruction(MCInst *Inst, - const std::vector &Replacement) { + template + bool replaceInstruction(MCInst *Inst, Itr Begin, Itr End) { auto I = Instructions.end(); auto B = Instructions.begin(); while (I > B) { --I; if (&*I == Inst) { - Instructions.insert( - Instructions.erase(I), - Replacement.begin(), - Replacement.end()); + Instructions.insert(Instructions.erase(I), Begin, End); return true; } } return false; } + bool replaceInstruction(MCInst *Inst, + const std::vector &Replacement) { + return replaceInstruction(Inst, Replacement.begin(), Replacement.end()); + } + /// Split apart the instructions in this basic block starting at Inst. /// The instructions following Inst are removed and returned in a vector. std::vector splitInstructions(const MCInst *Inst) { diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index df7b0fd45ca7..e62f2c70b02b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -550,14 +550,15 @@ void BinaryFunction::clearLandingPads(const unsigned StartIndex, void BinaryFunction::addLandingPads(const unsigned StartIndex, const unsigned NumBlocks) { - for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { - auto *BB = BasicBlocks[I]; + for (auto *BB : BasicBlocks) { if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { MCSymbol *LP = BB->getLabel(); - for (unsigned I : LPToBBIndex.at(LP)) { + for (unsigned I : LPToBBIndex[LP]) { assert(I < BasicBlocks.size()); BinaryBasicBlock *ThrowBB = BasicBlocks[I]; - ThrowBB->addLandingPad(BB); + const unsigned ThrowBBIndex = getIndex(ThrowBB); + if (ThrowBBIndex >= StartIndex && ThrowBBIndex < StartIndex + NumBlocks) + ThrowBB->addLandingPad(BB); } } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 9a99e35c30c3..e6c8bbbe3a7a 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -505,6 +505,24 @@ class BinaryFunction : public AddressRangesOwner { return BasicBlocks.at(Index); } + /// Returns the basic block after the given basic block in the layout or + /// nullptr the last basic block is given. + const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB) const { + for (auto I = layout_begin(), E = layout_end(); I != E; ++I) { + if (*I == BB && std::next(I) != E) + return *std::next(I); + } + return nullptr; + } + + BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB) { + for (auto I = layout_begin(), E = layout_end(); I != E; ++I) { + if (*I == BB && std::next(I) != E) + return *std::next(I); + } + return nullptr; + } + /// Return the name of the function as extracted from the binary file. /// If the function has multiple names - return the last one /// followed by "(*#)". diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 10c981a78382..8531cf750e45 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -82,6 +82,9 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(), opts::IdenticalCodeFolding); + Manager.registerPass(llvm::make_unique(), + opts::InlineSmallFunctions); + Manager.registerPass( std::move(llvm::make_unique(Manager.NagUser)), opts::EliminateUnreachable); @@ -103,9 +106,6 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(), opts::OptimizeBodylessFunctions); - Manager.registerPass(llvm::make_unique(), - opts::InlineSmallFunctions); - Manager.registerPass(std::move(llvm::make_unique())); Manager.registerPass(llvm::make_unique(), opts::Peepholes); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 679c06657b22..53e8d8b6ca20 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -172,6 +172,47 @@ void InlineSmallFunctions::findInliningCandidates( << " inlineable functions.\n"); } +void InlineSmallFunctions::findInliningCandidatesAggressive( + BinaryContext &BC, + const std::map &BFs) { + std::set OverwrittenFunctions = { + "_ZN4HPHP13hash_string_iEPKcj", + "_ZN4HPHP21hash_string_cs_unsafeEPKcj", + "_ZN4HPHP14hash_string_csEPKcj", + "_ZN4HPHP20hash_string_i_unsafeEPKcj", + "_ZNK4HPHP10StringData10hashHelperEv" + }; + for (const auto &BFIt : BFs) { + const auto &Function = BFIt.second; + if (!Function.isSimple() || + !opts::shouldProcess(Function) || + OverwrittenFunctions.count(Function.getName()) || + Function.hasEHRanges()) + continue; + uint64_t FunctionSize = 0; + for (const auto *BB : Function.layout()) { + FunctionSize += BC.computeCodeSize(BB->begin(), BB->end()); + } + assert(FunctionSize > 0 && "found empty function"); + if (FunctionSize > kMaxSize) + continue; + bool FoundCFI = false; + for (const auto BB : Function.layout()) { + for (const auto &Inst : *BB) { + if (BC.MIA->isEHLabel(Inst) || BC.MIA->isCFI(Inst)) { + FoundCFI = true; + break; + } + } + } + if (!FoundCFI) + InliningCandidates.insert(Function.getName()); + } + + DEBUG(errs() << "BOLT-DEBUG: " << InliningCandidates.size() + << " inlineable functions.\n"); +} + namespace { /// Returns whether a function creates a stack frame for itself or not. @@ -225,7 +266,275 @@ void InlineSmallFunctions::inlineCall( BB.replaceInstruction(CallInst, InlinedInstance); } -void InlineSmallFunctions::inlineCallsInFunction( +std::pair +InlineSmallFunctions::inlineCall( + BinaryContext &BC, + BinaryFunction &CallerFunction, + BinaryBasicBlock *CallerBB, + const unsigned CallInstIndex, + const BinaryFunction &InlinedFunction) { + // Get the instruction to be replaced with inlined code. + MCInst &CallInst = CallerBB->getInstructionAtIndex(CallInstIndex); + assert(BC.MIA->isCall(CallInst) && "Can only inline a call."); + + // Point in the function after the inlined code. + BinaryBasicBlock *AfterInlinedBB = nullptr; + unsigned AfterInlinedIstrIndex = 0; + + // In case of a tail call we should not remove any ret instructions from the + // inlined instance. + bool IsTailCall = BC.MIA->isTailCall(CallInst); + + // The first block of the function to be inlined can be merged with the caller + // basic block. This cannot happen if there are jumps to the first block. + bool CanMergeFirstInlinedBlock = (*InlinedFunction.begin()).pred_size() == 0; + + // If the call to be inlined is not at the end of its basic block and we have + // to inline more than one basic blocks (or even just one basic block that + // cannot be merged into the caller block), then the caller's basic block + // should be split. + bool ShouldSplitCallerBB = + CallInstIndex < CallerBB->size() - 1 && + (InlinedFunction.size() > 1 || !CanMergeFirstInlinedBlock); + + // Copy inlined function's basic blocks into a vector of basic blocks that + // will be inserted in the caller function (the inlined instance). Also, we + // keep a mapping from basic block index to the corresponding block in the + // inlined instance. + std::vector> InlinedInstance; + std::vector + BBIndexToInlinedInstanceBB(InlinedFunction.size(), nullptr); + for (const auto InlinedFunctionBB : InlinedFunction.layout()) { + InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0)); + BBIndexToInlinedInstanceBB[InlinedFunction.getIndex(InlinedFunctionBB)] = + InlinedInstance.back().get(); + if (InlinedFunction.hasValidProfile()) + InlinedInstance.back()->setExecutionCount( + InlinedFunctionBB->getExecutionCount()); + } + if (ShouldSplitCallerBB) { + // Add one extra block at the inlined instance for the removed part of the + // caller block. + InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0)); + BBIndexToInlinedInstanceBB.push_back(InlinedInstance.back().get()); + if (CallerFunction.hasValidProfile()) + InlinedInstance.back()->setExecutionCount(CallerBB->getExecutionCount()); + } + + // Copy instructions to the basic blocks of the inlined instance. + unsigned InlinedInstanceBBIndex = 0; + for (const auto InlinedFunctionBB : InlinedFunction.layout()) { + // Get the corresponding block of the inlined instance. + auto *InlinedInstanceBB = InlinedInstance[InlinedInstanceBBIndex].get(); + assert(InlinedInstanceBB == + BBIndexToInlinedInstanceBB[InlinedFunction.getIndex(InlinedFunctionBB)]); + + bool IsExitingBlock = false; + + // Copy instructions into the inlined instance. + for (auto Instruction : *InlinedFunctionBB) { + if (!IsTailCall && + BC.MIA->isReturn(Instruction) && + !BC.MIA->isTailCall(Instruction)) { + // Skip returns when the caller does a normal call as opposed to a tail + // call. + IsExitingBlock = true; + continue; + } + if (!IsTailCall && + BC.MIA->isTailCall(Instruction)) { + // Convert tail calls to normal calls when the caller does a normal + // call. + if (!BC.MIA->convertTailCallToCall(Instruction)) + assert(false && "unexpected tail call opcode found"); + IsExitingBlock = true; + } + if (BC.MIA->isBranch(Instruction) && + !BC.MIA->isIndirectBranch(Instruction)) { + // Convert the branch targets in the branch instructions that will be + // added to the inlined instance. + const MCSymbol *OldTargetLabel = nullptr; + const MCSymbol *OldFTLabel = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + assert(BC.MIA->analyzeBranch(Instruction, OldTargetLabel, OldFTLabel, + CondBranch, UncondBranch)); + assert(OldTargetLabel); + const MCSymbol *NewTargetLabel = nullptr; + for (const auto SuccBB : InlinedFunctionBB->successors()) { + if (SuccBB->getLabel() == OldTargetLabel) { + const auto InlinedInstanceSuccBB = + BBIndexToInlinedInstanceBB[InlinedFunction.getIndex(SuccBB)]; + NewTargetLabel = InlinedInstanceSuccBB->getLabel(); + break; + } + } + assert(NewTargetLabel); + BC.MIA->replaceBranchTarget(Instruction, NewTargetLabel, BC.Ctx.get()); + } + // TODO; Currently we simply ignore CFI instructions but we need to + // address them for correctness. + if (!BC.MIA->isEHLabel(Instruction) && + !BC.MIA->isCFI(Instruction)) { + InlinedInstanceBB->addInstruction(std::move(Instruction)); + } + } + + // Add CFG edges to the basic blocks of the inlined instance. + std::vector + Successors(InlinedFunctionBB->succ_size(), nullptr); + std::transform( + InlinedFunctionBB->succ_begin(), + InlinedFunctionBB->succ_end(), + Successors.begin(), + [&InlinedFunction, &BBIndexToInlinedInstanceBB] + (const BinaryBasicBlock *BB) { + return BBIndexToInlinedInstanceBB[InlinedFunction.getIndex(BB)]; + }); + if (InlinedFunction.hasValidProfile()) { + InlinedInstanceBB->addSuccessors( + Successors.begin(), + Successors.end(), + InlinedFunctionBB->branch_info_begin(), + InlinedFunctionBB->branch_info_end()); + } else { + InlinedInstanceBB->addSuccessors( + Successors.begin(), + Successors.end()); + } + + if (IsExitingBlock) { + assert(Successors.size() == 0); + if (ShouldSplitCallerBB) { + if (InlinedFunction.hasValidProfile()) { + InlinedInstanceBB->addSuccessor( + InlinedInstance.back().get(), + InlinedInstanceBB->getExecutionCount()); + } else { + InlinedInstanceBB->addSuccessor(InlinedInstance.back().get()); + } + MCInst ExitBranchInst; + const MCSymbol *ExitLabel = InlinedInstance.back().get()->getLabel(); + BC.MIA->createUncondBranch(ExitBranchInst, ExitLabel, BC.Ctx.get()); + InlinedInstanceBB->addInstruction(std::move(ExitBranchInst)); + } else if (InlinedInstanceBBIndex > 0 || !CanMergeFirstInlinedBlock) { + assert(CallInstIndex == CallerBB->size() - 1); + assert(CallerBB->succ_size() <= 1); + if (CallerBB->succ_size() == 1) { + if (InlinedFunction.hasValidProfile()) { + InlinedInstanceBB->addSuccessor( + *CallerBB->succ_begin(), + InlinedInstanceBB->getExecutionCount()); + } else { + InlinedInstanceBB->addSuccessor(*CallerBB->succ_begin()); + } + MCInst ExitBranchInst; + const MCSymbol *ExitLabel = (*CallerBB->succ_begin())->getLabel(); + BC.MIA->createUncondBranch(ExitBranchInst, ExitLabel, BC.Ctx.get()); + InlinedInstanceBB->addInstruction(std::move(ExitBranchInst)); + } + } + } + + ++InlinedInstanceBBIndex; + } + + if (ShouldSplitCallerBB) { + // Split the basic block that contains the call and add the removed + // instructions in the last block of the inlined instance. + // (Is it OK to have a basic block with just CFI instructions?) + std::vector TrailInstructions = + std::move(CallerBB->splitInstructions(&CallInst)); + assert(TrailInstructions.size() > 0); + InlinedInstance.back()->addInstructions( + TrailInstructions.begin(), + TrailInstructions.end()); + // Add CFG edges for the block with the removed instructions. + if (CallerFunction.hasValidProfile()) { + InlinedInstance.back()->addSuccessors( + CallerBB->succ_begin(), + CallerBB->succ_end(), + CallerBB->branch_info_begin(), + CallerBB->branch_info_end()); + } else { + InlinedInstance.back()->addSuccessors( + CallerBB->succ_begin(), + CallerBB->succ_end()); + } + // Update the after-inlined point. + AfterInlinedBB = InlinedInstance.back().get(); + AfterInlinedIstrIndex = 0; + } + + assert(InlinedInstance.size() > 0 && "found function with no basic blocks"); + assert(InlinedInstance.front()->size() > 0 && + "found function with empty basic block"); + + // If the inlining cannot happen as a simple instruction insertion into + // CallerBB, we remove the outgoing CFG edges of the caller block. + if (InlinedInstance.size() > 1 || !CanMergeFirstInlinedBlock) { + CallerBB->removeSuccessors(CallerBB->succ_begin(), CallerBB->succ_end()); + if (!ShouldSplitCallerBB) { + // Update the after-inlined point. + AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB); + AfterInlinedIstrIndex = 0; + } + } else { + assert(!ShouldSplitCallerBB); + // Update the after-inlined point. + if (CallInstIndex < CallerBB->size() - 1) { + AfterInlinedBB = CallerBB; + AfterInlinedIstrIndex = + CallInstIndex + InlinedInstance.front()->size(); + } else { + AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB); + AfterInlinedIstrIndex = 0; + } + } + + // Do the inlining by merging the first block of the inlined instance into + // the caller basic block if possible and adding the rest of the inlined + // instance basic blocks in the caller function. + if (CanMergeFirstInlinedBlock) { + CallerBB->replaceInstruction( + &CallInst, + InlinedInstance.front()->begin(), + InlinedInstance.front()->end()); + if (InlinedInstance.size() > 1) { + auto FirstBB = InlinedInstance.begin()->get(); + if (InlinedFunction.hasValidProfile()) { + CallerBB->addSuccessors( + FirstBB->succ_begin(), + FirstBB->succ_end(), + FirstBB->branch_info_begin(), + FirstBB->branch_info_end()); + } else { + CallerBB->addSuccessors( + FirstBB->succ_begin(), + FirstBB->succ_end()); + } + FirstBB->removeSuccessors(FirstBB->succ_begin(), FirstBB->succ_end()); + } + InlinedInstance.erase(InlinedInstance.begin()); + } else { + CallerBB->eraseInstruction(&CallInst); + if (CallerFunction.hasValidProfile()) { + CallerBB->addSuccessor(InlinedInstance.front().get(), + CallerBB->getExecutionCount()); + } else { + CallerBB->addSuccessor(InlinedInstance.front().get(), + CallerBB->getExecutionCount()); + } + } + unsigned NumBlocksToAdd = InlinedInstance.size(); + CallerFunction.insertBasicBlocks(CallerBB, std::move(InlinedInstance)); + CallerFunction.updateLayout(CallerBB, NumBlocksToAdd); + CallerFunction.fixBranches(); + + return std::make_pair(AfterInlinedBB, AfterInlinedIstrIndex); +} + +bool InlineSmallFunctions::inlineCallsInFunction( BinaryContext &BC, BinaryFunction &Function) { std::vector Blocks(Function.layout().begin(), @@ -245,6 +554,8 @@ void InlineSmallFunctions::inlineCallsInFunction( } } + bool DidInlining = false; + for (auto BB : Blocks) { if (BB->isCold()) continue; @@ -272,6 +583,7 @@ void InlineSmallFunctions::inlineCallsInFunction( + Function.estimateHotSize() < Function.getMaxSize()) { auto NextInstIt = std::next(InstIt); inlineCall(BC, *BB, &Inst, *TargetFunction.begin()); + DidInlining = true; DEBUG(errs() << "BOLT-DEBUG: Inlining call to " << TargetFunction.getName() << " in " << Function.getName() << "\n"); @@ -286,6 +598,81 @@ void InlineSmallFunctions::inlineCallsInFunction( ++InstIt; } } + + return DidInlining; +} + +bool InlineSmallFunctions::inlineCallsInFunctionAggressive( + BinaryContext &BC, + BinaryFunction &Function) { + std::vector Blocks(Function.layout().begin(), + Function.layout().end()); + std::sort(Blocks.begin(), Blocks.end(), + [](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) { + return BB1->getExecutionCount() > BB2->getExecutionCount(); + }); + uint32_t ExtraSize = 0; + + for (auto BB : Blocks) { + for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) { + auto &Inst = *InstIt; + if (BC.MIA->isCall(Inst)) { + totalDynamicCalls += BB->getExecutionCount(); + } + } + } + + bool DidInlining = false; + + for (auto BB : Blocks) { + if (BB->isCold()) + continue; + + unsigned InstIndex = 0; + for (auto InstIt = BB->begin(); InstIt != BB->end(); ) { + auto &Inst = *InstIt; + if (BC.MIA->isCall(Inst) && + Inst.size() == 1 && + Inst.getOperand(0).isExpr()) { + assert(!BC.MIA->isInvoke(Inst)); + auto Target = dyn_cast( + Inst.getOperand(0).getExpr()); + assert(Target && "Not MCSymbolRefExpr"); + auto FunctionIt = FunctionByName.find(Target->getSymbol().getName()); + if (FunctionIt != FunctionByName.end()) { + auto &TargetFunction = *FunctionIt->second; + bool CallToInlineableFunction = + InliningCandidates.count(TargetFunction.getName()); + + totalInlineableCalls += + CallToInlineableFunction * BB->getExecutionCount(); + + if (CallToInlineableFunction && + TargetFunction.getSize() + ExtraSize + + Function.estimateHotSize() < Function.getMaxSize()) { + unsigned NextInstIndex = 0; + BinaryBasicBlock *NextBB = nullptr; + std::tie(NextBB, NextInstIndex) = + inlineCall(BC, Function, BB, InstIndex, TargetFunction); + DidInlining = true; + DEBUG(errs() << "BOLT-DEBUG: Inlining call to " + << TargetFunction.getName() << " in " + << Function.getName() << "\n"); + InstIndex = NextBB == BB ? NextInstIndex : BB->size(); + InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end(); + ExtraSize += TargetFunction.getSize(); + inlinedDynamicCalls += BB->getExecutionCount(); + continue; + } + } + } + + ++InstIndex; + ++InstIt; + } + } + + return DidInlining; } void InlineSmallFunctions::runOnFunctions( @@ -295,17 +682,30 @@ void InlineSmallFunctions::runOnFunctions( for (auto &It : BFs) { FunctionByName[It.second.getName()] = &It.second; } + findInliningCandidates(BC, BFs); - uint32_t ConsideredFunctions = 0; + + std::vector ConsideredFunctions; for (auto &It : BFs) { auto &Function = It.second; if (!Function.isSimple() || !opts::shouldProcess(Function)) continue; - if (ConsideredFunctions == kMaxFunctions) - break; - inlineCallsInFunction(BC, Function); - ++ConsideredFunctions; + if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + continue; + ConsideredFunctions.push_back(&Function); + } + std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(), + [](BinaryFunction *A, BinaryFunction *B) { + return B->getExecutionCount() < A->getExecutionCount(); + }); + unsigned ModifiedFunctions = 0; + for (unsigned i = 0; i < ConsideredFunctions.size() && + ModifiedFunctions <= kMaxFunctions; ++i) { + auto &Function = *ConsideredFunctions[i]; + if (inlineCallsInFunction(BC, Function)) + ++ModifiedFunctions; } + DEBUG(errs() << "BOLT-DEBUG: Inlined " << inlinedDynamicCalls << " of " << totalDynamicCalls << " function calls in the profile.\n"); DEBUG(errs() << "BOLT-DEBUG: Inlined calls represent " diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index b227041649b5..29e9c7e1e3a8 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -64,8 +64,11 @@ class InlineSmallFunctions : public BinaryFunctionPass { /// Maximum number of instructions in an inlined function. static const unsigned kMaxInstructions = 8; + /// Maximum code size (in bytes) of inlined function (used by aggressive + /// inlining). + static const uint64_t kMaxSize = 60; /// Maximum number of functions that will be considered for inlining (in - /// ascending address order). + /// descending hottness order). static const unsigned kMaxFunctions = 30000; /// Statistics collected for debugging. @@ -83,9 +86,32 @@ class InlineSmallFunctions : public BinaryFunctionPass { MCInst *CallInst, const BinaryBasicBlock &InlinedFunctionBB); - void inlineCallsInFunction(BinaryContext &BC, + bool inlineCallsInFunction(BinaryContext &BC, BinaryFunction &Function); + /// The following methods do a more aggressive inlining pass, where we + /// inline calls as well as tail calls and we are not limited to inlining + /// functions with only one basic block. + /// FIXME: Currently these are broken since they do not work with the split + /// function option. + void findInliningCandidatesAggressive( + BinaryContext &BC, const std::map &BFs); + + bool inlineCallsInFunctionAggressive( + BinaryContext &BC, BinaryFunction &Function); + + /// Inline the call in CallInst to InlinedFunction. Inlined function should not + /// contain any landing pad or thrower edges but can have more than one blocks. + /// + /// Return the location (basic block and instruction index) where the code of + /// the caller function continues after the the inlined code. + std::pair + inlineCall(BinaryContext &BC, + BinaryFunction &CallerFunction, + BinaryBasicBlock *CallerBB, + const unsigned CallInstIdex, + const BinaryFunction &InlinedFunction); + public: void runOnFunctions(BinaryContext &BC, std::map &BFs, From 926573fd27ecf7f7d4ab83ea3a8335bc4ec2a022 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sun, 7 Aug 2016 12:35:23 -0700 Subject: [PATCH 151/904] Refactoring. Mainly NFC. Summary: Eliminated BinaryFunction::getName(). The function was confusing since the name is ambigous. Instead we have BinaryFunction::getPrintName() used for printing and whenever unique string identifier is needed one can use getSymbol()->getName(). In the next diff I'll have a map from MCSymbol to BinaryFunction in BinaryContext to facilitate function lookup from instruction operand expressions. There's one bug fixed where the function was called only under assert() in ICF::foldFunction(). For output we update all symbols associated with the function. At the moment it has no effect on the generated binary but in the future we would like to have all symbols in the symbol table updated. (cherry picked from commit 128f91727f367a1a89b26e3a030737336c9e47c1) --- bolt/BinaryBasicBlock.h | 13 +++-- bolt/BinaryFunction.cpp | 99 ++++++++++++++++----------------- bolt/BinaryFunction.h | 39 +++++++------ bolt/BinaryPasses.cpp | 115 +++++++++++++++++++-------------------- bolt/DWARFRewriter.cpp | 2 +- bolt/DebugData.cpp | 2 +- bolt/Exceptions.cpp | 12 ++-- bolt/RewriteInstance.cpp | 58 +++++++------------- 8 files changed, 167 insertions(+), 173 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 8b358c8f59dc..5d99f39f7ea0 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -48,7 +48,7 @@ class BinaryBasicBlock { BinaryFunction *Function; /// Label associated with the end of the block in the output binary. - MCSymbol *EndLabel{nullptr}; + const MCSymbol *EndLabel{nullptr}; /// [Begin, End) address range for this block in the output binary. std::pair OutputAddressRange{0, 0}; @@ -279,7 +279,12 @@ class BinaryBasicBlock { } /// Return symbol marking the start of this basic block. - MCSymbol *getLabel() const { + MCSymbol *getLabel() { + return Label; + } + + /// Return symbol marking the start of this basic block (const version). + const MCSymbol *getLabel() const { return Label; } @@ -447,12 +452,12 @@ class BinaryBasicBlock { } /// Sets the symbol pointing to the end of the BB in the output binary. - void setEndLabel(MCSymbol *Symbol) { + void setEndLabel(const MCSymbol *Symbol) { EndLabel = Symbol; } /// Gets the symbol pointing to the end of the BB in the output binary. - MCSymbol *getEndLabel() const { + const MCSymbol *getEndLabel() const { return EndLabel; } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index e62f2c70b02b..4dc2ea4c7357 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -143,7 +143,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, bool PrintInstructions) const { StringRef SectionName; Section.getName(SectionName); - OS << "Binary Function \"" << getName() << "\" " << Annotation << " {"; + OS << "Binary Function \"" << *this << "\" " << Annotation << " {"; if (Names.size() > 1) { OS << "\n Other names : "; auto Sep = ""; @@ -323,7 +323,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (FrameInstructions.empty()) OS << " \n"; - OS << "End of Function \"" << getName() << "\"\n\n"; + OS << "End of Function \"" << *this << "\"\n\n"; } bool BinaryFunction::disassemble(ArrayRef FunctionData) { @@ -340,30 +340,30 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { Labels[0] = Ctx->createTempSymbol("BB0", false); auto handleRIPOperand = - [&](MCInst &Instruction, uint64_t Address, uint64_t Size) -> bool { - uint64_t TargetAddress{0}; - MCSymbol *TargetSymbol{nullptr}; - if (!BC.MIA->evaluateRIPOperand(Instruction, Address, Size, - TargetAddress)) { - DEBUG(dbgs() << "BOLT: rip-relative operand can't be evaluated:\n"; - BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); - dbgs() << '\n'; - Instruction.dump_pretty(dbgs(), BC.InstPrinter.get()); - dbgs() << '\n';); - return false; - } - // FIXME: check that the address is in data, not in code. - if (TargetAddress == 0) { - errs() << "BOLT-WARNING: rip-relative operand is zero in function " - << getName() << ". Ignoring function.\n"; - return false; - } - TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); - BC.MIA->replaceRIPOperandDisp( - Instruction, MCOperand::createExpr(MCSymbolRefExpr::create( - TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx))); - return true; - }; + [&](MCInst &Instruction, uint64_t Address, uint64_t Size) { + uint64_t TargetAddress{0}; + MCSymbol *TargetSymbol{nullptr}; + if (!BC.MIA->evaluateRIPOperand(Instruction, Address, Size, + TargetAddress)) { + DEBUG(dbgs() << "BOLT: rip-relative operand can't be evaluated:\n"; + BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); + dbgs() << '\n'; + Instruction.dump_pretty(dbgs(), BC.InstPrinter.get()); + dbgs() << '\n';); + return false; + } + // FIXME: check that the address is in data, not in code. + if (TargetAddress == 0) { + errs() << "BOLT-WARNING: rip-relative operand is zero in function " + << *this << ". Ignoring function.\n"; + return false; + } + TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); + BC.MIA->replaceRIPOperandDisp( + Instruction, MCOperand::createExpr(MCSymbolRefExpr::create( + TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx))); + return true; + }; bool IsSimple = true; for (uint64_t Offset = 0; IsSimple && (Offset < getSize()); ) { @@ -381,7 +381,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x" << Twine::utohexstr(Offset) << " (address 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " - << getName() << '\n'; + << *this << '\n'; IsSimple = false; break; } @@ -408,12 +408,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (IsCall && containsAddress(InstructionTarget)) { if (InstructionTarget == getAddress()) { // Recursive call. - TargetSymbol = Ctx->getOrCreateSymbol(getName()); + TargetSymbol = getSymbol(); } else { // Possibly an old-style PIC code errs() << "BOLT: internal call detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) - << " in function " << getName() << ". Skipping.\n"; + << " in function " << *this << ". Skipping.\n"; IsSimple = false; } } @@ -448,12 +448,13 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (!MIA->convertJmpToTailCall(Instruction)) { assert(IsCondBranch && "unknown tail call instruction"); errs() << "BOLT-WARNING: conditional tail call detected in " - << "function " << getName() << " at 0x" + << "function " << *this << " at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ".\n"; } // TODO: A better way to do this would be using annotations for // MCInst objects. - TailCallOffsets.emplace(std::make_pair(Offset, InstructionTarget)); + TailCallOffsets.emplace(std::make_pair(Offset, + InstructionTarget)); IsCall = true; } @@ -464,7 +465,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // from the libraries. In reality more often than not it is // unreachable code, but we don't know it and have to emit calls // to 0 which make LLVM JIT unhappy. - errs() << "BOLT-WARNING: Function " << getName() + errs() << "BOLT-WARNING: Function " << *this << " has a call to address zero. Ignoring function.\n"; IsSimple = false; } @@ -491,7 +492,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (MIA->isIndirectBranch(Instruction)) { DEBUG(dbgs() << "BOLT-WARNING: indirect branch detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) - << ". Skipping function " << getName() << ".\n"); + << ". Skipping function " << *this << ".\n"); IsSimple = false; } // Indirect call. We only need to fix it if the operand is RIP-relative @@ -499,7 +500,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) - << ". Skipping function " << getName() << ".\n"; + << ". Skipping function " << *this << ".\n"; IsSimple = false; } } @@ -509,7 +510,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) - << ". Skipping function " << getName() << ".\n"; + << ". Skipping function " << *this << ".\n"; IsSimple = false; } } @@ -552,7 +553,7 @@ void BinaryFunction::addLandingPads(const unsigned StartIndex, const unsigned NumBlocks) { for (auto *BB : BasicBlocks) { if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { - MCSymbol *LP = BB->getLabel(); + const MCSymbol *LP = BB->getLabel(); for (unsigned I : LPToBBIndex[LP]) { assert(I < BasicBlocks.size()); BinaryBasicBlock *ThrowBB = BasicBlocks[I]; @@ -595,7 +596,7 @@ bool BinaryFunction::buildCFG() { auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); if (!BranchDataOrErr) { - DEBUG(dbgs() << "no branch data found for \"" << getName() << "\"\n"); + DEBUG(dbgs() << "no branch data found for \"" << *this << "\"\n"); } else { ExecutionCount = BranchDataOrErr->ExecutionCount; } @@ -1000,7 +1001,7 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { << format("%.1f%%", ProfileMatchRatio * 100.0f) << " (" << (LocalProfileBranches.size() - OrphanBranches.size()) << '/' << LocalProfileBranches.size() << ") for function " - << getName() << '\n'; + << *this << '\n'; DEBUG( for (auto &OBranch : OrphanBranches) errs() << "\t0x" << Twine::utohexstr(OBranch.first) << " -> 0x" @@ -1270,7 +1271,7 @@ void BinaryFunction::annotateCFIState() { bool BinaryFunction::fixCFIState() { auto Sep = ""; DEBUG(dbgs() << "Trying to fix CFI states for each BB after reordering.\n"); - DEBUG(dbgs() << "This is the list of CFI states for each BB of " << getName() + DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this << ": "); auto replayCFIInstrs = @@ -1301,7 +1302,7 @@ bool BinaryFunction::fixCFIState() { if (NestedLevel != 0) { errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state while" << " replaying CFI instructions for BB " << InBB->getName() - << " in function " << getName() << '\n'; + << " in function " << *this << '\n'; return false; } @@ -1379,7 +1380,7 @@ bool BinaryFunction::fixCFIState() { if (StackOffset != 0) { errs() << " BOLT-WARNING: not possible to remember/recover state" << " without corrupting CFI state stack in function " - << getName() << "\n"; + << *this << "\n"; return false; } } else if (BBCFIState[BBIndex] > State) { @@ -1416,11 +1417,11 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, } else if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) { // Work on optimal solution if problem is small enough - DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n"); + DEBUG(dbgs() << "finding optimal block layout for " << *this << "\n"); Algo.reset(new OptimalReorderAlgorithm()); } else { - DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n"); + DEBUG(dbgs() << "running block layout heuristics on " << *this << "\n"); std::unique_ptr CAlgo; if (MinBranchClusters) @@ -1482,7 +1483,7 @@ std::string constructFilename(std::string Filename, } void BinaryFunction::dumpGraph(raw_ostream& OS) const { - OS << "strict digraph \"" << getName() << "\" {\n"; + OS << "strict digraph \"" << *this << "\" {\n"; for (auto *BB : BasicBlocks) { for (auto *Succ : BB->successors()) { OS << "\"" << BB->getName() << "\" -> " @@ -1511,7 +1512,7 @@ void BinaryFunction::viewGraph() const { } void BinaryFunction::dumpGraphForPass(std::string Annotation) const { - auto Filename = constructFilename(getName(), Annotation, ".dot"); + auto Filename = constructFilename(getPrintName(), Annotation, ".dot"); dbgs() << "BOLT-DEBUG: Dumping CFG to " << Filename << "\n"; dumpGraphToFile(Filename); } @@ -1607,7 +1608,7 @@ void BinaryFunction::fixBranches() { // invert this conditional branch logic so we can make this a fallthrough. if (TBB == FT && !HotColdBorder) { if (OldFT == nullptr) { - errs() << "BOLT-ERROR: malformed CFG for function " << getName() + errs() << "BOLT-ERROR: malformed CFG for function " << *this << " in basic block " << BB->getName() << '\n'; } assert(OldFT != nullptr && "malformed CFG"); @@ -2136,8 +2137,8 @@ bool BinaryFunction::isIdenticalWith(const BinaryFunction &BF) const { } if (PseudosDiffer) { - errs() << "BOLT-WARNING: functions " << getName() << " and "; - errs() << BF.getName() << " are identical, but have different"; + errs() << "BOLT-WARNING: functions " << *this << " and "; + errs() << BF << " are identical, but have different"; errs() << " pseudo instruction sequences.\n"; } @@ -2308,7 +2309,7 @@ void BinaryFunction::calculateLoopInfo() { } void BinaryFunction::printLoopInfo(raw_ostream &OS) const { - OS << "Loop Info for Function \"" << getName() << "\""; + OS << "Loop Info for Function \"" << *this << "\""; if (hasValidProfile()) { OS << " (count: " << getExecutionCount() << ")"; } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index e6c8bbbe3a7a..e0213265fc85 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -139,7 +139,7 @@ class BinaryFunction : public AddressRangesOwner { /// Alignment requirements for the function. uint64_t Alignment{1}; - MCSymbol *PersonalityFunction{nullptr}; + const MCSymbol *PersonalityFunction{nullptr}; uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel}; BinaryContext &BC; @@ -183,7 +183,7 @@ class BinaryFunction : public AddressRangesOwner { uint64_t LSDAAddress{0}; /// Landing pads for the function. - std::set LandingPads; + std::set LandingPads; /// Associated DIEs in the .debug_info section with their respective CUs. /// There can be multiple because of identical code folding. @@ -340,7 +340,7 @@ class BinaryFunction : public AddressRangesOwner { std::vector BBCFIState; /// Symbol in the output. - const MCSymbol *OutputSymbol; + MCSymbol *OutputSymbol; /// Symbol at the end of the function. MCSymbol *FunctionEndLabel{nullptr}; @@ -457,7 +457,9 @@ class BinaryFunction : public AddressRangesOwner { Names({Name}), Symbol(Symbol), Section(Section), Address(Address), IdenticalFunctionAddress(Address), Size(Size), BC(BC), IsSimple(IsSimple), CodeSectionName(".text." + Name), FunctionNumber(++Count) - {} + { + OutputSymbol = BC.Ctx->getOrCreateSymbol(Name); + } /// Modify code layout making necessary adjustments to instructions at the /// end of basic blocks. @@ -531,7 +533,7 @@ class BinaryFunction : public AddressRangesOwner { /// /// We pick the last name from the list to match the name of the function /// in profile data for easier manual analysis. - std::string getName() const { + std::string getPrintName() const { return Names.size() == 1 ? Names.back() : (Names.back() + "(*" + std::to_string(Names.size()) + ")"); @@ -580,8 +582,15 @@ class BinaryFunction : public AddressRangesOwner { return MaxSize; } - /// Return MC symbol associtated with the function in the output object. - const MCSymbol *getOutputSymbol() const { + /// Return MC symbol associtated with the function. + /// All references to the function should use this symbol. + MCSymbol *getSymbol() { + return OutputSymbol; + } + + /// Return MC symbol associtated with the function (const version). + /// All references to the function should use this symbol. + const MCSymbol *getSymbol() const { return OutputSymbol; } @@ -620,7 +629,7 @@ class BinaryFunction : public AddressRangesOwner { return UsesGnuArgsSize; } - MCSymbol *getPersonalityFunction() const { + const MCSymbol *getPersonalityFunction() const { return PersonalityFunction; } @@ -821,11 +830,6 @@ class BinaryFunction : public AddressRangesOwner { return *this; } - BinaryFunction &setOutputSymbol(const MCSymbol *Symbol) { - OutputSymbol = Symbol; - return *this; - } - BinaryFunction &setSimple(bool Simple) { IsSimple = Simple; return *this; @@ -1080,19 +1084,16 @@ class BinaryFunction : public AddressRangesOwner { uint64_t ImageAddress{0}; uint64_t ImageSize{0}; uint64_t FileOffset{0}; - const MCSymbol *OutputSymbol{nullptr}; public: uint64_t getAddress() const { return Address; } uint64_t getImageAddress() const { return ImageAddress; } uint64_t getImageSize() const { return ImageSize; } uint64_t getFileOffset() const { return FileOffset; } - const MCSymbol *getOutputSymbol() const { return OutputSymbol; } void setAddress(uint64_t VAddress) { Address = VAddress; } void setImageAddress(uint64_t Address) { ImageAddress = Address; } void setImageSize(uint64_t Size) { ImageSize = Size; } void setFileOffset(uint64_t Offset) { FileOffset = Offset; } - void setOutputSymbol(const MCSymbol *Symbol) { OutputSymbol = Symbol; } }; /// Cold fragment of the function. @@ -1103,6 +1104,12 @@ class BinaryFunction : public AddressRangesOwner { const FragmentInfo &cold() const { return ColdFragment; } }; +inline raw_ostream &operator<<(raw_ostream &OS, + const BinaryFunction &Function) { + OS << Function.getPrintName(); + return OS; +} + inline raw_ostream &operator<<(raw_ostream &OS, const BinaryFunction::State State) { switch (State) { diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 53e8d8b6ca20..1d2b4e597991 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -73,24 +73,24 @@ void OptimizeBodylessFunctions::analyze( auto &BB = *BF.begin(); const auto &FirstInst = *BB.begin(); - if (!BC.MIA->isTailCall(FirstInst)) return; - auto &Op1 = FirstInst.getOperand(0); if (!Op1.isExpr()) return; + auto Expr = dyn_cast(Op1.getExpr()); + if (!Expr) + return; + auto AddressIt = BC.GlobalSymbols.find(Expr->getSymbol().getName()); + if (AddressIt == BC.GlobalSymbols.end()) + return; + auto CalleeIt = BFs.find(AddressIt->second); + if (CalleeIt == BFs.end()) + return; - if (auto Expr = dyn_cast(Op1.getExpr())) { - auto AddressIt = BC.GlobalSymbols.find(Expr->getSymbol().getName()); - if (AddressIt != BC.GlobalSymbols.end()) { - auto CalleeIt = BFs.find(AddressIt->second); - if (CalleeIt != BFs.end()) { - assert(Expr->getSymbol().getName() == CalleeIt->second.getName()); - EquivalentCallTarget[BF.getName()] = &CalleeIt->second; - } - } - } + assert(&Expr->getSymbol() == CalleeIt->second.getSymbol()); + + EquivalentCallTarget[BF.getSymbol()->getName()] = &CalleeIt->second; } void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, @@ -99,31 +99,30 @@ void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, for (auto InstIt = (*BBIt).begin(), InstEnd = (*BBIt).end(); InstIt != InstEnd; ++InstIt) { auto &Inst = *InstIt; - if (BC.MIA->isCall(Inst)) { - auto &Op1 = Inst.getOperand(0); - if (Op1.isExpr()) { - if (auto Expr = dyn_cast(Op1.getExpr())) { - auto OriginalTarget = Expr->getSymbol().getName(); - auto Target = OriginalTarget; - // Iteratively update target since we could have f1() calling f2() - // calling f3() calling f4() and we want to output f1() directly - // calling f4(). - while (EquivalentCallTarget.count(Target)) { - Target = EquivalentCallTarget.find(Target)->second->getName(); - } - if (Target != OriginalTarget) { - DEBUG(errs() << "BOLT-DEBUG: Optimizing " << BF.getName() - << ": replacing call to " - << OriginalTarget - << " by call to " << Target << "\n"); - Inst.clear(); - Inst.addOperand(MCOperand::createExpr( - MCSymbolRefExpr::create( - BC.Ctx->getOrCreateSymbol(Target), *BC.Ctx))); - } - } - } + if (!BC.MIA->isCall(Inst)) + continue; + auto &Op1 = Inst.getOperand(0); + if (!Op1.isExpr()) + continue; + auto Expr = dyn_cast(Op1.getExpr()); + if (!Expr) + continue; + auto *OriginalTarget = &Expr->getSymbol(); + auto *Target = OriginalTarget; + // Iteratively update target since we could have f1() calling f2() + // calling f3() calling f4() and we want to output f1() directly + // calling f4(). + while (EquivalentCallTarget.count(Target->getName())) { + Target = + EquivalentCallTarget.find(Target->getName())->second->getSymbol(); } + if (Target == OriginalTarget) + continue; + DEBUG(errs() << "BOLT-DEBUG: Optimizing " << (*BBIt).getName() + << " in " << BF + << ": replacing call to " << OriginalTarget->getName() + << " by call to " << Target->getName() << "\n"); + BC.MIA->replaceCallTargetOperand(Inst, Target, BC.Ctx.get()); } } } @@ -164,7 +163,7 @@ void InlineSmallFunctions::findInliningCandidates( BB.size() <= kMaxInstructions && BC.MIA->isReturn(LastInstruction) && !BC.MIA->isTailCall(LastInstruction)) { - InliningCandidates.insert(Function.getName()); + InliningCandidates.insert(Function.getSymbol()->getName()); } } @@ -186,7 +185,7 @@ void InlineSmallFunctions::findInliningCandidatesAggressive( const auto &Function = BFIt.second; if (!Function.isSimple() || !opts::shouldProcess(Function) || - OverwrittenFunctions.count(Function.getName()) || + OverwrittenFunctions.count(Function.getSymbol()->getName()) || Function.hasEHRanges()) continue; uint64_t FunctionSize = 0; @@ -206,7 +205,7 @@ void InlineSmallFunctions::findInliningCandidatesAggressive( } } if (!FoundCFI) - InliningCandidates.insert(Function.getName()); + InliningCandidates.insert(Function.getSymbol()->getName()); } DEBUG(errs() << "BOLT-DEBUG: " << InliningCandidates.size() @@ -573,7 +572,7 @@ bool InlineSmallFunctions::inlineCallsInFunction( if (FunctionIt != FunctionByName.end()) { auto &TargetFunction = *FunctionIt->second; bool CallToInlineableFunction = - InliningCandidates.count(TargetFunction.getName()); + InliningCandidates.count(TargetFunction.getSymbol()->getName()); totalInlineableCalls += CallToInlineableFunction * BB->getExecutionCount(); @@ -585,8 +584,8 @@ bool InlineSmallFunctions::inlineCallsInFunction( inlineCall(BC, *BB, &Inst, *TargetFunction.begin()); DidInlining = true; DEBUG(errs() << "BOLT-DEBUG: Inlining call to " - << TargetFunction.getName() << " in " - << Function.getName() << "\n"); + << TargetFunction << " in " + << Function << "\n"); InstIt = NextInstIt; ExtraSize += TargetFunction.getSize(); inlinedDynamicCalls += BB->getExecutionCount(); @@ -642,7 +641,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( if (FunctionIt != FunctionByName.end()) { auto &TargetFunction = *FunctionIt->second; bool CallToInlineableFunction = - InliningCandidates.count(TargetFunction.getName()); + InliningCandidates.count(TargetFunction.getSymbol()->getName()); totalInlineableCalls += CallToInlineableFunction * BB->getExecutionCount(); @@ -656,8 +655,8 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( inlineCall(BC, Function, BB, InstIndex, TargetFunction); DidInlining = true; DEBUG(errs() << "BOLT-DEBUG: Inlining call to " - << TargetFunction.getName() << " in " - << Function.getName() << "\n"); + << TargetFunction << " in " + << Function << "\n"); InstIndex = NextBB == BB ? NextInstIndex : BB->size(); InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end(); ExtraSize += TargetFunction.getSize(); @@ -680,7 +679,7 @@ void InlineSmallFunctions::runOnFunctions( std::map &BFs, std::set &) { for (auto &It : BFs) { - FunctionByName[It.second.getName()] = &It.second; + FunctionByName[It.second.getSymbol()->getName()] = &It.second; } findInliningCandidates(BC, BFs); @@ -749,8 +748,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { auto Count = Function.eraseDeadBBs(Reachable); if (Count) { DEBUG(dbgs() << "BOLT: Removed " << Count - << " dead basic block(s) in function " - << Function.getName() << '\n'); + << " dead basic block(s) in function " << Function << '\n'); } if (opts::PrintAll || opts::PrintUCE) @@ -818,7 +816,7 @@ void FixupFunctions::runOnFunctions( // Fix the CFI state. if (!Function.fixCFIState()) { errs() << "BOLT-WARNING: unable to fix CFI state for function " - << Function.getName() << ". Skipping.\n"; + << Function << ". Skipping.\n"; Function.setSimple(false); continue; } @@ -882,7 +880,8 @@ bool SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // Lookup the address for the current function and // the tail call target. - auto const FnAddress = BC.GlobalSymbols.find(BF.getName()); + auto const FnAddress = + BC.GlobalSymbols.find(BF.getSymbol()->getName()); auto const TailAddress = BC.GlobalSymbols.find(TailTarget.getName()); if (FnAddress == BC.GlobalSymbols.end() || TailAddress == BC.GlobalSymbols.end()) { @@ -908,7 +907,7 @@ bool SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // if there are no other users. BB->removeSuccessor(CondTargetBB); DEBUG(dbgs() << "patched " << (isForward ? "(fwd)" : "(back)") - << " tail call in " << BF.getName() << ".\n";); + << " tail call in " << BF << ".\n";); } } } @@ -918,7 +917,7 @@ bool SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, DEBUG(dbgs() << "BOLT: patched " << NumLocalPatchedTailCalls << " tail calls (" << NumOrigForwardBranches << " forward)" << " from a total of " << NumLocalTailCalls - << " in function " << BF.getName() << "\n";); + << " in function " << BF << "\n";); return NumLocalPatchedTailCalls > 0; } @@ -1016,7 +1015,7 @@ bool SimplifyRODataLoads::simplifyRODataLoads( // Look up the symbol address in the global symbols map of the binary // context object. - auto GI = BC.GlobalSymbols.find(DisplSymbol.getName().str()); + auto GI = BC.GlobalSymbols.find(DisplSymbol.getName()); if (GI == BC.GlobalSymbols.end()) continue; TargetAddress = GI->second; @@ -1179,12 +1178,10 @@ void IdenticalCodeFolding::foldFunction( MCInst &CallInst = CallBB->getInstructionAtIndex(CS.InstrIndex); // Replace call target with BFToReplaceWith. - MCOperand CallTargetOp = - MCOperand::createExpr( - MCSymbolRefExpr::create( - SymbolToReplaceWith, MCSymbolRefExpr::VK_None, *BC.Ctx)); - assert(BC.MIA->replaceCallTargetOperand(CallInst, CallTargetOp) && - "unexpected call target prevented the replacement"); + auto Success = BC.MIA->replaceCallTargetOperand(CallInst, + SymbolToReplaceWith, + BC.Ctx.get()); + assert(Success && "unexpected call target prevented the replacement"); // Add this call site to the callers of BFToReplaceWith. BFToReplaceWithCallers.emplace_back(CS); diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 0a58e789d7de..67e6f2a3d3f0 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -271,7 +271,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, FunctionSection); } else { - DEBUG(errs() << "BOLT-DEBUG: Function " << Function.getName() + DEBUG(errs() << "BOLT-DEBUG: Function " << Function << " has no associated line number information.\n"); } } diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index b1462fc284c6..42e13b8d1f84 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -30,7 +30,7 @@ void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, BeginAddress - Function.getAddress()); if (!FirstBB) { errs() << "BOLT-WARNING: no basic blocks in function " - << Function.getName() << " intersect with debug range [0x" + << Function << " intersect with debug range [0x" << Twine::utohexstr(BeginAddress) << ", 0x" << Twine::utohexstr(EndAddress) << ")\n"; return; diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 77a9d011dd92..64154296500f 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -122,7 +122,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (opts::PrintExceptions) { errs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress()) - << " for function " << getName() << "]:\n"; + << " for function " << *this << "]:\n"; errs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; errs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; errs() << "TType Encoding = " << (unsigned)TTypeEncoding << '\n'; @@ -173,7 +173,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (Instructions.find(LandingPad) == Instructions.end()) { errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LandingPad) << " not pointing to an instruction in function " - << getName() << " - ignoring.\n"; + << *this << " - ignoring.\n"; } else { auto Label = Labels.find(LandingPad); if (Label != Labels.end()) { @@ -331,7 +331,7 @@ void BinaryFunction::updateEHRanges() { continue; // Same symbol is used for the beginning and the end of the range. - MCSymbol *EHSymbol{nullptr}; + const MCSymbol *EHSymbol{nullptr}; if (BB->isCold()) { // If we see a label in the cold block, it means we have to close // the range using function end symbol. @@ -458,13 +458,13 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer) { assert(BeginLabel && "start EH label expected"); assert(EndLabel && "end EH label expected"); - Streamer->emitAbsoluteSymbolDiff(BeginLabel, getOutputSymbol(), 4); + Streamer->emitAbsoluteSymbolDiff(BeginLabel, getSymbol(), 4); Streamer->emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4); if (!CallSite.LP) { Streamer->EmitIntValue(0, 4); } else { - Streamer->emitAbsoluteSymbolDiff(CallSite.LP, getOutputSymbol(), 4); + Streamer->emitAbsoluteSymbolDiff(CallSite.LP, getSymbol(), 4); } Streamer->EmitULEB128IntValue(CallSite.Action); @@ -495,7 +495,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { const FDE &CurFDE = *I->second; if (Function.getSize() != CurFDE.getAddressRange()) { errs() << "BOLT-WARNING: CFI information size mismatch for function \"" - << Function.getName() << "\"" + << Function << "\"" << format(": Function size is %dB, CFI covers " "%dB\n", Function.getSize(), CurFDE.getAddressRange()); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 255514449e23..4141a488628d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -605,7 +605,7 @@ void RewriteInstance::run() { auto FunctionIt = BinaryFunctions.find(Address); assert(FunctionIt != BinaryFunctions.end() && "Invalid large function address."); - errs() << "BOLT-WARNING: Function " << FunctionIt->second.getName() + errs() << "BOLT-WARNING: Function " << FunctionIt->second << " is larger than its orginal size: emitting again marking it " << "as not simple.\n"; FunctionIt->second.setSimple(false); @@ -787,7 +787,7 @@ void RewriteInstance::discoverFileObjects() { if (SymbolSize != BFI->second.getSize()) { errs() << "BOLT-WARNING: size mismatch for duplicate entries " << UniqueName << ':' << SymbolSize << " and " - << BFI->second.getName() << ':' << BFI->second.getSize() << '\n'; + << BFI->second << ':' << BFI->second.getSize() << '\n'; } BFI->second.addAlternativeName(UniqueName); } else { @@ -810,16 +810,6 @@ void RewriteInstance::discoverFileObjects() { "wish to proceed, use -allow-stripped option.\n"; exit(1); } - - // Register the final names of functions with multiple names with BinaryContext - // data structures. - for (auto &BFI : BinaryFunctions) { - uint64_t Address = BFI.first; - const BinaryFunction &BF = BFI.second; - auto AI = BC->GlobalSymbols.find(BF.getName()); - if (AI == BC->GlobalSymbols.end()) - BC->registerNameAtAddress(BF.getName(), Address); - } } void RewriteInstance::readSpecialSections() { @@ -889,7 +879,7 @@ void RewriteInstance::disassembleFunctions() { if (!opts::shouldProcess(Function)) { DEBUG(dbgs() << "BOLT: skipping processing function " - << Function.getName() << " per user request.\n"); + << Function << " per user request.\n"); continue; } @@ -901,7 +891,7 @@ void RewriteInstance::disassembleFunctions() { if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { // When could it happen? errs() << "BOLT: corresponding section is non-executable or empty " - << "for function " << Function.getName(); + << "for function " << Function; continue; } @@ -920,7 +910,7 @@ void RewriteInstance::disassembleFunctions() { uint64_t SectionEnd = Function.getSection().getAddress() + Function.getSection().getSize(); if (SectionEnd > SymRefI->first) { - errs() << "BOLT-WARNING: symbol after " << Function.getName() + errs() << "BOLT-WARNING: symbol after " << Function << " should not be in the same section.\n"; MaxSize = 0; } else { @@ -930,7 +920,7 @@ void RewriteInstance::disassembleFunctions() { if (MaxSize < Function.getSize()) { errs() << "BOLT-WARNING: symbol seen in the middle of the function " - << Function.getName() << ". Skipping.\n"; + << Function << ". Skipping.\n"; Function.setSimple(false); continue; } @@ -969,7 +959,7 @@ void RewriteInstance::disassembleFunctions() { if (EHFrame->ParseError.empty()) { if (!CFIRdWrt->fillCFIInfoFor(Function)) { errs() << "BOLT-WARNING: unable to fill CFI for function " - << Function.getName() << '\n'; + << Function << '\n'; Function.setSimple(false); continue; } @@ -1010,7 +1000,7 @@ void RewriteInstance::disassembleFunctions() { uint64_t Offset = Addr - I->first; if (Offset == 0 || Offset >= Func.getSize()) continue; - errs() << "BOLT-WARNING: Function " << Func.getName() + errs() << "BOLT-WARNING: Function " << Func << " has internal BBs that are target of a branch located in " "another function. We will not process this function.\n"; Func.setSimple(false); @@ -1056,7 +1046,7 @@ void RewriteInstance::disassembleFunctions() { ); auto SFI = ProfiledFunctions.begin(); for (int i = 0; i < 100 && SFI != ProfiledFunctions.end(); ++SFI, ++i) { - errs() << " " << (*SFI)->getName() << " : " + errs() << " " << *SFI << " : " << (*SFI)->getExecutionCount() << '\n'; } } @@ -1147,17 +1137,12 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.EmitCodeAlignment(Function.getAlignment()); - if (!EmitColdPart) { - MCSymbol *FunctionSymbol = BC.Ctx->getOrCreateSymbol(Function.getName()); - Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); - Streamer.EmitLabel(FunctionSymbol); - Function.setOutputSymbol(FunctionSymbol); - } else { - MCSymbol *FunctionSymbol = - BC.Ctx->getOrCreateSymbol(Twine(Function.getName()).concat(".cold")); - Streamer.EmitSymbolAttribute(FunctionSymbol, MCSA_ELF_TypeFunction); - Streamer.EmitLabel(FunctionSymbol); - Function.cold().setOutputSymbol(FunctionSymbol); + // Emit all names the function is known under. + for (const auto &Name : Function.getNames()) { + Twine EmitName = EmitColdPart ? Twine(Name).concat(".cold") : Name; + auto *EmitSymbol = BC.Ctx->getOrCreateSymbol(EmitName); + Streamer.EmitSymbolAttribute(EmitSymbol, MCSA_ELF_TypeFunction); + Streamer.EmitLabel(EmitSymbol); } // Emit CFI start @@ -1348,7 +1333,7 @@ void RewriteInstance::emitFunctions() { continue; DEBUG(dbgs() << "BOLT: generating code for function \"" - << Function.getName() << "\" : " + << Function << "\" : " << Function.getFunctionNumber() << '\n'); emitFunction(*Streamer, Function, *BC.get(), /*EmitColdPart=*/false); @@ -1431,7 +1416,7 @@ void RewriteInstance::emitFunctions() { FailedAddresses.emplace_back(Function.getAddress()); } } else { - errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; + errs() << "BOLT: cannot remap function " << Function << "\n"; FailedAddresses.emplace_back(Function.getAddress()); } @@ -1458,7 +1443,7 @@ void RewriteInstance::emitFunctions() { NextAvailableAddress += Function.cold().getImageSize(); } else { - errs() << "BOLT: cannot remap function " << Function.getName() << "\n"; + errs() << "BOLT: cannot remap function " << Function << "\n"; FailedAddresses.emplace_back(Function.getAddress()); } } @@ -1914,14 +1899,14 @@ void RewriteInstance::rewriteFile() { << Twine::utohexstr(Function.getImageSize()) << ") is larger than maximum allowed size (0x" << Twine::utohexstr(Function.getMaxSize()) - << ") for function " << Function.getName() << '\n'; + << ") for function " << Function << '\n'; FailedAddresses.emplace_back(Function.getAddress()); continue; } OverwrittenScore += Function.getFunctionScore(); // Overwrite function in the output file. - outs() << "BOLT: rewriting function \"" << Function.getName() << "\"\n"; + outs() << "BOLT: rewriting function \"" << Function << "\"\n"; Out->os().pwrite(reinterpret_cast(Function.getImageAddress()), Function.getImageSize(), Function.getFileOffset()); @@ -1943,8 +1928,7 @@ void RewriteInstance::rewriteFile() { } // Write cold part - outs() << "BOLT: rewriting function \"" << Function.getName() - << "\" (cold part)\n"; + outs() << "BOLT: rewriting function \"" << Function << "\" (cold part)\n"; Out->os().pwrite(reinterpret_cast(Function.cold().getImageAddress()), Function.cold().getImageSize(), Function.cold().getFileOffset()); From ada47d1af6a594d844fe2e1ec26b3a696d920a6d Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 11 Aug 2016 14:23:54 -0700 Subject: [PATCH 152/904] More refactoring work. Summary: Avoid referring to BinaryFunction's by name. Functions could be found by MCSymbol using BinaryContext::getFunctionForSymbol(). (cherry picked from commit 13199b033e6640f6c773fd7758d4265359bb7812) --- bolt/BinaryContext.h | 9 +++++ bolt/BinaryFunction.h | 37 +++++++++-------- bolt/BinaryPasses.cpp | 85 +++++++++++++++------------------------- bolt/BinaryPasses.h | 10 ++--- bolt/DebugData.h | 2 + bolt/RewriteInstance.cpp | 39 ++++++++++++------ bolt/RewriteInstance.h | 6 +++ 7 files changed, 98 insertions(+), 90 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 211736122407..30a98116a1b6 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -39,6 +39,7 @@ #include #include #include +#include #include namespace llvm { @@ -65,6 +66,9 @@ class BinaryContext { /// [address] -> [name1], [name2], ... std::multimap GlobalAddresses; + /// [MCSymbol] -> [BinaryFunction] + std::unordered_map SymbolToFunctionMap; + /// Map virtual address to a section. std::map AllocatableSections; @@ -173,6 +177,11 @@ class BinaryContext { GlobalAddresses.emplace(std::make_pair(Address, Name)); } + const BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) const { + auto BFI = SymbolToFunctionMap.find(Symbol); + return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second; + } + /// Populate some internal data structures with debug info. void preprocessDebugInfo( std::map &BinaryFunctions); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index e0213265fc85..d76e0f3502d1 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -107,9 +107,6 @@ class BinaryFunction : public AddressRangesOwner { /// A list of function names. std::vector Names; - /// Symbol associated with this function in the input. - SymbolRef Symbol; - /// Containing section SectionRef Section; @@ -278,7 +275,8 @@ class BinaryFunction : public AddressRangesOwner { /// construction. Map from tail call terminated basic block to a struct with /// information about the tail call. struct TailCallInfo { - uint32_t Offset; // offset of the tail call from the function start + uint32_t Offset; // offset of the tail call from the function + // start uint32_t Index; // index of the tail call in the basic block uint64_t TargetAddress; // address of the callee uint64_t Count{0}; // taken count from profile data @@ -366,8 +364,24 @@ class BinaryFunction : public AddressRangesOwner { Itr itr; }; + BinaryFunction& operator=(const BinaryFunction &) = delete; + BinaryFunction(const BinaryFunction &) = delete; + + friend class RewriteInstance; + + /// Creation should be handled by RewriteInstance::createBinaryFunction(). + BinaryFunction(const std::string &Name, SectionRef Section, uint64_t Address, + uint64_t Size, BinaryContext &BC, bool IsSimple) : + Names({Name}), Section(Section), Address(Address), + IdenticalFunctionAddress(Address), Size(Size), BC(BC), IsSimple(IsSimple), + CodeSectionName(".text." + Name), FunctionNumber(++Count) { + OutputSymbol = BC.Ctx->getOrCreateSymbol(Name); + } + public: + BinaryFunction(BinaryFunction &&) = default; + typedef Iterator iterator; typedef Iterator const_iterator; @@ -446,21 +460,6 @@ class BinaryFunction : public AddressRangesOwner { return iterator_range(cie_begin(), cie_end()); } - BinaryFunction& operator=(const BinaryFunction &) = delete; - BinaryFunction(const BinaryFunction &) = delete; - - BinaryFunction(BinaryFunction &&) = default; - - BinaryFunction(const std::string &Name, SymbolRef Symbol, SectionRef Section, - uint64_t Address, uint64_t Size, BinaryContext &BC, - bool IsSimple = true) : - Names({Name}), Symbol(Symbol), Section(Section), Address(Address), - IdenticalFunctionAddress(Address), Size(Size), BC(BC), IsSimple(IsSimple), - CodeSectionName(".text." + Name), FunctionNumber(++Count) - { - OutputSymbol = BC.Ctx->getOrCreateSymbol(Name); - } - /// Modify code layout making necessary adjustments to instructions at the /// end of basic blocks. void modifyLayout(LayoutType Type, bool MinBranchClusters, bool Split); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 1d2b4e597991..15ef38c51cf4 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -81,16 +81,11 @@ void OptimizeBodylessFunctions::analyze( auto Expr = dyn_cast(Op1.getExpr()); if (!Expr) return; - auto AddressIt = BC.GlobalSymbols.find(Expr->getSymbol().getName()); - if (AddressIt == BC.GlobalSymbols.end()) + const auto *Function = BC.getFunctionForSymbol(&Expr->getSymbol()); + if (!Function) return; - auto CalleeIt = BFs.find(AddressIt->second); - if (CalleeIt == BFs.end()) - return; - - assert(&Expr->getSymbol() == CalleeIt->second.getSymbol()); - EquivalentCallTarget[BF.getSymbol()->getName()] = &CalleeIt->second; + EquivalentCallTarget[BF.getSymbol()] = Function; } void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, @@ -112,9 +107,8 @@ void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, // Iteratively update target since we could have f1() calling f2() // calling f3() calling f4() and we want to output f1() directly // calling f4(). - while (EquivalentCallTarget.count(Target->getName())) { - Target = - EquivalentCallTarget.find(Target->getName())->second->getSymbol(); + while (EquivalentCallTarget.count(Target)) { + Target = EquivalentCallTarget.find(Target)->second->getSymbol(); } if (Target == OriginalTarget) continue; @@ -163,7 +157,7 @@ void InlineSmallFunctions::findInliningCandidates( BB.size() <= kMaxInstructions && BC.MIA->isReturn(LastInstruction) && !BC.MIA->isTailCall(LastInstruction)) { - InliningCandidates.insert(Function.getSymbol()->getName()); + InliningCandidates.insert(&Function); } } @@ -205,7 +199,7 @@ void InlineSmallFunctions::findInliningCandidatesAggressive( } } if (!FoundCFI) - InliningCandidates.insert(Function.getSymbol()->getName()); + InliningCandidates.insert(&Function); } DEBUG(errs() << "BOLT-DEBUG: " << InliningCandidates.size() @@ -568,26 +562,26 @@ bool InlineSmallFunctions::inlineCallsInFunction( auto Target = dyn_cast( Inst.getOperand(0).getExpr()); assert(Target && "Not MCSymbolRefExpr"); - auto FunctionIt = FunctionByName.find(Target->getSymbol().getName()); - if (FunctionIt != FunctionByName.end()) { - auto &TargetFunction = *FunctionIt->second; + const auto *TargetFunction = + BC.getFunctionForSymbol(&Target->getSymbol()); + if (TargetFunction) { bool CallToInlineableFunction = - InliningCandidates.count(TargetFunction.getSymbol()->getName()); + InliningCandidates.count(TargetFunction); totalInlineableCalls += CallToInlineableFunction * BB->getExecutionCount(); if (CallToInlineableFunction && - TargetFunction.getSize() + ExtraSize + TargetFunction->getSize() + ExtraSize + Function.estimateHotSize() < Function.getMaxSize()) { auto NextInstIt = std::next(InstIt); - inlineCall(BC, *BB, &Inst, *TargetFunction.begin()); + inlineCall(BC, *BB, &Inst, *TargetFunction->begin()); DidInlining = true; DEBUG(errs() << "BOLT-DEBUG: Inlining call to " - << TargetFunction << " in " + << *TargetFunction << " in " << Function << "\n"); InstIt = NextInstIt; - ExtraSize += TargetFunction.getSize(); + ExtraSize += TargetFunction->getSize(); inlinedDynamicCalls += BB->getExecutionCount(); continue; } @@ -637,29 +631,29 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( auto Target = dyn_cast( Inst.getOperand(0).getExpr()); assert(Target && "Not MCSymbolRefExpr"); - auto FunctionIt = FunctionByName.find(Target->getSymbol().getName()); - if (FunctionIt != FunctionByName.end()) { - auto &TargetFunction = *FunctionIt->second; + const auto *TargetFunction = + BC.getFunctionForSymbol(&Target->getSymbol()); + if (TargetFunction) { bool CallToInlineableFunction = - InliningCandidates.count(TargetFunction.getSymbol()->getName()); + InliningCandidates.count(TargetFunction); totalInlineableCalls += CallToInlineableFunction * BB->getExecutionCount(); if (CallToInlineableFunction && - TargetFunction.getSize() + ExtraSize + TargetFunction->getSize() + ExtraSize + Function.estimateHotSize() < Function.getMaxSize()) { unsigned NextInstIndex = 0; BinaryBasicBlock *NextBB = nullptr; std::tie(NextBB, NextInstIndex) = - inlineCall(BC, Function, BB, InstIndex, TargetFunction); + inlineCall(BC, Function, BB, InstIndex, *TargetFunction); DidInlining = true; DEBUG(errs() << "BOLT-DEBUG: Inlining call to " - << TargetFunction << " in " + << *TargetFunction << " in " << Function << "\n"); InstIndex = NextBB == BB ? NextInstIndex : BB->size(); InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end(); - ExtraSize += TargetFunction.getSize(); + ExtraSize += TargetFunction->getSize(); inlinedDynamicCalls += BB->getExecutionCount(); continue; } @@ -678,10 +672,6 @@ void InlineSmallFunctions::runOnFunctions( BinaryContext &BC, std::map &BFs, std::set &) { - for (auto &It : BFs) { - FunctionByName[It.second.getSymbol()->getName()] = &It.second; - } - findInliningCandidates(BC, BFs); std::vector ConsideredFunctions; @@ -878,26 +868,21 @@ bool SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, cast(Instr.getOperand(0).getExpr()); auto const &TailTarget = TailTargetSymExpr->getSymbol(); - // Lookup the address for the current function and - // the tail call target. - auto const FnAddress = - BC.GlobalSymbols.find(BF.getSymbol()->getName()); + // Lookup the address for the tail call target. auto const TailAddress = BC.GlobalSymbols.find(TailTarget.getName()); - if (FnAddress == BC.GlobalSymbols.end() || - TailAddress == BC.GlobalSymbols.end()) { + if (TailAddress == BC.GlobalSymbols.end()) continue; - } // Check to make sure we would be doing a forward jump. // This assumes the address range of the current BB and the // tail call target address don't overlap. - if (FnAddress->second < TailAddress->second) { + if (BF.getAddress() < TailAddress->second) { ++NumTailCallsPatched; ++NumLocalPatchedTailCalls; // Is the original jump forward or backward? const bool isForward = - TailAddress->second > FnAddress->second + BB->getOffset(); + TailAddress->second > BF.getAddress() + BB->getOffset(); if (isForward) ++NumOrigForwardBranches; @@ -1118,25 +1103,19 @@ void IdenticalCodeFolding::discoverCallers( } // Find the target function for this call. - const MCExpr *TargetExpr = TargetOp.getExpr(); + const auto *TargetExpr = TargetOp.getExpr(); assert(TargetExpr->getKind() == MCExpr::SymbolRef); - const MCSymbol &TargetSymbol = + const auto &TargetSymbol = dyn_cast(TargetExpr)->getSymbol(); - auto AI = BC.GlobalSymbols.find(TargetSymbol.getName()); - assert(AI != BC.GlobalSymbols.end()); - uint64_t TargetAddress = AI->second; - auto FI = BFs.find(TargetAddress); - if (FI == BFs.end()) { + const auto *Function = BC.getFunctionForSymbol(&TargetSymbol); + if (!Function) { // Call to a function without a BinaryFunction object. ++InstrIndex; continue; } - BinaryFunction *Callee = &FI->second; - // Insert a tuple in the Callers map. - Callers[Callee].emplace_back( + Callers[Function].emplace_back( CallSite(&Caller, BlockIndex, InstrIndex)); - ++InstrIndex; } } diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 29e9c7e1e3a8..bd34fa7aa24b 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace llvm { namespace bolt { @@ -38,7 +39,8 @@ class OptimizeBodylessFunctions : public BinaryFunctionPass { private: /// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G, /// thus calls to F can be optimized to calls to G. - std::map EquivalentCallTarget; + std::unordered_map + EquivalentCallTarget; void analyze(BinaryFunction &BF, BinaryContext &BC, @@ -58,9 +60,7 @@ class OptimizeBodylessFunctions : public BinaryFunctionPass { /// correctness and we may break exception handling because of this. class InlineSmallFunctions : public BinaryFunctionPass { private: - std::set InliningCandidates; - /// Maps function name to BinaryFunction. - std::map FunctionByName; + std::set InliningCandidates; /// Maximum number of instructions in an inlined function. static const unsigned kMaxInstructions = 8; @@ -226,7 +226,7 @@ class IdenticalCodeFolding : public BinaryFunctionPass { CallSite(BinaryFunction *Caller, unsigned BlockIndex, unsigned InstrIndex) : Caller(Caller), BlockIndex(BlockIndex), InstrIndex(InstrIndex) { } }; - using CallerMap = std::map>; + using CallerMap = std::map>; CallerMap Callers; /// Replaces all calls to BFTOFold with calls to BFToReplaceWith and merges diff --git a/bolt/DebugData.h b/bolt/DebugData.h index 53f90da124e2..21e7274a4d00 100644 --- a/bolt/DebugData.h +++ b/bolt/DebugData.h @@ -119,6 +119,8 @@ class BasicBlockOffsetRanges { class AddressRangesOwner { public: virtual void setAddressRangesOffset(uint32_t Offset) = 0; + + virtual ~AddressRangesOwner() {} }; /// Represents DWARF entities that have generic address ranges, maintaining diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 4141a488628d..60ba7a492d72 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -342,9 +342,11 @@ ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { } } +namespace { + /// Create BinaryContext for a given architecture \p ArchName and /// triple \p TripleName. -static std::unique_ptr CreateBinaryContext( +std::unique_ptr createBinaryContext( std::string ArchName, std::string TripleName, const DataReader &DR, @@ -451,10 +453,12 @@ static std::unique_ptr CreateBinaryContext( return BC; } +} // namespace + RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const DataReader &DR) : InputFile(File), - BC(CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR, + BC(createBinaryContext("x86-64", "x86_64-unknown-linux", DR, std::unique_ptr(new DWARFContextInMemory(*InputFile)))) { } @@ -464,7 +468,7 @@ void RewriteInstance::reset() { BinaryFunctions.clear(); FileSymRefs.clear(); auto &DR = BC->DR; - BC = CreateBinaryContext("x86-64", "x86_64-unknown-linux", DR, + BC = createBinaryContext("x86-64", "x86_64-unknown-linux", DR, std::unique_ptr(new DWARFContextInMemory(*InputFile))); CFIRdWrt.reset(nullptr); SectionMM.reset(nullptr); @@ -780,26 +784,24 @@ void RewriteInstance::discoverFileObjects() { } } + BinaryFunction *BF{nullptr}; auto BFI = BinaryFunctions.find(Address); if (BFI != BinaryFunctions.end()) { + BF = &BFI->second; // Duplicate function name. Make sure everything matches before we add // an alternative name. - if (SymbolSize != BFI->second.getSize()) { + if (SymbolSize != BF->getSize()) { errs() << "BOLT-WARNING: size mismatch for duplicate entries " << UniqueName << ':' << SymbolSize << " and " - << BFI->second << ':' << BFI->second.getSize() << '\n'; + << *BF << ':' << BF->getSize() << '\n'; } - BFI->second.addAlternativeName(UniqueName); + BF->addAlternativeName(UniqueName); } else { - // Create the function and add it to the map. - auto Result = BinaryFunctions.emplace( - Address, - BinaryFunction(UniqueName, Symbol, *Section, Address, SymbolSize, - *BC, IsSimple)); - BFI = Result.first; + BF = createBinaryFunction(UniqueName, *Section, Address, SymbolSize, + IsSimple); } if (!AlternativeName.empty()) - BFI->second.addAlternativeName(AlternativeName); + BF->addAlternativeName(AlternativeName); } if (!SeenFileName && BC->DR.hasLocalsWithFileName() && !opts::AllowStripped) { @@ -812,6 +814,17 @@ void RewriteInstance::discoverFileObjects() { } } +BinaryFunction *RewriteInstance::createBinaryFunction( + const std::string &Name, SectionRef Section, uint64_t Address, + uint64_t Size, bool IsSimple) { + auto Result = BinaryFunctions.emplace( + Address, BinaryFunction(Name, Section, Address, Size, *BC, IsSimple)); + assert(Result.second == true && "unexpected duplicate function"); + auto *BF = &Result.first->second; + BC->SymbolToFunctionMap[BF->getSymbol()] = BF; + return BF; +} + void RewriteInstance::readSpecialSections() { // Process special sections. StringRef FrameHdrContents; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index b99bcf7aa78d..aa1235d29370 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -332,6 +332,12 @@ class RewriteInstance { /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; + /// Construct BinaryFunction object and add it to internal maps. + BinaryFunction *createBinaryFunction(const std::string &Name, + object::SectionRef Section, + uint64_t Address, + uint64_t Size, + bool IsSimple); }; } // namespace bolt From ec75b355747dd0e84572bd093dbc8d2528e5cf8d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 29 Jul 2016 19:18:37 -0700 Subject: [PATCH 153/904] Add additional info to BOLT graphviz CFG dumps. Summary: Add the following info the graphviz CFG dump: - Edges are labeled with the jmp instruction that leads to that edge. - Edges include the count and misprediction count. - Nodes have (offset, BB index, BB layout index) - Nodes optionally have tooltips which contain the code of the basic block. (enabled with -dot-tooltip-code) - Added dashed edges to landing pads. (cherry picked from commit 891753dc1cca4ee957eb554776ce42be1e3695f9) --- bolt/BinaryFunction.cpp | 92 ++++++++++++++++++++++++++++++++++++++-- bolt/RewriteInstance.cpp | 3 +- 2 files changed, 91 insertions(+), 4 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 4dc2ea4c7357..6e8109bc24ba 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -45,6 +45,12 @@ AgressiveSplitting("split-all-cold", cl::desc("outline as many cold basic blocks as possible"), cl::Optional); +static cl::opt +DotToolTipCode("dot-tooltip-code", + cl::desc("add basic block instructions as tool tips on nodes"), + cl::Optional, + cl::Hidden); + } // namespace opts namespace { @@ -1480,14 +1486,94 @@ std::string constructFilename(std::string Filename, return Filename; } +std::string formatEscapes(const std::string& Str) { + std::string Result; + for (unsigned I = 0; I < Str.size(); ++I) { + auto C = Str[I]; + switch (C) { + case '\n': + Result += " "; + break; + case '"': + break; + default: + Result += C; + break; + } + } + return Result; +} + } void BinaryFunction::dumpGraph(raw_ostream& OS) const { - OS << "strict digraph \"" << *this << "\" {\n"; + OS << "strict digraph \"" << getPrintName() << "\" {\n"; + uint64_t Offset = Address; for (auto *BB : BasicBlocks) { + auto LayoutPos = std::find(BasicBlocksLayout.begin(), + BasicBlocksLayout.end(), + BB); + unsigned Layout = LayoutPos - BasicBlocksLayout.begin(); + OS << format("\"%s\" [label=\"%s\\n(O:%lu,I:%u,L%u)\"]\n", + BB->getName().data(), + BB->getName().data(), + BB->getOffset(), + BB->Index, + Layout); + OS << format("\"%s\" [shape=box]\n", BB->getName().data()); + if (opts::DotToolTipCode) { + std::string Str; + raw_string_ostream CS(Str); + Offset = BC.printInstructions(CS, BB->begin(), BB->end(), Offset, this); + const auto Code = formatEscapes(CS.str()); + OS << format("\"%s\" [tooltip=\"%s\"]\n", + BB->getName().data(), + Code.c_str()); + } + + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + const bool Success = BC.MIA->analyzeBranch(BB->Instructions, + TBB, + FBB, + CondBranch, + UncondBranch); + + unsigned Idx = 0; for (auto *Succ : BB->successors()) { - OS << "\"" << BB->getName() << "\" -> " - << "\"" << Succ->getName() << "\"\n"; + std::string Branch; + if (Success) { + if (CondBranch && Succ->getLabel() == TBB) { + Branch = BC.InstPrinter->getOpcodeName(CondBranch->getOpcode()); + } else if(UncondBranch && Succ->getLabel() == TBB) { + Branch = BC.InstPrinter->getOpcodeName(UncondBranch->getOpcode()); + } else { + Branch = "FT"; + } + } + OS << format("\"%s\" -> \"%s\" [label=\"%s", + BB->getName().data(), + Succ->getName().data(), + Branch.c_str()); + + const auto &BI = BB->BranchInfo[Idx]; + if (BB->ExecutionCount != COUNT_NO_PROFILE && + BI.MispredictedCount != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + OS << "\\n(M:" << BI.MispredictedCount << ",C:" << BI.Count << ")"; + } else if (ExecutionCount != COUNT_NO_PROFILE && + BI.Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + OS << "\\n(IC:" << BI.Count << ")"; + } + OS << "\"]\n"; + + ++Idx; + } + for (auto *LP : BB->LandingPads) { + OS << format("\"%s\" -> \"%s\" [constraint=false style=dashed]\n", + BB->getName().data(), + LP->getName().data()); } } OS << "}\n"; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 60ba7a492d72..9740be8d0124 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -251,7 +251,8 @@ bool shouldProcess(const BinaryFunction &Function) { } // namespace opts - +constexpr const char *RewriteInstance::DebugSectionsToOverwrite[]; + static void report_error(StringRef Message, std::error_code EC) { assert(EC); errs() << "BOLT-ERROR: '" << Message << "': " << EC.message() << ".\n"; From 2a0d6f00712695c07a2c51fcdedb6358a1edd8e0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 22 Jul 2016 20:52:57 -0700 Subject: [PATCH 154/904] Check if operands are immediates before trying shortening. Summary: Operands in the initial instruction stream should all have immediate operands for instructions that can be shortened. But if a BOLT optimization pass adds one of these instructions with a symbolic operand, the shortening operation will assert. This diff adds checks to make sure that the operands are immediate. I've also disabled shortening pass by default since it won't really be needed until ICP is submitted. It will still run at CFG creation time. (cherry picked from commit 52b023662ec55d089ba738182b29f39a073ac005) --- bolt/BinaryPassManager.cpp | 1 - bolt/RewriteInstance.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 8531cf750e45..af690956ffd4 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -39,7 +39,6 @@ SimplifyConditionalTailCalls("simplify-conditional-tail-calls", static llvm::cl::opt Peepholes("peepholes", llvm::cl::desc("run peephole optimizations"), - llvm::cl::init(true), llvm::cl::Optional); static llvm::cl::opt diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 9740be8d0124..41691d91d057 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -252,7 +252,7 @@ bool shouldProcess(const BinaryFunction &Function) { } // namespace opts constexpr const char *RewriteInstance::DebugSectionsToOverwrite[]; - + static void report_error(StringRef Message, std::error_code EC) { assert(EC); errs() << "BOLT-ERROR: '" << Message << "': " << EC.message() << ".\n"; From c1d07f514a759944ad70b247bce06fab9b369f6b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 15 Aug 2016 15:37:00 -0700 Subject: [PATCH 155/904] Compute ClusterEdges only when necessary. Summary: We only need ClusterEdges in reordering algorithm optimized for branches and the computation is quite resource-hungry, thus it makes sense to only do it when needed. Some refactoring too. (cherry picked from commit e576d80a679b46af68f7e6ba04b83a772bebb1db) --- bolt/BinaryFunction.cpp | 2 +- bolt/BinaryFunction.h | 3 +- bolt/BinaryPasses.cpp | 7 ++--- bolt/ReorderAlgorithm.cpp | 63 ++++++++++++++++++++++----------------- bolt/ReorderAlgorithm.h | 39 +++++++++++++----------- 5 files changed, 63 insertions(+), 51 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 6e8109bc24ba..b35d6ea4e6ce 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1407,7 +1407,7 @@ bool BinaryFunction::fixCFIState() { } void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, - bool Split) { + bool Split) { if (BasicBlocksLayout.empty() || Type == LT_NONE) return; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index d76e0f3502d1..bcedb64a0fbc 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -99,6 +99,8 @@ class BinaryFunction : public AddressRangesOwner { // solution to the layout problem instead of seeking the optimal one. static constexpr uint64_t FUNC_SIZE_THRESHOLD = 10; + using BasicBlockOrderType = std::vector; + private: /// Current state of the function. @@ -327,7 +329,6 @@ class BinaryFunction : public AddressRangesOwner { // layout (if BasicBlocksLayout stores a different order than BasicBlocks), // the terminating instructions need to be modified. using BasicBlockListType = std::vector; - using BasicBlockOrderType = std::vector; BasicBlockListType BasicBlocks; BasicBlockOrderType BasicBlocksLayout; diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 15ef38c51cf4..2566f4c527fa 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -760,10 +760,9 @@ void EliminateUnreachableBlocks::runOnFunctions( } void ReorderBasicBlocks::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions -) { + BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { for (auto &It : BFs) { auto &Function = It.second; diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp index 451e20715665..fb0f043e0235 100644 --- a/bolt/ReorderAlgorithm.cpp +++ b/bolt/ReorderAlgorithm.cpp @@ -90,7 +90,7 @@ void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { } size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { - HashPair Hasher; + HashPair Hasher; return Hasher(std::make_pair(E.Src, E.Dst)); } @@ -99,7 +99,8 @@ bool GreedyClusterAlgorithm::EdgeEqual::operator()( return A.Src == B.Src && A.Dst == B.Dst; } -void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { +void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF, + bool ComputeEdges) { reset(); // Greedy heuristic implementation for the TSP, applied to BB layout. Try to @@ -111,7 +112,8 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { std::vector Queue; // Initialize inter-cluster weights. - ClusterEdges.resize(BF.layout_size()); + if (ComputeEdges) + ClusterEdges.resize(BF.layout_size()); // Initialize clusters and edge queue. for (auto BB : BF.layout()) { @@ -138,8 +140,8 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { auto E = Queue.back(); Queue.pop_back(); - BinaryBasicBlock *SrcBB = E.Src; - BinaryBasicBlock *DstBB = E.Dst; + const auto *SrcBB = E.Src; + const auto *DstBB = E.Dst; DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); @@ -157,7 +159,8 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { // Case 2: If they are already allocated at the same cluster, just increase // the weight of this cluster if (I == J) { - ClusterEdges[I][I] += E.Count; + if (ComputeEdges) + ClusterEdges[I][I] += E.Count; DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); continue; } @@ -171,24 +174,27 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { BBToClusterMap[BB] = I; ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); ClusterB.clear(); - // Increase the intra-cluster edge count of cluster A with the count of - // this edge as well as with the total count of previously visited edges - // from cluster B cluster A. - ClusterEdges[I][I] += E.Count; - ClusterEdges[I][I] += ClusterEdges[J][I]; - // Iterate through all inter-cluster edges and transfer edges targeting - // cluster B to cluster A. - for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) - ClusterEdges[K][I] += ClusterEdges[K][J]; - DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); + if (ComputeEdges) { + // Increase the intra-cluster edge count of cluster A with the count of + // this edge as well as with the total count of previously visited edges + // from cluster B cluster A. + ClusterEdges[I][I] += E.Count; + ClusterEdges[I][I] += ClusterEdges[J][I]; + // Iterate through all inter-cluster edges and transfer edges targeting + // cluster B to cluster A. + for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) + ClusterEdges[K][I] += ClusterEdges[K][J]; + } // Adjust the weights of the remaining edges and re-sort the queue. adjustQueue(Queue, BF); + DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); } else { // Case 4: Both SrcBB and DstBB are allocated in positions we cannot // merge them. Add the count of this edge to the inter-cluster edge count // between clusters A and B to help us decide ordering between these // clusters. - ClusterEdges[I][J] += E.Count; + if (ComputeEdges) + ClusterEdges[I][J] += E.Count; DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); } } @@ -308,8 +314,8 @@ void MinBranchGreedyClusterAlgorithm::adjustQueue( // source and destination in the same cluster. std::vector NewQueue; for (const EdgeTy &E : Queue) { - BinaryBasicBlock *SrcBB = E.Src; - BinaryBasicBlock *DstBB = E.Dst; + const auto *SrcBB = E.Src; + const auto *DstBB = E.Dst; // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore // this edge. @@ -330,19 +336,20 @@ void MinBranchGreedyClusterAlgorithm::adjustQueue( // destination, so that this edge has no effect on them any more, and ignore // this edge. Also increase the intra- (or inter-) cluster edge count. if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { - ClusterEdges[I][J] += E.Count; + if (!ClusterEdges.empty()) + ClusterEdges[I][J] += E.Count; DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); dbgs() << " (src, dst belong to same cluster or incompatible " "clusters)\n"); - for (BinaryBasicBlock *SuccBB : SrcBB->successors()) { + for (const auto *SuccBB : SrcBB->successors()) { if (SuccBB == DstBB) continue; auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); assert(WI != Weight.end() && "CFG edge not found in Weight map"); WI->second += (int64_t)E.Count; } - for (BinaryBasicBlock *PredBB : DstBB->predecessors()) { + for (const auto *PredBB : DstBB->predecessors()) { if (PredBB == SrcBB) continue; auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); @@ -375,7 +382,7 @@ void MinBranchGreedyClusterAlgorithm::reset() { void OptimalReorderAlgorithm::reorderBasicBlocks( const BinaryFunction &BF, BasicBlockOrder &Order) const { std::vector> Weight; - std::unordered_map BBToIndex; + std::unordered_map BBToIndex; std::vector IndexToBB; unsigned N = BF.layout_size(); @@ -491,13 +498,13 @@ void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( return; // Cluster basic blocks. - CAlgo->clusterBasicBlocks(BF); - std::vector &Clusters = CAlgo->Clusters;; + CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true); + std::vector &Clusters = CAlgo->Clusters; auto &ClusterEdges = CAlgo->ClusterEdges; // Compute clusters' average frequencies. CAlgo->computeClusterAverageFrequency(); - std::vector &AvgFreq = CAlgo->AvgFreq;; + std::vector &AvgFreq = CAlgo->AvgFreq; if (opts::PrintClusters) CAlgo->printClusters(); @@ -595,11 +602,11 @@ void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( // Cluster basic blocks. CAlgo->clusterBasicBlocks(BF); - std::vector &Clusters = CAlgo->Clusters;; + std::vector &Clusters = CAlgo->Clusters; // Compute clusters' average frequencies. CAlgo->computeClusterAverageFrequency(); - std::vector &AvgFreq = CAlgo->AvgFreq;; + std::vector &AvgFreq = CAlgo->AvgFreq; if (opts::PrintClusters) CAlgo->printClusters(); diff --git a/bolt/ReorderAlgorithm.h b/bolt/ReorderAlgorithm.h index 1269a0d5d2e8..1aef053d6cf3 100644 --- a/bolt/ReorderAlgorithm.h +++ b/bolt/ReorderAlgorithm.h @@ -14,6 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H #define LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H +#include "BinaryFunction.h" #include "llvm/Support/ErrorHandling.h" #include #include @@ -36,16 +37,17 @@ class BinaryFunction; /// into clusters using execution profile data and various heuristics. class ClusterAlgorithm { public: - typedef std::vector ClusterTy; + using ClusterTy = std::vector; std::vector Clusters; std::vector> ClusterEdges; std::vector AvgFreq; - /// Group the basic blocks the given function into clusters stored in the + /// Group the basic blocks in the given function into clusters stored in the /// Clusters vector. Also encode relative weights between two clusters in - /// the ClusterEdges vector. This vector is indexed by the clusters indices - /// in the Clusters vector. - virtual void clusterBasicBlocks(const BinaryFunction &BF) =0; + /// the ClusterEdges vector if requested. This vector is indexed by + /// the clusters indices in the Clusters vector. + virtual void clusterBasicBlocks(const BinaryFunction &BF, + bool ComputeEdges = false) = 0; /// Compute for each cluster its averagae execution frequency, that is /// the sum of average frequencies of its blocks (execution count / # instrs). @@ -58,7 +60,7 @@ class ClusterAlgorithm { void printClusters() const; - virtual ~ClusterAlgorithm() { } + virtual ~ClusterAlgorithm() {} }; @@ -69,12 +71,13 @@ class GreedyClusterAlgorithm : public ClusterAlgorithm { // Represents an edge between two basic blocks, with source, destination, and // profile count. struct EdgeTy { - BinaryBasicBlock *Src; - BinaryBasicBlock *Dst; + const BinaryBasicBlock *Src; + const BinaryBasicBlock *Dst; uint64_t Count; - EdgeTy(BinaryBasicBlock *Src, BinaryBasicBlock *Dst, uint64_t Count) : - Src(Src), Dst(Dst), Count(Count) { } + EdgeTy(const BinaryBasicBlock *Src, const BinaryBasicBlock *Dst, + uint64_t Count) : + Src(Src), Dst(Dst), Count(Count) {} void print(raw_ostream &OS) const; }; @@ -90,18 +93,20 @@ class GreedyClusterAlgorithm : public ClusterAlgorithm { // Virtual methods that allow custom specialization of the heuristic used by // the algorithm to select edges. virtual void initQueue( - std::vector &Queue, const BinaryFunction &BF) =0; + std::vector &Queue, const BinaryFunction &BF) = 0; virtual void adjustQueue( - std::vector &Queue, const BinaryFunction &BF) =0; + std::vector &Queue, const BinaryFunction &BF) = 0; virtual bool areClustersCompatible( - const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const =0; + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const = 0; // Map from basic block to owning cluster index. - using BBToClusterMapTy = std::unordered_map; + using BBToClusterMapTy = std::unordered_map; BBToClusterMapTy BBToClusterMap; public: - void clusterBasicBlocks(const BinaryFunction &BF) override; + void clusterBasicBlocks(const BinaryFunction &BF, + bool ComputeEdges = false) override; void reset() override; }; @@ -172,12 +177,12 @@ class ReorderAlgorithm { explicit ReorderAlgorithm(std::unique_ptr CAlgo) : CAlgo(std::move(CAlgo)) { } - typedef std::vector BasicBlockOrder; + using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; /// Reorder the basic blocks of the given function and store the new order in /// the new Clusters vector. virtual void reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const =0; + const BinaryFunction &BF, BasicBlockOrder &Order) const = 0; void setClusterAlgorithm(ClusterAlgorithm *CAlgo) { this->CAlgo.reset(CAlgo); From 93568eddf87793c5612a890f396c2b335c895b37 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 19 Aug 2016 13:54:35 -0700 Subject: [PATCH 156/904] Write padding for .eh_frame_hdr to a file. Summary: We were applying padding to the calculated address but were never writing it to a file triggering an assertion for cases when .gcc_except_table size wasn't multiple of 4. (cherry picked from commit 768ff62da0f86b30c1e389a113edee65e6e01cd0) --- bolt/RewriteInstance.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 41691d91d057..25b8b7b8ac18 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1983,8 +1983,10 @@ void RewriteInstance::rewriteFile() { auto &EHFrameSecInfo = SMII->second; outs() << "BOLT: writing a new .eh_frame_hdr\n"; if (FrameHdrAlign > 1) { - NextAvailableAddress = - RoundUpToAlignment(NextAvailableAddress, FrameHdrAlign); + auto PaddingSize = OffsetToAlignment(NextAvailableAddress, FrameHdrAlign); + for (unsigned I = 0; I < PaddingSize; ++I) + Out->os().write((unsigned char)0); + NextAvailableAddress += PaddingSize; } SectionInfo EHFrameHdrSecInfo; From 03de8e127205ffb2ad92bfbc7b6768c6ee7aaf47 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 22 Aug 2016 14:24:09 -0700 Subject: [PATCH 157/904] Handling for indirect tail calls. Summary: Analyze indirect branches and convert them into indirect tail calls when possible. We analyze the memory contents when the address could be calculated statically and also detect epilogue code. (cherry picked from commit 0b350ea67722b599cb175ef20b2e7edd9646b9b5) --- bolt/BinaryBasicBlock.h | 4 +- bolt/BinaryContext.cpp | 16 +-- bolt/BinaryContext.h | 6 +- bolt/BinaryFunction.cpp | 277 ++++++++++++++++++++++++++++++++++----- bolt/BinaryFunction.h | 15 +++ bolt/BinaryPasses.cpp | 2 +- bolt/RewriteInstance.cpp | 39 +++--- bolt/RewriteInstance.h | 4 + 8 files changed, 299 insertions(+), 64 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 5d99f39f7ea0..180f51d04e87 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -34,13 +34,13 @@ class BinaryContext; /// The intention is to keep the structure similar to MachineBasicBlock as /// we might switch to it at some point. class BinaryBasicBlock { - public: +public: struct BinaryBranchInfo { uint64_t Count; uint64_t MispredictedCount; /// number of branches mispredicted }; - private: +private: /// Label associated with the block. MCSymbol *Label{nullptr}; diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 935a1b34e9a3..f9befb191671 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -63,15 +63,9 @@ void BinaryContext::printGlobalSymbols(raw_ostream& OS) const { } } -} // namespace bolt -} // namespace llvm - namespace { -using namespace llvm; -using namespace bolt; - -/// Returns the binary function that contains a given address in the input +/// Returns a binary function that contains a given address in the input /// binary, or nullptr if none does. BinaryFunction *getBinaryFunctionContainingAddress( uint64_t Address, @@ -158,9 +152,6 @@ void findSubprograms(DWARFCompileUnit *Unit, } // namespace -namespace llvm { -namespace bolt { - void BinaryContext::preprocessDebugInfo( std::map &BinaryFunctions) { // Populate MCContext with DWARF files. @@ -282,10 +273,11 @@ void BinaryContext::printInstruction(raw_ostream &OS, return; } OS << format(" %08" PRIx64 ": ", Offset); - if (Function && MIA->isCFI(Instruction)) { + if (MIA->isCFI(Instruction)) { uint32_t Offset = Instruction.getOperand(0).getImm(); OS << "\t!CFI\t$" << Offset << "\t; "; - printCFI(OS, Function->getCFIFor(Instruction)->getOperation()); + if (Function) + printCFI(OS, Function->getCFIFor(Instruction)->getOperation()); OS << "\n"; return; } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 30a98116a1b6..007fe2d6825d 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -72,8 +72,10 @@ class BinaryContext { /// Map virtual address to a section. std::map AllocatableSections; - /// Set of addresses we cannot relocate because we have a direct branch to it. - std::set InterproceduralBranchTargets; + /// Set of addresses in the code that are not a function start, and are + /// referenced from outside of containing function. E.g. this could happen + /// when a function has more than a single entry point. + std::set InterproceduralReferences; /// List of DWARF location lists in .debug_loc. std::vector LocationLists; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index b35d6ea4e6ce..942e5cc23901 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -53,6 +53,12 @@ DotToolTipCode("dot-tooltip-code", } // namespace opts +// Temporary constant. +// +// TODO: move to architecture-specific file together with the code that is +// using it. +constexpr unsigned NoRegister = 0; + namespace { /// Gets debug line information for the instruction located at the given @@ -345,12 +351,27 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // basic block. Labels[0] = Ctx->createTempSymbol("BB0", false); + auto getOrCreateLocalLabel = [&](uint64_t Address) { + MCSymbol *Result; + // Check if there's already a registered label. + auto Offset = Address - getAddress(); + assert(Offset < getSize() && "address outside of function bounds"); + auto LI = Labels.find(Offset); + if (LI == Labels.end()) { + Result = Ctx->createTempSymbol(); + Labels[Offset] = Result; + } else { + Result = LI->second; + } + return Result; + }; + auto handleRIPOperand = [&](MCInst &Instruction, uint64_t Address, uint64_t Size) { uint64_t TargetAddress{0}; MCSymbol *TargetSymbol{nullptr}; - if (!BC.MIA->evaluateRIPOperand(Instruction, Address, Size, - TargetAddress)) { + if (!BC.MIA->evaluateRIPOperandTarget(Instruction, Address, Size, + TargetAddress)) { DEBUG(dbgs() << "BOLT: rip-relative operand can't be evaluated:\n"; BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); dbgs() << '\n'; @@ -358,19 +379,129 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { dbgs() << '\n';); return false; } - // FIXME: check that the address is in data, not in code. if (TargetAddress == 0) { errs() << "BOLT-WARNING: rip-relative operand is zero in function " << *this << ". Ignoring function.\n"; return false; } - TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); + + // Note that the address does not necessarily have to reside inside + // a section, it could be an absolute address too. + auto Section = BC.getSectionForAddress(TargetAddress); + if (Section && Section->isText()) { + if (containsAddress(TargetAddress)) { + TargetSymbol = getOrCreateLocalLabel(TargetAddress); + } else { + BC.InterproceduralReferences.insert(TargetAddress); + } + } + if (!TargetSymbol) + TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); BC.MIA->replaceRIPOperandDisp( Instruction, MCOperand::createExpr(MCSymbolRefExpr::create( TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx))); return true; }; + enum class IndirectBranchType : char { + UNKNOWN = 0, /// Unable to determine type. + POSSIBLE_TAIL_CALL, /// Possibly a tail call. + POSSIBLE_SWITCH_TABLE, /// Possibly a switch/jump table + POSSIBLE_GOTO /// Possibly a gcc's computed goto. + }; + + auto analyzeIndirectBranch = + [&](MCInst &Instruction, unsigned Size, uint64_t Offset) { + // Try to find a (base) memory location from where the address for + // the indirect branch is loaded. For X86-64 the memory will be specified + // in the following format: + // + // {%rip}/{%basereg} + Imm + IndexReg * Scale + // + // We are interested in the cases where Scale == sizeof(uintptr_t) and + // the contents of the memory are presumably a function array. + const auto *MemLocInstr = &Instruction; + if (Instruction.getNumOperands() == 1) { + // If the indirect jump is on register - try to detect if the + // register value is loaded from a memory location. + assert(Instruction.getOperand(0).isReg() && "register operand expected"); + const auto JmpRegNum = Instruction.getOperand(0).getReg(); + // Check if one of the previous instructions defines the jump-on register. + // We will check that this instruction belongs to the same basic block + // in postProcessIndirectBranches(). + for (auto PrevII = Instructions.rbegin(); PrevII != Instructions.rend(); + ++PrevII) { + const auto &PrevInstr = PrevII->second; + const auto &PrevInstrDesc = BC.MII->get(PrevInstr.getOpcode()); + if (!PrevInstrDesc.hasDefOfPhysReg(PrevInstr, JmpRegNum, *BC.MRI)) + continue; + if (!MIA->isMoveMem2Reg(PrevInstr)) + return IndirectBranchType::UNKNOWN; + MemLocInstr = &PrevInstr; + break; + } + if (MemLocInstr == &Instruction) { + // No definition seen for the register in this function so far. Could be + // an input parameter - which means it is an external code reference. + // It also could be that the definition happens to be in the code that + // we haven't processed yet. Since we have to be conservative, return + // as UNKNOWN case. + return IndirectBranchType::UNKNOWN; + } + } + + const auto RIPRegister = BC.MRI->getProgramCounter(); + + // Analyze contents of the memory if possible. + unsigned BaseRegNum; + int64_t ScaleValue; + unsigned IndexRegNum; + int64_t DispValue; + unsigned SegRegNum; + if (!MIA->evaluateX86MemoryOperand(*MemLocInstr, BaseRegNum, + ScaleValue, IndexRegNum, + DispValue, SegRegNum)) + return IndirectBranchType::UNKNOWN; + + if ((BaseRegNum != bolt::NoRegister && BaseRegNum != RIPRegister) || + SegRegNum != bolt::NoRegister || + ScaleValue != BC.AsmInfo->getPointerSize()) + return IndirectBranchType::UNKNOWN; + + auto ArrayStart = DispValue; + if (BaseRegNum == RIPRegister) + ArrayStart += getAddress() + Offset + Size; + + auto SectionOrError = BC.getSectionForAddress(ArrayStart); + if (!SectionOrError) { + // No section - possibly an absolute address. Since we don't allow + // internal function addresses to escape the function scope - we + // consider it a tail call. + errs() << "BOLT-WARNING: no section for address 0x" + << Twine::utohexstr(ArrayStart) << " referenced from function " + << *this << '\n'; + return IndirectBranchType::POSSIBLE_TAIL_CALL; + } + auto &Section = *SectionOrError; + if (Section.isVirtual()) { + // The contents are filled at runtime. + return IndirectBranchType::POSSIBLE_TAIL_CALL; + } + // Extract the value at the start of the array. + StringRef SectionContents; + Section.getContents(SectionContents); + DataExtractor DE(SectionContents, + BC.AsmInfo->isLittleEndian(), + BC.AsmInfo->getPointerSize()); + auto ValueOffset = static_cast(ArrayStart - Section.getAddress()); + auto Value = DE.getAddress(&ValueOffset); + if (containsAddress(Value) && Value != getAddress()) + return IndirectBranchType::POSSIBLE_SWITCH_TABLE; + + BC.InterproceduralReferences.insert(Value); + return IndirectBranchType::POSSIBLE_TAIL_CALL; + }; + bool IsSimple = true; for (uint64_t Offset = 0; IsSimple && (Offset < getSize()); ) { MCInst Instruction; @@ -396,11 +527,11 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { MIA->shortenInstruction(Instruction); if (MIA->isBranch(Instruction) || MIA->isCall(Instruction)) { - uint64_t InstructionTarget = 0; + uint64_t TargetAddress = 0; if (MIA->evaluateBranch(Instruction, AbsoluteInstrAddr, Size, - InstructionTarget)) { + TargetAddress)) { // Check if the target is within the same function. Otherwise it's // a call, possibly a tail call. // @@ -409,10 +540,9 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { bool IsCall = MIA->isCall(Instruction); bool IsCondBranch = MIA->isConditionalBranch(Instruction); MCSymbol *TargetSymbol{nullptr}; - uint64_t TargetOffset{0}; - if (IsCall && containsAddress(InstructionTarget)) { - if (InstructionTarget == getAddress()) { + if (IsCall && containsAddress(TargetAddress)) { + if (TargetAddress == getAddress()) { // Recursive call. TargetSymbol = getSymbol(); } else { @@ -426,21 +556,14 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (!TargetSymbol) { // Create either local label or external symbol. - if (containsAddress(InstructionTarget)) { - // Check if there's already a registered label. - TargetOffset = InstructionTarget - getAddress(); - auto LI = Labels.find(TargetOffset); - if (LI == Labels.end()) { - TargetSymbol = Ctx->createTempSymbol(); - Labels[TargetOffset] = TargetSymbol; - } else { - TargetSymbol = LI->second; - } + if (containsAddress(TargetAddress)) { + TargetSymbol = getOrCreateLocalLabel(TargetAddress); } else { - BC.InterproceduralBranchTargets.insert(InstructionTarget); + BC.InterproceduralReferences.insert(TargetAddress); if (!IsCall && Size == 2) { errs() << "BOLT-WARNING: relaxed tail call detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) + << " in function " << *this << ". Code size will be increased.\n"; } @@ -460,13 +583,13 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // TODO: A better way to do this would be using annotations for // MCInst objects. TailCallOffsets.emplace(std::make_pair(Offset, - InstructionTarget)); + TargetAddress)); IsCall = true; } - TargetSymbol = BC.getOrCreateGlobalSymbol(InstructionTarget, + TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "FUNCat"); - if (InstructionTarget == 0) { + if (TargetAddress == 0) { // We actually see calls to address 0 because of the weak symbols // from the libraries. In reality more often than not it is // unreachable code, but we don't know it and have to emit calls @@ -486,23 +609,34 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { *Ctx))); if (!IsCall) { // Add taken branch info. - TakenBranches.push_back({Offset, TargetOffset}); + TakenBranches.push_back({Offset, TargetAddress - getAddress()}); } if (IsCondBranch) { // Add fallthrough branch info. FTBranches.push_back({Offset, Offset + Size}); } } else { - // Should be an indirect call or an indirect branch. Bail out on the - // latter case. + // Could not evaluate branch. Should be an indirect call or an + // indirect branch. Bail out on the latter case. if (MIA->isIndirectBranch(Instruction)) { - DEBUG(dbgs() << "BOLT-WARNING: indirect branch detected at 0x" - << Twine::utohexstr(AbsoluteInstrAddr) - << ". Skipping function " << *this << ".\n"); - IsSimple = false; + auto Result = analyzeIndirectBranch(Instruction, Size, Offset); + switch (Result) { + default: + llvm_unreachable("unexpected result"); + case IndirectBranchType::POSSIBLE_TAIL_CALL: + MIA->convertJmpToTailCall(Instruction); + break; + case IndirectBranchType::POSSIBLE_SWITCH_TABLE: + IsSimple = false; + break; + case IndirectBranchType::UNKNOWN: + // Keep processing. We'll do more checks and fixes in + // postProcessIndirectBranches(). + break; + }; } // Indirect call. We only need to fix it if the operand is RIP-relative - if (MIA->hasRIPOperand(Instruction)) { + if (IsSimple && MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) @@ -542,6 +676,83 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { return true; } +bool BinaryFunction::postProcessIndirectBranches() { + for (auto *BB : layout()) { + for (auto &Instr : *BB) { + if (!BC.MIA->isIndirectBranch(Instr)) + continue; + + // If there's an indirect branch in a single-block function - + // it must be a tail call. + if (layout_size() == 1) { + BC.MIA->convertJmpToTailCall(Instr); + return true; + } + + // Validate the tail call assumptions. + if (BC.MIA->isTailCall(Instr)) { + unsigned BaseRegNum; + int64_t ScaleValue; + unsigned IndexRegNum; + int64_t DispValue; + unsigned SegRegNum; + if (BC.MIA->evaluateX86MemoryOperand(Instr, BaseRegNum, + ScaleValue, IndexRegNum, + DispValue, SegRegNum)) { + // We have validated the memory contents addressed by the + // jump instruction already. + continue; + } + // This is jump on register. Just make sure the register is defined + // in the containing basic block. Other assumptions were checked + // earlier. + assert(Instr.getOperand(0).isReg() && "register operand expected"); + const auto JmpRegNum = Instr.getOperand(0).getReg(); + bool IsJmpRegSetInBB = false; + for (const auto &OtherInstr : *BB) { + const auto &OtherInstrDesc = BC.MII->get(OtherInstr.getOpcode()); + if (OtherInstrDesc.hasDefOfPhysReg(OtherInstr, JmpRegNum, *BC.MRI)) { + IsJmpRegSetInBB = true; + break; + } + } + if (IsJmpRegSetInBB) + continue; + DEBUG(dbgs() << "BOLT-INFO: rejected potential indirect tail call in " + << "function " << *this << " because the jump-on register " + << "was not defined in basic block " + << BB->getName() << ":\n"; + BC.printInstructions(dbgs(), BB->begin(), BB->end(), + BB->getOffset(), this); + ); + return false; + } + + // If this block contains an epilogue code and has an indirect branch, + // then most likely it's a tail call. Otherwise, we cannot tell for sure + // what it is and conservatively reject the function's CFG. + bool IsEpilogue = false; + for (const auto &Instr : *BB) { + if (BC.MIA->isLeave(Instr) || BC.MIA->isPop(Instr)) { + IsEpilogue = true; + break; + } + } + if (!IsEpilogue) { + DEBUG(dbgs() << "BOLT-INFO: rejected potential indirect tail call in " + << "function " << *this << " in basic block " + << BB->getName() << ":\n"; + BC.printInstructions(dbgs(), BB->begin(), BB->end(), + BB->getOffset(), this); + ); + return false; + } + BC.MIA->convertJmpToTailCall(Instr); + } + } + return true; +} + void BinaryFunction::clearLandingPads(const unsigned StartIndex, const unsigned NumBlocks) { // remove all landing pads/throws for the given collection of blocks @@ -892,6 +1103,10 @@ bool BinaryFunction::buildCFG() { BasicBlocksLayout.emplace_back(BB); } + // Make any necessary adjustments for indirect branches. + if (!postProcessIndirectBranches()) + setSimple(false); + // Fix the possibly corrupted CFI state. CFI state may have been corrupted // because of the CFG modifications while removing conditional tail calls. fixCFIState(); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index bcedb64a0fbc..c2b91916bd54 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -968,6 +968,21 @@ class BinaryFunction : public AddressRangesOwner { /// State::CFG. Returns false if CFG cannot be built. bool buildCFG(); + /// Verify that any assumptions we've made about indirect branches were + /// correct and also make any necessary changes to unknown indirect branches. + /// + /// Catch-22: we need to know indirect branch targets to build CFG, and + /// in order to determine the value for indirect branches we need to know CFG. + /// + /// As such, the process of decoding indirect branches is broken into 2 steps: + /// first we make our best guess about a branch without knowing the CFG, + /// and later after we have the CFG for the function, we verify our earlier + /// assumptions and also do our best at processing unknown indirect branches. + /// + /// Return true upon successful processing, or false if the control flow + /// cannot be statically evaluated for any given indirect branch. + bool postProcessIndirectBranches(); + /// Check how closely the profile data matches the function and set /// ProfileMatchRatio to reflect the accuracy. void evaluateProfileData(const FuncBranchData &BranchData); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 2566f4c527fa..b99add3ecf63 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -1003,7 +1003,7 @@ bool SimplifyRODataLoads::simplifyRODataLoads( if (GI == BC.GlobalSymbols.end()) continue; TargetAddress = GI->second; - } else if (!MIA->evaluateMemOperand(Inst, TargetAddress)) { + } else if (!MIA->evaluateMemOperandTarget(Inst, TargetAddress)) { continue; } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 25b8b7b8ac18..42c5eed87532 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -850,7 +850,9 @@ void RewriteInstance::readSpecialSections() { DebugLocSize = Section.getSize(); } - if (Section.isText() || Section.isData() || Section.isBSS()) { + // Ignore zero-size allocatable sections as they present no interest to us. + if ((Section.isText() || Section.isData() || Section.isBSS()) && + Section.getSize() > 0) { BC->AllocatableSections.emplace(std::make_pair(Section.getAddress(), Section)); } @@ -1002,22 +1004,16 @@ void RewriteInstance::disassembleFunctions() { } // Iterate over all functions // Mark all functions with internal addresses serving as interprocedural - // branch targets as not simple -- pretty rare but can happen in code - // written in assembly. + // reference as not simple. // TODO: #9301815 - for (auto Addr : BC->InterproceduralBranchTargets) { - // Check if this address is internal to some function we are reordering - auto I = BinaryFunctions.upper_bound(Addr); - if (I == BinaryFunctions.begin()) - continue; - BinaryFunction &Func = (--I)->second; - uint64_t Offset = Addr - I->first; - if (Offset == 0 || Offset >= Func.getSize()) - continue; - errs() << "BOLT-WARNING: Function " << Func - << " has internal BBs that are target of a branch located in " - "another function. We will not process this function.\n"; - Func.setSimple(false); + for (auto Addr : BC->InterproceduralReferences) { + auto *ContainingFunction = getBinaryFunctionContainingAddress(Addr); + if (ContainingFunction && ContainingFunction->getAddress() != Addr) { + errs() << "BOLT-WARNING: Function " << ContainingFunction + << " has internal BBs that are target of a reference located in " + "another function. Skipping the function.\n"; + ContainingFunction->setSimple(false); + } } uint64_t NumSimpleFunctions{0}; @@ -2035,3 +2031,14 @@ bool RewriteInstance::shouldOverwriteSection(StringRef SectionName) { return false; } + +BinaryFunction * +RewriteInstance::getBinaryFunctionContainingAddress(uint64_t Address) { + auto FI = BinaryFunctions.upper_bound(Address); + if (FI == BinaryFunctions.begin()) + return nullptr; + --FI; + if (FI->first + FI->second.getSize() <= Address) + return nullptr; + return &FI->second; +} diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index aa1235d29370..444f98449dfd 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -249,6 +249,10 @@ class RewriteInstance { return Address - NewTextSegmentAddress + NewTextSegmentOffset; } + /// Return BinaryFunction containing the given \p Address or nullptr if + /// no registered function has it. + BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) ; + /// Return true if we should overwrite contents of the section instead /// of appending contents to it. bool shouldOverwriteSection(StringRef SectionName); From f5767d95474710ad165fbac51a015dbdc2d132a6 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 24 Aug 2016 14:25:33 -0700 Subject: [PATCH 158/904] Emit remember_state CFI in the same code region as restore_state. Summary: While creating remember_state/restore_state CFI sequences, we were always placing remember_state instruction into the first basic block. However, when we have hot-cold splitting, the cold part has and independent FDE entry in .eh_frame, and thus the restore_state instruction was missing its counter part. The fix is to adjust the basic block that is used for placing remember_state instruction whenever we see the hot-cold split boundary. (cherry picked from commit 146284c00861277f252eeb562c2f102a7ea6dbdf) --- bolt/BinaryFunction.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 942e5cc23901..c4767fcd8445 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1540,16 +1540,18 @@ bool BinaryFunction::fixCFIState() { }; uint32_t State = 0; - BinaryBasicBlock *EntryBB = *BasicBlocksLayout.begin(); + auto *FDEStartBB = BasicBlocksLayout[0]; for (uint32_t I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { - BinaryBasicBlock *BB = BasicBlocksLayout[I]; + auto *BB = BasicBlocksLayout[I]; uint32_t BBIndex = getIndex(BB); // Hot-cold border: check if this is the first BB to be allocated in a cold - // region (a different function). If yes, we need to reset the CFI state. - if (I != 0 && - BB->IsCold != BasicBlocksLayout[I - 1]->IsCold) + // region (a different FDE). If yes, we need to reset the CFI state and + // the FDEStartBB that is used to insert remember_state CFIs (t12863876). + if (I != 0 && BB->IsCold != BasicBlocksLayout[I - 1]->IsCold) { State = 0; + FDEStartBB = BB; + } // We need to recover the correct state if it doesn't match expected // state at BB entry point. @@ -1561,10 +1563,10 @@ bool BinaryFunction::fixCFIState() { // reach the desired state. uint32_t OldState = BBCFIState[BBIndex]; // Remember state at function entry point (our reference state). - BinaryBasicBlock::const_iterator InsertIt = EntryBB->begin(); - while (InsertIt != EntryBB->end() && BC.MIA->isCFI(*InsertIt)) + BinaryBasicBlock::const_iterator InsertIt = FDEStartBB->begin(); + while (InsertIt != FDEStartBB->end() && BC.MIA->isCFI(*InsertIt)) ++InsertIt; - addCFIPseudo(EntryBB, InsertIt, FrameInstructions.size()); + addCFIPseudo(FDEStartBB, InsertIt, FrameInstructions.size()); FrameInstructions.emplace_back( MCCFIInstruction::createRememberState(nullptr)); // Restore state From d192e7b6202f450d984bc3894e71aeb5734d501c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 2 Sep 2016 14:15:29 -0700 Subject: [PATCH 159/904] Add verbosity level and clean up stream usage. Summary: I've added a verbosity level to help keep the BOLT spewage to a minimum. The default level is pretty terse now, level 1 is closer to the original, I've saved level 2 for the noisiest of messages. Error messages should never be suppressed by the verbosity level only warnings and info messages. The rational behind stream usage is as follows: outs() for info and debugging controlled by command line flags. errs() for errors and warnings. dbgs() for output within DEBUG(). With the exception of a few of the level 2 messages I don't have any strong feelings about the others. (cherry picked from commit 174ddbb9007a527263795212a1b74b0ef011f7f4) --- bolt/BinaryBasicBlock.cpp | 14 ++-- bolt/BinaryFunction.cpp | 121 ++++++++++++++++----------- bolt/BinaryPasses.cpp | 85 ++++++++++--------- bolt/DWARFRewriter.cpp | 38 ++++++--- bolt/DebugData.cpp | 15 +++- bolt/Exceptions.cpp | 94 ++++++++++++--------- bolt/RewriteInstance.cpp | 170 ++++++++++++++++++++++++-------------- 7 files changed, 329 insertions(+), 208 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 3be44979feeb..1215980e63e7 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -95,17 +95,17 @@ bool BinaryBasicBlock::analyzeBranch(const MCInstrAnalysis &MIA, } void BinaryBasicBlock::dump(BinaryContext& BC) const { - if (Label) dbgs() << Label->getName() << ":\n"; - BC.printInstructions(dbgs(), Instructions.begin(), Instructions.end(), Offset); - dbgs() << "preds:"; + if (Label) outs() << Label->getName() << ":\n"; + BC.printInstructions(outs(), Instructions.begin(), Instructions.end(), Offset); + outs() << "preds:"; for (auto itr = pred_begin(); itr != pred_end(); ++itr) { - dbgs() << " " << (*itr)->getName(); + outs() << " " << (*itr)->getName(); } - dbgs() << "\nsuccs:"; + outs() << "\nsuccs:"; for (auto itr = succ_begin(); itr != succ_end(); ++itr) { - dbgs() << " " << (*itr)->getName(); + outs() << " " << (*itr)->getName(); } - dbgs() << "\n"; + outs() << "\n"; } } // namespace bolt diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index c4767fcd8445..99e7454a8f4a 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -34,12 +34,12 @@ #undef DEBUG_TYPE #define DEBUG_TYPE "bolt" - -namespace llvm { -namespace bolt { +using namespace llvm; namespace opts { +extern cl::opt Verbosity; + static cl::opt AgressiveSplitting("split-all-cold", cl::desc("outline as many cold basic blocks as possible"), @@ -53,6 +53,9 @@ DotToolTipCode("dot-tooltip-code", } // namespace opts +namespace llvm { +namespace bolt { + // Temporary constant. // // TODO: move to architecture-specific file together with the code that is @@ -380,8 +383,10 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { return false; } if (TargetAddress == 0) { - errs() << "BOLT-WARNING: rip-relative operand is zero in function " - << *this << ". Ignoring function.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: rip-relative operand is zero in function " + << *this << ". Ignoring function.\n"; + } return false; } @@ -477,9 +482,11 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // No section - possibly an absolute address. Since we don't allow // internal function addresses to escape the function scope - we // consider it a tail call. - errs() << "BOLT-WARNING: no section for address 0x" - << Twine::utohexstr(ArrayStart) << " referenced from function " - << *this << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: no section for address 0x" + << Twine::utohexstr(ArrayStart) << " referenced from function " + << *this << '\n'; + } return IndirectBranchType::POSSIBLE_TAIL_CALL; } auto &Section = *SectionOrError; @@ -515,10 +522,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { nulls(), nulls())) { // Ignore this function. Skip to the next one. - errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x" - << Twine::utohexstr(Offset) << " (address 0x" - << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " - << *this << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x" + << Twine::utohexstr(Offset) << " (address 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " + << *this << '\n'; + } IsSimple = false; break; } @@ -547,9 +556,11 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = getSymbol(); } else { // Possibly an old-style PIC code - errs() << "BOLT: internal call detected at 0x" - << Twine::utohexstr(AbsoluteInstrAddr) - << " in function " << *this << ". Skipping.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: internal call detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << " in function " << *this << ". Skipping.\n"; + } IsSimple = false; } } @@ -560,7 +571,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = getOrCreateLocalLabel(TargetAddress); } else { BC.InterproceduralReferences.insert(TargetAddress); - if (!IsCall && Size == 2) { + if (opts::Verbosity >= 2 && !IsCall && Size == 2) { errs() << "BOLT-WARNING: relaxed tail call detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this @@ -574,7 +585,8 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Assign proper opcode for tail calls, so that they could be // treated as calls. if (!IsCall) { - if (!MIA->convertJmpToTailCall(Instruction)) { + if (opts::Verbosity >= 2 && + !MIA->convertJmpToTailCall(Instruction)) { assert(IsCondBranch && "unknown tail call instruction"); errs() << "BOLT-WARNING: conditional tail call detected in " << "function " << *this << " at 0x" @@ -594,8 +606,10 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // from the libraries. In reality more often than not it is // unreachable code, but we don't know it and have to emit calls // to 0 which make LLVM JIT unhappy. - errs() << "BOLT-WARNING: Function " << *this - << " has a call to address zero. Ignoring function.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Function " << *this + << " has a call to address zero. Ignoring function.\n"; + } IsSimple = false; } } @@ -638,9 +652,11 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Indirect call. We only need to fix it if the operand is RIP-relative if (IsSimple && MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { - errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" - << Twine::utohexstr(AbsoluteInstrAddr) - << ". Skipping function " << *this << ".\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << ". Skipping function " << *this << ".\n"; + } IsSimple = false; } } @@ -648,9 +664,11 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } else { if (MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { - errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" - << Twine::utohexstr(AbsoluteInstrAddr) - << ". Skipping function " << *this << ".\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: cannot handle RIP operand at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << ". Skipping function " << *this << ".\n"; + } IsSimple = false; } } @@ -1217,7 +1235,7 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { (float) (LocalProfileBranches.size() - OrphanBranches.size()) / (float) LocalProfileBranches.size(); - if (!OrphanBranches.empty()) { + if (opts::Verbosity >= 2 && !OrphanBranches.empty()) { errs() << "BOLT-WARNING: profile branches match only " << format("%.1f%%", ProfileMatchRatio * 100.0f) << " (" << (LocalProfileBranches.size() - OrphanBranches.size()) << '/' @@ -1317,8 +1335,8 @@ void BinaryFunction::inferFallThroughCounts() { Inferred = BBExecCount - TotalReportedJumps; DEBUG({ - if (BBExecCount < TotalReportedJumps) - dbgs() + if (opts::Verbosity >= 1 && BBExecCount < TotalReportedJumps) + errs() << "BOLT-WARNING: Fall-through inference is slightly inconsistent. " "exec frequency is less than the outgoing edges frequency (" << BBExecCount << " < " << ReportedBranches @@ -1521,9 +1539,11 @@ bool BinaryFunction::fixCFIState() { // without using the state stack. Not sure if it is worth the effort // because this happens rarely. if (NestedLevel != 0) { - errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state while" - << " replaying CFI instructions for BB " << InBB->getName() - << " in function " << *this << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state while" + << " replaying CFI instructions for BB " << InBB->getName() + << " in function " << *this << '\n'; + } return false; } @@ -1601,9 +1621,11 @@ bool BinaryFunction::fixCFIState() { } if (StackOffset != 0) { - errs() << " BOLT-WARNING: not possible to remember/recover state" - << " without corrupting CFI state stack in function " - << *this << "\n"; + if (opts::Verbosity >= 1) { + errs() << " BOLT-WARNING: not possible to remember/recover state" + << " without corrupting CFI state stack in function " + << *this << "\n"; + } return false; } } else if (BBCFIState[BBIndex] > State) { @@ -1694,8 +1716,10 @@ std::string constructFilename(std::string Filename, } if (Filename.size() + Annotation.size() + Suffix.size() > MAX_PATH) { assert(Suffix.size() + Annotation.size() <= MAX_PATH); - dbgs() << "BOLT-WARNING: Filename \"" << Filename << Annotation << Suffix - << "\" exceeds the " << MAX_PATH << " size limit, truncating.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Filename \"" << Filename << Annotation << Suffix + << "\" exceeds the " << MAX_PATH << " size limit, truncating.\n"; + } Filename.resize(MAX_PATH - (Suffix.size() + Annotation.size())); } Filename += Annotation; @@ -1799,24 +1823,23 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { void BinaryFunction::viewGraph() const { SmallString Filename; if (auto EC = sys::fs::createTemporaryFile("bolt-cfg", "dot", Filename)) { - dbgs() << "BOLT-WARNING: " << EC.message() << ", unable to create " + errs() << "BOLT-ERROR: " << EC.message() << ", unable to create " << " bolt-cfg-XXXXX.dot temporary file.\n"; return; } dumpGraphToFile(Filename.str()); if (DisplayGraph(Filename)) { - dbgs() << "BOLT-WARNING: Can't display " << Filename - << " with graphviz.\n"; + errs() << "BOLT-ERROR: Can't display " << Filename << " with graphviz.\n"; } if (auto EC = sys::fs::remove(Filename)) { - dbgs() << "BOLT-WARNING: " << EC.message() << ", failed to remove " - << Filename.str() << "\n"; + errs() << "BOLT-WARNING: " << EC.message() << ", failed to remove " + << Filename << "\n"; } } void BinaryFunction::dumpGraphForPass(std::string Annotation) const { auto Filename = constructFilename(getPrintName(), Annotation, ".dot"); - dbgs() << "BOLT-DEBUG: Dumping CFG to " << Filename << "\n"; + outs() << "BOLT-DEBUG: Dumping CFG to " << Filename << "\n"; dumpGraphToFile(Filename); } @@ -1824,8 +1847,10 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const { std::error_code EC; raw_fd_ostream of(Filename, EC, sys::fs::F_None); if (EC) { - dbgs() << "BOLT-WARNING: " << EC.message() << ", unable to open " - << Filename << " for output.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: " << EC.message() << ", unable to open " + << Filename << " for output.\n"; + } return; } dumpGraph(of); @@ -2439,10 +2464,10 @@ bool BinaryFunction::isIdenticalWith(const BinaryFunction &BF) const { ++BBI; } - if (PseudosDiffer) { - errs() << "BOLT-WARNING: functions " << *this << " and "; - errs() << BF << " are identical, but have different"; - errs() << " pseudo instruction sequences.\n"; + if (opts::Verbosity >= 1 && PseudosDiffer) { + errs() << "BOLT-WARNING: functions " << *this << " and " + << BF << " are identical, but have different" + << " pseudo instruction sequences.\n"; } return true; diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index b99add3ecf63..d4a29545b3c1 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -17,6 +17,7 @@ namespace opts { +extern llvm::cl::opt Verbosity; extern llvm::cl::opt PrintAll; extern llvm::cl::opt DumpDotAll; extern llvm::cl::opt PrintReordered; @@ -112,7 +113,7 @@ void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, } if (Target == OriginalTarget) continue; - DEBUG(errs() << "BOLT-DEBUG: Optimizing " << (*BBIt).getName() + DEBUG(dbgs() << "BOLT-DEBUG: Optimizing " << (*BBIt).getName() << " in " << BF << ": replacing call to " << OriginalTarget->getName() << " by call to " << Target->getName() << "\n"); @@ -161,7 +162,7 @@ void InlineSmallFunctions::findInliningCandidates( } } - DEBUG(errs() << "BOLT-DEBUG: " << InliningCandidates.size() + DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size() << " inlineable functions.\n"); } @@ -202,7 +203,7 @@ void InlineSmallFunctions::findInliningCandidatesAggressive( InliningCandidates.insert(&Function); } - DEBUG(errs() << "BOLT-DEBUG: " << InliningCandidates.size() + DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size() << " inlineable functions.\n"); } @@ -577,7 +578,7 @@ bool InlineSmallFunctions::inlineCallsInFunction( auto NextInstIt = std::next(InstIt); inlineCall(BC, *BB, &Inst, *TargetFunction->begin()); DidInlining = true; - DEBUG(errs() << "BOLT-DEBUG: Inlining call to " + DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to " << *TargetFunction << " in " << Function << "\n"); InstIt = NextInstIt; @@ -648,7 +649,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( std::tie(NextBB, NextInstIndex) = inlineCall(BC, Function, BB, InstIndex, *TargetFunction); DidInlining = true; - DEBUG(errs() << "BOLT-DEBUG: Inlining call to " + DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to " << *TargetFunction << " in " << Function << "\n"); InstIndex = NextBB == BB ? NextInstIndex : BB->size(); @@ -695,9 +696,9 @@ void InlineSmallFunctions::runOnFunctions( ++ModifiedFunctions; } - DEBUG(errs() << "BOLT-DEBUG: Inlined " << inlinedDynamicCalls << " of " + DEBUG(dbgs() << "BOLT-DEBUG: Inlined " << inlinedDynamicCalls << " of " << totalDynamicCalls << " function calls in the profile.\n"); - DEBUG(errs() << "BOLT-DEBUG: Inlined calls represent " + DEBUG(dbgs() << "BOLT-DEBUG: Inlined calls represent " << (100.0 * inlinedDynamicCalls / totalInlineableCalls) << "% of all inlineable calls in the profile.\n"); } @@ -710,9 +711,11 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { // in the graph. if (Function.layout_size() > 0) { if (NagUser) { - outs() - << "BOLT-WARNING: Using -eliminate-unreachable is experimental and " - "unsafe for exceptions\n"; + if (opts::Verbosity >= 1) { + errs() + << "BOLT-WARNING: Using -eliminate-unreachable is experimental and " + "unsafe for exceptions\n"; + } NagUser = false; } @@ -742,7 +745,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { } if (opts::PrintAll || opts::PrintUCE) - Function.print(errs(), "after unreachable code elimination", true); + Function.print(outs(), "after unreachable code elimination", true); if (opts::DumpDotAll) Function.dumpGraphForPass("unreachable-code"); @@ -781,7 +784,7 @@ void ReorderBasicBlocks::runOnFunctions( Function.modifyLayout(opts::ReorderBlocks, opts::MinBranchClusters, ShouldSplit); if (opts::PrintAll || opts::PrintReordered) - Function.print(errs(), "after reordering blocks", true); + Function.print(outs(), "after reordering blocks", true); if (opts::DumpDotAll) Function.dumpGraphForPass("reordering"); } @@ -804,8 +807,10 @@ void FixupFunctions::runOnFunctions( // Fix the CFI state. if (!Function.fixCFIState()) { - errs() << "BOLT-WARNING: unable to fix CFI state for function " - << Function << ". Skipping.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: unable to fix CFI state for function " + << Function << ". Skipping.\n"; + } Function.setSimple(false); continue; } @@ -813,7 +818,7 @@ void FixupFunctions::runOnFunctions( // Update exception handling information. Function.updateEHRanges(); if (opts::PrintAll || opts::PrintEHRanges) - Function.print(errs(), "after updating EH ranges", true); + Function.print(outs(), "after updating EH ranges", true); if (opts::DumpDotAll) Function.dumpGraphForPass("update-EH-ranges"); } @@ -920,7 +925,7 @@ void SimplifyConditionalTailCalls::runOnFunctions( // Fix tail calls to reduce branch mispredictions. if (fixTailCalls(BC, Function)) { if (opts::PrintAll || opts::PrintReordered) { - Function.print(errs(), "after tail call patching", true); + Function.print(outs(), "after tail call patching", true); } if (opts::DumpDotAll) { Function.dumpGraphForPass("tail-call-patching"); @@ -928,7 +933,7 @@ void SimplifyConditionalTailCalls::runOnFunctions( } } - outs() << "BOLT: patched " << NumTailCallsPatched + outs() << "BOLT-INFO: patched " << NumTailCallsPatched << " tail calls (" << NumOrigForwardBranches << " forward)" << " from a total of " << NumTailCallCandidates << "\n"; } @@ -951,7 +956,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC, shortenInstructions(BC, Function); if (opts::PrintAll || opts::PrintPeepholes) { - Function.print(errs(), "after peepholes", true); + Function.print(outs(), "after peepholes", true); } if (opts::DumpDotAll) { @@ -1058,7 +1063,7 @@ void SimplifyRODataLoads::runOnFunctions( if (simplifyRODataLoads(BC, Function)) { if (opts::PrintAll || opts::PrintSimplifyROLoads) { - Function.print(errs(), + Function.print(outs(), "after simplifying read-only section loads", true); } @@ -1068,11 +1073,11 @@ void SimplifyRODataLoads::runOnFunctions( } } - outs() << "BOLT: simplified " << NumLoadsSimplified << " out of "; - outs() << NumLoadsFound << " loads from a statically computed address.\n"; - outs() << "BOLT: dynamic loads simplified: " << NumDynamicLoadsSimplified; - outs() << "\n"; - outs() << "BOLT: dynamic loads found: " << NumDynamicLoadsFound << "\n"; + outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " + << NumLoadsFound << " loads from a statically computed address.\n" + << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified + << "\n" + << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; } void IdenticalCodeFolding::discoverCallers( @@ -1221,7 +1226,9 @@ void IdenticalCodeFolding::runOnFunctions( Buckets.clear(); Mod.clear(); - errs() << "BOLT-INFO: icf pass " << Iter << "...\n"; + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: icf pass " << Iter << "...\n"; + } uint64_t NumIdenticalFunctions = 0; @@ -1267,28 +1274,30 @@ void IdenticalCodeFolding::runOnFunctions( } } - errs() << "BOLT-INFO: found " << NumIdenticalFunctions; - errs() << " identical functions.\n"; - errs() << "BOLT-INFO: modified " << Mod.size() << " functions.\n"; + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: found " << NumIdenticalFunctions + << " identical functions.\n" + << "BOLT-INFO: modified " << Mod.size() << " functions.\n"; + } NumIdenticalFunctionsFound += NumIdenticalFunctions; ++Iter; } while (!Mod.empty()); - outs() << "BOLT: ICF pass found " << NumIdenticalFunctionsFound; - outs() << " functions identical to some other function.\n"; - outs() << "BOLT: ICF pass folded references to " << NumFunctionsFolded; - outs() << " functions.\n"; - outs() << "BOLT: ICF pass folded " << NumDynamicCallsFolded << " dynamic"; - outs() << " function calls.\n"; - outs() << "BOLT: Removing all identical functions could save "; - outs() << format("%.2lf", (double) BytesSavedEstimate / 1024); - outs() << " KB of code space.\n"; + outs() << "BOLT-INFO: ICF pass found " << NumIdenticalFunctionsFound + << " functions identical to some other function.\n" + << "BOLT-INFO: ICF pass folded references to " << NumFunctionsFolded + << " functions.\n" + << "BOLT-INFO: ICF pass folded " << NumDynamicCallsFolded << " dynamic" + << " function calls.\n" + << "BOLT-INFO: Removing all identical functions could save " + << format("%.2lf", (double) BytesSavedEstimate / 1024) + << " KB of code space.\n"; if (opts::PrintAll || opts::PrintICF) { for (auto &I : BFs) { - I.second.print(errs(), "after identical code folding", true); + I.second.print(outs(), "after identical code folding", true); } } } diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 67e6f2a3d3f0..179720df586a 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -27,6 +27,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/Errc.h" @@ -42,6 +43,10 @@ using namespace llvm; using namespace object; using namespace bolt; +namespace opts { +extern cl::opt Verbosity; +} + void RewriteInstance::updateDebugInfo() { SectionPatchers[".debug_abbrev"] = llvm::make_unique(); SectionPatchers[".debug_info"] = llvm::make_unique(); @@ -123,7 +128,7 @@ void RewriteInstance::updateDWARFObjectAddressRanges( return; } - if (DebugRangesOffset == -1U) { + if (opts::Verbosity >= 2 && DebugRangesOffset == -1U) { errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x" << Twine::utohexstr(DIE->getOffset()) << '\n'; } @@ -137,9 +142,11 @@ void RewriteInstance::updateDWARFObjectAddressRanges( const auto *AbbreviationDecl = DIE->getAbbreviationDeclarationPtr(); if (!AbbreviationDecl) { - errs() << "BOLT-WARNING: object's DIE doesn't have an abbreviation: " - << "skipping update. DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: object's DIE doesn't have an abbreviation: " + << "skipping update. DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + } return; } @@ -176,14 +183,19 @@ void RewriteInstance::updateDWARFObjectAddressRanges( (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { - errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " - "at offset 0x" << Twine::utohexstr(DIE->getOffset()) << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " + << "at offset 0x" << Twine::utohexstr(DIE->getOffset()) + << "\n"; + } return; } if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { - errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " - "Cannot update DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " + << "Cannot update DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + } return; } @@ -213,8 +225,10 @@ void RewriteInstance::updateDWARFObjectAddressRanges( ProducerString.back() = '\0'; DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); } else { - errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; + } } } } @@ -271,7 +285,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, FunctionSection); } else { - DEBUG(errs() << "BOLT-DEBUG: Function " << Function + DEBUG(dbgs() << "BOLT-DEBUG: Function " << Function << " has no associated line number information.\n"); } } diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index 42e13b8d1f84..577473a778a3 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -14,9 +14,14 @@ #include "BinaryFunction.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/Support/CommandLine.h" #include #include +namespace opts { +extern llvm::cl::opt Verbosity; +} + namespace llvm { namespace bolt { @@ -29,10 +34,12 @@ void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, auto FirstBB = Function.getBasicBlockContainingOffset( BeginAddress - Function.getAddress()); if (!FirstBB) { - errs() << "BOLT-WARNING: no basic blocks in function " - << Function << " intersect with debug range [0x" - << Twine::utohexstr(BeginAddress) << ", 0x" - << Twine::utohexstr(EndAddress) << ")\n"; + if (opts::Verbosity >= 2) { + errs() << "BOLT-WARNING: no basic blocks in function " + << Function << " intersect with debug range [0x" + << Twine::utohexstr(BeginAddress) << ", 0x" + << Twine::utohexstr(EndAddress) << ")\n"; + } return; } diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 64154296500f..f81fd12ec361 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -32,18 +32,20 @@ using namespace llvm::dwarf; -namespace llvm { -namespace bolt { - namespace opts { -static cl::opt +extern llvm::cl::opt Verbosity; + +static llvm::cl::opt PrintExceptions("print-exceptions", - cl::desc("print exception handling data"), - cl::Hidden); + llvm::cl::desc("print exception handling data"), + llvm::cl::Hidden); } // namespace opts +namespace llvm { +namespace bolt { + // Read and dump the .gcc_exception_table section entry. // // .gcc_except_table section contains a set of Language-Specific Data Areas - @@ -121,12 +123,12 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, } if (opts::PrintExceptions) { - errs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress()) + outs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress()) << " for function " << *this << "]:\n"; - errs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; - errs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; - errs() << "TType Encoding = " << (unsigned)TTypeEncoding << '\n'; - errs() << "TType End = " << TTypeEnd << '\n'; + outs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; + outs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; + outs() << "TType Encoding = " << (unsigned)TTypeEncoding << '\n'; + outs() << "TType End = " << TTypeEnd << '\n'; } // Table to store list of indices in type table. Entries are uleb128 values. @@ -147,9 +149,9 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, const uint8_t *ActionTableStart = CallSiteTableEnd; if (opts::PrintExceptions) { - errs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n'; - errs() << "CallSite table length = " << CallSiteTableLength << '\n'; - errs() << '\n'; + outs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n'; + outs() << "CallSite table length = " << CallSiteTableLength << '\n'; + outs() << '\n'; } HasEHRanges = CallSitePtr < CallSiteTableEnd; @@ -161,7 +163,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, uintptr_t ActionEntry = readULEB128(CallSitePtr); if (opts::PrintExceptions) { - errs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) + outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) << "); landing pad: 0x" << Twine::utohexstr(LPStart + LandingPad) << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; @@ -171,9 +173,11 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, MCSymbol *LPSymbol{nullptr}; if (LandingPad) { if (Instructions.find(LandingPad) == Instructions.end()) { - errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LandingPad) - << " not pointing to an instruction in function " - << *this << " - ignoring.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LandingPad) + << " not pointing to an instruction in function " + << *this << " - ignoring.\n"; + } } else { auto Label = Labels.find(LandingPad); if (Label != Labels.end()) { @@ -222,7 +226,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, } }; if (opts::PrintExceptions) - errs() << " actions: "; + outs() << " actions: "; const uint8_t *ActionPtr = ActionTableStart + ActionEntry - 1; long long ActionType; long long ActionNext; @@ -232,19 +236,19 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, auto Self = ActionPtr; ActionNext = readSLEB128(ActionPtr); if (opts::PrintExceptions) - errs() << Sep << "(" << ActionType << ", " << ActionNext << ") "; + outs() << Sep << "(" << ActionType << ", " << ActionNext << ") "; if (ActionType == 0) { if (opts::PrintExceptions) - errs() << "cleanup"; + outs() << "cleanup"; } else if (ActionType > 0) { // It's an index into a type table. if (opts::PrintExceptions) { - errs() << "catch type "; - printType(ActionType, errs()); + outs() << "catch type "; + printType(ActionType, outs()); } } else { // ActionType < 0 if (opts::PrintExceptions) - errs() << "filter exception types "; + outs() << "filter exception types "; auto TSep = ""; // ActionType is a negative *byte* offset into *uleb128-encoded* table // of indices with base 1. @@ -253,8 +257,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; while (auto Index = readULEB128(TypeIndexTablePtr)) { if (opts::PrintExceptions) { - errs() << TSep; - printType(Index, errs()); + outs() << TSep; + printType(Index, outs()); TSep = ", "; } } @@ -268,11 +272,11 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, ActionPtr = Self + ActionNext; } while (ActionNext); if (opts::PrintExceptions) - errs() << '\n'; + outs() << '\n'; } } if (opts::PrintExceptions) - errs() << '\n'; + outs() << '\n'; assert(TypeIndexTableStart + MaxTypeIndexTableOffset <= LSDASectionData.data() + LSDASectionData.size() && @@ -494,11 +498,13 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { const FDE &CurFDE = *I->second; if (Function.getSize() != CurFDE.getAddressRange()) { - errs() << "BOLT-WARNING: CFI information size mismatch for function \"" - << Function << "\"" - << format(": Function size is %dB, CFI covers " - "%dB\n", - Function.getSize(), CurFDE.getAddressRange()); + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: CFI information size mismatch for function \"" + << Function << "\"" + << format(": Function size is %dB, CFI covers " + "%dB\n", + Function.getSize(), CurFDE.getAddressRange()); + } return false; } @@ -609,24 +615,34 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { break; case DW_CFA_val_offset_sf: case DW_CFA_val_offset: - errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n"; + } return false; case DW_CFA_expression: case DW_CFA_def_cfa_expression: case DW_CFA_val_expression: - errs() << "BOLT-WARNING: DWARF CFA expressions unimplemented\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: DWARF CFA expressions unimplemented\n"; + } return false; case DW_CFA_MIPS_advance_loc8: - errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n"; + } return false; case DW_CFA_GNU_window_save: case DW_CFA_lo_user: case DW_CFA_hi_user: - errs() << - "BOLT-WARNING: DW_CFA_GNU_* and DW_CFA_*_user unimplemented\n"; + if (opts::Verbosity >= 1) { + errs() << + "BOLT-WARNING: DW_CFA_GNU_* and DW_CFA_*_user unimplemented\n"; + } return false; default: - errs() << "BOLT-WARNING: Unrecognized CFI instruction\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Unrecognized CFI instruction\n"; + } return false; } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 42c5eed87532..ec7db888330e 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -66,6 +66,22 @@ namespace opts { static cl::opt OutputFilename("o", cl::desc(""), cl::Required); +// The default verbosity level (0) is pretty terse, level 1 is fairly +// verbose and usually prints some informational message for every +// function processed. Level 2 is for the noisiest of messages and +// often prints a message per basic block. +// Error messages should never be suppressed by the verbosity level. +// Only warnings and info messages should be affected. +// +// The rational behind stream usage is as follows: +// outs() for info and debugging controlled by command line flags. +// errs() for errors and warnings. +// dbgs() for output within DEBUG(). +cl::opt +Verbosity("v", + cl::desc("set verbosity level for diagnostic output"), + cl::init(0)); + static cl::list BreakFunctionNames("break-funcs", cl::CommaSeparated, @@ -360,14 +376,14 @@ std::unique_ptr createBinaryContext( *TheTriple, Error); if (!TheTarget) { - errs() << "BOLT: " << Error; + errs() << "BOLT-ERROR: " << Error; return nullptr; } std::unique_ptr MRI( TheTarget->createMCRegInfo(TripleName)); if (!MRI) { - errs() << "error: no register info for target " << TripleName << "\n"; + errs() << "BOLT-ERROR: no register info for target " << TripleName << "\n"; return nullptr; } @@ -375,20 +391,20 @@ std::unique_ptr createBinaryContext( std::unique_ptr AsmInfo( TheTarget->createMCAsmInfo(*MRI, TripleName)); if (!AsmInfo) { - errs() << "error: no assembly info for target " << TripleName << "\n"; + errs() << "BOLT-ERROR: no assembly info for target " << TripleName << "\n"; return nullptr; } std::unique_ptr STI( TheTarget->createMCSubtargetInfo(TripleName, "", "")); if (!STI) { - errs() << "error: no subtarget info for target " << TripleName << "\n"; + errs() << "BOLT-ERROR: no subtarget info for target " << TripleName << "\n"; return nullptr; } std::unique_ptr MII(TheTarget->createMCInstrInfo()); if (!MII) { - errs() << "error: no instruction info for target " << TripleName << "\n"; + errs() << "BOLT-ERROR: no instruction info for target " << TripleName << "\n"; return nullptr; } @@ -403,14 +419,14 @@ std::unique_ptr createBinaryContext( TheTarget->createMCDisassembler(*STI, *Ctx)); if (!DisAsm) { - errs() << "error: no disassembler for target " << TripleName << "\n"; + errs() << "BOLT-ERROR: no disassembler for target " << TripleName << "\n"; return nullptr; } std::unique_ptr MIA( TheTarget->createMCInstrAnalysis(MII.get())); if (!MIA) { - errs() << "error: failed to create instruction analysis for target" + errs() << "BOLT-ERROR: failed to create instruction analysis for target" << TripleName << "\n"; return nullptr; } @@ -420,7 +436,7 @@ std::unique_ptr createBinaryContext( TheTarget->createMCInstPrinter(Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI)); if (!InstructionPrinter) { - errs() << "error: no instruction printer for target " << TripleName + errs() << "BOLT-ERROR: no instruction printer for target " << TripleName << '\n'; return nullptr; } @@ -507,7 +523,7 @@ void RewriteInstance::discoverStorage() { assert(NextAvailableAddress && NextAvailableOffset && "no PT_LOAD pheader seen"); - errs() << "BOLT-INFO: first alloc address is 0x" + outs() << "BOLT-INFO: first alloc address is 0x" << Twine::utohexstr(FirstAllocAddress) << '\n'; FirstNonAllocatableOffset = NextAvailableOffset; @@ -534,7 +550,7 @@ void RewriteInstance::discoverStorage() { assert(NextAvailableOffset == NextAvailableAddress - FirstAllocAddress && "PHDR table address calculation error"); - errs() << "BOLT-INFO: creating new program header table at address 0x" + outs() << "BOLT-INFO: creating new program header table at address 0x" << Twine::utohexstr(NextAvailableAddress) << ", offset 0x" << Twine::utohexstr(NextAvailableOffset) << '\n'; @@ -559,7 +575,7 @@ void RewriteInstance::discoverStorage() { void RewriteInstance::run() { if (!BC) { - errs() << "failed to create a binary context\n"; + errs() << "BOLT-ERROR: failed to create a binary context\n"; return; } @@ -610,9 +626,11 @@ void RewriteInstance::run() { auto FunctionIt = BinaryFunctions.find(Address); assert(FunctionIt != BinaryFunctions.end() && "Invalid large function address."); - errs() << "BOLT-WARNING: Function " << FunctionIt->second - << " is larger than its orginal size: emitting again marking it " - << "as not simple.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Function " << FunctionIt->second + << " is larger than its orginal size: emitting again marking it " + << "as not simple.\n"; + } FunctionIt->second.setSimple(false); } @@ -664,7 +682,7 @@ void RewriteInstance::discoverFileObjects() { check_error(AddressOrErr.getError(), "cannot get symbol address"); uint64_t Address = *AddressOrErr; if (Address == 0) { - if (Symbol.getType() == SymbolRef::ST_Function) + if (opts::Verbosity >= 1 && Symbol.getType() == SymbolRef::ST_Function) errs() << "BOLT-WARNING: function with 0 address seen\n"; continue; } @@ -764,7 +782,8 @@ void RewriteInstance::discoverFileObjects() { auto &PrevFDE = *FDEI->second; auto PrevStart = PrevFDE.getInitialLocation(); auto PrevLength = PrevFDE.getAddressRange(); - if (Address > PrevStart && Address < PrevStart + PrevLength) { + if (opts::Verbosity >= 1 && + Address > PrevStart && Address < PrevStart + PrevLength) { errs() << "BOLT-WARNING: function " << UniqueName << " is in conflict with FDE [" << Twine::utohexstr(PrevStart) << ", " @@ -775,9 +794,11 @@ void RewriteInstance::discoverFileObjects() { } } else if (FDE.getAddressRange() != SymbolSize) { // Function addresses match but sizes differ. - errs() << "BOLT-WARNING: sizes differ for function " << UniqueName - << ". FDE : " << FDE.getAddressRange() - << "; symbol table : " << SymbolSize << ". Skipping.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: sizes differ for function " << UniqueName + << ". FDE : " << FDE.getAddressRange() + << "; symbol table : " << SymbolSize << ". Skipping.\n"; + } // Create maximum size non-simple function. IsSimple = false; @@ -791,7 +812,7 @@ void RewriteInstance::discoverFileObjects() { BF = &BFI->second; // Duplicate function name. Make sure everything matches before we add // an alternative name. - if (SymbolSize != BF->getSize()) { + if (opts::Verbosity >= 1 && SymbolSize != BF->getSize()) { errs() << "BOLT-WARNING: size mismatch for duplicate entries " << UniqueName << ':' << SymbolSize << " and " << *BF << ':' << BF->getSize() << '\n'; @@ -906,8 +927,10 @@ void RewriteInstance::disassembleFunctions() { "wrong section for function"); if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { // When could it happen? - errs() << "BOLT: corresponding section is non-executable or empty " - << "for function " << Function; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: corresponding section is non-executable or empty " + << "for function " << Function; + } continue; } @@ -926,8 +949,10 @@ void RewriteInstance::disassembleFunctions() { uint64_t SectionEnd = Function.getSection().getAddress() + Function.getSection().getSize(); if (SectionEnd > SymRefI->first) { - errs() << "BOLT-WARNING: symbol after " << Function - << " should not be in the same section.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: symbol after " << Function + << " should not be in the same section.\n"; + } MaxSize = 0; } else { MaxSize = SectionEnd - Function.getAddress(); @@ -935,8 +960,10 @@ void RewriteInstance::disassembleFunctions() { } if (MaxSize < Function.getSize()) { - errs() << "BOLT-WARNING: symbol seen in the middle of the function " - << Function << ". Skipping.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: symbol seen in the middle of the function " + << Function << ". Skipping.\n"; + } Function.setSimple(false); continue; } @@ -966,7 +993,7 @@ void RewriteInstance::disassembleFunctions() { continue; if (opts::PrintAll || opts::PrintDisasm) - Function.print(errs(), "after disassembly", true); + Function.print(outs(), "after disassembly", true); if (!Function.isSimple()) continue; @@ -974,8 +1001,10 @@ void RewriteInstance::disassembleFunctions() { // Fill in CFI information for this function if (EHFrame->ParseError.empty()) { if (!CFIRdWrt->fillCFIInfoFor(Function)) { - errs() << "BOLT-WARNING: unable to fill CFI for function " - << Function << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: unable to fill CFI for function " + << Function << '\n'; + } Function.setSimple(false); continue; } @@ -989,14 +1018,14 @@ void RewriteInstance::disassembleFunctions() { continue; if (opts::PrintAll || opts::PrintCFG) - Function.print(errs(), "after building cfg", true); + Function.print(outs(), "after building cfg", true); if (opts::DumpDotAll) Function.dumpGraphForPass("build-cfg"); if (opts::PrintLoopInfo) { Function.calculateLoopInfo(); - Function.printLoopInfo(errs()); + Function.printLoopInfo(outs()); } TotalScore += Function.getFunctionScore(); @@ -1009,9 +1038,11 @@ void RewriteInstance::disassembleFunctions() { for (auto Addr : BC->InterproceduralReferences) { auto *ContainingFunction = getBinaryFunctionContainingAddress(Addr); if (ContainingFunction && ContainingFunction->getAddress() != Addr) { - errs() << "BOLT-WARNING: Function " << ContainingFunction - << " has internal BBs that are target of a reference located in " - "another function. Skipping the function.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Function " << ContainingFunction + << " has internal BBs that are target of a reference located in " + << "another function. Skipping the function.\n"; + } ContainingFunction->setSimple(false); } } @@ -1032,7 +1063,7 @@ void RewriteInstance::disassembleFunctions() { ++NumStaleProfileFunctions; } - errs() << "BOLT-INFO: " + outs() << "BOLT-INFO: " << ProfiledFunctions.size() + NumStaleProfileFunctions << " functions out of " << NumSimpleFunctions << " simple functions (" << format("%.1f", @@ -1040,7 +1071,7 @@ void RewriteInstance::disassembleFunctions() { (float) NumSimpleFunctions * 100.0f) << "%) have non-empty execution profile.\n"; if (NumStaleProfileFunctions) { - errs() << "BOLT-INFO: " << NumStaleProfileFunctions + outs() << "BOLT-INFO: " << NumStaleProfileFunctions << format(" (%.1f%) ", NumStaleProfileFunctions / (float) NumSimpleFunctions * 100.0f) << " function" << (NumStaleProfileFunctions == 1 ? "" : "s") @@ -1048,16 +1079,18 @@ void RewriteInstance::disassembleFunctions() { } if (ProfiledFunctions.size() > 10) { - errs() << "BOLT-INFO: top called functions are:\n"; - std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), - [](BinaryFunction *A, BinaryFunction *B) { - return B->getExecutionCount() < A->getExecutionCount(); - } - ); - auto SFI = ProfiledFunctions.begin(); - for (int i = 0; i < 100 && SFI != ProfiledFunctions.end(); ++SFI, ++i) { - errs() << " " << *SFI << " : " - << (*SFI)->getExecutionCount() << '\n'; + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: top called functions are:\n"; + std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), + [](BinaryFunction *A, BinaryFunction *B) { + return B->getExecutionCount() < A->getExecutionCount(); + } + ); + auto SFI = ProfiledFunctions.begin(); + for (int i = 0; i < 100 && SFI != ProfiledFunctions.end(); ++SFI, ++i) { + outs() << " " << **SFI << " : " + << (*SFI)->getExecutionCount() << '\n'; + } } } } @@ -1426,7 +1459,9 @@ void RewriteInstance::emitFunctions() { FailedAddresses.emplace_back(Function.getAddress()); } } else { - errs() << "BOLT: cannot remap function " << Function << "\n"; + if (opts::Verbosity >= 2) { + errs() << "BOLT-WARNING: cannot remap function " << Function << "\n"; + } FailedAddresses.emplace_back(Function.getAddress()); } @@ -1453,7 +1488,9 @@ void RewriteInstance::emitFunctions() { NextAvailableAddress += Function.cold().getImageSize(); } else { - errs() << "BOLT: cannot remap function " << Function << "\n"; + if (opts::Verbosity >= 2) { + errs() << "BOLT-WARNING: cannot remap function " << Function << "\n"; + } FailedAddresses.emplace_back(Function.getAddress()); } } @@ -1494,7 +1531,9 @@ void RewriteInstance::emitFunctions() { NextAvailableAddress += SI.Size; } else { - errs() << "BOLT: cannot remap " << SectionName << '\n'; + if (opts::Verbosity >= 2) { + errs() << "BOLT-WARNING: cannot remap " << SectionName << '\n'; + } } } @@ -1794,8 +1833,9 @@ void RewriteInstance::patchELFSectionHeaderTable() { // Ignore function sections. if (SI.IsCode && SMII.first != ".bolt.text") continue; - errs() << "BOLT-INFO: writing section header for " - << SMII.first << '\n'; + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: writing section header for " << SMII.first << '\n'; + } Elf_Shdr NewSection; NewSection.sh_name = SI.ShName; NewSection.sh_type = ELF::SHT_PROGBITS; @@ -1905,18 +1945,22 @@ void RewriteInstance::rewriteFile() { continue; if (Function.getImageSize() > Function.getMaxSize()) { - errs() << "BOLT-WARNING: new function size (0x" - << Twine::utohexstr(Function.getImageSize()) - << ") is larger than maximum allowed size (0x" - << Twine::utohexstr(Function.getMaxSize()) - << ") for function " << Function << '\n'; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: new function size (0x" + << Twine::utohexstr(Function.getImageSize()) + << ") is larger than maximum allowed size (0x" + << Twine::utohexstr(Function.getMaxSize()) + << ") for function " << Function << '\n'; + } FailedAddresses.emplace_back(Function.getAddress()); continue; } OverwrittenScore += Function.getFunctionScore(); // Overwrite function in the output file. - outs() << "BOLT: rewriting function \"" << Function << "\"\n"; + if (opts::Verbosity >= 2) { + outs() << "BOLT: rewriting function \"" << Function << "\"\n"; + } Out->os().pwrite(reinterpret_cast(Function.getImageAddress()), Function.getImageSize(), Function.getFileOffset()); @@ -1938,7 +1982,9 @@ void RewriteInstance::rewriteFile() { } // Write cold part - outs() << "BOLT: rewriting function \"" << Function << "\" (cold part)\n"; + if (opts::Verbosity >= 2) { + outs() << "BOLT: rewriting function \"" << Function << "\" (cold part)\n"; + } Out->os().pwrite(reinterpret_cast(Function.cold().getImageAddress()), Function.cold().getImageSize(), Function.cold().getFileOffset()); @@ -1967,7 +2013,9 @@ void RewriteInstance::rewriteFile() { SectionInfo &SI = SMII.second; if (SI.IsCode) continue; - outs() << "BOLT: writing new section " << SMII.first << '\n'; + if (opts::Verbosity >= 1) { + outs() << "BOLT: writing new section " << SMII.first << '\n'; + } Out->os().pwrite(reinterpret_cast(SI.AllocAddress), SI.Size, SI.FileOffset); @@ -1977,7 +2025,9 @@ void RewriteInstance::rewriteFile() { auto SMII = SectionMM->SectionMapInfo.find(".eh_frame"); if (SMII != SectionMM->SectionMapInfo.end()) { auto &EHFrameSecInfo = SMII->second; - outs() << "BOLT: writing a new .eh_frame_hdr\n"; + if (opts::Verbosity >= 1) { + outs() << "BOLT: writing a new .eh_frame_hdr\n"; + } if (FrameHdrAlign > 1) { auto PaddingSize = OffsetToAlignment(NextAvailableAddress, FrameHdrAlign); for (unsigned I = 0; I < PaddingSize; ++I) From ff0e01c83720d46e63bc00746ef3a28f84f9deb8 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 6 Sep 2016 13:19:26 -0700 Subject: [PATCH 160/904] Fix tail call conversion and test cases. Summary: A previous diff accidentally disabled tail call conversion. Additionally some test cases relied on output of "-v=2". Fix those. (cherry picked from commit f815870afb0edc6b3f0851b872b48760e86a73fe) --- bolt/BinaryFunction.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 99e7454a8f4a..507f76b1ad8d 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -585,8 +585,8 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Assign proper opcode for tail calls, so that they could be // treated as calls. if (!IsCall) { - if (opts::Verbosity >= 2 && - !MIA->convertJmpToTailCall(Instruction)) { + if (!MIA->convertJmpToTailCall(Instruction) && + opts::Verbosity >= 2) { assert(IsCondBranch && "unknown tail call instruction"); errs() << "BOLT-WARNING: conditional tail call detected in " << "function " << *this << " at 0x" From 9b8c62f2ce5c8616dbe9d5234ca24f47ce54cf5e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 2 Sep 2016 11:58:53 -0700 Subject: [PATCH 161/904] Inlining fixes/enhancements Summary: A number of fixes/enhancements to inline-small-functions - Fixed size estimateHotSize to use computeCodeSize instead of the original layout offsets. - Added -print-inline option to dump CFGs for functions that have been modified by inlining. - Added flag to force consideration of functions without any profiling info (mostly for testing) - Updated debug line info for inlined functions. - Ignore the number of pseudo instructions when checking for candidates of suitable size. Misc changes - Moved most print flags to BinaryPasses.cpp (cherry picked from commit 46d162d378b8661453c771dc2e4602af8931b52a) --- bolt/BinaryContext.cpp | 18 +++- bolt/BinaryContext.h | 5 + bolt/BinaryFunction.h | 4 +- bolt/BinaryPasses.cpp | 194 +++++++++++++++++++++++++++------------ bolt/BinaryPasses.h | 8 +- bolt/RewriteInstance.cpp | 78 +++++++--------- 6 files changed, 200 insertions(+), 107 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index f9befb191671..5b9fe8e1caa6 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -152,6 +152,22 @@ void findSubprograms(DWARFCompileUnit *Unit, } // namespace +unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID, + const uint32_t SrcCUID, + unsigned FileIndex) { + auto SrcUnit = DwCtx->getCompileUnitForOffset(SrcCUID); + auto LineTable = DwCtx->getLineTableForUnit(SrcUnit); + const auto &FileNames = LineTable->Prologue.FileNames; + // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 + // means empty dir. + assert(FileIndex > 0 && FileIndex <= FileNames.size() && + "FileIndex out of range for the compilation unit."); + const char *Dir = FileNames[FileIndex - 1].DirIdx ? + LineTable->Prologue.IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1] : + ""; + return Ctx->getDwarfFile(Dir, FileNames[FileIndex - 1].Name, 0, DestCUID); +} + void BinaryContext::preprocessDebugInfo( std::map &BinaryFunctions) { // Populate MCContext with DWARF files. @@ -165,7 +181,7 @@ void BinaryContext::preprocessDebugInfo( const char *Dir = FileNames[I].DirIdx ? LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] : ""; - Ctx->getDwarfFile(Dir, FileNames[I].Name, I + 1, CUID); + Ctx->getDwarfFile(Dir, FileNames[I].Name, 0, CUID); } } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 007fe2d6825d..523a995fd3e4 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -193,6 +193,11 @@ class BinaryContext { void preprocessFunctionDebugInfo( std::map &BinaryFunctions); + /// Add a filename entry from SrcCUID to DestCUID. + unsigned addDebugFilenameToUnit(const uint32_t DestCUID, + const uint32_t SrcCUID, + unsigned FileIndex); + /// Compute the native code size for a range of instructions. /// Note: this can be imprecise wrt the final binary since happening prior to /// relaxation, as well as wrt the original binary because of opcode diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index c2b91916bd54..ba1e8290a5ab 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -1080,11 +1080,13 @@ class BinaryFunction : public AddressRangesOwner { /// This is a very rough estimate, as with C++ exceptions there are /// blocks we don't move, and it makes no attempt at estimating the size /// of the added/removed branch instructions. + /// Note that this size is optimistic and the actual size may increase + /// after relaxation. size_t estimateHotSize() const { size_t Estimate = 0; for (const auto *BB : BasicBlocksLayout) { if (BB->ExecutionCount != 0) { - Estimate += getBasicBlockOriginalSize(BB); + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); } } return Estimate; diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index d4a29545b3c1..4bdb5c9202eb 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -15,50 +15,94 @@ #define DEBUG_TYPE "bolt" +using namespace llvm; + namespace opts { -extern llvm::cl::opt Verbosity; -extern llvm::cl::opt PrintAll; -extern llvm::cl::opt DumpDotAll; -extern llvm::cl::opt PrintReordered; -extern llvm::cl::opt PrintEHRanges; -extern llvm::cl::opt PrintUCE; -extern llvm::cl::opt PrintPeepholes; -extern llvm::cl::opt PrintSimplifyROLoads; -extern llvm::cl::opt PrintICF; -extern llvm::cl::opt SplitFunctions; -extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function); - -static llvm::cl::opt +extern cl::opt Verbosity; +extern cl::opt PrintAll; +extern cl::opt DumpDotAll; +extern cl::opt SplitFunctions; +extern bool shouldProcess(const bolt::BinaryFunction &Function); + +static cl::opt +PrintReordered("print-reordered", + cl::desc("print functions after layout optimization"), + cl::Hidden); + +static cl::opt +PrintEHRanges("print-eh-ranges", + cl::desc("print function with updated exception ranges"), + cl::Hidden); + +static cl::opt +PrintUCE("print-uce", + cl::desc("print functions after unreachable code elimination"), + cl::Hidden); + +static cl::opt +PrintPeepholes("print-peepholes", + cl::desc("print functions after peephole optimization"), + cl::Hidden); + +static cl::opt +PrintSimplifyROLoads("print-simplify-rodata-loads", + cl::desc("print functions after simplification of RO data" + " loads"), + cl::Hidden); + +static cl::opt +PrintICF("print-icf", + cl::desc("print functions after ICF optimization"), + cl::Hidden); + +static cl::opt +PrintInline("print-inline", + cl::desc("print functions after inlining optimization"), + cl::Hidden); + +static cl::list +ForceInlineFunctions("force-inline", + cl::CommaSeparated, + cl::desc("list of functions to always consider " + "for inlining"), + cl::value_desc("func1,func2,func3,...")); + +static cl::opt +AggressiveInlining("aggressive-inlining", + cl::desc("perform aggressive inlining"), + cl::Hidden); + +static cl::opt ReorderBlocks( "reorder-blocks", - llvm::cl::desc("change layout of basic blocks in a function"), - llvm::cl::init(llvm::bolt::BinaryFunction::LT_NONE), - llvm::cl::values(clEnumValN(llvm::bolt::BinaryFunction::LT_NONE, - "none", - "do not reorder basic blocks"), - clEnumValN(llvm::bolt::BinaryFunction::LT_REVERSE, - "reverse", - "layout blocks in reverse order"), - clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE, - "normal", - "perform optimal layout based on profile"), - clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE_BRANCH, - "branch-predictor", - "perform optimal layout prioritizing branch " - "predictions"), - clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE_CACHE, - "cache", - "perform optimal layout prioritizing I-cache " - "behavior"), - clEnumValEnd)); - -static llvm::cl::opt + cl::desc("change layout of basic blocks in a function"), + cl::init(bolt::BinaryFunction::LT_NONE), + cl::values(clEnumValN(bolt::BinaryFunction::LT_NONE, + "none", + "do not reorder basic blocks"), + clEnumValN(bolt::BinaryFunction::LT_REVERSE, + "reverse", + "layout blocks in reverse order"), + clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE, + "normal", + "perform optimal layout based on profile"), + clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_BRANCH, + "branch-predictor", + "perform optimal layout prioritizing branch " + "predictions"), + clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_CACHE, + "cache", + "perform optimal layout prioritizing I-cache " + "behavior"), + clEnumValEnd)); + +static cl::opt MinBranchClusters( "min-branch-clusters", - llvm::cl::desc("use a modified clustering algorithm geared towards " - "minimizing branches"), - llvm::cl::Hidden); + cl::desc("use a modified clustering algorithm geared towards " + "minimizing branches"), + cl::Hidden); } // namespace opts @@ -152,10 +196,8 @@ void InlineSmallFunctions::findInliningCandidates( auto &BB = *Function.begin(); const auto &LastInstruction = *BB.rbegin(); // Check if the function is small enough and doesn't do a tail call. - // The size we use includes pseudo-instructions but here they shouldn't - // matter. So some opportunities may be missed because of this. if (BB.size() > 0 && - BB.size() <= kMaxInstructions && + (BB.size() - BB.getNumPseudos()) <= kMaxInstructions && BC.MIA->isReturn(LastInstruction) && !BC.MIA->isTailCall(LastInstruction)) { InliningCandidates.insert(&Function); @@ -351,8 +393,11 @@ InlineSmallFunctions::inlineCall( const MCSymbol *OldFTLabel = nullptr; MCInst *CondBranch = nullptr; MCInst *UncondBranch = nullptr; - assert(BC.MIA->analyzeBranch(Instruction, OldTargetLabel, OldFTLabel, - CondBranch, UncondBranch)); + const bool Result = BC.MIA->analyzeBranch(Instruction, OldTargetLabel, + OldFTLabel, CondBranch, + UncondBranch); + assert(Result && + "analyzeBranch failed on instruction guaranteed to be a branch"); assert(OldTargetLabel); const MCSymbol *NewTargetLabel = nullptr; for (const auto SuccBB : InlinedFunctionBB->successors()) { @@ -543,7 +588,7 @@ bool InlineSmallFunctions::inlineCallsInFunction( for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) { auto &Inst = *InstIt; if (BC.MIA->isCall(Inst)) { - totalDynamicCalls += BB->getExecutionCount(); + TotalDynamicCalls += BB->getExecutionCount(); } } } @@ -569,12 +614,12 @@ bool InlineSmallFunctions::inlineCallsInFunction( bool CallToInlineableFunction = InliningCandidates.count(TargetFunction); - totalInlineableCalls += + TotalInlineableCalls += CallToInlineableFunction * BB->getExecutionCount(); if (CallToInlineableFunction && TargetFunction->getSize() + ExtraSize - + Function.estimateHotSize() < Function.getMaxSize()) { + + Function.estimateHotSize() < Function.getMaxSize()) { auto NextInstIt = std::next(InstIt); inlineCall(BC, *BB, &Inst, *TargetFunction->begin()); DidInlining = true; @@ -583,7 +628,7 @@ bool InlineSmallFunctions::inlineCallsInFunction( << Function << "\n"); InstIt = NextInstIt; ExtraSize += TargetFunction->getSize(); - inlinedDynamicCalls += BB->getExecutionCount(); + InlinedDynamicCalls += BB->getExecutionCount(); continue; } } @@ -611,7 +656,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) { auto &Inst = *InstIt; if (BC.MIA->isCall(Inst)) { - totalDynamicCalls += BB->getExecutionCount(); + TotalDynamicCalls += BB->getExecutionCount(); } } } @@ -638,7 +683,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( bool CallToInlineableFunction = InliningCandidates.count(TargetFunction); - totalInlineableCalls += + TotalInlineableCalls += CallToInlineableFunction * BB->getExecutionCount(); if (CallToInlineableFunction && @@ -655,7 +700,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( InstIndex = NextBB == BB ? NextInstIndex : BB->size(); InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end(); ExtraSize += TargetFunction->getSize(); - inlinedDynamicCalls += BB->getExecutionCount(); + InlinedDynamicCalls += BB->getExecutionCount(); continue; } } @@ -669,20 +714,35 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( return DidInlining; } +bool InlineSmallFunctions::mustConsider(const BinaryFunction &BF) { + for (auto &Name : opts::ForceInlineFunctions) { + if (BF.hasName(Name)) + return true; + } + return false; +} + void InlineSmallFunctions::runOnFunctions( BinaryContext &BC, std::map &BFs, std::set &) { - findInliningCandidates(BC, BFs); + + if (opts::AggressiveInlining) + findInliningCandidatesAggressive(BC, BFs); + else + findInliningCandidates(BC, BFs); std::vector ConsideredFunctions; + std::vector Modified; for (auto &It : BFs) { auto &Function = It.second; if (!Function.isSimple() || !opts::shouldProcess(Function)) continue; - if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE && + !mustConsider(Function)) continue; ConsideredFunctions.push_back(&Function); + Modified.push_back(false); } std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(), [](BinaryFunction *A, BinaryFunction *B) { @@ -692,14 +752,34 @@ void InlineSmallFunctions::runOnFunctions( for (unsigned i = 0; i < ConsideredFunctions.size() && ModifiedFunctions <= kMaxFunctions; ++i) { auto &Function = *ConsideredFunctions[i]; - if (inlineCallsInFunction(BC, Function)) + + const bool DidInline = opts::AggressiveInlining + ? inlineCallsInFunctionAggressive(BC, Function) + : inlineCallsInFunction(BC, Function); + + if (DidInline) { + Modified[i] = true; ++ModifiedFunctions; + } + } + + if (opts::PrintAll || opts::PrintInline || opts::DumpDotAll) { + for (unsigned i = 0; i < ConsideredFunctions.size(); ++i) { + if (Modified[i]) { + const auto *Function = ConsideredFunctions[i]; + if (opts::PrintAll || opts::PrintInline) + Function->print(errs(), "after inlining", true); + + if (opts::DumpDotAll) + Function->dumpGraphForPass("inlining"); + } + } } - DEBUG(dbgs() << "BOLT-DEBUG: Inlined " << inlinedDynamicCalls << " of " - << totalDynamicCalls << " function calls in the profile.\n"); - DEBUG(dbgs() << "BOLT-DEBUG: Inlined calls represent " - << (100.0 * inlinedDynamicCalls / totalInlineableCalls) + DEBUG(dbgs() << "BOLT-INFO: Inlined " << InlinedDynamicCalls << " of " + << TotalDynamicCalls << " function calls in the profile.\n" + << "BOLT-INFO: Inlined calls represent " + << format("%.1f", 100.0 * InlinedDynamicCalls / TotalInlineableCalls) << "% of all inlineable calls in the profile.\n"); } diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index bd34fa7aa24b..471be11cf3c8 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -72,9 +72,11 @@ class InlineSmallFunctions : public BinaryFunctionPass { static const unsigned kMaxFunctions = 30000; /// Statistics collected for debugging. - uint64_t totalDynamicCalls = 0; - uint64_t inlinedDynamicCalls = 0; - uint64_t totalInlineableCalls = 0; + uint64_t TotalDynamicCalls = 0; + uint64_t InlinedDynamicCalls = 0; + uint64_t TotalInlineableCalls = 0; + + static bool mustConsider(const BinaryFunction &BF); void findInliningCandidates(BinaryContext &BC, const std::map &BFs); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ec7db888330e..e2a64a6882c8 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -172,41 +172,10 @@ static cl::opt PrintLoopInfo("print-loops", cl::desc("print loop related information"), cl::Hidden); -cl::opt -PrintUCE("print-uce", - cl::desc("print functions after unreachable code elimination"), - cl::Hidden); - -cl::opt -PrintPeepholes("print-peepholes", - cl::desc("print functions after peephole optimization"), - cl::Hidden); - static cl::opt PrintDisasm("print-disasm", cl::desc("print function after disassembly"), cl::Hidden); -cl::opt -PrintEHRanges("print-eh-ranges", - cl::desc("print function with updated exception ranges"), - cl::Hidden); - -cl::opt -PrintSimplifyROLoads("print-simplify-rodata-loads", - cl::desc("print functions after simplification of RO data" - " loads"), - cl::Hidden); - -cl::opt -PrintReordered("print-reordered", - cl::desc("print functions after layout optimization"), - cl::Hidden); - -cl::opt -PrintICF("print-icf", - cl::desc("print functions after ICF optimization"), - cl::Hidden); - static cl::opt KeepTmp("keep-tmp", cl::desc("preserve intermediate .o file"), @@ -1234,6 +1203,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, } // Emit code. + auto ULT = Function.getDWARFUnitLineTable(); int64_t CurrentGnuArgsSize = 0; for (auto BB : Function.layout()) { if (EmitColdPart != BB->isCold()) @@ -1264,28 +1234,46 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc()); if (RowReference != DebugLineTableRowRef::NULL_ROW && Instr.getLoc().getPointer() != LastLocSeen.getPointer()) { - auto ULT = Function.getDWARFUnitLineTable(); auto Unit = ULT.first; auto OriginalLineTable = ULT.second; + const auto OrigUnitID = Unit->getOffset(); + unsigned NewFilenum = 0; + + // If the CU id from the current instruction location does not + // match the CU id from the current function, it means that we + // have come across some inlined code. We must look up the CU + // for the instruction's original function and get the line table + // from that. We also update the current CU debug info with the + // filename of the inlined function. + if (RowReference.DwCompileUnitIndex != OrigUnitID) { + Unit = + BC.DwCtx->getCompileUnitForOffset(RowReference.DwCompileUnitIndex); + OriginalLineTable = BC.DwCtx->getLineTableForUnit(Unit); + const auto Filenum = + OriginalLineTable->Rows[RowReference.RowIndex - 1].File; + NewFilenum = + BC.addDebugFilenameToUnit(OrigUnitID, + RowReference.DwCompileUnitIndex, + Filenum); + } assert(Unit && OriginalLineTable && "Invalid CU offset set in instruction debug info."); - assert(RowReference.DwCompileUnitIndex == Unit->getOffset() && - "DWARF compile unit mismatch"); const auto &OriginalRow = - OriginalLineTable->Rows[RowReference.RowIndex - 1]; + OriginalLineTable->Rows[RowReference.RowIndex - 1]; + BC.Ctx->setCurrentDwarfLoc( - OriginalRow.File, - OriginalRow.Line, - OriginalRow.Column, - (DWARF2_FLAG_IS_STMT * OriginalRow.IsStmt) | - (DWARF2_FLAG_BASIC_BLOCK * OriginalRow.BasicBlock) | - (DWARF2_FLAG_PROLOGUE_END * OriginalRow.PrologueEnd) | - (DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin), - OriginalRow.Isa, - OriginalRow.Discriminator); - BC.Ctx->setDwarfCompileUnitID(Unit->getOffset()); + NewFilenum == 0 ? OriginalRow.File : NewFilenum, + OriginalRow.Line, + OriginalRow.Column, + (DWARF2_FLAG_IS_STMT * OriginalRow.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * OriginalRow.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * OriginalRow.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin), + OriginalRow.Isa, + OriginalRow.Discriminator); + BC.Ctx->setDwarfCompileUnitID(OrigUnitID); LastLocSeen = Instr.getLoc(); } } From 0e2a450ec5bc9f5a13584690702ea3f336690669 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 7 Sep 2016 14:41:56 -0700 Subject: [PATCH 162/904] BOLT: Make most command line options ZeroOrMore. Summary: This will make it easier to run experiments with the same baseline BOLT binary but different command line options. (cherry picked from commit d018c6424e4456d937d78c66d17aeee5e81b8a77) --- bolt/BinaryFunction.cpp | 4 ++-- bolt/BinaryPassManager.cpp | 14 +++++++------- bolt/BinaryPasses.cpp | 15 +++++++++++++-- bolt/Exceptions.cpp | 1 + bolt/ReorderAlgorithm.cpp | 2 +- bolt/RewriteInstance.cpp | 22 +++++++++++++++------- 6 files changed, 39 insertions(+), 19 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 507f76b1ad8d..d6940ac2c916 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -43,12 +43,12 @@ extern cl::opt Verbosity; static cl::opt AgressiveSplitting("split-all-cold", cl::desc("outline as many cold basic blocks as possible"), - cl::Optional); + cl::ZeroOrMore); static cl::opt DotToolTipCode("dot-tooltip-code", cl::desc("add basic block instructions as tool tips on nodes"), - cl::Optional, + cl::ZeroOrMore, cl::Hidden); } // namespace opts diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index af690956ffd4..2245106704a9 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -16,30 +16,30 @@ namespace opts { static llvm::cl::opt EliminateUnreachable("eliminate-unreachable", llvm::cl::desc("eliminate unreachable code"), - llvm::cl::Optional); + llvm::cl::ZeroOrMore); static llvm::cl::opt OptimizeBodylessFunctions( "optimize-bodyless-functions", llvm::cl::desc("optimize functions that just do a tail call"), - llvm::cl::Optional); + llvm::cl::ZeroOrMore); static llvm::cl::opt InlineSmallFunctions( "inline-small-functions", llvm::cl::desc("inline functions with a single basic block"), - llvm::cl::Optional); + llvm::cl::ZeroOrMore); static llvm::cl::opt SimplifyConditionalTailCalls("simplify-conditional-tail-calls", llvm::cl::desc("simplify conditional tail calls " "by removing unnecessary jumps"), - llvm::cl::Optional); + llvm::cl::ZeroOrMore); static llvm::cl::opt Peepholes("peepholes", llvm::cl::desc("run peephole optimizations"), - llvm::cl::Optional); + llvm::cl::ZeroOrMore); static llvm::cl::opt SimplifyRODataLoads("simplify-rodata-loads", @@ -47,13 +47,13 @@ SimplifyRODataLoads("simplify-rodata-loads", "replacing the memory operand with the " "constant found in the corresponding " "section"), - llvm::cl::Optional); + llvm::cl::ZeroOrMore); static llvm::cl::opt IdenticalCodeFolding( "icf", llvm::cl::desc("fold functions with identical code"), - llvm::cl::Optional); + llvm::cl::ZeroOrMore); } // namespace opts diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 4bdb5c9202eb..e84c1756cdd7 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -28,37 +28,44 @@ extern bool shouldProcess(const bolt::BinaryFunction &Function); static cl::opt PrintReordered("print-reordered", cl::desc("print functions after layout optimization"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintEHRanges("print-eh-ranges", cl::desc("print function with updated exception ranges"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintUCE("print-uce", cl::desc("print functions after unreachable code elimination"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintPeepholes("print-peepholes", cl::desc("print functions after peephole optimization"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintSimplifyROLoads("print-simplify-rodata-loads", cl::desc("print functions after simplification of RO data" " loads"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintICF("print-icf", cl::desc("print functions after ICF optimization"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintInline("print-inline", cl::desc("print functions after inlining optimization"), + cl::ZeroOrMore, cl::Hidden); static cl::list @@ -66,11 +73,13 @@ ForceInlineFunctions("force-inline", cl::CommaSeparated, cl::desc("list of functions to always consider " "for inlining"), - cl::value_desc("func1,func2,func3,...")); + cl::value_desc("func1,func2,func3,..."), + cl::Hidden); static cl::opt AggressiveInlining("aggressive-inlining", cl::desc("perform aggressive inlining"), + cl::ZeroOrMore, cl::Hidden); static cl::opt @@ -95,13 +104,15 @@ ReorderBlocks( "cache", "perform optimal layout prioritizing I-cache " "behavior"), - clEnumValEnd)); + clEnumValEnd), + cl::ZeroOrMore); static cl::opt MinBranchClusters( "min-branch-clusters", cl::desc("use a modified clustering algorithm geared towards " "minimizing branches"), + cl::ZeroOrMore, cl::Hidden); } // namespace opts diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index f81fd12ec361..dd824e8cbd24 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -39,6 +39,7 @@ extern llvm::cl::opt Verbosity; static llvm::cl::opt PrintExceptions("print-exceptions", llvm::cl::desc("print exception handling data"), + llvm::cl::ZeroOrMore, llvm::cl::Hidden); } // namespace opts diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp index fb0f043e0235..abfe983d5837 100644 --- a/bolt/ReorderAlgorithm.cpp +++ b/bolt/ReorderAlgorithm.cpp @@ -27,7 +27,7 @@ using namespace bolt; namespace opts { static cl::opt -PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); +PrintClusters("print-clusters", cl::desc("print clusters"), cl::ZeroOrMore); } // namespace opts diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index e2a64a6882c8..5537da56627f 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -80,7 +80,8 @@ OutputFilename("o", cl::desc(""), cl::Required); cl::opt Verbosity("v", cl::desc("set verbosity level for diagnostic output"), - cl::init(0)); + cl::init(0), + cl::ZeroOrMore); static cl::list BreakFunctionNames("break-funcs", @@ -112,7 +113,7 @@ SkipFunctionNamesFile("skip-funcs-file", static cl::opt MaxFunctions("max-funcs", cl::desc("maximum # of functions to overwrite"), - cl::Optional); + cl::ZeroOrMore); cl::opt SplitFunctions("split-functions", @@ -127,53 +128,60 @@ SplitFunctions("split-functions", clEnumValN(BinaryFunction::ST_ALL, "3", "split all functions"), clEnumValEnd), - cl::Optional); + cl::ZeroOrMore); static cl::opt UpdateDebugSections("update-debug-sections", cl::desc("update DWARF debug sections of the executable"), - cl::Optional); + cl::ZeroOrMore); static cl::opt FixDebugInfoLargeFunctions("fix-debuginfo-large-functions", cl::init(true), cl::desc("do another pass if we encounter large " "functions, to correct their debug info."), - cl::Optional, + cl::ZeroOrMore, cl::ReallyHidden); static cl::opt AlignBlocks("align-blocks", cl::desc("try to align BBs inserting nops"), - cl::Optional); + cl::ZeroOrMore); static cl::opt UseGnuStack("use-gnu-stack", - cl::desc("use GNU_STACK program header for new segment")); + cl::desc("use GNU_STACK program header for new segment"), + cl::ZeroOrMore); static cl::opt DumpEHFrame("dump-eh-frame", cl::desc("dump parsed .eh_frame (debugging)"), + cl::ZeroOrMore, cl::Hidden); cl::opt PrintAll("print-all", cl::desc("print functions after each stage"), + cl::ZeroOrMore, cl::Hidden); cl::opt DumpDotAll("dump-dot-all", cl::desc("dump function CFGs to graphviz format after each stage"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintCFG("print-cfg", cl::desc("print functions after CFG construction"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintLoopInfo("print-loops", cl::desc("print loop related information"), + cl::ZeroOrMore, cl::Hidden); static cl::opt PrintDisasm("print-disasm", cl::desc("print function after disassembly"), + cl::ZeroOrMore, cl::Hidden); static cl::opt From b382c5ad4d947282ee3ec0becf4f6229521afa07 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 29 Aug 2016 21:11:22 -0700 Subject: [PATCH 163/904] Make BinaryFunction::fixBranches() more flexible and support CFG updates. Summary: The CFG represents "the ultimate source of truth". Transformations on functions and blocks have to update the CFG and fixBranches() would make sure the correct branch instructions are inserted at the end of basic blocks (or removed when necessary). We do require a conditional branch at the end of the basic block if the block has 2 successors as CFG currently lacks the conditional code support (it will probably stay that way). We only use this branch instruction for its conditional code, the destination is determined by CFG - first successor representing true/taken branch, while the second successor - false/fall-through branch. When we reverse the branch condition, the CFG is updated accordingly. The previous version used to insert jumps after some terminating instructions sometimes resulting in a larger code than needed. As a result with the new version 1 extra function becomes overwritten for HHVM binary. With this diff we also convert conditional branches with one successor (result of code from __builtin_unreachable()) into unconditional jumps. (cherry picked from commit 5b31dc18b44b72c2922f1a34f2d5e45363f7a138) --- bolt/BinaryBasicBlock.cpp | 20 ++++ bolt/BinaryBasicBlock.h | 28 ++++- bolt/BinaryFunction.cpp | 205 +++++++++---------------------------- bolt/BinaryFunction.h | 32 +++--- bolt/BinaryPassManager.cpp | 11 +- bolt/BinaryPasses.cpp | 49 ++++++--- bolt/BinaryPasses.h | 8 ++ 7 files changed, 161 insertions(+), 192 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 1215980e63e7..32d4db864b2f 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -11,6 +11,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" +#include "BinaryFunction.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -30,6 +31,9 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { } BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const { + if (!Label && succ_size() == 1) + return *succ_begin(); + for (BinaryBasicBlock *BB : successors()) { if (BB->getLabel() == Label) return BB; @@ -94,6 +98,22 @@ bool BinaryBasicBlock::analyzeBranch(const MCInstrAnalysis &MIA, return MIA.analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch); } +bool BinaryBasicBlock::swapConditionalSuccessors() { + if (succ_size() != 2) + return false; + + std::swap(Successors[0], Successors[1]); + std::swap(BranchInfo[0], BranchInfo[1]); + return true; +} + +void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) { + auto &BC = Function->getBinaryContext(); + MCInst NewInst; + BC.MIA->createUncondBranch(NewInst, Successor->getLabel(), BC.Ctx.get()); + Instructions.emplace_back(std::move(NewInst)); +} + void BinaryBasicBlock::dump(BinaryContext& BC) const { if (Label) outs() << Label->getName() << ":\n"; BC.printInstructions(outs(), Instructions.begin(), Instructions.end(), Offset); diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 180f51d04e87..5f4c8ed06bfa 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -288,9 +288,31 @@ class BinaryBasicBlock { return Label; } - /// Get successor with given label. Returns nullptr if no such - /// successor is found. - BinaryBasicBlock *getSuccessor(const MCSymbol *Label) const; + /// Get successor with given \p Label if \p Label != nullptr. + /// Returns nullptr if no such successor is found. + /// If the \p Label == nullptr and the block has only one successor then + /// return the successor. + BinaryBasicBlock *getSuccessor(const MCSymbol *Label = nullptr) const; + + /// If the basic block ends with a conditional branch (possibly followed by + /// an unconditional branch) and thus has 2 successors, return a successor + /// corresponding to a jump conditon which could be true or false. + /// Return nullptr if the basic block does not have a conditional jump. + const BinaryBasicBlock *getConditionalSuccessor(bool Condition) const { + if (succ_size() != 2) + return nullptr; + return Successors[Condition == true ? 0 : 1]; + } + + /// If the basic block ends with a conditional branch (possibly followed by + /// an unconditonal branch) and thus has 2 successor, revese the order of + /// its successors in CFG, update branch info, and return true. If the basic + /// block does not have 2 successors return false. + bool swapConditionalSuccessors(); + + /// Add an instruction with unconditional control transfer to \p Successor + /// basic block to the end of this basic block. + void addBranchInstruction(const BinaryBasicBlock *Successor); /// Get landing pad with given label. Returns nullptr if no such /// landing pad is found. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index d6940ac2c916..1a2548516d4a 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -932,8 +932,7 @@ bool BinaryFunction::buildCFG() { // Add associated CFI instrs. We always add the CFI instruction that is // located immediately after this instruction, since the next CFI // instruction reflects the change in state caused by this instruction. - auto NextInstr = I; - ++NextInstr; + auto NextInstr = std::next(I); uint64_t CFIOffset; if (NextInstr != E) CFIOffset = NextInstr->first; @@ -1375,7 +1374,7 @@ void BinaryFunction::removeConditionalTailCalls() { std::tie(LP, Action) = BC.MIA->getEHInfo(CondTailCallInst); assert(!LP && "found tail call with associated landing pad"); - // Create the uncoditional tail call instruction. + // Create the unconditional tail call instruction. const MCSymbol &TailCallTargetLabel = cast( CondTailCallInst.getOperand(0).getExpr())->getSymbol(); @@ -1386,21 +1385,22 @@ void BinaryFunction::removeConditionalTailCalls() { // direction of the jump when it is taken. We want to preserve this // direction. BinaryBasicBlock *TailCallBB = nullptr; - if (getAddress() > TCInfo.TargetAddress) { + MCSymbol *TCLabel = BC.Ctx->createTempSymbol("TC", true); + if (getAddress() >= TCInfo.TargetAddress) { // Backward jump: We will reverse the condition of the tail call, change // its target to the following (currently fall-through) block, and insert - // a new block between them that will contain the uncoditional tail call. + // a new block between them that will contain the unconditional tail call. // Reverse the condition of the tail call and update its target. unsigned InsertIdx = getIndex(BB) + 1; - assert(InsertIdx < size() && "no fall-through for condtional tail call"); + assert(InsertIdx < size() && "no fall-through for conditional tail call"); BinaryBasicBlock *NextBB = getBasicBlockAtIndex(InsertIdx); + BC.MIA->reverseBranchCondition( CondTailCallInst, NextBB->getLabel(), BC.Ctx.get()); // Create a basic block containing the unconditional tail call instruction // and place it between BB and NextBB. - MCSymbol *TCLabel = BC.Ctx->createTempSymbol("TC", true); std::vector> TailCallBBs; TailCallBBs.emplace_back(createBasicBlock(NextBB->getOffset(), TCLabel)); TailCallBBs[0]->addInstruction(TailCallInst); @@ -1411,15 +1411,14 @@ void BinaryFunction::removeConditionalTailCalls() { BBCFIState.insert(BBCFIState.begin() + InsertIdx, TCInfo.CFIStateBefore); } else { // Forward jump: we will create a new basic block at the end of the - // function containing the uncoditional tail call and change the target of - // the conditional tail call to this basic block. + // function containing the unconditional tail call and change the target + // of the conditional tail call to this basic block. // Create a basic block containing the unconditional tail call // instruction and place it at the end of the function. const BinaryBasicBlock *LastBB = BasicBlocks.back(); uint64_t NewBlockOffset = LastBB->Offset + BC.computeCodeSize(LastBB->begin(), LastBB->end()); - MCSymbol *TCLabel = BC.Ctx->createTempSymbol("TC", true); TailCallBB = addBasicBlock(NewBlockOffset, TCLabel); TailCallBB->addInstruction(TailCallInst); @@ -1434,9 +1433,11 @@ void BinaryFunction::removeConditionalTailCalls() { BC.MIA->replaceBranchTarget(CondTailCallInst, TCLabel, BC.Ctx.get()); } - // Add the CFG edge from BB to TailCallBB and the corresponding profile - // info. + // Add CFG edge with profile info from BB to TailCallBB info and swap + // edges if the TailCallBB corresponds to the taken branch. BB->addSuccessor(TailCallBB, TCInfo.Count, TCInfo.Mispreds); + if (getAddress() < TCInfo.TargetAddress) + BB->swapConditionalSuccessors(); // Add execution count for the block. if (hasValidProfile()) @@ -1698,7 +1699,6 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, if (Split) splitFunction(); - fixBranches(); } namespace { @@ -1856,168 +1856,57 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const { dumpGraph(of); } -const BinaryBasicBlock * -BinaryFunction::getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const { - // This is commented out because it makes BOLT run too slowly. - //assert(std::is_sorted(begin(), end())); - auto I = std::upper_bound(begin(), end(), *BB); - assert(I != begin() && "first basic block not at offset 0"); - - if (I == end()) - return nullptr; - return &*I; -} - void BinaryFunction::fixBranches() { auto &MIA = BC.MIA; + auto *Ctx = BC.Ctx.get(); for (unsigned I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { BinaryBasicBlock *BB = BasicBlocksLayout[I]; - const MCSymbol *TBB = nullptr; const MCSymbol *FBB = nullptr; MCInst *CondBranch = nullptr; MCInst *UncondBranch = nullptr; if (!MIA->analyzeBranch(BB->Instructions, TBB, FBB, CondBranch, - UncondBranch)) { + UncondBranch)) continue; - } - // Check if the original fall-through for this block has been moved - const MCSymbol *FT = nullptr; - bool HotColdBorder = false; - if (I + 1 != BasicBlocksLayout.size()) { - FT = BasicBlocksLayout[I + 1]->getLabel(); - if (BB->IsCold != BasicBlocksLayout[I + 1]->IsCold) - HotColdBorder = true; - } - const BinaryBasicBlock *OldFTBB = getOriginalLayoutSuccessor(BB); - const MCSymbol *OldFT = OldFTBB ? OldFTBB->getLabel() : nullptr; - - // Case 1: There are no branches in this basic block and it just falls - // through - if (CondBranch == nullptr && UncondBranch == nullptr) { - // Case 1a: Last instruction, excluding pseudos, is a return, so it does - // *not* fall through to the next block. - if (!BB->empty()) { - auto LastInstIter = --BB->end(); - while (BC.MII->get(LastInstIter->getOpcode()).isPseudo() && - LastInstIter != BB->begin()) - --LastInstIter; - if (MIA->isReturn(*LastInstIter)) - continue; - } - // Case 1b: Layout has changed and the fallthrough is not the same (or the - // fallthrough got moved to a cold region). Need to add a new - // unconditional branch to jump to the old fallthrough. - if ((FT != OldFT || HotColdBorder) && OldFT != nullptr) { - MCInst NewInst; - if (!MIA->createUncondBranch(NewInst, OldFT, BC.Ctx.get())) - llvm_unreachable("Target does not support creating new branches"); - BB->Instructions.emplace_back(std::move(NewInst)); - } - // Case 1c: Layout hasn't changed, nothing to do. - continue; - } - - // Case 2: There is a single jump, unconditional, in this basic block - if (CondBranch == nullptr) { - // Case 2a: It jumps to the new fall-through, so we can delete it - if (TBB == FT && !HotColdBorder) { - BB->eraseInstruction(UncondBranch); - } - // Case 2b: If 2a doesn't happen, there is nothing we can do - continue; - } + // We will create unconditional branch with correct destination if needed. + if (UncondBranch) + BB->eraseInstruction(UncondBranch); - // Case 3: There is a single jump, conditional, in this basic block - if (UncondBranch == nullptr) { - // Case 3a: If the taken branch goes to the next block in the new layout, - // invert this conditional branch logic so we can make this a fallthrough. - if (TBB == FT && !HotColdBorder) { - if (OldFT == nullptr) { - errs() << "BOLT-ERROR: malformed CFG for function " << *this - << " in basic block " << BB->getName() << '\n'; - } - assert(OldFT != nullptr && "malformed CFG"); - if (!MIA->reverseBranchCondition(*CondBranch, OldFT, BC.Ctx.get())) - llvm_unreachable("Target does not support reversing branches"); + // Basic block that follows the current one in the final layout. + const BinaryBasicBlock *NextBB = nullptr; + if (I + 1 != E && BB->IsCold == BasicBlocksLayout[I + 1]->IsCold) + NextBB = BasicBlocksLayout[I + 1]; + + if (BB->succ_size() == 1) { + // __builtin_unreachable() could create a conditional branch that + // falls-through into the next function - hence the block will have only + // one valid successor. Since behaviour is undefined - we replace + // the conditional branch with an unconditional if required. + if (CondBranch) + BB->eraseInstruction(CondBranch); + if (BB->getSuccessor() == NextBB) continue; + BB->addBranchInstruction(BB->getSuccessor()); + } else if (BB->succ_size() == 2) { + assert(CondBranch && "conditional branch expected"); + const auto *TSuccessor = BB->getConditionalSuccessor(true); + const auto *FSuccessor = BB->getConditionalSuccessor(false); + if (NextBB && NextBB == TSuccessor) { + std::swap(TSuccessor, FSuccessor); + MIA->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx); + BB->swapConditionalSuccessors(); + } else { + MIA->replaceBranchTarget(*CondBranch, TSuccessor->getLabel(), Ctx); } - // Case 3b: Need to add a new unconditional branch because layout - // has changed - if ((FT != OldFT || HotColdBorder) && OldFT != nullptr) { - MCInst NewInst; - if (!MIA->createUncondBranch(NewInst, OldFT, BC.Ctx.get())) - llvm_unreachable("Target does not support creating new branches"); - BB->Instructions.emplace_back(std::move(NewInst)); - continue; + if (!NextBB || (NextBB != TSuccessor && NextBB != FSuccessor)) { + BB->addBranchInstruction(FSuccessor); } - // Case 3c: Old fall-through is the same as the new one, no need to change - continue; - } - - // Case 4: There are two jumps in this basic block, one conditional followed - // by another unconditional. - // Case 4a: If the unconditional jump target is the new fall through, - // delete it. - if (FBB == FT && !HotColdBorder) { - BB->eraseInstruction(UncondBranch); - continue; - } - // Case 4b: If the taken branch goes to the next block in the new layout, - // invert this conditional branch logic so we can make this a fallthrough. - // Now we don't need the unconditional jump anymore, so we also delete it. - if (TBB == FT && !HotColdBorder) { - if (!MIA->reverseBranchCondition(*CondBranch, FBB, BC.Ctx.get())) - llvm_unreachable("Target does not support reversing branches"); - BB->eraseInstruction(UncondBranch); - continue; - } - // Case 4c: Nothing interesting happening. - } -} - -void BinaryFunction::fixFallthroughBranch(BinaryBasicBlock *Block) { - // No successors, must be a return or similar. - if (Block->succ_size() == 0) return; - - const MCSymbol *TBB = nullptr; - const MCSymbol *FBB = nullptr; - MCInst *CondBranch = nullptr; - MCInst *UncondBranch = nullptr; - - if (!BC.MIA->analyzeBranch(Block->Instructions, TBB, FBB, CondBranch, - UncondBranch)) { - assert(0); - return; - } - - if (!UncondBranch) { - const BinaryBasicBlock* FallThroughBB = nullptr; - if (CondBranch) { - assert(TBB); - // Find the first successor that is not a target of the conditional - // branch. - for (auto *Succ : Block->successors()) { - if (Succ->getLabel() != TBB) { - FallThroughBB = Succ; - break; - } - } - } else { - // pick first successor as fallthrough. - FallThroughBB = *Block->succ_begin(); - } - - assert(FallThroughBB); - - const auto FallThroughLabel = FallThroughBB->getLabel(); - MCInst NewInst; - if (!BC.MIA->createUncondBranch(NewInst, FallThroughLabel, BC.Ctx.get())) { - llvm_unreachable("Target does not support creating new branches"); } - Block->addInstruction(NewInst); + // Cases where the number of successors is 0 (block ends with a + // terminator) or more than 2 (switch table) don't require branch + // instruction adjustments. } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index ba1e8290a5ab..b960102ebc82 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -245,11 +245,6 @@ class BinaryFunction : public AddressRangesOwner { void recomputeLandingPads(const unsigned StartIndex, const unsigned NumBlocks); - /// Return basic block that originally was laid out immediately following - /// the given /p BB basic block. - const BinaryBasicBlock * - getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const; - using BranchListType = std::vector>; BranchListType TakenBranches; /// All local taken branches. BranchListType FTBranches; /// All fall-through branches. @@ -491,6 +486,11 @@ class BinaryFunction : public AddressRangesOwner { /// CFG after an optimization pass. void dumpGraphForPass(std::string Annotation = "") const; + /// Return BinaryContext for the function. + const BinaryContext &getBinaryContext() const { + return BC; + } + /// Get basic block index assuming it belongs to this function. unsigned getIndex(const BinaryBasicBlock *BB) const { assert(BB->Index < BasicBlocks.size()); @@ -1019,15 +1019,23 @@ class BinaryFunction : public AddressRangesOwner { /// (call instructions with non-empty landing pad). void propagateGnuArgsSizeInfo(); - /// Traverse the CFG checking branches, inverting their condition, removing or - /// adding jumps based on a new layout order. + /// Adjust branch instructions to match the CFG. + /// + /// As it comes to internal branches, the CFG represents "the ultimate source + /// of truth". Transformations on functions and blocks have to update the CFG + /// and fixBranches() would make sure the correct branch instructions are + /// inserted at the end of basic blocks. + /// + /// We do require a conditional branch at the end of the basic block if + /// the block has 2 successors as CFG currently lacks the conditional + /// code support (it will probably stay that way). We only use this + /// branch instruction for its conditional code, the destination is + /// determined by CFG - first successor representing true/taken branch, + /// while the second successor - false/fall-through branch. + /// + /// When we reverse the branch condition, the CFG is updated accordingly. void fixBranches(); - /// If needed, add an unconditional jmp to the original fallthrough of - /// Block. This is used by the indirect call promotion optimization - /// since it inserts new BBs after the merge block. - void fixFallthroughBranch(BinaryBasicBlock *Block); - /// Split function in two: a part with warm or hot BBs and a part with never /// executed BBs. The cold part is moved to a new BinaryFunction. void splitFunction(); diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 2245106704a9..4f507a396511 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -88,11 +88,19 @@ void BinaryFunctionPassManager::runAllPasses( std::move(llvm::make_unique(Manager.NagUser)), opts::EliminateUnreachable); + Manager.registerPass(llvm::make_unique(), + opts::OptimizeBodylessFunctions); + Manager.registerPass(llvm::make_unique(), opts::SimplifyRODataLoads); Manager.registerPass(std::move(llvm::make_unique())); + // This pass syncs local branches with CFG. If any of the following + // passes breakes the sync - they need to re-run the pass. + Manager.registerPass(std::move(llvm::make_unique())); + + // This pass should be run after FixupBranches. Manager.registerPass(llvm::make_unique(), opts::SimplifyConditionalTailCalls); @@ -102,9 +110,6 @@ void BinaryFunctionPassManager::runAllPasses( std::move(llvm::make_unique(Manager.NagUser)), opts::EliminateUnreachable); - Manager.registerPass(llvm::make_unique(), - opts::OptimizeBodylessFunctions); - Manager.registerPass(std::move(llvm::make_unique())); Manager.registerPass(llvm::make_unique(), opts::Peepholes); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index e84c1756cdd7..f4721cbb14c5 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -31,11 +31,15 @@ PrintReordered("print-reordered", cl::ZeroOrMore, cl::Hidden); -static cl::opt -PrintEHRanges("print-eh-ranges", - cl::desc("print function with updated exception ranges"), - cl::ZeroOrMore, - cl::Hidden); +cl::opt +PrintAfterBranchFixup("print-after-branch-fixup", + cl::desc("print function after fixing local branches"), + cl::Hidden); + +cl::opt +PrintAfterFixup("print-after-fixup", + cl::desc("print function after fixup"), + cl::Hidden); static cl::opt PrintUCE("print-uce", @@ -463,10 +467,7 @@ InlineSmallFunctions::inlineCall( } else { InlinedInstanceBB->addSuccessor(InlinedInstance.back().get()); } - MCInst ExitBranchInst; - const MCSymbol *ExitLabel = InlinedInstance.back().get()->getLabel(); - BC.MIA->createUncondBranch(ExitBranchInst, ExitLabel, BC.Ctx.get()); - InlinedInstanceBB->addInstruction(std::move(ExitBranchInst)); + InlinedInstanceBB->addBranchInstruction(InlinedInstance.back().get()); } else if (InlinedInstanceBBIndex > 0 || !CanMergeFirstInlinedBlock) { assert(CallInstIndex == CallerBB->size() - 1); assert(CallerBB->succ_size() <= 1); @@ -478,10 +479,7 @@ InlineSmallFunctions::inlineCall( } else { InlinedInstanceBB->addSuccessor(*CallerBB->succ_begin()); } - MCInst ExitBranchInst; - const MCSymbol *ExitLabel = (*CallerBB->succ_begin())->getLabel(); - BC.MIA->createUncondBranch(ExitBranchInst, ExitLabel, BC.Ctx.get()); - InlinedInstanceBB->addInstruction(std::move(ExitBranchInst)); + InlinedInstanceBB->addBranchInstruction(*CallerBB->succ_begin()); } } } @@ -882,6 +880,25 @@ void ReorderBasicBlocks::runOnFunctions( } } +void FixupBranches::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set &) { + for (auto &It : BFs) { + auto &Function = It.second; + + if (!Function.isSimple() || !opts::shouldProcess(Function)) + continue; + + Function.fixBranches(); + + if (opts::PrintAll || opts::PrintAfterBranchFixup) + Function.print(errs(), "after branch fixup", true); + if (opts::DumpDotAll) + Function.dumpGraphForPass("after-branch-fixup"); + } +} + void FixupFunctions::runOnFunctions( BinaryContext &BC, std::map &BFs, @@ -908,10 +925,10 @@ void FixupFunctions::runOnFunctions( // Update exception handling information. Function.updateEHRanges(); - if (opts::PrintAll || opts::PrintEHRanges) - Function.print(outs(), "after updating EH ranges", true); + if (opts::PrintAll || opts::PrintAfterFixup) + Function.print(errs(), "after fixup", true); if (opts::DumpDotAll) - Function.dumpGraphForPass("update-EH-ranges"); + Function.dumpGraphForPass("after-fixup"); } } diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 471be11cf3c8..51c37665bedc 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -141,6 +141,14 @@ class ReorderBasicBlocks : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// Sync local branches with CFG. +class FixupBranches : public BinaryFunctionPass { + public: + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + /// Fix the CFI state and exception handling information after all other /// passes have completed. class FixupFunctions : public BinaryFunctionPass { From 559d28dbc510e5c5f6af706e96e4eaef8f7f8c95 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 29 Aug 2016 21:11:22 -0700 Subject: [PATCH 164/904] Add dyno stats to BOLT. Summary: Add "-dyno-stats" option that prints instruction stats based on the execution profile similar to below: BOLT-INFO: program-wide dynostats after optimizations: executed forward branches : 109706407 (+8.1%) taken forward branches : 13769074 (-55.5%) executed backward branches : 24517582 (-25.0%) taken backward branches : 15330256 (-27.2%) executed unconditional branches : 6009826 (-35.5%) function calls : 17192114 (+0.0%) executed instructions : 837733057 (-0.4%) total branches : 140233815 (-2.3%) taken branches : 35109156 (-42.8%) Also fixed pseudo instruction discrepancies and added assertions for BinaryBasicBlock::getNumPseudos() to make sure the number is synchronized with real number of pseudo instructions. (cherry picked from commit dca96e20378563d57c40d1e1819a0af70e431dd2) --- bolt/BinaryBasicBlock.cpp | 19 ++++++ bolt/BinaryBasicBlock.h | 28 ++++++-- bolt/BinaryFunction.cpp | 135 +++++++++++++++++++++++++++++++++++++- bolt/BinaryFunction.h | 93 +++++++++++++++++++++++++- bolt/BinaryPasses.cpp | 4 +- bolt/Exceptions.cpp | 2 +- bolt/RewriteInstance.cpp | 20 +++++- bolt/RewriteInstance.h | 15 ++++- 8 files changed, 303 insertions(+), 13 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 32d4db864b2f..921e69d6ddee 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -114,6 +114,25 @@ void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) { Instructions.emplace_back(std::move(NewInst)); } +uint32_t BinaryBasicBlock::getNumPseudos() const { +#ifndef NDEBUG + auto &BC = Function->getBinaryContext(); + uint32_t N = 0; + for (auto &Instr : Instructions) { + if (BC.MII->get(Instr.getOpcode()).isPseudo()) + ++N; + } + if (N != NumPseudos) { + errs() << "BOLT-ERROR: instructions for basic block " << getName() + << " in function " << *Function << ": calculated pseudos " + << N << ", set pseudos " << NumPseudos << ", size " << size() + << '\n'; + llvm_unreachable("pseudos mismatch"); + } +#endif + return NumPseudos; +} + void BinaryBasicBlock::dump(BinaryContext& BC) const { if (Label) outs() << Label->getName() << ":\n"; BC.printInstructions(outs(), Instructions.begin(), Instructions.end(), Offset); diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 5f4c8ed06bfa..df3b228dfa01 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -296,7 +296,7 @@ class BinaryBasicBlock { /// If the basic block ends with a conditional branch (possibly followed by /// an unconditional branch) and thus has 2 successors, return a successor - /// corresponding to a jump conditon which could be true or false. + /// corresponding to a jump condition which could be true or false. /// Return nullptr if the basic block does not have a conditional jump. const BinaryBasicBlock *getConditionalSuccessor(bool Condition) const { if (succ_size() != 2) @@ -304,8 +304,14 @@ class BinaryBasicBlock { return Successors[Condition == true ? 0 : 1]; } + const BinaryBranchInfo &getBranchInfo(bool Condition) const { + assert(BranchInfo.size() == 2 && + "could only be called for blocks with 2 successors"); + return BranchInfo[Condition == true ? 0 : 1]; + }; + /// If the basic block ends with a conditional branch (possibly followed by - /// an unconditonal branch) and thus has 2 successor, revese the order of + /// an unconditional branch) and thus has 2 successor, reverse the order of /// its successors in CFG, update branch info, and return true. If the basic /// block does not have 2 successors return false. bool swapConditionalSuccessors(); @@ -346,12 +352,19 @@ class BinaryBasicBlock { } /// Add instruction before Pos in this basic block. - const_iterator insertPseudoInstr(const_iterator Pos, MCInst &Instr) { + template + Itr insertPseudoInstr(Itr Pos, MCInst &Instr) { ++NumPseudos; return Instructions.emplace(Pos, Instr); } - uint32_t getNumPseudos() const { return NumPseudos; } + /// Return the number of pseudo instructions in the basic block. + uint32_t getNumPseudos() const; + + /// Return the number of emitted instructions for this basic block. + uint32_t getNumNonPseudos() const { + return size() - getNumPseudos(); + } /// Set minimum alignment for the basic block. void setAlignment(uint64_t Align) { @@ -433,6 +446,13 @@ class BinaryBasicBlock { return CanOutline; } + /// Erase pseudo instruction at a given iterator. + iterator erasePseudoInstruction(iterator II) { + --NumPseudos; + return Instructions.erase(II); + } + + /// Erase given (non-pseudo) instruction if found. bool eraseInstruction(MCInst *Inst) { return replaceInstruction(Inst, std::vector()); } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 1a2548516d4a..62e9db26ce87 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -39,6 +39,7 @@ using namespace llvm; namespace opts { extern cl::opt Verbosity; +extern cl::opt PrintDynoStats; static cl::opt AgressiveSplitting("split-all-cold", @@ -51,6 +52,12 @@ DotToolTipCode("dot-tooltip-code", cl::ZeroOrMore, cl::Hidden); +static cl::opt +DynoStatsScale("dyno-stats-scale", + cl::desc("scale to be applied while reporting dyno stats"), + cl::Optional, + cl::init(1)); + } // namespace opts namespace llvm { @@ -62,6 +69,8 @@ namespace bolt { // using it. constexpr unsigned NoRegister = 0; +constexpr const char *DynoStats::Desc[]; + namespace { /// Gets debug line information for the instruction located at the given @@ -199,6 +208,11 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (IdenticalFunctionAddress != Address) OS << "\n Id Fun Addr : 0x" << Twine::utohexstr(IdenticalFunctionAddress); + if (opts::PrintDynoStats && !BasicBlocksLayout.empty()) { + DynoStats dynoStats = getDynoStats(); + OS << dynoStats; + } + OS << "\n}\n"; if (!PrintInstructions || !BC.InstPrinter) @@ -2006,7 +2020,7 @@ void BinaryFunction::propagateGnuArgsSizeInfo() { // Delete DW_CFA_GNU_args_size instructions and only regenerate // during the final code emission. The information is embedded // inside call instructions. - II = BB->Instructions.erase(II); + II = BB->erasePseudoInstruction(II); } else { ++II; } @@ -2575,5 +2589,124 @@ void BinaryFunction::printLoopInfo(raw_ostream &OS) const { OS << "Maximum nested loop depth: " << BLI->MaximumDepth << "\n\n"; } +DynoStats BinaryFunction::getDynoStats() const { + DynoStats Stats; + + // Return empty-stats about the function we don't completely understand. + if (!isSimple()) + return Stats; + + // Basic block indices in the new layout for quick branch direction lookup. + std::unordered_map + BBToIndexMap(layout_size()); + unsigned Index = 0; + for (const auto &BB : layout()) { + BBToIndexMap[BB] = ++Index; + } + auto isForwardBranch = [&](const BinaryBasicBlock *From, + const BinaryBasicBlock *To) { + return BBToIndexMap[To] > BBToIndexMap[From]; + }; + + for (const auto &BB : layout()) { + // The basic block execution count equals to the sum of incoming branch + // frequencies. This may deviate from the sum of outgoing branches of the + // basic block especially since the block may contain a function that + // does not return or a function that throws an exception. + uint64_t BBExecutionCount = 0; + for (const auto &BI : BB->BranchInfo) + if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) + BBExecutionCount += BI.Count; + + // Ignore blocks that were not executed. + if (BBExecutionCount == 0) + continue; + + // Count the number of calls by iterating through all instructions. + for (const auto &Instr : *BB) { + if (BC.MIA->isCall(Instr)) { + Stats[DynoStats::FUNCTION_CALLS] += BBExecutionCount; + if (BC.MIA->getMemoryOperandNo(Instr) != -1) { + Stats[DynoStats::INDIRECT_CALLS] += BBExecutionCount; + } + } + } + + Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount; + + // Update stats for branches. + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + if (!BC.MIA->analyzeBranch(BB->Instructions, TBB, FBB, CondBranch, + UncondBranch)) { + continue; + } + + if (!CondBranch && !UncondBranch) { + continue; + } + + // Simple unconditional branch. + if (!CondBranch) { + Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount; + continue; + } + + // Conditional branch that could be followed by an unconditional branch. + uint64_t TakenCount = BB->getBranchInfo(true).Count; + if (TakenCount == COUNT_NO_PROFILE) + TakenCount = 0; + uint64_t NonTakenCount = BB->getBranchInfo(false).Count; + if (NonTakenCount == COUNT_NO_PROFILE) + NonTakenCount = 0; + + assert(TakenCount + NonTakenCount == BBExecutionCount && + "internal calculation error"); + + if (isForwardBranch(BB, BB->getConditionalSuccessor(true))) { + Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount; + Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount; + } else { + Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount; + Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount; + } + + if (UncondBranch) { + Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount; + } + } + + return Stats; +} + +void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { + auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat, + uint64_t OtherStat) { + OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name; + if (Other) { + OS << format(" (%+.1f%%)", + ( (float) Stat - (float) OtherStat ) * 100.0 / + (float) (OtherStat + 1) ); + } + OS << '\n'; + }; + + for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; + Stat < DynoStats::LAST_DYNO_STAT; + ++Stat) { + printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0); + } +} + +void DynoStats::operator+=(const DynoStats &Other) { + for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; + Stat < DynoStats::LAST_DYNO_STAT; + ++Stat) { + Stats[Stat] += Other[Stat]; + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index b960102ebc82..eb4c4c10fae9 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -52,6 +52,88 @@ namespace bolt { using DWARFUnitLineTable = std::pair; +/// Class encapsulating runtime statistics about an execution unit. +class DynoStats { + +#define DYNO_STATS\ + D(FIRST_DYNO_STAT, "", Fn)\ + D(FORWARD_COND_BRANCHES, "executed forward branches", Fn)\ + D(FORWARD_COND_BRANCHES_TAKEN, "taken forward branches", Fn)\ + D(BACKWARD_COND_BRANCHES, "executed backward branches", Fn)\ + D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\ + D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\ + D(FUNCTION_CALLS, "all function calls", Fn)\ + D(INDIRECT_CALLS, "indirect calls", Fn)\ + D(INSTRUCTIONS, "executed instructions", Fn)\ + D(ALL_BRANCHES, "total branches",\ + Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\ + D(ALL_TAKEN, "taken branches",\ + Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\ + D(NONTAKEN_CONDITIONAL, "non-taken conditional branches",\ + Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\ + D(TAKEN_CONDITIONAL, "taken conditional branches",\ + Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\ + D(ALL_CONDITIONAL, "all conditional branches",\ + Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\ + D(LAST_DYNO_STAT, "", Fn) + +public: +#define D(name, ...) name, + enum : uint8_t { DYNO_STATS }; +#undef D + + +private: + uint64_t Stats[LAST_DYNO_STAT]; + +#define D(name, desc, ...) desc, + static constexpr const char *Desc[] = { DYNO_STATS }; +#undef D + +public: + DynoStats() { + for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat) + Stats[Stat] = 0; + } + + uint64_t &operator[](size_t I) { + assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT && + "index out of bounds"); + return Stats[I]; + } + + uint64_t operator[](size_t I) const { + switch (I) { +#define D(name, desc, func) \ + case name: \ + return func; +#define Fn Stats[I] +#define Fadd(a, b) operator[](a) + operator[](b) +#define Fsub(a, b) operator[](a) - operator[](b) +#define F(a) operator[](a) +#define Radd(a, b) (a + b) +#define Rsub(a, b) (a - b) + DYNO_STATS +#undef Fn +#undef D + default: + llvm_unreachable("index out of bounds"); + } + return 0; + } + + void print(raw_ostream &OS, const DynoStats *Other = nullptr) const; + + void operator+=(const DynoStats &Other); +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) { + Stats.print(OS, nullptr); + return OS; +} + +DynoStats operator+(const DynoStats &A, const DynoStats &B); + /// BinaryFunction is a representation of machine-level function. // /// We use the term "Binary" as "Machine" was already taken. @@ -460,7 +542,7 @@ class BinaryFunction : public AddressRangesOwner { /// end of basic blocks. void modifyLayout(LayoutType Type, bool MinBranchClusters, bool Split); - /// Find the loops in the CFG of the function and store infromation about + /// Find the loops in the CFG of the function and store information about /// them. void calculateLoopInfo(); @@ -469,7 +551,7 @@ class BinaryFunction : public AddressRangesOwner { return BLI != nullptr; } - /// Print loop inforamtion about the function. + /// Print loop information about the function. void printLoopInfo(raw_ostream &OS) const; /// View CFG in graphviz program @@ -491,6 +573,13 @@ class BinaryFunction : public AddressRangesOwner { return BC; } + /// Return dynostats for the function. + /// + /// The function relies on branch instructions being in-sync with CFG for + /// branch instructions stats. Thus it is better to call it after + /// fixBranches(). + DynoStats getDynoStats() const; + /// Get basic block index assuming it belongs to this function. unsigned getIndex(const BinaryBasicBlock *BB) const { assert(BB->Index < BasicBlocks.size()); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index f4721cbb14c5..27a12ae42b3e 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -31,12 +31,12 @@ PrintReordered("print-reordered", cl::ZeroOrMore, cl::Hidden); -cl::opt +static cl::opt PrintAfterBranchFixup("print-after-branch-fixup", cl::desc("print function after fixing local branches"), cl::Hidden); -cl::opt +static cl::opt PrintAfterFixup("print-after-fixup", cl::desc("print function after fixup"), cl::Hidden); diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index dd824e8cbd24..b20ee8264c20 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -345,7 +345,7 @@ void BinaryFunction::updateEHRanges() { EHSymbol = BC.Ctx->createTempSymbol("EH", true); MCInst EHLabel; BC.MIA->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get()); - II = BB->Instructions.insert(II, EHLabel); + II = BB->insertPseudoInstr(II, EHLabel); ++II; } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 5537da56627f..f5cb2fcc8b4b 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -90,6 +90,10 @@ BreakFunctionNames("break-funcs", cl::value_desc("func1,func2,func3,..."), cl::Hidden); +cl::opt +PrintDynoStats("dyno-stats", + cl::desc("print execution info based on profile")); + static cl::list FunctionNames("funcs", cl::CommaSeparated, @@ -1073,9 +1077,21 @@ void RewriteInstance::disassembleFunctions() { } void RewriteInstance::runOptimizationPasses() { - // Run optimization passes. - // + DynoStats dynoStatsBefore; + if (opts::PrintDynoStats) { + dynoStatsBefore = getDynoStats(); + outs() << "BOLT-INFO: program-wide dynostats before running " + "optimizations:\n\n" << dynoStatsBefore << '\n'; + } + BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions, LargeFunctions); + + if (opts::PrintDynoStats) { + auto dynoStatsAfter = getDynoStats(); + outs() << "BOLT-INFO: program-wide dynostats after optimizaions:\n\n"; + dynoStatsAfter.print(outs(), &dynoStatsBefore); + outs() << '\n'; + } } namespace { diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 444f98449dfd..415a56d44f69 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -14,6 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H #define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H +#include "BinaryFunction.h" #include "DebugData.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" @@ -31,7 +32,6 @@ class tool_output_file; namespace bolt { class BinaryContext; -class BinaryFunction; class CFIReaderWriter; class DataReader; @@ -342,6 +342,19 @@ class RewriteInstance { uint64_t Address, uint64_t Size, bool IsSimple); + + /// Return program-wide dynostats. + DynoStats getDynoStats() const { + DynoStats dynoStats; + for (auto &BFI : BinaryFunctions) { + auto &BF = BFI.second; + if (BF.isSimple()) { + dynoStats += BF.getDynoStats(); + } + } + return dynoStats; + } + }; } // namespace bolt From 4193ccafb3952a0d15586e50b1f36d5eda092f1f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 8 Sep 2016 14:52:26 -0700 Subject: [PATCH 165/904] Rewrite SCTC pass to do UCE and make it the last optimization pass. Summary: For now we make SCTC a special pass that runs at the end of all optimizations and transformations right after fixupBranches(). Since it's the last pass, it has to do its own UCE. (cherry picked from commit b50b21d4351cbae05abc3ce69bcb5b4ff45e34c4) --- bolt/BinaryBasicBlock.cpp | 9 ++ bolt/BinaryBasicBlock.h | 18 ++++ bolt/BinaryContext.cpp | 5 +- bolt/BinaryFunction.cpp | 23 ++--- bolt/BinaryFunction.h | 18 ++++ bolt/BinaryPassManager.cpp | 23 +++-- bolt/BinaryPasses.cpp | 193 +++++++++++++++++-------------------- bolt/RewriteInstance.cpp | 13 ++- 8 files changed, 160 insertions(+), 142 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 921e69d6ddee..2ac456551773 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -30,6 +30,15 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { return LHS.Offset < RHS.Offset; } +MCInst *BinaryBasicBlock::findFirstNonPseudoInstruction() { + auto &BC = Function->getBinaryContext(); + for (auto &Inst : Instructions) { + if (!BC.MII->get(Inst.getOpcode()).isPseudo()) + return &Inst; + } + return nullptr; +} + BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const { if (!Label && succ_size() == 1) return *succ_begin(); diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index df3b228dfa01..78c8313cb84a 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -62,6 +62,9 @@ class BinaryBasicBlock { /// Index to BasicBlocks vector in BinaryFunction. unsigned Index{~0u}; + /// Index in the current layout. + unsigned LayoutIndex{~0u}; + /// Number of pseudo instructions in this block. uint32_t NumPseudos{0}; @@ -207,6 +210,7 @@ class BinaryBasicBlock { return (unsigned)Throwers.size(); } bool throw_empty() const { return Throwers.empty(); } + bool isLandingPad() const { return !Throwers.empty(); } lp_iterator lp_begin() { return LandingPads.begin(); } const_lp_iterator lp_begin() const { return LandingPads.begin(); } @@ -329,6 +333,8 @@ class BinaryBasicBlock { return Label->getName(); } + MCInst *findFirstNonPseudoInstruction(); + /// Add instruction at the end of this basic block. /// Returns the index of the instruction in the Instructions vector of the BB. uint32_t addInstruction(MCInst &&Inst) { @@ -381,6 +387,13 @@ class BinaryBasicBlock { return Offset; } + /// Return index in the current layout. The user is responsible for + /// making sure the indices are up to date, + /// e.g. by calling BinaryFunction::updateLayoutIndices(); + unsigned getLayoutIndex() const { + return LayoutIndex; + } + /// Adds block to successor list, and also updates predecessor list for /// successor block. /// Set branch info for this path. @@ -549,6 +562,11 @@ class BinaryBasicBlock { void setOffset(uint64_t NewOffset) { Offset = NewOffset; } + + /// Set layout index. To be used by BinaryFunction. + void setLayoutIndex(unsigned Index) { + LayoutIndex = Index; + } }; bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 5b9fe8e1caa6..60e5424209ec 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -282,10 +282,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, const BinaryFunction* Function, bool printMCInst) const { if (MIA->isEHLabel(Instruction)) { - OS << " EH_LABEL: " - << cast(Instruction.getOperand(0).getExpr())-> - getSymbol() - << '\n'; + OS << " EH_LABEL: " << *MIA->getTargetSymbol(Instruction) << '\n'; return; } OS << format(" %08" PRIx64 ": ", Offset); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 62e9db26ce87..7766867b3d2d 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -147,6 +147,8 @@ unsigned BinaryFunction::eraseDeadBBs( std::map &ToPreserve) { BasicBlockOrderType NewLayout; unsigned Count = 0; + assert(ToPreserve[BasicBlocksLayout.front()] == true && + "unable to remove an entry basic block"); for (auto I = BasicBlocksLayout.begin(), E = BasicBlocksLayout.end(); I != E; ++I) { if (ToPreserve[*I]) @@ -1389,11 +1391,10 @@ void BinaryFunction::removeConditionalTailCalls() { assert(!LP && "found tail call with associated landing pad"); // Create the unconditional tail call instruction. - const MCSymbol &TailCallTargetLabel = - cast( - CondTailCallInst.getOperand(0).getExpr())->getSymbol(); + const auto *TailCallTargetLabel = BC.MIA->getTargetSymbol(CondTailCallInst); + assert(TailCallTargetLabel && "symbol expected for direct tail call"); MCInst TailCallInst; - BC.MIA->createTailCall(TailCallInst, &TailCallTargetLabel, BC.Ctx.get()); + BC.MIA->createTailCall(TailCallInst, TailCallTargetLabel, BC.Ctx.get()); // The way we will remove this conditional tail call depends on the // direction of the jump when it is taken. We want to preserve this @@ -2596,17 +2597,9 @@ DynoStats BinaryFunction::getDynoStats() const { if (!isSimple()) return Stats; - // Basic block indices in the new layout for quick branch direction lookup. - std::unordered_map - BBToIndexMap(layout_size()); - unsigned Index = 0; - for (const auto &BB : layout()) { - BBToIndexMap[BB] = ++Index; - } - auto isForwardBranch = [&](const BinaryBasicBlock *From, - const BinaryBasicBlock *To) { - return BBToIndexMap[To] > BBToIndexMap[From]; - }; + // Update enumeration of basic blocks for correct detection of branch' + // direction. + updateLayoutIndices(); for (const auto &BB : layout()) { // The basic block execution count equals to the sum of incoming branch diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index eb4c4c10fae9..26b6b22dc70f 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -830,6 +830,14 @@ class BinaryFunction : public AddressRangesOwner { /// computed from scratch using modifyLayout. void updateLayout(LayoutType Type, bool MinBranchClusters, bool Split); + /// Make sure basic blocks' indices match the current layout. + void updateLayoutIndices() const { + unsigned Index = 0; + for (auto *BB : layout()) { + BB->setLayoutIndex(Index++); + } + } + /// Dump function information to debug output. If \p PrintInstructions /// is true - include instruction disassembly. void dump(std::string Annotation = "", bool PrintInstructions = true) const; @@ -1218,6 +1226,16 @@ class BinaryFunction : public AddressRangesOwner { const FragmentInfo &cold() const { return ColdFragment; } }; +/// Determine direction of the branch based on the current layout. +/// Callee is responsible of updating basic block indices prior to using +/// this function (e.g. by calling BinaryFunction::updateLayoutIndices()). +inline bool isForwardBranch(const BinaryBasicBlock *From, + const BinaryBasicBlock *To) { + assert(From->getFunction() == To->getFunction() && + "basic blocks should be in the same function"); + return To->getLayoutIndex() > From->getLayoutIndex(); +} + inline raw_ostream &operator<<(raw_ostream &OS, const BinaryFunction &Function) { OS << Function.getPrintName(); diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 4f507a396511..23cac76730f4 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -94,26 +94,29 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(), opts::SimplifyRODataLoads); + Manager.registerPass( + std::move(llvm::make_unique(Manager.NagUser)), + opts::EliminateUnreachable); + Manager.registerPass(std::move(llvm::make_unique())); + Manager.registerPass(llvm::make_unique(), opts::Peepholes); + // This pass syncs local branches with CFG. If any of the following - // passes breakes the sync - they need to re-run the pass. + // passes breaks the sync - they either need to re-run the pass or + // fix branches consistency internally. Manager.registerPass(std::move(llvm::make_unique())); - // This pass should be run after FixupBranches. + // This pass introduces conditional jumps into external functions. + // Between extending CFG to support this and isolating this pass we chose + // the latter. Thus this pass will do unreachable code elimination + // if necessary and wouldn't rely on UCE for this. + // More generally this pass should be the last optimization pass. Manager.registerPass(llvm::make_unique(), opts::SimplifyConditionalTailCalls); - // The tail call fixup pass may introduce unreachable code. Add another - // instance of EliminateUnreachableBlocks here to catch it. - Manager.registerPass( - std::move(llvm::make_unique(Manager.NagUser)), - opts::EliminateUnreachable); - Manager.registerPass(std::move(llvm::make_unique())); - Manager.registerPass(llvm::make_unique(), opts::Peepholes); - Manager.runPasses(); } diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 27a12ae42b3e..070a0c3668fb 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -128,20 +128,18 @@ void OptimizeBodylessFunctions::analyze( BinaryFunction &BF, BinaryContext &BC, std::map &BFs) { - if (BF.size() != 1 || (*BF.begin()).size() == 0) + if (BF.size() != 1 || BF.front().getNumNonPseudos() != 1) return; - auto &BB = *BF.begin(); - const auto &FirstInst = *BB.begin(); - if (!BC.MIA->isTailCall(FirstInst)) + const auto *FirstInstr = BF.front().findFirstNonPseudoInstruction(); + if (!FirstInstr) return; - auto &Op1 = FirstInst.getOperand(0); - if (!Op1.isExpr()) + if (!BC.MIA->isTailCall(*FirstInstr)) return; - auto Expr = dyn_cast(Op1.getExpr()); - if (!Expr) + const auto *TargetSymbol = BC.MIA->getTargetSymbol(*FirstInstr); + if (!TargetSymbol) return; - const auto *Function = BC.getFunctionForSymbol(&Expr->getSymbol()); + const auto *Function = BC.getFunctionForSymbol(TargetSymbol); if (!Function) return; @@ -156,14 +154,10 @@ void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, auto &Inst = *InstIt; if (!BC.MIA->isCall(Inst)) continue; - auto &Op1 = Inst.getOperand(0); - if (!Op1.isExpr()) + const auto *OriginalTarget = BC.MIA->getTargetSymbol(Inst); + if (!OriginalTarget) continue; - auto Expr = dyn_cast(Op1.getExpr()); - if (!Expr) - continue; - auto *OriginalTarget = &Expr->getSymbol(); - auto *Target = OriginalTarget; + const auto *Target = OriginalTarget; // Iteratively update target since we could have f1() calling f2() // calling f3() calling f4() and we want to output f1() directly // calling f4(). @@ -614,11 +608,9 @@ bool InlineSmallFunctions::inlineCallsInFunction( !BC.MIA->isTailCall(Inst) && Inst.size() == 1 && Inst.getOperand(0).isExpr()) { - auto Target = dyn_cast( - Inst.getOperand(0).getExpr()); - assert(Target && "Not MCSymbolRefExpr"); - const auto *TargetFunction = - BC.getFunctionForSymbol(&Target->getSymbol()); + const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); + assert(TargetSymbol && "target symbol expected for direct call"); + const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol); if (TargetFunction) { bool CallToInlineableFunction = InliningCandidates.count(TargetFunction); @@ -683,11 +675,9 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( Inst.size() == 1 && Inst.getOperand(0).isExpr()) { assert(!BC.MIA->isInvoke(Inst)); - auto Target = dyn_cast( - Inst.getOperand(0).getExpr()); - assert(Target && "Not MCSymbolRefExpr"); - const auto *TargetFunction = - BC.getFunctionForSymbol(&Target->getSymbol()); + const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); + assert(TargetSymbol && "target symbol expected for direct call"); + const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol); if (TargetFunction) { bool CallToInlineableFunction = InliningCandidates.count(TargetFunction); @@ -934,89 +924,86 @@ void FixupFunctions::runOnFunctions( bool SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, BinaryFunction &BF) { - if (BF.layout_size() == 0) + if (BF.layout_size() < 2) return false; + // Need updated indices to correctly detect branch' direction. + BF.updateLayoutIndices(); + auto &MIA = BC.MIA; - uint64_t NumLocalTailCalls = 0; - uint64_t NumLocalPatchedTailCalls = 0; + uint64_t NumLocalCTCCandidates = 0; + uint64_t NumLocalCTCs = 0; + std::map ToPreserve; + for (auto *BB : BF.layout()) { + ToPreserve[BB] = true; - for (auto* BB : BF.layout()) { - const MCSymbol *TBB = nullptr; - const MCSymbol *FBB = nullptr; - MCInst *CondBranch = nullptr; - MCInst *UncondBranch = nullptr; + // Locate BB with a single direct tail-call instruction. + if (BB->getNumNonPseudos() != 1) + continue; - // Determine the control flow at the end of each basic block - if (!BB->analyzeBranch(*MIA, TBB, FBB, CondBranch, UncondBranch)) { + auto *Instr = BB->findFirstNonPseudoInstruction(); + if (!MIA->isTailCall(*Instr)) + continue; + auto *CalleeSymbol = MIA->getTargetSymbol(*Instr); + if (!CalleeSymbol) continue; + + // Detect direction of the possible conditional tail call. + // XXX: Once we start reordering functions this has to change. + bool IsForwardCTC; + const auto *CalleeBF = BC.getFunctionForSymbol(CalleeSymbol); + if (CalleeBF) { + IsForwardCTC = CalleeBF->getAddress() > BF.getAddress(); + } else { + // Absolute symbol. + auto const CalleeSI = BC.GlobalSymbols.find(CalleeSymbol->getName()); + assert(CalleeSI != BC.GlobalSymbols.end() && "unregistered symbol found"); + IsForwardCTC = CalleeSI->second > BF.getAddress(); } - // TODO: do we need to test for other branch patterns? - - // For this particular case, the first basic block ends with - // a conditional branch and has two successors, one fall-through - // and one for when the condition is true. - // The target of the conditional is a basic block with a single - // unconditional branch (i.e. tail call) to another function. - // We don't care about the contents of the fall-through block. - // Note: this code makes the assumption that the fall-through - // block is the last successor. - if (CondBranch && !UncondBranch && BB->succ_size() == 2) { - // Find conditional branch target assuming the fall-through is - // always the last successor. - auto *CondTargetBB = *BB->succ_begin(); - - // Does the BB contain a single instruction? - if (CondTargetBB->size() - CondTargetBB->getNumPseudos() == 1) { - // Check to see if the sole instruction is a tail call. - auto const &Instr = *CondTargetBB->begin(); - - if (MIA->isTailCall(Instr)) { - ++NumTailCallCandidates; - ++NumLocalTailCalls; - - auto const &TailTargetSymExpr = - cast(Instr.getOperand(0).getExpr()); - auto const &TailTarget = TailTargetSymExpr->getSymbol(); - - // Lookup the address for the tail call target. - auto const TailAddress = BC.GlobalSymbols.find(TailTarget.getName()); - if (TailAddress == BC.GlobalSymbols.end()) - continue; + // Iterate through all predecessors. + for (auto *PredBB : BB->predecessors()) { + if (PredBB->getConditionalSuccessor(true) != BB) + continue; - // Check to make sure we would be doing a forward jump. - // This assumes the address range of the current BB and the - // tail call target address don't overlap. - if (BF.getAddress() < TailAddress->second) { - ++NumTailCallsPatched; - ++NumLocalPatchedTailCalls; - - // Is the original jump forward or backward? - const bool isForward = - TailAddress->second > BF.getAddress() + BB->getOffset(); - - if (isForward) ++NumOrigForwardBranches; - - // Patch the new target address into the conditional branch. - CondBranch->getOperand(0).setExpr(TailTargetSymExpr); - // Remove the unused successor which may be eliminated later - // if there are no other users. - BB->removeSuccessor(CondTargetBB); - DEBUG(dbgs() << "patched " << (isForward ? "(fwd)" : "(back)") - << " tail call in " << BF << ".\n";); - } - } - } + ++NumLocalCTCCandidates; + + // We don't want to reverse direction of the branch in new order + // without further profile analysis. + if (isForwardBranch(PredBB, BB) != IsForwardCTC) + continue; + + // Change destination of the unconditional branch. + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + auto Result = + PredBB->analyzeBranch(*MIA, TBB, FBB, CondBranch, UncondBranch); + assert(Result && "internal error analyzing conditional branch"); + assert(CondBranch && "conditional branch expected"); + + MIA->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get()); + PredBB->removeSuccessor(BB); + ++NumLocalCTCs; } + + // Remove the block from CFG if all predecessors were removed. + if (BB->pred_size() == 0 && !BB->isLandingPad()) + ToPreserve[BB] = false; } - DEBUG(dbgs() << "BOLT: patched " << NumLocalPatchedTailCalls - << " tail calls (" << NumOrigForwardBranches << " forward)" - << " from a total of " << NumLocalTailCalls - << " in function " << BF << "\n";); + // Clean-up unreachable tail-call blocks. + BF.eraseDeadBBs(ToPreserve); + + DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs + << " conditional tail calls from a total of " << NumLocalCTCCandidates + << " candidates in function " << BF << "\n";); + + NumTailCallsPatched += NumLocalCTCs; + NumTailCallCandidates += NumLocalCTCCandidates; - return NumLocalPatchedTailCalls > 0; + return NumLocalCTCs > 0; } void SimplifyConditionalTailCalls::runOnFunctions( @@ -1206,20 +1193,14 @@ void IdenticalCodeFolding::discoverCallers( continue; } - const MCOperand &TargetOp = Inst.getOperand(0); - if (!TargetOp.isExpr()) { - // This is an inderect call, we cannot record - // a target. + const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); + if (!TargetSymbol) { + // This is an indirect call, we cannot record a target. ++InstrIndex; continue; } - // Find the target function for this call. - const auto *TargetExpr = TargetOp.getExpr(); - assert(TargetExpr->getKind() == MCExpr::SymbolRef); - const auto &TargetSymbol = - dyn_cast(TargetExpr)->getSymbol(); - const auto *Function = BC.getFunctionForSymbol(&TargetSymbol); + const auto *Function = BC.getFunctionForSymbol(TargetSymbol); if (!Function) { // Call to a function without a BinaryFunction object. ++InstrIndex; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index f5cb2fcc8b4b..3362f1a8a32d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1243,10 +1243,9 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, for (const auto &Instr : *BB) { // Handle pseudo instructions. if (BC.MIA->isEHLabel(Instr)) { - assert(Instr.getNumOperands() == 1 && Instr.getOperand(0).isExpr() && + const auto *Label = BC.MIA->getTargetSymbol(Instr); + assert(Instr.getNumOperands() == 1 && Label && "bad EH_LABEL instruction"); - auto Label = &(cast(Instr.getOperand(0).getExpr()) - ->getSymbol()); Streamer.EmitLabel(const_cast(Label)); continue; } @@ -1952,10 +1951,6 @@ void RewriteInstance::rewriteFile() { if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) continue; - if (Function.isSplit() && (Function.cold().getImageAddress() == 0 || - Function.cold().getImageSize() == 0)) - continue; - if (Function.getImageSize() > Function.getMaxSize()) { if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: new function size (0x" @@ -1968,6 +1963,10 @@ void RewriteInstance::rewriteFile() { continue; } + if (Function.isSplit() && (Function.cold().getImageAddress() == 0 || + Function.cold().getImageSize() == 0)) + continue; + OverwrittenScore += Function.getFunctionScore(); // Overwrite function in the output file. if (opts::Verbosity >= 2) { From 9b6574e9fc386275d4dbacc0849d5b476bb09758 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 9 Sep 2016 12:37:37 -0700 Subject: [PATCH 166/904] BOLT: Add per pass dyno stats + factor out post pass printing. Summary: I've added dyno stats printing per pass so we can see the results of each optimization pass on the stats. I've also factored out the post pass function printing code since it was pretty much the same after each pass. (cherry picked from commit 3448601a0f36f7160c6271bd3a53a05a0b6b8045) --- bolt/BinaryFunction.h | 40 +++++++- bolt/BinaryPassManager.cpp | 186 ++++++++++++++++++++++++++++-------- bolt/BinaryPassManager.h | 8 +- bolt/BinaryPasses.cpp | 191 ++++++++----------------------------- bolt/BinaryPasses.h | 94 +++++++++++++++++- bolt/RewriteInstance.cpp | 29 +++--- bolt/RewriteInstance.h | 13 --- 7 files changed, 332 insertions(+), 229 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 26b6b22dc70f..3bed5b106aca 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -135,7 +135,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) { DynoStats operator+(const DynoStats &A, const DynoStats &B); /// BinaryFunction is a representation of machine-level function. -// +/// /// We use the term "Binary" as "Machine" was already taken. class BinaryFunction : public AddressRangesOwner { public: @@ -1226,6 +1226,44 @@ class BinaryFunction : public AddressRangesOwner { const FragmentInfo &cold() const { return ColdFragment; } }; +/// Return program-wide dynostats. +template +inline DynoStats getDynoStats(const FuncsType &Funcs) { + DynoStats dynoStats; + for (auto &BFI : Funcs) { + auto &BF = BFI.second; + if (BF.isSimple()) { + dynoStats += BF.getDynoStats(); + } + } + return dynoStats; +} + +/// Call a function with optional before and after dynostats printing. +template +inline void +callWithDynoStats(FnType &&Func, + const FuncsType &Funcs, + StringRef Phase, + const bool Flag) { + DynoStats dynoStatsBefore; + if (Flag) { + dynoStatsBefore = getDynoStats(Funcs); + outs() << "BOLT-INFO: program-wide dynostats before running " + << Phase << ":\n\n" << dynoStatsBefore << '\n'; + } + + Func(); + + if (Flag) { + auto dynoStatsAfter = getDynoStats(Funcs); + outs() << "BOLT-INFO: program-wide dynostats after running " + << Phase << ":\n\n" << dynoStatsBefore << '\n'; + dynoStatsAfter.print(outs(), &dynoStatsBefore); + outs() << '\n'; + } +} + /// Determine direction of the branch based on the current layout. /// Callee is responsible of updating basic block indices prior to using /// this function (e.g. by calling BinaryFunction::updateLayoutIndices()). diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 23cac76730f4..c66d7b2af158 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -11,63 +11,165 @@ #include "BinaryPassManager.h" +using namespace llvm; + namespace opts { -static llvm::cl::opt +extern llvm::cl::opt PrintAll; +extern llvm::cl::opt DumpDotAll; +extern llvm::cl::opt DynoStatsAll; + +static cl::opt EliminateUnreachable("eliminate-unreachable", - llvm::cl::desc("eliminate unreachable code"), - llvm::cl::ZeroOrMore); + cl::desc("eliminate unreachable code"), + cl::ZeroOrMore); -static llvm::cl::opt +static cl::opt OptimizeBodylessFunctions( "optimize-bodyless-functions", - llvm::cl::desc("optimize functions that just do a tail call"), - llvm::cl::ZeroOrMore); + cl::desc("optimize functions that just do a tail call"), + cl::ZeroOrMore); -static llvm::cl::opt +static cl::opt InlineSmallFunctions( "inline-small-functions", - llvm::cl::desc("inline functions with a single basic block"), - llvm::cl::ZeroOrMore); + cl::desc("inline functions with a single basic block"), + cl::ZeroOrMore); -static llvm::cl::opt +static cl::opt SimplifyConditionalTailCalls("simplify-conditional-tail-calls", - llvm::cl::desc("simplify conditional tail calls " - "by removing unnecessary jumps"), - llvm::cl::ZeroOrMore); + cl::desc("simplify conditional tail calls " + "by removing unnecessary jumps"), + cl::ZeroOrMore); -static llvm::cl::opt +static cl::opt Peepholes("peepholes", - llvm::cl::desc("run peephole optimizations"), - llvm::cl::ZeroOrMore); + cl::desc("run peephole optimizations"), + cl::ZeroOrMore); -static llvm::cl::opt +static cl::opt SimplifyRODataLoads("simplify-rodata-loads", - llvm::cl::desc("simplify loads from read-only sections by " - "replacing the memory operand with the " - "constant found in the corresponding " - "section"), - llvm::cl::ZeroOrMore); + cl::desc("simplify loads from read-only sections by " + "replacing the memory operand with the " + "constant found in the corresponding " + "section"), + cl::ZeroOrMore); -static llvm::cl::opt +static cl::opt IdenticalCodeFolding( "icf", - llvm::cl::desc("fold functions with identical code"), - llvm::cl::ZeroOrMore); + cl::desc("fold functions with identical code"), + cl::ZeroOrMore); + +static cl::opt +PrintReordered("print-reordered", + cl::desc("print functions after layout optimization"), + cl::ZeroOrMore, + cl::Hidden); + +static cl::opt +PrintOptimizeBodyless("print-optimize-bodyless", + cl::desc("print functions after bodyless optimization"), + cl::ZeroOrMore, + cl::Hidden); + +static cl::opt +PrintAfterBranchFixup("print-after-branch-fixup", + cl::desc("print function after fixing local branches"), + cl::Hidden); + +static cl::opt +PrintAfterFixup("print-after-fixup", + cl::desc("print function after fixup"), + cl::Hidden); + +static cl::opt +PrintUCE("print-uce", + cl::desc("print functions after unreachable code elimination"), + cl::ZeroOrMore, + cl::Hidden); + +static cl::opt +PrintSCTC("print-sctc", + cl::desc("print functions after conditional tail call simplification"), + cl::ZeroOrMore, + cl::Hidden); + +static cl::opt +PrintPeepholes("print-peepholes", + cl::desc("print functions after peephole optimization"), + cl::ZeroOrMore, + cl::Hidden); + +static cl::opt +PrintSimplifyROLoads("print-simplify-rodata-loads", + cl::desc("print functions after simplification of RO data" + " loads"), + cl::ZeroOrMore, + cl::Hidden); + +static cl::opt +PrintICF("print-icf", + cl::desc("print functions after ICF optimization"), + cl::ZeroOrMore, + cl::Hidden); + +static cl::opt +PrintInline("print-inline", + cl::desc("print functions after inlining optimization"), + cl::ZeroOrMore, + cl::Hidden); } // namespace opts namespace llvm { namespace bolt { +using namespace opts; + cl::opt BinaryFunctionPassManager::AlwaysOn( "always-run-pass", - llvm::cl::desc("Used for passes that are always enabled"), + cl::desc("Used for passes that are always enabled"), cl::init(true), cl::ReallyHidden); bool BinaryFunctionPassManager::NagUser = false; +void BinaryFunctionPassManager::runPasses() { + for (const auto &OptPassPair : Passes) { + if (!OptPassPair.first) + continue; + + auto &Pass = OptPassPair.second; + + callWithDynoStats( + [this,&Pass] { + Pass->runOnFunctions(BC, BFs, LargeFunctions); + }, + BFs, + Pass->getName(), + opts::DynoStatsAll + ); + + if (!opts::PrintAll && !opts::DumpDotAll && !Pass->printPass()) + continue; + + const std::string Message = std::string("after ") + Pass->getName(); + + for (auto &It : BFs) { + auto &Function = It.second; + + if (!Pass->shouldPrint(Function)) + continue; + + Function.print(outs(), Message, true); + + if (opts::DumpDotAll) + Function.dumpGraphForPass(Pass->getName()); + } + } +} + void BinaryFunctionPassManager::runAllPasses( BinaryContext &BC, std::map &Functions, @@ -78,44 +180,48 @@ void BinaryFunctionPassManager::runAllPasses( // Here we manage dependencies/order manually, since passes are ran in the // order they're registered. - Manager.registerPass(llvm::make_unique(), + Manager.registerPass(llvm::make_unique(PrintICF), opts::IdenticalCodeFolding); - Manager.registerPass(llvm::make_unique(), + Manager.registerPass(llvm::make_unique(PrintInline), opts::InlineSmallFunctions); Manager.registerPass( - std::move(llvm::make_unique(Manager.NagUser)), + llvm::make_unique(PrintUCE, Manager.NagUser), opts::EliminateUnreachable); - Manager.registerPass(llvm::make_unique(), - opts::OptimizeBodylessFunctions); + Manager.registerPass( + llvm::make_unique(PrintOptimizeBodyless), + opts::OptimizeBodylessFunctions); - Manager.registerPass(llvm::make_unique(), - opts::SimplifyRODataLoads); + Manager.registerPass( + llvm::make_unique(PrintSimplifyROLoads), + opts::SimplifyRODataLoads); Manager.registerPass( - std::move(llvm::make_unique(Manager.NagUser)), + llvm::make_unique(PrintUCE, Manager.NagUser), opts::EliminateUnreachable); - Manager.registerPass(std::move(llvm::make_unique())); + Manager.registerPass(llvm::make_unique(PrintReordered)); - Manager.registerPass(llvm::make_unique(), opts::Peepholes); + Manager.registerPass(llvm::make_unique(PrintPeepholes), + opts::Peepholes); // This pass syncs local branches with CFG. If any of the following // passes breaks the sync - they either need to re-run the pass or // fix branches consistency internally. - Manager.registerPass(std::move(llvm::make_unique())); + Manager.registerPass(llvm::make_unique(PrintAfterBranchFixup)); // This pass introduces conditional jumps into external functions. // Between extending CFG to support this and isolating this pass we chose // the latter. Thus this pass will do unreachable code elimination // if necessary and wouldn't rely on UCE for this. // More generally this pass should be the last optimization pass. - Manager.registerPass(llvm::make_unique(), - opts::SimplifyConditionalTailCalls); + Manager.registerPass( + llvm::make_unique(PrintSCTC), + opts::SimplifyConditionalTailCalls); - Manager.registerPass(std::move(llvm::make_unique())); + Manager.registerPass(llvm::make_unique(PrintAfterFixup)); Manager.runPasses(); } diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h index 5875a99b5a7e..fb6d34e176a7 100644 --- a/bolt/BinaryPassManager.h +++ b/bolt/BinaryPassManager.h @@ -55,13 +55,7 @@ class BinaryFunctionPassManager { } /// Run all registered passes in the order they were added. - void runPasses() { - for (const auto &OptPassPair : Passes) { - if (OptPassPair.first) { - OptPassPair.second->runOnFunctions(BC, BFs, LargeFunctions); - } - } - } + void runPasses(); /// Runs all enabled implemented passes on all functions. static void runAllPasses(BinaryContext &BC, diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 070a0c3668fb..f4b0f02bfdd9 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -20,58 +20,9 @@ using namespace llvm; namespace opts { extern cl::opt Verbosity; -extern cl::opt PrintAll; -extern cl::opt DumpDotAll; extern cl::opt SplitFunctions; extern bool shouldProcess(const bolt::BinaryFunction &Function); -static cl::opt -PrintReordered("print-reordered", - cl::desc("print functions after layout optimization"), - cl::ZeroOrMore, - cl::Hidden); - -static cl::opt -PrintAfterBranchFixup("print-after-branch-fixup", - cl::desc("print function after fixing local branches"), - cl::Hidden); - -static cl::opt -PrintAfterFixup("print-after-fixup", - cl::desc("print function after fixup"), - cl::Hidden); - -static cl::opt -PrintUCE("print-uce", - cl::desc("print functions after unreachable code elimination"), - cl::ZeroOrMore, - cl::Hidden); - -static cl::opt -PrintPeepholes("print-peepholes", - cl::desc("print functions after peephole optimization"), - cl::ZeroOrMore, - cl::Hidden); - -static cl::opt -PrintSimplifyROLoads("print-simplify-rodata-loads", - cl::desc("print functions after simplification of RO data" - " loads"), - cl::ZeroOrMore, - cl::Hidden); - -static cl::opt -PrintICF("print-icf", - cl::desc("print functions after ICF optimization"), - cl::ZeroOrMore, - cl::Hidden); - -static cl::opt -PrintInline("print-inline", - cl::desc("print functions after inlining optimization"), - cl::ZeroOrMore, - cl::Hidden); - static cl::list ForceInlineFunctions("force-inline", cl::CommaSeparated, @@ -124,6 +75,14 @@ MinBranchClusters( namespace llvm { namespace bolt { +bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { + return BF.isSimple() && opts::shouldProcess(BF); +} + +bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { + return BF.isSimple() && opts::shouldProcess(BF); +} + void OptimizeBodylessFunctions::analyze( BinaryFunction &BF, BinaryContext &BC, @@ -181,13 +140,13 @@ void OptimizeBodylessFunctions::runOnFunctions( std::set &) { for (auto &It : BFs) { auto &Function = It.second; - if (Function.isSimple() && opts::shouldProcess(Function)) { + if (shouldOptimize(Function)) { analyze(Function, BC, BFs); } } for (auto &It : BFs) { auto &Function = It.second; - if (Function.isSimple() && opts::shouldProcess(Function)) { + if (shouldOptimize(Function)) { optimizeCalls(Function, BC); } } @@ -198,9 +157,7 @@ void InlineSmallFunctions::findInliningCandidates( const std::map &BFs) { for (const auto &BFIt : BFs) { const auto &Function = BFIt.second; - if (!Function.isSimple() || - !opts::shouldProcess(Function) || - Function.size() != 1) + if (!shouldOptimize(Function) || Function.size() != 1) continue; auto &BB = *Function.begin(); const auto &LastInstruction = *BB.rbegin(); @@ -229,8 +186,7 @@ void InlineSmallFunctions::findInliningCandidatesAggressive( }; for (const auto &BFIt : BFs) { const auto &Function = BFIt.second; - if (!Function.isSimple() || - !opts::shouldProcess(Function) || + if (!shouldOptimize(Function) || OverwrittenFunctions.count(Function.getSymbol()->getName()) || Function.hasEHRanges()) continue; @@ -732,16 +688,13 @@ void InlineSmallFunctions::runOnFunctions( findInliningCandidates(BC, BFs); std::vector ConsideredFunctions; - std::vector Modified; for (auto &It : BFs) { auto &Function = It.second; - if (!Function.isSimple() || !opts::shouldProcess(Function)) - continue; - if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE && - !mustConsider(Function)) + if (!shouldOptimize(Function) || + (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE && + !mustConsider(Function))) continue; ConsideredFunctions.push_back(&Function); - Modified.push_back(false); } std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(), [](BinaryFunction *A, BinaryFunction *B) { @@ -757,24 +710,11 @@ void InlineSmallFunctions::runOnFunctions( : inlineCallsInFunction(BC, Function); if (DidInline) { - Modified[i] = true; + Modified.insert(&Function); ++ModifiedFunctions; } } - if (opts::PrintAll || opts::PrintInline || opts::DumpDotAll) { - for (unsigned i = 0; i < ConsideredFunctions.size(); ++i) { - if (Modified[i]) { - const auto *Function = ConsideredFunctions[i]; - if (opts::PrintAll || opts::PrintInline) - Function->print(errs(), "after inlining", true); - - if (opts::DumpDotAll) - Function->dumpGraphForPass("inlining"); - } - } - } - DEBUG(dbgs() << "BOLT-INFO: Inlined " << InlinedDynamicCalls << " of " << TotalDynamicCalls << " function calls in the profile.\n" << "BOLT-INFO: Inlined calls represent " @@ -783,8 +723,6 @@ void InlineSmallFunctions::runOnFunctions( } void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { - if (!Function.isSimple() || !opts::shouldProcess(Function)) return; - // FIXME: this wouldn't work with C++ exceptions until we implement // support for those as there will be "invisible" edges // in the graph. @@ -822,12 +760,6 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { DEBUG(dbgs() << "BOLT: Removed " << Count << " dead basic block(s) in function " << Function << '\n'); } - - if (opts::PrintAll || opts::PrintUCE) - Function.print(outs(), "after unreachable code elimination", true); - - if (opts::DumpDotAll) - Function.dumpGraphForPass("unreachable-code"); } } @@ -837,10 +769,18 @@ void EliminateUnreachableBlocks::runOnFunctions( std::set & ) { for (auto &It : BFs) { - runOnFunction(It.second); + auto &Function = It.second; + if (shouldOptimize(Function)) { + runOnFunction(Function); + } } } +bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { + return (BinaryFunctionPass::shouldPrint(BF) && + opts::ReorderBlocks != BinaryFunction::LT_NONE); +} + void ReorderBasicBlocks::runOnFunctions( BinaryContext &BC, std::map &BFs, @@ -848,10 +788,7 @@ void ReorderBasicBlocks::runOnFunctions( for (auto &It : BFs) { auto &Function = It.second; - if (!Function.isSimple()) - continue; - - if (!opts::shouldProcess(Function)) + if (!shouldOptimize(Function)) continue; if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { @@ -862,10 +799,6 @@ void ReorderBasicBlocks::runOnFunctions( (LargeFunctions.find(It.first) != LargeFunctions.end()); Function.modifyLayout(opts::ReorderBlocks, opts::MinBranchClusters, ShouldSplit); - if (opts::PrintAll || opts::PrintReordered) - Function.print(outs(), "after reordering blocks", true); - if (opts::DumpDotAll) - Function.dumpGraphForPass("reordering"); } } } @@ -876,16 +809,9 @@ void FixupBranches::runOnFunctions( std::set &) { for (auto &It : BFs) { auto &Function = It.second; - - if (!Function.isSimple() || !opts::shouldProcess(Function)) - continue; - - Function.fixBranches(); - - if (opts::PrintAll || opts::PrintAfterBranchFixup) - Function.print(errs(), "after branch fixup", true); - if (opts::DumpDotAll) - Function.dumpGraphForPass("after-branch-fixup"); + if (shouldOptimize(Function)) { + Function.fixBranches(); + } } } @@ -897,10 +823,7 @@ void FixupFunctions::runOnFunctions( for (auto &It : BFs) { auto &Function = It.second; - if (!Function.isSimple()) - continue; - - if (!opts::shouldProcess(Function)) + if (!shouldOptimize(Function)) continue; // Fix the CFI state. @@ -915,10 +838,6 @@ void FixupFunctions::runOnFunctions( // Update exception handling information. Function.updateEHRanges(); - if (opts::PrintAll || opts::PrintAfterFixup) - Function.print(errs(), "after fixup", true); - if (opts::DumpDotAll) - Function.dumpGraphForPass("after-fixup"); } } @@ -1014,17 +933,12 @@ void SimplifyConditionalTailCalls::runOnFunctions( for (auto &It : BFs) { auto &Function = It.second; - if (!Function.isSimple()) + if (!shouldOptimize(Function)) continue; // Fix tail calls to reduce branch mispredictions. if (fixTailCalls(BC, Function)) { - if (opts::PrintAll || opts::PrintReordered) { - Function.print(outs(), "after tail call patching", true); - } - if (opts::DumpDotAll) { - Function.dumpGraphForPass("tail-call-patching"); - } + Modified.insert(&Function); } } @@ -1047,16 +961,8 @@ void Peepholes::runOnFunctions(BinaryContext &BC, std::set &LargeFunctions) { for (auto &It : BFs) { auto &Function = It.second; - if (Function.isSimple() && opts::shouldProcess(Function)) { + if (shouldOptimize(Function)) { shortenInstructions(BC, Function); - - if (opts::PrintAll || opts::PrintPeepholes) { - Function.print(outs(), "after peepholes", true); - } - - if (opts::DumpDotAll) { - Function.dumpGraphForPass("peepholes"); - } } } } @@ -1149,22 +1055,10 @@ void SimplifyRODataLoads::runOnFunctions( std::map &BFs, std::set & ) { - for (auto &It : BFs) { auto &Function = It.second; - - if (!Function.isSimple()) - continue; - - if (simplifyRODataLoads(BC, Function)) { - if (opts::PrintAll || opts::PrintSimplifyROLoads) { - Function.print(outs(), - "after simplifying read-only section loads", - true); - } - if (opts::DumpDotAll) { - Function.dumpGraphForPass("simplify-rodata-loads"); - } + if (shouldOptimize(Function) && simplifyRODataLoads(BC, Function)) { + Modified.insert(&Function); } } @@ -1180,7 +1074,7 @@ void IdenticalCodeFolding::discoverCallers( for (auto &I : BFs) { BinaryFunction &Caller = I.second; - if (!Caller.isSimple()) + if (!shouldOptimize(Caller)) continue; for (BinaryBasicBlock &BB : Caller) { @@ -1281,7 +1175,6 @@ void IdenticalCodeFolding::runOnFunctions( std::map &BFs, std::set & ) { - discoverCallers(BC, BFs); // This hash table is used to identify identical functions. It maps @@ -1304,9 +1197,9 @@ void IdenticalCodeFolding::runOnFunctions( // to each other. Initialized with all simple functions. std::vector Cands; for (auto &I : BFs) { - BinaryFunction *BF = &I.second; - if (BF->isSimple()) - Cands.emplace_back(BF); + auto &BF = I.second; + if (shouldOptimize(BF)) + Cands.emplace_back(&BF); } // We repeat the icf pass until no new modifications happen. @@ -1383,12 +1276,6 @@ void IdenticalCodeFolding::runOnFunctions( << "BOLT-INFO: Removing all identical functions could save " << format("%.2lf", (double) BytesSavedEstimate / 1024) << " KB of code space.\n"; - - if (opts::PrintAll || opts::PrintICF) { - for (auto &I : BFs) { - I.second.print(outs(), "after identical code folding", true); - } - } } } // namespace bolt diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 51c37665bedc..9bc2987a3921 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -16,18 +16,40 @@ #include "BinaryContext.h" #include "BinaryFunction.h" +#include "llvm/Support/CommandLine.h" #include #include #include #include +#include namespace llvm { namespace bolt { /// An optimization/analysis pass that runs on functions. class BinaryFunctionPass { + const cl::opt &PrintPass; +protected: + explicit BinaryFunctionPass(const cl::opt &PrintPass) + : PrintPass(PrintPass) { } + + /// Control whether a specific function should be skipped during + /// optimization. + bool shouldOptimize(const BinaryFunction &BF) const; public: virtual ~BinaryFunctionPass() = default; + + /// The name of this pass + virtual const char *getName() const = 0; + + /// Control whether debug info is printed after this pass is completed. + bool printPass() const { return PrintPass; } + + /// Control whether debug info is printed for an individual function after + /// this pass is completed (printPass() must have returned true). + virtual bool shouldPrint(const BinaryFunction &BF) const; + + /// Execute this pass on the given functions. virtual void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) = 0; @@ -50,6 +72,11 @@ class OptimizeBodylessFunctions : public BinaryFunctionPass { BinaryContext &BC); public: + explicit OptimizeBodylessFunctions(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + const char *getName() const override { + return "optimize-bodyless"; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; @@ -75,6 +102,7 @@ class InlineSmallFunctions : public BinaryFunctionPass { uint64_t TotalDynamicCalls = 0; uint64_t InlinedDynamicCalls = 0; uint64_t TotalInlineableCalls = 0; + std::unordered_set Modified; static bool mustConsider(const BinaryFunction &BF); @@ -115,6 +143,15 @@ class InlineSmallFunctions : public BinaryFunctionPass { const BinaryFunction &InlinedFunction); public: + explicit InlineSmallFunctions(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "inlining"; + } + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; @@ -126,8 +163,12 @@ class EliminateUnreachableBlocks : public BinaryFunctionPass { bool& NagUser; void runOnFunction(BinaryFunction& Function); public: - explicit EliminateUnreachableBlocks(bool &nagUser) : NagUser(nagUser) { } + EliminateUnreachableBlocks(const cl::opt &PrintPass, bool &NagUser) + : BinaryFunctionPass(PrintPass), NagUser(NagUser) { } + const char *getName() const override { + return "eliminate-unreachable"; + } void runOnFunctions(BinaryContext&, std::map &BFs, std::set &LargeFunctions) override; @@ -136,6 +177,13 @@ class EliminateUnreachableBlocks : public BinaryFunctionPass { // Reorder the basic blocks for each function based on hotness. class ReorderBasicBlocks : public BinaryFunctionPass { public: + explicit ReorderBasicBlocks(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "reordering"; + } + bool shouldPrint(const BinaryFunction &BF) const override; void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; @@ -144,6 +192,12 @@ class ReorderBasicBlocks : public BinaryFunctionPass { /// Sync local branches with CFG. class FixupBranches : public BinaryFunctionPass { public: + explicit FixupBranches(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "fix-branches"; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; @@ -153,6 +207,12 @@ class FixupBranches : public BinaryFunctionPass { /// passes have completed. class FixupFunctions : public BinaryFunctionPass { public: + explicit FixupFunctions(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "fixup-functions"; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; @@ -176,9 +236,19 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { uint64_t NumTailCallCandidates{0}; uint64_t NumTailCallsPatched{0}; uint64_t NumOrigForwardBranches{0}; + std::unordered_set Modified; bool fixTailCalls(BinaryContext &BC, BinaryFunction &BF); public: + explicit SimplifyConditionalTailCalls(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "simplify-conditional-tail-calls"; + } + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; @@ -188,6 +258,12 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { class Peepholes : public BinaryFunctionPass { void shortenInstructions(BinaryContext &BC, BinaryFunction &Function); public: + explicit Peepholes(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "peepholes"; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; @@ -209,10 +285,20 @@ class SimplifyRODataLoads : public BinaryFunctionPass { uint64_t NumDynamicLoadsSimplified{0}; uint64_t NumLoadsFound{0}; uint64_t NumDynamicLoadsFound{0}; + std::unordered_set Modified; bool simplifyRODataLoads(BinaryContext &BC, BinaryFunction &BF); public: + explicit SimplifyRODataLoads(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "simplify-read-only-loads"; + } + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; @@ -254,6 +340,12 @@ class IdenticalCodeFolding : public BinaryFunctionPass { std::map &BFs); public: + explicit IdenticalCodeFolding(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "identical-code-folding"; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 3362f1a8a32d..84a06ed51b6c 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -94,6 +94,11 @@ cl::opt PrintDynoStats("dyno-stats", cl::desc("print execution info based on profile")); +cl::opt +DynoStatsAll("dyno-stats-all", cl::desc("print dyno stats after each stage"), + cl::ZeroOrMore, + cl::Hidden); + static cl::list FunctionNames("funcs", cl::CommaSeparated, @@ -1077,21 +1082,15 @@ void RewriteInstance::disassembleFunctions() { } void RewriteInstance::runOptimizationPasses() { - DynoStats dynoStatsBefore; - if (opts::PrintDynoStats) { - dynoStatsBefore = getDynoStats(); - outs() << "BOLT-INFO: program-wide dynostats before running " - "optimizations:\n\n" << dynoStatsBefore << '\n'; - } - - BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions, LargeFunctions); - - if (opts::PrintDynoStats) { - auto dynoStatsAfter = getDynoStats(); - outs() << "BOLT-INFO: program-wide dynostats after optimizaions:\n\n"; - dynoStatsAfter.print(outs(), &dynoStatsBefore); - outs() << '\n'; - } + callWithDynoStats( + [this] { + BinaryFunctionPassManager::runAllPasses(*BC, + BinaryFunctions, + LargeFunctions); + }, + BinaryFunctions, + "optimizations", + opts::PrintDynoStats || opts::DynoStatsAll); } namespace { diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 415a56d44f69..a5162077a817 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -342,19 +342,6 @@ class RewriteInstance { uint64_t Address, uint64_t Size, bool IsSimple); - - /// Return program-wide dynostats. - DynoStats getDynoStats() const { - DynoStats dynoStats; - for (auto &BFI : BinaryFunctions) { - auto &BF = BFI.second; - if (BF.isSimple()) { - dynoStats += BF.getDynoStats(); - } - } - return dynoStats; - } - }; } // namespace bolt From 11a994e3e9153d734d347584e32470e47d3150cf Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 9 Sep 2016 14:42:35 -0700 Subject: [PATCH 167/904] Use BB.getNumNonPseudos() in more places. Summary: Use BB.getNumNonPseudos() in more places. Fix analyze_potential script to pass the new parameter. (cherry picked from commit e0a34f4f8c7ba8c871fe43c7d6d9db77150e746a) --- bolt/BinaryFunction.cpp | 2 +- bolt/BinaryPasses.cpp | 2 +- bolt/ReorderAlgorithm.cpp | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 7766867b3d2d..9217205784f3 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1469,7 +1469,7 @@ uint64_t BinaryFunction::getFunctionScore() { uint64_t BBExecCount = BB->getExecutionCount(); if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) continue; - BBExecCount *= (BB->Instructions.size() - BB->getNumPseudos()); + BBExecCount *= BB->getNumNonPseudos(); TotalScore += BBExecCount; } FunctionScore = TotalScore; diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index f4b0f02bfdd9..6c97c41cd04d 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -163,7 +163,7 @@ void InlineSmallFunctions::findInliningCandidates( const auto &LastInstruction = *BB.rbegin(); // Check if the function is small enough and doesn't do a tail call. if (BB.size() > 0 && - (BB.size() - BB.getNumPseudos()) <= kMaxInstructions && + BB.getNumNonPseudos() <= kMaxInstructions && BC.MIA->isReturn(LastInstruction) && !BC.MIA->isTailCall(LastInstruction)) { InliningCandidates.insert(&Function); diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp index abfe983d5837..31c00b1b5ee1 100644 --- a/bolt/ReorderAlgorithm.cpp +++ b/bolt/ReorderAlgorithm.cpp @@ -56,9 +56,8 @@ void ClusterAlgorithm::computeClusterAverageFrequency() { for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { double Freq = 0.0; for (auto BB : Clusters[I]) { - if (!BB->empty() && BB->size() != BB->getNumPseudos()) - Freq += ((double) BB->getExecutionCount()) / - (BB->size() - BB->getNumPseudos()); + if (BB->getNumNonPseudos() > 0) + Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos(); } AvgFreq[I] = Freq; } From 641ae204e2dd1b82408555ab7c5a8803b6c187b6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 2 Sep 2016 18:09:07 -0700 Subject: [PATCH 168/904] BOLT: Remove double jumps peephole. Summary: Replace jumps to other unconditional jumps with the final destination, e.g. B0: ... jmp B1 (or jcc B1) B1: jmp B2 -> B0: ... jmp B2 (or jcc B1) This peephole removes 8928 double jumps from a test binary. Note: after filtering out double jumps found in EH code and infinite loops, the number of double jumps patched is 49 (24 for a clang compiled test). The 24 in the clang build are all from external libraries which have probably been compiled with gcc. This peephole is still useful for cleaning up after ICP though. (cherry picked from commit 994309552e1880e9d178a3770492c9c6dd9d4c84) --- bolt/BinaryBasicBlock.cpp | 35 +++++++++++++++++ bolt/BinaryBasicBlock.h | 36 +++++++++++++++++- bolt/BinaryPassManager.cpp | 4 ++ bolt/BinaryPasses.cpp | 77 ++++++++++++++++++++++++++++++++++++++ bolt/BinaryPasses.h | 2 + 5 files changed, 152 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 2ac456551773..1930369e1bf0 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -39,6 +39,15 @@ MCInst *BinaryBasicBlock::findFirstNonPseudoInstruction() { return nullptr; } +MCInst *BinaryBasicBlock::findLastNonPseudoInstruction() { + auto &BC = Function->getBinaryContext(); + for (auto Itr = Instructions.rbegin(); Itr != Instructions.rend(); ++Itr) { + if (!BC.MII->get(Itr->getOpcode()).isPseudo()) + return &*Itr; + } + return nullptr; +} + BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const { if (!Label && succ_size() == 1) return *succ_begin(); @@ -68,6 +77,24 @@ void BinaryBasicBlock::addSuccessor(BinaryBasicBlock *Succ, Succ->Predecessors.push_back(this); } +void BinaryBasicBlock::replaceSuccessor(BinaryBasicBlock *Succ, + BinaryBasicBlock *NewSucc, + uint64_t Count, + uint64_t MispredictedCount) { + auto I = succ_begin(); + auto BI = BranchInfo.begin(); + for (; I != succ_end(); ++I) { + assert(BI != BranchInfo.end() && "missing BranchInfo entry"); + if (*I == Succ) + break; + ++BI; + } + assert(I != succ_end() && "no such successor!"); + + *I = NewSucc; + *BI = BinaryBranchInfo{Count, MispredictedCount}; +} + void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) { Succ->removePredecessor(this); auto I = succ_begin(); @@ -117,12 +144,20 @@ bool BinaryBasicBlock::swapConditionalSuccessors() { } void BinaryBasicBlock::addBranchInstruction(const BinaryBasicBlock *Successor) { + assert(isSuccessor(Successor)); auto &BC = Function->getBinaryContext(); MCInst NewInst; BC.MIA->createUncondBranch(NewInst, Successor->getLabel(), BC.Ctx.get()); Instructions.emplace_back(std::move(NewInst)); } +void BinaryBasicBlock::addTailCallInstruction(const MCSymbol *Target) { + auto &BC = Function->getBinaryContext(); + MCInst NewInst; + BC.MIA->createTailCall(NewInst, Target, BC.Ctx.get()); + Instructions.emplace_back(std::move(NewInst)); +} + uint32_t BinaryBasicBlock::getNumPseudos() const { #ifndef NDEBUG auto &BC = Function->getBinaryContext(); diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 78c8313cb84a..fba243966f19 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -308,6 +308,15 @@ class BinaryBasicBlock { return Successors[Condition == true ? 0 : 1]; } + /// Find the fallthrough successor for a block, or nullptr if there is + /// none. + const BinaryBasicBlock* getFallthrough() const { + if (succ_size() == 2) + return getConditionalSuccessor(false); + else + return getSuccessor(); + } + const BinaryBranchInfo &getBranchInfo(bool Condition) const { assert(BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"); @@ -324,6 +333,10 @@ class BinaryBasicBlock { /// basic block to the end of this basic block. void addBranchInstruction(const BinaryBasicBlock *Successor); + /// Add an instruction with tail call control transfer to \p Target + /// to the end of this basic block. + void addTailCallInstruction(const MCSymbol *Target); + /// Get landing pad with given label. Returns nullptr if no such /// landing pad is found. BinaryBasicBlock *getLandingPad(const MCSymbol *Label) const; @@ -333,8 +346,6 @@ class BinaryBasicBlock { return Label->getName(); } - MCInst *findFirstNonPseudoInstruction(); - /// Add instruction at the end of this basic block. /// Returns the index of the instruction in the Instructions vector of the BB. uint32_t addInstruction(MCInst &&Inst) { @@ -372,6 +383,14 @@ class BinaryBasicBlock { return size() - getNumPseudos(); } + /// Return a pointer to the first non-pseudo instruction in this basic + /// block. Returns nullptr if none exists. + MCInst *findFirstNonPseudoInstruction(); + + /// Return a pointer to the last non-pseudo instruction in this basic + /// block. Returns nullptr if none exists. + MCInst *findLastNonPseudoInstruction(); + /// Set minimum alignment for the basic block. void setAlignment(uint64_t Align) { Alignment = Align; @@ -419,6 +438,13 @@ class BinaryBasicBlock { } } + /// Replace Succ with NewSucc. This routine is helpful for preserving + /// the order of conditional successors when editing the CFG. + void replaceSuccessor(BinaryBasicBlock *Succ, + BinaryBasicBlock *NewSucc, + uint64_t Count = 0, + uint64_t MispredictedCount = 0); + /// Adds block to landing pad list. void addLandingPad(BinaryBasicBlock *LPBlock); @@ -434,6 +460,12 @@ class BinaryBasicBlock { } } + /// Test if BB is a successor of this block. + bool isSuccessor(const BinaryBasicBlock *BB) const { + auto Itr = std::find(Successors.begin(), Successors.end(), BB); + return Itr != Successors.end(); + } + /// Return the information about the number of times this basic block was /// executed. /// diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index c66d7b2af158..f653c26140a3 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -207,6 +207,10 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintPeepholes), opts::Peepholes); + Manager.registerPass( + llvm::make_unique(PrintUCE, Manager.NagUser), + opts::EliminateUnreachable); + // This pass syncs local branches with CFG. If any of the following // passes breaks the sync - they either need to re-run the pass or // fix branches consistency internally. diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 6c97c41cd04d..d67bda059d8d 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -956,6 +956,81 @@ void Peepholes::shortenInstructions(BinaryContext &BC, } } +// This peephole fixes jump instructions that jump to another basic +// block with a single jump instruction, e.g. +// +// B0: ... +// jmp B1 (or jcc B1) +// +// B1: jmp B2 +// +// -> +// +// B0: ... +// jmp B2 (or jcc B2) +// +void Peepholes::fixDoubleJumps(BinaryContext &BC, + BinaryFunction &Function) { + for (auto &BB : Function) { + auto checkAndPatch = [&](BinaryBasicBlock *Pred, + BinaryBasicBlock *Succ, + const MCSymbol *SuccSym) { + // Ignore infinite loop jumps or fallthrough tail jumps. + if (Pred == Succ || Succ == &BB) + return; + + if (Succ) { + Pred->replaceSuccessor(&BB, Succ, BinaryBasicBlock::COUNT_NO_PROFILE); + } else { + // Succ will be null in the tail call case. In this case we + // need to explicitly add a tail call instruction. + auto *Branch = Pred->findLastNonPseudoInstruction(); + if (Branch && BC.MIA->isUnconditionalBranch(*Branch)) { + Pred->removeSuccessor(&BB); + Pred->eraseInstruction(Branch); + Pred->addTailCallInstruction(SuccSym); + } else { + return; + } + } + + ++NumDoubleJumps; + DEBUG(dbgs() << "Removed double jump in " << Function << " from " + << Pred->getName() << " -> " << BB.getName() << " to " + << Pred->getName() << " -> " << SuccSym->getName() + << (!Succ ? " (tail)\n" : "\n")); + }; + + if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) + continue; + + auto *Inst = BB.findFirstNonPseudoInstruction(); + const bool IsTailCall = BC.MIA->isTailCall(*Inst); + + if (!BC.MIA->isUnconditionalBranch(*Inst) && !IsTailCall) + continue; + + const auto *SuccSym = BC.MIA->getTargetSymbol(*Inst); + auto *Succ = BB.getSuccessor(SuccSym); + + if ((!Succ || &BB == Succ) && !IsTailCall) + continue; + + std::vector Preds{BB.pred_begin(), BB.pred_end()}; + + for (auto *Pred : Preds) { + if (Pred->isLandingPad()) + continue; + + if (Pred->getSuccessor() == &BB || + (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || + Pred->getConditionalSuccessor(false) == &BB) { + checkAndPatch(Pred, Succ, SuccSym); + } + } + } +} + void Peepholes::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { @@ -963,8 +1038,10 @@ void Peepholes::runOnFunctions(BinaryContext &BC, auto &Function = It.second; if (shouldOptimize(Function)) { shortenInstructions(BC, Function); + fixDoubleJumps(BC, Function); } } + outs() << "BOLT-INFO: " << NumDoubleJumps << " double jumps patched.\n"; } bool SimplifyRODataLoads::simplifyRODataLoads( diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 9bc2987a3921..41602213bb99 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -256,7 +256,9 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { /// Perform simple peephole optimizations. class Peepholes : public BinaryFunctionPass { + uint64_t NumDoubleJumps{0}; void shortenInstructions(BinaryContext &BC, BinaryFunction &Function); + void fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function); public: explicit Peepholes(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } From 99247366f739405e483fb2c07e09b4a5ed6cda49 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 12 Sep 2016 10:12:31 -0700 Subject: [PATCH 169/904] Fix switch table detection. Disassemble all instructions in non-simple functions. Summary: Switch table can contain __builtin_unreachable(). As a result, a compiler may place an entry into a jump table that contains an address immediately past the last instruction in the function. Sometimes it may coincide with a start of the next function in the binary. Thus when we check for switch tables in such cases we have to check more than a single entry until we see either an address inside containing function or some address outside different from the address past the last instruction. Additonally, don't stop disassembly after discovering that the function was not simple. We need to detect all outside references whenever possible. (cherry picked from commit 3b9b01431de17b1d6b9ae4c626101809a4053d46) --- bolt/BinaryFunction.cpp | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 9217205784f3..7386deacb5d5 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -472,6 +472,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } const auto RIPRegister = BC.MRI->getProgramCounter(); + auto PtrSize = BC.AsmInfo->getPointerSize(); // Analyze contents of the memory if possible. unsigned BaseRegNum; @@ -485,8 +486,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { return IndirectBranchType::UNKNOWN; if ((BaseRegNum != bolt::NoRegister && BaseRegNum != RIPRegister) || - SegRegNum != bolt::NoRegister || - ScaleValue != BC.AsmInfo->getPointerSize()) + SegRegNum != bolt::NoRegister || ScaleValue != PtrSize) return IndirectBranchType::UNKNOWN; auto ArrayStart = DispValue; @@ -513,20 +513,35 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Extract the value at the start of the array. StringRef SectionContents; Section.getContents(SectionContents); - DataExtractor DE(SectionContents, - BC.AsmInfo->isLittleEndian(), - BC.AsmInfo->getPointerSize()); + DataExtractor DE(SectionContents, BC.AsmInfo->isLittleEndian(), PtrSize); auto ValueOffset = static_cast(ArrayStart - Section.getAddress()); - auto Value = DE.getAddress(&ValueOffset); - if (containsAddress(Value) && Value != getAddress()) - return IndirectBranchType::POSSIBLE_SWITCH_TABLE; - + uint64_t Value = 0; + while (ValueOffset <= Section.getSize() - PtrSize) { + DEBUG(dbgs() << "BOLT-DEBUG: indirect jmp at 0x" + << Twine::utohexstr(getAddress() + Offset) + << " is referencing address 0x" + << Twine::utohexstr(Section.getAddress() + ValueOffset)); + // Extract the value and increment the offset. + Value = DE.getAddress(&ValueOffset); + DEBUG(dbgs() << ", which contains value " + << Twine::utohexstr(Value) << '\n'); + if (containsAddress(Value) && Value != getAddress()) { + return IndirectBranchType::POSSIBLE_SWITCH_TABLE; + } + // Potentially a switch table can contain __builtin_unreachable() entry + // pointing just right after the function. In this case we have to check + // another entry. Otherwise the entry is outside of this function scope + // and it's not a switch table. + if (Value != getAddress() + getSize()) { + break; + } + } BC.InterproceduralReferences.insert(Value); return IndirectBranchType::POSSIBLE_TAIL_CALL; }; bool IsSimple = true; - for (uint64_t Offset = 0; IsSimple && (Offset < getSize()); ) { + for (uint64_t Offset = 0; Offset < getSize(); ) { MCInst Instruction; uint64_t Size; uint64_t AbsoluteInstrAddr = getAddress() + Offset; From 7192d4c46f837e290a545f62f952a2260534f21a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sun, 11 Sep 2016 14:33:58 -0700 Subject: [PATCH 170/904] Add cluster randomization layout algorithm. Summary: Add "-reorder-blocks=cluster-shuffle" for performance experiments. Use "-bolt-seed=" to set a randomization seed. (cherry picked from commit 4fe753b10acdaaa820d17d457fcf014f8ea8c846) --- bolt/BinaryFunction.cpp | 7 ++++- bolt/BinaryFunction.h | 2 ++ bolt/BinaryPasses.cpp | 3 +++ bolt/ReorderAlgorithm.cpp | 55 +++++++++++++++++++++++++++++++++++---- bolt/ReorderAlgorithm.h | 11 +++++++- 5 files changed, 71 insertions(+), 7 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 7386deacb5d5..a44a56b77352 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1691,7 +1691,8 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, if (Type == LT_REVERSE) { Algo.reset(new ReverseReorderAlgorithm()); } - else if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) { + else if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD && + Type != LT_OPTIMIZE_SHUFFLE) { // Work on optimal solution if problem is small enough DEBUG(dbgs() << "finding optimal block layout for " << *this << "\n"); Algo.reset(new OptimalReorderAlgorithm()); @@ -1718,6 +1719,10 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); break; + case LT_OPTIMIZE_SHUFFLE: + Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); + break; + default: llvm_unreachable("unexpected layout type"); } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 3bed5b106aca..500aac63c4e7 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -173,6 +173,8 @@ class BinaryFunction : public AddressRangesOwner { /// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04) /// that suggests putting frequently executed chains first in the layout. LT_OPTIMIZE_CACHE, + /// Create clusters and use random order for them. + LT_OPTIMIZE_SHUFFLE, }; static constexpr uint64_t COUNT_NO_PROFILE = diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index d67bda059d8d..d7c23ec2d6d0 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -59,6 +59,9 @@ ReorderBlocks( "cache", "perform optimal layout prioritizing I-cache " "behavior"), + clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_SHUFFLE, + "cluster-shuffle", + "perform random layout of clusters"), clEnumValEnd), cl::ZeroOrMore); diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp index 31c00b1b5ee1..02662d5921d0 100644 --- a/bolt/ReorderAlgorithm.cpp +++ b/bolt/ReorderAlgorithm.cpp @@ -29,6 +29,12 @@ namespace opts { static cl::opt PrintClusters("print-clusters", cl::desc("print clusters"), cl::ZeroOrMore); +static cl::opt +RandomSeed("bolt-seed", + cl::desc("seed for randomization"), + cl::init(42), + cl::ZeroOrMore); + } // namespace opts namespace { @@ -617,12 +623,12 @@ void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) if (!Clusters[I].empty()) ClusterOrder.push_back(I); - auto Beg = ClusterOrder.begin(); // Don't reorder the first cluster, which contains the function entry point - ++Beg; - std::stable_sort(Beg, ClusterOrder.end(), [&AvgFreq](uint32_t A, uint32_t B) { - return AvgFreq[A] > AvgFreq[B]; - }); + std::stable_sort(std::next(ClusterOrder.begin()), + ClusterOrder.end(), + [&AvgFreq](uint32_t A, uint32_t B) { + return AvgFreq[A] > AvgFreq[B]; + }); if (opts::PrintClusters) { errs() << "New cluster order: "; @@ -653,3 +659,42 @@ void ReverseReorderAlgorithm::reorderBasicBlocks( } +void RandomClusterReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + std::vector &Clusters = CAlgo->Clusters; + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Cluster layout order + std::vector ClusterOrder; + + // Order clusters based on average instruction execution frequency + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!Clusters[I].empty()) + ClusterOrder.push_back(I); + + std::srand(opts::RandomSeed); + std::random_shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end()); + + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for (auto O : ClusterOrder) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + } + + // Arrange basic blocks according to cluster order. + for (uint32_t ClusterIndex : ClusterOrder) { + ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); + } +} diff --git a/bolt/ReorderAlgorithm.h b/bolt/ReorderAlgorithm.h index 1aef053d6cf3..ff190191da0e 100644 --- a/bolt/ReorderAlgorithm.h +++ b/bolt/ReorderAlgorithm.h @@ -63,7 +63,6 @@ class ClusterAlgorithm { virtual ~ClusterAlgorithm() {} }; - /// Base class for a greedy clustering algorithm that selects edges in order /// based on some heuristic and uses them to join basic blocks into clusters. class GreedyClusterAlgorithm : public ClusterAlgorithm { @@ -252,6 +251,16 @@ class ReverseReorderAlgorithm : public ReorderAlgorithm { const BinaryFunction &BF, BasicBlockOrder &Order) const override; }; +/// Create clusters as usual and place them in random order. +class RandomClusterReorderAlgorithm : public ReorderAlgorithm { +public: + explicit RandomClusterReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; } // namespace bolt } // namespace llvm From c902ee4c41c026c2071ace43bcfbf528a393379c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Sep 2016 17:12:00 -0700 Subject: [PATCH 171/904] BOLT: Clean up interface between BinaryFunction and BinaryBasicBlock. Summary: This is just a bit of refactoring to make sure that BinaryFunction goes through methods to get at the state in BinaryBasicBlock. I did this so that changing the way Index/LayoutIndex/Valid works will be easier. (cherry picked from commit 2c391cadee793a5c162268711f318dc8dcaf5af7) --- bolt/BinaryBasicBlock.cpp | 14 ++- bolt/BinaryBasicBlock.h | 190 +++++++++++++++++++++----------------- bolt/BinaryFunction.cpp | 135 +++++++++++++-------------- bolt/BinaryFunction.h | 12 +-- bolt/BinaryPasses.cpp | 3 +- 5 files changed, 184 insertions(+), 170 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 1930369e1bf0..e43cef2cecbc 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -126,12 +126,20 @@ void BinaryBasicBlock::addLandingPad(BinaryBasicBlock *LPBlock) { LPBlock->Throwers.insert(this); } -bool BinaryBasicBlock::analyzeBranch(const MCInstrAnalysis &MIA, - const MCSymbol *&TBB, +void BinaryBasicBlock::clearLandingPads() { + for (auto *LPBlock : LandingPads) { + auto count = LPBlock->Throwers.erase(this); + assert(count == 1); + } + LandingPads.clear(); +} + +bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, MCInst *&UncondBranch) { - return MIA.analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch); + auto &MIA = Function->getBinaryContext().MIA; + return MIA->analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch); } bool BinaryBasicBlock::swapConditionalSuccessors() { diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index fba243966f19..3b65f0064a6a 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -25,7 +25,6 @@ #include namespace llvm { -class MCInstrAnalysis; namespace bolt { class BinaryFunction; @@ -41,12 +40,24 @@ class BinaryBasicBlock { }; private: - /// Label associated with the block. - MCSymbol *Label{nullptr}; + /// Vector of all instructions in the block. + std::vector Instructions; + + /// CFG information. + std::vector Predecessors; + std::vector Successors; + std::set Throwers; + std::set LandingPads; + + /// Each successor has a corresponding BranchInfo entry in the list. + std::vector BranchInfo; /// Function that owns this basic block. BinaryFunction *Function; + /// Label associated with the block. + MCSymbol *Label{nullptr}; + /// Label associated with the end of the block in the output binary. const MCSymbol *EndLabel{nullptr}; @@ -59,6 +70,9 @@ class BinaryBasicBlock { /// Alignment requirements for the block. uint64_t Alignment{1}; + /// Number of times this basic block was executed. + uint64_t ExecutionCount{COUNT_NO_PROFILE}; + /// Index to BasicBlocks vector in BinaryFunction. unsigned Index{~0u}; @@ -68,9 +82,6 @@ class BinaryBasicBlock { /// Number of pseudo instructions in this block. uint32_t NumPseudos{0}; - /// Number of times this basic block was executed. - uint64_t ExecutionCount{COUNT_NO_PROFILE}; - /// In cases where the parent function has been split, IsCold == true means /// this BB will be allocated outside its parent function. bool IsCold{false}; @@ -78,25 +89,14 @@ class BinaryBasicBlock { /// Indicates if the block could be outlined. bool CanOutline{true}; - /// Vector of all instructions in the block. - std::vector Instructions; - - /// CFG information. - std::vector Predecessors; - std::vector Successors; - std::set Throwers; - std::set LandingPads; - - /// Each successor has a corresponding BranchInfo entry in the list. - std::vector BranchInfo; - - BinaryBasicBlock() {} +private: + BinaryBasicBlock() = delete; explicit BinaryBasicBlock( - MCSymbol *Label, BinaryFunction *Function, + MCSymbol *Label, uint64_t Offset = std::numeric_limits::max()) - : Label(Label), Function(Function), Offset(Offset) {} + : Function(Function), Label(Label), Offset(Offset) {} explicit BinaryBasicBlock(uint64_t Offset) : Offset(Offset) {} @@ -113,10 +113,10 @@ class BinaryBasicBlock { std::numeric_limits::max(); // Instructions iterators. - typedef std::vector::iterator iterator; - typedef std::vector::const_iterator const_iterator; - typedef std::reverse_iterator const_reverse_iterator; - typedef std::reverse_iterator reverse_iterator; + using iterator = std::vector::iterator; + using const_iterator = std::vector::const_iterator; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; bool empty() const { return Instructions.empty(); } unsigned size() const { return (unsigned)Instructions.size(); } @@ -135,30 +135,19 @@ class BinaryBasicBlock { const_reverse_iterator rend () const { return Instructions.rend(); } // CFG iterators. - typedef std::vector::iterator pred_iterator; - typedef std::vector::const_iterator const_pred_iterator; - typedef std::vector::iterator succ_iterator; - typedef std::vector::const_iterator const_succ_iterator; - typedef std::set::iterator throw_iterator; - typedef std::set::const_iterator const_throw_iterator; - typedef std::set::iterator lp_iterator; - typedef std::set::const_iterator const_lp_iterator; - typedef std::vector::reverse_iterator - pred_reverse_iterator; - typedef std::vector::const_reverse_iterator - const_pred_reverse_iterator; - typedef std::vector::reverse_iterator - succ_reverse_iterator; - typedef std::vector::const_reverse_iterator - const_succ_reverse_iterator; - typedef std::set::reverse_iterator - throw_reverse_iterator; - typedef std::set::const_reverse_iterator - const_throw_reverse_iterator; - typedef std::set::reverse_iterator - lp_reverse_iterator; - typedef std::set::const_reverse_iterator - const_lp_reverse_iterator; + using pred_iterator = std::vector::iterator; + using const_pred_iterator = std::vector::const_iterator; + using succ_iterator = std::vector::iterator; + using const_succ_iterator = std::vector::const_iterator; + using throw_iterator = decltype(Throwers)::iterator; + using const_throw_iterator = decltype(Throwers)::const_iterator; + using lp_iterator = decltype(LandingPads)::iterator; + using const_lp_iterator = decltype(LandingPads)::const_iterator; + + using pred_reverse_iterator = std::reverse_iterator; + using const_pred_reverse_iterator = std::reverse_iterator; + using succ_reverse_iterator = std::reverse_iterator; + using const_succ_reverse_iterator = std::reverse_iterator; pred_iterator pred_begin() { return Predecessors.begin(); } const_pred_iterator pred_begin() const { return Predecessors.begin(); } @@ -198,14 +187,6 @@ class BinaryBasicBlock { const_throw_iterator throw_begin() const { return Throwers.begin(); } throw_iterator throw_end() { return Throwers.end(); } const_throw_iterator throw_end() const { return Throwers.end(); } - throw_reverse_iterator throw_rbegin() - { return Throwers.rbegin();} - const_throw_reverse_iterator throw_rbegin() const - { return Throwers.rbegin();} - throw_reverse_iterator throw_rend() - { return Throwers.rend(); } - const_throw_reverse_iterator throw_rend() const - { return Throwers.rend(); } unsigned throw_size() const { return (unsigned)Throwers.size(); } @@ -216,19 +197,17 @@ class BinaryBasicBlock { const_lp_iterator lp_begin() const { return LandingPads.begin(); } lp_iterator lp_end() { return LandingPads.end(); } const_lp_iterator lp_end() const { return LandingPads.end(); } - lp_reverse_iterator lp_rbegin() - { return LandingPads.rbegin(); } - const_lp_reverse_iterator lp_rbegin() const - { return LandingPads.rbegin(); } - lp_reverse_iterator lp_rend() - { return LandingPads.rend(); } - const_lp_reverse_iterator lp_rend() const - { return LandingPads.rend(); } unsigned lp_size() const { return (unsigned)LandingPads.size(); } bool lp_empty() const { return LandingPads.empty(); } + inline iterator_range instructions() { + return iterator_range(begin(), end()); + } + inline iterator_range instructions() const { + return iterator_range(begin(), end()); + } inline iterator_range predecessors() { return iterator_range(pred_begin(), pred_end()); } @@ -255,22 +234,40 @@ class BinaryBasicBlock { } // BranchInfo iterators. - typedef std::vector::const_iterator - const_branch_info_iterator; - - const_branch_info_iterator branch_info_begin() const - { return BranchInfo.begin(); } - const_branch_info_iterator branch_info_end() const - { return BranchInfo.end(); } + using branch_info_iterator = std::vector::iterator; + using const_branch_info_iterator = + std::vector::const_iterator; + using branch_info_reverse_iterator = + std::reverse_iterator; + using const_branch_info_reverse_iterator = + std::reverse_iterator; + + branch_info_iterator branch_info_begin() { return BranchInfo.begin(); } + branch_info_iterator branch_info_end() { return BranchInfo.end(); } + const_branch_info_iterator branch_info_begin() const + { return BranchInfo.begin(); } + const_branch_info_iterator branch_info_end() const + { return BranchInfo.end(); } + branch_info_reverse_iterator branch_info_rbegin() + { return BranchInfo.rbegin(); } + branch_info_reverse_iterator branch_info_rend() + { return BranchInfo.rend(); } + const_branch_info_reverse_iterator branch_info_rbegin() const + { return BranchInfo.rbegin(); } + const_branch_info_reverse_iterator branch_info_rend() const + { return BranchInfo.rend(); } unsigned branch_info_size() const { return (unsigned)BranchInfo.size(); } - bool branch_info_empty() const - { return BranchInfo.empty(); } + bool branch_info_empty() const { return BranchInfo.empty(); } + inline iterator_range branch_info() { + return iterator_range( + BranchInfo.begin(), BranchInfo.end()); + } inline iterator_range branch_info() const { return iterator_range( - branch_info_begin(), branch_info_end()); + BranchInfo.begin(), BranchInfo.end()); } /// Get instruction at given index. @@ -483,6 +480,10 @@ class BinaryBasicBlock { return IsCold; } + void setIsCold(const bool Flag) { + IsCold = Flag; + } + /// Return true if the block can be outlined. At the moment we disallow /// outlining of blocks that can potentially throw exceptions or are /// the beginning of a landing pad. The entry basic block also can @@ -491,6 +492,10 @@ class BinaryBasicBlock { return CanOutline; } + void setCanOutline(const bool Flag) { + CanOutline = Flag; + } + /// Erase pseudo instruction at a given iterator. iterator erasePseudoInstruction(iterator II) { --NumPseudos; @@ -564,8 +569,7 @@ class BinaryBasicBlock { /// Analyze and interpret the terminators of this basic block. TBB must be /// initialized with the original fall-through for this BB. - bool analyzeBranch(const MCInstrAnalysis &MIA, - const MCSymbol *&TBB, + bool analyzeBranch(const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, MCInst *&UncondBranch); @@ -587,14 +591,27 @@ class BinaryBasicBlock { void addPredecessor(BinaryBasicBlock *Pred); /// Remove predecessor of the basic block. Don't use directly, instead - /// use removeSuccessor() funciton. + /// use removeSuccessor() function. void removePredecessor(BinaryBasicBlock *Pred); + /// Remove landing pads of this basic block. + void clearLandingPads(); + /// Set offset of the basic block from the function start. void setOffset(uint64_t NewOffset) { Offset = NewOffset; } + /// Get the index of this basic block. + unsigned getIndex() const { + return Index; + } + + /// Set the index of this basic block. + void setIndex(unsigned I) { + Index = I; + } + /// Set layout index. To be used by BinaryFunction. void setLayoutIndex(unsigned Index) { LayoutIndex = Index; @@ -609,8 +626,8 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); // GraphTraits specializations for basic block graphs (CFGs) template <> struct GraphTraits { - typedef bolt::BinaryBasicBlock NodeType; - typedef bolt::BinaryBasicBlock::succ_iterator ChildIteratorType; + using NodeType = bolt::BinaryBasicBlock; + using ChildIteratorType = bolt::BinaryBasicBlock::succ_iterator; static NodeType *getEntryNode(bolt::BinaryBasicBlock *BB) { return BB; } static inline ChildIteratorType child_begin(NodeType *N) { @@ -622,8 +639,8 @@ template <> struct GraphTraits { }; template <> struct GraphTraits { - typedef const bolt::BinaryBasicBlock NodeType; - typedef bolt::BinaryBasicBlock::const_succ_iterator ChildIteratorType; + using NodeType = const bolt::BinaryBasicBlock; + using ChildIteratorType = bolt::BinaryBasicBlock::const_succ_iterator; static NodeType *getEntryNode(const bolt::BinaryBasicBlock *BB) { return BB; @@ -637,8 +654,8 @@ template <> struct GraphTraits { }; template <> struct GraphTraits> { - typedef bolt::BinaryBasicBlock NodeType; - typedef bolt::BinaryBasicBlock::pred_iterator ChildIteratorType; + using NodeType = bolt::BinaryBasicBlock; + using ChildIteratorType = bolt::BinaryBasicBlock::pred_iterator; static NodeType *getEntryNode(Inverse G) { return G.Graph; } @@ -651,8 +668,8 @@ template <> struct GraphTraits> { }; template <> struct GraphTraits> { - typedef const bolt::BinaryBasicBlock NodeType; - typedef bolt::BinaryBasicBlock::const_pred_iterator ChildIteratorType; + using NodeType = const bolt::BinaryBasicBlock; + using ChildIteratorType = bolt::BinaryBasicBlock::const_pred_iterator; static NodeType *getEntryNode(Inverse G) { return G.Graph; } @@ -664,7 +681,6 @@ template <> struct GraphTraits> { } }; - } // namespace llvm #endif diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a44a56b77352..1bbd851cddd4 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -240,14 +240,14 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (uint32_t I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { auto BB = BasicBlocksLayout[I]; if (I != 0 && - BB->IsCold != BasicBlocksLayout[I - 1]->IsCold) + BB->isCold() != BasicBlocksLayout[I - 1]->isCold()) OS << "------- HOT-COLD SPLIT POINT -------\n\n"; OS << BB->getName() << " (" - << BB->Instructions.size() << " instructions, align : " + << BB->size() << " instructions, align : " << BB->getAlignment() << ")\n"; - if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { + if (BB->isLandingPad()) { OS << " Landing Pad\n"; } @@ -258,19 +258,19 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (!BBCFIState.empty()) { OS << " CFI State : " << BBCFIState[getIndex(BB)] << '\n'; } - if (!BB->Predecessors.empty()) { + if (!BB->pred_empty()) { OS << " Predecessors: "; auto Sep = ""; - for (auto Pred : BB->Predecessors) { + for (auto Pred : BB->predecessors()) { OS << Sep << Pred->getName(); Sep = ", "; } OS << '\n'; } - if (!BB->Throwers.empty()) { + if (!BB->throw_empty()) { OS << " Throwers: "; auto Sep = ""; - for (auto Throw : BB->Throwers) { + for (auto Throw : BB->throwers()) { OS << Sep << Throw->getName(); Sep = ", "; } @@ -282,12 +282,12 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, // Note: offsets are imprecise since this is happening prior to relaxation. Offset = BC.printInstructions(OS, BB->begin(), BB->end(), Offset, this); - if (!BB->Successors.empty()) { + if (!BB->succ_empty()) { OS << " Successors: "; - auto BI = BB->BranchInfo.begin(); + auto BI = BB->branch_info_begin(); auto Sep = ""; - for (auto Succ : BB->Successors) { - assert(BI != BB->BranchInfo.end() && "missing BranchInfo entry"); + for (auto Succ : BB->successors()) { + assert(BI != BB->branch_info_end() && "missing BranchInfo entry"); OS << Sep << Succ->getName(); if (ExecutionCount != COUNT_NO_PROFILE && BI->MispredictedCount != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { @@ -303,13 +303,13 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } - if (!BB->LandingPads.empty()) { + if (!BB->lp_empty()) { OS << " Landing Pads: "; auto Sep = ""; - for (auto LP : BB->LandingPads) { + for (auto LP : BB->landing_pads()) { OS << Sep << LP->getName(); if (ExecutionCount != COUNT_NO_PROFILE) { - OS << " (count: " << LP->ExecutionCount << ")"; + OS << " (count: " << LP->getExecutionCount() << ")"; } Sep = ", "; } @@ -806,12 +806,7 @@ void BinaryFunction::clearLandingPads(const unsigned StartIndex, const unsigned NumBlocks) { // remove all landing pads/throws for the given collection of blocks for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { - auto *BB = BasicBlocks[I]; - for (auto *LPBlock : BB->LandingPads) { - auto count = LPBlock->Throwers.erase(BB); - assert(count == 1); - } - BB->LandingPads.clear(); + BasicBlocks[I]->clearLandingPads(); } } @@ -839,14 +834,14 @@ void BinaryFunction::recomputeLandingPads(const unsigned StartIndex, for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { auto *BB = BasicBlocks[I]; - for (auto &Instr : BB->Instructions) { + for (auto &Instr : BB->instructions()) { // Store info about associated landing pad. if (BC.MIA->isInvoke(Instr)) { const MCSymbol *LP; uint64_t Action; std::tie(LP, Action) = BC.MIA->getEHInfo(Instr); if (LP) { - LPToBBIndex[LP].push_back(BB->Index); + LPToBBIndex[LP].push_back(getIndex(BB)); } } } @@ -1288,14 +1283,14 @@ void BinaryFunction::inferFallThroughCounts() { // Compute preliminary execution time for each basic block for (auto CurBB : BasicBlocks) { if (CurBB == *BasicBlocks.begin()) { - CurBB->ExecutionCount = ExecutionCount; + CurBB->setExecutionCount(ExecutionCount); continue; } CurBB->ExecutionCount = 0; } for (auto CurBB : BasicBlocks) { - auto SuccCount = CurBB->BranchInfo.begin(); + auto SuccCount = CurBB->branch_info_begin(); for (auto Succ : CurBB->successors()) { // Do not update execution count of the entry block (when we have tail // calls). We already accounted for those when computing the func count. @@ -1304,7 +1299,7 @@ void BinaryFunction::inferFallThroughCounts() { continue; } if (SuccCount->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) - Succ->ExecutionCount += SuccCount->Count; + Succ->setExecutionCount(Succ->getExecutionCount() + SuccCount->Count); ++SuccCount; } } @@ -1315,7 +1310,7 @@ void BinaryFunction::inferFallThroughCounts() { for (const auto &I : BranchData.EntryData) { BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); if (BB && LandingPads.find(BB->getLabel()) != LandingPads.end()) { - BB->ExecutionCount += I.Branches; + BB->setExecutionCount(BB->getExecutionCount() + I.Branches); } } } @@ -1333,7 +1328,7 @@ void BinaryFunction::inferFallThroughCounts() { // Calculate frequency of outgoing branches from this node according to // LBR data uint64_t ReportedBranches = 0; - for (auto &SuccCount : CurBB->BranchInfo) { + for (auto &SuccCount : CurBB->branch_info()) { if (SuccCount.Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) ReportedBranches += SuccCount.Count; } @@ -1351,8 +1346,8 @@ void BinaryFunction::inferFallThroughCounts() { // for a landing pad to be associated with more than one basic blocks, // we may overestimate the frequency of throws for such blocks. uint64_t ReportedThrows = 0; - for (BinaryBasicBlock *LP: CurBB->LandingPads) { - ReportedThrows += LP->ExecutionCount; + for (BinaryBasicBlock *LP: CurBB->landing_pads()) { + ReportedThrows += LP->getExecutionCount(); } uint64_t TotalReportedJumps = @@ -1375,11 +1370,11 @@ void BinaryFunction::inferFallThroughCounts() { }); // If there is a FT, the last successor will be it. - auto &SuccCount = CurBB->BranchInfo.back(); - auto &Succ = CurBB->Successors.back(); + auto &SuccCount = *CurBB->branch_info_rbegin(); + auto &Succ = *CurBB->succ_rbegin(); if (SuccCount.Count == BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { SuccCount.Count = Inferred; - Succ->ExecutionCount += Inferred; + Succ->setExecutionCount(Succ->getExecutionCount() + Inferred); } } // end for (CurBB : BasicBlocks) @@ -1448,7 +1443,7 @@ void BinaryFunction::removeConditionalTailCalls() { // instruction and place it at the end of the function. const BinaryBasicBlock *LastBB = BasicBlocks.back(); uint64_t NewBlockOffset = - LastBB->Offset + BC.computeCodeSize(LastBB->begin(), LastBB->end()); + LastBB->getOffset() + BC.computeCodeSize(LastBB->begin(), LastBB->end()); TailCallBB = addBasicBlock(NewBlockOffset, TCLabel); TailCallBB->addInstruction(TailCallInst); @@ -1471,7 +1466,7 @@ void BinaryFunction::removeConditionalTailCalls() { // Add execution count for the block. if (hasValidProfile()) - TailCallBB->ExecutionCount = TCInfo.Count; + TailCallBB->setExecutionCount(TCInfo.Count); } } @@ -1599,7 +1594,7 @@ bool BinaryFunction::fixCFIState() { // Hot-cold border: check if this is the first BB to be allocated in a cold // region (a different FDE). If yes, we need to reset the CFI state and // the FDEStartBB that is used to insert remember_state CFIs (t12863876). - if (I != 0 && BB->IsCold != BasicBlocksLayout[I - 1]->IsCold) { + if (I != 0 && BB->isCold() != BasicBlocksLayout[I - 1]->isCold()) { State = 0; FDEStartBB = BB; } @@ -1794,7 +1789,7 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { BB->getName().data(), BB->getName().data(), BB->getOffset(), - BB->Index, + getIndex(BB), Layout); OS << format("\"%s\" [shape=box]\n", BB->getName().data()); if (opts::DotToolTipCode) { @@ -1811,13 +1806,12 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { const MCSymbol *FBB = nullptr; MCInst *CondBranch = nullptr; MCInst *UncondBranch = nullptr; - const bool Success = BC.MIA->analyzeBranch(BB->Instructions, - TBB, - FBB, - CondBranch, - UncondBranch); + const bool Success = BB->analyzeBranch(TBB, + FBB, + CondBranch, + UncondBranch); - unsigned Idx = 0; + auto BI = BB->branch_info_begin(); for (auto *Succ : BB->successors()) { std::string Branch; if (Success) { @@ -1834,19 +1828,18 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { Succ->getName().data(), Branch.c_str()); - const auto &BI = BB->BranchInfo[Idx]; - if (BB->ExecutionCount != COUNT_NO_PROFILE && - BI.MispredictedCount != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { - OS << "\\n(M:" << BI.MispredictedCount << ",C:" << BI.Count << ")"; + if (BB->getExecutionCount() != COUNT_NO_PROFILE && + BI->MispredictedCount != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + OS << "\\n(M:" << BI->MispredictedCount << ",C:" << BI->Count << ")"; } else if (ExecutionCount != COUNT_NO_PROFILE && - BI.Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { - OS << "\\n(IC:" << BI.Count << ")"; + BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + OS << "\\n(IC:" << BI->Count << ")"; } OS << "\"]\n"; - ++Idx; + ++BI; } - for (auto *LP : BB->LandingPads) { + for (auto *LP : BB->landing_pads()) { OS << format("\"%s\" -> \"%s\" [constraint=false style=dashed]\n", BB->getName().data(), LP->getName().data()); @@ -1901,8 +1894,7 @@ void BinaryFunction::fixBranches() { const MCSymbol *FBB = nullptr; MCInst *CondBranch = nullptr; MCInst *UncondBranch = nullptr; - if (!MIA->analyzeBranch(BB->Instructions, TBB, FBB, CondBranch, - UncondBranch)) + if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) continue; // We will create unconditional branch with correct destination if needed. @@ -1911,7 +1903,7 @@ void BinaryFunction::fixBranches() { // Basic block that follows the current one in the final layout. const BinaryBasicBlock *NextBB = nullptr; - if (I + 1 != E && BB->IsCold == BasicBlocksLayout[I + 1]->IsCold) + if (I + 1 != E && BB->isCold() == BasicBlocksLayout[I + 1]->isCold()) NextBB = BasicBlocksLayout[I + 1]; if (BB->succ_size() == 1) { @@ -1961,19 +1953,19 @@ void BinaryFunction::splitFunction() { assert(BasicBlocksLayout.size() > 0); // Never outline the first basic block. - BasicBlocks.front()->CanOutline = false; + BasicBlocks.front()->setCanOutline(false); for (auto BB : BasicBlocks) { - if (!BB->CanOutline) + if (!BB->canOutline()) continue; if (BB->getExecutionCount() != 0) { - BB->CanOutline = false; + BB->setCanOutline(false); continue; } if (hasEHRanges()) { // We cannot move landing pads (or rather entry points for landing // pads). - if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { - BB->CanOutline = false; + if (BB->isLandingPad()) { + BB->setCanOutline(false); continue; } // We cannot move a block that can throw since exception-handling @@ -1982,7 +1974,7 @@ void BinaryFunction::splitFunction() { // decrease the size of the function. for (auto &Instr : *BB) { if (BC.MIA->isInvoke(Instr)) { - BB->CanOutline = false; + BB->setCanOutline(false); break; } } @@ -2000,7 +1992,7 @@ void BinaryFunction::splitFunction() { // We cannot move beginning of landing pads, but we can move 0-count blocks // comprising landing pads to the end and thus facilitating splitting. auto FirstLP = BasicBlocksLayout.begin(); - while (LandingPads.find((*FirstLP)->getLabel()) != LandingPads.end()) + while ((*FirstLP)->isLandingPad()) ++FirstLP; std::stable_sort(FirstLP, BasicBlocksLayout.end(), @@ -2015,7 +2007,7 @@ void BinaryFunction::splitFunction() { BinaryBasicBlock *BB = *I; if (!BB->canOutline()) break; - BB->IsCold = true; + BB->setIsCold(true); IsSplit = true; } } @@ -2085,13 +2077,13 @@ void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { OldExecCount == BinaryBasicBlock::COUNT_NO_PROFILE ? MyBBExecutionCount : MyBBExecutionCount + OldExecCount; - BBMerge->ExecutionCount = NewExecCount; + BBMerge->setExecutionCount(NewExecCount); } // Update BF's edge count for successors of this basic block. auto BBMergeSI = BBMerge->succ_begin(); - auto BII = BB->BranchInfo.begin(); - auto BIMergeI = BBMerge->BranchInfo.begin(); + auto BII = BB->branch_info_begin(); + auto BIMergeI = BBMerge->branch_info_begin(); for (BinaryBasicBlock *BBSucc : BB->successors()) { BinaryBasicBlock *BBMergeSucc = *BBMergeSI; assert(getIndex(BBSucc) == BF.getIndex(BBMergeSucc)); @@ -2449,7 +2441,7 @@ void BinaryFunction::insertBasicBlocks( auto *BB = BasicBlocks[I]; BB->setOffset(Offset); Offset += BC.computeCodeSize(BB->begin(), BB->end()); - BB->Index = I; + BB->setIndex(I); } if (UpdateCFIState) { @@ -2470,8 +2462,8 @@ void BinaryFunction::updateLayout(BinaryBasicBlock* Start, // Insert new blocks in the layout immediately after Start. auto Pos = std::find(layout_begin(), layout_end(), Start); assert(Pos != layout_end()); - auto Begin = &BasicBlocks[Start->Index + 1]; - auto End = &BasicBlocks[Start->Index + NumNewBlocks + 1]; + auto Begin = &BasicBlocks[getIndex(Start) + 1]; + auto End = &BasicBlocks[getIndex(Start) + NumNewBlocks + 1]; BasicBlocksLayout.insert(Pos + 1, Begin, End); } @@ -2527,7 +2519,7 @@ void BinaryFunction::calculateLoopInfo() { L->getLoopLatches(Latches); for (BinaryBasicBlock *Latch : Latches) { - auto BI = Latch->BranchInfo.begin(); + auto BI = Latch->branch_info_begin(); for (BinaryBasicBlock *Succ : Latch->successors()) { if (Succ == L->getHeader()) { assert(BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE && @@ -2547,7 +2539,7 @@ void BinaryFunction::calculateLoopInfo() { for (BinaryLoop::Edge &Exit : ExitEdges) { const BinaryBasicBlock *Exiting = Exit.first; const BinaryBasicBlock *ExitTarget = Exit.second; - auto BI = Exiting->BranchInfo.begin(); + auto BI = Exiting->branch_info_begin(); for (BinaryBasicBlock *Succ : Exiting->successors()) { if (Succ == ExitTarget) { assert(BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE && @@ -2627,7 +2619,7 @@ DynoStats BinaryFunction::getDynoStats() const { // basic block especially since the block may contain a function that // does not return or a function that throws an exception. uint64_t BBExecutionCount = 0; - for (const auto &BI : BB->BranchInfo) + for (const auto &BI : BB->branch_info()) if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) BBExecutionCount += BI.Count; @@ -2652,8 +2644,7 @@ DynoStats BinaryFunction::getDynoStats() const { const MCSymbol *FBB = nullptr; MCInst *CondBranch = nullptr; MCInst *UncondBranch = nullptr; - if (!BC.MIA->analyzeBranch(BB->Instructions, TBB, FBB, CondBranch, - UncondBranch)) { + if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) { continue; } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 500aac63c4e7..10138bae4ede 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -584,8 +584,8 @@ class BinaryFunction : public AddressRangesOwner { /// Get basic block index assuming it belongs to this function. unsigned getIndex(const BinaryBasicBlock *BB) const { - assert(BB->Index < BasicBlocks.size()); - return BB->Index; + assert(BB->getIndex() < BasicBlocks.size()); + return BB->getIndex(); } /// Returns the n-th basic block in this function in its original layout, or @@ -764,7 +764,7 @@ class BinaryFunction : public AddressRangesOwner { Label = BC.Ctx->createTempSymbol("BB", true); } auto BB = std::unique_ptr( - new BinaryBasicBlock(Label, this, Offset)); + new BinaryBasicBlock(this, Label, Offset)); if (DeriveAlignment) { uint64_t DerivedAlignment = Offset & (1 + ~Offset); @@ -788,7 +788,7 @@ class BinaryFunction : public AddressRangesOwner { BasicBlocks.emplace_back(BBPtr.release()); auto BB = BasicBlocks.back(); - BB->Index = BasicBlocks.size() - 1; + BB->setIndex(BasicBlocks.size() - 1); assert(CurrentState == State::CFG || std::is_sorted(begin(), end())); @@ -802,7 +802,7 @@ class BinaryFunction : public AddressRangesOwner { /// Return basic block that started at offset \p Offset. BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) { BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); - if (BB && BB->Offset == Offset) + if (BB && BB->getOffset() == Offset) return BB; return nullptr; @@ -1192,7 +1192,7 @@ class BinaryFunction : public AddressRangesOwner { size_t estimateHotSize() const { size_t Estimate = 0; for (const auto *BB : BasicBlocksLayout) { - if (BB->ExecutionCount != 0) { + if (BB->getExecutionCount() != 0) { Estimate += BC.computeCodeSize(BB->begin(), BB->end()); } } diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index d7c23ec2d6d0..f2fa6c8d869e 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -900,8 +900,7 @@ bool SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, const MCSymbol *FBB = nullptr; MCInst *CondBranch = nullptr; MCInst *UncondBranch = nullptr; - auto Result = - PredBB->analyzeBranch(*MIA, TBB, FBB, CondBranch, UncondBranch); + auto Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); assert(Result && "internal error analyzing conditional branch"); assert(CondBranch && "conditional branch expected"); From 5a2071866abe52b1838eac2908b8a4e9eb016d54 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 14 Sep 2016 16:45:40 -0700 Subject: [PATCH 172/904] Add experimental jump table support. Summary: Option "-jump-tables=1" enables experimental support for jump tables. The option hasn't been tested with optimizations other than block re-ordering. Only non-PIC jump tables are supported at the moment. (cherry picked from commit fcd194ad62efe520a67b50f955e0ca1e203b7453) --- bolt/BinaryContext.cpp | 6 ++ bolt/BinaryFunction.cpp | 149 ++++++++++++++++++++++++++------------ bolt/BinaryFunction.h | 20 +++-- bolt/BinaryPasses.cpp | 16 ++-- bolt/ReorderAlgorithm.cpp | 4 +- bolt/RewriteInstance.cpp | 12 ++- 6 files changed, 141 insertions(+), 66 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 60e5424209ec..f256ad9b1f20 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -313,6 +313,12 @@ void BinaryContext::printInstruction(raw_ostream &OS, OS << "; GNU_args_size = " << GnuArgsSize; } } + if (MIA->isIndirectBranch(Instruction)) { + auto JTIndex = MIA->getJumpTableIndex(Instruction); + if (JTIndex != -1LL) { + OS << " # JUMPTABLE " << JTIndex; + } + } const DWARFDebugLine::LineTable *LineTable = Function && opts::PrintDebugInfo ? Function->getDWARFUnitLineTable().second diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 1bbd851cddd4..45d8dd4e5a3c 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -41,6 +42,11 @@ namespace opts { extern cl::opt Verbosity; extern cl::opt PrintDynoStats; +static cl::opt +JumpTables("jump-tables", + cl::desc("enable jump table support (experimental)"), + cl::ZeroOrMore); + static cl::opt AgressiveSplitting("split-all-cold", cl::desc("outline as many cold basic blocks as possible"), @@ -333,6 +339,16 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } + for(unsigned Index = 0; Index < JumpTables.size(); ++Index) { + const auto &JumpTable = JumpTables[Index]; + OS << "Jump Table #" << (Index + 1) << '\n'; + for (unsigned EIndex = 0; EIndex < JumpTable.Entries.size(); ++EIndex) { + const auto *Entry = JumpTable.Entries[EIndex]; + OS << " entry " << EIndex << ": " << Entry->getName() << '\n'; + } + OS << '\n'; + } + OS << "DWARF CFI Instructions:\n"; if (OffsetToCFI.size()) { // Pre-buildCFG information @@ -389,8 +405,8 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { [&](MCInst &Instruction, uint64_t Address, uint64_t Size) { uint64_t TargetAddress{0}; MCSymbol *TargetSymbol{nullptr}; - if (!BC.MIA->evaluateRIPOperandTarget(Instruction, Address, Size, - TargetAddress)) { + if (!BC.MIA->evaluateMemOperandTarget(Instruction, TargetAddress, Address, + Size)) { DEBUG(dbgs() << "BOLT: rip-relative operand can't be evaluated:\n"; BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI); dbgs() << '\n'; @@ -418,7 +434,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { } if (!TargetSymbol) TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); - BC.MIA->replaceRIPOperandDisp( + BC.MIA->replaceMemOperandDisp( Instruction, MCOperand::createExpr(MCSymbolRefExpr::create( TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx))); return true; @@ -427,7 +443,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { enum class IndirectBranchType : char { UNKNOWN = 0, /// Unable to determine type. POSSIBLE_TAIL_CALL, /// Possibly a tail call. - POSSIBLE_SWITCH_TABLE, /// Possibly a switch/jump table + POSSIBLE_JUMP_TABLE, /// Possibly a switch/jump table POSSIBLE_GOTO /// Possibly a gcc's computed goto. }; @@ -441,7 +457,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // // We are interested in the cases where Scale == sizeof(uintptr_t) and // the contents of the memory are presumably a function array. - const auto *MemLocInstr = &Instruction; + auto *MemLocInstr = &Instruction; if (Instruction.getNumOperands() == 1) { // If the indirect jump is on register - try to detect if the // register value is loaded from a memory location. @@ -452,7 +468,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // in postProcessIndirectBranches(). for (auto PrevII = Instructions.rbegin(); PrevII != Instructions.rend(); ++PrevII) { - const auto &PrevInstr = PrevII->second; + auto &PrevInstr = PrevII->second; const auto &PrevInstrDesc = BC.MII->get(PrevInstr.getOpcode()); if (!PrevInstrDesc.hasDefOfPhysReg(PrevInstr, JmpRegNum, *BC.MRI)) continue; @@ -516,6 +532,8 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { DataExtractor DE(SectionContents, BC.AsmInfo->isLittleEndian(), PtrSize); auto ValueOffset = static_cast(ArrayStart - Section.getAddress()); uint64_t Value = 0; + auto Result = IndirectBranchType::UNKNOWN; + std::vector JTLabelCandidates; while (ValueOffset <= Section.getSize() - PtrSize) { DEBUG(dbgs() << "BOLT-DEBUG: indirect jmp at 0x" << Twine::utohexstr(getAddress() + Offset) @@ -526,7 +544,12 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { DEBUG(dbgs() << ", which contains value " << Twine::utohexstr(Value) << '\n'); if (containsAddress(Value) && Value != getAddress()) { - return IndirectBranchType::POSSIBLE_SWITCH_TABLE; + // Is it possible to have a jump table with function start as an entry? + auto *JTEntry = getOrCreateLocalLabel(Value); + JTLabelCandidates.push_back(JTEntry); + TakenBranches.emplace_back(Offset, Value - getAddress()); + Result = IndirectBranchType::POSSIBLE_JUMP_TABLE; + continue; } // Potentially a switch table can contain __builtin_unreachable() entry // pointing just right after the function. In this case we have to check @@ -535,6 +558,21 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { if (Value != getAddress() + getSize()) { break; } + JTLabelCandidates.push_back(getFunctionEndLabel()); + } + if (Result == IndirectBranchType::POSSIBLE_JUMP_TABLE) { + assert(JTLabelCandidates.size() > 2 && + "expected more than 2 jump table entries"); + auto *JTStartLabel = BC.Ctx->createTempSymbol("JUMP_TABLE", true); + JumpTables.emplace_back(JumpTable{JTStartLabel, + std::move(JTLabelCandidates)}); + BC.MIA->replaceMemOperandDisp(*MemLocInstr, JTStartLabel, BC.Ctx.get()); + BC.MIA->setJumpTableIndex(Instruction, JumpTables.size()); + DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " + << JTStartLabel->getName() + << " in function " << *this << " with " + << JTLabelCandidates.size() << " entries.\n"); + return Result; } BC.InterproceduralReferences.insert(Value); return IndirectBranchType::POSSIBLE_TAIL_CALL; @@ -654,11 +692,11 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { *Ctx))); if (!IsCall) { // Add taken branch info. - TakenBranches.push_back({Offset, TargetAddress - getAddress()}); + TakenBranches.emplace_back(Offset, TargetAddress - getAddress()); } if (IsCondBranch) { // Add fallthrough branch info. - FTBranches.push_back({Offset, Offset + Size}); + FTBranches.emplace_back(Offset, Offset + Size); } } else { // Could not evaluate branch. Should be an indirect call or an @@ -671,8 +709,9 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { case IndirectBranchType::POSSIBLE_TAIL_CALL: MIA->convertJmpToTailCall(Instruction); break; - case IndirectBranchType::POSSIBLE_SWITCH_TABLE: - IsSimple = false; + case IndirectBranchType::POSSIBLE_JUMP_TABLE: + if (!opts::JumpTables) + IsSimple = false; break; case IndirectBranchType::UNKNOWN: // Keep processing. We'll do more checks and fixes in @@ -739,17 +778,10 @@ bool BinaryFunction::postProcessIndirectBranches() { } // Validate the tail call assumptions. - if (BC.MIA->isTailCall(Instr)) { - unsigned BaseRegNum; - int64_t ScaleValue; - unsigned IndexRegNum; - int64_t DispValue; - unsigned SegRegNum; - if (BC.MIA->evaluateX86MemoryOperand(Instr, BaseRegNum, - ScaleValue, IndexRegNum, - DispValue, SegRegNum)) { - // We have validated the memory contents addressed by the - // jump instruction already. + if (BC.MIA->isTailCall(Instr) || (BC.MIA->getJumpTableIndex(Instr) > 0)) { + if (BC.MIA->getMemoryOperandNo(Instr) != -1) { + // We have validated memory contents addressed by the jump + // instruction already. continue; } // This is jump on register. Just make sure the register is defined @@ -767,13 +799,16 @@ bool BinaryFunction::postProcessIndirectBranches() { } if (IsJmpRegSetInBB) continue; - DEBUG(dbgs() << "BOLT-INFO: rejected potential indirect tail call in " - << "function " << *this << " because the jump-on register " - << "was not defined in basic block " - << BB->getName() << ":\n"; - BC.printInstructions(dbgs(), BB->begin(), BB->end(), - BB->getOffset(), this); - ); + if (opts::Verbosity >= 2) { + outs() << "BOLT-INFO: rejected potential " + << (BC.MIA->isTailCall(Instr) ? "indirect tail call" + : "jump table") + << " in function " << *this + << " because the jump-on register was not defined in " + << " basic block " << BB->getName() << ".\n"; + DEBUG(dbgs() << BC.printInstructions(dbgs(), BB->begin(), BB->end(), + BB->getOffset(), this)); + } return false; } @@ -788,12 +823,13 @@ bool BinaryFunction::postProcessIndirectBranches() { } } if (!IsEpilogue) { - DEBUG(dbgs() << "BOLT-INFO: rejected potential indirect tail call in " - << "function " << *this << " in basic block " - << BB->getName() << ":\n"; - BC.printInstructions(dbgs(), BB->begin(), BB->end(), - BB->getOffset(), this); - ); + if (opts::Verbosity >= 2) { + outs() << "BOLT-INFO: rejected potential indirect tail call in " + << "function " << *this << " in basic block " + << BB->getName() << ".\n"; + DEBUG(BC.printInstructions(dbgs(), BB->begin(), BB->end(), + BB->getOffset(), this)); + } return false; } BC.MIA->convertJmpToTailCall(Instr); @@ -1369,12 +1405,14 @@ void BinaryFunction::inferFallThroughCounts() { << Twine::utohexstr(getAddress() + CurBB->getOffset()) << '\n'; }); - // If there is a FT, the last successor will be it. - auto &SuccCount = *CurBB->branch_info_rbegin(); - auto &Succ = *CurBB->succ_rbegin(); - if (SuccCount.Count == BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { - SuccCount.Count = Inferred; - Succ->setExecutionCount(Succ->getExecutionCount() + Inferred); + if (CurBB->succ_size() <= 2) { + // If there is an FT it will be the last successor. + auto &SuccCount = *CurBB->branch_info_rbegin(); + auto &Succ = *CurBB->succ_rbegin(); + if (SuccCount.Count == BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) { + SuccCount.Count = Inferred; + Succ->ExecutionCount += Inferred; + } } } // end for (CurBB : BasicBlocks) @@ -1441,9 +1479,11 @@ void BinaryFunction::removeConditionalTailCalls() { // Create a basic block containing the unconditional tail call // instruction and place it at the end of the function. + // We have to add 1 byte as there's potentially an existing branch past + // the end of the code as a result of __builtin_unreachable(). const BinaryBasicBlock *LastBB = BasicBlocks.back(); - uint64_t NewBlockOffset = - LastBB->getOffset() + BC.computeCodeSize(LastBB->begin(), LastBB->end()); + uint64_t NewBlockOffset = LastBB->getOffset() + + BC.computeCodeSize(LastBB->begin(), LastBB->end()) + 1; TailCallBB = addBasicBlock(NewBlockOffset, TCLabel); TailCallBB->addInstruction(TailCallInst); @@ -1566,9 +1606,9 @@ bool BinaryFunction::fixCFIState() { // because this happens rarely. if (NestedLevel != 0) { if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state while" - << " replaying CFI instructions for BB " << InBB->getName() - << " in function " << *this << '\n'; + errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state" + << " while replaying CFI instructions for BB " + << InBB->getName() << " in function " << *this << '\n'; } return false; } @@ -2481,6 +2521,23 @@ BinaryFunction::~BinaryFunction() { } } +void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { + if (JumpTables.empty()) + return; + + Streamer->SwitchSection(BC.MOFI->getReadOnlySection()); + for (auto &JumpTable : JumpTables) { + DEBUG(dbgs() << "BOLT-DEBUG: emitting jump table " + << JumpTable.StartLabel->getName() << '\n'); + Streamer->EmitLabel(JumpTable.StartLabel); + // TODO (#9806207): based on jump table type (PIC vs non-PIC etc.) + // we would need to emit different references. + for (auto *Entry : JumpTable.Entries) { + Streamer->EmitSymbolValue(Entry, BC.AsmInfo->getPointerSize()); + } + } +} + void BinaryFunction::calculateLoopInfo() { // Discover loops. BinaryDominatorTree DomTree(false); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 10138bae4ede..bbf7d843080c 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -404,6 +404,13 @@ class BinaryFunction : public AddressRangesOwner { /// function and that apply before the entry basic block). CFIInstrMapType CIEFrameInstructions; + /// Representation of a jump table. + struct JumpTable { + MCSymbol *StartLabel; + std::vector Entries; + }; + std::vector JumpTables; + // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), // the terminating instructions need to be modified. @@ -673,19 +680,19 @@ class BinaryFunction : public AddressRangesOwner { return MaxSize; } - /// Return MC symbol associtated with the function. + /// Return MC symbol associated with the function. /// All references to the function should use this symbol. MCSymbol *getSymbol() { return OutputSymbol; } - /// Return MC symbol associtated with the function (const version). + /// Return MC symbol associated with the function (const version). /// All references to the function should use this symbol. const MCSymbol *getSymbol() const { return OutputSymbol; } - /// Return MC symbol associtated with the end of the function. + /// Return MC symbol associated with the end of the function. MCSymbol *getFunctionEndLabel() { assert(BC.Ctx && "cannot be called with empty context"); if (!FunctionEndLabel) { @@ -782,8 +789,8 @@ class BinaryFunction : public AddressRangesOwner { /// Returns NULL if basic block already exists at the \p Offset. BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, bool DeriveAlignment = false) { - assert(CurrentState == State::CFG || - (!getBasicBlockAtOffset(Offset) && "basic block already exists")); + assert((CurrentState == State::CFG || !getBasicBlockAtOffset(Offset)) && + "basic block already exists in pre-CFG state"); auto BBPtr = createBasicBlock(Offset, Label, DeriveAlignment); BasicBlocks.emplace_back(BBPtr.release()); @@ -1148,6 +1155,9 @@ class BinaryFunction : public AddressRangesOwner { /// Emit exception handling ranges for the function. void emitLSDA(MCStreamer *Streamer); + /// Emit jump tables for the function. + void emitJumpTables(MCStreamer *Streamer); + /// Merge profile data of this function into those of the given /// function. The functions should have been proven identical with /// isIdenticalWith. diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index f2fa6c8d869e..7ee3869e390b 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -1068,17 +1068,15 @@ bool SimplifyRODataLoads::simplifyRODataLoads( uint64_t TargetAddress; if (MIA->hasRIPOperand(Inst)) { - // Try to find the symbol that corresponds to the rip-relative operand. - MCOperand DisplOp; - if (!MIA->getRIPOperandDisp(Inst, DisplOp)) - continue; - - assert(DisplOp.isExpr() && - "found rip-relative with non-symbolic displacement"); + // Try to find the symbol that corresponds to the RIP-relative operand. + auto DispOpI = MIA->getMemOperandDisp(Inst); + assert(DispOpI != Inst.end() && "expected RIP-relative displacement"); + assert(DispOpI->isExpr() && + "found RIP-relative with non-symbolic displacement"); // Get displacement symbol. const MCSymbolRefExpr *DisplExpr; - if (!(DisplExpr = dyn_cast(DisplOp.getExpr()))) + if (!(DisplExpr = dyn_cast(DispOpI->getExpr()))) continue; const MCSymbol &DisplSymbol = DisplExpr->getSymbol(); @@ -1092,7 +1090,7 @@ bool SimplifyRODataLoads::simplifyRODataLoads( continue; } - // Get the contents of the section containing the target addresss of the + // Get the contents of the section containing the target address of the // memory operand. We are only interested in read-only sections. ErrorOr DataSectionOrErr = BC.getSectionForAddress(TargetAddress); diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp index 02662d5921d0..24f22604b08b 100644 --- a/bolt/ReorderAlgorithm.cpp +++ b/bolt/ReorderAlgorithm.cpp @@ -277,7 +277,7 @@ int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( break; ++BI; } - assert(BI != PredBB->branch_info_end() && "invalied control flow graph"); + assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); assert(BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE && "attempted reordering blocks of function with no profile data"); assert(BI->Count <= std::numeric_limits::max() && @@ -304,7 +304,7 @@ void MinBranchGreedyClusterAlgorithm::adjustQueue( auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) { // With equal weights, prioritize branches with lower index // source/destination. This helps to keep original block order for blocks - // when optimal order cannot be deducted from a profile. + // when optimal order cannot be deduced from a profile. if (Weight[A] == Weight[B]) { uint32_t ASrcBBIndex = BF.getIndex(A.Src); uint32_t BSrcBBIndex = BF.getIndex(B.Src); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 84a06ed51b6c..6b65a0fd27c8 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -283,7 +283,8 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, IsReadOnly); } - DEBUG(dbgs() << "BOLT: allocating " << (IsCode ? "code" : "data") + DEBUG(dbgs() << "BOLT: allocating " + << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data")) << " section : " << SectionName << " with size " << Size << ", alignment " << Alignment << " at 0x" << ret << "\n"); @@ -1025,7 +1026,7 @@ void RewriteInstance::disassembleFunctions() { auto *ContainingFunction = getBinaryFunctionContainingAddress(Addr); if (ContainingFunction && ContainingFunction->getAddress() != Addr) { if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: Function " << ContainingFunction + errs() << "BOLT-WARNING: Function " << *ContainingFunction << " has internal BBs that are target of a reference located in " << "another function. Skipping the function.\n"; } @@ -1325,8 +1326,10 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.EmitLabel(Function.getFunctionEndLabel()); // Emit LSDA before anything else? - if (!EmitColdPart) + if (!EmitColdPart) { Function.emitLSDA(&Streamer); + Function.emitJumpTables(&Streamer); + } // TODO: is there any use in emiting end of function? // Perhaps once we have a support for C++ exceptions. @@ -1521,7 +1524,8 @@ void RewriteInstance::emitFunctions() { // Map special sections to their addresses in the output image. // // TODO: perhaps we should process all the allocated sections here? - std::vector Sections = { ".eh_frame", ".gcc_except_table" }; + std::vector Sections = { ".eh_frame", ".gcc_except_table", + ".rodata" }; for (auto &SectionName : Sections) { auto SMII = EFMM->SectionMapInfo.find(SectionName); if (SMII != EFMM->SectionMapInfo.end()) { From 129d1901ff25ddc6c4a129355373293c5999fb91 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 15 Sep 2016 10:24:22 -0700 Subject: [PATCH 173/904] Add dyno stats for jump tables. Summary: Add dyno stats for jump tables. (cherry picked from commit 9b62a65bd563a4b70441706b11160fe4d3905563) --- bolt/BinaryFunction.cpp | 22 +++++++++++++++++++--- bolt/BinaryFunction.h | 1 + 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 45d8dd4e5a3c..33d01a959202 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1364,7 +1364,7 @@ void BinaryFunction::inferFallThroughCounts() { // Calculate frequency of outgoing branches from this node according to // LBR data uint64_t ReportedBranches = 0; - for (auto &SuccCount : CurBB->branch_info()) { + for (const auto &SuccCount : CurBB->branch_info()) { if (SuccCount.Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) ReportedBranches += SuccCount.Count; } @@ -2680,8 +2680,8 @@ DynoStats BinaryFunction::getDynoStats() const { if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) BBExecutionCount += BI.Count; - // Ignore blocks that were not executed. - if (BBExecutionCount == 0) + // Ignore empty blocks and blocks that were not executed. + if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0) continue; // Count the number of calls by iterating through all instructions. @@ -2696,6 +2696,22 @@ DynoStats BinaryFunction::getDynoStats() const { Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount; + // Jump tables. + const auto *LastInstr = BB->findLastNonPseudoInstruction(); + if (BC.MIA->getJumpTableIndex(*LastInstr) > 0) { + Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount; + DEBUG( + static uint64_t MostFrequentJT; + if (BBExecutionCount > MostFrequentJT) { + MostFrequentJT = BBExecutionCount; + dbgs() << "BOLT-INFO: most frequently executed jump table is in " + << "function " << *this << " in basic block " << BB->getName() + << " executed totally " << BBExecutionCount << " times.\n"; + } + ); + continue; + } + // Update stats for branches. const MCSymbol *TBB = nullptr; const MCSymbol *FBB = nullptr; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index bbf7d843080c..2b1838864925 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -65,6 +65,7 @@ class DynoStats { D(FUNCTION_CALLS, "all function calls", Fn)\ D(INDIRECT_CALLS, "indirect calls", Fn)\ D(INSTRUCTIONS, "executed instructions", Fn)\ + D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\ D(ALL_BRANCHES, "total branches",\ Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\ D(ALL_TAKEN, "taken branches",\ From 420018e90390a76f8a90a733facc2c257238c2b5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 15 Sep 2016 15:47:10 -0700 Subject: [PATCH 174/904] Fix issue with zero-size duplicate function symbols. Summary: While working on PLT dyno stats I've noticed that we were missing BinaryFunctions for some symbols that were not PLT. Upon closer inspection turned out that those symbols were marked as zero-sized functions in symbol table, but they had duplicates with non-zero size. Since the zero-size symbols were preceding other duplicates, we were not creating BinaryFunction for them and they were not added as duplicates. The 2 most prominent functions that were missing for a test were free() and malloc(). There's not much to optimize in these functions, but they were contributing quite significantly to dyno stats. As a result dyno stats for this test needed an adjustment. Also several assembly functions (e.g. _init()) had zero size, and now we set the size to the max size and start processing those. It's good for coverage but will not affect the performance. (cherry picked from commit d334f5422892280284050db5c397592818a033aa) --- bolt/BinaryFunction.cpp | 13 +++---- bolt/BinaryFunction.h | 5 +++ bolt/RewriteInstance.cpp | 75 ++++++++++++++++++++++++++-------------- 3 files changed, 60 insertions(+), 33 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 33d01a959202..9c688918d0cf 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -578,7 +578,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { return IndirectBranchType::POSSIBLE_TAIL_CALL; }; - bool IsSimple = true; for (uint64_t Offset = 0; Offset < getSize(); ) { MCInst Instruction; uint64_t Size; @@ -754,8 +753,6 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { Offset += Size; } - setSimple(IsSimple); - // TODO: clear memory if not simple function? // Update state. @@ -2686,11 +2683,11 @@ DynoStats BinaryFunction::getDynoStats() const { // Count the number of calls by iterating through all instructions. for (const auto &Instr : *BB) { - if (BC.MIA->isCall(Instr)) { - Stats[DynoStats::FUNCTION_CALLS] += BBExecutionCount; - if (BC.MIA->getMemoryOperandNo(Instr) != -1) { - Stats[DynoStats::INDIRECT_CALLS] += BBExecutionCount; - } + if (!BC.MIA->isCall(Instr)) + continue; + Stats[DynoStats::FUNCTION_CALLS] += BBExecutionCount; + if (BC.MIA->getMemoryOperandNo(Instr) != -1) { + Stats[DynoStats::INDIRECT_CALLS] += BBExecutionCount; } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 2b1838864925..13f31e632882 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -932,6 +932,11 @@ class BinaryFunction : public AddressRangesOwner { return *this; } + BinaryFunction &setSize(uint64_t S) { + Size = S; + return *this; + } + BinaryFunction &setMaxSize(uint64_t Size) { MaxSize = Size; return *this; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 6b65a0fd27c8..7a14ae8f070b 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -743,11 +743,6 @@ void RewriteInstance::discoverFileObjects() { // TODO: populate address map with PLT entries for better readability. - // Ignore function with 0 size for now (possibly coming from assembly). - auto SymbolSize = ELFSymbolRef(Symbol).getSize(); - if (SymbolSize == 0) - continue; - ErrorOr SectionOrErr = Symbol.getSection(); check_error(SectionOrErr.getError(), "cannot get symbol section"); section_iterator Section = *SectionOrErr; @@ -756,6 +751,8 @@ void RewriteInstance::discoverFileObjects() { continue; } + auto SymbolSize = ELFSymbolRef(Symbol).getSize(); + // Checkout for conflicts with function data from FDEs. bool IsSimple = true; auto FDEI = CFIRdWrt->getFDEs().lower_bound(Address); @@ -769,26 +766,33 @@ void RewriteInstance::discoverFileObjects() { auto &PrevFDE = *FDEI->second; auto PrevStart = PrevFDE.getInitialLocation(); auto PrevLength = PrevFDE.getAddressRange(); - if (opts::Verbosity >= 1 && - Address > PrevStart && Address < PrevStart + PrevLength) { - errs() << "BOLT-WARNING: function " << UniqueName - << " is in conflict with FDE [" - << Twine::utohexstr(PrevStart) << ", " - << Twine::utohexstr(PrevStart + PrevLength) - << "). Skipping.\n"; + if (Address > PrevStart && Address < PrevStart + PrevLength) { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: function " << UniqueName + << " is in conflict with FDE [" + << Twine::utohexstr(PrevStart) << ", " + << Twine::utohexstr(PrevStart + PrevLength) + << "). Skipping.\n"; + } IsSimple = false; } } } else if (FDE.getAddressRange() != SymbolSize) { - // Function addresses match but sizes differ. + if (SymbolSize) { + // Function addresses match but sizes differ. + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: sizes differ for function " << UniqueName + << ". FDE : " << FDE.getAddressRange() + << "; symbol table : " << SymbolSize << ". Skipping.\n"; + } + + // Create maximum size non-simple function. + IsSimple = false; + } if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: sizes differ for function " << UniqueName - << ". FDE : " << FDE.getAddressRange() - << "; symbol table : " << SymbolSize << ". Skipping.\n"; + outs() << "BOLT-INFO: adjusting size of function " << UniqueName + << " using FDE data.\n"; } - - // Create maximum size non-simple function. - IsSimple = false; SymbolSize = std::max(SymbolSize, FDE.getAddressRange()); } } @@ -799,10 +803,16 @@ void RewriteInstance::discoverFileObjects() { BF = &BFI->second; // Duplicate function name. Make sure everything matches before we add // an alternative name. - if (opts::Verbosity >= 1 && SymbolSize != BF->getSize()) { - errs() << "BOLT-WARNING: size mismatch for duplicate entries " - << UniqueName << ':' << SymbolSize << " and " - << *BF << ':' << BF->getSize() << '\n'; + if (SymbolSize != BF->getSize()) { + if (opts::Verbosity >= 1) { + if (SymbolSize && BF->getSize()) { + errs() << "BOLT-WARNING: size mismatch for duplicate entries " + << *BF << " and " << UniqueName << '\n'; + } + outs() << "BOLT-INFO: adjusting size of function " << *BF + << " old " << BF->getSize() << " new " << SymbolSize << "\n"; + } + BF->setSize(std::max(SymbolSize, BF->getSize())); } BF->addAlternativeName(UniqueName); } else { @@ -915,8 +925,8 @@ void RewriteInstance::disassembleFunctions() { if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { // When could it happen? if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: corresponding section is non-executable or empty " - << "for function " << Function; + errs() << "BOLT-WARNING: corresponding section is non-executable or " + << "empty for function " << Function << '\n'; } continue; } @@ -955,6 +965,21 @@ void RewriteInstance::disassembleFunctions() { continue; } Function.setMaxSize(MaxSize); + if (!Function.getSize() && Function.getMaxSize()) { + // Some assembly functions have their size set to 0, use the max + // size as their real size. + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: setting size of function " << Function + << " to " << Function.getMaxSize() << " (was 0)\n"; + } + Function.setSize(Function.getMaxSize()); + } + } + + // Treat zero-sized functions as non-simple ones. + if (Function.getSize() == 0) { + Function.setSimple(false); + continue; } StringRef SectionContents; From 069f011b7953b29c5cf6c1ba997c0a0f74e28316 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 15 Sep 2016 15:47:10 -0700 Subject: [PATCH 175/904] Add PLT dyno stats. Summary: Get PLT call stats. (cherry picked from commit ae85b3839e4cd733b5f5ac4b8745cff4c04dae80) --- bolt/BinaryFunction.cpp | 14 ++++++++++++++ bolt/BinaryFunction.h | 1 + 2 files changed, 15 insertions(+) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 9c688918d0cf..ed40e059f7b5 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -2688,6 +2688,20 @@ DynoStats BinaryFunction::getDynoStats() const { Stats[DynoStats::FUNCTION_CALLS] += BBExecutionCount; if (BC.MIA->getMemoryOperandNo(Instr) != -1) { Stats[DynoStats::INDIRECT_CALLS] += BBExecutionCount; + } else if (const auto *CallSymbol = BC.MIA->getTargetSymbol(Instr)) { + if (BC.getFunctionForSymbol(CallSymbol)) + continue; + auto GSI = BC.GlobalSymbols.find(CallSymbol->getName()); + if (GSI == BC.GlobalSymbols.end()) + continue; + auto Section = BC.getSectionForAddress(GSI->second); + if (!Section) + continue; + StringRef SectionName; + Section->getName(SectionName); + if (SectionName == ".plt") { + Stats[DynoStats::PLT_CALLS] += BBExecutionCount; + } } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 13f31e632882..ecee06d30753 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -64,6 +64,7 @@ class DynoStats { D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\ D(FUNCTION_CALLS, "all function calls", Fn)\ D(INDIRECT_CALLS, "indirect calls", Fn)\ + D(PLT_CALLS, "PLT calls", Fn)\ D(INSTRUCTIONS, "executed instructions", Fn)\ D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\ D(ALL_BRANCHES, "total branches",\ From ad9e7d03e9510c7ca9e3eae29b80f4d709a7fc8b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 16 Sep 2016 13:13:16 -0700 Subject: [PATCH 176/904] Do no collect dyno stats on functions with stale profile. Summary: Dyno stats collected on functions with invalid profile may appear completely bogus. Skip them. (cherry picked from commit e05ed3f3877feebe2cdb0f0836e9755f9f35ed51) --- bolt/BinaryFunction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index ed40e059f7b5..b21991799ca6 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -2660,7 +2660,7 @@ DynoStats BinaryFunction::getDynoStats() const { DynoStats Stats; // Return empty-stats about the function we don't completely understand. - if (!isSimple()) + if (!isSimple() || !hasValidProfile()) return Stats; // Update enumeration of basic blocks for correct detection of branch' From e0ece690ac7fc3eb850c2a22bd0094766507297f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 20 Sep 2016 20:55:49 -0700 Subject: [PATCH 177/904] BOLT: Add feature to sort functions by dyno stats. Summary: Add -print-sorted-by and -print-sorted-by-order command line options. The first option takes a list of dyno stats keys used to sort functions that are printed at the end of all optimization passes. Only the top 100 functions are printed. The -print-sorted-by-order option can be either ascending or descending (descending is the default). (cherry picked from commit d979db11e4990bf43c6d49167af93b10592cd6ab) --- bolt/BinaryFunction.cpp | 18 +++++ bolt/BinaryFunction.h | 8 ++- bolt/BinaryPassManager.cpp | 10 +++ bolt/BinaryPasses.cpp | 143 +++++++++++++++++++++++++++++++++++++ bolt/BinaryPasses.h | 17 +++++ 5 files changed, 195 insertions(+), 1 deletion(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index b21991799ca6..f8882b56fdfc 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -119,6 +119,24 @@ SMLoc findDebugLineInformationForInstructionAt( } // namespace +bool DynoStats::operator<(const DynoStats &Other) const { + return std::lexicographical_compare( + &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], + &Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT] + ); +} + +bool DynoStats::lessThan(const DynoStats &Other, + ArrayRef Keys) const { + return std::lexicographical_compare( + Keys.begin(), Keys.end(), + Keys.begin(), Keys.end(), + [this,&Other](const Category A, const Category) { + return Stats[A] < Other.Stats[A]; + } + ); +} + uint64_t BinaryFunction::Count = 0; BinaryBasicBlock * diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index ecee06d30753..26ec8aa719eb 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -81,7 +81,7 @@ class DynoStats { public: #define D(name, ...) name, - enum : uint8_t { DYNO_STATS }; + enum Category : uint8_t { DYNO_STATS }; #undef D @@ -127,6 +127,12 @@ class DynoStats { void print(raw_ostream &OS, const DynoStats *Other = nullptr) const; void operator+=(const DynoStats &Other); + bool operator<(const DynoStats &Other) const; + bool lessThan(const DynoStats &Other, ArrayRef Keys) const; + + static const char* Description(const Category C) { + return Desc[C]; + } }; inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) { diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index f653c26140a3..ed8bb11b5e28 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -120,6 +120,13 @@ PrintInline("print-inline", cl::ZeroOrMore, cl::Hidden); +static cl::opt +NeverPrint("never-print", + cl::desc("never print"), + cl::init(false), + cl::ZeroOrMore, + cl::ReallyHidden); + } // namespace opts namespace llvm { @@ -180,6 +187,9 @@ void BinaryFunctionPassManager::runAllPasses( // Here we manage dependencies/order manually, since passes are ran in the // order they're registered. + // Run this pass first to use stats for the original functions. + Manager.registerPass(llvm::make_unique(NeverPrint)); + Manager.registerPass(llvm::make_unique(PrintICF), opts::IdenticalCodeFolding); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 7ee3869e390b..3a95839b7ca3 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -17,6 +17,34 @@ using namespace llvm; +namespace { + +const char* dynoStatsOptName(const bolt::DynoStats::Category C) { + if (C == bolt::DynoStats::FIRST_DYNO_STAT) + return "none"; + else if (C == bolt::DynoStats::LAST_DYNO_STAT) + return "all"; + + static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT+1]; + + OptNames[C] = bolt::DynoStats::Description(C); + + std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); + + return OptNames[C].c_str(); +} + +const char* dynoStatsOptDesc(const bolt::DynoStats::Category C) { + if (C == bolt::DynoStats::FIRST_DYNO_STAT) + return "unsorted"; + else if (C == bolt::DynoStats::LAST_DYNO_STAT) + return "sorted by all stats"; + + return bolt::DynoStats::Description(C); +} + +} + namespace opts { extern cl::opt Verbosity; @@ -73,6 +101,35 @@ MinBranchClusters( cl::ZeroOrMore, cl::Hidden); +static cl::list +PrintSortedBy( + "print-sorted-by", + cl::CommaSeparated, + cl::desc("print functions sorted by order of dyno stats"), + cl::value_desc("key1,key2,key3,..."), + cl::values( +#define D(name, ...) \ + clEnumValN(bolt::DynoStats::name, \ + dynoStatsOptName(bolt::DynoStats::name), \ + dynoStatsOptDesc(bolt::DynoStats::name)), + DYNO_STATS +#undef D + clEnumValEnd), + cl::ZeroOrMore); + +enum DynoStatsSortOrder : char { + Ascending, + Descending +}; + +static cl::opt +DynoStatsSortOrderOpt( + "print-sorted-by-order", + cl::desc("use ascending or descending order when printing " + "functions ordered by dyno stats"), + cl::ZeroOrMore, + cl::init(DynoStatsSortOrder::Descending)); + } // namespace opts namespace llvm { @@ -1355,5 +1412,91 @@ void IdenticalCodeFolding::runOnFunctions( << " KB of code space.\n"; } +void PrintSortedBy::runOnFunctions( + BinaryContext &, + std::map &BFs, + std::set & +) { + if (!opts::PrintSortedBy.empty() && + std::find(opts::PrintSortedBy.begin(), + opts::PrintSortedBy.end(), + DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) { + + std::vector Functions; + std::map Stats; + + for (const auto &BFI : BFs) { + const auto &BF = BFI.second; + if (shouldOptimize(BF) && BF.hasValidProfile()) { + Functions.push_back(&BF); + Stats.emplace(&BF, BF.getDynoStats()); + } + } + + const bool SortAll = + std::find(opts::PrintSortedBy.begin(), + opts::PrintSortedBy.end(), + DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end(); + + const bool Ascending = + opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; + + if (SortAll) { + std::stable_sort( + Functions.begin(), + Functions.end(), + [Ascending,&Stats](const BinaryFunction *A, const BinaryFunction *B) { + return Ascending ? + Stats.at(A) < Stats.at(B) : Stats.at(B) < Stats.at(A); + } + ); + } else { + std::stable_sort( + Functions.begin(), + Functions.end(), + [Ascending,&Stats](const BinaryFunction *A, const BinaryFunction *B) { + const auto &StatsA = Stats.at(A); + const auto &StatsB = Stats.at(B); + return Ascending + ? StatsA.lessThan(StatsB, opts::PrintSortedBy) + : StatsB.lessThan(StatsA, opts::PrintSortedBy); + } + ); + } + + outs() << "BOLT-INFO: top functions sorted by "; + if (SortAll) { + outs() << "dyno stats"; + } else { + outs() << "("; + bool PrintComma = false; + for (const auto Category : opts::PrintSortedBy) { + if (PrintComma) outs() << ", "; + outs() << DynoStats::Description(Category); + PrintComma = true; + } + outs() << ")"; + } + + outs() << " are:\n"; + auto SFI = Functions.begin(); + for (unsigned i = 0; i < 100 && SFI != Functions.end(); ++SFI, ++i) { + const auto Stats = (*SFI)->getDynoStats(); + outs() << " " << **SFI; + if (!SortAll) { + outs() << " ("; + bool PrintComma = false; + for (const auto Category : opts::PrintSortedBy) { + if (PrintComma) outs() << ", "; + outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; + PrintComma = true; + } + outs() << ")"; + } + outs() << "\n"; + } + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 41602213bb99..e0ad0ff18fa7 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -353,6 +353,23 @@ class IdenticalCodeFolding : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// +/// Prints a list of the top 100 functions sorted by a set of +/// dyno stats categories. +/// +class PrintSortedBy : public BinaryFunctionPass { + public: + explicit PrintSortedBy(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "print-sorted-by"; + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + } // namespace bolt } // namespace llvm From 9f368861fb6c4a77117cf3470cdadfc8565d273f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Sep 2016 20:32:12 -0700 Subject: [PATCH 178/904] BOLT: Refactoring BinaryFunction interface. Summary: Get rid of all uses of getIndex/getLayoutIndex/getOffset outside of BinaryFunction. Also made some other offset related methods private. (cherry picked from commit 6d56de152717e2ddeb22acb19548e221a2b29e70) --- bolt/BinaryBasicBlock.h | 24 ++++++------ bolt/BinaryFunction.cpp | 24 +++++++++--- bolt/BinaryFunction.h | 80 +++++++++++++++++++++------------------ bolt/BinaryPasses.cpp | 51 +++++++++++-------------- bolt/BinaryPasses.h | 11 ++++-- bolt/DebugData.cpp | 18 ++++----- bolt/ReorderAlgorithm.cpp | 18 ++++----- 7 files changed, 122 insertions(+), 104 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 3b65f0064a6a..71c3d6aa06bf 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -398,18 +398,6 @@ class BinaryBasicBlock { return Alignment; } - /// Return offset of the basic block from the function start. - uint64_t getOffset() const { - return Offset; - } - - /// Return index in the current layout. The user is responsible for - /// making sure the indices are up to date, - /// e.g. by calling BinaryFunction::updateLayoutIndices(); - unsigned getLayoutIndex() const { - return LayoutIndex; - } - /// Adds block to successor list, and also updates predecessor list for /// successor block. /// Set branch info for this path. @@ -597,6 +585,11 @@ class BinaryBasicBlock { /// Remove landing pads of this basic block. void clearLandingPads(); + /// Return offset of the basic block from the function start. + uint64_t getOffset() const { + return Offset; + } + /// Set offset of the basic block from the function start. void setOffset(uint64_t NewOffset) { Offset = NewOffset; @@ -612,6 +605,13 @@ class BinaryBasicBlock { Index = I; } + /// Return index in the current layout. The user is responsible for + /// making sure the indices are up to date, + /// e.g. by calling BinaryFunction::updateLayoutIndices(); + unsigned getLayoutIndex() const { + return LayoutIndex; + } + /// Set layout index. To be used by BinaryFunction. void setLayoutIndex(unsigned Index) { LayoutIndex = Index; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index f8882b56fdfc..f90ea956b1dc 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1472,7 +1472,7 @@ void BinaryFunction::removeConditionalTailCalls() { // Reverse the condition of the tail call and update its target. unsigned InsertIdx = getIndex(BB) + 1; assert(InsertIdx < size() && "no fall-through for conditional tail call"); - BinaryBasicBlock *NextBB = getBasicBlockAtIndex(InsertIdx); + BinaryBasicBlock *NextBB = BasicBlocks[InsertIdx]; BC.MIA->reverseBranchCondition( CondTailCallInst, NextBB->getLabel(), BC.Ctx.get()); @@ -1483,7 +1483,7 @@ void BinaryFunction::removeConditionalTailCalls() { TailCallBBs.emplace_back(createBasicBlock(NextBB->getOffset(), TCLabel)); TailCallBBs[0]->addInstruction(TailCallInst); insertBasicBlocks(BB, std::move(TailCallBBs), /* UpdateCFIState */ false); - TailCallBB = getBasicBlockAtIndex(InsertIdx); + TailCallBB = BasicBlocks[InsertIdx]; // Add the correct CFI state for the new block. BBCFIState.insert(BBCFIState.begin() + InsertIdx, TCInfo.CFIStateBefore); @@ -1857,6 +1857,8 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { Code.c_str()); } + // analyzeBranch is just used to get the names of the branch + // opcodes. const MCSymbol *TBB = nullptr; const MCSymbol *FBB = nullptr; MCInst *CondBranch = nullptr; @@ -1866,18 +1868,28 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { CondBranch, UncondBranch); + const auto *LastInstr = BB->findLastNonPseudoInstruction(); + const bool IsJumpTable = LastInstr && BC.MIA->getJumpTableIndex(*LastInstr) > 0; + auto BI = BB->branch_info_begin(); for (auto *Succ : BB->successors()) { std::string Branch; if (Success) { - if (CondBranch && Succ->getLabel() == TBB) { - Branch = BC.InstPrinter->getOpcodeName(CondBranch->getOpcode()); - } else if(UncondBranch && Succ->getLabel() == TBB) { - Branch = BC.InstPrinter->getOpcodeName(UncondBranch->getOpcode()); + if (Succ == BB->getConditionalSuccessor(true)) { + Branch = CondBranch + ? BC.InstPrinter->getOpcodeName(CondBranch->getOpcode()) + : "TB"; + } else if (Succ == BB->getConditionalSuccessor(false)) { + Branch = UncondBranch + ? BC.InstPrinter->getOpcodeName(UncondBranch->getOpcode()) + : "FB"; } else { Branch = "FT"; } } + if (IsJumpTable) { + Branch = "JT"; + } OS << format("\"%s\" -> \"%s\" [label=\"%s", BB->getName().data(), Succ->getName().data(), diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 26ec8aa719eb..775ab769aef4 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -285,6 +285,22 @@ class BinaryFunction : public AddressRangesOwner { /// the output binary. uint32_t AddressRangesOffset{-1U}; + /// Get basic block index assuming it belongs to this function. + unsigned getIndex(const BinaryBasicBlock *BB) const { + assert(BB->getIndex() < BasicBlocks.size()); + return BB->getIndex(); + } + + /// Return basic block that originally contained offset \p Offset + /// from the function start. + BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); + + /// Return basic block that started at offset \p Offset. + BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) { + BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); + return BB && BB->getOffset() == Offset ? BB : nullptr; + } + /// Release memory taken by the list. template BinaryFunction &clearList(T& List) { T TempList; @@ -597,22 +613,6 @@ class BinaryFunction : public AddressRangesOwner { /// fixBranches(). DynoStats getDynoStats() const; - /// Get basic block index assuming it belongs to this function. - unsigned getIndex(const BinaryBasicBlock *BB) const { - assert(BB->getIndex() < BasicBlocks.size()); - return BB->getIndex(); - } - - /// Returns the n-th basic block in this function in its original layout, or - /// nullptr if n >= size(). - const BinaryBasicBlock *getBasicBlockAtIndex(unsigned Index) const { - return BasicBlocks.at(Index); - } - - BinaryBasicBlock *getBasicBlockAtIndex(unsigned Index) { - return BasicBlocks.at(Index); - } - /// Returns the basic block after the given basic block in the layout or /// nullptr the last basic block is given. const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB) const { @@ -673,6 +673,11 @@ class BinaryFunction : public AddressRangesOwner { return Address; } + /// Get the original address for the given basic block within this function. + uint64_t getBasicBlockOriginalAddress(const BinaryBasicBlock *BB) const { + return Address + BB->getOffset(); + } + /// Return offset of the function body in the binary file. uint64_t getFileOffset() const { return FileOffset; @@ -814,18 +819,21 @@ class BinaryFunction : public AddressRangesOwner { /// BBs. unsigned eraseDeadBBs(std::map &ToPreserve); - /// Return basic block that started at offset \p Offset. - BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) { - BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); - if (BB && BB->getOffset() == Offset) - return BB; - - return nullptr; + /// Get the relative order between two basic blocks in the original + /// layout. The result is > 0 if B occurs before A and < 0 if B + /// occurs after A. If A and B are the same block, the result is 0. + signed getOriginalLayoutRelativeOrder(const BinaryBasicBlock *A, + const BinaryBasicBlock *B) const { + return getIndex(A) - getIndex(B); } - /// Return basic block that originally contained offset \p Offset - /// from the function start. - BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); + /// Return basic block range that originally contained offset \p Offset + /// from the function start to the function end. + iterator_range getBasicBlockRangeFromOffsetToEnd(uint64_t Offset) { + auto *BB = getBasicBlockContainingOffset(Offset); + return BB ? iterator_range(BasicBlocks.begin() + getIndex(BB), end()) + : iterator_range(end(), end()); + } /// Insert the BBs contained in NewBBs into the basic blocks for this /// function. Update the associated state of all blocks as needed, i.e. @@ -855,6 +863,16 @@ class BinaryFunction : public AddressRangesOwner { } } + /// Determine direction of the branch based on the current layout. + /// Callee is responsible of updating basic block indices prior to using + /// this function (e.g. by calling BinaryFunction::updateLayoutIndices()). + static bool isForwardBranch(const BinaryBasicBlock *From, + const BinaryBasicBlock *To) { + assert(From->getFunction() == To->getFunction() && + "basic blocks should be in the same function"); + return To->getLayoutIndex() > From->getLayoutIndex(); + } + /// Dump function information to debug output. If \p PrintInstructions /// is true - include instruction disassembly. void dump(std::string Annotation = "", bool PrintInstructions = true) const; @@ -1289,16 +1307,6 @@ callWithDynoStats(FnType &&Func, } } -/// Determine direction of the branch based on the current layout. -/// Callee is responsible of updating basic block indices prior to using -/// this function (e.g. by calling BinaryFunction::updateLayoutIndices()). -inline bool isForwardBranch(const BinaryBasicBlock *From, - const BinaryBasicBlock *To) { - assert(From->getFunction() == To->getFunction() && - "basic blocks should be in the same function"); - return To->getLayoutIndex() > From->getLayoutIndex(); -} - inline raw_ostream &operator<<(raw_ostream &OS, const BinaryFunction &Function) { OS << Function.getPrintName(); diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index 3a95839b7ca3..b33a0a480a21 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -363,33 +363,31 @@ InlineSmallFunctions::inlineCall( // keep a mapping from basic block index to the corresponding block in the // inlined instance. std::vector> InlinedInstance; - std::vector - BBIndexToInlinedInstanceBB(InlinedFunction.size(), nullptr); + std::unordered_map InlinedBBMap; + for (const auto InlinedFunctionBB : InlinedFunction.layout()) { InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0)); - BBIndexToInlinedInstanceBB[InlinedFunction.getIndex(InlinedFunctionBB)] = - InlinedInstance.back().get(); - if (InlinedFunction.hasValidProfile()) - InlinedInstance.back()->setExecutionCount( - InlinedFunctionBB->getExecutionCount()); + InlinedBBMap[InlinedFunctionBB] = InlinedInstance.back().get(); + if (InlinedFunction.hasValidProfile()) { + const auto Count = InlinedFunctionBB->getExecutionCount(); + InlinedInstance.back()->setExecutionCount(Count); + } } if (ShouldSplitCallerBB) { // Add one extra block at the inlined instance for the removed part of the // caller block. InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0)); - BBIndexToInlinedInstanceBB.push_back(InlinedInstance.back().get()); - if (CallerFunction.hasValidProfile()) - InlinedInstance.back()->setExecutionCount(CallerBB->getExecutionCount()); + if (CallerFunction.hasValidProfile()) { + const auto Count = CallerBB->getExecutionCount(); + InlinedInstance.back()->setExecutionCount(Count); + } } // Copy instructions to the basic blocks of the inlined instance. - unsigned InlinedInstanceBBIndex = 0; + bool First = true; for (const auto InlinedFunctionBB : InlinedFunction.layout()) { // Get the corresponding block of the inlined instance. - auto *InlinedInstanceBB = InlinedInstance[InlinedInstanceBBIndex].get(); - assert(InlinedInstanceBB == - BBIndexToInlinedInstanceBB[InlinedFunction.getIndex(InlinedFunctionBB)]); - + auto *InlinedInstanceBB = InlinedBBMap.at(InlinedFunctionBB); bool IsExitingBlock = false; // Copy instructions into the inlined instance. @@ -427,9 +425,7 @@ InlineSmallFunctions::inlineCall( const MCSymbol *NewTargetLabel = nullptr; for (const auto SuccBB : InlinedFunctionBB->successors()) { if (SuccBB->getLabel() == OldTargetLabel) { - const auto InlinedInstanceSuccBB = - BBIndexToInlinedInstanceBB[InlinedFunction.getIndex(SuccBB)]; - NewTargetLabel = InlinedInstanceSuccBB->getLabel(); + NewTargetLabel = InlinedBBMap.at(SuccBB)->getLabel(); break; } } @@ -447,14 +443,15 @@ InlineSmallFunctions::inlineCall( // Add CFG edges to the basic blocks of the inlined instance. std::vector Successors(InlinedFunctionBB->succ_size(), nullptr); + std::transform( InlinedFunctionBB->succ_begin(), InlinedFunctionBB->succ_end(), Successors.begin(), - [&InlinedFunction, &BBIndexToInlinedInstanceBB] - (const BinaryBasicBlock *BB) { - return BBIndexToInlinedInstanceBB[InlinedFunction.getIndex(BB)]; + [&InlinedBBMap](const BinaryBasicBlock *BB) { + return InlinedBBMap.at(BB); }); + if (InlinedFunction.hasValidProfile()) { InlinedInstanceBB->addSuccessors( Successors.begin(), @@ -478,7 +475,7 @@ InlineSmallFunctions::inlineCall( InlinedInstanceBB->addSuccessor(InlinedInstance.back().get()); } InlinedInstanceBB->addBranchInstruction(InlinedInstance.back().get()); - } else if (InlinedInstanceBBIndex > 0 || !CanMergeFirstInlinedBlock) { + } else if (!First || !CanMergeFirstInlinedBlock) { assert(CallInstIndex == CallerBB->size() - 1); assert(CallerBB->succ_size() <= 1); if (CallerBB->succ_size() == 1) { @@ -494,7 +491,7 @@ InlineSmallFunctions::inlineCall( } } - ++InlinedInstanceBBIndex; + First = false; } if (ShouldSplitCallerBB) { @@ -949,7 +946,7 @@ bool SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // We don't want to reverse direction of the branch in new order // without further profile analysis. - if (isForwardBranch(PredBB, BB) != IsForwardCTC) + if (BF.isForwardBranch(PredBB, BB) != IsForwardCTC) continue; // Change destination of the unconditional branch. @@ -1212,7 +1209,6 @@ void IdenticalCodeFolding::discoverCallers( continue; for (BinaryBasicBlock &BB : Caller) { - unsigned BlockIndex = Caller.getIndex(&BB); unsigned InstrIndex = 0; for (MCInst &Inst : BB) { @@ -1235,8 +1231,7 @@ void IdenticalCodeFolding::discoverCallers( continue; } // Insert a tuple in the Callers map. - Callers[Function].emplace_back( - CallSite(&Caller, BlockIndex, InstrIndex)); + Callers[Function].emplace_back(CallSite(&Caller, &BB, InstrIndex)); ++InstrIndex; } } @@ -1274,7 +1269,7 @@ void IdenticalCodeFolding::foldFunction( for (const CallSite &CS : BFToFoldCallers) { // Get call instruction. BinaryFunction *Caller = CS.Caller; - BinaryBasicBlock *CallBB = Caller->getBasicBlockAtIndex(CS.BlockIndex); + BinaryBasicBlock *CallBB = CS.Block; MCInst &CallInst = CallBB->getInstructionAtIndex(CS.InstrIndex); // Replace call target with BFToReplaceWith. diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index e0ad0ff18fa7..266cc2762407 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -318,11 +318,13 @@ class IdenticalCodeFolding : public BinaryFunctionPass { /// Map from a binary function to its callers. struct CallSite { BinaryFunction *Caller; - unsigned BlockIndex; + BinaryBasicBlock *Block; unsigned InstrIndex; - CallSite(BinaryFunction *Caller, unsigned BlockIndex, unsigned InstrIndex) : - Caller(Caller), BlockIndex(BlockIndex), InstrIndex(InstrIndex) { } + CallSite(BinaryFunction *Caller, + BinaryBasicBlock *Block, + unsigned InstrIndex) : + Caller(Caller), Block(Block), InstrIndex(InstrIndex) { } }; using CallerMap = std::map>; CallerMap Callers; @@ -365,6 +367,9 @@ class PrintSortedBy : public BinaryFunctionPass { const char *getName() const override { return "print-sorted-by"; } + bool shouldPrint(const BinaryFunction &) const override { + return false; + } void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index 577473a778a3..5938c793f05e 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -31,9 +31,10 @@ void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, uint64_t BeginAddress, uint64_t EndAddress, const BinaryData *Data) { - auto FirstBB = Function.getBasicBlockContainingOffset( - BeginAddress - Function.getAddress()); - if (!FirstBB) { + auto BBRange = Function.getBasicBlockRangeFromOffsetToEnd( + BeginAddress - Function.getAddress()); + + if (BBRange.begin() == BBRange.end()) { if (opts::Verbosity >= 2) { errs() << "BOLT-WARNING: no basic blocks in function " << Function << " intersect with debug range [0x" @@ -43,23 +44,22 @@ void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, return; } - for (auto I = Function.getIndex(FirstBB), S = Function.size(); I != S; ++I) { - auto BB = Function.getBasicBlockAtIndex(I); - uint64_t BBAddress = Function.getAddress() + BB->getOffset(); + for (auto &BB : BBRange) { + uint64_t BBAddress = Function.getBasicBlockOriginalAddress(&BB); // Note the special handling for [a, a) address range. if (BBAddress >= EndAddress && BeginAddress != EndAddress) break; uint64_t InternalAddressRangeBegin = std::max(BBAddress, BeginAddress); - assert(BB->getFunction() == &Function && + assert(BB.getFunction() == &Function && "Mismatching functions.\n"); uint64_t InternalAddressRangeEnd = - std::min(BBAddress + Function.getBasicBlockOriginalSize(BB), + std::min(BBAddress + Function.getBasicBlockOriginalSize(&BB), EndAddress); AddressRanges.emplace_back( BBAddressRange{ - BB, + &BB, static_cast(InternalAddressRangeBegin - BBAddress), static_cast(InternalAddressRangeEnd - BBAddress), Data}); diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp index 24f22604b08b..ebc79a4106ee 100644 --- a/bolt/ReorderAlgorithm.cpp +++ b/bolt/ReorderAlgorithm.cpp @@ -218,11 +218,10 @@ void PHGreedyClusterAlgorithm::initQueue( // source/destination. This helps to keep original block order for blocks // when optimal order cannot be deducted from a profile. if (A.Count == B.Count) { - uint32_t ASrcBBIndex = BF.getIndex(A.Src); - uint32_t BSrcBBIndex = BF.getIndex(B.Src); - if (ASrcBBIndex != BSrcBBIndex) - return ASrcBBIndex > BSrcBBIndex; - return BF.getIndex(A.Dst) > BF.getIndex(B.Dst); + const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); + return (SrcOrder != 0) + ? SrcOrder > 0 + : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; } return A.Count < B.Count; }; @@ -306,11 +305,10 @@ void MinBranchGreedyClusterAlgorithm::adjustQueue( // source/destination. This helps to keep original block order for blocks // when optimal order cannot be deduced from a profile. if (Weight[A] == Weight[B]) { - uint32_t ASrcBBIndex = BF.getIndex(A.Src); - uint32_t BSrcBBIndex = BF.getIndex(B.Src); - if (ASrcBBIndex != BSrcBBIndex) - return ASrcBBIndex > BSrcBBIndex; - return BF.getIndex(A.Dst) > BF.getIndex(B.Dst); + const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); + return (SrcOrder != 0) + ? SrcOrder > 0 + : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; } return Weight[A] < Weight[B]; }; From 7e574b90b5eb345b04cb4b4564f5e9c21804ed66 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Sep 2016 15:16:11 -0700 Subject: [PATCH 179/904] BOLT: Add ud2 after indirect tailcalls. Summary: Insert ud2 instructions after indirect tailcalls to prevent the CPU from decoding instructions following the callsite. A simple counter in the peephole pass shows 3260 tail call traps inserted. (cherry picked from commit 43bb818e27c9dfc1e11820ea8aa557c6c95d5aae) --- bolt/BinaryPasses.cpp | 16 ++++++++++++++++ bolt/BinaryPasses.h | 12 ++++++++++++ 2 files changed, 28 insertions(+) diff --git a/bolt/BinaryPasses.cpp b/bolt/BinaryPasses.cpp index b33a0a480a21..0a9e8d280d85 100644 --- a/bolt/BinaryPasses.cpp +++ b/bolt/BinaryPasses.cpp @@ -1087,6 +1087,20 @@ void Peepholes::fixDoubleJumps(BinaryContext &BC, } } +void Peepholes::addTailcallTraps(BinaryContext &BC, + BinaryFunction &Function) { + for (auto &BB : Function) { + auto *Inst = BB.findLastNonPseudoInstruction(); + if (Inst && BC.MIA->isTailCall(*Inst) && BC.MIA->isIndirectBranch(*Inst)) { + MCInst Trap; + if (BC.MIA->createTrap(Trap)) { + BB.addInstruction(Trap); + ++TailCallTraps; + } + } + } +} + void Peepholes::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { @@ -1095,9 +1109,11 @@ void Peepholes::runOnFunctions(BinaryContext &BC, if (shouldOptimize(Function)) { shortenInstructions(BC, Function); fixDoubleJumps(BC, Function); + addTailcallTraps(BC, Function); } } outs() << "BOLT-INFO: " << NumDoubleJumps << " double jumps patched.\n"; + outs() << "BOLT-INFO: " << TailCallTraps << " tail call traps inserted.\n"; } bool SimplifyRODataLoads::simplifyRODataLoads( diff --git a/bolt/BinaryPasses.h b/bolt/BinaryPasses.h index 266cc2762407..660d967dcfdb 100644 --- a/bolt/BinaryPasses.h +++ b/bolt/BinaryPasses.h @@ -257,8 +257,20 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { /// Perform simple peephole optimizations. class Peepholes : public BinaryFunctionPass { uint64_t NumDoubleJumps{0}; + uint64_t TailCallTraps{0}; + + /// Attempt to use the minimum operand width for arithmetic, branch and + /// move instructions. void shortenInstructions(BinaryContext &BC, BinaryFunction &Function); + + /// Replace double jumps with a jump directly to the target, i.e. + /// jmp/jcc L1; L1: jmp L2 -> jmp/jcc L2. void fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function); + + /// Add trap instructions immediately after indirect tail calls to prevent + /// the processor from decoding instructions immediate following the + /// tailcall. + void addTailcallTraps(BinaryContext &BC, BinaryFunction &Function); public: explicit Peepholes(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } From 0dacfbe3833861b33521562b507a0c72f916ecee Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 16 Sep 2016 15:54:32 -0700 Subject: [PATCH 180/904] Support for splitting jump tables. Summary: Add level for "-jump-tables=" option: 1 - all jump tables are output in the same section (default). 2 - basic splitting, if the table is used it is output to hot section otherwise to cold one. 3 - aggressively split compound jump tables and collect profile for all entries. Option "-print-jump-tables" outputs all jump tables for debugging and/or analyzing purposes. Use with "-jump-tables=3" to get profile values for every entry in a jump table. (cherry picked from commit 8ff060bef5c0352b05b36df8b0d458f8905f9841) --- bolt/BinaryBasicBlock.cpp | 4 +- bolt/BinaryBasicBlock.h | 4 +- bolt/BinaryContext.cpp | 5 +- bolt/BinaryFunction.cpp | 245 +++++++++++++++++++++++++++++++++----- bolt/BinaryFunction.h | 76 +++++++++++- bolt/BinaryPasses.cpp | 10 +- bolt/RewriteInstance.cpp | 88 +++++++------- bolt/RewriteInstance.h | 15 ++- 8 files changed, 353 insertions(+), 94 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index e43cef2cecbc..9e0b3ff9b9d1 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -30,7 +30,7 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { return LHS.Offset < RHS.Offset; } -MCInst *BinaryBasicBlock::findFirstNonPseudoInstruction() { +MCInst *BinaryBasicBlock::getFirstNonPseudo() { auto &BC = Function->getBinaryContext(); for (auto &Inst : Instructions) { if (!BC.MII->get(Inst.getOpcode()).isPseudo()) @@ -39,7 +39,7 @@ MCInst *BinaryBasicBlock::findFirstNonPseudoInstruction() { return nullptr; } -MCInst *BinaryBasicBlock::findLastNonPseudoInstruction() { +MCInst *BinaryBasicBlock::getLastNonPseudo() { auto &BC = Function->getBinaryContext(); for (auto Itr = Instructions.rbegin(); Itr != Instructions.rend(); ++Itr) { if (!BC.MII->get(Itr->getOpcode()).isPseudo()) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 71c3d6aa06bf..a96f510560e3 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -382,11 +382,11 @@ class BinaryBasicBlock { /// Return a pointer to the first non-pseudo instruction in this basic /// block. Returns nullptr if none exists. - MCInst *findFirstNonPseudoInstruction(); + MCInst *getFirstNonPseudo(); /// Return a pointer to the last non-pseudo instruction in this basic /// block. Returns nullptr if none exists. - MCInst *findLastNonPseudoInstruction(); + MCInst *getLastNonPseudo(); /// Set minimum alignment for the basic block. void setAlignment(uint64_t Align) { diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index f256ad9b1f20..c8815ba8084c 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -314,9 +314,8 @@ void BinaryContext::printInstruction(raw_ostream &OS, } } if (MIA->isIndirectBranch(Instruction)) { - auto JTIndex = MIA->getJumpTableIndex(Instruction); - if (JTIndex != -1LL) { - OS << " # JUMPTABLE " << JTIndex; + if (auto JTAddress = MIA->getJumpTable(Instruction)) { + OS << " # JUMPTABLE @0x" << Twine::utohexstr(JTAddress); } } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index f90ea956b1dc..be4d5d8a79ff 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -36,17 +36,34 @@ #define DEBUG_TYPE "bolt" using namespace llvm; +using namespace bolt; namespace opts { extern cl::opt Verbosity; extern cl::opt PrintDynoStats; -static cl::opt +static cl::opt JumpTables("jump-tables", - cl::desc("enable jump table support (experimental)"), + cl::desc("jump tables support"), + cl::init(BinaryFunction::JTS_NONE), + cl::values(clEnumValN(BinaryFunction::JTS_NONE, "0", + "do not optimize functions with jump tables"), + clEnumValN(BinaryFunction::JTS_BASIC, "1", + "optimize functions with jump tables"), + clEnumValN(BinaryFunction::JTS_SPLIT, "2", + "split jump tables into hot and cold"), + clEnumValN(BinaryFunction::JTS_AGGRESSIVE, "3", + "aggressively split jump tables (unsafe)"), + clEnumValEnd), cl::ZeroOrMore); +static cl::opt +PrintJumpTables("print-jump-tables", + cl::desc("print jump tables"), + cl::ZeroOrMore, + cl::Hidden); + static cl::opt AgressiveSplitting("split-all-cold", cl::desc("outline as many cold basic blocks as possible"), @@ -235,6 +252,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "\n Id Fun Addr : 0x" << Twine::utohexstr(IdenticalFunctionAddress); if (opts::PrintDynoStats && !BasicBlocksLayout.empty()) { + OS << '\n'; DynoStats dynoStats = getDynoStats(); OS << dynoStats; } @@ -357,14 +375,9 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } - for(unsigned Index = 0; Index < JumpTables.size(); ++Index) { - const auto &JumpTable = JumpTables[Index]; - OS << "Jump Table #" << (Index + 1) << '\n'; - for (unsigned EIndex = 0; EIndex < JumpTable.Entries.size(); ++EIndex) { - const auto *Entry = JumpTable.Entries[EIndex]; - OS << " entry " << EIndex << ": " << Entry->getName() << '\n'; - } - OS << '\n'; + // Print all jump tables. + for (auto &JTI : JumpTables) { + JTI.second.print(OS); } OS << "DWARF CFI Instructions:\n"; @@ -373,7 +386,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (auto &Elmt : OffsetToCFI) { OS << format(" %08x:\t", Elmt.first); assert(Elmt.second < FrameInstructions.size() && "Incorrect CFI offset"); - BinaryContext::printCFI(OS, FrameInstructions[Elmt.second].getOperation()); + BinaryContext::printCFI(OS, + FrameInstructions[Elmt.second].getOperation()); OS << "\n"; } } else { @@ -523,10 +537,30 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { SegRegNum != bolt::NoRegister || ScaleValue != PtrSize) return IndirectBranchType::UNKNOWN; - auto ArrayStart = DispValue; + auto ArrayStart = static_cast(DispValue); if (BaseRegNum == RIPRegister) ArrayStart += getAddress() + Offset + Size; + // Check if there's already a jump table registered at this address. + if (auto *JT = getJumpTableContainingAddress(ArrayStart)) { + auto JTOffset = ArrayStart - JT->Address; + // Get or create a label. + auto LI = JT->Labels.find(JTOffset); + if (LI == JT->Labels.end()) { + auto *JTStartLabel = BC.Ctx->createTempSymbol("JUMP_TABLE", true); + auto Result = JT->Labels.emplace(JTOffset, JTStartLabel); + assert(Result.second && "error adding jump table label"); + LI = Result.first; + } + + BC.MIA->replaceMemOperandDisp(*MemLocInstr, LI->second, BC.Ctx.get()); + BC.MIA->setJumpTable(Instruction, ArrayStart); + + JTSites.emplace_back(Offset, ArrayStart); + + return IndirectBranchType::POSSIBLE_JUMP_TABLE; + } + auto SectionOrError = BC.getSectionForAddress(ArrayStart); if (!SectionOrError) { // No section - possibly an absolute address. Since we don't allow @@ -552,6 +586,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { uint64_t Value = 0; auto Result = IndirectBranchType::UNKNOWN; std::vector JTLabelCandidates; + std::vector JTOffsetCandidates; while (ValueOffset <= Section.getSize() - PtrSize) { DEBUG(dbgs() << "BOLT-DEBUG: indirect jmp at 0x" << Twine::utohexstr(getAddress() + Offset) @@ -565,7 +600,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { // Is it possible to have a jump table with function start as an entry? auto *JTEntry = getOrCreateLocalLabel(Value); JTLabelCandidates.push_back(JTEntry); - TakenBranches.emplace_back(Offset, Value - getAddress()); + JTOffsetCandidates.push_back(Value - getAddress()); Result = IndirectBranchType::POSSIBLE_JUMP_TABLE; continue; } @@ -577,19 +612,26 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { break; } JTLabelCandidates.push_back(getFunctionEndLabel()); + JTOffsetCandidates.push_back(Value - getAddress()); } if (Result == IndirectBranchType::POSSIBLE_JUMP_TABLE) { assert(JTLabelCandidates.size() > 2 && "expected more than 2 jump table entries"); auto *JTStartLabel = BC.Ctx->createTempSymbol("JUMP_TABLE", true); - JumpTables.emplace_back(JumpTable{JTStartLabel, - std::move(JTLabelCandidates)}); - BC.MIA->replaceMemOperandDisp(*MemLocInstr, JTStartLabel, BC.Ctx.get()); - BC.MIA->setJumpTableIndex(Instruction, JumpTables.size()); DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " << JTStartLabel->getName() << " in function " << *this << " with " << JTLabelCandidates.size() << " entries.\n"); + JumpTables.emplace(ArrayStart, JumpTable{ArrayStart, + PtrSize, + std::move(JTLabelCandidates), + std::move(JTOffsetCandidates), + {{0, JTStartLabel}}}); + BC.MIA->replaceMemOperandDisp(*MemLocInstr, JTStartLabel, BC.Ctx.get()); + BC.MIA->setJumpTable(Instruction, ArrayStart); + + JTSites.emplace_back(Offset, ArrayStart); + return Result; } BC.InterproceduralReferences.insert(Value); @@ -727,7 +769,7 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { MIA->convertJmpToTailCall(Instruction); break; case IndirectBranchType::POSSIBLE_JUMP_TABLE: - if (!opts::JumpTables) + if (opts::JumpTables == JTS_NONE) IsSimple = false; break; case IndirectBranchType::UNKNOWN: @@ -771,6 +813,40 @@ bool BinaryFunction::disassemble(ArrayRef FunctionData) { Offset += Size; } + // Update TakenBranches from JumpTables. + // + // We want to do it after initial processing since we don't know jump tables + // boundaries until we process them all. + for (auto &JTSite : JTSites) { + auto JTSiteOffset = JTSite.first; + auto JTAddress = JTSite.second; + auto *JT = getJumpTableContainingAddress(JTAddress); + assert(JT && "cannot find jump table for address"); + uint32_t EI = (JTAddress - JT->Address) / JT->EntrySize; + while (EI < JT->Entries.size()) { + auto TargetOffset = JT->OffsetEntries[EI]; + if (TargetOffset < getSize()) + TakenBranches.emplace_back(JTSiteOffset, TargetOffset); + ++EI; + // A label at the next entry means the end of this jump table. + if (JT->Labels.count(EI * JT->EntrySize)) + break; + } + } + + // Free memory used by jump table offsets. + for (auto &JTI : JumpTables) { + auto &JT = JTI.second; + clearList(JT.OffsetEntries); + } + + // Remove duplicates branches. We can get a bunch of them from jump tables. + // Without doing jump table value profiling we don't have use for extra + // (duplicate) branches. + std::sort(TakenBranches.begin(), TakenBranches.end()); + auto NewEnd = std::unique(TakenBranches.begin(), TakenBranches.end()); + TakenBranches.erase(NewEnd, TakenBranches.end()); + // TODO: clear memory if not simple function? // Update state. @@ -793,7 +869,7 @@ bool BinaryFunction::postProcessIndirectBranches() { } // Validate the tail call assumptions. - if (BC.MIA->isTailCall(Instr) || (BC.MIA->getJumpTableIndex(Instr) > 0)) { + if (BC.MIA->isTailCall(Instr) || BC.MIA->getJumpTable(Instr)) { if (BC.MIA->getMemoryOperandNo(Instr) != -1) { // We have validated memory contents addressed by the jump // instruction already. @@ -1063,6 +1139,34 @@ bool BinaryFunction::buildCFG() { } else { const BranchInfo &BInfo = BranchInfoOrErr.get(); FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); + // Populate profile counts for the jump table. + auto *LastInstr = FromBB->getLastNonPseudo(); + if (!LastInstr) + continue; + auto JTAddress = BC.MIA->getJumpTable(*LastInstr); + if (!JTAddress) + continue; + auto *JT = getJumpTableContainingAddress(JTAddress); + if (!JT) + continue; + JT->Count += BInfo.Branches; + if (opts::JumpTables < JTS_AGGRESSIVE) + continue; + if (JT->Counts.empty()) + JT->Counts.resize(JT->Entries.size()); + auto EI = JT->Entries.begin(); + auto Delta = (JTAddress - JT->Address) / JT->EntrySize; + EI += Delta; + while (EI != JT->Entries.end()) { + if (ToBB->getLabel() == *EI) { + JT->Counts[Delta] += BInfo.Branches; + } + ++Delta; + ++EI; + // A label marks the start of another jump table. + if (JT->Labels.count(Delta * JT->EntrySize)) + break; + } } } } @@ -1311,7 +1415,7 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { (float) (LocalProfileBranches.size() - OrphanBranches.size()) / (float) LocalProfileBranches.size(); - if (opts::Verbosity >= 2 && !OrphanBranches.empty()) { + if (opts::Verbosity >= 1 && !OrphanBranches.empty()) { errs() << "BOLT-WARNING: profile branches match only " << format("%.1f%%", ProfileMatchRatio * 100.0f) << " (" << (LocalProfileBranches.size() - OrphanBranches.size()) << '/' @@ -1322,6 +1426,7 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { errs() << "\t0x" << Twine::utohexstr(OBranch.first) << " -> 0x" << Twine::utohexstr(OBranch.second) << " (0x" << Twine::utohexstr(OBranch.first + getAddress()) << " -> 0x" + << Twine::utohexstr(OBranch.second + getAddress()) << ")\n"; ); } } @@ -1868,8 +1973,8 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { CondBranch, UncondBranch); - const auto *LastInstr = BB->findLastNonPseudoInstruction(); - const bool IsJumpTable = LastInstr && BC.MIA->getJumpTableIndex(*LastInstr) > 0; + const auto *LastInstr = BB->getLastNonPseudo(); + const bool IsJumpTable = LastInstr && BC.MIA->getJumpTable(*LastInstr); auto BI = BB->branch_info_begin(); for (auto *Succ : BB->successors()) { @@ -2551,18 +2656,92 @@ BinaryFunction::~BinaryFunction() { void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { if (JumpTables.empty()) return; + if (opts::PrintJumpTables) { + outs() << "BOLT-INFO: jump tables for function " << *this << ":\n"; + } + for (auto &JTI : JumpTables) { + auto &JT = JTI.second; + if (opts::PrintJumpTables) + JT.print(outs()); + JT.emit(Streamer, + BC.MOFI->getReadOnlySection(), + BC.MOFI->getReadOnlyColdSection()); + } +} + +// TODO (#9806207): based on jump table type (PIC vs non-PIC etc.) we will +// need to emit different references. +uint64_t BinaryFunction::JumpTable::emit(MCStreamer *Streamer, + MCSection *HotSection, + MCSection *ColdSection) { + // Pre-process entries for aggressive splitting. + // Each label represents a separate switch table and gets its own count + // determining its destination. + std::map LabelCounts; + if (opts::JumpTables > JTS_SPLIT && !Counts.empty()) { + MCSymbol *CurrentLabel = Labels[0]; + uint64_t CurrentLabelCount = 0; + for (unsigned Index = 0; Index < Entries.size(); ++Index) { + auto LI = Labels.find(Index * EntrySize); + if (LI != Labels.end()) { + LabelCounts[CurrentLabel] = CurrentLabelCount; + CurrentLabel = LI->second; + CurrentLabelCount = 0; + } + CurrentLabelCount += Counts[Index]; + } + LabelCounts[CurrentLabel] = CurrentLabelCount; + } else { + Streamer->SwitchSection(Count > 0 ? HotSection : ColdSection); + Streamer->EmitValueToAlignment(EntrySize); + } + uint64_t Offset = 0; + for (auto *Entry : Entries) { + auto LI = Labels.find(Offset); + if (LI != Labels.end()) { + DEBUG(dbgs() << "BOLT-DEBUG: emitting jump table " + << LI->second->getName() << " (originally was at address 0x" + << Twine::utohexstr(Address + Offset) + << (Offset ? "as part of larger jump table\n" : "\n")); + if (!LabelCounts.empty()) { + DEBUG(dbgs() << "BOLT-DEBUG: jump table count: " + << LabelCounts[LI->second] << '\n'); + if (LabelCounts[LI->second] > 0) { + Streamer->SwitchSection(HotSection); + } else { + Streamer->SwitchSection(ColdSection); + } + Streamer->EmitValueToAlignment(EntrySize); + } + Streamer->EmitLabel(LI->second); + } + Streamer->EmitSymbolValue(Entry, EntrySize); + Offset += EntrySize; + } - Streamer->SwitchSection(BC.MOFI->getReadOnlySection()); - for (auto &JumpTable : JumpTables) { - DEBUG(dbgs() << "BOLT-DEBUG: emitting jump table " - << JumpTable.StartLabel->getName() << '\n'); - Streamer->EmitLabel(JumpTable.StartLabel); - // TODO (#9806207): based on jump table type (PIC vs non-PIC etc.) - // we would need to emit different references. - for (auto *Entry : JumpTable.Entries) { - Streamer->EmitSymbolValue(Entry, BC.AsmInfo->getPointerSize()); + return Offset; +} + +void BinaryFunction::JumpTable::print(raw_ostream &OS) const { + uint64_t Offset = 0; + for (const auto *Entry : Entries) { + auto LI = Labels.find(Offset); + if (LI != Labels.end()) { + OS << "Jump Table " << LI->second->getName() << " at @0x" + << Twine::utohexstr(Address+Offset); + if (Offset) { + OS << " (possibly part of larger jump table):\n"; + } else { + OS << " with total count of " << Count << ":\n"; + } } + OS << format(" 0x%04" PRIx64 " : ", Offset) << Entry->getName(); + if (!Counts.empty()) + OS << " : " << Counts[Offset / EntrySize]; + OS << '\n'; + Offset += EntrySize; } + OS << "\n\n"; } void BinaryFunction::calculateLoopInfo() { @@ -2738,8 +2917,8 @@ DynoStats BinaryFunction::getDynoStats() const { Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount; // Jump tables. - const auto *LastInstr = BB->findLastNonPseudoInstruction(); - if (BC.MIA->getJumpTableIndex(*LastInstr) > 0) { + const auto *LastInstr = BB->getLastNonPseudo(); + if (BC.MIA->getJumpTable(*LastInstr)) { Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount; DEBUG( static uint64_t MostFrequentJT; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 775ab769aef4..d4b8ac9a4ddd 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -185,6 +185,13 @@ class BinaryFunction : public AddressRangesOwner { LT_OPTIMIZE_SHUFFLE, }; + enum JumpTableSupportLevel : char { + JTS_NONE = 0, /// Disable jump tables support + JTS_BASIC = 1, /// Enable basic jump tables support + JTS_SPLIT = 2, /// Enable hot/cold splitting of jump tables + JTS_AGGRESSIVE = 3, /// Aggressive splitting of jump tables + }; + static constexpr uint64_t COUNT_NO_PROFILE = std::numeric_limits::max(); // Function size, in number of BBs, above which we fallback to a heuristic @@ -429,11 +436,76 @@ class BinaryFunction : public AddressRangesOwner { CFIInstrMapType CIEFrameInstructions; /// Representation of a jump table. + /// + /// The jump table may include other jump tables that are referenced by + /// a different label at a different offset in this jump table. struct JumpTable { - MCSymbol *StartLabel; + /// Original address. + uint64_t Address; + + /// Size of the entry used for storage. + std::size_t EntrySize; + + /// All the entries as labels. std::vector Entries; + + /// All the entries as offsets into a function. Invalid after CFG is built. + std::vector OffsetEntries; + + /// Map -> Hasher; + size_t Seed = Hasher(Val.first); + hashCombine(Seed, Val.second); + return Seed; + } +}; + +} + +void ClusterAlgorithm::computeClusterAverageFrequency() { + AvgFreq.resize(Clusters.size(), 0.0); + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + double Freq = 0.0; + for (auto BB : Clusters[I]) { + if (BB->getNumNonPseudos() > 0) + Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos(); + } + AvgFreq[I] = Freq; + } +} + +void ClusterAlgorithm::printClusters() const { + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + errs() << "Cluster number " << I; + if (AvgFreq.size() == Clusters.size()) + errs() << " (frequency: " << AvgFreq[I] << ")"; + errs() << " : "; + auto Sep = ""; + for (auto BB : Clusters[I]) { + errs() << Sep << BB->getName(); + Sep = ", "; + } + errs() << "\n"; + } +} + +void ClusterAlgorithm::reset() { + Clusters.clear(); + ClusterEdges.clear(); + AvgFreq.clear(); +} + +void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { + OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; +} + +size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { + HashPair Hasher; + return Hasher(std::make_pair(E.Src, E.Dst)); +} + +bool GreedyClusterAlgorithm::EdgeEqual::operator()( + const EdgeTy &A, const EdgeTy &B) const { + return A.Src == B.Src && A.Dst == B.Dst; +} + +void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF, + bool ComputeEdges) { + reset(); + + // Greedy heuristic implementation for the TSP, applied to BB layout. Try to + // maximize weight during a path traversing all BBs. In this way, we will + // convert the hottest branches into fall-throughs. + + // This is the queue of edges from which we will pop edges and use them to + // cluster basic blocks in a greedy fashion. + std::vector Queue; + + // Initialize inter-cluster weights. + if (ComputeEdges) + ClusterEdges.resize(BF.layout_size()); + + // Initialize clusters and edge queue. + for (auto BB : BF.layout()) { + // Create a cluster for this BB. + uint32_t I = Clusters.size(); + Clusters.emplace_back(); + auto &Cluster = Clusters.back(); + Cluster.push_back(BB); + BBToClusterMap[BB] = I; + // Populate priority queue with edges. + auto BI = BB->branch_info_begin(); + for (auto &I : BB->successors()) { + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "attempted reordering blocks of function with no profile data"); + Queue.emplace_back(EdgeTy(BB, I, BI->Count)); + ++BI; + } + } + // Sort and adjust the edge queue. + initQueue(Queue, BF); + + // Grow clusters in a greedy fashion. + while (!Queue.empty()) { + auto E = Queue.back(); + Queue.pop_back(); + + const auto *SrcBB = E.Src; + const auto *DstBB = E.Dst; + + DEBUG(dbgs() << "Popped edge "; + E.print(dbgs()); + dbgs() << "\n"); + + // Case 1: BBSrc and BBDst are the same. Ignore this edge + if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { + DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); + continue; + } + + int I = BBToClusterMap[SrcBB]; + int J = BBToClusterMap[DstBB]; + + // Case 2: If they are already allocated at the same cluster, just increase + // the weight of this cluster + if (I == J) { + if (ComputeEdges) + ClusterEdges[I][I] += E.Count; + DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); + continue; + } + + auto &ClusterA = Clusters[I]; + auto &ClusterB = Clusters[J]; + if (areClustersCompatible(ClusterA, ClusterB, E)) { + // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, + // allowing us to merge two clusters. + for (auto BB : ClusterB) + BBToClusterMap[BB] = I; + ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); + ClusterB.clear(); + if (ComputeEdges) { + // Increase the intra-cluster edge count of cluster A with the count of + // this edge as well as with the total count of previously visited edges + // from cluster B cluster A. + ClusterEdges[I][I] += E.Count; + ClusterEdges[I][I] += ClusterEdges[J][I]; + // Iterate through all inter-cluster edges and transfer edges targeting + // cluster B to cluster A. + for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) + ClusterEdges[K][I] += ClusterEdges[K][J]; + } + // Adjust the weights of the remaining edges and re-sort the queue. + adjustQueue(Queue, BF); + DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); + } else { + // Case 4: Both SrcBB and DstBB are allocated in positions we cannot + // merge them. Add the count of this edge to the inter-cluster edge count + // between clusters A and B to help us decide ordering between these + // clusters. + if (ComputeEdges) + ClusterEdges[I][J] += E.Count; + DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); + } + } +} + +void GreedyClusterAlgorithm::reset() { + ClusterAlgorithm::reset(); + BBToClusterMap.clear(); +} + +void PHGreedyClusterAlgorithm::initQueue( + std::vector &Queue, const BinaryFunction &BF) { + // Define a comparison function to establish SWO between edges. + auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) { + // With equal weights, prioritize branches with lower index + // source/destination. This helps to keep original block order for blocks + // when optimal order cannot be deducted from a profile. + if (A.Count == B.Count) { + const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); + return (SrcOrder != 0) + ? SrcOrder > 0 + : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; + } + return A.Count < B.Count; + }; + + // Sort edges in increasing profile count order. + std::sort(Queue.begin(), Queue.end(), Comp); +} + +void PHGreedyClusterAlgorithm::adjustQueue( + std::vector &Queue, const BinaryFunction &BF) { + // Nothing to do. + return; +} + +bool PHGreedyClusterAlgorithm::areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { + return Front.back() == E.Src && Back.front() == E.Dst; +} + +int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( + const EdgeTy &E, const BinaryFunction &BF) const { + const BinaryBasicBlock *SrcBB = E.Src; + const BinaryBasicBlock *DstBB = E.Dst; + + // Initial weight value. + int64_t W = (int64_t)E.Count; + + // Adjust the weight by taking into account other edges with the same source. + auto BI = SrcBB->branch_info_begin(); + for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "attempted reordering blocks of function with no profile data"); + assert(BI->Count <= std::numeric_limits::max() && + "overflow detected"); + // Ignore edges with same source and destination, edges that target the + // entry block as well as the edge E itself. + if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB) + W -= (int64_t)BI->Count; + ++BI; + } + + // Adjust the weight by taking into account other edges with the same + // destination. + for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { + // Ignore edges with same source and destination as well as the edge E + // itself. + if (PredBB == DstBB || PredBB == SrcBB) + continue; + auto BI = PredBB->branch_info_begin(); + for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { + if (SuccBB == DstBB) + break; + ++BI; + } + assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "attempted reordering blocks of function with no profile data"); + assert(BI->Count <= std::numeric_limits::max() && + "overflow detected"); + W -= (int64_t)BI->Count; + } + + return W; +} + +void MinBranchGreedyClusterAlgorithm::initQueue( + std::vector &Queue, const BinaryFunction &BF) { + // Initialize edge weights. + for (const EdgeTy &E : Queue) + Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); + + // Sort edges in increasing weight order. + adjustQueue(Queue, BF); +} + +void MinBranchGreedyClusterAlgorithm::adjustQueue( + std::vector &Queue, const BinaryFunction &BF) { + // Define a comparison function to establish SWO between edges. + auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) { + // With equal weights, prioritize branches with lower index + // source/destination. This helps to keep original block order for blocks + // when optimal order cannot be deduced from a profile. + if (Weight[A] == Weight[B]) { + const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); + return (SrcOrder != 0) + ? SrcOrder > 0 + : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; + } + return Weight[A] < Weight[B]; + }; + + // Iterate through all remaining edges to find edges that have their + // source and destination in the same cluster. + std::vector NewQueue; + for (const EdgeTy &E : Queue) { + const auto *SrcBB = E.Src; + const auto *DstBB = E.Dst; + + // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore + // this edge. + if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { + DEBUG(dbgs() << "\tAdjustment: Ignored edge "; + E.print(dbgs()); + dbgs() << " (same src, dst)\n"); + continue; + } + + int I = BBToClusterMap[SrcBB]; + int J = BBToClusterMap[DstBB]; + auto &ClusterA = Clusters[I]; + auto &ClusterB = Clusters[J]; + + // Case 2: They are already allocated at the same cluster or incompatible + // clusters. Adjust the weights of edges with the same source or + // destination, so that this edge has no effect on them any more, and ignore + // this edge. Also increase the intra- (or inter-) cluster edge count. + if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { + if (!ClusterEdges.empty()) + ClusterEdges[I][J] += E.Count; + DEBUG(dbgs() << "\tAdjustment: Ignored edge "; + E.print(dbgs()); + dbgs() << " (src, dst belong to same cluster or incompatible " + "clusters)\n"); + for (const auto *SuccBB : SrcBB->successors()) { + if (SuccBB == DstBB) + continue; + auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); + assert(WI != Weight.end() && "CFG edge not found in Weight map"); + WI->second += (int64_t)E.Count; + } + for (const auto *PredBB : DstBB->predecessors()) { + if (PredBB == SrcBB) + continue; + auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); + assert(WI != Weight.end() && "CFG edge not found in Weight map"); + WI->second += (int64_t)E.Count; + } + continue; + } + + // Case 3: None of the previous cases is true, so just keep this edge in + // the queue. + NewQueue.emplace_back(E); + } + + // Sort remaining edges in increasing weight order. + Queue.swap(NewQueue); + std::sort(Queue.begin(), Queue.end(), Comp); +} + +bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { + return Front.back() == E.Src && Back.front() == E.Dst; +} + +void MinBranchGreedyClusterAlgorithm::reset() { + GreedyClusterAlgorithm::reset(); + Weight.clear(); +} + +void OptimalReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + std::vector> Weight; + std::unordered_map BBToIndex; + std::vector IndexToBB; + + unsigned N = BF.layout_size(); + // Populating weight map and index map + for (auto BB : BF.layout()) { + BBToIndex[BB] = IndexToBB.size(); + IndexToBB.push_back(BB); + } + Weight.resize(N); + for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); + Weight[BBToIndex[BB]].resize(N); + for (auto I : BB->successors()) { + if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) + Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count; + ++BI; + } + } + + std::vector> DP; + DP.resize(1 << N); + for (auto &Elmt : DP) { + Elmt.resize(N, -1); + } + // Start with the entry basic block being allocated with cost zero + DP[1][0] = 0; + // Walk through TSP solutions using a bitmask to represent state (current set + // of BBs in the layout) + unsigned BestSet = 1; + unsigned BestLast = 0; + int64_t BestWeight = 0; + for (unsigned Set = 1; Set < (1U << N); ++Set) { + // Traverse each possibility of Last BB visited in this layout + for (unsigned Last = 0; Last < N; ++Last) { + // Case 1: There is no possible layout with this BB as Last + if (DP[Set][Last] == -1) + continue; + + // Case 2: There is a layout with this Set and this Last, and we try + // to expand this set with New + for (unsigned New = 1; New < N; ++New) { + // Case 2a: BB "New" is already in this Set + if ((Set & (1 << New)) != 0) + continue; + + // Case 2b: BB "New" is not in this set and we add it to this Set and + // record total weight of this layout with "New" as the last BB. + unsigned NewSet = (Set | (1 << New)); + if (DP[NewSet][New] == -1) + DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; + DP[NewSet][New] = std::max(DP[NewSet][New], + DP[Set][Last] + (int64_t)Weight[Last][New]); + + if (DP[NewSet][New] > BestWeight) { + BestWeight = DP[NewSet][New]; + BestSet = NewSet; + BestLast = New; + } + } + } + } + + // Define final function layout based on layout that maximizes weight + unsigned Last = BestLast; + unsigned Set = BestSet; + std::vector Visited; + Visited.resize(N); + Visited[Last] = true; + Order.push_back(IndexToBB[Last]); + Set = Set & ~(1U << Last); + while (Set != 0) { + int64_t Best = -1; + for (unsigned I = 0; I < N; ++I) { + if (DP[Set][I] == -1) + continue; + if (DP[Set][I] > Best) { + Last = I; + Best = DP[Set][I]; + } + } + Visited[Last] = true; + Order.push_back(IndexToBB[Last]); + Set = Set & ~(1U << Last); + } + std::reverse(Order.begin(), Order.end()); + + // Finalize layout with BBs that weren't assigned to the layout + for (auto BB : BF.layout()) { + if (Visited[BBToIndex[BB]] == false) + Order.push_back(BB); + } +} + +void OptimizeReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Arrange basic blocks according to clusters. + for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); +} + +void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true); + std::vector &Clusters = CAlgo->Clusters; + auto &ClusterEdges = CAlgo->ClusterEdges; + + // Compute clusters' average frequencies. + CAlgo->computeClusterAverageFrequency(); + std::vector &AvgFreq = CAlgo->AvgFreq; + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Cluster layout order + std::vector ClusterOrder; + + // Do a topological sort for clusters, prioritizing frequently-executed BBs + // during the traversal. + std::stack Stack; + std::vector Status; + std::vector Parent; + Status.resize(Clusters.size(), 0); + Parent.resize(Clusters.size(), 0); + constexpr uint32_t STACKED = 1; + constexpr uint32_t VISITED = 2; + Status[0] = STACKED; + Stack.push(0); + while (!Stack.empty()) { + uint32_t I = Stack.top(); + if (!(Status[I] & VISITED)) { + Status[I] |= VISITED; + // Order successors by weight + auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { + return ClusterEdges[I][A] > ClusterEdges[I][B]; + }; + std::priority_queue, + decltype(ClusterComp)> SuccQueue(ClusterComp); + for (auto &Target: ClusterEdges[I]) { + if (Target.second > 0 && !(Status[Target.first] & STACKED) && + !Clusters[Target.first].empty()) { + Parent[Target.first] = I; + Status[Target.first] = STACKED; + SuccQueue.push(Target.first); + } + } + while (!SuccQueue.empty()) { + Stack.push(SuccQueue.top()); + SuccQueue.pop(); + } + continue; + } + // Already visited this node + Stack.pop(); + ClusterOrder.push_back(I); + } + std::reverse(ClusterOrder.begin(), ClusterOrder.end()); + // Put unreachable clusters at the end + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!(Status[I] & VISITED) && !Clusters[I].empty()) + ClusterOrder.push_back(I); + + // Sort nodes with equal precedence + auto Beg = ClusterOrder.begin(); + // Don't reorder the first cluster, which contains the function entry point + ++Beg; + std::stable_sort(Beg, ClusterOrder.end(), + [&AvgFreq, &Parent](uint32_t A, uint32_t B) { + uint32_t P = Parent[A]; + while (Parent[P] != 0) { + if (Parent[P] == B) + return false; + P = Parent[P]; + } + P = Parent[B]; + while (Parent[P] != 0) { + if (Parent[P] == A) + return true; + P = Parent[P]; + } + return AvgFreq[A] > AvgFreq[B]; + }); + + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for (auto O : ClusterOrder) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + } + + // Arrange basic blocks according to cluster order. + for (uint32_t ClusterIndex : ClusterOrder) { + ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); + } +} + +void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + std::vector &Clusters = CAlgo->Clusters; + + // Compute clusters' average frequencies. + CAlgo->computeClusterAverageFrequency(); + std::vector &AvgFreq = CAlgo->AvgFreq; + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Cluster layout order + std::vector ClusterOrder; + + // Order clusters based on average instruction execution frequency + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!Clusters[I].empty()) + ClusterOrder.push_back(I); + // Don't reorder the first cluster, which contains the function entry point + std::stable_sort(std::next(ClusterOrder.begin()), + ClusterOrder.end(), + [&AvgFreq](uint32_t A, uint32_t B) { + return AvgFreq[A] > AvgFreq[B]; + }); + + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for (auto O : ClusterOrder) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + } + + // Arrange basic blocks according to cluster order. + for (uint32_t ClusterIndex : ClusterOrder) { + ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); + } +} + +void ReverseReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + auto FirstBB = *BF.layout_begin(); + Order.push_back(FirstBB); + for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) + Order.push_back(*RLI); +} + + +void RandomClusterReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + std::vector &Clusters = CAlgo->Clusters; + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Cluster layout order + std::vector ClusterOrder; + + // Order clusters based on average instruction execution frequency + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!Clusters[I].empty()) + ClusterOrder.push_back(I); + + std::srand(opts::RandomSeed); + std::random_shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end()); + + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for (auto O : ClusterOrder) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + } + + // Arrange basic blocks according to cluster order. + for (uint32_t ClusterIndex : ClusterOrder) { + ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); + } +} diff --git a/bolt/Passes/ReorderAlgorithm.h b/bolt/Passes/ReorderAlgorithm.h new file mode 100644 index 000000000000..fd50a6c311e5 --- /dev/null +++ b/bolt/Passes/ReorderAlgorithm.h @@ -0,0 +1,268 @@ +// Passes/ReorderAlgorithm.h - Interface for basic block reorderng algorithms // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to different basic block reordering algorithms. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_ALGORITHM_H + +#include "BinaryFunction.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include +#include + + +namespace llvm { + +class raw_ostream; + + +namespace bolt { + +class BinaryBasicBlock; +class BinaryFunction; + +/// Objects of this class implement various basic block clustering algorithms. +/// Basic block clusters are chains of basic blocks that should be laid out +/// in this order to maximize performace. These algorithms group basic blocks +/// into clusters using execution profile data and various heuristics. +class ClusterAlgorithm { +public: + using ClusterTy = std::vector; + std::vector Clusters; + std::vector> ClusterEdges; + std::vector AvgFreq; + + /// Group the basic blocks in the given function into clusters stored in the + /// Clusters vector. Also encode relative weights between two clusters in + /// the ClusterEdges vector if requested. This vector is indexed by + /// the clusters indices in the Clusters vector. + virtual void clusterBasicBlocks(const BinaryFunction &BF, + bool ComputeEdges = false) = 0; + + /// Compute for each cluster its averagae execution frequency, that is + /// the sum of average frequencies of its blocks (execution count / # instrs). + /// The average frequencies are stored in the AvgFreq vector, index by the + /// cluster indices in the Clusters vector. + void computeClusterAverageFrequency(); + + /// Clear clusters and related info. + virtual void reset(); + + void printClusters() const; + + virtual ~ClusterAlgorithm() {} +}; + +/// Base class for a greedy clustering algorithm that selects edges in order +/// based on some heuristic and uses them to join basic blocks into clusters. +class GreedyClusterAlgorithm : public ClusterAlgorithm { +protected: + // Represents an edge between two basic blocks, with source, destination, and + // profile count. + struct EdgeTy { + const BinaryBasicBlock *Src; + const BinaryBasicBlock *Dst; + uint64_t Count; + + EdgeTy(const BinaryBasicBlock *Src, const BinaryBasicBlock *Dst, + uint64_t Count) : + Src(Src), Dst(Dst), Count(Count) {} + + void print(raw_ostream &OS) const; + }; + + struct EdgeHash { + size_t operator() (const EdgeTy &E) const; + }; + + struct EdgeEqual { + bool operator() (const EdgeTy &A, const EdgeTy &B) const; + }; + + // Virtual methods that allow custom specialization of the heuristic used by + // the algorithm to select edges. + virtual void initQueue( + std::vector &Queue, const BinaryFunction &BF) = 0; + virtual void adjustQueue( + std::vector &Queue, const BinaryFunction &BF) = 0; + virtual bool areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const = 0; + + // Map from basic block to owning cluster index. + using BBToClusterMapTy = std::unordered_map; + BBToClusterMapTy BBToClusterMap; + +public: + void clusterBasicBlocks(const BinaryFunction &BF, + bool ComputeEdges = false) override; + void reset() override; +}; + + +/// This clustering algorithm is based on a greedy heuristic suggested by +/// Pettis and Hansen (PLDI '90). +class PHGreedyClusterAlgorithm : public GreedyClusterAlgorithm { +protected: + void initQueue( + std::vector &Queue, const BinaryFunction &BF) override; + void adjustQueue( + std::vector &Queue, const BinaryFunction &BF) override; + bool areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const + override; +}; + + +/// This clustering algorithm is based on a greedy heuristic that is a +/// modification of the heuristic suggested by Pettis (PLDI '90). It is +/// geared towards minimizing branches. +class MinBranchGreedyClusterAlgorithm : public GreedyClusterAlgorithm { +private: + // Map from an edge to its weight which is used by the algorithm to sort the + // edges. + std::unordered_map Weight; + + // The weight of an edge is calculated as the win in branches if we choose + // to layout this edge as a fall-through. For example, consider the edges + // A -> B with execution count 500, + // A -> C with execution count 100, and + // D -> B with execution count 150 + // wher B, C are the only successors of A and A, D are thr only predessecors + // of B. Then if we choose to layout edge A -> B as a fallthrough, the win in + // branches would be 500 - 100 - 150 = 250. That is the weight of edge A->B. + int64_t calculateWeight(const EdgeTy &E, const BinaryFunction &BF) const; + +protected: + void initQueue( + std::vector &Queue, const BinaryFunction &BF) override; + void adjustQueue( + std::vector &Queue, const BinaryFunction &BF) override; + bool areClustersCompatible( + const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const + override; + +public: + void reset() override; +}; + + +/// Objects of this class implement various basic block reordering alogrithms. +/// Most of these algorithms depend on a clustering alogrithm. +/// Here we have 3 conflicting goals as to how to layout clusters. If we want +/// to minimize jump offsets, we should put clusters with heavy inter-cluster +/// dependence as close as possible. If we want to maximize the probability +/// that all inter-cluster edges are predicted as not-taken, we should enforce +/// a topological order to make targets appear after sources, creating forward +/// branches. If we want to separate hot from cold blocks to maximize the +/// probability that unfrequently executed code doesn't pollute the cache, we +/// should put clusters in descending order of hotness. +class ReorderAlgorithm { +protected: + std::unique_ptr CAlgo; + +public: + ReorderAlgorithm() { } + explicit ReorderAlgorithm(std::unique_ptr CAlgo) : + CAlgo(std::move(CAlgo)) { } + + using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; + + /// Reorder the basic blocks of the given function and store the new order in + /// the new Clusters vector. + virtual void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const = 0; + + void setClusterAlgorithm(ClusterAlgorithm *CAlgo) { + this->CAlgo.reset(CAlgo); + } + + virtual ~ReorderAlgorithm() { } +}; + + +/// Dynamic programming implementation for the TSP, applied to BB layout. Find +/// the optimal way to maximize weight during a path traversing all BBs. In +/// this way, we will convert the hottest branches into fall-throughs. +/// +/// Uses exponential amount of memory on the number of basic blocks and should +/// only be used for small functions. +class OptimalReorderAlgorithm : public ReorderAlgorithm { +public: + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// Simple algorithm that groups basic blocks into clusters and then +/// lays them out cluster after cluster. +class OptimizeReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeReorderAlgorithm(std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// This reorder algorithm tries to ensure that all inter-cluster edges are +/// predicted as not-taken, by enforcing a topological order to make +/// targets appear after sources, creating forward branches. +class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeBranchReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// This reorder tries to separate hot from cold blocks to maximize the +/// probability that unfrequently executed code doesn't pollute the cache, by +/// putting clusters in descending order of hotness. +class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeCacheReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// Toy example that simply reverses the original basic block order. +class ReverseReorderAlgorithm : public ReorderAlgorithm { +public: + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + +/// Create clusters as usual and place them in random order. +class RandomClusterReorderAlgorithm : public ReorderAlgorithm { +public: + explicit RandomClusterReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + +} // namespace bolt +} // namespace llvm + +#endif From d735292725f6a4db618007b0b86bdcc17b657cc6 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 21 Feb 2017 16:15:15 -0800 Subject: [PATCH 218/904] [BOLT] Fix -jump-tables=basic in relocation mode. Summary: In a prev diff I added an option to update jump tables in-place (on by default) and accidentally broke the default handling of jump tables in relocation mode. The update should be happening semi-automatically, but because we ignore relocations for jump tables it wasn't happening (derp). Since we mostly use '-jump-tables=move' this hasn't been noticed for some time. This diff gets rid of IgnoredRelocations and removes relocations from a relocation set when they are no longer needed. If relocations are created later for jump tables they are no longer ignored. (cherry picked from commit de6f44ee6b265318d0242529c6748bbcad56e294) --- bolt/BinaryContext.cpp | 100 +++++++++++++++++++++++++++++++++++++-- bolt/BinaryContext.h | 23 ++++----- bolt/BinaryFunction.cpp | 70 +-------------------------- bolt/RewriteInstance.cpp | 30 +----------- 4 files changed, 112 insertions(+), 111 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 15850dd5e929..dfbfdb1dbe2b 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/DebugInfo/DWARF/DWARFUnit.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" @@ -415,14 +416,107 @@ ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ return std::make_error_code(std::errc::bad_address); } -void BinaryContext::addSectionRelocation(SectionRef Section, uint64_t Address, +void BinaryContext::addSectionRelocation(SectionRef Section, uint64_t Offset, MCSymbol *Symbol, uint64_t Type, uint64_t Addend) { auto RI = SectionRelocations.find(Section); if (RI == SectionRelocations.end()) { auto Result = - SectionRelocations.emplace(Section, std::vector()); + SectionRelocations.emplace(Section, std::set()); RI = Result.first; } - RI->second.emplace_back(Relocation{Address, Symbol, Type, Addend}); + RI->second.emplace(Relocation{Offset, Symbol, Type, Addend}); +} + +void BinaryContext::addRelocation(uint64_t Address, MCSymbol *Symbol, + uint64_t Type, uint64_t Addend) { + auto ContainingSection = getSectionForAddress(Address); + assert(ContainingSection && "cannot find section for address"); + addSectionRelocation(*ContainingSection, + Address - ContainingSection->getAddress(), + Symbol, + Type, + Addend); +} + +void BinaryContext::removeRelocationAt(uint64_t Address) { + auto ContainingSection = getSectionForAddress(Address); + assert(ContainingSection && "cannot find section for address"); + auto RI = SectionRelocations.find(*ContainingSection); + if (RI == SectionRelocations.end()) + return; + + auto &Relocations = RI->second; + auto RelocI = Relocations.find( + Relocation{Address - ContainingSection->getAddress(), 0, 0, 0, 0}); + if (RelocI == Relocations.end()) + return; + + Relocations.erase(RelocI); +} + +size_t Relocation::getSizeForType(uint64_t Type) { + switch (Type) { + default: + llvm_unreachable("unsupported relocation type"); + case ELF::R_X86_64_PC8: + return 1; + case ELF::R_X86_64_PLT32: + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_32: + case ELF::R_X86_64_GOTPCREL: + case ELF::R_X86_64_GOTTPOFF: + case ELF::R_X86_64_TPOFF32: + case ELF::R_X86_64_GOTPCRELX: + case ELF::R_X86_64_REX_GOTPCRELX: + return 4; + case ELF::R_X86_64_PC64: + case ELF::R_X86_64_64: + return 8; + } +} + +bool Relocation::isPCRelative(uint64_t Type) { + switch (Type) { + default: + llvm_unreachable("Unknown relocation type"); + + case ELF::R_X86_64_64: + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_TPOFF32: + return false; + + case ELF::R_X86_64_PC8: + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_GOTPCREL: + case ELF::R_X86_64_PLT32: + case ELF::R_X86_64_GOTTPOFF: + case ELF::R_X86_64_GOTPCRELX: + case ELF::R_X86_64_REX_GOTPCRELX: + return true; + } +} + +size_t Relocation::emit(MCStreamer *Streamer) const { + const auto Size = getSizeForType(Type); + auto &Ctx = Streamer->getContext(); + if (isPCRelative(Type)) { + auto *TempLabel = Ctx.createTempSymbol(); + Streamer->EmitLabel(TempLabel); + auto Value = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(Symbol, Ctx), + MCSymbolRefExpr::create(TempLabel, Ctx), + Ctx); + if (Addend) { + Value = MCBinaryExpr::createAdd(Value, + MCConstantExpr::create(Addend, Ctx), + Ctx); + } + Streamer->EmitValue(Value, Size); + } else { + Streamer->EmitSymbolValue(Symbol, Size); + } + return Size; } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index df07636ca0d2..969f434f2049 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -56,7 +56,7 @@ class DataReader; /// Relocation class. struct Relocation { uint64_t Offset; - MCSymbol *Symbol; + mutable MCSymbol *Symbol; /// mutable to allow modification by emitter. uint64_t Type; uint64_t Addend; uint64_t Value; @@ -69,7 +69,7 @@ struct Relocation { /// Emit relocation at a current \p Streamer' position. The caller is /// responsible for setting the position correctly. - size_t emit(MCStreamer *Streamer); + size_t emit(MCStreamer *Streamer) const; }; /// Relocation ordering by offset. @@ -109,14 +109,8 @@ class BinaryContext { /// List of DWARF location lists in .debug_loc. std::vector LocationLists; - /// List of relocation offsets where relocations should be ignored. - std::set IgnoredRelocations; - - /// List of PC-relative relocations from data to code. - std::set PCRelativeDataRelocations; - /// Section relocations. - std::map> SectionRelocations; + std::map> SectionRelocations; /// List of DWARF entries in .debug_info that have address ranges to be /// updated. These include lexical blocks (DW_TAG_lexical_block) and concrete @@ -231,11 +225,18 @@ class BinaryContext { BinaryFunction &ParentBF, std::map &BFs); - /// Add section relocation. - void addSectionRelocation(SectionRef Section, uint64_t Address, + /// Add relocation for \p Section at a given \p Offset. + void addSectionRelocation(SectionRef Section, uint64_t Offset, MCSymbol *Symbol, uint64_t Type, uint64_t Addend = 0); + /// Add a relocation at a given \p Address. + void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type, + uint64_t Addend = 0); + + /// Remove registered relocation at a given \p Address. + void removeRelocationAt(uint64_t Address); + const BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) const { auto BFI = SymbolToFunctionMap.find(Symbol); return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a45ef35cfd37..53baed360dfe 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1261,9 +1261,9 @@ void BinaryFunction::postProcessJumpTables() { if (TargetOffset < getSize()) TakenBranches.emplace_back(JTSiteOffset, TargetOffset); - // Ignore relocations for jump tables. + // Take ownership of jump table relocations. if (opts::Relocs) - BC.IgnoredRelocations.emplace(JT->Address + EntryOffset); + BC.removeRelocationAt(JT->Address + EntryOffset); EntryOffset += JT->EntrySize; @@ -3821,71 +3821,5 @@ void DynoStats::operator+=(const DynoStats &Other) { } } -size_t Relocation::getSizeForType(uint64_t Type) { - switch (Type) { - default: - llvm_unreachable("unsupported relocation type"); - case ELF::R_X86_64_PC8: - return 1; - case ELF::R_X86_64_PLT32: - case ELF::R_X86_64_PC32: - case ELF::R_X86_64_32S: - case ELF::R_X86_64_32: - case ELF::R_X86_64_GOTPCREL: - case ELF::R_X86_64_GOTTPOFF: - case ELF::R_X86_64_TPOFF32: - case ELF::R_X86_64_GOTPCRELX: - case ELF::R_X86_64_REX_GOTPCRELX: - return 4; - case ELF::R_X86_64_PC64: - case ELF::R_X86_64_64: - return 8; - } -} - -bool Relocation::isPCRelative(uint64_t Type) { - switch (Type) { - default: - llvm_unreachable("Unknown relocation type"); - - case ELF::R_X86_64_64: - case ELF::R_X86_64_32: - case ELF::R_X86_64_32S: - case ELF::R_X86_64_TPOFF32: - return false; - - case ELF::R_X86_64_PC8: - case ELF::R_X86_64_PC32: - case ELF::R_X86_64_GOTPCREL: - case ELF::R_X86_64_PLT32: - case ELF::R_X86_64_GOTTPOFF: - case ELF::R_X86_64_GOTPCRELX: - case ELF::R_X86_64_REX_GOTPCRELX: - return true; - } -} - -size_t Relocation::emit(MCStreamer *Streamer) { - const auto Size = getSizeForType(Type); - auto &Ctx = Streamer->getContext(); - if (isPCRelative(Type)) { - auto *TempLabel = Ctx.createTempSymbol(); - Streamer->EmitLabel(TempLabel); - auto Value = - MCBinaryExpr::createSub(MCSymbolRefExpr::create(Symbol, Ctx), - MCSymbolRefExpr::create(TempLabel, Ctx), - Ctx); - if (Addend) { - Value = MCBinaryExpr::createAdd(Value, - MCConstantExpr::create(Addend, Ctx), - Ctx); - } - Streamer->EmitValue(Value, Size); - } else { - Streamer->EmitSymbolValue(Symbol, Size); - } - return Size; -} - } // namespace bolt } // namespace llvm diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 27c17c9837ba..f9f012c5d1cb 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1494,8 +1494,6 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { // from linker data alone. if (IsFromCode) { ContainingBF->addPCRelativeRelocationAddress(Rel.getOffset()); - } else { - BC->PCRelativeDataRelocations.emplace(Rel.getOffset()); } DEBUG(dbgs() << "BOLT-DEBUG: not creating PC-relative relocation\n"); continue; @@ -1551,14 +1549,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocation from code to data\n"); } } else if (ToCode) { - auto ContainingSection = BC->getSectionForAddress(Rel.getOffset()); - assert(ContainingSection && "cannot find section for address"); assert(Addend == 0 && "did not expect addend"); - BC->addSectionRelocation(*ContainingSection, - Rel.getOffset()- ContainingSection->getAddress(), - ReferencedSymbol, - Rel.getType()); - + BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType()); } else { DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocation from data to data\n"); } @@ -1998,21 +1990,6 @@ void RewriteInstance::emitFunctions() { } if (opts::Relocs) { - // Make sure all original PC-relative relocations from data are ignored. - std::vector MissedAddresses; - std::set_difference(BC->PCRelativeDataRelocations.begin(), - BC->PCRelativeDataRelocations.end(), - BC->IgnoredRelocations.begin(), - BC->IgnoredRelocations.end(), - std::back_inserter(MissedAddresses)); - - if (!MissedAddresses.empty()) { - errs() << "BOLT-ERROR: " << MissedAddresses.size() - << " missed addresses:\n"; - for (auto Address : MissedAddresses) - errs() << "\t0x" << Twine::utohexstr(Address) << '\n'; - } - emitDataSections(Streamer.get()); } @@ -2324,12 +2301,7 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, SectionRef Section, auto &Relocations = SRI->second; uint64_t SectionOffset = 0; - std::sort(Relocations.begin(), Relocations.end()); for (auto &Relocation : Relocations) { - auto RelocationAddress = Section.getAddress() + Relocation.Offset; - if (BC->IgnoredRelocations.count(RelocationAddress)) { - continue; - } assert(Relocation.Offset < Section.getSize() && "overflow detected"); if (SectionOffset < Relocation.Offset) { Streamer->EmitBytes( From b692fd39bcabfee843e4945d0c763ee4c84d9e1a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 22 Feb 2017 11:29:52 -0800 Subject: [PATCH 219/904] [BOLT] Don't set code skew in relocations mode. Summary: We use code skew in non-relocation mode since functions have fixed addresses, and internal alignment has to be adjusted wrt the skew. However in relocation mode it interferes with effective code alignment, and has to be disabled. I missed it when was re-basing the relocation diff. (cherry picked from commit 63a847947fff6f5e8600b938842f6b810cd6d153) --- bolt/RewriteInstance.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index f9f012c5d1cb..63b0aecdf143 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -133,7 +133,7 @@ TopCalledLimit("top-called-limit", cl::init(100), cl::ZeroOrMore, cl::Hidden); - + cl::opt HotText("hot-text", cl::desc("hot text symbols support"), @@ -1776,6 +1776,9 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.SwitchSection(Section); + if (!opts::Relocs) + Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress()); + if (opts::Relocs) { // We have to use at least 2-byte alignment because of C++ ABI. Streamer.EmitCodeAlignment(2); @@ -2747,7 +2750,7 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { // // New section header string table goes last. - + // Fix ELF header. auto NewEhdr = *Obj->getHeader(); From 9e393525e4cfb5721db4ac59782d836c0895403c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 23 Feb 2017 18:09:10 -0800 Subject: [PATCH 220/904] [BOLT] Strip 'repz' prefix from 'repz retq'. Summary: Add pass to strip 'repz' prefix from 'repz retq' sequence. The prefix is not used in Intel CPUs afaik. The pass is on by default. (cherry picked from commit 8df00a103a91b059bc7c2d7c7e8751cf35784908) --- bolt/BinaryBasicBlock.cpp | 25 ++++++++++----------- bolt/BinaryBasicBlock.h | 23 ++++++++++++++++++-- bolt/BinaryFunction.cpp | 6 +++--- bolt/BinaryPassManager.cpp | 9 ++++++++ bolt/Passes/BinaryPasses.cpp | 42 +++++++++++++++++++++++++++++++----- bolt/Passes/BinaryPasses.h | 15 +++++++++++++ 6 files changed, 98 insertions(+), 22 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index c31c4b886e51..fc005e1087cd 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -35,22 +35,23 @@ void BinaryBasicBlock::adjustNumPseudos(const MCInst &Inst, int Sign) { NumPseudos += Sign; } -MCInst *BinaryBasicBlock::getFirstNonPseudo() { - auto &BC = Function->getBinaryContext(); - for (auto &Inst : Instructions) { - if (!BC.MII->get(Inst.getOpcode()).isPseudo()) - return &Inst; +BinaryBasicBlock::iterator BinaryBasicBlock::getFirstNonPseudo() { + const auto &BC = Function->getBinaryContext(); + for (auto II = Instructions.begin(), E = Instructions.end(); II != E; ++II) { + if (!BC.MII->get(II->getOpcode()).isPseudo()) + return II; } - return nullptr; + return end(); } -MCInst *BinaryBasicBlock::getLastNonPseudo() { - auto &BC = Function->getBinaryContext(); - for (auto Itr = Instructions.rbegin(); Itr != Instructions.rend(); ++Itr) { - if (!BC.MII->get(Itr->getOpcode()).isPseudo()) - return &*Itr; +BinaryBasicBlock::reverse_iterator BinaryBasicBlock::getLastNonPseudo() { + const auto &BC = Function->getBinaryContext(); + for (auto RII = Instructions.rbegin(), E = Instructions.rend(); + RII != E; ++RII) { + if (!BC.MII->get(RII->getOpcode()).isPseudo()) + return RII; } - return nullptr; + return rend(); } bool BinaryBasicBlock::validateSuccessorInvariants() { diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index df230a081efc..402389b3f007 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -412,13 +412,27 @@ class BinaryBasicBlock { return size() - getNumPseudos(); } + /// Return iterator to the first non-pseudo instruction or end() + /// if no such instruction was found. + iterator getFirstNonPseudo(); + /// Return a pointer to the first non-pseudo instruction in this basic /// block. Returns nullptr if none exists. - MCInst *getFirstNonPseudo(); + MCInst *getFirstNonPseudoInstr() { + auto II = getFirstNonPseudo(); + return II == Instructions.end() ? nullptr : &*II; + } + + /// Return reverse iterator to the last non-pseudo instruction or rend() + /// if no such instruction was found. + reverse_iterator getLastNonPseudo(); /// Return a pointer to the last non-pseudo instruction in this basic /// block. Returns nullptr if none exists. - MCInst *getLastNonPseudo(); + MCInst *getLastNonPseudoInstr() { + auto RII = getLastNonPseudo(); + return RII == Instructions.rend() ? nullptr : &*RII; + } /// Set minimum alignment for the basic block. void setAlignment(uint64_t Align) { @@ -553,6 +567,11 @@ class BinaryBasicBlock { return replaceInstruction(Inst, std::vector()); } + /// Erase non-pseudo instruction at a given iterator \p II. + iterator eraseInstruction(iterator II) { + return Instructions.erase(II); + } + /// Replace an instruction with a sequence of instructions. Returns true /// if the instruction to be replaced was found and replaced. template diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 53baed360dfe..aa8d56d6f45a 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1617,7 +1617,7 @@ bool BinaryFunction::buildCFG() { const BranchInfo &BInfo = BranchInfoOrErr.get(); FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); // Populate profile counts for the jump table. - auto *LastInstr = FromBB->getLastNonPseudo(); + auto *LastInstr = FromBB->getLastNonPseudoInstr(); if (!LastInstr) continue; auto JTAddress = BC.MIA->getJumpTable(*LastInstr); @@ -2711,7 +2711,7 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { CondBranch, UncondBranch); - const auto *LastInstr = BB->getLastNonPseudo(); + const auto *LastInstr = BB->getLastNonPseudoInstr(); const bool IsJumpTable = LastInstr && BC.MIA->getJumpTable(*LastInstr); auto BI = BB->branch_info_begin(); @@ -3717,7 +3717,7 @@ DynoStats BinaryFunction::getDynoStats() const { Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount; // Jump tables. - const auto *LastInstr = BB->getLastNonPseudo(); + const auto *LastInstr = BB->getLastNonPseudoInstr(); if (BC.MIA->getJumpTable(*LastInstr)) { Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount; DEBUG( diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 6592e18c27c8..ade1298e887e 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -68,6 +68,12 @@ SimplifyRODataLoads("simplify-rodata-loads", "section"), cl::ZeroOrMore); +static cl::opt +StripRepRet("strip-rep-ret", + cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"), + cl::init(true), + cl::ZeroOrMore); + static cl::opt OptimizeFrameAccesses( "frame-opt", cl::desc("optimize stack frame accesses"), cl::ZeroOrMore); @@ -220,6 +226,9 @@ void BinaryFunctionPassManager::runAllPasses( // Run this pass first to use stats for the original functions. Manager.registerPass(llvm::make_unique(NeverPrint)); + Manager.registerPass(llvm::make_unique(NeverPrint), + opts::StripRepRet); + Manager.registerPass(llvm::make_unique(PrintICF)); Manager.registerPass(llvm::make_unique(PrintICP), diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 5fc6ea4e8a84..d693bbb85555 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -221,7 +221,7 @@ void OptimizeBodylessFunctions::analyze( if (BF.size() != 1 || BF.front().getNumNonPseudos() != 1) return; - const auto *FirstInstr = BF.front().getFirstNonPseudo(); + const auto *FirstInstr = BF.front().getFirstNonPseudoInstr(); if (!FirstInstr) return; if (!BC.MIA->isTailCall(*FirstInstr)) @@ -461,7 +461,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, if (BB->getNumNonPseudos() != 1) continue; - auto *Instr = BB->getFirstNonPseudo(); + auto *Instr = BB->getFirstNonPseudoInstr(); if (!MIA->isTailCall(*Instr)) continue; @@ -621,7 +621,7 @@ void Peepholes::fixDoubleJumps(BinaryContext &BC, } else { // Succ will be null in the tail call case. In this case we // need to explicitly add a tail call instruction. - auto *Branch = Pred->getLastNonPseudo(); + auto *Branch = Pred->getLastNonPseudoInstr(); if (Branch && BC.MIA->isUnconditionalBranch(*Branch)) { Pred->removeSuccessor(&BB); Pred->eraseInstruction(Branch); @@ -641,7 +641,7 @@ void Peepholes::fixDoubleJumps(BinaryContext &BC, if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) continue; - auto *Inst = BB.getFirstNonPseudo(); + auto *Inst = BB.getFirstNonPseudoInstr(); const bool IsTailCall = BC.MIA->isTailCall(*Inst); if (!BC.MIA->isUnconditionalBranch(*Inst) && !IsTailCall) @@ -671,7 +671,7 @@ void Peepholes::fixDoubleJumps(BinaryContext &BC, void Peepholes::addTailcallTraps(BinaryContext &BC, BinaryFunction &Function) { for (auto &BB : Function) { - auto *Inst = BB.getLastNonPseudo(); + auto *Inst = BB.getLastNonPseudoInstr(); if (Inst && BC.MIA->isTailCall(*Inst) && BC.MIA->isIndirectBranch(*Inst)) { MCInst Trap; if (BC.MIA->createTrap(Trap)) { @@ -1577,5 +1577,37 @@ void InstructionLowering::runOnFunctions( } } +void StripRepRet::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + uint64_t NumPrefixesRemoved = 0; + uint64_t NumBytesSaved = 0; + for (auto &BFI : BFs) { + for (auto &BB : BFI.second) { + auto LastInstRIter = BB.getLastNonPseudo(); + if (LastInstRIter == BB.rend() || + !BC.MIA->isReturn(*LastInstRIter)) + continue; + + auto NextToLastInstRIter = std::next(LastInstRIter); + if (NextToLastInstRIter == BB.rend() || + !BC.MIA->isPrefix(*NextToLastInstRIter)) + continue; + + BB.eraseInstruction(std::next(NextToLastInstRIter).base()); + + NumPrefixesRemoved += BB.getKnownExecutionCount(); + ++NumBytesSaved; + } + } + + if (NumBytesSaved) { + outs() << "BOLT-INFO: removed " << NumBytesSaved << " 'repz' prefixes" + " with estimated execution count of " << NumPrefixesRemoved + << " times.\n"; + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index ea23c7bb9d02..a2c61cb1dafa 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -484,6 +484,21 @@ class InstructionLowering : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// Pass for stripping 'repz' from 'repz retq' sequence of instructions. +class StripRepRet : public BinaryFunctionPass { +public: + explicit StripRepRet(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { + return "strip-rep-ret"; + } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + } // namespace bolt } // namespace llvm From 723b5f5560f2d4bdab9cf0fc0d0c58c6e27f0f1e Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 27 Feb 2017 13:09:27 -0800 Subject: [PATCH 221/904] Fix warnings when compiling with clang (NFC) Summary: Fix inconsistent override keyword usages and initializes a missing field of a Relocation object when using braced initializers. (cherry picked from commit 99991460f677a82422488a789bc7fcb2fc600cf1) --- bolt/BinaryContext.cpp | 2 +- bolt/Passes/BinaryPasses.h | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index dfbfdb1dbe2b..2a9dde0ccb53 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -425,7 +425,7 @@ void BinaryContext::addSectionRelocation(SectionRef Section, uint64_t Offset, SectionRelocations.emplace(Section, std::set()); RI = Result.first; } - RI->second.emplace(Relocation{Offset, Symbol, Type, Addend}); + RI->second.emplace(Relocation{Offset, Symbol, Type, Addend, 0}); } void BinaryContext::addRelocation(uint64_t Address, MCSymbol *Symbol, diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index a2c61cb1dafa..7ed78e6eea71 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -326,13 +326,13 @@ class PrintSortedBy : public BinaryFunctionPass { /// from that callsite exceed the specified threshold (default 90%), /// the call is promoted. Otherwise, it is ignored. By default, /// only one target is considered at each callsite. -/// +/// /// When an candidate callsite is processed, we modify the callsite /// to test for the most common call targets before calling through /// the original generic call mechanism. -/// +/// /// The CFG and layout are modified by ICP. -/// +/// /// A few new command line options have been added: /// -indirect-call-promotion /// -indirect-call-promotion-threshold= @@ -346,7 +346,7 @@ class PrintSortedBy : public BinaryFunctionPass { /// any callsite where the branch predictor does a good enough job /// that ICP wouldn't help regardless of the frequency of the most /// common target. -/// +/// /// The topn option controls the number of targets to consider for /// each callsite, e.g. ICP is triggered if topn=2 and the total /// frequency of the top two call targets exceeds the threshold. @@ -356,22 +356,22 @@ class PrintSortedBy : public BinaryFunctionPass { /// (callq $foo). /// /// Example of ICP: -/// +/// /// C++ code: -/// +/// /// int B_count = 0; /// int C_count = 0; -/// +/// /// struct A { virtual void foo() = 0; } /// struct B : public A { virtual void foo() { ++B_count; }; }; /// struct C : public A { virtual void foo() { ++C_count; }; }; -/// +/// /// A* a = ... /// a->foo(); /// ... -/// +/// /// original assembly: -/// +/// /// B0: 49 8b 07 mov (%r15),%rax /// 4c 89 ff mov %r15,%rdi /// ff 10 callq *(%rax) @@ -380,9 +380,9 @@ class PrintSortedBy : public BinaryFunctionPass { /// 4c 0f 44 f5 cmove %rbp,%r14 /// 4c 89 f7 mov %r14,%rdi /// ... -/// +/// /// after ICP: -/// +/// /// B0: 49 8b 07 mov (%r15),%rax /// 4c 89 ff mov %r15,%rdi /// 48 81 38 e0 0b 40 00 cmpq $B::foo,(%rax) @@ -393,7 +393,7 @@ class PrintSortedBy : public BinaryFunctionPass { /// 4c 0f 44 f5 cmove %rbp,%r14 /// 4c 89 f7 mov %r14,%rdi /// ... -/// +/// /// B3: ff 10 callq *(%rax) /// eb d6 jmp B2 /// @@ -457,7 +457,7 @@ class IndirectCallPromotion : public BinaryFunctionPass { explicit IndirectCallPromotion(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } - const char *getName() const { + const char *getName() const override { return "indirect-call-promotion"; } bool shouldPrint(const BinaryFunction &BF) const override { From cc036da4a27415bb5ef81faf68aa969839fad644 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 24 Feb 2017 21:59:33 -0800 Subject: [PATCH 222/904] [BOLT] New CFI handling policy. Summary: The new interface for handling Call Frame Information: * CFI state at any point in a function (in CFG state) is defined by CFI state at basic block entry and CFI instructions inside the block. The state is independent of basic blocks layout order (this is implied by CFG state but wasn't always true in the past). * Use BinaryBasicBlock::getCFIStateAtInstr(const MCInst *Inst) to get CFI state at any given instruction in the program. * No need to call fixCFIState() after any given pass. fixCFIState() is called only once during function finalization, and any function transformations after that point are prohibited. * When introducing new basic blocks, make sure CFI state at entry is set correctly and matches CFI instructions in the basic block (if any). * When splitting basic blocks, use getCFIStateAtInstr() to get a state at the split point, and set the new basic block's CFI state to this value. Introduce CFG_Finalized state to indicate that no further optimizations are allowed on the function. This state is reached after we have synced CFI instructions and updated EH info. Rename "-print-after-fixup" option to "-print-finalized". This diffs fixes CFI for cases when we split conditional tail calls, and for indirect call promotion optimization. (cherry picked from commit 32571fba226c4153d582f6841276d26e94f7395d) --- bolt/BinaryBasicBlock.cpp | 72 +++++++++- bolt/BinaryBasicBlock.h | 28 ++++ bolt/BinaryFunction.cpp | 255 +++++++++++++++++------------------ bolt/BinaryFunction.h | 68 +++++++--- bolt/BinaryPassManager.cpp | 8 +- bolt/Exceptions.cpp | 2 +- bolt/Passes/BinaryPasses.cpp | 16 +-- bolt/Passes/BinaryPasses.h | 6 +- bolt/RewriteInstance.cpp | 1 - 9 files changed, 285 insertions(+), 171 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index fc005e1087cd..3934ae25cf15 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -81,7 +81,7 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { } return true; } - + BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const { if (!Label && succ_size() == 1) return *succ_begin(); @@ -103,6 +103,76 @@ BinaryBasicBlock *BinaryBasicBlock::getLandingPad(const MCSymbol *Label) const { return nullptr; } +int32_t BinaryBasicBlock::getCFIStateAtInstr(const MCInst *Instr) const { + assert(getFunction()->getState() == BinaryFunction::State::CFG && + "can only calculate CFI state when function is in active CFG state"); + + const auto &FDEProgram = getFunction()->getFDEProgram(); + + // Find the last CFI preceding Instr in this basic block. + const MCInst *LastCFI = nullptr; + bool InstrSeen = (Instr == nullptr); + for (auto RII = Instructions.rbegin(), E = Instructions.rend(); + RII != E; ++RII) { + if (!InstrSeen) { + InstrSeen = (&*RII == Instr); + continue; + } + if (Function->getBinaryContext().MIA->isCFI(*RII)) { + LastCFI = &*RII; + break; + } + } + + assert(InstrSeen && "instruction expected in basic block"); + + // CFI state is the same as at basic block entry point. + if (!LastCFI) + return getCFIState(); + + // Fold all RememberState/RestoreState sequences, such as for: + // + // [ CFI #(K-1) ] + // RememberState (#K) + // .... + // RestoreState + // RememberState + // .... + // RestoreState + // [ GNU_args_size ] + // RememberState + // .... + // RestoreState <- LastCFI + // + // we return K - the most efficient state to (re-)generate. + int64_t State = LastCFI->getOperand(0).getImm(); + while (State >= 0 && + FDEProgram[State].getOperation() == MCCFIInstruction::OpRestoreState) { + int32_t Depth = 1; + --State; + assert(State >= 0 && "first CFI cannot be RestoreState"); + while (Depth && State >= 0) { + const auto &CFIInstr = FDEProgram[State]; + if (CFIInstr.getOperation() == MCCFIInstruction::OpRestoreState) { + ++Depth; + } else if (CFIInstr.getOperation() == MCCFIInstruction::OpRememberState) { + --Depth; + } + --State; + } + assert(Depth == 0 && "unbalanced RememberState/RestoreState stack"); + + // Skip any GNU_args_size. + while (State >= 0 && + FDEProgram[State].getOperation() == MCCFIInstruction::OpGnuArgsSize){ + --State; + } + } + + assert((State + 1 >= 0) && "miscalculated CFI state"); + return State + 1; +} + void BinaryBasicBlock::addSuccessor(BinaryBasicBlock *Succ, uint64_t Count, uint64_t MispredictedCount) { diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 402389b3f007..8fe86129f189 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -95,6 +95,9 @@ class BinaryBasicBlock { /// Number of pseudo instructions in this block. uint32_t NumPseudos{0}; + /// CFI state at the entry to this basic block. + int32_t CFIState{-1}; + /// True if this basic block is (potentially) an external entry point into /// the function. bool IsEntryPoint{false}; @@ -434,6 +437,31 @@ class BinaryBasicBlock { return RII == Instructions.rend() ? nullptr : &*RII; } + /// Set CFI state at entry to this basic block. + void setCFIState(int32_t NewCFIState) { + assert((CFIState == -1 || NewCFIState == CFIState) && + "unexpected change of CFI state for basic block"); + CFIState = NewCFIState; + } + + /// Return CFI state (expected) at entry of this basic block. + int32_t getCFIState() const { + assert(CFIState >= 0 && "unknown CFI state"); + return CFIState; + } + + /// Calculate and return CFI state right before instruction \p Instr in + /// this basic block. If \p Instr is nullptr then return the state at + /// the end of the basic block. + int32_t getCFIStateAtInstr(const MCInst *Instr) const; + + /// Calculate and return CFI state after execution of this basic block. + /// The state depends on CFI state at entry and CFI instructions inside the + /// basic block. + int32_t getCFIStateAtExit() const { + return getCFIStateAtInstr(nullptr); + } + /// Set minimum alignment for the basic block. void setAlignment(uint64_t Align) { Alignment = Align; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index aa8d56d6f45a..e69316abb5e9 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -203,7 +203,7 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { size_t BinaryFunction::getBasicBlockOriginalSize(const BinaryBasicBlock *BB) const { - if (CurrentState != State::CFG) + if (!hasCFG()) return 0; auto Index = getIndex(BB); @@ -276,8 +276,6 @@ std::pair BinaryFunction::eraseInvalidBBs() { if (Count > 0) { updateBBIndices(0); recomputeLandingPads(0, BasicBlocks.size()); - BBCFIState = annotateCFIState(); - fixCFIState(); } return std::make_pair(Count, Bytes); @@ -331,7 +329,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, << "\n IsSplit : " << IsSplit << "\n BB Count : " << BasicBlocksLayout.size(); - if (CurrentState == State::CFG) { + if (hasCFG()) { OS << "\n Hash : " << Twine::utohexstr(hash()); } if (FrameInstructions.size()) { @@ -400,8 +398,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (hasValidProfile()) { OS << " Exec Count : " << BBExecCount << "\n"; } - if (!BBCFIState.empty()) { - OS << " CFI State : " << BBCFIState[getIndex(BB)] << '\n'; + if (BB->getCFIState() >= 0) { + OS << " CFI State : " << BB->getCFIState() << '\n'; } if (!BB->pred_empty()) { OS << " Predecessors: "; @@ -461,6 +459,13 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } + // In CFG_Finalized state we can miscalculate CFI state at exit. + if (CurrentState == State::CFG) { + const auto CFIStateAtExit = BB->getCFIStateAtExit(); + if (CFIStateAtExit >= 0) + OS << " CFI State: " << CFIStateAtExit << '\n'; + } + OS << '\n'; } @@ -1768,8 +1773,8 @@ bool BinaryFunction::buildCFG() { else clearProfile(); - // Update CFI information for each BB - BBCFIState = annotateCFIState(); + // Assign CFI information to each BB entry. + annotateCFIState(); // Convert conditional tail call branches to conditional branches that jump // to a tail call. @@ -1791,10 +1796,6 @@ bool BinaryFunction::buildCFG() { setSimple(false); } - // Fix the possibly corrupted CFI state. CFI state may have been corrupted - // because of the CFG modifications while removing conditional tail calls. - fixCFIState(); - // Clean-up memory taken by instructions and labels. // // NB: don't clear Labels list as we may need them if we mark the function @@ -2146,7 +2147,7 @@ void BinaryFunction::removeConditionalTailCalls() { TailCallBB = BasicBlocks[InsertIdx]; // Add the correct CFI state for the new block. - BBCFIState.insert(BBCFIState.begin() + InsertIdx, TCInfo.CFIStateBefore); + TailCallBB->setCFIState(TCInfo.CFIStateBefore); } else { // Forward jump: we will create a new basic block at the end of the // function containing the unconditional tail call and change the target @@ -2158,15 +2159,15 @@ void BinaryFunction::removeConditionalTailCalls() { // the end of the code as a result of __builtin_unreachable(). const BinaryBasicBlock *LastBB = BasicBlocks.back(); uint64_t NewBlockOffset = - LastBB->getOffset() + BC.computeCodeSize(LastBB->begin(), LastBB->end()) + 1; + LastBB->getOffset() + + BC.computeCodeSize(LastBB->begin(), LastBB->end()) + 1; TailCallBB = addBasicBlock(NewBlockOffset, TCLabel); TailCallBB->addInstruction(TailCallInst); // Add the correct CFI state for the new block. It has to be inserted in // the one before last position (the last position holds the CFI state // after the last block). - BBCFIState.insert(BBCFIState.begin() + BBCFIState.size() - 1, - TCInfo.CFIStateBefore); + TailCallBB->setCFIState(TCInfo.CFIStateBefore); // Replace the target of the conditional tail call with the label of the // new basic block. @@ -2201,64 +2202,62 @@ uint64_t BinaryFunction::getFunctionScore() { return FunctionScore; } -BinaryFunction::CFIStateVector -BinaryFunction::annotateCFIState(const MCInst *Stop) { +void BinaryFunction::annotateCFIState() { + assert(CurrentState == State::Disassembled && "unexpected function state"); assert(!BasicBlocks.empty() && "basic block list should not be empty"); - uint32_t State = 0; - uint32_t HighestState = 0; - std::stack StateStack; - CFIStateVector CFIState; + // This is an index of the last processed CFI in FDE CFI program. + int32_t State = 0; + + // This is an index of RememberState CFI reflecting effective state right + // after execution of RestoreState CFI. + // + // It differs from State iff the CFI at (State-1) + // was RestoreState (modulo GNU_args_size CFIs, which are ignored). + // + // This allows us to generate shorter replay sequences when producing new + // CFI programs. + int32_t EffectiveState = 0; - for (auto CI = BasicBlocks.begin(), CE = BasicBlocks.end(); CI != CE; ++CI) { - BinaryBasicBlock *CurBB = *CI; - // Annotate this BB entry - CFIState.emplace_back(State); + // For tracking RememberState/RestoreState sequences. + std::stack StateStack; + + for (auto *BB : BasicBlocks) { + BB->setCFIState(EffectiveState); // While building the CFG, we want to save the CFI state before a tail call - // instruction, so that we can correctly remove condtional tail calls - auto TCI = TailCallTerminatedBlocks.find(CurBB); + // instruction, so that we can correctly remove conditional tail calls. + auto TCI = TailCallTerminatedBlocks.find(BB); bool SaveState = TCI != TailCallTerminatedBlocks.end(); - // Advance state - uint32_t Idx = 0; - for (const auto &Instr : *CurBB) { - auto *CFI = getCFIFor(Instr); - if (CFI == nullptr) { - if (SaveState && Idx == TCI->second.Index) - TCI->second.CFIStateBefore = State; - ++Idx; - if (&Instr == Stop) { - CFIState.emplace_back(State); - return CFIState; - } - continue; + uint32_t Idx = 0; // instruction index in a current basic block + for (const auto &Instr : *BB) { + ++Idx; + if (SaveState && Idx == TCI->second.Index) { + TCI->second.CFIStateBefore = EffectiveState; + SaveState = false; } - ++HighestState; + + const auto *CFI = getCFIFor(Instr); + if (!CFI) + continue; + + ++State; + if (CFI->getOperation() == MCCFIInstruction::OpRememberState) { - StateStack.push(State); + StateStack.push(EffectiveState); } else if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) { - assert(!StateStack.empty() && "Corrupt CFI stack"); - State = StateStack.top(); + assert(!StateStack.empty() && "corrupt CFI stack"); + EffectiveState = StateStack.top(); StateStack.pop(); } else if (CFI->getOperation() != MCCFIInstruction::OpGnuArgsSize) { - State = HighestState; - } - assert(State <= FrameInstructions.size()); - ++Idx; - if (&Instr == Stop) { - CFIState.emplace_back(State); - return CFIState; + // OpGnuArgsSize CFIs do not affect the CFI state. + EffectiveState = State; } } } - // Store the state after the last BB - CFIState.emplace_back(State); - - assert(StateStack.empty() && "Corrupt CFI stack"); - - return CFIState; + assert(StateStack.empty() && "corrupt CFI stack"); } bool BinaryFunction::fixCFIState() { @@ -2268,74 +2267,72 @@ bool BinaryFunction::fixCFIState() { << ": "); auto replayCFIInstrs = - [this](uint32_t FromState, uint32_t ToState, BinaryBasicBlock *InBB, - BinaryBasicBlock::iterator InsertIt) -> bool { - if (FromState == ToState) - return true; - assert(FromState < ToState); - - std::vector NewCFIs; - uint32_t NestedLevel = 0; - for (uint32_t CurState = FromState; CurState < ToState; ++CurState) { - assert(CurState < FrameInstructions.size()); - MCCFIInstruction *Instr = &FrameInstructions[CurState]; - if (Instr->getOperation() == MCCFIInstruction::OpRememberState) - ++NestedLevel; - if (!NestedLevel) - NewCFIs.push_back(CurState); - if (Instr->getOperation() == MCCFIInstruction::OpRestoreState) - --NestedLevel; - } - - // TODO: If in replaying the CFI instructions to reach this state we - // have state stack instructions, we could still work out the logic - // to extract only the necessary instructions to reach this state - // without using the state stack. Not sure if it is worth the effort - // because this happens rarely. - if (NestedLevel != 0) { - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state" - << " while replaying CFI instructions for BB " - << InBB->getName() << " in function " << *this << '\n'; - } - return false; - } + [this](int32_t FromState, int32_t ToState, BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator InsertIt) -> bool { + if (FromState == ToState) + return true; + assert(FromState < ToState && "can only replay CFIs forward"); + + std::vector NewCFIs; + uint32_t NestedLevel = 0; + for (auto CurState = FromState; CurState < ToState; ++CurState) { + MCCFIInstruction *Instr = &FrameInstructions[CurState]; + if (Instr->getOperation() == MCCFIInstruction::OpRememberState) + ++NestedLevel; + if (!NestedLevel) + NewCFIs.push_back(CurState); + if (Instr->getOperation() == MCCFIInstruction::OpRestoreState) + --NestedLevel; + } + + // TODO: If in replaying the CFI instructions to reach this state we + // have state stack instructions, we could still work out the logic + // to extract only the necessary instructions to reach this state + // without using the state stack. Not sure if it is worth the effort + // because this happens rarely. + if (NestedLevel != 0) { + errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state" + << " while replaying CFI instructions for BB " + << InBB->getName() << " in function " << *this << '\n'; + return false; + } - for (auto CFI : NewCFIs) { - // Ignore GNU_args_size instructions. - if (FrameInstructions[CFI].getOperation() != - MCCFIInstruction::OpGnuArgsSize) { - InsertIt = addCFIPseudo(InBB, InsertIt, CFI); - ++InsertIt; - } - } + for (auto CFI : NewCFIs) { + // Ignore GNU_args_size instructions. + if (FrameInstructions[CFI].getOperation() != + MCCFIInstruction::OpGnuArgsSize) { + InsertIt = addCFIPseudo(InBB, InsertIt, CFI); + ++InsertIt; + } + } - return true; - }; + return true; + }; - uint32_t State = 0; + int32_t State = 0; auto *FDEStartBB = BasicBlocksLayout[0]; - for (uint32_t I = 0, E = BasicBlocksLayout.size(); I != E; ++I) { - auto *BB = BasicBlocksLayout[I]; - uint32_t BBIndex = getIndex(BB); + bool SeenCold = false; + for (auto *BB : BasicBlocksLayout) { + const auto CFIStateAtExit = BB->getCFIStateAtExit(); // Hot-cold border: check if this is the first BB to be allocated in a cold - // region (a different FDE). If yes, we need to reset the CFI state and - // the FDEStartBB that is used to insert remember_state CFIs (t12863876). - if (I != 0 && BB->isCold() != BasicBlocksLayout[I - 1]->isCold()) { + // region (with a different FDE). If yes, we need to reset the CFI state and + // the FDEStartBB that is used to insert remember_state CFIs. + if (!SeenCold && BB->isCold()) { State = 0; FDEStartBB = BB; + SeenCold = true; } // We need to recover the correct state if it doesn't match expected // state at BB entry point. - if (BBCFIState[BBIndex] < State) { + if (BB->getCFIState() < State) { // In this case, State is currently higher than what this BB expect it // to be. To solve this, we need to insert a CFI instruction to remember // the old state at function entry, then another CFI instruction to // restore it at the entry of this BB and replay CFI instructions to // reach the desired state. - uint32_t OldState = BBCFIState[BBIndex]; + int32_t OldState = BB->getCFIState(); // Remember state at function entry point (our reference state). auto InsertIt = FDEStartBB->begin(); while (InsertIt != FDEStartBB->end() && BC.MIA->isCFI(*InsertIt)) @@ -2375,25 +2372,22 @@ bool BinaryFunction::fixCFIState() { } if (StackOffset != 0) { - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: not possible to remember/recover state" - << " without corrupting CFI state stack in function " - << *this << " @ " << BB->getName() << "\n"; - } + errs() << "BOLT-WARNING: not possible to remember/recover state" + << " without corrupting CFI state stack in function " + << *this << " @ " << BB->getName() << "\n"; return false; } - } else if (BBCFIState[BBIndex] > State) { - // If BBCFIState[BBIndex] > State, it means we are behind in the + } else if (BB->getCFIState() > State) { + // If BB's CFI state is greater than State, it means we are behind in the // state. Just emit all instructions to reach this state at the // beginning of this BB. If this sequence of instructions involve // remember state or restore state, bail out. - if (!replayCFIInstrs(State, BBCFIState[BBIndex], BB, BB->begin())) + if (!replayCFIInstrs(State, BB->getCFIState(), BB, BB->begin())) return false; } - State = BBCFIState[BBIndex + 1]; - DEBUG(dbgs() << Sep << State); - DEBUG(Sep = ", "); + State = CFIStateAtExit; + DEBUG(dbgs() << Sep << State; Sep = ", "); } DEBUG(dbgs() << "\n"); return true; @@ -2543,9 +2537,6 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { // Emit GNU_args_size CFIs as necessary. if (usesGnuArgsSize() && BC.MIA->isInvoke(Instr)) { auto NewGnuArgsSize = BC.MIA->getGnuArgsSize(Instr); - if (NewGnuArgsSize < 0) { - errs() << "XXX: in function " << *this << '\n'; - } assert(NewGnuArgsSize >= 0 && "expected non-negative GNU_args_size"); if (NewGnuArgsSize != CurrentGnuArgsSize) { CurrentGnuArgsSize = NewGnuArgsSize; @@ -2688,7 +2679,7 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const { BB->getOffset(), getIndex(BB), Layout, - BBCFIState[getIndex(BB)]); + BB->getCFIState()); OS << format("\"%s\" [shape=box]\n", BB->getName().data()); if (opts::DotToolTipCode) { std::string Str; @@ -3095,7 +3086,7 @@ __attribute__((noinline)) BinaryFunction::BasicBlockOrderType BinaryFunction::df bool BinaryFunction::isIdenticalWith(const BinaryFunction &OtherBF, bool IgnoreSymbols, bool UseDFS) const { - assert(CurrentState == State::CFG && OtherBF.CurrentState == State::CFG); + assert(hasCFG() && OtherBF.hasCFG() && "both functions should have CFG"); // Compare the two functions, one basic block at a time. // Currently we require two identical basic blocks to have identical @@ -3261,7 +3252,7 @@ bool BinaryFunction::equalJumpTables(const JumpTable *JumpTableA, } std::size_t BinaryFunction::hash(bool Recompute, bool UseDFS) const { - assert(CurrentState == State::CFG); + assert(hasCFG() && "function is expected to have CFG"); if (!Recompute) return Hash; @@ -3343,13 +3334,11 @@ void BinaryFunction::updateBBIndices(const unsigned StartIndex) { void BinaryFunction::updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks) { assert(TailCallTerminatedBlocks.empty()); - auto PartialCFIState = annotateCFIState(&(*Start->rbegin())); - const auto StartIndex = getIndex(Start); - BBCFIState.insert(BBCFIState.begin() + StartIndex + 1, - NumNewBlocks, - PartialCFIState.back()); - assert(BBCFIState.size() == BasicBlocks.size() + 1); - fixCFIState(); + const auto CFIState = Start->getCFIStateAtExit(); + const auto StartIndex = getIndex(Start) + 1; + for (unsigned I = 0; I < NumNewBlocks; ++I) { + BasicBlocks[StartIndex + I]->setCFIState(CFIState); + } } void BinaryFunction::updateLayout(BinaryBasicBlock* Start, diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 7952b0561cb2..03b4a82dc350 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -159,10 +159,11 @@ enum JumpTableSupportLevel : char { class BinaryFunction : public AddressRangesOwner { public: enum class State : char { - Empty = 0, /// Function body is empty - Disassembled, /// Function have been disassembled - CFG, /// Control flow graph have been built - Assembled, /// Function has been assembled in memory + Empty = 0, /// Function body is empty. + Disassembled, /// Function have been disassembled. + CFG, /// Control flow graph have been built. + CFG_Finalized, /// CFG is finalized. No optimizations allowed. + Assembled, /// Function has been assembled in memory. }; /// Settings for splitting function bodies into hot/cold partitions. @@ -336,6 +337,11 @@ class BinaryFunction : public AddressRangesOwner { /// Update the indices of all the basic blocks starting at StartIndex. void updateBBIndices(const unsigned StartIndex); + /// Annotate each basic block entry with its current CFI state. This is + /// run right after the construction of CFG while basic blocks are in their + /// original order. + void annotateCFIState(); + /// Helper function that compares an instruction of this function to the /// given instruction of the given function. The functions should have /// identical CFG. @@ -450,6 +456,28 @@ class BinaryFunction : public AddressRangesOwner { using CFIInstrMapType = std::vector; using cfi_iterator = CFIInstrMapType::iterator; using const_cfi_iterator = CFIInstrMapType::const_iterator; + + /// We don't decode Call Frame Info encoded in DWARF program state + /// machine. Instead we define a "CFI State" - a frame information that + /// is a result of executing FDE CFI program up to a given point. The + /// program consists of opaque Call Frame Instructions: + /// + /// CFI #0 + /// CFI #1 + /// .... + /// CFI #N + /// + /// When we refer to "CFI State K" - it corresponds to a row in an abstract + /// Call Frame Info table. This row is reached right before executing CFI #K. + /// + /// At any point of execution in a function we are in any one of (N + 2) + /// states described in the original FDE program. We can't have more states + /// without intelligent processing of CFIs. + /// + /// When the final layout of basic blocks is known, and we finalize CFG, + /// we modify the original program to make sure the same state could be + /// reached even when basic blocks containing CFI instructions are executed + /// in a different order. CFIInstrMapType FrameInstructions; /// Exception handling ranges. @@ -615,13 +643,6 @@ class BinaryFunction : public AddressRangesOwner { }; std::vector BasicBlockOffsets; - // At each basic block entry we attach a CFI state to detect if reordering - // corrupts the CFI state for a block. The CFI state is simply the index in - // FrameInstructions for the CFI responsible for creating this state. - // This vector is indexed by BB index. - using CFIStateVector = std::vector; - CFIStateVector BBCFIState; - /// Symbol in the output. /// /// NB: function can have multiple symbols associated with it. We will emit @@ -895,10 +916,18 @@ class BinaryFunction : public AddressRangesOwner { return Names; } + /// Return a state the function is in (see BinaryFunction::State definition + /// for description). State getState() const { return CurrentState; } + /// Return true if function has a control flow graph available. + bool hasCFG() const { + return getState() == State::CFG || + getState() == State::CFG_Finalized; + } + /// Return containing file section. SectionRef getSection() const { return Section; @@ -1511,14 +1540,9 @@ class BinaryFunction : public AddressRangesOwner { /// and size. uint64_t getFunctionScore(); - /// Annotate each basic block entry with its current CFI state. This is used - /// to detect when reordering changes the CFI state seen by a basic block and - /// fix this. - /// The CFI state is simply the index in FrameInstructions for the - /// MCCFIInstruction object responsible for this state. - /// If Stop is not null, the annotation will exit early once the scan finishes - /// with the Stop instruction. - CFIStateVector annotateCFIState(const MCInst *Stop = nullptr); + const CFIInstrMapType &getFDEProgram() const { + return FrameInstructions; + } /// After reordering, this function checks the state of CFI and fixes it if it /// is corrupted. If it is unable to fix it, it returns false. @@ -1545,6 +1569,11 @@ class BinaryFunction : public AddressRangesOwner { /// When we reverse the branch condition, the CFG is updated accordingly. void fixBranches(); + /// Mark function as finalized. No further optimizations are permitted. + void setFinalized() { + CurrentState = State::CFG_Finalized; + } + /// Split function in two: a part with warm or hot BBs and a part with never /// executed BBs. The cold part is moved to a new BinaryFunction. void splitFunction(); @@ -1712,6 +1741,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, case BinaryFunction::State::Empty: OS << "empty"; break; case BinaryFunction::State::Disassembled: OS << "disassembled"; break; case BinaryFunction::State::CFG: OS << "CFG constructed"; break; + case BinaryFunction::State::CFG_Finalized:OS << "CFG finalized"; break; case BinaryFunction::State::Assembled: OS << "assembled"; break; } diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index ade1298e887e..d68d0378c2d0 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -95,8 +95,8 @@ PrintAfterBranchFixup("print-after-branch-fixup", cl::Hidden); static cl::opt -PrintAfterFixup("print-after-fixup", - cl::desc("print function after fixup"), +PrintFinalized("print-finalized", + cl::desc("print function after CFG is finalized"), cl::Hidden); static cl::opt @@ -140,7 +140,7 @@ PrintICP("print-icp", cl::desc("print functions after indirect call promotion"), cl::ZeroOrMore, cl::Hidden); - + static cl::opt PrintInline("print-inline", cl::desc("print functions after inlining optimization"), @@ -283,7 +283,7 @@ void BinaryFunctionPassManager::runAllPasses( llvm::make_unique(PrintUCE), opts::EliminateUnreachable); - Manager.registerPass(llvm::make_unique(PrintAfterFixup)); + Manager.registerPass(llvm::make_unique(PrintFinalized)); Manager.registerPass( llvm::make_unique(PrintAfterLowering)); diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 5c713941c43f..4e7a39f1cb00 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -302,7 +302,7 @@ void BinaryFunction::updateEHRanges() { if (getSize() == 0) return; - assert(CurrentState == State::CFG && "unexpected state"); + assert(CurrentState == State::CFG_Finalized && "unexpected state"); // Build call sites table. struct EHInfo { diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index d693bbb85555..b20aa19d2333 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -378,7 +378,7 @@ void FixupBranches::runOnFunctions( } } -void FixupFunctions::runOnFunctions( +void FinalizeFunctions::runOnFunctions( BinaryContext &BC, std::map &BFs, std::set & @@ -394,17 +394,15 @@ void FixupFunctions::runOnFunctions( if (shouldOptimize(Function) && !Function.fixCFIState()) { if (opts::Relocs) { errs() << "BOLT-ERROR: unable to fix CFI state for function " - << Function << ". Aborting.\n"; - abort(); - } - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: unable to fix CFI state for function " - << Function << ". Skipping.\n"; + << Function << ". Exiting.\n"; + exit(1); } Function.setSimple(false); continue; } + Function.setFinalized(); + // Update exception handling information. Function.updateEHRanges(); } @@ -592,7 +590,7 @@ void Peepholes::shortenInstructions(BinaryContext &BC, void debugDump(BinaryFunction *BF) { BF->dump(); } - + // This peephole fixes jump instructions that jump to another basic // block with a single jump instruction, e.g. // @@ -1396,7 +1394,7 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, BC.printInstruction(dbgs(), Inst, Targets[0].From.Offset, nullptr, true); }); } - + void IndirectCallPromotion::runOnFunctions( BinaryContext &BC, std::map &BFs, diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 7ed78e6eea71..9caecc1a3520 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -139,13 +139,13 @@ class FixupBranches : public BinaryFunctionPass { /// Fix the CFI state and exception handling information after all other /// passes have completed. -class FixupFunctions : public BinaryFunctionPass { +class FinalizeFunctions : public BinaryFunctionPass { public: - explicit FixupFunctions(const cl::opt &PrintPass) + explicit FinalizeFunctions(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } const char *getName() const override { - return "fixup-functions"; + return "finalize-functions"; } void runOnFunctions(BinaryContext &BC, std::map &BFs, diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 63b0aecdf143..61b6b95f5ebb 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2751,7 +2751,6 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { // New section header string table goes last. - // Fix ELF header. auto NewEhdr = *Obj->getHeader(); NewEhdr.e_entry = EntryPoint; From 40b3643f2e9f2f30906cc512934fe153db11eadb Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 3 Mar 2017 11:35:41 -0800 Subject: [PATCH 223/904] [BOLT] Detect and handle __builtin_unreachable(). Summary: Calls to __builtin_unreachable() can result in a inconsistent CFG. It was possible for basic block to end with a conditional branche and have a single successor. Or there could exist non-terminated basic block without successors. We also often treated conditional jumps with destination past the end of a function as conditional tail calls. This can be prevented reliably at least when the byte past the end of the function does not belong to the next function. This diff includes several changes: * At disassembly stage jumps past the end of a function are converted into 'nops'. This is done only for cases when we can guarantee that the jump is not a tail call. Conversion to nop is required since the instruction could be referenced either by exception handling tables and/or debug info. Nops are later removed. * In CFG insert 'ret' into non-terminated basic blocks without successors (this almost never happens). * Conditional jumps at the end of the function are removed from CFG. The block will still have a single successor. * Cases where a destination of a jump instruction is the start of the next function, are still conservatively handled as (conditional) tail calls. (cherry picked from commit 5daf0410008d70b1c672938aa50ca5149dcb8647) --- bolt/BinaryBasicBlock.h | 7 +- bolt/BinaryContext.h | 7 +- bolt/BinaryFunction.cpp | 119 ++++++-- bolt/BinaryFunction.h | 57 +++- bolt/BinaryPassManager.cpp | 9 + bolt/CMakeLists.txt | 2 +- bolt/Passes/BinaryPasses.cpp | 421 ++++++++++++++++++++++++- bolt/Passes/BinaryPasses.h | 24 ++ bolt/Passes/CMakeLists.txt | 2 + bolt/Passes/HFSort.cpp | 489 +++++++++++++++++++++++++++++ bolt/Passes/HFSort.h | 195 ++++++++++++ bolt/Passes/HFSortPlus.cpp | 508 +++++++++++++++++++++++++++++++ bolt/Passes/ReorderAlgorithm.cpp | 2 +- bolt/RewriteInstance.cpp | 62 ++-- 14 files changed, 1836 insertions(+), 68 deletions(-) create mode 100644 bolt/Passes/HFSort.cpp create mode 100644 bolt/Passes/HFSort.h create mode 100644 bolt/Passes/HFSortPlus.cpp diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 8fe86129f189..5caf1e8f1ed5 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -528,6 +528,11 @@ class BinaryBasicBlock { return Itr != Successors.end(); } + /// Test if this BB has a valid execution count. + bool hasProfile() const { + return ExecutionCount != COUNT_NO_PROFILE; + } + /// Return the information about the number of times this basic block was /// executed. /// @@ -539,7 +544,7 @@ class BinaryBasicBlock { /// Return the execution count for blocks with known profile. /// Return 0 if the block has no profile. uint64_t getKnownExecutionCount() const { - return ExecutionCount == COUNT_NO_PROFILE ? 0 : ExecutionCount; + return !hasProfile() ? 0 : ExecutionCount; } /// Set the execution count for this block. diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 969f434f2049..a98c5c9f7cb6 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -96,7 +96,7 @@ class BinaryContext { /// As we fold identical functions, multiple symbols can point /// to the same BinaryFunction. std::unordered_map SymbolToFunctionMap; + BinaryFunction *> SymbolToFunctionMap; /// Map virtual address to a section. std::map AllocatableSections; @@ -242,6 +242,11 @@ class BinaryContext { return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second; } + BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) { + auto BFI = SymbolToFunctionMap.find(Symbol); + return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second; + } + /// Populate some internal data structures with debug info. void preprocessDebugInfo( std::map &BinaryFunctions); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index e69316abb5e9..726c78d66d41 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -970,9 +970,9 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { return true; }; - for (uint64_t Offset = 0; Offset < getSize(); ) { + uint64_t Size = 0; // instruction size + for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) { MCInst Instruction; - uint64_t Size; const uint64_t AbsoluteInstrAddr = getAddress() + Offset; if (!BC.DisAsm->getInstruction(Instruction, @@ -1073,6 +1073,20 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { if (containsAddress(TargetAddress)) { TargetSymbol = getOrCreateLocalLabel(TargetAddress); } else { + if (TargetAddress == getAddress() + getSize() && + TargetAddress < getAddress() + getMaxSize()) { + // Result of __builtin_unreachable(). + DEBUG(dbgs() << "BOLT-DEBUG: jump past end detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) + << " in function " << *this + << " : replacing with nop.\n"); + BC.MIA->createNoop(Instruction); + if (IsCondBranch) { + // Register FT branch for passing function profile validation. + FTBranches.emplace_back(Offset, Offset + Size); + } + goto add_instruction; + } BC.InterproceduralReferences.insert(TargetAddress); if (opts::Verbosity >= 2 && !IsCall && Size == 2 && !opts::Relocs) { errs() << "BOLT-WARNING: relaxed tail call detected at 0x" @@ -1159,13 +1173,19 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { MCSymbolRefExpr::VK_None, *Ctx))); - if (isIndirect && BranchDataOrErr) { - MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData", - Offset); + if (BranchDataOrErr) { + if (IsCall) { + MIA->addAnnotation(Ctx.get(), Instruction, "EdgeCountData", Offset); + } + if (isIndirect) { + MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData", + Offset); + } } } else { // Could not evaluate branch. Should be an indirect call or an // indirect branch. Bail out on the latter case. + bool MaybeEdgeCountData = false; if (MIA->isIndirectBranch(Instruction)) { auto Result = analyzeIndirectBranch(Instruction, Size, Offset); switch (Result) { @@ -1185,10 +1205,12 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE: if (opts::JumpTables == JTS_NONE) IsSimple = false; + MaybeEdgeCountData = true; break; case IndirectBranchType::UNKNOWN: // Keep processing. We'll do more checks and fixes in // postProcessIndirectBranches(). + MaybeEdgeCountData = true; if (BranchDataOrErr) { MIA->addAnnotation(Ctx.get(), Instruction, @@ -1203,6 +1225,11 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { Offset); } } + if (BranchDataOrErr) { + const char* AttrName = + MaybeEdgeCountData ? "MaybeEdgeCountData" : "EdgeCountData"; + MIA->addAnnotation(Ctx.get(), Instruction, AttrName, Offset); + } // Indirect call. We only need to fix it if the operand is RIP-relative if (IsSimple && MIA->hasRIPOperand(Instruction)) { if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { @@ -1224,14 +1251,13 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } } +add_instruction: if (ULT.first && ULT.second) { Instruction.setLoc( findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, ULT)); } addInstruction(Offset, std::move(Instruction)); - - Offset += Size; } postProcessJumpTables(); @@ -1304,15 +1330,12 @@ bool BinaryFunction::postProcessIndirectBranches() { // it must be a tail call. if (layout_size() == 1) { BC.MIA->convertJmpToTailCall(Instr); - - if (BC.MIA->hasAnnotation(Instr, "MaybeIndirectBranchData")) { - auto Offset = - BC.MIA->getAnnotationAs(Instr, "MaybeIndirectBranchData"); - BC.MIA->addAnnotation(BC.Ctx.get(), - Instr, - "IndirectBranchData", - Offset); - } + BC.MIA->renameAnnotation(Instr, + "MaybeEdgeCountData", + "EdgeCountData"); + BC.MIA->renameAnnotation(Instr, + "MaybeIndirectBranchData", + "IndirectBranchData"); return true; } @@ -1392,15 +1415,12 @@ bool BinaryFunction::postProcessIndirectBranches() { return false; } BC.MIA->convertJmpToTailCall(Instr); - - if (BranchDataOrErr) { - auto Offset = - BC.MIA->getAnnotationAs(Instr, "MaybeIndirectBranchData"); - BC.MIA->addAnnotation(BC.Ctx.get(), - Instr, - "IndirectBranchData", - Offset); - } + BC.MIA->renameAnnotation(Instr, + "MaybeEdgeCountData", + "EdgeCountData"); + BC.MIA->renameAnnotation(Instr, + "MaybeIndirectBranchData", + "IndirectBranchData"); } } return true; @@ -1662,7 +1682,7 @@ bool BinaryFunction::buildCFG() { // Try to find the destination basic block. If the jump instruction was // followed by a no-op then the destination offset recorded in FTBranches // will point to that no-op but the destination basic block will start - // after the no-op due to ingoring no-ops when creating basic blocks. + // after the no-op due to ignoring no-ops when creating basic blocks. // So we have to skip any no-ops when trying to find the destination // basic block. auto *ToBB = getBasicBlockAtOffset(Branch.second); @@ -1678,6 +1698,8 @@ bool BinaryFunction::buildCFG() { // We have a fall-through that does not point to another BB, ignore it // as it may happen in cases where we have a BB finished by two // branches. + // This can also happen when we delete a branch past the end of a + // function in case of a call to __builtin_unreachable(). continue; } } @@ -1796,6 +1818,9 @@ bool BinaryFunction::buildCFG() { setSimple(false); } + // Eliminate inconsistencies between branch instructions and CFG. + postProcessBranches(); + // Clean-up memory taken by instructions and labels. // // NB: don't clear Labels list as we may need them if we mark the function @@ -2984,6 +3009,48 @@ void BinaryFunction::propagateGnuArgsSizeInfo() { } } +void BinaryFunction::postProcessBranches() { + if (!isSimple()) + return; + for (auto *BB : BasicBlocksLayout) { + auto LastInstrRI = BB->getLastNonPseudo(); + if (BB->succ_size() == 1) { + if (LastInstrRI != BB->rend() && + BC.MIA->isConditionalBranch(*LastInstrRI)) { + // __builtin_unreachable() could create a conditional branch that + // falls-through into the next function - hence the block will have only + // one valid successor. Such behaviour is undefined and thus we remove + // the conditional branch while leaving a valid successor. + assert(BB == BasicBlocksLayout.back() && "last basic block expected"); + BB->eraseInstruction(std::next(LastInstrRI.base())); + DEBUG(dbgs() << "BOLT-DEBUG: erasing conditional branch in " + << BB->getName() << " in function " << *this << '\n'); + } + } else if (BB->succ_size() == 0) { + // Ignore unreachable basic blocks. + if (BB->pred_size() == 0 || BB->isLandingPad()) + continue; + + // If it's the basic block that does not end up with a terminator - we + // insert a return instruction unless it's a call instruction. + if (LastInstrRI == BB->rend()) { + DEBUG(dbgs() << "BOLT-DEBUG: at least one instruction expected in BB " + << BB->getName() << " in function " << *this << '\n'); + continue; + } + if (!BC.MIA->isTerminator(*LastInstrRI) && + !BC.MIA->isCall(*LastInstrRI)) { + DEBUG(dbgs() << "BOLT-DEBUG: adding return to basic block " + << BB->getName() << " in function " << *this << '\n'); + MCInst ReturnInstr; + BC.MIA->createReturn(ReturnInstr); + BB->addInstruction(ReturnInstr); + } + } + } + assert(validateCFG() && "invalid CFG"); +} + void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { // No reason to merge invalid or empty profiles into BF. if (!hasValidProfile()) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 03b4a82dc350..70c93792f74f 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -197,6 +197,23 @@ class BinaryFunction : public AddressRangesOwner { LT_OPTIMIZE_SHUFFLE, }; + enum JumpTableSupportLevel : char { + JTS_NONE = 0, /// Disable jump tables support + JTS_BASIC = 1, /// Enable basic jump tables support + JTS_SPLIT = 2, /// Enable hot/cold splitting of jump tables + JTS_AGGRESSIVE = 3, /// Aggressive splitting of jump tables + }; + + enum ReorderType : char { + RT_NONE = 0, + RT_EXEC_COUNT, + RT_HFSORT, + RT_HFSORT_PLUS, + RT_PETTIS_HANSEN, + RT_RANDOM, + RT_USER + }; + static constexpr uint64_t COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; // Function size, in number of BBs, above which we fallback to a heuristic @@ -301,6 +318,9 @@ class BinaryFunction : public AddressRangesOwner { /// Last computed hash value. mutable uint64_t Hash{0}; + /// Function order for streaming into the destination binary. + uint32_t Index{-1U}; + /// Get basic block index assuming it belongs to this function. unsigned getIndex(const BinaryBasicBlock *BB) const { assert(BB->getIndex() < BasicBlocks.size()); @@ -342,6 +362,13 @@ class BinaryFunction : public AddressRangesOwner { /// original order. void annotateCFIState(); + /// Associate DW_CFA_GNU_args_size info with invoke instructions + /// (call instructions with non-empty landing pad). + void propagateGnuArgsSizeInfo(); + + /// Synchronize branch instructions with CFG. + void postProcessBranches(); + /// Helper function that compares an instruction of this function to the /// given instruction of the given function. The functions should have /// identical CFG. @@ -938,6 +965,22 @@ class BinaryFunction : public AddressRangesOwner { return Address; } + /// Does this function have a valid streaming order index? + bool hasValidIndex() const { + return Index != -1U; + } + + /// Get the streaming order index for this function. + uint32_t getIndex() const { + return Index; + } + + /// Set the streaming order index for this function. + void setIndex(uint32_t Idx) { + assert(!hasValidIndex()); + Index = Idx; + } + /// Get the original address for the given basic block within this function. uint64_t getBasicBlockOriginalAddress(const BinaryBasicBlock *BB) const { return Address + BB->getOffset(); @@ -1548,10 +1591,6 @@ class BinaryFunction : public AddressRangesOwner { /// is corrupted. If it is unable to fix it, it returns false. bool fixCFIState(); - /// Associate DW_CFA_GNU_args_size info with invoke instructions - /// (call instructions with non-empty landing pad). - void propagateGnuArgsSizeInfo(); - /// Adjust branch instructions to match the CFG. /// /// As it comes to internal branches, the CFG represents "the ultimate source @@ -1655,13 +1694,21 @@ class BinaryFunction : public AddressRangesOwner { size_t estimateHotSize() const { size_t Estimate = 0; for (const auto *BB : BasicBlocksLayout) { - if (BB->getExecutionCount() != 0) { + if (BB->getKnownExecutionCount() != 0) { Estimate += BC.computeCodeSize(BB->begin(), BB->end()); } } return Estimate; } + size_t estimateSize() const { + size_t Estimate = 0; + for (const auto *BB : BasicBlocksLayout) { + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + } + return Estimate; + } + virtual ~BinaryFunction(); /// Info for fragmented functions. diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index d68d0378c2d0..1364452ba35b 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -83,6 +83,12 @@ PrintReordered("print-reordered", cl::ZeroOrMore, cl::Hidden); +static cl::opt +PrintReorderedFunctions("print-reordered-functions", + cl::desc("print functions after clustering"), + cl::ZeroOrMore, + cl::Hidden); + static cl::opt PrintOptimizeBodyless("print-optimize-bodyless", cl::desc("print functions after bodyless optimization"), @@ -283,6 +289,9 @@ void BinaryFunctionPassManager::runAllPasses( llvm::make_unique(PrintUCE), opts::EliminateUnreachable); + Manager.registerPass( + llvm::make_unique(PrintReorderedFunctions)); + Manager.registerPass(llvm::make_unique(PrintFinalized)); Manager.registerPass( diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index fd9aa6884b72..95b9e1fe4019 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -23,7 +23,7 @@ add_llvm_tool(llvm-bolt BinaryPassManager.cpp DataReader.cpp DebugData.cpp + DWARFRewriter.cpp Exceptions.cpp RewriteInstance.cpp - DWARFRewriter.cpp ) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index b20aa19d2333..fad3437f2417 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -10,8 +10,11 @@ //===----------------------------------------------------------------------===// #include "BinaryPasses.h" +#include "HFSort.h" #include "llvm/Support/Options.h" +#include + #define DEBUG_TYPE "bolt" using namespace llvm; @@ -47,9 +50,11 @@ const char* dynoStatsOptDesc(const bolt::DynoStats::Category C) { namespace opts { extern cl::opt Verbosity; +extern cl::opt RandomSeed; extern cl::opt Relocs; -extern cl::opt SplitFunctions; +extern cl::opt SplitFunctions; extern bool shouldProcess(const bolt::BinaryFunction &Function); +extern size_t padFunction(const bolt::BinaryFunction &Function); static cl::opt IndirectCallPromotionThreshold( @@ -198,6 +203,53 @@ UseDFSForICF( cl::ReallyHidden, cl::ZeroOrMore); +cl::opt +ReorderFunctions( + "reorder-functions", + cl::desc("reorder and cluster functions (works only with relocations)"), + cl::init(bolt::BinaryFunction::RT_NONE), + cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE, + "none", + "do not reorder functions"), + clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT, + "exec-count", + "order by execution count"), + clEnumValN(bolt::BinaryFunction::RT_HFSORT, + "hfsort", + "use hfsort algorithm"), + clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS, + "hfsort+", + "use hfsort+ algorithm"), + clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN, + "pettis-hansen", + "use Pettis-Hansen algorithm"), + clEnumValN(bolt::BinaryFunction::RT_RANDOM, + "random", + "reorder functions randomly"), + clEnumValN(bolt::BinaryFunction::RT_USER, + "user", + "use function order specified by -function-order"), + clEnumValEnd)); + +static cl::opt +FunctionOrderFile("function-order", + cl::desc("file containing an ordered list of functions to use" + " for function reordering")); + +static cl::opt +ReorderFunctionsUseHotSize( + "reorder-functions-use-hot-size", + cl::desc("use a function's hot size when doing clustering"), + cl::init(true), + cl::ZeroOrMore); + +static cl::opt +UseEdgeCounts( + "use-edge-counts", + cl::desc("use edge count data when doing clustering"), + cl::init(true), + cl::ZeroOrMore); + } // namespace opts namespace llvm { @@ -257,9 +309,7 @@ void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, if (Target == OriginalTarget) continue; DEBUG(dbgs() << "BOLT-DEBUG: Optimizing " << BB->getName() - << " (executed " << (BB->getExecutionCount() == - BinaryFunction::COUNT_NO_PROFILE ? - 0 : BB->getExecutionCount()) + << " (executed " << BB->getKnownExecutionCount() << " times) in " << BF << ": replacing call to " << OriginalTarget->getName() << " by call to " << Target->getName() @@ -267,7 +317,7 @@ void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, BC.MIA->replaceCallTargetOperand(Inst, Target, BC.Ctx.get()); NumOptimizedCallSites += CallSites; - if (BB->getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) { + if (BB->hasProfile()) { NumEliminatedCalls += CallSites * BB->getExecutionCount(); } } @@ -757,12 +807,12 @@ bool SimplifyRODataLoads::simplifyRODataLoads( } ++NumLocalLoadsFound; - if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) + if (BB->hasProfile()) NumDynamicLocalLoadsFound += BB->getExecutionCount(); if (MIA->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { ++NumLocalLoadsSimplified; - if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) + if (BB->hasProfile()) NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); } } @@ -1607,5 +1657,362 @@ void StripRepRet::runOnFunctions( } } +void ReorderFunctions::buildCallGraph(BinaryContext &BC, + std::map &BFs) { + // Add call graph nodes. + auto lookupNode = [&](BinaryFunction *Function) { + auto It = FuncToTargetId.find(Function); + if (It == FuncToTargetId.end()) { + // It's ok to use the hot size here when the function is split. This is + // because emitFunctions will emit the hot part first in the order that is + // computed by ReorderFunctions. The cold part will be emitted with the + // rest of the cold functions and code. + const auto Size = opts::ReorderFunctionsUseHotSize && Function->isSplit() + ? Function->estimateHotSize() + : Function->estimateSize(); + const auto Id = Cg.addTarget(Size); + assert(size_t(Id) == Funcs.size()); + Funcs.push_back(Function); + FuncToTargetId[Function] = Id; + // NOTE: for functions without a profile, we set the number of samples + // to zero. This will keep these functions from appearing in the hot + // section. This is a little weird because we wouldn't be trying to + // create a node for a function unless it was the target of a call from + // a hot block. The alternative would be to set the count to one or + // accumulate the number of calls from the callsite into the function + // samples. Results from perfomance testing seem to favor the zero + // count though, so I'm leaving it this way for now. + Cg.Targets[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0; + assert(Funcs[Id] == Function); + return Id; + } else { + return It->second; + } + }; + + // Add call graph edges. + uint64_t NotFound = 0; + uint64_t TotalCalls = 0; + for (auto &It : BFs) { + auto *Function = &It.second; + + if(!shouldOptimize(*Function) || !Function->hasProfile()) { + continue; + } + + auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames()); + const auto SrcId = lookupNode(Function); + uint64_t Offset = Function->getAddress(); + + auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) { + if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) { + const auto DstId = lookupNode(DstFunc); + auto &A = Cg.incArcWeight(SrcId, DstId, Count); + if (!opts::UseEdgeCounts) { + A.AvgCallOffset += (Offset - DstFunc->getAddress()); + } + DEBUG(dbgs() << "BOLT-DEBUG: Reorder functions: call " << *Function + << " -> " << *DstFunc << " @ " << Offset << "\n"); + return true; + } + return false; + }; + + for (auto *BB : Function->layout()) { + if (!BB->isCold()) { // Don't count calls from cold blocks + for (auto &Inst : *BB) { + // Find call instructions and extract target symbols from each one. + bool Success = false; + if (BC.MIA->isCall(Inst)) + ++TotalCalls; + + if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) { + // For direct calls, just use the BB execution count. + assert(BB->hasProfile()); + const auto Count = opts::UseEdgeCounts ? BB->getExecutionCount() : 1; + Success = recordCall(DstSym, Count); + } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { + // For indirect calls and jump tables, use branch data. + assert(BranchDataOrErr); + const FuncBranchData &BranchData = BranchDataOrErr.get(); + const auto DataOffset = + BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); + + for (const auto &BI : BranchData.getBranchRange(DataOffset)) { + if (!BI.To.IsSymbol) { + continue; + } + + auto Itr = BC.GlobalSymbols.find(BI.To.Name); + if (Itr == BC.GlobalSymbols.end()) { + continue; + } + + const auto *DstSym = + BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); + + assert(BI.Branches > 0); + Success = recordCall(DstSym, opts::UseEdgeCounts ? BI.Branches : 1); + } + } + + if (!Success) + ++NotFound; + + if (!opts::UseEdgeCounts) { + Offset += BC.computeCodeSize(&Inst, &Inst + 1); + } + } + } + } + } + outs() << "BOLT-INFO: ReorderFunctions: " << NotFound << " calls not " + << " processed out of " << TotalCalls << "\n"; + + // Normalize arc weights. + if (!opts::UseEdgeCounts) { + for (TargetId FuncId = 0; FuncId < Cg.Targets.size(); ++FuncId) { + auto& Func = Cg.Targets[FuncId]; + for (auto Caller : Func.Preds) { + auto& A = *Cg.Arcs.find(Arc(Caller, FuncId)); + A.NormalizedWeight = A.Weight / Func.Samples; + A.AvgCallOffset /= A.Weight; + assert(A.AvgCallOffset < Cg.Targets[Caller].Size); + } + } + } else { + for (TargetId FuncId = 0; FuncId < Cg.Targets.size(); ++FuncId) { + auto &Func = Cg.Targets[FuncId]; + for (auto Caller : Func.Preds) { + auto& A = *Cg.Arcs.find(Arc(Caller, FuncId)); + A.NormalizedWeight = A.Weight / Func.Samples; + } + } + } +} + +void ReorderFunctions::reorder(std::vector &&Clusters, + std::map &BFs) { + std::vector FuncAddr(Cg.Targets.size()); // Just for computing stats + uint64_t TotalSize = 0; + uint32_t Index = 0; + + // Set order of hot functions based on clusters. + for (const auto& Cluster : Clusters) { + for (const auto FuncId : Cluster.Targets) { + assert(Cg.Targets[FuncId].Samples > 0); + Funcs[FuncId]->setIndex(Index++); + FuncAddr[FuncId] = TotalSize; + TotalSize += Cg.Targets[FuncId].Size; + } + } + + if (opts::Verbosity > 0 || (DebugFlag && isCurrentDebugType("hfsort"))) { + uint64_t TotalSize = 0; + uint64_t CurPage = 0; + uint64_t Hotfuncs = 0; + double TotalDistance = 0; + double TotalCalls = 0; + double TotalCalls64B = 0; + double TotalCalls4KB = 0; + double TotalCalls2MB = 0; + dbgs() << "============== page 0 ==============\n"; + for (auto& Cluster : Clusters) { + dbgs() << + format("-------- density = %.3lf (%u / %u) --------\n", + (double) Cluster.Samples / Cluster.Size, + Cluster.Samples, Cluster.Size); + + for (auto FuncId : Cluster.Targets) { + if (Cg.Targets[FuncId].Samples > 0) { + Hotfuncs++; + + dbgs() << "BOLT-INFO: hot func " << *Funcs[FuncId] + << " (" << Cg.Targets[FuncId].Size << ")\n"; + + uint64_t Dist = 0; + uint64_t Calls = 0; + for (auto Dst : Cg.Targets[FuncId].Succs) { + auto& A = *Cg.Arcs.find(Arc(FuncId, Dst)); + auto D = + std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset)); + auto W = A.Weight; + Calls += W; + if (D < 64) TotalCalls64B += W; + if (D < 4096) TotalCalls4KB += W; + if (D < (2 << 20)) TotalCalls2MB += W; + Dist += A.Weight * D; + dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: " + "weight = %.0lf, callDist = %f\n", + A.Src, FuncAddr[A.Src], A.AvgCallOffset, + A.Dst, FuncAddr[A.Dst], A.Weight, D); + } + TotalCalls += Calls; + TotalDistance += Dist; + dbgs() << format("start = %6u : avgCallDist = %lu : %s\n", + TotalSize, + Calls ? Dist / Calls : 0, + Funcs[FuncId]->getPrintName().c_str()); + TotalSize += Cg.Targets[FuncId].Size; + auto NewPage = TotalSize / PageSize; + if (NewPage != CurPage) { + CurPage = NewPage; + dbgs() << format("============== page %u ==============\n", CurPage); + } + } + } + } + dbgs() << format(" Number of hot functions: %u\n" + " Number of clusters: %lu\n", + Hotfuncs, Clusters.size()) + << format(" Final average call distance = %.1lf (%.0lf / %.0lf)\n", + TotalCalls ? TotalDistance / TotalCalls : 0, + TotalDistance, TotalCalls) + << format(" Total Calls = %.0lf\n", TotalCalls); + if (TotalCalls) { + dbgs() << format(" Total Calls within 64B = %.0lf (%.2lf%%)\n", + TotalCalls64B, 100 * TotalCalls64B / TotalCalls) + << format(" Total Calls within 4KB = %.0lf (%.2lf%%)\n", + TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls) + << format(" Total Calls within 2MB = %.0lf (%.2lf%%)\n", + TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls); + } + } +} + +namespace { + +std::vector readFunctionOrderFile() { + std::vector FunctionNames; + std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in); + if (!FuncsFile) { + errs() << "Ordered functions file \"" << opts::FunctionOrderFile + << "\" can't be opened.\n"; + exit(1); + } + std::string FuncName; + while (std::getline(FuncsFile, FuncName)) { + FunctionNames.push_back(FuncName); + } + return FunctionNames; +} + +} + +void ReorderFunctions::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + if (!opts::Relocs && opts::ReorderFunctions != BinaryFunction::RT_NONE) { + errs() << "BOLT-ERROR: Function reordering only works when " + << "relocs are enabled.\n"; + exit(1); + } + + if (opts::ReorderFunctions != BinaryFunction::RT_NONE && + opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT && + opts::ReorderFunctions != BinaryFunction::RT_USER) { + buildCallGraph(BC, BFs); + } + + std::vector Clusters; + + switch(opts::ReorderFunctions) { + case BinaryFunction::RT_NONE: + break; + case BinaryFunction::RT_EXEC_COUNT: + { + std::vector SortedFunctions(BFs.size()); + uint32_t Index = 0; + std::transform(BFs.begin(), + BFs.end(), + SortedFunctions.begin(), + [](std::pair &BFI) { + return &BFI.second; + }); + std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), + [&](const BinaryFunction *A, const BinaryFunction *B) { + if (!opts::shouldProcess(*A)) + return false; + const auto PadA = opts::padFunction(*A); + const auto PadB = opts::padFunction(*B); + if (!PadA || !PadB) { + if (PadA) + return true; + if (PadB) + return false; + } + return !A->hasProfile() && + (B->hasProfile() || + (A->getExecutionCount() > B->getExecutionCount())); + }); + for (auto *BF : SortedFunctions) { + if (BF->hasProfile()) + BF->setIndex(Index++); + } + } + break; + case BinaryFunction::RT_HFSORT: + Clusters = clusterize(Cg); + break; + case BinaryFunction::RT_HFSORT_PLUS: + Clusters = hfsortPlus(Cg); + break; + case BinaryFunction::RT_PETTIS_HANSEN: + Clusters = pettisAndHansen(Cg); + break; + case BinaryFunction::RT_RANDOM: + std::srand(opts::RandomSeed); + Clusters = randomClusters(Cg); + break; + case BinaryFunction::RT_USER: + { + uint32_t Index = 0; + for (const auto &Function : readFunctionOrderFile()) { + std::vector FuncAddrs; + + auto Itr = BC.GlobalSymbols.find(Function); + if (Itr == BC.GlobalSymbols.end()) { + uint32_t LocalID = 1; + while(1) { + // If we can't find the main symbol name, look for alternates. + Itr = BC.GlobalSymbols.find(Function + "/" + std::to_string(LocalID)); + if (Itr != BC.GlobalSymbols.end()) + FuncAddrs.push_back(Itr->second); + else + break; + LocalID++; + } + } else { + FuncAddrs.push_back(Itr->second); + } + + if (FuncAddrs.empty()) { + errs() << "BOLT-WARNING: Reorder functions: can't find function for " + << Function << "\n"; + continue; + } + + for (const auto FuncAddr : FuncAddrs) { + const auto *FuncSym = BC.getOrCreateGlobalSymbol(FuncAddr, "FUNCat"); + assert(FuncSym); + + auto *BF = BC.getFunctionForSymbol(FuncSym); + if (!BF) { + errs() << "BOLT-WARNING: Reorder functions: can't find function for " + << Function << "\n"; + break; + } + if (!BF->hasValidIndex()) { + BF->setIndex(Index++); + } + } + } + } + break; + } + + reorder(std::move(Clusters), BFs); +} + } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 9caecc1a3520..37866f9e36b1 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -16,6 +16,7 @@ #include "BinaryContext.h" #include "BinaryFunction.h" +#include "HFSort.h" #include "llvm/Support/CommandLine.h" #include #include @@ -499,6 +500,29 @@ class StripRepRet : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// Modify function order for streaming based on hotness. +class ReorderFunctions : public BinaryFunctionPass { + static constexpr uint32_t PageSize = 2 << 20; + std::vector Funcs; + std::unordered_map FuncToTargetId; + TargetGraph Cg; + + void buildCallGraph(BinaryContext &BC, + std::map &BFs); + void reorder(std::vector &&Clusters, + std::map &BFs); + public: + explicit ReorderFunctions(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "reorder-functions"; + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 46684474b39e..01b61bc4c6a5 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -1,6 +1,8 @@ add_llvm_library(LLVMBOLTPasses BinaryPasses.cpp FrameOptimizer.cpp + HFSort.cpp + HFSortPlus.cpp Inliner.cpp ReorderAlgorithm.cpp ) diff --git a/bolt/Passes/HFSort.cpp b/bolt/Passes/HFSort.cpp new file mode 100644 index 000000000000..9cab6f6f21dd --- /dev/null +++ b/bolt/Passes/HFSort.cpp @@ -0,0 +1,489 @@ +//===--- HFSort.cpp - Cluster functions by hotness ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +// TODO: copyright/license msg. + +/* + +----------------------------------------------------------------------+ + | HipHop for PHP | + +----------------------------------------------------------------------+ + | Copyright (c) 2010-2016 Facebook, Inc. (http://www.facebook.com) | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ +*/ + +#include "HFSort.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "hfsort" + +namespace llvm { +namespace bolt { + +namespace { + +// The number of pages to reserve for the functions with highest +// density (samples / size). The functions put in these pages are not +// considered for clustering. +constexpr uint32_t FrozenPages = 0; + +// The minimum approximate probability of a callee being called from a +// particular arc to consider merging with the caller's cluster. +constexpr double MinArcProbability = 0.1; + +// This is a factor to determine by how much a caller cluster is +// willing to degrade it's density by merging a callee. +constexpr int CallerDegradeFactor = 8; + +// Maximum size of a cluster, in bytes. +constexpr uint32_t MaxClusterSize = 1 << 20; + +constexpr uint32_t PageSize = 2 << 20; + +} +//////////////////////////////////////////////////////////////////////////////// + +TargetId TargetGraph::addTarget(uint32_t Size, uint32_t Samples) { + auto Id = Targets.size(); + Targets.emplace_back(Size, Samples); + return Id; +} + +const Arc &TargetGraph::incArcWeight(TargetId Src, TargetId Dst, double W) { + auto Res = Arcs.emplace(Src, Dst, W); + if (!Res.second) { + Res.first->Weight += W; + return *Res.first; + } + Targets[Src].Succs.push_back(Dst); + Targets[Dst].Preds.push_back(Src); + return *Res.first; +} + +Cluster::Cluster(TargetId Id, const TargetNode &Func) { + Targets.push_back(Id); + Size = Func.Size; + Samples = Func.Samples; + Frozen = false; + DEBUG(dbgs() << "new Cluster: " << toString() << "\n"); +} + +std::string Cluster::toString() const { + std::string Str; + raw_string_ostream CS(Str); + bool PrintComma = false; + CS << "funcs = ["; + for (auto &Target : Targets) { + if (PrintComma) CS << ", "; + CS << Target; + PrintComma = true; + } + CS << "]"; + return CS.str(); +} + +namespace { +//////////////////////////////////////////////////////////////////////////////// + +bool compareClustersDensity(const Cluster &C1, const Cluster &C2) { + return C1.density() > C2.density(); +} + +//////////////////////////////////////////////////////////////////////////////// + +void freezeClusters(const TargetGraph &Cg, std::vector &Clusters) { + uint32_t TotalSize = 0; + std::sort(Clusters.begin(), Clusters.end(), compareClustersDensity); + for (auto &C : Clusters) { + uint32_t NewSize = TotalSize + C.Size; + if (NewSize > FrozenPages * PageSize) break; + C.Frozen = true; + TotalSize = NewSize; + auto Fid = C.Targets[0]; + DEBUG(dbgs() << + format("freezing cluster for func %d, size = %u, samples = %u)\n", + Fid, Cg.Targets[Fid].Size, Cg.Targets[Fid].Samples);); + } +} + +void mergeInto(Cluster &Into, Cluster&& Other, const double Aw = 0) { + Into.Targets.insert(Into.Targets.end(), + Other.Targets.begin(), + Other.Targets.end()); + Into.Size += Other.Size; + Into.Samples += Other.Samples; + + Other.Size = 0; + Other.Samples = 0; + Other.Targets.clear(); +} +} + +std::vector clusterize(const TargetGraph &Cg) { + std::vector SortedFuncs; + + // indexed by TargetId, keeps it's current cluster + std::vector FuncCluster(Cg.Targets.size(), nullptr); + std::vector Clusters; + Clusters.reserve(Cg.Targets.size()); + + for (TargetId F = 0; F < Cg.Targets.size(); F++) { + if (Cg.Targets[F].Samples == 0) continue; + Clusters.emplace_back(F, Cg.Targets[F]); + SortedFuncs.push_back(F); + } + + freezeClusters(Cg, Clusters); + + // The size and order of Clusters is fixed until we reshuffle it immediately + // before returning. + for (auto &Cluster : Clusters) { + FuncCluster[Cluster.Targets.front()] = &Cluster; + } + + std::sort( + SortedFuncs.begin(), + SortedFuncs.end(), + [&] (const TargetId F1, const TargetId F2) { + const auto &Func1 = Cg.Targets[F1]; + const auto &Func2 = Cg.Targets[F2]; + return + (uint64_t)Func1.Samples * Func2.Size > // TODO: is this correct? + (uint64_t)Func2.Samples * Func1.Size; + } + ); + + // Process each function, and consider merging its cluster with the + // one containing its most likely predecessor. + for (const auto Fid : SortedFuncs) { + auto Cluster = FuncCluster[Fid]; + if (Cluster->Frozen) continue; + + // Find best predecessor. + TargetId BestPred = InvalidId; + double BestProb = 0; + + for (const auto Src : Cg.Targets[Fid].Preds) { + auto &A = *Cg.Arcs.find(Arc(Src, Fid)); + if (BestPred == InvalidId || A.NormalizedWeight > BestProb) { + BestPred = A.Src; + BestProb = A.NormalizedWeight; + } + } + + // Check if the merge is good for the callee. + // Don't merge if the probability of getting to the callee from the + // caller is too low. + if (BestProb < MinArcProbability) continue; + + assert(BestPred != InvalidId); + + auto PredCluster = FuncCluster[BestPred]; + + // Skip if no predCluster (predecessor w/ no samples), or if same + // as cluster, of it's frozen. + if (PredCluster == nullptr || PredCluster == Cluster || + PredCluster->Frozen) { + continue; + } + + // Skip if merged cluster would be bigger than the threshold. + if (Cluster->Size + PredCluster->Size > MaxClusterSize) continue; + + // Check if the merge is good for the caller. + // Don't merge if the caller's density is significantly better + // than the density resulting from the merge. + const double NewDensity = + ((double)PredCluster->Samples + Cluster->Samples) / + (PredCluster->Size + Cluster->Size); + if (PredCluster->density() > NewDensity * CallerDegradeFactor) { + continue; + } + + DEBUG(dbgs() << format("merging %s -> %s: %u\n", + PredCluster->toString().c_str(), + Cluster->toString().c_str(), + Cg.Targets[Fid].Samples);); + + for (auto F : Cluster->Targets) { + FuncCluster[F] = PredCluster; + } + + mergeInto(*PredCluster, std::move(*Cluster)); + } + + // Return the set of Clusters that are left, which are the ones that + // didn't get merged (so their first func is its original func). + std::vector SortedClusters; + for (const auto Func : SortedFuncs) { + auto Cluster = FuncCluster[Func]; + if (!Cluster || Cluster->Targets.empty()) continue; + if (Cluster->Targets[0] != Func) continue; + SortedClusters.emplace_back(std::move(*Cluster)); + Cluster->Targets.clear(); + } + + std::sort(SortedClusters.begin(), + SortedClusters.end(), + compareClustersDensity); + + return SortedClusters; +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +class ClusterArc { +public: + ClusterArc(Cluster *Ca, Cluster *Cb, double W = 0) + : C1(std::min(Ca, Cb)) + , C2(std::max(Ca, Cb)) + , Weight(W) + {} + + friend bool operator==(const ClusterArc &Lhs, const ClusterArc &Rhs) { + return Lhs.C1 == Rhs.C1 && Lhs.C2 == Rhs.C2; + } + + Cluster *const C1; + Cluster *const C2; + mutable double Weight; +}; + +class ClusterArcHash { +public: + int64_t operator()(const ClusterArc &Arc) const { + std::hash Hasher; + return hashCombine(Hasher(int64_t(Arc.C1)), int64_t(Arc.C2)); + } +}; + +using ClusterArcSet = std::unordered_set; + +void orderFuncs(const TargetGraph &Cg, Cluster *C1, Cluster *C2) { + TargetId C1head = C1->Targets.front(); + TargetId C1tail = C1->Targets.back(); + TargetId C2head = C2->Targets.front(); + TargetId C2tail = C2->Targets.back(); + + double C1headC2head = 0; + double C1headC2tail = 0; + double C1tailC2head = 0; + double C1tailC2tail = 0; + + for (const auto &Arc : Cg.Arcs) { + if ((Arc.Src == C1head && Arc.Dst == C2head) || + (Arc.Dst == C1head && Arc.Src == C2head)) { + C1headC2head += Arc.Weight; + } else if ((Arc.Src == C1head && Arc.Dst == C2tail) || + (Arc.Dst == C1head && Arc.Src == C2tail)) { + C1headC2tail += Arc.Weight; + } else if ((Arc.Src == C1tail && Arc.Dst == C2head) || + (Arc.Dst == C1tail && Arc.Src == C2head)) { + C1tailC2head += Arc.Weight; + } else if ((Arc.Src == C1tail && Arc.Dst == C2tail) || + (Arc.Dst == C1tail && Arc.Src == C2tail)) { + C1tailC2tail += Arc.Weight; + } + } + + const double Max = std::max(std::max(C1headC2head, C1headC2tail), + std::max(C1tailC2head, C1tailC2tail)); + + if (C1headC2head == Max) { + // flip C1 + std::reverse(C1->Targets.begin(), C1->Targets.end()); + } else if (C1headC2tail == Max) { + // flip C1 C2 + std::reverse(C1->Targets.begin(), C1->Targets.end()); + std::reverse(C2->Targets.begin(), C2->Targets.end()); + } else if (C1tailC2tail == Max) { + // flip C2 + std::reverse(C2->Targets.begin(), C2->Targets.end()); + } +} +} + +std::vector pettisAndHansen(const TargetGraph &Cg) { + // indexed by TargetId, keeps its current cluster + std::vector FuncCluster(Cg.Targets.size(), nullptr); + std::vector Clusters; + std::vector Funcs; + + Clusters.reserve(Cg.Targets.size()); + + for (TargetId F = 0; F < Cg.Targets.size(); F++) { + if (Cg.Targets[F].Samples == 0) continue; + Clusters.emplace_back(F, Cg.Targets[F]); + FuncCluster[F] = &Clusters.back(); + Funcs.push_back(F); + } + + ClusterArcSet Carcs; + + auto insertOrInc = [&](Cluster *C1, Cluster *C2, double Weight) { + auto Res = Carcs.emplace(C1, C2, Weight); + if (!Res.second) { + Res.first->Weight += Weight; + } + }; + + // Create a std::vector of cluster arcs + + for (auto &Arc : Cg.Arcs) { + if (Arc.Weight == 0) continue; + + auto const S = FuncCluster[Arc.Src]; + auto const D = FuncCluster[Arc.Dst]; + + // ignore if s or d is nullptr + + if (S == nullptr || D == nullptr) continue; + + // ignore self-edges + + if (S == D) continue; + + insertOrInc(S, D, Arc.Weight); + } + + // Find an arc with max weight and merge its nodes + + while (!Carcs.empty()) { + auto Maxpos = std::max_element( + Carcs.begin(), + Carcs.end(), + [&] (const ClusterArc &Carc1, const ClusterArc &Carc2) { + return Carc1.Weight < Carc2.Weight; + } + ); + + auto Max = *Maxpos; + Carcs.erase(Maxpos); + + auto const C1 = Max.C1; + auto const C2 = Max.C2; + + if (C1->Size + C2->Size > MaxClusterSize) continue; + + if (C1->Frozen || C2->Frozen) continue; + + // order functions and merge cluster + + orderFuncs(Cg, C1, C2); + + DEBUG(dbgs() << format("merging %s -> %s: %.1f\n", C2->toString().c_str(), + C1->toString().c_str(), Max.Weight);); + + // update carcs: merge C1arcs to C2arcs + + std::unordered_map C2arcs; + for (auto &Carc : Carcs) { + if (Carc.C1 == C2) C2arcs.emplace(Carc, Carc.C2); + if (Carc.C2 == C2) C2arcs.emplace(Carc, Carc.C1); + } + + for (auto It : C2arcs) { + auto const C = It.second; + auto const C2arc = It.first; + + insertOrInc(C, C1, C2arc.Weight); + Carcs.erase(C2arc); + } + + // update FuncCluster + + for (auto F : C2->Targets) { + FuncCluster[F] = C1; + } + mergeInto(*C1, std::move(*C2), Max.Weight); + } + + // Return the set of Clusters that are left, which are the ones that + // didn't get merged. + + std::set LiveClusters; + std::vector OutClusters; + + for (auto Fid : Funcs) { + LiveClusters.insert(FuncCluster[Fid]); + } + for (auto C : LiveClusters) { + OutClusters.push_back(std::move(*C)); + } + + std::sort(OutClusters.begin(), + OutClusters.end(), + compareClustersDensity); + + return OutClusters; +} + +std::vector randomClusters(const TargetGraph &Cg) { + std::vector FuncIds(Cg.Targets.size(), 0); + std::vector Clusters; + Clusters.reserve(Cg.Targets.size()); + + for (TargetId F = 0; F < Cg.Targets.size(); F++) { + if (Cg.Targets[F].Samples == 0) continue; + Clusters.emplace_back(F, Cg.Targets[F]); + } + + std::sort(Clusters.begin(), + Clusters.end(), + [](const Cluster &A, const Cluster &B) { + return A.Size < B.Size; + }); + + auto pickMergeCluster = [&Clusters](const size_t Idx) { + size_t MaxIdx = Idx + 1; + + while (MaxIdx < Clusters.size() && + Clusters[Idx].Size + Clusters[MaxIdx].Size <= MaxClusterSize) { + ++MaxIdx; + } + + if (MaxIdx - Idx > 1) { + size_t MergeIdx = (std::rand() % (MaxIdx - Idx - 1)) + Idx + 1; + assert(Clusters[MergeIdx].Size + Clusters[Idx].Size <= MaxClusterSize); + return MergeIdx; + } + return Clusters.size(); + }; + + size_t Idx = 0; + while (Idx < Clusters.size()) { + auto MergeIdx = pickMergeCluster(Idx); + if (MergeIdx == Clusters.size()) { + ++Idx; + } else { + mergeInto(Clusters[Idx], std::move(Clusters[MergeIdx])); + Clusters.erase(Clusters.begin() + MergeIdx); + } + } + + return Clusters; +} + +} +} diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h new file mode 100644 index 000000000000..acf4e87d2b84 --- /dev/null +++ b/bolt/Passes/HFSort.h @@ -0,0 +1,195 @@ +//===--- HFSort.h - Cluster functions by hotness --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Cluster functions by hotness. There are four clustering algorithms: +// 1. clusterize +// 2. HFsort+ +// 3. pettisAndHansen +// 4. randomClusters +// +// See original code in hphp/utils/hfsort.[h,cpp] +//===----------------------------------------------------------------------===// + +// TODO: copyright/license msg. + +/* + +----------------------------------------------------------------------+ + | HipHop for PHP | + +----------------------------------------------------------------------+ + | Copyright (c) 2010-2016 Facebook, Inc. (http://www.facebook.com) | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ +*/ + +#ifndef LLVM_TOOLS_LLVM_BOLT_HFSORT_H +#define LLVM_TOOLS_LLVM_BOLT_HFSORT_H + +#include +#include +#include +#include + +#if defined(__x86_64__) && !defined(_MSC_VER) +# if (!defined USE_SSECRC) +# define USE_SSECRC +# endif +#else +# undef USE_SSECRC +#endif + +namespace llvm { +namespace bolt { + +using TargetId = size_t; +constexpr TargetId InvalidId = -1; + +class Arc { +public: + Arc(TargetId S, TargetId D, double W = 0) + : Src(S) + , Dst(D) + , Weight(W) + {} + Arc(const Arc&) = delete; + + friend bool operator==(const Arc &Lhs, const Arc &Rhs) { + return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst; + } + + const TargetId Src; + const TargetId Dst; + mutable double Weight; + mutable double NormalizedWeight{0}; + mutable double AvgCallOffset{0}; +}; + +namespace { + +inline int64_t hashCombine(const int64_t Seed, const int64_t Val) { + std::hash Hasher; + return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2)); +} + +inline size_t hash_int64_fallback(int64_t key) { + // "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function." + // http://www.concentric.net/~ttwang/tech/inthash.htm + key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = key ^ ((unsigned long long)key >> 24); + key = (key + (key << 3)) + (key << 8); // key * 265 + key = key ^ ((unsigned long long)key >> 14); + key = (key + (key << 2)) + (key << 4); // key * 21 + key = key ^ ((unsigned long long)key >> 28); + return static_cast(static_cast(key)); +} + +inline size_t hash_int64(int64_t k) { +#if defined(USE_SSECRC) && defined(__SSE4_2__) + size_t h = 0; + __asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k)); + return h; +#else + return hash_int64_fallback(k); +#endif +} + +inline size_t hash_int64_pair(int64_t k1, int64_t k2) { +#if defined(USE_SSECRC) && defined(__SSE4_2__) + // crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes + // differently from (k2, k1). + k1 += k1; + __asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2)); + return k1; +#else + return (hash_int64(k1) << 1) ^ hash_int64(k2); +#endif +} + +} + +class ArcHash { +public: + int64_t operator()(const Arc &Arc) const { +#ifdef USE_STD_HASH + std::hash Hasher; + return hashCombine(Hasher(Arc.Src), Arc.Dst); +#else + return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst)); +#endif + } +}; + +class TargetNode { +public: + explicit TargetNode(uint32_t Size, uint32_t Samples = 0) + : Size(Size), Samples(Samples) + {} + + uint32_t Size; + uint32_t Samples; + + // preds and succs contain no duplicate elements and self arcs are not allowed + std::vector Preds; + std::vector Succs; +}; + +class TargetGraph { +public: + TargetId addTarget(uint32_t Size, uint32_t Samples = 0); + const Arc &incArcWeight(TargetId Src, TargetId Dst, double W = 1.0); + + std::vector Targets; + std::unordered_set Arcs; +}; + +class Cluster { +public: + Cluster(TargetId Id, const TargetNode &F); + + std::string toString() const; + double density() const { + return (double)Samples / Size; + } + + std::vector Targets; + uint32_t Samples; + uint32_t Size; + bool Frozen; // not a candidate for merging +}; + +/* + * Cluster functions in order to minimize call distance. + */ +std::vector clusterize(const TargetGraph &Cg); + +/* + * Optimize function placement for iTLB cache and i-cache. + */ +std::vector hfsortPlus(const TargetGraph &Cg); + +/* + * Pettis-Hansen code layout algorithm + * reference: K. Pettis and R. C. Hansen, "Profile Guided Code Positioning", + * PLDI '90 + */ +std::vector pettisAndHansen(const TargetGraph &Cg); + +/* Group functions into clusters randomly. */ +std::vector randomClusters(const TargetGraph &Cg); + +} +} + +#endif diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp new file mode 100644 index 000000000000..523841210a56 --- /dev/null +++ b/bolt/Passes/HFSortPlus.cpp @@ -0,0 +1,508 @@ +//===--- HFSort.cpp - Cluster functions by hotness ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +// TODO: copyright/license msg. + +/* + +----------------------------------------------------------------------+ + | HipHop for PHP | + +----------------------------------------------------------------------+ + | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ +*/ + +#include "HFSort.h" +#include "llvm/Support/Format.h" + +#include +#include +#include +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "hfsort" + +namespace llvm { +namespace bolt { + +namespace { + +// The size of a cache page +// Since we optimize both for iTLB cache (2MB pages) and i-cache (64b pages), +// using a value that fits both +constexpr uint32_t PageSize = uint32_t(1) << 12; + +// Capacity of the iTLB cache: larger values yield more iTLB-friendly result, +// while smaller values result in better i-cache performance +constexpr uint32_t ITLBEntries = 16; + +constexpr size_t InvalidAddr = -1; + +template +class HashPair { +public: + size_t operator()(const std::pair &P) const { + size_t Seed(0); + Seed = hashCombine(Seed, (int64_t)P.first); + Seed = hashCombine(Seed, (int64_t)P.second); + return Seed; + } +}; + +// A cache of precomputed results for a pair of clusters +class PrecomputedResults { + public: + PrecomputedResults() {} + + bool contains(Cluster *First, Cluster *Second) const { + if (InvalidKeys.count(First) || InvalidKeys.count(Second)) { + return false; + } + const auto Key = std::make_pair(First, Second); + return Cache.find(Key) != Cache.end(); + } + + double get(Cluster *First, Cluster *Second) const { + assert(contains(First, Second)); + const auto Key = std::make_pair(First, Second); // TODO: use min/max? + return Cache.find(Key)->second; + } + + void set(Cluster *First, Cluster *Second, double Value) { + const auto Key = std::make_pair(First, Second); + Cache[Key] = Value; + validate(First); + validate(Second); + } + + void validate(Cluster *C) { + auto Itr = InvalidKeys.find(C); + if (Itr != InvalidKeys.end()) + InvalidKeys.erase(Itr); + } + + void validateAll() { + InvalidKeys.clear(); + } + + void invalidate(Cluster *Cluster) { + InvalidKeys.insert(Cluster); + } + + private: + std::unordered_map, + double, + HashPair> Cache; + std::unordered_set InvalidKeys; +}; + +// A wrapper for algorthm-wide variables +struct AlgoState { + // the call graph + const TargetGraph *Cg; + // the total number of samples in the graph + double TotalSamples; + // target_id => cluster + std::vector FuncCluster; + // current address of the function from the beginning of its cluster + std::vector Addr; +}; + +bool compareClustersDensity(const Cluster &C1, const Cluster &C2) { + return C1.density() > C2.density(); +} + +} + +/* + * Sorting clusters by their density in decreasing order + */ +void sortByDensity(std::vector &Clusters) { + std::sort( + Clusters.begin(), + Clusters.end(), + [&] (const Cluster *C1, const Cluster *C2) { + const double D1 = C1->density(); + const double D2 = C2->density(); + // making sure the sorting is deterministic + if (D1 != D2) return D1 > D2; + if (C1->Size != C2->Size) return C1->Size < C2->Size; + if (C1->Samples != C2->Samples) return C1->Samples > C2->Samples; + return C1->Targets[0] < C2->Targets[0]; + } + ); +} + +/* + * Density of a cluster formed by merging a given pair of clusters + */ +double density(Cluster *ClusterPred, Cluster *ClusterSucc) { + const double CombinedSamples = ClusterPred->Samples + ClusterSucc->Samples; + const double CombinedSize = ClusterPred->Size + ClusterSucc->Size; + return CombinedSamples / CombinedSize; +} + +/* + * The probability that a page with a given weight is not present in the cache. + * + * Assume that the hot function are called in a random order; then the + * probability of a TLB page being accessed after a function call is + * p=pageSamples/totalSamples. The probability that the page is not accessed + * is (1-p), and the probability that it is not in the cache (i.e. not accessed + * during the last kITLBEntries function calls) is (1-p)^kITLBEntries + */ +double missProbability(const AlgoState &State, double PageSamples) { + double P = PageSamples / State.TotalSamples; + double X = ITLBEntries; + // avoiding precision issues for small values + if (P < 0.0001) return (1.0 - X * P + X * (X - 1.0) * P * P / 2.0); + return pow(1.0 - P, X); +} + +/* + * Expected hit ratio of the iTLB cache under the given order of clusters + * + * Given an ordering of hot functions (and hence, their assignment to the + * iTLB pages), we can divide all functions calls into two categories: + * - 'short' ones that have a caller-callee distance less than a page; + * - 'long' ones where the distance exceeds a page. + * The short calls are likely to result in a iTLB cache hit. For the long ones, + * the hit/miss result depends on the 'hotness' of the page (i.e., how often + * the page is accessed). Assuming that functions are sent to the iTLB cache + * in a random order, the probability that a page is present in the cache is + * proportional to the number of samples corresponding to the functions on the + * page. The following procedure detects short and long calls, and estimates + * the expected number of cache misses for the long ones. + */ +double expectedCacheHitRatio(const AlgoState &State, + const std::vector &Clusters_) { + // copy and sort by density + std::vector Clusters(Clusters_); + sortByDensity(Clusters); + + // generate function addresses with an alignment + std::vector Addr(State.Cg->Targets.size(), InvalidAddr); + size_t CurAddr = 0; + // 'hotness' of the pages + std::vector PageSamples; + for (auto Cluster : Clusters) { + for (auto TargetId : Cluster->Targets) { + if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16; + Addr[TargetId] = CurAddr; + CurAddr += State.Cg->Targets[TargetId].Size; + // update page weight + size_t Page = Addr[TargetId] / PageSize; + while (PageSamples.size() <= Page) PageSamples.push_back(0.0); + PageSamples[Page] += State.Cg->Targets[TargetId].Samples; + } + } + + // computing expected number of misses for every function + double Misses = 0; + for (auto Cluster : Clusters) { + for (auto TargetId : Cluster->Targets) { + size_t Page = Addr[TargetId] / PageSize; + double Samples = State.Cg->Targets[TargetId].Samples; + // probability that the page is not present in the cache + double MissProb = missProbability(State, PageSamples[Page]); + + for (auto Pred : State.Cg->Targets[TargetId].Preds) { + if (State.Cg->Targets[Pred].Samples == 0) continue; + auto A = State.Cg->Arcs.find(Arc(Pred, TargetId)); + + // the source page + size_t SrcPage = (Addr[Pred] + (size_t)A->AvgCallOffset) / PageSize; + if (Page != SrcPage) { + // this is a miss + Misses += A->Weight * MissProb; + } + Samples -= A->Weight; + } + + // the remaining samples come from the jitted code + Misses += Samples * MissProb; + } + } + + return 100.0 * (1.0 - Misses / State.TotalSamples); +} + +/* + * Get adjacent clusters (the ones that share an arc) with the given one + */ +std::unordered_set adjacentClusters(const AlgoState &State, + Cluster *C) { + std::unordered_set Result; + for (auto TargetId : C->Targets) { + for (auto Succ : State.Cg->Targets[TargetId].Succs) { + auto SuccCluster = State.FuncCluster[Succ]; + if (SuccCluster != nullptr && SuccCluster != C) { + Result.insert(SuccCluster); + } + } + for (auto Pred : State.Cg->Targets[TargetId].Preds) { + auto PredCluster = State.FuncCluster[Pred]; + if (PredCluster != nullptr && PredCluster != C) { + Result.insert(PredCluster); + } + } + } + return Result; +} + +/* + * The expected number of calls for an edge withing the same TLB page + */ +double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) { + auto Dist = std::abs(SrcAddr - DstAddr); + if (Dist > PageSize) { + return 0; + } + return (double(PageSize - Dist) / PageSize) * EdgeWeight; +} + +/* + * The expected number of calls within a given cluster with both endpoints on + * the same TLB cache page + */ +double shortCalls(const AlgoState &State, Cluster *Cluster) { + double Calls = 0; + for (auto TargetId : Cluster->Targets) { + for (auto Succ : State.Cg->Targets[TargetId].Succs) { + if (State.FuncCluster[Succ] == Cluster) { + auto A = State.Cg->Arcs.find(Arc(TargetId, Succ)); + + auto SrcAddr = State.Addr[TargetId] + A->AvgCallOffset; + auto DstAddr = State.Addr[Succ]; + + Calls += expectedCalls(SrcAddr, DstAddr, A->Weight); + } + } + } + + return Calls; +} + +/* + * The number of calls between the two clusters with both endpoints on + * the same TLB page, assuming that a given pair of clusters gets merged + */ +double shortCalls(const AlgoState &State, + Cluster *ClusterPred, + Cluster *ClusterSucc) { + double Calls = 0; + for (auto TargetId : ClusterPred->Targets) { + for (auto Succ : State.Cg->Targets[TargetId].Succs) { + if (State.FuncCluster[Succ] == ClusterSucc) { + auto A = State.Cg->Arcs.find(Arc(TargetId, Succ)); + + auto SrcAddr = State.Addr[TargetId] + A->AvgCallOffset; + auto DstAddr = State.Addr[Succ] + ClusterPred->Size; + + Calls += expectedCalls(SrcAddr, DstAddr, A->Weight); + } + } + } + + for (auto TargetId : ClusterPred->Targets) { + for (auto Pred : State.Cg->Targets[TargetId].Preds) { + if (State.FuncCluster[Pred] == ClusterSucc) { + auto A = State.Cg->Arcs.find(Arc(Pred, TargetId)); + + auto SrcAddr = State.Addr[Pred] + A->AvgCallOffset + + ClusterPred->Size; + auto DstAddr = State.Addr[TargetId]; + + Calls += expectedCalls(SrcAddr, DstAddr, A->Weight); + } + } + } + + return Calls; +} + +/* + * The gain of merging two clusters. + * + * We assume that the final clusters are sorted by their density, and hence + * every cluster is likely to be adjacent with clusters of the same density. + * Thus, the 'hotness' of every cluster can be estimated by density*pageSize, + * which is used to compute the probability of cache misses for long calls + * of a given cluster. + * The result is also scaled by the size of the resulting cluster in order to + * increse the chance of merging short clusters, which is helpful for + * the i-cache performance. + */ +double mergeGain(const AlgoState &State, + Cluster *ClusterPred, + Cluster *ClusterSucc) { + // cache misses on the first cluster + double LongCallsPred = ClusterPred->Samples - shortCalls(State, ClusterPred); + double ProbPred = missProbability(State, ClusterPred->density() * PageSize); + double ExpectedMissesPred = LongCallsPred * ProbPred; + + // cache misses on the second cluster + double LongCallsSucc = ClusterSucc->Samples - shortCalls(State, ClusterSucc); + double ProbSucc = missProbability(State, ClusterSucc->density() * PageSize); + double ExpectedMissesSucc = LongCallsSucc * ProbSucc; + + // cache misses on the merged cluster + double LongCallsNew = LongCallsPred + LongCallsSucc - + shortCalls(State, ClusterPred, ClusterSucc); + double NewDensity = density(ClusterPred, ClusterSucc); + double ProbNew = missProbability(State, NewDensity * PageSize); + double MissesNew = LongCallsNew * ProbNew; + + double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew; + // scaling the result to increase the importance of merging short clusters + return Gain / (ClusterPred->Size + ClusterSucc->Size); +} + + /* + * Merge two clusters + */ +void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) { + auto &Targets = Other->Targets; + Into->Targets.insert(Into->Targets.end(), Targets.begin(), Targets.end()); + Into->Size += Other->Size; + Into->Samples += Other->Samples; + + size_t CurAddr = 0; + for (auto TargetId : Into->Targets) { + State.FuncCluster[TargetId] = Into; + State.Addr[TargetId] = CurAddr; + CurAddr += State.Cg->Targets[TargetId].Size; + } + + Other->Size = 0; + Other->Samples = 0; + Other->Targets.clear(); +} + +/* + * HFSortPlus - layout of hot functions with iTLB cache optimization + */ +std::vector hfsortPlus(const TargetGraph &Cg) { + // create a cluster for every function + std::vector AllClusters; + AllClusters.reserve(Cg.Targets.size()); + for (TargetId F = 0; F < Cg.Targets.size(); F++) { + AllClusters.emplace_back(F, Cg.Targets[F]); + } + + // initialize objects used by the algorithm + std::vector Clusters; + Clusters.reserve(Cg.Targets.size()); + AlgoState State; + State.Cg = &Cg; + State.TotalSamples = 0; + State.FuncCluster = std::vector(Cg.Targets.size(), nullptr); + State.Addr = std::vector(Cg.Targets.size(), InvalidAddr); + for (TargetId F = 0; F < Cg.Targets.size(); F++) { + if (Cg.Targets[F].Samples == 0) continue; + + Clusters.push_back(&AllClusters[F]); + State.FuncCluster[F] = &AllClusters[F]; + State.Addr[F] = 0; + State.TotalSamples += Cg.Targets[F].Samples; + } + + DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n" + << format("Initial expected iTLB cache hit ratio: %.4lf\n", + expectedCacheHitRatio(State, Clusters))); + + // the cache keeps precomputed values of mergeGain for pairs of clusters; + // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs + // containing both x and y (and recompute them on the next iteration) + PrecomputedResults Cache; + + int Steps = 0; + // merge pairs of clusters while there is an improvement + while (Clusters.size() > 1) { + DEBUG( + if (Steps % 500 == 0) { + dbgs() << format("step = %d clusters = %lu expected_hit_rate = %.4lf\n", + Steps, + Clusters.size(), + expectedCacheHitRatio(State, Clusters)); + } + ); + Steps++; + + Cluster *BestClusterPred = nullptr; + Cluster *BestClusterSucc = nullptr; + double BestGain = -1; + for (auto ClusterPred : Clusters) { + // get candidates for merging with the current cluster + auto CandidateClusters = adjacentClusters(State, ClusterPred); + + // find the best candidate + for (auto ClusterSucc : CandidateClusters) { + // get a cost of merging two clusters + if (!Cache.contains(ClusterPred, ClusterSucc)) { + double Value = mergeGain(State, ClusterPred, ClusterSucc); + Cache.set(ClusterPred, ClusterSucc, Value); + assert(Cache.contains(ClusterPred, ClusterSucc)); + } + + double Gain = Cache.get(ClusterPred, ClusterSucc); + // breaking ties by density to make the hottest clusters be merged first + if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 && + density(ClusterPred, ClusterSucc) > + density(BestClusterPred, BestClusterSucc))) { + BestGain = Gain; + BestClusterPred = ClusterPred; + BestClusterSucc = ClusterSucc; + } + } + } + Cache.validateAll(); + + if (BestGain <= 0.0) break; + + Cache.invalidate(BestClusterPred); + Cache.invalidate(BestClusterSucc); + + // merge the best pair of clusters + mergeInto(State, BestClusterPred, BestClusterSucc); + // remove BestClusterSucc from the list of active clusters + auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc); + Clusters.erase(Iter, Clusters.end()); + } + + DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n" + << format("Final expected iTLB cache hit ratio: %.4lf\n", + expectedCacheHitRatio(State, Clusters))); + + // Return the set of clusters that are left, which are the ones that + // didn't get merged (so their first func is its original func). + sortByDensity(Clusters); + std::vector Result; + for (auto Cluster : Clusters) { + Result.emplace_back(std::move(*Cluster)); + } + + std::sort(Result.begin(), Result.end(), compareClustersDensity); + + return Result; +} + +}} diff --git a/bolt/Passes/ReorderAlgorithm.cpp b/bolt/Passes/ReorderAlgorithm.cpp index 8f632be49404..295d286aed23 100644 --- a/bolt/Passes/ReorderAlgorithm.cpp +++ b/bolt/Passes/ReorderAlgorithm.cpp @@ -29,7 +29,7 @@ namespace opts { static cl::opt PrintClusters("print-clusters", cl::desc("print clusters"), cl::ZeroOrMore); -static cl::opt +cl::opt RandomSeed("bolt-seed", cl::desc("seed for randomization"), cl::init(42), diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 61b6b95f5ebb..14b785895c98 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -67,6 +67,7 @@ using namespace bolt; namespace opts { extern cl::opt JumpTables; +extern cl::opt ReorderFunctions; static cl::opt OutputFilename("o", cl::desc(""), cl::Required); @@ -154,11 +155,6 @@ Relocs("relocs", cl::desc("relocation support (experimental)"), cl::ZeroOrMore); -cl::opt -ReorderFunctions("reorder-functions", - cl::desc("reorder function (works only with relocations)"), - cl::ZeroOrMore); - static cl::list FunctionPadSpec("pad-funcs", cl::CommaSeparated, @@ -1923,22 +1919,38 @@ void RewriteInstance::emitFunctions() { return &BFI.second; }); - if (opts::Relocs && opts::ReorderFunctions) { + if (opts::ReorderFunctions != BinaryFunction::RT_NONE) { std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), - [&](const BinaryFunction *A, const BinaryFunction *B) { - const auto PadA = opts::padFunction(*A); - const auto PadB = opts::padFunction(*B); - if (!PadA || !PadB) { - if (PadA) - return true; - if (PadB) - return false; - } - return (A->getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) && - ((B->getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE)|| - (A->getExecutionCount() > B->getExecutionCount())); - }); + [](const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } else { + return A->hasValidIndex(); + } + }); + } + + DEBUG( + if (!opts::Relocs) { + auto SortedIt = SortedFunctions.begin(); + for (auto &It : BinaryFunctions) { + assert(&It.second == *SortedIt); + ++SortedIt; + } + }); + + uint32_t LastHotIndex = -1u; + uint32_t CurrentIndex = 0; + for (auto *BF : SortedFunctions) { + if (!BF->hasValidIndex() && LastHotIndex == -1u) { + LastHotIndex = CurrentIndex; + } + assert(LastHotIndex == -1u || !BF->hasValidIndex()); + assert(!BF->hasValidIndex() || CurrentIndex == BF->getIndex()); + ++CurrentIndex; } + CurrentIndex = 0; + DEBUG(dbgs() << "BOLT-DEBUG: LastHotIndex = " << LastHotIndex << "\n"); bool ColdFunctionSeen = false; @@ -1948,12 +1960,7 @@ void RewriteInstance::emitFunctions() { // Emit all cold function split parts at the border of hot and // cold functions. - // - // FIXME: this only works with reordered functions. What do we do - // if there's no functions reordering in place? - if (opts::Relocs && - !ColdFunctionSeen && - Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) { + if (opts::Relocs && !ColdFunctionSeen && CurrentIndex >= LastHotIndex) { // Mark the end of "hot" stuff. if (opts::HotText) { Streamer->SwitchSection(BC->MOFI->getTextSection()); @@ -1974,6 +1981,7 @@ void RewriteInstance::emitFunctions() { if (!opts::Relocs && (!Function.isSimple() || !opts::shouldProcess(Function))) { + ++CurrentIndex; continue; } @@ -1985,6 +1993,8 @@ void RewriteInstance::emitFunctions() { if (!opts::Relocs && Function.isSplit()) emitFunction(*Streamer, Function, *BC.get(), /*EmitColdPart=*/true); + + ++CurrentIndex; } if (!ColdFunctionSeen && opts::HotText) { @@ -2106,7 +2116,7 @@ void RewriteInstance::mapFileSections( } else { if (opts::UseOldText) { errs() << "BOLT-ERROR: original .text too small to fit the new code. " - << SI.Size << " byte needed, have " << OldTextSectionSize + << SI.Size << " bytes needed, have " << OldTextSectionSize << " bytes available.\n"; } auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); From a6e3703f7b89e3570b0d099f0bbef85b3d53268e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 21 Feb 2017 14:18:09 -0800 Subject: [PATCH 224/904] [BOLT] Detect unmarked data in text. Summary: Sometimes a code written in assembly will have unmarked data (such as constants) embedded into text. Typically such data falls into a "padding" address space of a function. This diffs detects such references, and adjusts the padding space to prevent overwriting of code in data. Note that in relocation mode we prefer to overwrite the original code (-use-old-text) and thus cannot simply ignore data in text. (cherry picked from commit 22451b639e934dfb93d54741c4058d782ac263a7) --- bolt/BinaryFunction.cpp | 2 +- bolt/RewriteInstance.cpp | 45 +++++++++++++++++++++++++++++++++++++--- bolt/RewriteInstance.h | 14 +++++++++++-- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 726c78d66d41..f31b1a8105ca 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1061,7 +1061,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { TargetSymbol = getSymbol(); } else { // Possibly an old-style PIC code - errs() << "BOLT-ERROR: internal call detected at 0x" + errs() << "BOLT-WARNING: internal call detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this << ". Skipping.\n"; IsSimple = false; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 14b785895c98..18b0271503ed 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1631,7 +1631,7 @@ void RewriteInstance::disassembleFunctions() { // Post-process inter-procedural references ASAP as it may affect // functions we are about to disassemble next. - for (auto Addr : BC->InterproceduralReferences) { + for (const auto Addr : BC->InterproceduralReferences) { auto *ContainingFunction = getBinaryFunctionContainingAddress(Addr); if (ContainingFunction && ContainingFunction->getAddress() != Addr) { ContainingFunction->addEntryPoint(Addr); @@ -1643,6 +1643,40 @@ void RewriteInstance::disassembleFunctions() { } ContainingFunction->setSimple(false); } + } else if (!ContainingFunction && Addr) { + // Check if address falls in function padding space - this could be + // unmarked data in code. In this case adjust the padding space size. + auto Section = BC->getSectionForAddress(Addr); + assert(Section && "cannot get section for referenced address"); + + if (!Section->isText()) + continue; + + // PLT requires special handling and could be ignored in this context. + StringRef SectionName; + Section->getName(SectionName); + if (SectionName == ".plt") + continue; + + if (opts::Relocs) { + errs() << "BOLT-ERROR: cannot process binaries with unmarked " + << "object in code at address 0x" + << Twine::utohexstr(Addr) << " belonging to section " + << SectionName << " in relocation mode.\n"; + exit(1); + } + + ContainingFunction = + getBinaryFunctionContainingAddress(Addr, + /*CheckPastEnd=*/false, + /*UseMaxSize=*/true); + if (ContainingFunction) { + errs() << "BOLT-WARNING: function " << *ContainingFunction + << " has an object detected in a padding region at address 0x" + << Twine::utohexstr(Addr) << '\n'; + ContainingFunction->setMaxSize( + Addr - ContainingFunction->getAddress()); + } } } BC->InterproceduralReferences.clear(); @@ -3295,12 +3329,17 @@ bool RewriteInstance::willOverwriteSection(StringRef SectionName) { BinaryFunction * RewriteInstance::getBinaryFunctionContainingAddress(uint64_t Address, - bool CheckPastEnd) { + bool CheckPastEnd, + bool UseMaxSize) { auto FI = BinaryFunctions.upper_bound(Address); if (FI == BinaryFunctions.begin()) return nullptr; --FI; - if (Address >= FI->first + FI->second.getSize() + CheckPastEnd) + + const auto UsedSize = UseMaxSize ? FI->second.getMaxSize() + : FI->second.getSize(); + + if (Address >= FI->first + UsedSize + (CheckPastEnd ? 1 : 0)) return nullptr; return &FI->second; } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 3e4da2122dda..d1d799f3d8f6 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -227,16 +227,26 @@ class RewriteInstance { return OLT.findSymbol(Name, false).getAddress(); } - /// Return BinaryFunction containing the given \p Address or nullptr if + /// Return BinaryFunction containing a given \p Address or nullptr if /// no registered function has it. /// + /// In a binary a function has somewhat vague boundaries. E.g. a function can + /// refer to the first byte past the end of the function, and it will still be + /// referring to this function, not the function following it in the address + /// space. Thus we have the following flags that allow to lookup for + /// a function where a caller has more context for the search. + /// /// If \p CheckPastEnd is true and the \p Address falls on a byte /// immediately following the last byte of some function and there's no other /// function that starts there, then return the function as the one containing /// the \p Address. This is useful when we need to locate functions for /// references pointing immediately past a function body. + /// + /// If \p UseMaxSize is true, then include the space between this function + /// body and the next object in address ranges that we check. BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address, - bool CheckPastEnd = false); + bool CheckPastEnd = false, + bool UseMaxSize = false); const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) const; From 354a8933a0b4890e7ec4eb0e5fa4dabc9243b50b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Feb 2017 21:44:38 -0800 Subject: [PATCH 225/904] [BOLT] Update tests Summary: Fix validateCFG to handle BBs that were generated from code that used _builtin_unreachable(). Add -verify-cfg option to run CFG validation after every optimization pass. (cherry picked from commit 629e4535ef53c4747c8f472a3e18e1115b970876) --- bolt/BinaryBasicBlock.cpp | 12 ++++++------ bolt/BinaryFunction.cpp | 15 ++++++++++----- bolt/BinaryFunction.h | 2 +- bolt/BinaryPassManager.cpp | 20 ++++++++++++++++++++ 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 3934ae25cf15..d845ba4c7f2a 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -69,12 +69,12 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { case 1: return !CondBranch; case 2: - if (CondBranch) { - return (TBB == getConditionalSuccessor(true)->getLabel() && - ((!UncondBranch && !FBB) || - (UncondBranch && FBB == getConditionalSuccessor(false)->getLabel()))); - } - return true; + return + (!CondBranch || + (TBB == getConditionalSuccessor(true)->getLabel() && + ((!UncondBranch && !FBB) || + (UncondBranch && + FBB == getConditionalSuccessor(false)->getLabel())))); default: return true; } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index f31b1a8105ca..eb326cfcb032 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1840,6 +1840,8 @@ bool BinaryFunction::buildCFG() { // Annotate invoke instructions with GNU_args_size data. propagateGnuArgsSizeInfo(); + assert(validateCFG() && "Invalid CFG detected after disassembly"); + return true; } @@ -2810,12 +2812,13 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const { dumpGraph(of); } -bool BinaryFunction::validateCFG() { +bool BinaryFunction::validateCFG() const { bool Valid = true; for (auto *BB : BasicBlocks) { Valid &= BB->validateSuccessorInvariants(); if (!Valid) { - errs() << "BOLT-WARNING: CFG invalid @ " << BB->getName() << "\n"; + errs() << "BOLT-WARNING: CFG invalid in " << *this << " @ " + << BB->getName() << "\n"; } } @@ -2827,14 +2830,16 @@ bool BinaryFunction::validateCFG() { for (auto *LPBlock : BB->LandingPads) { Valid &= Seen.count(LPBlock) == 0; if (!Valid) { - errs() << "Duplicate LP seen " << LPBlock->getName() << "\n"; + errs() << "BOLT-WARNING: Duplicate LP seen " << LPBlock->getName() + << "in " << *this << "\n"; break; } Seen.insert(LPBlock); auto count = LPBlock->Throwers.count(BB); Valid &= (count == 1); if (!Valid) { - errs() << "Inconsistent landing pad detected " << LPBlock->getName() + errs() << "BOLT-WARNING: Inconsistent landing pad detected in " + << *this << ": " << LPBlock->getName() << " is in LandingPads but not in " << BB->getName() << "->Throwers\n"; break; @@ -2896,7 +2901,7 @@ void BinaryFunction::fixBranches() { // terminator) or more than 2 (switch table) don't require branch // instruction adjustments. } - assert(validateCFG()); + assert(validateCFG() && "Invalid CFG detected after fixing branches"); } void BinaryFunction::splitFunction() { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 70c93792f74f..75443ec132fd 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -888,7 +888,7 @@ class BinaryFunction : public AddressRangesOwner { } /// Attempt to validate CFG invariants. - bool validateCFG(); + bool validateCFG() const; /// Return dynostats for the function. /// diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 1364452ba35b..98b62a7c0716 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -26,6 +26,13 @@ llvm::cl::opt TimeOpts("time-opts", cl::desc("print time spent in each optimization"), cl::init(false), cl::ZeroOrMore); +static llvm::cl::opt +VerifyCFG("verify-cfg", + cl::desc("verify the CFG after every pass"), + cl::init(false), + cl::Hidden, + cl::ZeroOrMore); + static cl::opt EliminateUnreachable("eliminate-unreachable", cl::desc("eliminate unreachable code"), @@ -200,6 +207,19 @@ void BinaryFunctionPassManager::runPasses() { opts::DynoStatsAll ); + if (opts::VerifyCFG && + !std::accumulate( + BFs.begin(), BFs.end(), + true, + [](const bool Valid, + const std::pair &It) { + return Valid && It.second.validateCFG(); + })) { + errs() << "BOLT-ERROR: Invalid CFG detected after pass " + << Pass->getName() << "\n"; + exit(1); + } + if (!opts::PrintAll && !opts::DumpDotAll && !Pass->printPass()) continue; From 334e4aaae64bbaf5c559c7a1287ab0c615ae48f7 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 7 Mar 2017 14:22:15 -0800 Subject: [PATCH 226/904] [BOLT] Fix verbose output. Summary: Inadvertently, output of BOLT became way too verbose. Discovered while building HHVM on master. (cherry picked from commit 127c64135a326b62e68d73c66922eb0968f63ce8) --- bolt/RewriteInstance.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 18b0271503ed..cb8a79eecfaf 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -738,9 +738,11 @@ void RewriteInstance::run() { auto FunctionIt = BinaryFunctions.find(Address); assert(FunctionIt != BinaryFunctions.end() && "Invalid large function address."); - errs() << "BOLT-WARNING: Function " << FunctionIt->second - << " is larger than its orginal size: emitting again marking it " - << "as not simple.\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Function " << FunctionIt->second + << " is larger than its orginal size: emitting again marking it " + << "as not simple.\n"; + } FunctionIt->second.setSimple(false); } @@ -1670,7 +1672,9 @@ void RewriteInstance::disassembleFunctions() { getBinaryFunctionContainingAddress(Addr, /*CheckPastEnd=*/false, /*UseMaxSize=*/true); - if (ContainingFunction) { + // We are not going to overwrite non-simple functions, but for simple + // ones - adjust the padding size. + if (ContainingFunction && ContainingFunction->isSimple()) { errs() << "BOLT-WARNING: function " << *ContainingFunction << " has an object detected in a padding region at address 0x" << Twine::utohexstr(Addr) << '\n'; From 45e0b4d6277b4c2ac9e3189c895249bb2f1d4136 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 7 Mar 2017 18:09:09 -0800 Subject: [PATCH 227/904] [BOLT] Fix gcc5 build. Summary: A include is required for gcc5 build. (cherry picked from commit b3dd5a0b6d879e6a2368e17fcc4e1852a2a84745) --- bolt/BinaryPassManager.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 98b62a7c0716..7246a0ccaef6 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -13,6 +13,7 @@ #include "Passes/FrameOptimizer.h" #include "Passes/Inliner.h" #include "llvm/Support/Timer.h" +#include using namespace llvm; From 1895eb8bca12fcd7046935e151a650f3dcb2ecb1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 7 Mar 2017 11:45:07 -0800 Subject: [PATCH 228/904] Fix hfsort callgraph stats, add hfsort test. Summary: The stats for call sites that are not included in the call graph were broken. The intention is to count the total number of call sites vs. the number of call sites that are ignored because they have targets that are not BinaryFunctions. Also add a new test for hfsort. (cherry picked from commit ff5a11a1ac991a93c4463c11d00c98d24d6816db) --- bolt/Passes/BinaryPasses.cpp | 40 ++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index fad3437f2417..81dc7f2e4030 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -1691,7 +1691,7 @@ void ReorderFunctions::buildCallGraph(BinaryContext &BC, }; // Add call graph edges. - uint64_t NotFound = 0; + uint64_t NotProcessed = 0; uint64_t TotalCalls = 0; for (auto &It : BFs) { auto *Function = &It.second; @@ -1719,18 +1719,20 @@ void ReorderFunctions::buildCallGraph(BinaryContext &BC, }; for (auto *BB : Function->layout()) { - if (!BB->isCold()) { // Don't count calls from cold blocks - for (auto &Inst : *BB) { - // Find call instructions and extract target symbols from each one. - bool Success = false; - if (BC.MIA->isCall(Inst)) - ++TotalCalls; + // Don't count calls from cold blocks + if (BB->isCold()) + continue; + for (auto &Inst : *BB) { + // Find call instructions and extract target symbols from each one. + if (BC.MIA->isCall(Inst)) { + ++TotalCalls; if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) { // For direct calls, just use the BB execution count. assert(BB->hasProfile()); const auto Count = opts::UseEdgeCounts ? BB->getExecutionCount() : 1; - Success = recordCall(DstSym, Count); + if (!recordCall(DstSym, Count)) + ++NotProcessed; } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { // For indirect calls and jump tables, use branch data. assert(BranchDataOrErr); @@ -1739,35 +1741,37 @@ void ReorderFunctions::buildCallGraph(BinaryContext &BC, BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); for (const auto &BI : BranchData.getBranchRange(DataOffset)) { + // Count each target as a separate call. + ++TotalCalls; + if (!BI.To.IsSymbol) { + ++NotProcessed; continue; } auto Itr = BC.GlobalSymbols.find(BI.To.Name); if (Itr == BC.GlobalSymbols.end()) { + ++NotProcessed; continue; } const auto *DstSym = BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); - assert(BI.Branches > 0); - Success = recordCall(DstSym, opts::UseEdgeCounts ? BI.Branches : 1); + if (!recordCall(DstSym, opts::UseEdgeCounts ? BI.Branches : 1)) + ++NotProcessed; } } + } - if (!Success) - ++NotFound; - - if (!opts::UseEdgeCounts) { - Offset += BC.computeCodeSize(&Inst, &Inst + 1); - } + if (!opts::UseEdgeCounts) { + Offset += BC.computeCodeSize(&Inst, &Inst + 1); } } } } - outs() << "BOLT-INFO: ReorderFunctions: " << NotFound << " calls not " - << " processed out of " << TotalCalls << "\n"; + outs() << "BOLT-WARNING: ReorderFunctions: " << NotProcessed + << " callsites not processed out of " << TotalCalls << "\n"; // Normalize arc weights. if (!opts::UseEdgeCounts) { From ea1c3478618af3b936cd18c61148364d1d8487fe Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sun, 12 Mar 2017 11:30:05 -0700 Subject: [PATCH 229/904] [BOLT] Do not process empty functions. Summary: While running on a recent test binary BOLT failed with an error. We were trying to process '__hot_end' (which is not really a function), and asserted that it had no basic blocks. This diff marks functions with empty basic blocks list as non-simple since there's no need to process them. (cherry picked from commit 949b38939d9a6c959f14547013ed821c76568c0f) --- bolt/BinaryFunction.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index eb326cfcb032..2a6dba1bc8a2 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1612,6 +1612,11 @@ bool BinaryFunction::buildCFG() { } } + if (BasicBlocks.empty()) { + setSimple(false); + return false; + } + // Intermediate dump. DEBUG(print(dbgs(), "after creating basic blocks")); From 4f1e4fa159b0b67f5b5a1abd0a5c20a1a1da37d5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 14 Mar 2017 09:03:23 -0700 Subject: [PATCH 230/904] [BOLT] Improve dynostats output. Summary: Reduce verbosity of dynostats to make them more readable. * Don't print "before" dynostats twice. * Detect if dynostats have changed after optimization and print before/after only if at least one metric have changed. Otherwise just print dynostats once and indicate "no change". * If any given metric hasn't changed, then print the difference as "(=)" as opposed to (+0.0%). (cherry picked from commit e5641eb047fde090c6a28c611401c465f6d5ee39) --- bolt/BinaryFunction.cpp | 11 +++++++++++ bolt/BinaryFunction.h | 18 +++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 2a6dba1bc8a2..c1c1ffaca5e3 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -165,6 +165,13 @@ bool DynoStats::operator<(const DynoStats &Other) const { ); } +bool DynoStats::operator==(const DynoStats &Other) const { + return std::equal( + &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], + &Other.Stats[FIRST_DYNO_STAT] + ); +} + bool DynoStats::lessThan(const DynoStats &Other, ArrayRef Keys) const { return std::lexicographical_compare( @@ -3865,9 +3872,13 @@ void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { uint64_t OtherStat) { OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name; if (Other) { + if (Stat != OtherStat) { OS << format(" (%+.1f%%)", ( (float) Stat - (float) OtherStat ) * 100.0 / (float) (OtherStat + 1) ); + } else { + OS << " (=)"; + } } OS << '\n'; }; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 75443ec132fd..e41e946db511 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -131,6 +131,8 @@ class DynoStats { void operator+=(const DynoStats &Other); bool operator<(const DynoStats &Other) const; + bool operator==(const DynoStats &Other) const; + bool operator!=(const DynoStats &Other) const { return !operator==(Other); } bool lessThan(const DynoStats &Other, ArrayRef Keys) const; static const char* Description(const Category C) { @@ -1758,20 +1760,22 @@ callWithDynoStats(FnType &&Func, const FuncsType &Funcs, StringRef Phase, const bool Flag) { - DynoStats dynoStatsBefore; + DynoStats DynoStatsBefore; if (Flag) { - dynoStatsBefore = getDynoStats(Funcs); - outs() << "BOLT-INFO: program-wide dynostats before running " - << Phase << ":\n\n" << dynoStatsBefore << '\n'; + DynoStatsBefore = getDynoStats(Funcs); } Func(); if (Flag) { - auto dynoStatsAfter = getDynoStats(Funcs); + const auto DynoStatsAfter = getDynoStats(Funcs); + const auto Changed = (DynoStatsAfter != DynoStatsBefore); outs() << "BOLT-INFO: program-wide dynostats after running " - << Phase << ":\n\n" << dynoStatsBefore << '\n'; - dynoStatsAfter.print(outs(), &dynoStatsBefore); + << Phase << (Changed ? "" : " (no change)") << ":\n\n" + << DynoStatsBefore << '\n'; + if (Changed) { + DynoStatsAfter.print(outs(), &DynoStatsBefore); + } outs() << '\n'; } } From 8f314e0fc39157821a4bee2d24af42d3785de4f2 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 15 Mar 2017 19:31:20 -0700 Subject: [PATCH 231/904] [BOLT] Do not overwrite starting address in non-relocation mode. Summary: In non-relocation mode we shouldn't attemtp to change ELF entry point. What made matters worse - it broke '-max-funcs=' and '-funcs=' options since an entry function more often than not was excluded from the list of processed functions, and we were setting entry point to 0. (cherry picked from commit 08b662e8e48a9b53d68138bb613bee2d97f4d2d7) --- bolt/RewriteInstance.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index cb8a79eecfaf..0a12489c5d37 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2121,10 +2121,13 @@ void RewriteInstance::emitFunctions() { OLT.emitAndFinalize(ObjectsHandle); - const auto *EntryFunction = getBinaryFunctionContainingAddress(EntryPoint); - assert(EntryFunction && "cannot find function for entry point"); - auto JITS = OLT.findSymbol(EntryFunction->getSymbol()->getName(), false); - EntryPoint = JITS.getAddress(); + if (opts::Relocs) { + const auto *EntryFunction = getBinaryFunctionContainingAddress(EntryPoint); + assert(EntryFunction && "cannot find function for entry point"); + auto JITS = OLT.findSymbol(EntryFunction->getSymbol()->getName(), false); + EntryPoint = JITS.getAddress(); + assert(EntryPoint && "entry point cannot be NULL"); + } if (opts::KeepTmp) TempOut->keep(); From d81b898cd9b1b56bead2ed5fa629009b232f9c2c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 17 Mar 2017 19:05:11 -0700 Subject: [PATCH 232/904] [BOLT] Add option to print only specific functions. Summary: Add option '-print-only=func1,func2,...' to print only functions of interest. The rest of the functions are still processed and optimized (e.g. inlined), but only the ones on the list are printed. (cherry picked from commit 2c403defbe4136d63bbe4ac139a70eb0e9c8efb3) --- bolt/BinaryFunction.cpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index c1c1ffaca5e3..57d1cd8cbbb3 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -97,12 +97,32 @@ PrintJumpTables("print-jump-tables", cl::ZeroOrMore, cl::Hidden); +static cl::list +PrintOnly("print-only", + cl::CommaSeparated, + cl::desc("list of functions to print"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden); + static cl::opt SplitEH("split-eh", cl::desc("split C++ exception handling code (experimental)"), cl::ZeroOrMore, cl::Hidden); +bool shouldPrint(const BinaryFunction &Function) { + if (PrintOnly.empty()) + return true; + + for (auto &Name : opts::PrintOnly) { + if (Function.hasName(Name)) { + return true; + } + } + + return false; +} + } // namespace opts namespace llvm { @@ -309,7 +329,7 @@ void BinaryFunction::dump(std::string Annotation, void BinaryFunction::print(raw_ostream &OS, std::string Annotation, bool PrintInstructions) const { // FIXME: remove after #15075512 is done. - if (!opts::shouldProcess(*this)) + if (!opts::shouldProcess(*this) || !opts::shouldPrint(*this)) return; StringRef SectionName; From 8dd239faa61fa857b213d1aa78e19923e21644a1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 18 Mar 2017 11:55:45 -0700 Subject: [PATCH 233/904] [BOLT] Don't allow non-symbol targets in ICP Summary: ICP was letting through call targets that weren't symbols. This diff filters out the non-symbol targets before running ICP. (cherry picked from commit 0b6c0d3596d53b7874548d4366a30189b0017779) --- bolt/Passes/BinaryPasses.cpp | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 81dc7f2e4030..c720e65c44c2 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -1092,6 +1092,14 @@ std::vector IndirectCallPromotion::getCallTargets( return A.Branches > B.Branches; }); + // Remove non-symbol targets + auto Last = std::remove_if(Targets.begin(), + Targets.end(), + [](const BranchInfo &BI) { + return !BI.To.IsSymbol; + }); + Targets.erase(Last, Targets.end()); + return Targets; } @@ -1104,20 +1112,15 @@ IndirectCallPromotion::findCallTargetSymbols( std::vector> SymTargets; for (size_t I = 0; I < N; ++I) { - MCSymbol* Symbol = nullptr; - uint64_t Addr = 0; - if (Targets[I].To.IsSymbol) { - auto itr = BC.GlobalSymbols.find(Targets[I].To.Name); - if (itr == BC.GlobalSymbols.end()) { - // punt if we can't find a symbol. - break; - } - Symbol = BC.getOrCreateGlobalSymbol(itr->second, "FUNCat"); - assert(Symbol); - } else { - Addr = Targets[I].To.Offset; + assert(Targets[I].To.IsSymbol && "All ICP targets must be symbols."); + auto Itr = BC.GlobalSymbols.find(Targets[I].To.Name); + if (Itr == BC.GlobalSymbols.end()) { + // punt if we can't find a symbol. + break; } - SymTargets.push_back(std::make_pair(Symbol, Addr)); + MCSymbol* Symbol = BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); + assert(Symbol && "All ICP targets must be known symbols."); + SymTargets.push_back(std::make_pair(Symbol, 0)); } return SymTargets; From d56a08e50b71017213bf56773544a8e1805c0032 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 17 Mar 2017 10:32:56 -0700 Subject: [PATCH 234/904] Change dynostats dynamic instruction count policy Summary: Also add LOAD/STORE counters. (cherry picked from commit bdd26488bbf222057b0943f6dd5c925dc91cdd51) --- bolt/BinaryFunction.cpp | 16 +++++++--------- bolt/BinaryFunction.h | 2 ++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 57d1cd8cbbb3..4b7fa4644588 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -3772,12 +3772,7 @@ DynoStats BinaryFunction::getDynoStats() const { // frequencies. This may deviate from the sum of outgoing branches of the // basic block especially since the block may contain a function that // does not return or a function that throws an exception. - uint64_t BBExecutionCount = 0; - for (const auto &BI : BB->branch_info()) { - assert(BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "unexpected empty profile"); - BBExecutionCount += BI.Count; - } + const uint64_t BBExecutionCount = BB->getKnownExecutionCount(); // Ignore empty blocks and blocks that were not executed. if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0) @@ -3785,6 +3780,12 @@ DynoStats BinaryFunction::getDynoStats() const { // Count the number of calls by iterating through all instructions. for (const auto &Instr : *BB) { + if (BC.MIA->isStore(Instr)) { + Stats[DynoStats::STORES] += BBExecutionCount; + } + if (BC.MIA->isLoad(Instr)) { + Stats[DynoStats::LOADS] += BBExecutionCount; + } if (!BC.MIA->isCall(Instr)) continue; Stats[DynoStats::FUNCTION_CALLS] += BBExecutionCount; @@ -3868,9 +3869,6 @@ DynoStats BinaryFunction::getDynoStats() const { if (NonTakenCount == COUNT_NO_PROFILE) NonTakenCount = 0; - assert(TakenCount + NonTakenCount == BBExecutionCount && - "internal calculation error"); - if (IsForwardBranch) { Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount; Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index e41e946db511..3aa6bc16f6ab 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -69,6 +69,8 @@ class DynoStats { D(INDIRECT_CALLS, "indirect calls", Fn)\ D(PLT_CALLS, "PLT calls", Fn)\ D(INSTRUCTIONS, "executed instructions", Fn)\ + D(LOADS, "executed load instructions", Fn)\ + D(STORES, "executed store instructions", Fn)\ D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\ D(ALL_BRANCHES, "total branches",\ Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\ From 56e1f32b25bbd2b978d77477df1779627dd1bd8d Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 22 Mar 2017 22:05:50 -0700 Subject: [PATCH 235/904] [BOLT] Issue error in relocs mode if input is lacking relocations. Summary: If we specify "-relocs" flag and an input has no relocations we proceed with assumptions that relocations were there and break the binary. Detect the condition above, and reject the input. (cherry picked from commit 60ae1cd0b072082b187cafe8c67dfe0da6d2c052) --- bolt/RewriteInstance.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 0a12489c5d37..29447f63f049 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1250,6 +1250,8 @@ BinaryFunction *RewriteInstance::createBinaryFunction( } void RewriteInstance::readSpecialSections() { + bool HasTextRelocations = false; + // Process special sections. for (const auto &Section : InputFile->sections()) { StringRef SectionName; @@ -1268,6 +1270,8 @@ void RewriteInstance::readSpecialSections() { DebugLocSize = Section.getSize(); } else if (SectionName == ".eh_frame") { EHFrameSection = Section; + } else if (SectionName == ".rela.text") { + HasTextRelocations = true; } // Ignore zero-size allocatable sections as they present no interest to us. @@ -1278,6 +1282,12 @@ void RewriteInstance::readSpecialSections() { } } + if (opts::Relocs && !HasTextRelocations) { + errs() << "BOLT-ERROR: relocations against code are missing from the input " + "file. Cannot proceed in relocations mode (-relocs).\n"; + exit(1); + } + // Process debug sections. EHFrame = BC->DwCtx->getEHFrame(); if (opts::DumpEHFrame) { From 1bb2034c976490d72708eb90c1322766f131ed36 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 28 Mar 2017 14:40:20 -0700 Subject: [PATCH 236/904] [BOLT] Organize options in categories for pretty printing (near NFC). Summary: Each BOLT-specific option now belongs to BoltCategory or BoltOptCategory. Use alphabetical order for options in source code (does not affect output). The result is a cleaner output of "llvm-bolt -help" which does not include any unrelated llvm options and is close to the following: ..... BOLT generic options: -data= - -dyno-stats - print execution info based on profile -hot-text - hot text symbols support (relocation mode) -o= - -relocs - relocation mode - use relocations to move functions in the binary -update-debug-sections - update DWARF debug sections of the executable -use-gnu-stack - use GNU_STACK program header for new segment (workaround for issues with strip/objcopy) -use-old-text - re-use space in old .text if possible (relocation mode) -v= - set verbosity level for diagnostic output BOLT optimization options: -align-blocks - try to align BBs inserting nops -align-functions= - align functions at a given value (relocation mode) -align-functions-max-bytes= - maximum number of bytes to use to align functions -boost-macroops - try to boost macro-op fusions by avoiding the cache-line boundary -eliminate-unreachable - eliminate unreachable code -frame-opt - optimize stack frame accesses ...... (cherry picked from commit e9d253b7626bc7c5c4051c769e06f5f0359badfb) --- bolt/BinaryContext.cpp | 7 +- bolt/BinaryFunction.cpp | 49 +++-- bolt/BinaryPassManager.cpp | 230 +++++++++++---------- bolt/Exceptions.cpp | 9 +- bolt/Passes/BinaryPasses.cpp | 332 ++++++++++++++++--------------- bolt/Passes/Inliner.cpp | 24 ++- bolt/Passes/ReorderAlgorithm.cpp | 16 +- bolt/RewriteInstance.cpp | 328 +++++++++++++++++------------- bolt/llvm-bolt.cpp | 31 ++- bolt/merge-fdata/merge-fdata.cpp | 53 ++--- 10 files changed, 609 insertions(+), 470 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 2a9dde0ccb53..660d597a3a93 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -24,12 +24,15 @@ using namespace bolt; namespace opts { +extern cl::OptionCategory BoltCategory; + extern cl::opt Relocs; static cl::opt PrintDebugInfo("print-debug-info", - cl::desc("print debug info when printing functions"), - cl::Hidden); + cl::desc("print debug info when printing functions"), + cl::Hidden, + cl::cat(BoltCategory)); } // namespace opts diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 4b7fa4644588..39affc6455bb 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -42,6 +42,10 @@ using namespace bolt; namespace opts { +extern cl::OptionCategory BoltCategory; +extern cl::OptionCategory BoltOptCategory; +extern cl::OptionCategory BoltRelocCategory; + extern bool shouldProcess(const BinaryFunction &); extern cl::opt PrintDynoStats; @@ -51,25 +55,30 @@ extern cl::opt Verbosity; static cl::opt AggressiveSplitting("split-all-cold", - cl::desc("outline as many cold basic blocks as possible"), - cl::ZeroOrMore); + cl::desc("outline as many cold basic blocks as possible"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt AlignBlocks("align-blocks", - cl::desc("try to align BBs inserting nops"), - cl::ZeroOrMore); + cl::desc("try to align BBs inserting nops"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt DotToolTipCode("dot-tooltip-code", - cl::desc("add basic block instructions as tool tips on nodes"), - cl::ZeroOrMore, - cl::Hidden); + cl::desc("add basic block instructions as tool tips on nodes"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); static cl::opt DynoStatsScale("dyno-stats-scale", - cl::desc("scale to be applied while reporting dyno stats"), - cl::Optional, - cl::init(1)); + cl::desc("scale to be applied while reporting dyno stats"), + cl::Optional, + cl::init(1), + cl::Hidden, + cl::cat(BoltCategory)); cl::opt JumpTables("jump-tables", @@ -89,26 +98,30 @@ JumpTables("jump-tables", "aggressively split jump tables section based on usage " "of the tables"), clEnumValEnd), - cl::ZeroOrMore); + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt PrintJumpTables("print-jump-tables", - cl::desc("print jump tables"), - cl::ZeroOrMore, - cl::Hidden); + cl::desc("print jump tables"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); static cl::list PrintOnly("print-only", cl::CommaSeparated, cl::desc("list of functions to print"), cl::value_desc("func1,func2,func3,..."), - cl::Hidden); + cl::Hidden, + cl::cat(BoltCategory)); static cl::opt SplitEH("split-eh", - cl::desc("split C++ exception handling code (experimental)"), - cl::ZeroOrMore, - cl::Hidden); + cl::desc("split C++ exception handling code (experimental)"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); bool shouldPrint(const BinaryFunction &Function) { if (PrintOnly.empty()) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 7246a0ccaef6..b3f1d306ee4a 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -19,160 +19,186 @@ using namespace llvm; namespace opts { +extern cl::OptionCategory BoltOptCategory; + extern cl::opt PrintAll; extern cl::opt DumpDotAll; extern cl::opt DynoStatsAll; -llvm::cl::opt TimeOpts("time-opts", - cl::desc("print time spent in each optimization"), - cl::init(false), cl::ZeroOrMore); - -static llvm::cl::opt -VerifyCFG("verify-cfg", - cl::desc("verify the CFG after every pass"), - cl::init(false), - cl::Hidden, - cl::ZeroOrMore); - static cl::opt EliminateUnreachable("eliminate-unreachable", - cl::desc("eliminate unreachable code"), - cl::init(true), - cl::ZeroOrMore); + cl::desc("eliminate unreachable code"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt -OptimizeBodylessFunctions( - "optimize-bodyless-functions", - cl::desc("optimize functions that just do a tail call"), - cl::ZeroOrMore); +IndirectCallPromotion("indirect-call-promotion", + cl::desc("indirect call promotion"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt -IndirectCallPromotion("indirect-call-promotion", - cl::desc("indirect call promotion"), - cl::ZeroOrMore); +InlineSmallFunctions("inline-small-functions", + cl::desc("inline functions with a single basic block"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt -InlineSmallFunctions( - "inline-small-functions", - cl::desc("inline functions with a single basic block"), - cl::ZeroOrMore); +NeverPrint("never-print", + cl::desc("never print"), + cl::init(false), + cl::ZeroOrMore, + cl::ReallyHidden, + cl::cat(BoltOptCategory)); static cl::opt -SimplifyConditionalTailCalls("simplify-conditional-tail-calls", - cl::desc("simplify conditional tail calls " - "by removing unnecessary jumps"), - cl::ZeroOrMore); +OptimizeBodylessFunctions("optimize-bodyless-functions", + cl::desc("optimize functions that just do a tail call"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +OptimizeFrameAccesses("frame-opt", + cl::desc("optimize stack frame accesses"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt Peepholes("peepholes", - cl::desc("run peephole optimizations"), - cl::ZeroOrMore); + cl::desc("run peephole optimizations"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt -SimplifyRODataLoads("simplify-rodata-loads", - cl::desc("simplify loads from read-only sections by " - "replacing the memory operand with the " - "constant found in the corresponding " - "section"), - cl::ZeroOrMore); +PrintAfterBranchFixup("print-after-branch-fixup", + cl::desc("print function after fixing local branches"), + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -StripRepRet("strip-rep-ret", - cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"), - cl::init(true), - cl::ZeroOrMore); +PrintAfterLowering("print-after-lowering", + cl::desc("print function after instruction lowering"), + cl::Hidden, + cl::cat(BoltOptCategory)); -static cl::opt OptimizeFrameAccesses( - "frame-opt", cl::desc("optimize stack frame accesses"), cl::ZeroOrMore); +static cl::opt +PrintFOP("print-fop", + cl::desc("print functions after frame optimizer pass"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintReordered("print-reordered", - cl::desc("print functions after layout optimization"), - cl::ZeroOrMore, - cl::Hidden); +PrintFinalized("print-finalized", + cl::desc("print function after CFG is finalized"), + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintReorderedFunctions("print-reordered-functions", - cl::desc("print functions after clustering"), - cl::ZeroOrMore, - cl::Hidden); +PrintICF("print-icf", + cl::desc("print functions after ICF optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintOptimizeBodyless("print-optimize-bodyless", - cl::desc("print functions after bodyless optimization"), - cl::ZeroOrMore, - cl::Hidden); +PrintICP("print-icp", + cl::desc("print functions after indirect call promotion"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintAfterBranchFixup("print-after-branch-fixup", - cl::desc("print function after fixing local branches"), - cl::Hidden); +PrintInline("print-inline", + cl::desc("print functions after inlining optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintFinalized("print-finalized", - cl::desc("print function after CFG is finalized"), - cl::Hidden); +PrintOptimizeBodyless("print-optimize-bodyless", + cl::desc("print functions after bodyless optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintAfterLowering("print-after-lowering", - cl::desc("print function after instruction lowering"), - cl::Hidden); +PrintPeepholes("print-peepholes", + cl::desc("print functions after peephole optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintUCE("print-uce", - cl::desc("print functions after unreachable code elimination"), - cl::ZeroOrMore, - cl::Hidden); +PrintReordered("print-reordered", + cl::desc("print functions after layout optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintSCTC("print-sctc", - cl::desc("print functions after conditional tail call simplification"), - cl::ZeroOrMore, - cl::Hidden); +PrintReorderedFunctions("print-reordered-functions", + cl::desc("print functions after clustering"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintPeepholes("print-peepholes", - cl::desc("print functions after peephole optimization"), - cl::ZeroOrMore, - cl::Hidden); +PrintSCTC("print-sctc", + cl::desc("print functions after conditional tail call simplification"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt PrintSimplifyROLoads("print-simplify-rodata-loads", - cl::desc("print functions after simplification of RO data" - " loads"), - cl::ZeroOrMore, - cl::Hidden); + cl::desc("print functions after simplification of RO data loads"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintICF("print-icf", - cl::desc("print functions after ICF optimization"), - cl::ZeroOrMore, - cl::Hidden); +PrintUCE("print-uce", + cl::desc("print functions after unreachable code elimination"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -PrintICP("print-icp", - cl::desc("print functions after indirect call promotion"), - cl::ZeroOrMore, - cl::Hidden); +SimplifyConditionalTailCalls("simplify-conditional-tail-calls", + cl::desc("simplify conditional tail calls by removing unnecessary jumps"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt -PrintInline("print-inline", - cl::desc("print functions after inlining optimization"), - cl::ZeroOrMore, - cl::Hidden); +SimplifyRODataLoads("simplify-rodata-loads", + cl::desc("simplify loads from read-only sections by replacing the memory " + "operand with the constant found in the corresponding section"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt -PrintFOP("print-fop", - cl::desc("print functions after frame optimizer pass"), - cl::ZeroOrMore, - cl::Hidden); +StripRepRet("strip-rep-ret", + cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); -static cl::opt -NeverPrint("never-print", - cl::desc("never print"), - cl::init(false), - cl::ZeroOrMore, - cl::ReallyHidden); +static llvm::cl::opt +TimeOpts("time-opts", + cl::desc("print time spent in each optimization"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static llvm::cl::opt +VerifyCFG("verify-cfg", + cl::desc("verify the CFG after every pass"), + cl::init(false), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); } // namespace opts diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 4e7a39f1cb00..4fd5e7494aed 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -37,13 +37,16 @@ using namespace llvm::dwarf; namespace opts { +extern llvm::cl::OptionCategory BoltCategory; + extern llvm::cl::opt Verbosity; static llvm::cl::opt PrintExceptions("print-exceptions", - llvm::cl::desc("print exception handling data"), - llvm::cl::ZeroOrMore, - llvm::cl::Hidden); + llvm::cl::desc("print exception handling data"), + llvm::cl::ZeroOrMore, + llvm::cl::Hidden, + llvm::cl::cat(BoltCategory)); } // namespace opts diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index c720e65c44c2..17bd0b475986 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -49,6 +49,8 @@ const char* dynoStatsOptDesc(const bolt::DynoStats::Category C) { namespace opts { +extern cl::OptionCategory BoltOptCategory; + extern cl::opt Verbosity; extern cl::opt RandomSeed; extern cl::opt Relocs; @@ -56,117 +58,174 @@ extern cl::opt SplitFunctions; extern bool shouldProcess(const bolt::BinaryFunction &Function); extern size_t padFunction(const bolt::BinaryFunction &Function); -static cl::opt -IndirectCallPromotionThreshold( - "indirect-call-promotion-threshold", - cl::desc("threshold for optimizing a frequently taken indirect call"), - cl::init(90), - cl::ZeroOrMore); +enum DynoStatsSortOrder : char { + Ascending, + Descending +}; -static cl::opt -IndirectCallPromotionMispredictThreshold( - "indirect-call-promotion-mispredict-threshold", - cl::desc("misprediction threshold for skipping ICP on an " - "indirect call"), - cl::init(2), - cl::ZeroOrMore); +static cl::opt +DynoStatsSortOrderOpt("print-sorted-by-order", + cl::desc("use ascending or descending order when printing functions " + "ordered by dyno stats"), + cl::ZeroOrMore, + cl::init(DynoStatsSortOrder::Descending), + cl::cat(BoltOptCategory)); + +static cl::opt +FunctionOrderFile("function-order", + cl::desc("file containing an ordered list of functions to use for function " + "reordering"), + cl::cat(BoltOptCategory)); static cl::opt -IndirectCallPromotionUseMispredicts( - "indirect-call-promotion-use-mispredicts", - cl::desc("use misprediction frequency for determining whether or not ICP " - "should be applied at a callsite. The " - "-indirect-call-promotion-mispredict-threshold value will be used " - "by this heuristic"), - cl::ZeroOrMore); +ICF("icf", + cl::desc("fold functions with identical code"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); -static cl::opt -IndirectCallPromotionTopN( - "indirect-call-promotion-topn", - cl::desc("number of targets to consider when doing indirect " - "call promotion"), - cl::init(1), - cl::ZeroOrMore); +static cl::opt +ICFUseDFS("icf-dfs", + cl::desc("use DFS ordering when using -icf option"), + cl::ReallyHidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::list ICPFuncsList("icp-funcs", - cl::CommaSeparated, - cl::desc("list of functions to enable ICP for"), - cl::value_desc("func1,func2,func3,..."), - cl::Hidden); + cl::CommaSeparated, + cl::desc("list of functions to enable ICP for"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt -ICPOldCodeSequence( - "icp-old-code-sequence", - cl::desc("use old code sequence for promoted calls"), - cl::init(false), - cl::ZeroOrMore, - cl::Hidden); +ICPOldCodeSequence("icp-old-code-sequence", + cl::desc("use old code sequence for promoted calls"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); -static cl::opt -ReorderBlocks( - "reorder-blocks", - cl::desc("change layout of basic blocks in a function"), - cl::init(bolt::BinaryFunction::LT_NONE), - cl::values(clEnumValN(bolt::BinaryFunction::LT_NONE, - "none", - "do not reorder basic blocks"), - clEnumValN(bolt::BinaryFunction::LT_REVERSE, - "reverse", - "layout blocks in reverse order"), - clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE, - "normal", - "perform optimal layout based on profile"), - clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_BRANCH, - "branch-predictor", - "perform optimal layout prioritizing branch " - "predictions"), - clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_CACHE, - "cache", - "perform optimal layout prioritizing I-cache " - "behavior"), - clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_SHUFFLE, - "cluster-shuffle", - "perform random layout of clusters"), - clEnumValEnd), - cl::ZeroOrMore); +static cl::opt +IndirectCallPromotionMispredictThreshold( + "indirect-call-promotion-mispredict-threshold", + cl::desc("misprediction threshold for skipping ICP on an " + "indirect call"), + cl::init(2), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionThreshold("indirect-call-promotion-threshold", + cl::desc("threshold for optimizing a frequently taken indirect call"), + cl::init(90), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionTopN("indirect-call-promotion-topn", + cl::desc("number of targets to consider when doing indirect " + "call promotion"), + cl::init(1), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionUseMispredicts("indirect-call-promotion-use-mispredicts", + cl::desc("use misprediction frequency for determining whether or not ICP " + "should be applied at a callsite. The " + "-indirect-call-promotion-mispredict-threshold value will be used " + "by this heuristic"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt -MinBranchClusters( - "min-branch-clusters", - cl::desc("use a modified clustering algorithm geared towards " - "minimizing branches"), - cl::ZeroOrMore, - cl::Hidden); +MinBranchClusters("min-branch-clusters", + cl::desc("use a modified clustering algorithm geared towards minimizing " + "branches"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); static cl::list -PrintSortedBy( - "print-sorted-by", - cl::CommaSeparated, - cl::desc("print functions sorted by order of dyno stats"), - cl::value_desc("key1,key2,key3,..."), - cl::values( -#define D(name, ...) \ +PrintSortedBy("print-sorted-by", + cl::CommaSeparated, + cl::desc("print functions sorted by order of dyno stats"), + cl::value_desc("key1,key2,key3,..."), + cl::values( +#define D(name, ...) \ clEnumValN(bolt::DynoStats::name, \ dynoStatsOptName(bolt::DynoStats::name), \ dynoStatsOptDesc(bolt::DynoStats::name)), DYNO_STATS #undef D clEnumValEnd), - cl::ZeroOrMore); + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); -enum DynoStatsSortOrder : char { - Ascending, - Descending -}; +static cl::opt +ReorderBlocks("reorder-blocks", + cl::desc("change layout of basic blocks in a function"), + cl::init(bolt::BinaryFunction::LT_NONE), + cl::values( + clEnumValN(bolt::BinaryFunction::LT_NONE, + "none", + "do not reorder basic blocks"), + clEnumValN(bolt::BinaryFunction::LT_REVERSE, + "reverse", + "layout blocks in reverse order"), + clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE, + "normal", + "perform optimal layout based on profile"), + clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_BRANCH, + "branch-predictor", + "perform optimal layout prioritizing branch " + "predictions"), + clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_CACHE, + "cache", + "perform optimal layout prioritizing I-cache " + "behavior"), + clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_SHUFFLE, + "cluster-shuffle", + "perform random layout of clusters"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); -static cl::opt -DynoStatsSortOrderOpt( - "print-sorted-by-order", - cl::desc("use ascending or descending order when printing " - "functions ordered by dyno stats"), - cl::ZeroOrMore, - cl::init(DynoStatsSortOrder::Descending)); +cl::opt +ReorderFunctions("reorder-functions", + cl::desc("reorder and cluster functions (works only with relocations)"), + cl::init(bolt::BinaryFunction::RT_NONE), + cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE, + "none", + "do not reorder functions"), + clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT, + "exec-count", + "order by execution count"), + clEnumValN(bolt::BinaryFunction::RT_HFSORT, + "hfsort", + "use hfsort algorithm"), + clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS, + "hfsort+", + "use hfsort+ algorithm"), + clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN, + "pettis-hansen", + "use Pettis-Hansen algorithm"), + clEnumValN(bolt::BinaryFunction::RT_RANDOM, + "random", + "reorder functions randomly"), + clEnumValN(bolt::BinaryFunction::RT_USER, + "user", + "use function order specified by -function-order"), + clEnumValEnd), + cl::cat(BoltOptCategory)); + +static cl::opt +ReorderFunctionsUseHotSize("reorder-functions-use-hot-size", + cl::desc("use a function's hot size when doing clustering"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); enum SctcModes : char { SctcAlways, @@ -175,80 +234,27 @@ enum SctcModes : char { }; static cl::opt -SctcMode( - "sctc-mode", - cl::desc("mode for simplify conditional tail calls"), - cl::init(SctcHeuristic), - cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), - clEnumValN(SctcPreserveDirection, - "preserve", - "only perform sctc when branch direction is " - "preserved"), - clEnumValN(SctcHeuristic, - "heuristic", - "use branch prediction data to control sctc"), - clEnumValEnd), - cl::ZeroOrMore); - -static cl::opt -IdenticalCodeFolding( - "icf", - cl::desc("fold functions with identical code"), - cl::ZeroOrMore); - -static cl::opt -UseDFSForICF( - "icf-dfs", - cl::desc("use DFS ordering when using -icf option"), - cl::ReallyHidden, - cl::ZeroOrMore); - -cl::opt -ReorderFunctions( - "reorder-functions", - cl::desc("reorder and cluster functions (works only with relocations)"), - cl::init(bolt::BinaryFunction::RT_NONE), - cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE, - "none", - "do not reorder functions"), - clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT, - "exec-count", - "order by execution count"), - clEnumValN(bolt::BinaryFunction::RT_HFSORT, - "hfsort", - "use hfsort algorithm"), - clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS, - "hfsort+", - "use hfsort+ algorithm"), - clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN, - "pettis-hansen", - "use Pettis-Hansen algorithm"), - clEnumValN(bolt::BinaryFunction::RT_RANDOM, - "random", - "reorder functions randomly"), - clEnumValN(bolt::BinaryFunction::RT_USER, - "user", - "use function order specified by -function-order"), - clEnumValEnd)); - -static cl::opt -FunctionOrderFile("function-order", - cl::desc("file containing an ordered list of functions to use" - " for function reordering")); - -static cl::opt -ReorderFunctionsUseHotSize( - "reorder-functions-use-hot-size", - cl::desc("use a function's hot size when doing clustering"), - cl::init(true), - cl::ZeroOrMore); +SctcMode("sctc-mode", + cl::desc("mode for simplify conditional tail calls"), + cl::init(SctcHeuristic), + cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), + clEnumValN(SctcPreserveDirection, + "preserve", + "only perform sctc when branch direction is " + "preserved"), + clEnumValN(SctcHeuristic, + "heuristic", + "use branch prediction data to control sctc"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt -UseEdgeCounts( - "use-edge-counts", - cl::desc("use edge count data when doing clustering"), - cl::init(true), - cl::ZeroOrMore); +UseEdgeCounts("use-edge-counts", + cl::desc("use edge count data when doing clustering"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); } // namespace opts @@ -848,7 +854,7 @@ void SimplifyRODataLoads::runOnFunctions( void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &) { - if (!opts::IdenticalCodeFolding) + if (!opts::ICF) return; const auto OriginalFunctionCount = BFs.size(); @@ -856,7 +862,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, uint64_t NumJTFunctionsFolded = 0; uint64_t BytesSavedEstimate = 0; uint64_t CallsSavedEstimate = 0; - static bool UseDFS = opts::UseDFSForICF; + static bool UseDFS = opts::ICFUseDFS; // This hash table is used to identify identical functions. It maps // a function to a bucket of functions identical to it. diff --git a/bolt/Passes/Inliner.cpp b/bolt/Passes/Inliner.cpp index abe3b9fb3c91..c376700f0526 100644 --- a/bolt/Passes/Inliner.cpp +++ b/bolt/Passes/Inliner.cpp @@ -17,19 +17,23 @@ using namespace llvm; namespace opts { -static cl::list -ForceInlineFunctions("force-inline", - cl::CommaSeparated, - cl::desc("list of functions to always consider " - "for inlining"), - cl::value_desc("func1,func2,func3,..."), - cl::Hidden); + +extern cl::OptionCategory BoltOptCategory; static cl::opt AggressiveInlining("aggressive-inlining", - cl::desc("perform aggressive inlining"), - cl::ZeroOrMore, - cl::Hidden); + cl::desc("perform aggressive inlining"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::list +ForceInlineFunctions("force-inline", + cl::CommaSeparated, + cl::desc("list of functions to always consider for inlining"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltOptCategory)); } diff --git a/bolt/Passes/ReorderAlgorithm.cpp b/bolt/Passes/ReorderAlgorithm.cpp index 295d286aed23..bb976acb5e41 100644 --- a/bolt/Passes/ReorderAlgorithm.cpp +++ b/bolt/Passes/ReorderAlgorithm.cpp @@ -26,14 +26,22 @@ using namespace bolt; namespace opts { +extern cl::OptionCategory BoltOptCategory; + static cl::opt -PrintClusters("print-clusters", cl::desc("print clusters"), cl::ZeroOrMore); +PrintClusters("print-clusters", + cl::desc("print clusters"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); cl::opt RandomSeed("bolt-seed", - cl::desc("seed for randomization"), - cl::init(42), - cl::ZeroOrMore); + cl::desc("seed for randomization"), + cl::init(42), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); } // namespace opts diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 29447f63f049..efcded2443a0 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -66,200 +66,250 @@ using namespace bolt; namespace opts { +extern cl::OptionCategory BoltCategory; +extern cl::OptionCategory BoltOptCategory; + extern cl::opt JumpTables; extern cl::opt ReorderFunctions; static cl::opt -OutputFilename("o", cl::desc(""), cl::Required); - -// The default verbosity level (0) is pretty terse, level 1 is fairly -// verbose and usually prints some informational message for every -// function processed. Level 2 is for the noisiest of messages and -// often prints a message per basic block. -// Error messages should never be suppressed by the verbosity level. -// Only warnings and info messages should be affected. -// -// The rational behind stream usage is as follows: -// outs() for info and debugging controlled by command line flags. -// errs() for errors and warnings. -// dbgs() for output within DEBUG(). -cl::opt -Verbosity("v", - cl::desc("set verbosity level for diagnostic output"), - cl::init(0), - cl::ZeroOrMore); +OutputFilename("o", + cl::desc(""), + cl::Required, + cl::cat(BoltCategory)); static cl::opt AlignFunctions("align-functions", - cl::desc("align functions at a given value"), - cl::init(64), - cl::ZeroOrMore); + cl::desc("align functions at a given value (relocation mode)"), + cl::init(64), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); static cl::opt AlignFunctionsMaxBytes("align-functions-max-bytes", - cl::desc("maximum number of bytes to use to align functions"), - cl::init(7), - cl::ZeroOrMore); - -static cl::list -BreakFunctionNames("break-funcs", - cl::CommaSeparated, - cl::desc("list of functions to core dump on (debugging)"), - cl::value_desc("func1,func2,func3,..."), - cl::Hidden); + cl::desc("maximum number of bytes to use to align functions"), + cl::init(7), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); cl::opt -UseOldText("use-old-text", - cl::desc("re-use space in old .text if possible"), - cl::Hidden); +AllowStripped("allow-stripped", + cl::desc("allow processing of stripped binaries"), + cl::Hidden, + cl::cat(BoltCategory)); cl::opt -TrapOldCode("trap-old-code", - cl::desc("insert traps in old function bodies"), - cl::Hidden); +BoostMacroops("boost-macroops", + cl::desc("try to boost macro-op fusions by avoiding the cache-line boundary"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); -cl::opt -PrintDynoStats("dyno-stats", - cl::desc("print execution info based on profile")); +static cl::list +BreakFunctionNames("break-funcs", + cl::CommaSeparated, + cl::desc("list of functions to core dump on (debugging)"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltCategory)); cl::opt -DynoStatsAll("dyno-stats-all", cl::desc("print dyno stats after each stage"), - cl::ZeroOrMore, - cl::Hidden); +DumpDotAll("dump-dot-all", + cl::desc("dump function CFGs to graphviz format after each stage"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); -static cl::opt -TopCalledLimit("top-called-limit", - cl::desc("maximum number of functions to print in top called " - "functions section"), - cl::init(100), - cl::ZeroOrMore, - cl::Hidden); +static cl::opt +DumpEHFrame("dump-eh-frame", + cl::desc("dump parsed .eh_frame (debugging)"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); cl::opt -HotText("hot-text", - cl::desc("hot text symbols support"), - cl::ZeroOrMore); +DynoStatsAll("dyno-stats-all", + cl::desc("print dyno stats after each stage"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +FixDebugInfoLargeFunctions("fix-debuginfo-large-functions", + cl::init(true), + cl::desc("do another pass if we encounter large functions, to correct their " + "debug info."), + cl::ZeroOrMore, + cl::ReallyHidden, + cl::cat(BoltCategory)); static cl::list FunctionNames("funcs", - cl::CommaSeparated, - cl::desc("list of functions to optimize"), - cl::value_desc("func1,func2,func3,...")); + cl::CommaSeparated, + cl::desc("list of functions to optimize"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltCategory)); static cl::opt FunctionNamesFile("funcs-file", - cl::desc("file with list of functions to optimize")); - -cl::opt -Relocs("relocs", - cl::desc("relocation support (experimental)"), - cl::ZeroOrMore); + cl::desc("file with list of functions to optimize"), + cl::Hidden, + cl::cat(BoltCategory)); static cl::list FunctionPadSpec("pad-funcs", - cl::CommaSeparated, - cl::desc("list of functions to pad with amount of bytes"), - cl::value_desc("func1:pad1,func2:pad2,func3:pad3,...")); + cl::CommaSeparated, + cl::desc("list of functions to pad with amount of bytes"), + cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."), + cl::Hidden, + cl::cat(BoltCategory)); -static cl::list -SkipFunctionNames("skip-funcs", - cl::CommaSeparated, - cl::desc("list of functions to skip"), - cl::value_desc("func1,func2,func3,...")); +cl::opt +HotText("hot-text", + cl::desc("hot text symbols support (relocation mode)"), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +static cl::opt +KeepTmp("keep-tmp", + cl::desc("preserve intermediate .o file"), + cl::Hidden, + cl::cat(BoltCategory)); -static cl::opt -SkipFunctionNamesFile("skip-funcs-file", - cl::desc("file with list of functions to skip")); static cl::opt MarkFuncs("mark-funcs", - cl::desc("mark function boundaries with break instruction to make " - "sure we accidentally don't cross them"), - cl::ReallyHidden, - cl::ZeroOrMore); + cl::desc("mark function boundaries with break instruction to make " + "sure we accidentally don't cross them"), + cl::ReallyHidden, + cl::ZeroOrMore, + cl::cat(BoltCategory)); static cl::opt MaxFunctions("max-funcs", - cl::desc("maximum # of functions to overwrite"), - cl::ZeroOrMore); - -cl::opt -SplitFunctions("split-functions", - cl::desc("split functions into hot and cold regions"), - cl::init(BinaryFunction::ST_NONE), - cl::values(clEnumValN(BinaryFunction::ST_NONE, "0", - "do not split any function"), - clEnumValN(BinaryFunction::ST_EH, "1", - "split all landing pads"), - clEnumValN(BinaryFunction::ST_LARGE, "2", - "also split if function too large to fit"), - clEnumValN(BinaryFunction::ST_ALL, "3", - "split all functions"), - clEnumValEnd), - cl::ZeroOrMore); + cl::desc("maximum number of functions to overwrite"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); cl::opt -UpdateDebugSections("update-debug-sections", - cl::desc("update DWARF debug sections of the executable"), - cl::ZeroOrMore); +PrintAll("print-all", + cl::desc("print functions after each stage"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); static cl::opt -FixDebugInfoLargeFunctions("fix-debuginfo-large-functions", - cl::init(true), - cl::desc("do another pass if we encounter large " - "functions, to correct their debug info."), - cl::ZeroOrMore, - cl::ReallyHidden); +PrintCFG("print-cfg", + cl::desc("print functions after CFG construction"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); static cl::opt -AlignBlocks("align-blocks", - cl::desc("try to align BBs inserting nops"), - cl::ZeroOrMore); +PrintDisasm("print-disasm", + cl::desc("print function after disassembly"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); -static cl::opt -UseGnuStack("use-gnu-stack", - cl::desc("use GNU_STACK program header for new segment"), - cl::ZeroOrMore); +cl::opt +PrintDynoStats("dyno-stats", + cl::desc("print execution info based on profile"), + cl::cat(BoltCategory)); static cl::opt -DumpEHFrame("dump-eh-frame", cl::desc("dump parsed .eh_frame (debugging)"), - cl::ZeroOrMore, - cl::Hidden); +PrintLoopInfo("print-loops", + cl::desc("print loop related information"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); cl::opt -PrintAll("print-all", cl::desc("print functions after each stage"), - cl::ZeroOrMore, - cl::Hidden); +Relocs("relocs", + cl::desc("relocation mode - use relocations to move functions in the binary"), + cl::ZeroOrMore, + cl::cat(BoltCategory)); -cl::opt -DumpDotAll("dump-dot-all", - cl::desc("dump function CFGs to graphviz format after each stage"), - cl::ZeroOrMore, - cl::Hidden); +static cl::list +SkipFunctionNames("skip-funcs", + cl::CommaSeparated, + cl::desc("list of functions to skip"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltCategory)); -static cl::opt -PrintCFG("print-cfg", cl::desc("print functions after CFG construction"), - cl::ZeroOrMore, - cl::Hidden); +static cl::opt +SkipFunctionNamesFile("skip-funcs-file", + cl::desc("file with list of functions to skip"), + cl::Hidden, + cl::cat(BoltCategory)); -static cl::opt -PrintLoopInfo("print-loops", cl::desc("print loop related information"), - cl::ZeroOrMore, - cl::Hidden); +cl::opt +SplitFunctions("split-functions", + cl::desc("split functions into hot and cold regions"), + cl::init(BinaryFunction::ST_NONE), + cl::values(clEnumValN(BinaryFunction::ST_NONE, "0", + "do not split any function"), + clEnumValN(BinaryFunction::ST_EH, "1", + "split all landing pads"), + clEnumValN(BinaryFunction::ST_LARGE, "2", + "also split if function too large to fit"), + clEnumValN(BinaryFunction::ST_ALL, "3", + "split all functions"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); -static cl::opt -PrintDisasm("print-disasm", cl::desc("print function after disassembly"), - cl::ZeroOrMore, - cl::Hidden); +static cl::opt +TopCalledLimit("top-called-limit", + cl::desc("maximum number of functions to print in top called " + "functions section"), + cl::init(100), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +cl::opt +TrapOldCode("trap-old-code", + cl::desc("insert traps in old function bodies (relocation mode)"), + cl::Hidden, + cl::cat(BoltCategory)); + +cl::opt +UpdateDebugSections("update-debug-sections", + cl::desc("update DWARF debug sections of the executable"), + cl::ZeroOrMore, + cl::cat(BoltCategory)); static cl::opt -KeepTmp("keep-tmp", - cl::desc("preserve intermediate .o file"), - cl::Hidden); +UseGnuStack("use-gnu-stack", + cl::desc("use GNU_STACK program header for new segment (workaround for " + "issues with strip/objcopy)"), + cl::ZeroOrMore, + cl::cat(BoltCategory)); cl::opt -AllowStripped("allow-stripped", - cl::desc("allow processing of stripped binaries"), - cl::Hidden); +UseOldText("use-old-text", + cl::desc("re-use space in old .text if possible (relocation mode)"), + cl::cat(BoltCategory)); + +// The default verbosity level (0) is pretty terse, level 1 is fairly +// verbose and usually prints some informational message for every +// function processed. Level 2 is for the noisiest of messages and +// often prints a message per basic block. +// Error messages should never be suppressed by the verbosity level. +// Only warnings and info messages should be affected. +// +// The rational behind stream usage is as follows: +// outs() for info and debugging controlled by command line flags. +// errs() for errors and warnings. +// dbgs() for output within DEBUG(). +cl::opt +Verbosity("v", + cl::desc("set verbosity level for diagnostic output"), + cl::init(0), + cl::ZeroOrMore, + cl::cat(BoltCategory)); // Check against lists of functions from options if we should // optimize the function with a given name. diff --git a/bolt/llvm-bolt.cpp b/bolt/llvm-bolt.cpp index e1a5ad80c693..5f75b2ff5817 100644 --- a/bolt/llvm-bolt.cpp +++ b/bolt/llvm-bolt.cpp @@ -32,15 +32,32 @@ using namespace bolt; namespace opts { -static cl::opt -InputFilename(cl::Positional, cl::desc(""), cl::Required); +cl::OptionCategory BoltCategory("BOLT generic options"); +cl::OptionCategory BoltOptCategory("BOLT optimization options"); +cl::OptionCategory BoltRelocCategory("BOLT options in relocation mode"); -static cl::opt -InputDataFilename("data", cl::desc(""), cl::Optional); +static cl::OptionCategory *BoltCategories[] = {&BoltCategory, + &BoltOptCategory, + &BoltRelocCategory}; static cl::opt -DumpData("dump-data", cl::desc("dump parsed bolt data and exit (debugging)"), - cl::Hidden); +DumpData("dump-data", + cl::desc("dump parsed bolt data and exit (debugging)"), + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +InputDataFilename("data", + cl::desc(""), + cl::Optional, + cl::cat(BoltCategory)); + +static cl::opt +InputFilename( + cl::Positional, + cl::desc(""), + cl::Required, + cl::cat(BoltCategory)); } // namespace opts @@ -68,6 +85,8 @@ int main(int argc, char **argv) { llvm::InitializeAllTargets(); llvm::InitializeAllAsmPrinters(); + cl::HideUnrelatedOptions(makeArrayRef(opts::BoltCategories)); + // Register the target printer for --version. cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/merge-fdata/merge-fdata.cpp index 83e3fd338fa9..8e847ca3a3ad 100644 --- a/bolt/merge-fdata/merge-fdata.cpp +++ b/bolt/merge-fdata/merge-fdata.cpp @@ -27,6 +27,8 @@ using namespace bolt; namespace opts { +cl::OptionCategory MergeFdataCategory("merge-fdata options"); + enum SortType : char { ST_NONE, ST_EXEC_COUNT, /// Sort based on function execution count. @@ -34,32 +36,35 @@ enum SortType : char { }; static cl::list -InputDataFilenames(cl::Positional, - cl::CommaSeparated, - cl::desc(" []..."), - cl::OneOrMore); +InputDataFilenames( + cl::Positional, + cl::CommaSeparated, + cl::desc(" []..."), + cl::OneOrMore, + cl::cat(MergeFdataCategory)); + +static cl::opt +PrintFunctionList("print", + cl::desc("print the list of objects with count to stderr"), + cl::init(ST_NONE), + cl::values(clEnumValN(ST_NONE, + "none", + "do not print objects/functions"), + clEnumValN(ST_EXEC_COUNT, + "exec", + "print functions sorted by execution count"), + clEnumValN(ST_TOTAL_BRANCHES, + "branches", + "print functions sorted by total branch count"), + clEnumValEnd), + cl::cat(MergeFdataCategory)); static cl::opt SuppressMergedDataOutput("q", - cl::desc("do not print merged data to stdout"), - cl::init(false), - cl::Optional); - -static cl::opt -PrintFunctionList( - "print", - cl::desc("print the list of objects with count to stderr"), - cl::init(ST_NONE), - cl::values(clEnumValN(ST_NONE, - "none", - "do not print objects/functions"), - clEnumValN(ST_EXEC_COUNT, - "exec", - "print functions sorted by execution count"), - clEnumValN(ST_TOTAL_BRANCHES, - "branches", - "print functions sorted by total branch count"), - clEnumValEnd)); + cl::desc("do not print merged data to stdout"), + cl::init(false), + cl::Optional, + cl::cat(MergeFdataCategory)); } // namespace opts @@ -78,6 +83,8 @@ int main(int argc, char **argv) { llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. + cl::HideUnrelatedOptions(opts::MergeFdataCategory); + cl::ParseCommandLineOptions(argc, argv, "merge fdata into a single file"); From 525e84ed44961b15b9cba3d7cea6637ed0f33d6b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 3 Apr 2017 16:24:26 -0700 Subject: [PATCH 237/904] [BOLT] Fix debug info update for inlining. Summary: When inlining, if a callee has debug info and a caller does not (i.e. a containing compilation unit was compiled without "-g"), we try to update a nonexistent compilation unit. Instead we should skip updating debug info in such cases. Minor refactoring of line number emitting code. (cherry picked from commit 40eafc197688c3b9f36d0f5d3766978949c8e293) --- bolt/BinaryFunction.cpp | 106 ++++++++++++++++++++-------------------- bolt/BinaryFunction.h | 30 +++++++++--- 2 files changed, 75 insertions(+), 61 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 39affc6455bb..e937e3a6f1cc 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -2526,7 +2526,6 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, } void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { - auto ULT = getDWARFUnitLineTable(); int64_t CurrentGnuArgsSize = 0; for (auto BB : layout()) { if (EmitColdPart != BB->isCold()) @@ -2536,14 +2535,9 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { Streamer.EmitCodeAlignment(BB->getAlignment()); Streamer.EmitLabel(BB->getLabel()); - // Remember last .debug_line entry emitted so that we don't repeat them in - // subsequent instructions, as gdb can figure it out by looking at the - // previous instruction with available line number info. - SMLoc LastLocSeen; - // Remember if last instruction emitted was a prefix bool LastIsPrefix = false; - + SMLoc LastLocSeen; for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { auto &Instr = *I; // Handle pseudo instructions. @@ -2558,52 +2552,8 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { Streamer.EmitCFIInstruction(*getCFIFor(Instr)); continue; } - if (opts::UpdateDebugSections) { - auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc()); - if (RowReference != DebugLineTableRowRef::NULL_ROW && - Instr.getLoc().getPointer() != LastLocSeen.getPointer()) { - auto Unit = ULT.first; - auto OriginalLineTable = ULT.second; - const auto OrigUnitID = Unit->getOffset(); - unsigned NewFilenum = 0; - - // If the CU id from the current instruction location does not - // match the CU id from the current function, it means that we - // have come across some inlined code. We must look up the CU - // for the instruction's original function and get the line table - // from that. We also update the current CU debug info with the - // filename of the inlined function. - if (RowReference.DwCompileUnitIndex != OrigUnitID) { - Unit = BC.DwCtx-> - getCompileUnitForOffset(RowReference.DwCompileUnitIndex); - OriginalLineTable = BC.DwCtx->getLineTableForUnit(Unit); - const auto Filenum = - OriginalLineTable->Rows[RowReference.RowIndex - 1].File; - NewFilenum = - BC.addDebugFilenameToUnit(OrigUnitID, - RowReference.DwCompileUnitIndex, - Filenum); - } - - assert(Unit && OriginalLineTable && - "Invalid CU offset set in instruction debug info."); - - const auto &OriginalRow = - OriginalLineTable->Rows[RowReference.RowIndex - 1]; - - BC.Ctx->setCurrentDwarfLoc( - NewFilenum == 0 ? OriginalRow.File : NewFilenum, - OriginalRow.Line, - OriginalRow.Column, - (DWARF2_FLAG_IS_STMT * OriginalRow.IsStmt) | - (DWARF2_FLAG_BASIC_BLOCK * OriginalRow.BasicBlock) | - (DWARF2_FLAG_PROLOGUE_END * OriginalRow.PrologueEnd) | - (DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin), - OriginalRow.Isa, - OriginalRow.Discriminator); - BC.Ctx->setDwarfCompileUnitID(OrigUnitID); - LastLocSeen = Instr.getLoc(); - } + if (opts::UpdateDebugSections && UnitLineTable.first) { + LastLocSeen = emitLineInfo(Instr.getLoc(), LastLocSeen); } // Emit GNU_args_size CFIs as necessary. @@ -3497,6 +3447,56 @@ bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, return true; } +SMLoc BinaryFunction::emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const { + auto *FunctionCU = UnitLineTable.first; + const auto *FunctionLineTable = UnitLineTable.second; + assert(FunctionCU && "cannot emit line info for function without CU"); + + auto RowReference = DebugLineTableRowRef::fromSMLoc(NewLoc); + + // Check if no new line info needs to be emitted. + if (RowReference == DebugLineTableRowRef::NULL_ROW || + NewLoc.getPointer() == PrevLoc.getPointer()) + return PrevLoc; + + unsigned CurrentFilenum = 0; + const auto *CurrentLineTable = FunctionLineTable; + + // If the CU id from the current instruction location does not + // match the CU id from the current function, it means that we + // have come across some inlined code. We must look up the CU + // for the instruction's original function and get the line table + // from that. + const auto FunctionUnitIndex = FunctionCU->getOffset(); + const auto CurrentUnitIndex = RowReference.DwCompileUnitIndex; + if (CurrentUnitIndex != FunctionUnitIndex) { + CurrentLineTable = BC.DwCtx->getLineTableForUnit( + BC.DwCtx->getCompileUnitForOffset(CurrentUnitIndex)); + // Add filename from the inlined function to the current CU. + CurrentFilenum = + BC.addDebugFilenameToUnit(FunctionUnitIndex, CurrentUnitIndex, + CurrentLineTable->Rows[RowReference.RowIndex - 1].File); + } + + const auto &CurrentRow = CurrentLineTable->Rows[RowReference.RowIndex - 1]; + if (!CurrentFilenum) + CurrentFilenum = CurrentRow.File; + + BC.Ctx->setCurrentDwarfLoc( + CurrentFilenum, + CurrentRow.Line, + CurrentRow.Column, + (DWARF2_FLAG_IS_STMT * CurrentRow.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * CurrentRow.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * CurrentRow.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * CurrentRow.EpilogueBegin), + CurrentRow.Isa, + CurrentRow.Discriminator); + BC.Ctx->setDwarfCompileUnitID(FunctionUnitIndex); + + return NewLoc; +} + BinaryFunction::~BinaryFunction() { for (auto BB : BasicBlocks) { delete BB; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 3aa6bc16f6ab..ac90f7f6313f 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -315,6 +315,12 @@ class BinaryFunction : public AddressRangesOwner { std::vector> SubprogramDIEs; + /// Line table for the function with containing compilation unit. + /// Because of identical code folding the function could have multiple + /// associated compilation units. The first of them with line number info + /// is referenced by UnitLineTable. + DWARFUnitLineTable UnitLineTable{nullptr, nullptr}; + /// Offset of this function's address ranges in the .debug_ranges section of /// the output binary. uint32_t AddressRangesOffset{-1U}; @@ -753,6 +759,12 @@ class BinaryFunction : public AddressRangesOwner { unsigned Size, uint64_t Offset); + /// Emit line number information corresponding to \p NewLoc. \p PrevLoc + /// provides a context for de-duplication of line number info. + /// + /// Return new current location which is either \p NewLoc or \p PrevLoc. + SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const; + BinaryFunction& operator=(const BinaryFunction &) = delete; BinaryFunction(const BinaryFunction &) = delete; @@ -1668,22 +1680,24 @@ class BinaryFunction : public AddressRangesOwner { /// Sets the associated .debug_info entry. void addSubprogramDIE(DWARFCompileUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE) { + const DWARFDebugInfoEntryMinimal *DIE) { SubprogramDIEs.emplace_back(DIE, Unit); + if (!UnitLineTable.first) { + if (const auto *LineTable = BC.DwCtx->getLineTableForUnit(Unit)) { + UnitLineTable = std::make_pair(Unit, LineTable); + } + } } + /// Return all compilation units with entry for this function. + /// Because of identical code folding there could be multiple of these. const decltype(SubprogramDIEs) &getSubprogramDIEs() const { return SubprogramDIEs; } - /// Return DWARF compile unit with line info. + /// Return DWARF compile unit with line info for this function. DWARFUnitLineTable getDWARFUnitLineTable() const { - for (auto &DIEUnitPair : SubprogramDIEs) { - if (auto *LT = BC.DwCtx->getLineTableForUnit(DIEUnitPair.second)) { - return std::make_pair(DIEUnitPair.second, LT); - } - } - return std::make_pair(nullptr, nullptr); + return UnitLineTable; } /// Returns the size of the basic block in the original binary. From dc2e54202aab8fe850abf0f37114d39f08460c57 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 31 Mar 2017 07:51:30 -0700 Subject: [PATCH 238/904] [BOLT] Detect and reject binaries built for coverage. Summary: Don't attempt to optimize binaries built with coverage support. (cherry picked from commit c76fcefa8e1b8f9156dbe9e7e166bd6c69c76aad) --- bolt/RewriteInstance.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index efcded2443a0..8e428709247f 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -837,6 +837,11 @@ void RewriteInstance::discoverFileObjects() { "support. Cannot optimize.\n"; exit(1); } + if (NameOrError && NameOrError->startswith("__llvm_coverage_mapping")) { + errs() << "BOLT-ERROR: input file was compiled or linked with coverage " + "support. Cannot optimize.\n"; + exit(1); + } if (Symbol.getFlags() & SymbolRef::SF_Undefined) continue; From d72ba2ffa62bbd5dbb7ef0296e3b13c865702d16 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 20 Mar 2017 22:44:25 -0700 Subject: [PATCH 239/904] [BOLT] Fix double jump peephole, remove useless conditional branches. Summary: I split some of this out from the jumptable diff since it fixes the double jump peephole. I've changed the pass manager so that UCE and peepholes are not called after SCTC. I've incorporated a call to the double jump fixer to SCTC since it is needed to fix things up afterwards. While working on fixing the double jump peephole I discovered a few useless conditional branches that could be removed as well. I highly doubt that removing them will improve perf at all but it does seem odd to leave in useless conditional branches. There are also some minor logging improvements. (cherry picked from commit c73a1121698c81c306e8c0a631b67c0a08459f35) --- bolt/BinaryBasicBlock.cpp | 30 +++- bolt/BinaryBasicBlock.h | 11 ++ bolt/BinaryFunction.cpp | 19 ++- bolt/BinaryFunction.h | 15 +- bolt/BinaryPassManager.cpp | 51 +++++-- bolt/Passes/BinaryPasses.cpp | 273 +++++++++++++++++++++-------------- bolt/Passes/BinaryPasses.h | 15 +- 7 files changed, 281 insertions(+), 133 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index d845ba4c7f2a..c3c9e0a4f193 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -55,6 +55,8 @@ BinaryBasicBlock::reverse_iterator BinaryBasicBlock::getLastNonPseudo() { } bool BinaryBasicBlock::validateSuccessorInvariants() { + auto *Func = getFunction(); + auto &BC = Func->getBinaryContext(); const MCSymbol *TBB = nullptr; const MCSymbol *FBB = nullptr; MCInst *CondBranch = nullptr; @@ -67,7 +69,9 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { case 0: return !CondBranch && !UncondBranch; case 1: - return !CondBranch; + return !CondBranch || + (CondBranch && + !Func->getBasicBlockForLabel(BC.MIA->getTargetSymbol(*CondBranch))); case 2: return (!CondBranch || @@ -185,6 +189,7 @@ void BinaryBasicBlock::replaceSuccessor(BinaryBasicBlock *Succ, BinaryBasicBlock *NewSucc, uint64_t Count, uint64_t MispredictedCount) { + Succ->removePredecessor(this); auto I = succ_begin(); auto BI = BranchInfo.begin(); for (; I != succ_end(); ++I) { @@ -197,6 +202,7 @@ void BinaryBasicBlock::replaceSuccessor(BinaryBasicBlock *Succ, *I = NewSucc; *BI = BinaryBranchInfo{Count, MispredictedCount}; + NewSucc->addPredecessor(this); } void BinaryBasicBlock::removeSuccessor(BinaryBasicBlock *Succ) { @@ -225,6 +231,28 @@ void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) { Predecessors.erase(I); } +void BinaryBasicBlock::removeDuplicateConditionalSuccessor(MCInst *CondBranch) { + assert(succ_size() == 2); + + auto *Succ = Successors[0]; + assert(Succ == Successors[1]); + + const auto CondBI = BranchInfo[0]; + const auto UncondBI = BranchInfo[1]; + + eraseInstruction(CondBranch); + + Successors.clear(); + BranchInfo.clear(); + + Successors.push_back(Succ); + BranchInfo.push_back({CondBI.Count + UncondBI.Count, + CondBI.MispredictedCount + UncondBI.MispredictedCount}); + + assert(isSuccessor(Succ)); + assert(Succ->isPredecessor(this)); +} + void BinaryBasicBlock::addLandingPad(BinaryBasicBlock *LPBlock) { if (std::find(LandingPads.begin(), LandingPads.end(), LPBlock) == LandingPads.end()) { LandingPads.push_back(LPBlock); diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 5caf1e8f1ed5..8e6a75ba1424 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -522,6 +522,17 @@ class BinaryBasicBlock { } } + /// Remove useless duplicate successors. When the conditional + /// successor is the same as the unconditional successor, we can + /// remove the conditional successor and branch instruction. + void removeDuplicateConditionalSuccessor(MCInst *CondBranch); + + /// Test if BB is a predecessor of this block. + bool isPredecessor(const BinaryBasicBlock *BB) const { + auto Itr = std::find(Predecessors.begin(), Predecessors.end(), BB); + return Itr != Predecessors.end(); + } + /// Test if BB is a successor of this block. bool isSuccessor(const BinaryBasicBlock *BB) const { auto Itr = std::find(Successors.begin(), Successors.end(), BB); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index e937e3a6f1cc..73a66e0c3dd9 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -322,10 +322,22 @@ std::pair BinaryFunction::eraseInvalidBBs() { } bool BinaryFunction::isForwardCall(const MCSymbol *CalleeSymbol) const { - // TODO: Once we start reordering functions this has to change. #15031238 + // This function should work properly before and after function reordering. + // In order to accomplish this, we use the function index (if it is valid). + // If the function indices are not valid, we fall back to the original + // addresses. This should be ok because the functions without valid indices + // should have been ordered with a stable sort. const auto *CalleeBF = BC.getFunctionForSymbol(CalleeSymbol); if (CalleeBF) { - return CalleeBF->getAddress() > getAddress(); + if (hasValidIndex() && CalleeBF->hasValidIndex()) { + return getIndex() < CalleeBF->getIndex(); + } else if (hasValidIndex() && !CalleeBF->hasValidIndex()) { + return true; + } else if (!hasValidIndex() && CalleeBF->hasValidIndex()) { + return false; + } else { + return getAddress() < CalleeBF->getAddress(); + } } else { // Absolute symbol. auto const CalleeSI = BC.GlobalSymbols.find(CalleeSymbol->getName()); @@ -2888,6 +2900,9 @@ void BinaryFunction::fixBranches() { } else { MIA->replaceBranchTarget(*CondBranch, TSuccessor->getLabel(), Ctx); } + if (TSuccessor == FSuccessor) { + BB->removeDuplicateConditionalSuccessor(CondBranch); + } if (!NextBB || (NextBB != TSuccessor && NextBB != FSuccessor)) { BB->addBranchInstruction(FSuccessor); } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index ac90f7f6313f..2f903b760de4 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -337,11 +337,6 @@ class BinaryFunction : public AddressRangesOwner { return BB->getIndex(); } - BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) const { - auto I = LabelToBB.find(Label); - return I == LabelToBB.end() ? nullptr : I->second; - } - /// Return basic block that originally contained offset \p Offset /// from the function start. BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); @@ -913,6 +908,16 @@ class BinaryFunction : public AddressRangesOwner { /// fixBranches(). DynoStats getDynoStats() const; + BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) { + auto I = LabelToBB.find(Label); + return I == LabelToBB.end() ? nullptr : I->second; + } + + const BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) const { + auto I = LabelToBB.find(Label); + return I == LabelToBB.end() ? nullptr : I->second; + } + /// Returns the basic block after the given basic block in the layout or /// nullptr the last basic block is given. const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB) const { diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index b3f1d306ee4a..ae2029be0984 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -13,6 +13,7 @@ #include "Passes/FrameOptimizer.h" #include "Passes/Inliner.h" #include "llvm/Support/Timer.h" +#include "llvm/Support/raw_ostream.h" #include using namespace llvm; @@ -21,10 +22,17 @@ namespace opts { extern cl::OptionCategory BoltOptCategory; +extern cl::opt Verbosity; extern cl::opt PrintAll; extern cl::opt DumpDotAll; extern cl::opt DynoStatsAll; +static cl::opt +ICF("icf", + cl::desc("fold functions with identical code"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt EliminateUnreachable("eliminate-unreachable", cl::desc("eliminate unreachable code"), @@ -223,6 +231,10 @@ void BinaryFunctionPassManager::runPasses() { auto &Pass = OptPassPair.second; + if (opts::Verbosity > 0) { + outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n"; + } + NamedRegionTimer T(Pass->getName(), TimerGroupName, TimeOpts); callWithDynoStats( @@ -247,6 +259,10 @@ void BinaryFunctionPassManager::runPasses() { exit(1); } + if (opts::Verbosity > 0) { + outs() << "BOLT-INFO: Finished pass: " << Pass->getName() << "\n"; + } + if (!opts::PrintAll && !opts::DumpDotAll && !Pass->printPass()) continue; @@ -282,7 +298,8 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(NeverPrint), opts::StripRepRet); - Manager.registerPass(llvm::make_unique(PrintICF)); + Manager.registerPass(llvm::make_unique(PrintICF), + opts::ICF); Manager.registerPass(llvm::make_unique(PrintICP), opts::IndirectCallPromotion); @@ -301,7 +318,8 @@ void BinaryFunctionPassManager::runAllPasses( llvm::make_unique(PrintSimplifyROLoads), opts::SimplifyRODataLoads); - Manager.registerPass(llvm::make_unique(PrintICF)); + Manager.registerPass(llvm::make_unique(PrintICF), + opts::ICF); Manager.registerPass(llvm::make_unique(PrintReordered)); @@ -320,27 +338,30 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintFOP), OptimizeFrameAccesses); + // This pass should come close to last since it uses the estimated hot + // size of a function to determine the order. It should definitely + // also happen after any changes to the call graph are made, e.g. inlining. + Manager.registerPass( + llvm::make_unique(PrintReorderedFunctions)); + // This pass introduces conditional jumps into external functions. // Between extending CFG to support this and isolating this pass we chose - // the latter. Thus this pass will do unreachable code elimination - // if necessary and wouldn't rely on UCE for this. - // More generally this pass should be the last optimization pass. + // the latter. Thus this pass will do double jump removal and unreachable + // code elimination if necessary and won't rely on peepholes/UCE for these + // optimizations. + // More generally this pass should be the last optimization pass that + // modifies branches/control flow. This pass is run after function + // reordering so that it can tell whether calls are forward/backward + // accurately. Manager.registerPass( llvm::make_unique(PrintSCTC), opts::SimplifyConditionalTailCalls); - Manager.registerPass(llvm::make_unique(PrintPeepholes), - opts::Peepholes); - - Manager.registerPass( - llvm::make_unique(PrintUCE), - opts::EliminateUnreachable); - - Manager.registerPass( - llvm::make_unique(PrintReorderedFunctions)); - + // This pass should always run last.* Manager.registerPass(llvm::make_unique(PrintFinalized)); + // *except for this pass. TODO: figure out why moving this before function + // reordering breaks things badly. Manager.registerPass( llvm::make_unique(PrintAfterLowering)); diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 17bd0b475986..76028dd037b0 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -77,12 +77,6 @@ FunctionOrderFile("function-order", "reordering"), cl::cat(BoltOptCategory)); -static cl::opt -ICF("icf", - cl::desc("fold functions with identical code"), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt ICFUseDFS("icf-dfs", cl::desc("use DFS ordering when using -icf option"), @@ -371,7 +365,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { std::tie(Count, Bytes) = Function.eraseInvalidBBs(); DeletedBlocks += Count; DeletedBytes += Bytes; - if (Count) { + if (Count && opts::Verbosity > 0) { Modified.insert(&Function); outs() << "BOLT-INFO: Removed " << Count << " dead basic block(s) accounting for " << Bytes @@ -404,21 +398,22 @@ void ReorderBasicBlocks::runOnFunctions( BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { + if (opts::ReorderBlocks == BinaryFunction::LT_NONE) + return; + for (auto &It : BFs) { auto &Function = It.second; if (!shouldOptimize(Function)) continue; - if (opts::ReorderBlocks != BinaryFunction::LT_NONE) { - bool ShouldSplit = - (opts::SplitFunctions == BinaryFunction::ST_ALL) || - (opts::SplitFunctions == BinaryFunction::ST_EH && - Function.hasEHRanges()) || - (LargeFunctions.find(It.first) != LargeFunctions.end()); - Function.modifyLayout(opts::ReorderBlocks, opts::MinBranchClusters, - ShouldSplit); - } + const bool ShouldSplit = + (opts::SplitFunctions == BinaryFunction::ST_ALL) || + (opts::SplitFunctions == BinaryFunction::ST_EH && + Function.hasEHRanges()) || + (LargeFunctions.find(It.first) != LargeFunctions.end()); + Function.modifyLayout(opts::ReorderBlocks, opts::MinBranchClusters, + ShouldSplit); } } @@ -441,13 +436,14 @@ void FinalizeFunctions::runOnFunctions( ) { for (auto &It : BFs) { auto &Function = It.second; + const auto ShouldOptimize = shouldOptimize(Function); // Always fix functions in relocation mode. - if (!opts::Relocs && !shouldOptimize(Function)) + if (!opts::Relocs && !ShouldOptimize) continue; // Fix the CFI state. - if (shouldOptimize(Function) && !Function.fixCFIState()) { + if (ShouldOptimize && !Function.fixCFIState()) { if (opts::Relocs) { errs() << "BOLT-ERROR: unable to fix CFI state for function " << Function << ". Exiting.\n"; @@ -464,6 +460,111 @@ void FinalizeFunctions::runOnFunctions( } } +namespace { + +// This peephole fixes jump instructions that jump to another basic +// block with a single jump instruction, e.g. +// +// B0: ... +// jmp B1 (or jcc B1) +// +// B1: jmp B2 +// +// -> +// +// B0: ... +// jmp B2 (or jcc B2) +// +uint64_t fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function) { + uint64_t NumDoubleJumps = 0; + + for (auto &BB : Function) { + auto checkAndPatch = [&](BinaryBasicBlock *Pred, + BinaryBasicBlock *Succ, + const MCSymbol *SuccSym) { + // Ignore infinite loop jumps or fallthrough tail jumps. + if (Pred == Succ || Succ == &BB) + return; + + if (Succ) { + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + auto Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); + if(!Res) { + DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; + Pred->dump()); + return; + } + Pred->replaceSuccessor(&BB, Succ); + + // We must patch up any existing branch instructions to match up + // with the new successor. + auto *Ctx = BC.Ctx.get(); + if (CondBranch && + BC.MIA->getTargetSymbol(*CondBranch) == BB.getLabel()) { + BC.MIA->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); + } else if (UncondBranch && + BC.MIA->getTargetSymbol(*UncondBranch) == BB.getLabel()) { + BC.MIA->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); + } + } else { + // Succ will be null in the tail call case. In this case we + // need to explicitly add a tail call instruction. + auto *Branch = Pred->getLastNonPseudoInstr(); + if (Branch && BC.MIA->isUnconditionalBranch(*Branch)) { + assert(BC.MIA->getTargetSymbol(*Branch) == BB.getLabel()); + Pred->removeSuccessor(&BB); + Pred->eraseInstruction(Branch); + Pred->addTailCallInstruction(SuccSym); + } else { + return; + } + } + + ++NumDoubleJumps; + DEBUG(dbgs() << "Removed double jump in " << Function << " from " + << Pred->getName() << " -> " << BB.getName() << " to " + << Pred->getName() << " -> " << SuccSym->getName() + << (!Succ ? " (tail)\n" : "\n")); + }; + + if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) + continue; + + auto *Inst = BB.getFirstNonPseudoInstr(); + const bool IsTailCall = BC.MIA->isTailCall(*Inst); + + if (!BC.MIA->isUnconditionalBranch(*Inst) && !IsTailCall) + continue; + + const auto *SuccSym = BC.MIA->getTargetSymbol(*Inst); + auto *Succ = BB.getSuccessor(); + + if ((!Succ || &BB == Succ) && !IsTailCall) + continue; + + std::vector Preds{BB.pred_begin(), BB.pred_end()}; + + for (auto *Pred : Preds) { + if (Pred->isLandingPad()) + continue; + + if (Pred->getSuccessor() == &BB || + (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || + Pred->getConditionalSuccessor(false) == &BB) { + checkAndPatch(Pred, Succ, SuccSym); + assert(Function.validateCFG()); + } + } + } + + return NumDoubleJumps; +} + +} + bool SimplifyConditionalTailCalls::shouldRewriteBranch(const BinaryBasicBlock *PredBB, const MCInst &CondBranch, @@ -597,8 +698,11 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, } if (NumLocalCTCs > 0) { + NumDoubleJumps += fixDoubleJumps(BC, BF); // Clean-up unreachable tail-call blocks. - BF.eraseInvalidBBs(); + const auto Stats = BF.eraseInvalidBBs(); + DeletedBlocks += Stats.first; + DeletedBytes += Stats.second; } DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs @@ -631,7 +735,10 @@ void SimplifyConditionalTailCalls::runOnFunctions( outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched << " tail calls (" << NumOrigForwardBranches << " forward)" << " tail calls (" << NumOrigBackwardBranches << " backward)" - << " from a total of " << NumCandidateTailCalls << "\n"; + << " from a total of " << NumCandidateTailCalls + << " while removing " << NumDoubleJumps << " double jumps" + << " and removing " << DeletedBlocks << " basic blocks" + << " totalling " << DeletedBytes << " bytes of code.\n"; } void Peepholes::shortenInstructions(BinaryContext &BC, @@ -643,94 +750,46 @@ void Peepholes::shortenInstructions(BinaryContext &BC, } } -void debugDump(BinaryFunction *BF) { - BF->dump(); -} - -// This peephole fixes jump instructions that jump to another basic -// block with a single jump instruction, e.g. -// -// B0: ... -// jmp B1 (or jcc B1) -// -// B1: jmp B2 -// -// -> -// -// B0: ... -// jmp B2 (or jcc B2) -// -void Peepholes::fixDoubleJumps(BinaryContext &BC, - BinaryFunction &Function) { +void Peepholes::addTailcallTraps(BinaryContext &BC, + BinaryFunction &Function) { for (auto &BB : Function) { - auto checkAndPatch = [&](BinaryBasicBlock *Pred, - BinaryBasicBlock *Succ, - const MCSymbol *SuccSym) { - // Ignore infinite loop jumps or fallthrough tail jumps. - if (Pred == Succ || Succ == &BB) - return; - - if (Succ) { - Pred->replaceSuccessor(&BB, Succ); - } else { - // Succ will be null in the tail call case. In this case we - // need to explicitly add a tail call instruction. - auto *Branch = Pred->getLastNonPseudoInstr(); - if (Branch && BC.MIA->isUnconditionalBranch(*Branch)) { - Pred->removeSuccessor(&BB); - Pred->eraseInstruction(Branch); - Pred->addTailCallInstruction(SuccSym); - } else { - return; - } + auto *Inst = BB.getLastNonPseudoInstr(); + if (Inst && BC.MIA->isTailCall(*Inst) && BC.MIA->isIndirectBranch(*Inst)) { + MCInst Trap; + if (BC.MIA->createTrap(Trap)) { + BB.addInstruction(Trap); + ++TailCallTraps; } + } + } +} - ++NumDoubleJumps; - DEBUG(dbgs() << "Removed double jump in " << Function << " from " - << Pred->getName() << " -> " << BB.getName() << " to " - << Pred->getName() << " -> " << SuccSym->getName() - << (!Succ ? " (tail)\n" : "\n")); - }; - - if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) +void Peepholes::removeUselessCondBranches(BinaryContext &BC, + BinaryFunction &Function) { + for (auto &BB : Function) { + if (BB.succ_size() != 2) continue; - auto *Inst = BB.getFirstNonPseudoInstr(); - const bool IsTailCall = BC.MIA->isTailCall(*Inst); - - if (!BC.MIA->isUnconditionalBranch(*Inst) && !IsTailCall) - continue; + auto *CondBB = BB.getConditionalSuccessor(true); + auto *UncondBB = BB.getConditionalSuccessor(false); - const auto *SuccSym = BC.MIA->getTargetSymbol(*Inst); - auto *Succ = BB.getSuccessor(); - - if ((!Succ || &BB == Succ) && !IsTailCall) - continue; - - std::vector Preds{BB.pred_begin(), BB.pred_end()}; + if (CondBB == UncondBB) { + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + auto Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); - for (auto *Pred : Preds) { - if (Pred->isLandingPad()) + // analyzeBranch can fail due to unusual branch instructions, e.g. jrcxz + if (!Result) { + DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; + BB.dump()); continue; - - if (Pred->getSuccessor() == &BB || - (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || - Pred->getConditionalSuccessor(false) == &BB) { - checkAndPatch(Pred, Succ, SuccSym); } - } - } -} -void Peepholes::addTailcallTraps(BinaryContext &BC, - BinaryFunction &Function) { - for (auto &BB : Function) { - auto *Inst = BB.getLastNonPseudoInstr(); - if (Inst && BC.MIA->isTailCall(*Inst) && BC.MIA->isIndirectBranch(*Inst)) { - MCInst Trap; - if (BC.MIA->createTrap(Trap)) { - BB.addInstruction(Trap); - ++TailCallTraps; + if (CondBranch) { + BB.removeDuplicateConditionalSuccessor(CondBranch); + ++NumUselessCondBranches; } } } @@ -743,12 +802,17 @@ void Peepholes::runOnFunctions(BinaryContext &BC, auto &Function = It.second; if (shouldOptimize(Function)) { shortenInstructions(BC, Function); - fixDoubleJumps(BC, Function); + NumDoubleJumps += fixDoubleJumps(BC, Function); addTailcallTraps(BC, Function); + removeUselessCondBranches(BC, Function); } } - outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps << " double jumps patched.\n"; - outs() << "BOLT-INFO: Peephole: " << TailCallTraps << " tail call traps inserted.\n"; + outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps + << " double jumps patched.\n" + << "BOLT-INFO: Peephole: " << TailCallTraps + << " tail call traps inserted.\n" + << "BOLT-INFO: Peephole: " << NumUselessCondBranches + << " useless conditional branches removed.\n"; } bool SimplifyRODataLoads::simplifyRODataLoads( @@ -854,9 +918,6 @@ void SimplifyRODataLoads::runOnFunctions( void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &) { - if (!opts::ICF) - return; - const auto OriginalFunctionCount = BFs.size(); uint64_t NumFunctionsFolded = 0; uint64_t NumJTFunctionsFolded = 0; @@ -1820,7 +1881,9 @@ void ReorderFunctions::reorder(std::vector &&Clusters, } } - if (opts::Verbosity > 0 || (DebugFlag && isCurrentDebugType("hfsort"))) { + if (opts::ReorderFunctions != BinaryFunction::RT_NONE && + (opts::Verbosity > 0 || + (DebugFlag && isCurrentDebugType("hfsort")))) { uint64_t TotalSize = 0; uint64_t CurPage = 0; uint64_t Hotfuncs = 0; diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 37866f9e36b1..fe420594828e 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -198,6 +198,9 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { uint64_t NumTailCallsPatched{0}; uint64_t NumOrigForwardBranches{0}; uint64_t NumOrigBackwardBranches{0}; + uint64_t NumDoubleJumps{0}; + uint64_t DeletedBlocks{0}; + uint64_t DeletedBytes{0}; std::unordered_set Modified; bool shouldRewriteBranch(const BinaryBasicBlock *PredBB, @@ -225,20 +228,22 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { class Peepholes : public BinaryFunctionPass { uint64_t NumDoubleJumps{0}; uint64_t TailCallTraps{0}; + uint64_t NumUselessCondBranches{0}; /// Attempt to use the minimum operand width for arithmetic, branch and /// move instructions. void shortenInstructions(BinaryContext &BC, BinaryFunction &Function); - /// Replace double jumps with a jump directly to the target, i.e. - /// jmp/jcc L1; L1: jmp L2 -> jmp/jcc L2. - void fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function); - /// Add trap instructions immediately after indirect tail calls to prevent /// the processor from decoding instructions immediate following the /// tailcall. void addTailcallTraps(BinaryContext &BC, BinaryFunction &Function); - public: + + /// Remove useless duplicate successors. When the conditional + /// successor is the same as the unconditional successor, we can + /// remove the conditional successor and branch instruction. + void removeUselessCondBranches(BinaryContext &BC, BinaryFunction &Function); +public: explicit Peepholes(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } From 51ec43746af5429f8103af39255cc945436f5c37 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 5 Apr 2017 13:00:20 -0700 Subject: [PATCH 240/904] [BOLT] Fix branch count in removeDuplicateConditionalSuccessor(). Summary: When we merge the original branch counts we have to make sure both of them have a profile. Otherwise set the count to COUNT_NO_PROFILE. The misprediction count should be 0. (cherry picked from commit 41fc7a52b0fa7d7e712372656138c97da448bd94) --- bolt/BinaryBasicBlock.cpp | 13 ++++++------- bolt/Passes/BinaryPasses.cpp | 32 ++++++++++++++------------------ 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index c3c9e0a4f193..359000fa6b07 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -232,11 +232,10 @@ void BinaryBasicBlock::removePredecessor(BinaryBasicBlock *Pred) { } void BinaryBasicBlock::removeDuplicateConditionalSuccessor(MCInst *CondBranch) { - assert(succ_size() == 2); + assert(succ_size() == 2 && Successors[0] == Successors[1] && + "conditional successors expected"); auto *Succ = Successors[0]; - assert(Succ == Successors[1]); - const auto CondBI = BranchInfo[0]; const auto UncondBI = BranchInfo[1]; @@ -246,11 +245,11 @@ void BinaryBasicBlock::removeDuplicateConditionalSuccessor(MCInst *CondBranch) { BranchInfo.clear(); Successors.push_back(Succ); - BranchInfo.push_back({CondBI.Count + UncondBI.Count, - CondBI.MispredictedCount + UncondBI.MispredictedCount}); - assert(isSuccessor(Succ)); - assert(Succ->isPredecessor(this)); + uint64_t Count = COUNT_NO_PROFILE; + if (CondBI.Count != COUNT_NO_PROFILE && UncondBI.Count != COUNT_NO_PROFILE) + Count = CondBI.Count + UncondBI.Count; + BranchInfo.push_back({Count, 0}); } void BinaryBasicBlock::addLandingPad(BinaryBasicBlock *LPBlock) { diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 76028dd037b0..515af23077e7 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -641,7 +641,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, MCInst *UncondBranch = nullptr; auto Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); - // analyzeBranch can fail due to unusual branch instructions, e.g. jrcxz + // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz if (!Result) { DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; PredBB->dump()); @@ -772,26 +772,22 @@ void Peepholes::removeUselessCondBranches(BinaryContext &BC, auto *CondBB = BB.getConditionalSuccessor(true); auto *UncondBB = BB.getConditionalSuccessor(false); + if (CondBB != UncondBB) + continue; - if (CondBB == UncondBB) { - const MCSymbol *TBB = nullptr; - const MCSymbol *FBB = nullptr; - MCInst *CondBranch = nullptr; - MCInst *UncondBranch = nullptr; - auto Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + auto Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); - // analyzeBranch can fail due to unusual branch instructions, e.g. jrcxz - if (!Result) { - DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; - BB.dump()); - continue; - } + // analyzeBranch() can fail due to unusual branch instructions, + // e.g. jrcxz, or jump tables (indirect jump). + if (!Result || !CondBranch) + continue; - if (CondBranch) { - BB.removeDuplicateConditionalSuccessor(CondBranch); - ++NumUselessCondBranches; - } - } + BB.removeDuplicateConditionalSuccessor(CondBranch); + ++NumUselessCondBranches; } } From e41d141ed3621bfcaecbc45c53b7ad95e8cc2fdb Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 5 Apr 2017 09:29:24 -0700 Subject: [PATCH 241/904] [BOLT] Relocation support for non-allocatable sections. Summary: Relocations can be created for non-allocatable (aka Note) sections. To start using this for debug info, the emission has to be moved earlier in the pipeline for relocation processing to kick in. (cherry picked from commit c562df02b105e02a81da2e5305bec7032dbc3076) --- bolt/DWARFRewriter.cpp | 5 ++-- bolt/RewriteInstance.cpp | 55 +++++++++++++++++++--------------------- bolt/RewriteInstance.h | 2 +- 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 351460a23161..128408052021 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -396,9 +396,8 @@ void RewriteInstance::updateFunctionRanges() { } void RewriteInstance::generateDebugRanges() { - using RangeType = enum { RANGES, ARANGES }; - for (int IntRT = RANGES; IntRT <= ARANGES; ++IntRT) { - RangeType RT = static_cast(IntRT); + enum { RANGES, ARANGES }; + for (auto RT = RANGES + 0; RT <= ARANGES; ++RT) { const char *SectionName = (RT == RANGES) ? ".debug_ranges" : ".debug_aranges"; SmallVector RangesBuffer; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 8e428709247f..587774488054 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -455,27 +455,21 @@ uint8_t *ExecutableFileMemoryManager::recordNoteSection( << " with size " << Size << ", alignment " << Alignment << " at 0x" << Twine::utohexstr(reinterpret_cast(Data)) << '\n'); - if (SectionName == ".debug_line") { // We need to make a copy of the section contents if we'll need it for - // a future reference. - uint8_t *DataCopy = new uint8_t[Size]; - memcpy(DataCopy, Data, Size); - NoteSectionInfo[SectionName] = - SectionInfo(reinterpret_cast(DataCopy), - Size, - Alignment, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false, - 0, - 0, - SectionID); - return DataCopy; - } else { - DEBUG(dbgs() << "BOLT-DEBUG: ignoring section " << SectionName - << " in recordNoteSection()\n"); - return nullptr; - } + // a future reference. RuntimeDyld will not allocate the space forus. + uint8_t *DataCopy = new uint8_t[Size]; + memcpy(DataCopy, Data, Size); + NoteSectionInfo[SectionName] = + SectionInfo(reinterpret_cast(DataCopy), + Size, + Alignment, + /*IsCode=*/false, + /*IsReadOnly=*/true, + /*IsLocal=*/false, + 0, + 0, + SectionID); + return DataCopy; } bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { @@ -2105,14 +2099,11 @@ void RewriteInstance::emitFunctions() { Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_end")); } - if (opts::Relocs) { - emitDataSections(Streamer.get()); - } - - if (opts::UpdateDebugSections) updateDebugLineInfoForNonSimpleFunctions(); + emitDataSections(Streamer.get()); + // Relocate .eh_frame to .eh_frame_old. if (EHFrameSection.getObject() != nullptr) { relocateEHFrameSection(); @@ -2401,16 +2392,22 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, SectionRef Section, SectionName = Name; else Section.getName(SectionName); + + const auto SectionFlags = ELFSectionRef(Section).getFlags(); + const auto SectionType = ELFSectionRef(Section).getType(); auto *ELFSection = BC->Ctx->getELFSection(SectionName, - ELF::SHT_PROGBITS, - ELF::SHF_WRITE | ELF::SHF_ALLOC); + SectionType, + SectionFlags); + StringRef SectionContents; Section.getContents(SectionContents); Streamer->SwitchSection(ELFSection); Streamer->EmitValueToAlignment(Section.getAlignment()); - DEBUG(dbgs() << "BOLT-DEBUG: emitting section " << SectionName << '\n'); + DEBUG(dbgs() << "BOLT-DEBUG: emitting " + << (SectionFlags & ELF::SHF_ALLOC ? "" : "non-") + << "allocatable data section " << SectionName << '\n'); auto SRI = BC->SectionRelocations.find(Section); if (SRI == BC->SectionRelocations.end()) { @@ -2634,7 +2631,7 @@ void RewriteInstance::rewriteNoteSections() { // Write section extension. Address = SI.AllocAddress; if (Address) { - DEBUG(dbgs() << "BOLT: " << (Size ? "appending" : "writing") + DEBUG(dbgs() << "BOLT-DEBUG: " << (Size ? "appending" : "writing") << " contents to section " << *SectionName << '\n'); OS.write(reinterpret_cast(Address), SI.Size); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index d1d799f3d8f6..233d09dded87 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -384,7 +384,7 @@ class RewriteInstance { bool IsSimple); private: - /// If we are updating debug info, these are the section we need to overwrite. + /// When updating debug info, these are the sections we overwrite. static constexpr const char *DebugSectionsToOverwrite[] = { ".debug_aranges", ".debug_line", From 783b20f4fdf754bc4268e128132164cee30efd49 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 5 Apr 2017 13:23:58 -0700 Subject: [PATCH 242/904] [BOLT] Enable SCTC by default. (cherry picked from commit daa4801f3a1b256d10ca4d9daaa10305d58233c2) --- bolt/BinaryFunction.cpp | 7 ++++++- bolt/BinaryPassManager.cpp | 1 + bolt/Passes/BinaryPasses.cpp | 26 +++++++++++++++++--------- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 73a66e0c3dd9..9a2dd8180e79 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -3889,7 +3889,12 @@ DynoStats BinaryFunction::getDynoStats() const { NonTakenCount = BBExecutionCount - TakenCount; else NonTakenCount = 0; - IsForwardBranch = isForwardBranch(BB, BB->getFallthrough()); + + // If succ_size == 0 then we are branching to a function + // rather than a BB label. + IsForwardBranch = BB->succ_size() == 0 + ? isForwardCall(BC.MIA->getTargetSymbol(*CondBranch)) + : isForwardBranch(BB, BB->getFallthrough()); } if (TakenCount == COUNT_NO_PROFILE) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index ae2029be0984..4f65cfa2bc16 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -176,6 +176,7 @@ PrintUCE("print-uce", static cl::opt SimplifyConditionalTailCalls("simplify-conditional-tail-calls", cl::desc("simplify conditional tail calls by removing unnecessary jumps"), + cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory)); diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 515af23077e7..69ca62f6e9c8 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -230,7 +230,7 @@ enum SctcModes : char { static cl::opt SctcMode("sctc-mode", cl::desc("mode for simplify conditional tail calls"), - cl::init(SctcHeuristic), + cl::init(SctcAlways), cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), clEnumValN(SctcPreserveDirection, "preserve", @@ -475,7 +475,9 @@ namespace { // B0: ... // jmp B2 (or jcc B2) // -uint64_t fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function) { +uint64_t fixDoubleJumps(BinaryContext &BC, + BinaryFunction &Function, + bool MarkInvalid) { uint64_t NumDoubleJumps = 0; for (auto &BB : Function) { @@ -484,7 +486,7 @@ uint64_t fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function) { const MCSymbol *SuccSym) { // Ignore infinite loop jumps or fallthrough tail jumps. if (Pred == Succ || Succ == &BB) - return; + return false; if (Succ) { const MCSymbol *TBB = nullptr; @@ -495,7 +497,7 @@ uint64_t fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function) { if(!Res) { DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; Pred->dump()); - return; + return false; } Pred->replaceSuccessor(&BB, Succ); @@ -519,7 +521,7 @@ uint64_t fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function) { Pred->eraseInstruction(Branch); Pred->addTailCallInstruction(SuccSym); } else { - return; + return false; } } @@ -528,6 +530,8 @@ uint64_t fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function) { << Pred->getName() << " -> " << BB.getName() << " to " << Pred->getName() << " -> " << SuccSym->getName() << (!Succ ? " (tail)\n" : "\n")); + + return true; }; if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) @@ -542,7 +546,7 @@ uint64_t fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function) { const auto *SuccSym = BC.MIA->getTargetSymbol(*Inst); auto *Succ = BB.getSuccessor(); - if ((!Succ || &BB == Succ) && !IsTailCall) + if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) continue; std::vector Preds{BB.pred_begin(), BB.pred_end()}; @@ -554,7 +558,11 @@ uint64_t fixDoubleJumps(BinaryContext &BC, BinaryFunction &Function) { if (Pred->getSuccessor() == &BB || (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || Pred->getConditionalSuccessor(false) == &BB) { - checkAndPatch(Pred, Succ, SuccSym); + if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) { + BB.markValid(BB.pred_size() != 0 || + BB.isLandingPad() || + BB.isEntryPoint()); + } assert(Function.validateCFG()); } } @@ -698,7 +706,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, } if (NumLocalCTCs > 0) { - NumDoubleJumps += fixDoubleJumps(BC, BF); + NumDoubleJumps += fixDoubleJumps(BC, BF, true); // Clean-up unreachable tail-call blocks. const auto Stats = BF.eraseInvalidBBs(); DeletedBlocks += Stats.first; @@ -798,7 +806,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC, auto &Function = It.second; if (shouldOptimize(Function)) { shortenInstructions(BC, Function); - NumDoubleJumps += fixDoubleJumps(BC, Function); + NumDoubleJumps += fixDoubleJumps(BC, Function, false); addTailcallTraps(BC, Function); removeUselessCondBranches(BC, Function); } From 7a960174ed0a19e8e40ec814ef9edbcb872eaff1 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 6 Apr 2017 10:49:59 -0700 Subject: [PATCH 243/904] [BOLT] Don't abort on processing binaries with .gdb_index section Summary: While writing non-allocatable sections we had an assumption that the size of such section is congruent to the alignment, as typically such sections are a collections of fixed-sized elements. .gdb_index breaks this assumption. This diff removes the assertion that was triggered by a presence of .gdb_index section, and makes sure that we insert a padding if we are appending to a section with a size not congruent to section alignment. (cherry picked from commit f72d26af2bd6570a32dbd0a5814f0219f8af0939) --- bolt/RewriteInstance.cpp | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 587774488054..94bde03636f4 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2555,10 +2555,18 @@ void RewriteInstance::patchELFPHDRTable() { } namespace { -void writePadding(raw_pwrite_stream &OS, unsigned BytesToWrite) { - for (unsigned I = 0; I < BytesToWrite; ++I) + +/// Write padding to \p OS such that its current \p Offset becomes aligned +/// at \p Alignment. Return new (aligned) offset. +uint64_t appendPadding(raw_pwrite_stream &OS, + uint64_t Offset, + uint64_t Alignment) { + const auto PaddingSize = OffsetToAlignment(Offset, Alignment); + for (unsigned I = 0; I < PaddingSize; ++I) OS.write((unsigned char)0); + return Offset + PaddingSize; } + } void RewriteInstance::rewriteNoteSections() { @@ -2586,15 +2594,8 @@ void RewriteInstance::rewriteNoteSections() { continue; // Insert padding as needed. - if (Section.sh_addralign > 1) { - auto PaddingSize = OffsetToAlignment(NextAvailableOffset, - Section.sh_addralign); - writePadding(OS, PaddingSize); - NextAvailableOffset += PaddingSize; - - assert(Section.sh_size % Section.sh_addralign == 0 && - "section size does not match section alignment"); - } + NextAvailableOffset = + appendPadding(OS, NextAvailableOffset, Section.sh_addralign); ErrorOr SectionName = Obj->getSectionName(&Section); check_error(SectionName.getError(), "cannot get section name"); @@ -2611,6 +2612,9 @@ void RewriteInstance::rewriteNoteSections() { (*SectionPatchersIt->second).patchBinary(Data); } OS << Data; + + // Add padding as the section extension might rely on the alignment. + Size = appendPadding(OS, Size, Section.sh_addralign); } if (Section.sh_type == ELF::SHT_SYMTAB) { @@ -2724,9 +2728,7 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSectionIndex.resize(Obj->getNumSections()); - auto PaddingSize = OffsetToAlignment(SHTOffset, sizeof(Elf_Shdr)); - writePadding(OS, PaddingSize); - SHTOffset += PaddingSize; + SHTOffset = appendPadding(OS, SHTOffset, sizeof(Elf_Shdr)); // Copy over entries for original allocatable sections with minor // modifications (e.g. name). @@ -3330,9 +3332,8 @@ void RewriteInstance::writeEHFrameHeader(SectionInfo &EHFrameSecInfo) { DEBUG(dbgs() << "BOLT: writing a new .eh_frame_hdr\n"); - auto PaddingSize = OffsetToAlignment(NextAvailableAddress, EHFrameHdrAlign); - writePadding(Out->os(), PaddingSize); - NextAvailableAddress += PaddingSize; + NextAvailableAddress = + appendPadding(Out->os(), NextAvailableAddress, EHFrameHdrAlign); SectionInfo EHFrameHdrSecInfo; EHFrameHdrSecInfo.FileAddress = NextAvailableAddress; From e74110e065927cee203a624ab249acf0fa946993 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 18 Apr 2017 23:32:11 -0700 Subject: [PATCH 244/904] [BOLT] Fix branch data for __builtin_unreachable(). Summary: When we have a conditional branch past the end of function (a result of a call to__builtin_unreachable()), we replace the branch with nop, but keep branch information for validation purposes. If that branch has a recorded profile we mistakenly create an additional successor to a containing basic block (a 3rd successor). Instead of adding the branch to FTBranches list we should be adding to IgnoredBranches. (cherry picked from commit e1ab752775e3260047c7eef2a5c01e2e4ccf2fd6) --- bolt/BinaryFunction.cpp | 8 ++++++-- bolt/BinaryFunction.h | 5 +++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 9a2dd8180e79..26e7375363b3 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1134,8 +1134,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { << " : replacing with nop.\n"); BC.MIA->createNoop(Instruction); if (IsCondBranch) { - // Register FT branch for passing function profile validation. - FTBranches.emplace_back(Offset, Offset + Size); + // Register branch function profile validation. + IgnoredBranches.emplace_back(Offset, Offset + Size); } goto add_instruction; } @@ -1888,6 +1888,7 @@ bool BinaryFunction::buildCFG() { clearList(OffsetToCFI); clearList(TakenBranches); clearList(FTBranches); + clearList(IgnoredBranches); clearList(LPToBBIndex); clearList(EntryOffsets); @@ -1986,6 +1987,9 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { FunctionBranches.insert(FunctionBranches.end(), FTBranches.begin(), FTBranches.end()); + FunctionBranches.insert(FunctionBranches.end(), + IgnoredBranches.begin(), + IgnoredBranches.end()); std::sort(FunctionBranches.begin(), FunctionBranches.end()); BranchListType DiffBranches; // Branches in profile without a match. diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 2f903b760de4..70a0e99be35b 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -439,8 +439,9 @@ class BinaryFunction : public AddressRangesOwner { std::unordered_map LabelToBB; using BranchListType = std::vector>; - BranchListType TakenBranches; /// All local taken branches. - BranchListType FTBranches; /// All fall-through branches. + BranchListType TakenBranches; /// All local taken branches. + BranchListType FTBranches; /// All fall-through branches. + BranchListType IgnoredBranches; /// Branches ignored by CFG purposes. /// Storage for all landing pads and their corresponding invokes. using LandingPadsMapType = std::map >; From 381e837d3c06b1594b0c28c3bd94ede6cdf98d1c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 8 May 2017 22:51:36 -0700 Subject: [PATCH 245/904] [BOLT] Update function address and size in relocation mode. Summary: Set function addresses after code emission but before we update debug info and symbol table entries. (cherry picked from commit 73241adaa85ae3b2bc4ec29722df9a9ca6854e26) --- bolt/BinaryBasicBlock.cpp | 5 +- bolt/BinaryBasicBlock.h | 39 ++++------ bolt/BinaryContext.cpp | 18 ++++- bolt/BinaryFunction.cpp | 6 -- bolt/BinaryFunction.h | 49 +++++++++--- bolt/RewriteInstance.cpp | 152 +++++++++++++++++++++++--------------- bolt/RewriteInstance.h | 3 + 7 files changed, 167 insertions(+), 105 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 359000fa6b07..e6448cf767ea 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -25,6 +25,8 @@ namespace llvm { namespace bolt { +constexpr uint32_t BinaryBasicBlock::INVALID_OFFSET; + bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { return LHS.Index < RHS.Index; } @@ -347,7 +349,8 @@ BinaryBasicBlock::getBranchStats(const BinaryBasicBlock *Succ) const { void BinaryBasicBlock::dump() const { auto &BC = Function->getBinaryContext(); if (Label) outs() << Label->getName() << ":\n"; - BC.printInstructions(outs(), Instructions.begin(), Instructions.end(), Offset); + BC.printInstructions(outs(), Instructions.begin(), Instructions.end(), + getOffset()); outs() << "preds:"; for (auto itr = pred_begin(); itr != pred_end(); ++itr) { outs() << " " << (*itr)->getName(); diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 8e6a75ba1424..3ddfeeb7ef80 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -50,6 +50,9 @@ class BinaryBasicBlock { uint64_t MispredictedCount; /// number of branches mispredicted }; + static constexpr uint32_t INVALID_OFFSET = + std::numeric_limits::max(); + private: /// Vector of all instructions in the block. std::vector Instructions; @@ -69,14 +72,11 @@ class BinaryBasicBlock { /// Label associated with the block. MCSymbol *Label{nullptr}; - /// Label associated with the end of the block in the output binary. - const MCSymbol *EndLabel{nullptr}; - /// [Begin, End) address range for this block in the output binary. std::pair OutputAddressRange{0, 0}; - /// Original offset in the function. - uint64_t Offset{std::numeric_limits::max()}; + /// Original range of the basic block in the function. + std::pair InputRange{INVALID_OFFSET, INVALID_OFFSET}; /// Alignment requirements for the block. uint64_t Alignment{1}; @@ -119,9 +119,10 @@ class BinaryBasicBlock { explicit BinaryBasicBlock( BinaryFunction *Function, MCSymbol *Label, - uint64_t Offset = std::numeric_limits::max()) - : Function(Function), Label(Label), Offset(Offset) { + uint32_t Offset = INVALID_OFFSET) + : Function(Function), Label(Label) { assert(Function && "Function must be non-null"); + InputRange.first = Offset; } // Exclusively managed by BinaryFunction. @@ -655,19 +656,14 @@ class BinaryBasicBlock { return SplitInst; } - /// Sets the symbol pointing to the end of the BB in the output binary. - void setEndLabel(const MCSymbol *Symbol) { - EndLabel = Symbol; - } - - /// Gets the symbol pointing to the end of the BB in the output binary. - const MCSymbol *getEndLabel() const { - return EndLabel; + /// Sets address of the basic block in the output. + void setOutputStartAddress(uint64_t Address) { + OutputAddressRange.first = Address; } - /// Sets the memory address range of this BB in the output binary. - void setOutputAddressRange(std::pair Range) { - OutputAddressRange = Range; + /// Sets address past the end of the basic block in the output. + void setOutputEndAddress(uint64_t Address) { + OutputAddressRange.second = Address; } /// Gets the memory address range of this BB in the output binary. @@ -722,12 +718,7 @@ class BinaryBasicBlock { /// Return offset of the basic block from the function start. uint64_t getOffset() const { - return Offset; - } - - /// Set offset of the basic block from the function start. - void setOffset(uint64_t NewOffset) { - Offset = NewOffset; + return InputRange.first; } /// Get the index of this basic block. diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 660d597a3a93..3c4c80cdbbb9 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -155,7 +155,7 @@ void findAddressRangesObjects( // and if we convert one DIE, it may affect the rest. Thus // the conservative approach that does not involve expanding // .debug_abbrev, is to switch all DIEs to use .debug_ranges, even if - // they use a single [a,b) range. The secondary reason is that it allows + // they have a simple [a,b) range. The secondary reason is that it allows // us to get rid of the original portion of .debug_ranges to save // space in the binary. auto Function = getBinaryFunctionContainingAddress(Ranges.front().first, @@ -195,6 +195,13 @@ void findSubprograms(DWARFCompileUnit *Unit, } else { Unknown.emplace_back(DIE, Unit); } + } else { + const auto RangesVector = DIE->getAddressRanges(Unit); + if (!RangesVector.empty()) { + errs() << "BOLT-ERROR: split function detected in .debug_info. " + "Split functions are not supported.\n"; + exit(1); + } } } @@ -296,7 +303,7 @@ void BinaryContext::preprocessFunctionDebugInfo( for (const auto &DebugLocEntry : DebugLoc->getLocationLists()) { if (DebugLocEntry.Entries.empty()) continue; - auto StartAddress = DebugLocEntry.Entries.front().Begin; + const auto StartAddress = DebugLocEntry.Entries.front().Begin; auto *Function = getBinaryFunctionContainingAddress(StartAddress, BinaryFunctions); if (!Function || !Function->isSimple()) @@ -304,8 +311,11 @@ void BinaryContext::preprocessFunctionDebugInfo( LocationLists.emplace_back(DebugLocEntry.Offset); auto &LocationList = LocationLists.back(); for (const auto &Location : DebugLocEntry.Entries) { - LocationList.addLocation(&Location.Loc, *Function, Location.Begin, - Location.End); + LocationList.addLocation( + &Location.Loc, + *Function, + Location.Begin, + Location.End); } } } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 26e7375363b3..cfacff073d80 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -2585,12 +2585,6 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { Streamer.EmitInstruction(Instr, *BC.STI); LastIsPrefix = BC.MIA->isPrefix(Instr); } - - if (opts::UpdateDebugSections) { - MCSymbol *BBEndLabel = BC.Ctx->createTempSymbol(); - BB->setEndLabel(BBEndLabel); - Streamer.EmitLabel(BBEndLabel); - } } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 70a0e99be35b..9120f42bd41c 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -167,7 +167,7 @@ class BinaryFunction : public AddressRangesOwner { Disassembled, /// Function have been disassembled. CFG, /// Control flow graph have been built. CFG_Finalized, /// CFG is finalized. No optimizations allowed. - Assembled, /// Function has been assembled in memory. + Emitted, /// Instructions have been emitted to output. }; /// Settings for splitting function bodies into hot/cold partitions. @@ -201,13 +201,6 @@ class BinaryFunction : public AddressRangesOwner { LT_OPTIMIZE_SHUFFLE, }; - enum JumpTableSupportLevel : char { - JTS_NONE = 0, /// Disable jump tables support - JTS_BASIC = 1, /// Enable basic jump tables support - JTS_SPLIT = 2, /// Enable hot/cold splitting of jump tables - JTS_AGGRESSIVE = 3, /// Aggressive splitting of jump tables - }; - enum ReorderType : char { RT_NONE = 0, RT_EXEC_COUNT, @@ -224,6 +217,9 @@ class BinaryFunction : public AddressRangesOwner { // solution to the layout problem instead of seeking the optimal one. static constexpr uint64_t FUNC_SIZE_THRESHOLD = 10; + /// We have to use at least 2-byte alignment for functions because of C++ ABI. + static constexpr unsigned MinAlign = 2; + using BasicBlockOrderType = std::vector; private: @@ -244,6 +240,12 @@ class BinaryFunction : public AddressRangesOwner { /// Original size of the function. uint64_t Size; + /// Address of the function in output. + uint64_t OutputAddress{0}; + + /// Size of the function in the output file. + uint64_t OutputSize{0}; + /// Offset in the file. uint64_t FileOffset{0}; @@ -974,7 +976,12 @@ class BinaryFunction : public AddressRangesOwner { /// Return true if function has a control flow graph available. bool hasCFG() const { return getState() == State::CFG || - getState() == State::CFG_Finalized; + getState() == State::CFG_Finalized || + getState() == State::Emitted; + } + + bool isEmitted() const { + return getState() == State::Emitted; } /// Return containing file section. @@ -987,6 +994,14 @@ class BinaryFunction : public AddressRangesOwner { return Address; } + uint64_t getOutputAddress() const { + return OutputAddress; + } + + uint64_t getOutputSize() const { + return OutputSize; + } + /// Does this function have a valid streaming order index? bool hasValidIndex() const { return Index != -1U; @@ -1405,6 +1420,16 @@ class BinaryFunction : public AddressRangesOwner { return *this; } + BinaryFunction &setOutputAddress(uint64_t Address) { + OutputAddress = Address; + return *this; + } + + BinaryFunction &setOutputSize(uint64_t Size) { + OutputSize = Size; + return *this; + } + BinaryFunction &setSimple(bool Simple) { IsSimple = Simple; return *this; @@ -1635,6 +1660,10 @@ class BinaryFunction : public AddressRangesOwner { CurrentState = State::CFG_Finalized; } + void setEmitted() { + CurrentState = State::Emitted; + } + /// Split function in two: a part with warm or hot BBs and a part with never /// executed BBs. The cold part is moved to a new BinaryFunction. void splitFunction(); @@ -1815,7 +1844,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, case BinaryFunction::State::Disassembled: OS << "disassembled"; break; case BinaryFunction::State::CFG: OS << "CFG constructed"; break; case BinaryFunction::State::CFG_Finalized:OS << "CFG finalized"; break; - case BinaryFunction::State::Assembled: OS << "assembled"; break; + case BinaryFunction::State::Emitted: OS << "emitted"; break; } return OS; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 94bde03636f4..ed35bdfbfaa7 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1873,10 +1873,10 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress()); if (opts::Relocs) { - // We have to use at least 2-byte alignment because of C++ ABI. - Streamer.EmitCodeAlignment(2); - Streamer.EmitCodeAlignment(opts::AlignFunctions, - opts::AlignFunctionsMaxBytes); + Streamer.EmitCodeAlignment(std::max((unsigned)opts::AlignFunctions, + BinaryFunction::MinAlign), + std::max((unsigned)opts::AlignFunctionsMaxBytes, + BinaryFunction::MinAlign - 1)); } else { Streamer.EmitCodeAlignment(Function.getAlignment()); } @@ -1953,14 +1953,13 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.EmitLabel(EmitColdPart ? Function.getFunctionColdEndLabel() : Function.getFunctionEndLabel()); - if (!Function.isSimple() && !opts::Relocs) - return; - // Exception handling info for the function. Function.emitLSDA(&Streamer, EmitColdPart); if (!EmitColdPart && opts::JumpTables > JTS_NONE) Function.emitJumpTables(&Streamer); + + Function.setEmitted(); } template @@ -2113,7 +2112,7 @@ void RewriteInstance::emitFunctions() { Streamer->Finish(); ////////////////////////////////////////////////////////////////////////////// - // Assign addresses to new functions/sections. + // Assign addresses to new sections. ////////////////////////////////////////////////////////////////////////////// if (opts::UpdateDebugSections) { @@ -2150,41 +2149,16 @@ void RewriteInstance::emitFunctions() { std::move(Resolver), /* ProcessAllSections = */true); - // Is there benefit in using notifyObjectLoaded() to remap sections? + // Assign addresses to all sections. mapFileSections(ObjectsHandle); - if (opts::UpdateDebugSections) { - MCAsmLayout Layout( + // Update output addresses based on the new section map and layout. + MCAsmLayout FinalLayout( static_cast(Streamer.get())->getAssembler()); - - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - for (auto &BB : Function) { - if (!(BB.getLabel()->isDefined(false) && - BB.getEndLabel() && BB.getEndLabel()->isDefined(false))) { - continue; - } - uint64_t BaseAddress = (BB.isCold() ? Function.cold().getAddress() - : Function.getAddress()); - uint64_t BeginAddress = - BaseAddress + Layout.getSymbolOffset(*BB.getLabel()); - uint64_t EndAddress = - BaseAddress + Layout.getSymbolOffset(*BB.getEndLabel()); - BB.setOutputAddressRange(std::make_pair(BeginAddress, EndAddress)); - } - } - } + updateOutputValues(FinalLayout); OLT.emitAndFinalize(ObjectsHandle); - if (opts::Relocs) { - const auto *EntryFunction = getBinaryFunctionContainingAddress(EntryPoint); - assert(EntryFunction && "cannot find function for entry point"); - auto JITS = OLT.findSymbol(EntryFunction->getSymbol()->getName(), false); - EntryPoint = JITS.getAddress(); - assert(EntryPoint && "entry point cannot be NULL"); - } - if (opts::KeepTmp) TempOut->keep(); } @@ -2385,6 +2359,73 @@ void RewriteInstance::mapFileSections( } } +void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + if (!Function.isEmitted()) + continue; + + if (opts::Relocs) { + const auto BaseAddress = NewTextSectionStartAddress; + const auto StartOffset = Layout.getSymbolOffset(*Function.getSymbol()); + const auto EndOffset = + Layout.getSymbolOffset(*Function.getFunctionEndLabel()); + Function.setOutputAddress(BaseAddress + StartOffset); + Function.setOutputSize(EndOffset - StartOffset); + if (Function.isSplit()) { + const auto *ColdStartSymbol = Function.getColdSymbol(); + assert(ColdStartSymbol && ColdStartSymbol->isDefined(false) && + "split function should have defined cold symbol"); + const auto *ColdEndSymbol = Function.getFunctionColdEndLabel(); + assert(ColdEndSymbol && ColdEndSymbol->isDefined(false) && + "split function should have defined cold end symbol"); + const auto ColdStartOffset = Layout.getSymbolOffset(*ColdStartSymbol); + const auto ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol); + Function.cold().setAddress(BaseAddress + ColdStartOffset); + Function.cold().setImageSize(ColdEndOffset - ColdStartOffset); + + } + } else { + Function.setOutputAddress(Function.getAddress()); + Function.setOutputSize( + Layout.getSymbolOffset(*Function.getFunctionEndLabel())); + } + + // Update basic block output ranges only for the debug info. + if (!opts::UpdateDebugSections) + continue; + + // Output ranges should match the input if the body hasn't changed. + if (!Function.isSimple()) + continue; + + BinaryBasicBlock *PrevBB = nullptr; + for (auto BBI = Function.layout_begin(), BBE = Function.layout_end(); + BBI != BBE; ++BBI) { + auto *BB = *BBI; + assert(BB->getLabel()->isDefined(false) && "symbol should be defined"); + uint64_t BaseAddress = BB->isCold() ? Function.cold().getAddress() + : Function.getOutputAddress(); + uint64_t Address = BaseAddress + Layout.getSymbolOffset(*BB->getLabel()); + BB->setOutputStartAddress(Address); + + if (PrevBB) { + auto PrevBBEndAddress = Address; + if (BB->isCold() != PrevBB->isCold()) { + PrevBBEndAddress = + Function.getOutputAddress() + Function.getOutputSize(); + } + PrevBB->setOutputEndAddress(PrevBBEndAddress); + } + PrevBB = BB; + } + PrevBB->setOutputEndAddress(Function.isSplit() ? + Function.cold().getAddress() + Function.cold().getImageSize() : + Function.getOutputAddress() + Function.getOutputSize()); + } +} + void RewriteInstance::emitDataSection(MCStreamer *Streamer, SectionRef Section, std::string Name) { StringRef SectionName; @@ -2868,7 +2909,11 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { // Fix ELF header. auto NewEhdr = *Obj->getHeader(); - NewEhdr.e_entry = EntryPoint; + + if (opts::Relocs) { + NewEhdr.e_entry = getNewFunctionAddress(NewEhdr.e_entry); + assert(NewEhdr.e_entry && "cannot find new address for entry point"); + } NewEhdr.e_phoff = PHDRTableOffset; NewEhdr.e_phnum = Phnum; NewEhdr.e_shoff = SHTOffset; @@ -2880,8 +2925,6 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { "internal calculation error"); } -// FIXME: proper size for symbols based on output. Current method doesn't -// work well with split functions. template void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (!opts::Relocs) @@ -2897,19 +2940,9 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto StringSectionOrError = Obj->getStringTableForSymtab(*Section); for (const Elf_Sym &Symbol : Obj->symbols(Section)) { auto NewSymbol = Symbol; - if (auto NewAddress = getNewFunctionAddress(Symbol.st_value)) { - std::size_t Size = 0; - auto BFI = BinaryFunctions.upper_bound(NewAddress); - if (BFI != BinaryFunctions.end()) { - Size = BFI->first - NewAddress; - } else { - Size = BFI->second.getSize(); - } - DEBUG(dbgs() << "BOLT-DEBUG: patching symbol address 0x" - << Twine::utohexstr(Symbol.st_value) << " with 0x" - << Twine::utohexstr(NewAddress) - << " size " << Size << '\n'); - NewSymbol.st_value = NewAddress; + if (const auto *Function = getBinaryFunctionAtAddress(Symbol.st_value)) { + NewSymbol.st_value = Function->getOutputAddress(); + NewSymbol.st_size = Function->getOutputSize(); NewSymbol.st_shndx = NewTextSectionIndex; } else { if (NewSymbol.st_shndx < ELF::SHN_LORESERVE) { @@ -3089,8 +3122,7 @@ uint64_t RewriteInstance::getNewFunctionAddress(uint64_t OldAddress) { const auto *Function = getBinaryFunctionAtAddress(OldAddress); if (!Function) return 0; - auto JITS = OLT.findSymbol(Function->getSymbol()->getName(), false); - return JITS.getAddress(); + return Function->getOutputAddress(); } void RewriteInstance::rewriteFile() { @@ -3152,7 +3184,8 @@ void RewriteInstance::rewriteFile() { outs() << "BOLT: rewriting function \"" << Function << "\"\n"; } OS.pwrite(reinterpret_cast(Function.getImageAddress()), - Function.getImageSize(), Function.getFileOffset()); + Function.getImageSize(), + Function.getFileOffset()); // Write nops at the end of the function. auto Pos = OS.tell(); @@ -3190,10 +3223,9 @@ void RewriteInstance::rewriteFile() { outs() << "BOLT: rewriting function \"" << Function << "\" (cold part)\n"; } - OS.pwrite(reinterpret_cast - (Function.cold().getImageAddress()), - Function.cold().getImageSize(), - Function.cold().getFileOffset()); + OS.pwrite(reinterpret_cast(Function.cold().getImageAddress()), + Function.cold().getImageSize(), + Function.cold().getFileOffset()); // FIXME: write nops after cold part too. diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 233d09dded87..406d5e02fca3 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -200,6 +200,9 @@ class RewriteInstance { /// Map all sections to their final addresses. void mapFileSections(orc::ObjectLinkingLayer<>::ObjSetHandleT &ObjectsHandle); + /// Update output object's values based on the final \p Layout. + void updateOutputValues(const MCAsmLayout &Layout); + /// Check which functions became larger than their original version and /// annotate function splitting information. /// From 89105a356f6569b67edbb88f57c15e3a6ec77701 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 15 May 2017 15:21:59 -0700 Subject: [PATCH 246/904] [BOLT] Update .gdb_index section. Summary: Update address table in .gdb_index section. (cherry picked from commit 85485e21c2261a0dced2ccff900b2c286d30c140) --- bolt/DWARFRewriter.cpp | 137 ++++++++++++++++++++++++++++++++++++--- bolt/DebugData.cpp | 25 +++---- bolt/DebugData.h | 10 +-- bolt/RewriteInstance.cpp | 2 + bolt/RewriteInstance.h | 10 ++- 5 files changed, 158 insertions(+), 26 deletions(-) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 128408052021..c8726a30e7c7 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/TimeValue.h" @@ -40,6 +41,7 @@ #define DEBUG_TYPE "bolt" using namespace llvm; +using namespace llvm::support::endian; using namespace object; using namespace bolt; @@ -62,6 +64,8 @@ void RewriteInstance::updateDebugInfo() { updateLocationLists(); updateDWARFAddressRanges(); + + updateGdbIndexSection(); } void RewriteInstance::updateEmptyModuleRanges() { @@ -71,7 +75,7 @@ void RewriteInstance::updateEmptyModuleRanges() { continue; auto const &Ranges = CU->getUnitDIE(true)->getAddressRanges(CU.get()); for (auto const &Range : Ranges) { - RangesSectionsWriter.AddRange(CU->getOffset(), + RangesSectionsWriter.addRange(CU->getOffset(), Range.first, Range.second - Range.first); } @@ -294,7 +298,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { void RewriteInstance::updateAddressRangesObjects() { for (auto &Obj : BC->AddressRangesObjects) { for (const auto &Range : Obj.getAbsoluteAddressRanges()) { - RangesSectionsWriter.AddRange(&Obj, Range.first, + RangesSectionsWriter.addRange(&Obj, Range.first, Range.second - Range.first); } } @@ -366,9 +370,9 @@ void RewriteInstance::updateFunctionRanges() { // the identical code folding optimization. Update all of them with // the range. for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { - auto CUOffset = DIECompileUnitPair.second->getOffset(); - if (CUOffset != -1U) - RangesSectionsWriter.AddRange(CUOffset, RangeBegin, RangeSize); + const auto CU = DIECompileUnitPair.second; + if (CU->getOffset() != -1U) + RangesSectionsWriter.addRange(CU->getOffset(), RangeBegin, RangeSize); } }; @@ -383,12 +387,12 @@ void RewriteInstance::updateFunctionRanges() { addDebugArangesEntry(Function, Function.getAddress(), Size); - RangesSectionsWriter.AddRange(&Function, Function.getAddress(), Size); + RangesSectionsWriter.addRange(&Function, Function.getAddress(), Size); if (Function.isSimple() && Function.cold().getImageSize()) { addDebugArangesEntry(Function, Function.cold().getAddress(), Function.cold().getImageSize()); - RangesSectionsWriter.AddRange(&Function, + RangesSectionsWriter.addRange(&Function, Function.cold().getAddress(), Function.cold().getImageSize()); } @@ -398,6 +402,10 @@ void RewriteInstance::updateFunctionRanges() { void RewriteInstance::generateDebugRanges() { enum { RANGES, ARANGES }; for (auto RT = RANGES + 0; RT <= ARANGES; ++RT) { + // Skip .debug_aranges if we are re-generating .gdb_index. + if (GdbIndexSection.getObject() && RT == ARANGES) + continue; + const char *SectionName = (RT == RANGES) ? ".debug_ranges" : ".debug_aranges"; SmallVector RangesBuffer; @@ -408,9 +416,9 @@ void RewriteInstance::generateDebugRanges() { auto Writer = std::unique_ptr(MAB->createObjectWriter(OS)); if (RT == RANGES) { - RangesSectionsWriter.WriteRangesSection(Writer.get()); + RangesSectionsWriter.writeRangesSection(Writer.get()); } else { - RangesSectionsWriter.WriteArangesSection(Writer.get()); + RangesSectionsWriter.writeArangesSection(Writer.get()); } const auto &DebugRangesContents = OS.str(); @@ -508,3 +516,114 @@ void RewriteInstance::updateLocationListPointers( updateLocationListPointers(Unit, Child, UpdatedOffsets); } } + +void RewriteInstance::updateGdbIndexSection() { + if (!GdbIndexSection.getObject()) + return; + + StringRef GdbIndexContents; + GdbIndexSection.getContents(GdbIndexContents); + + const auto *Data = GdbIndexContents.data(); + + // Parse the header. + const auto Version = read32le(Data); + if (Version != 7 && Version != 8) { + errs() << "BOLT-ERROR: can only process .gdb_index versions 7 and 8\n"; + exit(1); + } + + // Some .gdb_index generators use file offsets while others use section + // offsets. Hence we can only rely on offsets relative to each other, + // and ignore their absolute values. + const auto CUListOffset = read32le(Data + 4); + const auto CUTypesOffset = read32le(Data + 8); + const auto AddressTableOffset = read32le(Data + 12); + const auto SymbolTableOffset = read32le(Data + 16); + const auto ConstantPoolOffset = read32le(Data + 20); + Data += 24; + + assert(CUTypesOffset == AddressTableOffset && + "CU types in .gdb_index should be empty"); + + // Map CUs offsets to indices and verify existing index table. + std::map OffsetToIndexMap; + const auto CUListSize = CUTypesOffset - CUListOffset; + const auto NumCUs = BC->DwCtx->getNumCompileUnits(); + if (CUListSize != NumCUs * 16) { + errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n"; + exit(1); + } + for (unsigned Index = 0; Index < NumCUs; ++Index, Data += 16) { + const auto *CU = BC->DwCtx->getCompileUnitAtIndex(Index); + const auto Offset = read64le(Data); + if (CU->getOffset() != Offset) { + errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n"; + exit(1); + } + + OffsetToIndexMap[Offset] = Index; + } + + // Ignore old address table. + const auto OldAddressTableSize = SymbolTableOffset - AddressTableOffset; + Data += OldAddressTableSize; + + // Calculate the size of the new address table. + uint32_t NewAddressTableSize = 0; + for (const auto &CURangesPair : RangesSectionsWriter.getCUAddressRanges()) { + const auto &Ranges = CURangesPair.second; + NewAddressTableSize += Ranges.size() * 20; + } + + // Difference between old and new table (and section) sizes. + // Could be negative. + int32_t Delta = NewAddressTableSize - OldAddressTableSize; + + size_t NewGdbIndexSize = GdbIndexContents.size() + Delta; + + // Free'd by ExecutableFileMemoryManager. + auto * const NewGdbIndexContents = new uint8_t[NewGdbIndexSize]; + auto *Buffer = NewGdbIndexContents; + + write32le(Buffer, Version); + write32le(Buffer + 4, CUListOffset); + write32le(Buffer + 8, CUTypesOffset); + write32le(Buffer + 12, AddressTableOffset); + write32le(Buffer + 16, SymbolTableOffset + Delta); + write32le(Buffer + 20, ConstantPoolOffset + Delta); + Buffer += 24; + + // Copy over CU list. + memcpy(Buffer, GdbIndexContents.data() + 24, CUListSize); + Buffer += CUListSize; + + // Generate new address table. + for (const auto &CURangesPair : RangesSectionsWriter.getCUAddressRanges()) { + const auto CUIndex = OffsetToIndexMap[CURangesPair.first]; + const auto &Ranges = CURangesPair.second; + for (const auto &Range : Ranges) { + write64le(Buffer, Range.first); + write64le(Buffer + 8, Range.first + Range.second); + write32le(Buffer + 16, CUIndex); + Buffer += 20; + } + } + + const auto TrailingSize = + GdbIndexContents.data() + GdbIndexContents.size() - Data; + assert(Buffer + TrailingSize == NewGdbIndexContents + NewGdbIndexSize && + "size calculation error"); + + // Copy over the rest of the original data. + memcpy(Buffer, Data, TrailingSize); + + // Register the new section. + EFMM->NoteSectionInfo[".gdb_index"] = SectionInfo( + reinterpret_cast(NewGdbIndexContents), + NewGdbIndexSize, + /*Alignment=*/0, + /*IsCode=*/false, + /*IsReadOnly=*/true, + /*IsLocal=*/false); +} diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index 0cb363cde3bb..fd19e6a3fc47 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -126,13 +126,14 @@ BasicBlockOffsetRanges::getAbsoluteAddressRanges() const { return MergedRanges; } -void DebugRangesSectionsWriter::AddRange(uint32_t CompileUnitOffset, +void DebugRangesSectionsWriter::addRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size) { - CUAddressRanges[CompileUnitOffset].emplace_back(std::make_pair(Address, Size)); + CUAddressRanges[CompileUnitOffset].emplace_back(std::make_pair(Address, + Size)); } -void DebugRangesSectionsWriter::AddRange(AddressRangesOwner *BF, +void DebugRangesSectionsWriter::addRange(AddressRangesOwner *BF, uint64_t Address, uint64_t Size) { ObjectAddressRanges[BF].emplace_back(std::make_pair(Address, Size)); @@ -145,7 +146,7 @@ namespace { // the form (begin address, range size), otherwise (begin address, end address). // Terminates the list by writing a pair of two zeroes. // Returns the number of written bytes. -uint32_t WriteAddressRanges( +uint32_t writeAddressRanges( MCObjectWriter *Writer, const std::vector> &AddressRanges, bool RelativeRange) { @@ -162,37 +163,37 @@ uint32_t WriteAddressRanges( } // namespace -void DebugRangesSectionsWriter::WriteRangesSection(MCObjectWriter *Writer) { +void DebugRangesSectionsWriter::writeRangesSection(MCObjectWriter *Writer) { uint32_t SectionOffset = 0; for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { - uint64_t CUOffset = CUOffsetAddressRangesPair.first; + const auto CUOffset = CUOffsetAddressRangesPair.first; RangesSectionOffsetCUMap[CUOffset] = SectionOffset; const auto &AddressRanges = CUOffsetAddressRangesPair.second; - SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); + SectionOffset += writeAddressRanges(Writer, AddressRanges, false); } for (const auto &BFAddressRangesPair : ObjectAddressRanges) { BFAddressRangesPair.first->setAddressRangesOffset(SectionOffset); const auto &AddressRanges = BFAddressRangesPair.second; - SectionOffset += WriteAddressRanges(Writer, AddressRanges, false); + SectionOffset += writeAddressRanges(Writer, AddressRanges, false); } // Write an empty address list to be used for objects with unknown address // ranges. EmptyRangesListOffset = SectionOffset; - SectionOffset += WriteAddressRanges( + SectionOffset += writeAddressRanges( Writer, std::vector>{}, false); } void -DebugRangesSectionsWriter::WriteArangesSection(MCObjectWriter *Writer) const { +DebugRangesSectionsWriter::writeArangesSection(MCObjectWriter *Writer) const { // For reference on the format of the .debug_aranges section, see the DWARF4 // specification, section 6.1.4 Lookup by Address // http://www.dwarfstd.org/doc/DWARF4.pdf for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { - uint64_t Offset = CUOffsetAddressRangesPair.first; + const auto Offset = CUOffsetAddressRangesPair.first; const auto &AddressRanges = CUOffsetAddressRangesPair.second; // Emit header. @@ -222,7 +223,7 @@ DebugRangesSectionsWriter::WriteArangesSection(MCObjectWriter *Writer) const { // Padding before address table - 4 bytes in the 64-bit-pointer case. Writer->writeLE32(0); - WriteAddressRanges(Writer, AddressRanges, true); + writeAddressRanges(Writer, AddressRanges, true); } } diff --git a/bolt/DebugData.h b/bolt/DebugData.h index 77cadfd2879b..b1f984207a1c 100644 --- a/bolt/DebugData.h +++ b/bolt/DebugData.h @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/Support/SMLoc.h" #include #include @@ -224,20 +225,20 @@ class DebugRangesSectionsWriter { DebugRangesSectionsWriter() = default; /// Adds a range to the .debug_arange section. - void AddRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); + void addRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); /// Adds an address range that belongs to a given object. /// When .debug_ranges is written, the offset of the range corresponding /// to the function will be set using BF->setAddressRangesOffset(). - void AddRange(AddressRangesOwner *ARO, uint64_t Address, uint64_t Size); + void addRange(AddressRangesOwner *ARO, uint64_t Address, uint64_t Size); using RangesCUMapType = std::map; /// Writes .debug_aranges with the added ranges to the MCObjectWriter. - void WriteArangesSection(MCObjectWriter *Writer) const; + void writeArangesSection(MCObjectWriter *Writer) const; /// Writes .debug_ranges with the added ranges to the MCObjectWriter. - void WriteRangesSection(MCObjectWriter *Writer); + void writeRangesSection(MCObjectWriter *Writer); /// Resets the writer to a clear state. void reset() { @@ -255,6 +256,7 @@ class DebugRangesSectionsWriter { /// to .debug_ranges uint32_t getEmptyRangesListOffset() const { return EmptyRangesListOffset; } + /// Map DWARFCompileUnit index to ranges. using CUAddressRangesType = std::map>>; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ed35bdfbfaa7..22ebaba1b021 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1321,6 +1321,8 @@ void RewriteInstance::readSpecialSections() { EHFrameSection = Section; } else if (SectionName == ".rela.text") { HasTextRelocations = true; + } else if (SectionName == ".gdb_index") { + GdbIndexSection = Section; } // Ignore zero-size allocatable sections as they present no interest to us. diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 406d5e02fca3..e024c0e5247e 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -346,6 +346,9 @@ class RewriteInstance { /// blocks) to be updated. void updateDWARFAddressRanges(); + /// Rewrite .gdb_index section if present. + void updateGdbIndexSection(); + /// Patches the binary for an object's address ranges to be updated. /// The object can be a anything that has associated address ranges via either /// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc). @@ -391,7 +394,9 @@ class RewriteInstance { static constexpr const char *DebugSectionsToOverwrite[] = { ".debug_aranges", ".debug_line", - ".debug_ranges"}; + ".debug_ranges", + ".gdb_index", + }; /// Huge page size used for alignment. static constexpr unsigned PageAlign = 0x200000; @@ -463,6 +468,9 @@ class RewriteInstance { const llvm::DWARFFrame *EHFrame{nullptr}; SectionRef EHFrameSection; + /// .gdb_index section. + SectionRef GdbIndexSection; + uint64_t NewSymTabOffset{0}; /// Keep track of functions we fail to write in the binary. We need to avoid From 278e8919ee2599b4b8cff5aca4fbe3008dbb6102 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 16 May 2017 17:29:31 -0700 Subject: [PATCH 247/904] [BOLT] Support adding new non-allocatable sections. Summary: We had the ability to add allocatable sections before. This diff expands this capability to non-allocatable sections. (cherry picked from commit 31600af6fdea7137df061d74661a57e32020ba13) --- bolt/DWARFRewriter.cpp | 2 +- bolt/RewriteInstance.cpp | 140 +++++++++++++++++++++++++++------------ bolt/RewriteInstance.h | 8 ++- 3 files changed, 105 insertions(+), 45 deletions(-) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index c8726a30e7c7..1b817491796d 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -422,7 +422,7 @@ void RewriteInstance::generateDebugRanges() { } const auto &DebugRangesContents = OS.str(); - // Free'd by ExecutableFileMemoryManager. + // Freed by ExecutableFileMemoryManager. uint8_t *SectionData = new uint8_t[DebugRangesContents.size()]; memcpy(SectionData, DebugRangesContents.data(), DebugRangesContents.size()); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 22ebaba1b021..7c04024a755c 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2604,6 +2604,9 @@ namespace { uint64_t appendPadding(raw_pwrite_stream &OS, uint64_t Offset, uint64_t Alignment) { + if (!Alignment) + return Offset; + const auto PaddingSize = OffsetToAlignment(Offset, Alignment); for (unsigned I = 0; I < PaddingSize; ++I) OS.write((unsigned char)0); @@ -2716,12 +2719,30 @@ void RewriteInstance::rewriteNoteSections() { NextAvailableOffset += Size; } + + // Write new note sections. + for (auto &SII : EFMM->NoteSectionInfo) { + auto &SI = SII.second; + if (SI.FileOffset || !SI.AllocAddress) + continue; + + assert(SI.PendingRelocs.empty() && "cannot have pending relocs"); + + NextAvailableOffset = appendPadding(OS, NextAvailableOffset, SI.Alignment); + SI.FileOffset = NextAvailableOffset; + + DEBUG(dbgs() << "BOLT-DEBUG: writing out new section " << SII.first + << " of size " << SI.Size << " at offset 0x" + << Twine::utohexstr(SI.FileOffset) << '\n'); + + OS.write(reinterpret_cast(SI.AllocAddress), SI.Size); + NextAvailableOffset += SI.Size; + } } template -void RewriteInstance::writeStringTable(ELFObjectFile *File) { +void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { auto *Obj = File->getELFFile(); - auto &OS = Out->os(); // Pre-populate section header string table. for (auto &Section : Obj->sections()) { @@ -2734,16 +2755,22 @@ void RewriteInstance::writeStringTable(ELFObjectFile *File) { for (auto &SMII : EFMM->SectionMapInfo) { SHStrTab.add(SMII.first); } + for (auto &SMII : EFMM->NoteSectionInfo) { + SHStrTab.add(SMII.first); + } SHStrTab.finalize(StringTableBuilder::ELF); - auto SII = EFMM->NoteSectionInfo.find(".shstrtab"); - assert(SII != EFMM->NoteSectionInfo.end() && "cannot find .shstrtab"); - auto &SI = SII->second; - SI.FileOffset = OS.tell(); - SI.Size = SHStrTab.data().size(); - - // Write data for the table. - OS << SHStrTab.data(); + const auto SHStrTabSize = SHStrTab.data().size(); + uint8_t *DataCopy = new uint8_t[SHStrTabSize]; + memcpy(DataCopy, SHStrTab.data().data(), SHStrTabSize); + EFMM->NoteSectionInfo[".shstrtab"] = + SectionInfo(reinterpret_cast(DataCopy), + SHStrTabSize, + /*Alignment*/1, + /*IsCode=*/false, + /*IsReadOnly=*/false, + /*IsLocal=*/false); + EFMM->NoteSectionInfo[".shstrtab"].IsStrTab = true; } // Rewrite section header table inserting new entries as needed. The sections @@ -2751,10 +2778,12 @@ void RewriteInstance::writeStringTable(ELFObjectFile *File) { // so we are placing it at the end of the binary. // // As we rewrite entries we need to track how many sections were inserted -// as it changes the sh_link value. +// as it changes the sh_link value. We map old indices to new ones for +// existing sections. // // The following are assumptions about file modifications: -// * There are no modifications done to existing allocatable sections. +// * There are no modifications done to address and/or size of existing +// allocatable sections. // * All new allocatable sections are written immediately after existing // allocatable sections. // * There could be modifications done to non-allocatable sections, e.g. @@ -2811,12 +2840,9 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { OS.write(reinterpret_cast(&NewSection), sizeof(NewSection)); NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = CurrentSectionIndex++; - } // Create entries for new allocatable sections. - // - // Skip sections we overwrite in-place (like data sections). std::vector SectionsToRewrite; for (auto &SMII : EFMM->SectionMapInfo) { const auto &SectionName = SMII.first; @@ -2829,7 +2855,7 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { continue; } if (opts::Verbosity >= 1) - outs() << "BOLT-INFO: writing section header for " << SMII.first << '\n'; + outs() << "BOLT-INFO: writing section header for " << SectionName << '\n'; Elf_Shdr NewSection; NewSection.sh_name = SHStrTab.getOffset(SectionName); NewSection.sh_type = ELF::SHT_PROGBITS; @@ -2859,7 +2885,8 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { ++CurrentSectionIndex; } - int64_t NumNewSections = SectionsToRewrite.size(); + int64_t SectionCountDelta = SectionsToRewrite.size(); + uint64_t LastFileOffset = 0; // Copy over entries for non-allocatable sections performing necessary // adjustments. @@ -2868,46 +2895,77 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { continue; if (Section.sh_flags & ELF::SHF_ALLOC) continue; - - ErrorOr SectionName = Obj->getSectionName(&Section); - check_error(SectionName.getError(), "cannot get section name"); - if (Section.sh_type == ELF::SHT_RELA) { - if (opts::Verbosity) - outs() << "BOLT-INFO: omitting section header for relocation section " - << *SectionName << '\n'; - --NumNewSections; + --SectionCountDelta; continue; } + ErrorOr SectionName = Obj->getSectionName(&Section); + check_error(SectionName.getError(), "cannot get section name"); + auto SII = EFMM->NoteSectionInfo.find(*SectionName); assert(SII != EFMM->NoteSectionInfo.end() && "missing section info for non-allocatable section"); + const auto &SI = SII->second; auto NewSection = Section; - NewSection.sh_offset = SII->second.FileOffset; - NewSection.sh_size = SII->second.Size; + NewSection.sh_offset = SI.FileOffset; + NewSection.sh_size = SI.Size; NewSection.sh_name = SHStrTab.getOffset(*SectionName); // Adjust sh_link for sections that use it. if (Section.sh_link) - NewSection.sh_link = Section.sh_link + NumNewSections; + NewSection.sh_link = Section.sh_link + SectionCountDelta; // Adjust sh_info for relocation sections. if (Section.sh_type == ELF::SHT_REL || Section.sh_type == ELF::SHT_RELA) { if (Section.sh_info) - NewSection.sh_info = Section.sh_info + NumNewSections; + NewSection.sh_info = Section.sh_info + SectionCountDelta; } OS.write(reinterpret_cast(&NewSection), sizeof(NewSection)); NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = CurrentSectionIndex++; + + LastFileOffset = SI.FileOffset; } - // Using new section indices map updates sh_link and sh_info where needed. - // + // Create entries for new non-allocatable sections. + SectionsToRewrite.clear(); + for (auto &SII : EFMM->NoteSectionInfo) { + const auto &SectionName = SII.first; + const auto &SI = SII.second; + + if (SI.FileOffset <= LastFileOffset) + continue; - // New section header string table goes last. + if (opts::Verbosity >= 1) + outs() << "BOLT-INFO: writing section header for " << SectionName << '\n'; + Elf_Shdr NewSection; + NewSection.sh_name = SHStrTab.getOffset(SectionName); + NewSection.sh_type = (SI.IsStrTab ? ELF::SHT_STRTAB : ELF::SHT_PROGBITS); + NewSection.sh_addr = 0; + NewSection.sh_offset = SI.FileOffset; + NewSection.sh_size = SI.Size; + NewSection.sh_entsize = 0; + NewSection.sh_flags = 0; + NewSection.sh_link = 0; + NewSection.sh_info = 0; + NewSection.sh_addralign = SI.Alignment ? SI.Alignment : 1; + SectionsToRewrite.emplace_back(NewSection); + } + + // Write section header entries for new non-allocatable sections. + std::stable_sort(SectionsToRewrite.begin(), SectionsToRewrite.end(), + [] (Elf_Shdr A, Elf_Shdr B) { + return A.sh_offset < B.sh_offset; + }); + for (auto &SI : SectionsToRewrite) { + OS.write(reinterpret_cast(&SI), sizeof(SI)); + ++CurrentSectionIndex; + } + const auto AllocSectionCountDelta = SectionCountDelta; + SectionCountDelta += SectionsToRewrite.size(); // Fix ELF header. auto NewEhdr = *Obj->getHeader(); @@ -2919,8 +2977,8 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewEhdr.e_phoff = PHDRTableOffset; NewEhdr.e_phnum = Phnum; NewEhdr.e_shoff = SHTOffset; - NewEhdr.e_shnum = NewEhdr.e_shnum + NumNewSections; - NewEhdr.e_shstrndx = NewEhdr.e_shstrndx + NumNewSections; + NewEhdr.e_shnum = NewEhdr.e_shnum + SectionCountDelta; + NewEhdr.e_shstrndx = NewEhdr.e_shstrndx + AllocSectionCountDelta; OS.pwrite(reinterpret_cast(&NewEhdr), sizeof(NewEhdr), 0); assert(NewEhdr.e_shnum == CurrentSectionIndex && @@ -3291,12 +3349,12 @@ void RewriteInstance::rewriteFile() { // Patch program header table. patchELFPHDRTable(); + // Finalize memory image of section string table. + finalizeSectionStringTable(); + // Copy non-allocatable sections once allocatable part is finished. rewriteNoteSections(); - // Write string. - writeStringTable(); - if (opts::Relocs) { // Patch dynamic section/segment. patchELFDynamic(); @@ -3417,11 +3475,9 @@ uint64_t RewriteInstance::getFileOffsetForAddress(uint64_t Address) const { } bool RewriteInstance::willOverwriteSection(StringRef SectionName) { - if (opts::UpdateDebugSections) { - for (auto &OverwriteName : DebugSectionsToOverwrite) { - if (SectionName == OverwriteName) - return true; - } + for (auto &OverwriteName : DebugSectionsToOverwrite) { + if (SectionName == OverwriteName) + return true; } auto SMII = EFMM->SectionMapInfo.find(SectionName); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index e024c0e5247e..1253cd9cec06 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -46,6 +46,7 @@ struct SectionInfo { bool IsReadOnly{false}; /// Is the section read-only? bool IsLocal{false}; /// Is this section local to a function, and /// should only be emitted with the function? + bool IsStrTab{false}; /// Is this a string table section. uint64_t FileAddress{0}; /// Address for the output file (final address). uint64_t FileOffset{0}; /// Offset in the output file. unsigned SectionID{0}; /// Unique ID used for address mapping. @@ -309,8 +310,8 @@ class RewriteInstance { /// Patch .rela.plt section. ELF_FUNCTION(patchELFRelaPLT); - /// Write .shstrtab. - ELF_FUNCTION(writeStringTable); + /// Finalize memory image of section header string table. + ELF_FUNCTION(finalizeSectionStringTable); /// Computes output .debug_line line table offsets for each compile unit, /// and updates stmt_list for a corresponding compile unit. @@ -392,6 +393,7 @@ class RewriteInstance { /// When updating debug info, these are the sections we overwrite. static constexpr const char *DebugSectionsToOverwrite[] = { + ".shstrtab", ".debug_aranges", ".debug_line", ".debug_ranges", @@ -456,6 +458,8 @@ class RewriteInstance { /// Maps section name -> patcher. std::map> SectionPatchers; + /// [old section index] -> [new section index] map. Used for adjusting + /// referenced section indices. std::vector NewSectionIndex; uint64_t NewTextSectionStartAddress{0}; From 66646f45714bf8f0eeb946031ecfdb3ce2ad09e6 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 17 May 2017 18:35:00 -0700 Subject: [PATCH 248/904] [BOLT] Add option to keep/generate .debug_aranges. Summary: GOLD linker removes .debug_aranges while generating .gdb_index. Some tools however rely on the presence of this section. Add an option to generate .debug_aranges if it was removed, or keep it in the file if it was present. Generally speaking .debug_aranges duplicates information present in .gdb_index addresses table. (cherry picked from commit ac37e246b690bb1dcc2212453bf7d0ddc6977bcb) --- bolt/DWARFRewriter.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 1b817491796d..a7c59c05bd70 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -46,8 +46,18 @@ using namespace object; using namespace bolt; namespace opts { + +extern cl::OptionCategory BoltCategory; extern cl::opt Verbosity; -} + +static cl::opt +KeepARanges("keep-aranges", + cl::desc("keep or generate .debug_aranges section if .gdb_index is written"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +} // namespace opts void RewriteInstance::updateDebugInfo() { SectionPatchers[".debug_abbrev"] = llvm::make_unique(); @@ -403,7 +413,7 @@ void RewriteInstance::generateDebugRanges() { enum { RANGES, ARANGES }; for (auto RT = RANGES + 0; RT <= ARANGES; ++RT) { // Skip .debug_aranges if we are re-generating .gdb_index. - if (GdbIndexSection.getObject() && RT == ARANGES) + if (!opts::KeepARanges && GdbIndexSection.getObject() && RT == ARANGES) continue; const char *SectionName = (RT == RANGES) ? ".debug_ranges" From 98fb7e6973f5947882a9e9e4086e8d53aa076d8c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 8 Mar 2017 19:58:33 -0800 Subject: [PATCH 249/904] [BOLT] Add jump table support to ICP Summary: Add jump table support to ICP. The optimization is basically the same as ICP for tail calls. The big difference is that the profiling data comes from the jump table and the targets are local symbols rather than global. I've removed an instruction from ICP for tail calls. The code used to have a conditional jump to a block with a direct jump to the target, i.e. B1: cmp foo,(%rax) jne B3 B2: jmp foo B3: ... this code is now: B1: cmp foo,(%rax) je foo B2: ... The other changes in this diff: - Move ICP + new jump table support to separate file in Passes. - Improve the CFG validation to handle jump tables. - Fix the double jump peephole so that the successor of the modified block is updated properly. Also make sure that any existing branches in the block are modified to properly reflect the new CFG. - Add an invocation of the double jump peephole to SCTC. This allows us to remove a call to peepholes/UCE occurring after fixBranches() in the pass manager. - Miscellaneous cleanups to BOLT output. (cherry picked from commit 6162d5e8f65fdae74d64cdf6f96de2546d8eece7) --- bolt/BinaryBasicBlock.cpp | 92 ++- bolt/BinaryFunction.cpp | 53 +- bolt/BinaryFunction.h | 57 +- bolt/BinaryPassManager.cpp | 14 +- bolt/Passes/BinaryPasses.cpp | 593 +----------------- bolt/Passes/BinaryPasses.h | 148 ----- bolt/Passes/CMakeLists.txt | 1 + bolt/Passes/IndirectCallPromotion.cpp | 829 ++++++++++++++++++++++++++ bolt/Passes/IndirectCallPromotion.h | 217 +++++++ bolt/Passes/Inliner.cpp | 1 - 10 files changed, 1201 insertions(+), 804 deletions(-) create mode 100644 bolt/Passes/IndirectCallPromotion.cpp create mode 100644 bolt/Passes/IndirectCallPromotion.h diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index e6448cf767ea..40d55e91ffa4 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -57,35 +57,73 @@ BinaryBasicBlock::reverse_iterator BinaryBasicBlock::getLastNonPseudo() { } bool BinaryBasicBlock::validateSuccessorInvariants() { - auto *Func = getFunction(); - auto &BC = Func->getBinaryContext(); - const MCSymbol *TBB = nullptr; - const MCSymbol *FBB = nullptr; - MCInst *CondBranch = nullptr; - MCInst *UncondBranch = nullptr; - - assert(getNumPseudos() == getNumPseudos()); - - if (analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) { - switch (Successors.size()) { - case 0: - return !CondBranch && !UncondBranch; - case 1: - return !CondBranch || - (CondBranch && - !Func->getBasicBlockForLabel(BC.MIA->getTargetSymbol(*CondBranch))); - case 2: - return - (!CondBranch || - (TBB == getConditionalSuccessor(true)->getLabel() && - ((!UncondBranch && !FBB) || - (UncondBranch && - FBB == getConditionalSuccessor(false)->getLabel())))); - default: - return true; + const auto *Inst = getLastNonPseudoInstr(); + const auto *JT = Inst ? Function->getJumpTable(*Inst) : nullptr; + auto &BC = Function->getBinaryContext(); + bool Valid = true; + + if (JT) { + // Note: for now we assume that successors do not reference labels from + // any overlapping jump tables. We only look at the entries for the jump + // table that is referenced at the last instruction. + const auto Range = JT->getEntriesForAddress(BC.MIA->getJumpTable(*Inst)); + const std::vector Entries(&JT->Entries[Range.first], + &JT->Entries[Range.second]); + std::set UniqueSyms(Entries.begin(), Entries.end()); + for (auto *Succ : Successors) { + auto Itr = UniqueSyms.find(Succ->getLabel()); + if (Itr != UniqueSyms.end()) { + UniqueSyms.erase(Itr); + } else { + // Work on the assumption that jump table blocks don't + // have a conditional successor. + Valid = false; + } + } + // If there are any leftover entries in the jump table, they + // must be one of the function end labels. + for (auto *Sym : UniqueSyms) { + Valid &= (Sym == Function->getFunctionEndLabel() || + Sym == Function->getFunctionColdEndLabel()); + } + } else { + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + + if (analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) { + switch (Successors.size()) { + case 0: + Valid = !CondBranch && !UncondBranch; + break; + case 1: + Valid = !CondBranch || + (CondBranch && + !Function->getBasicBlockForLabel(BC.MIA->getTargetSymbol(*CondBranch))); + break; + case 2: + Valid = + (!CondBranch || + (TBB == getConditionalSuccessor(true)->getLabel() && + ((!UncondBranch && !FBB) || + (UncondBranch && + FBB == getConditionalSuccessor(false)->getLabel())))); + break; + } } } - return true; + if (!Valid) { + errs() << "BOLT-WARNING: CFG invalid in " << *getFunction() << " @ " + << getName() << "\n"; + if (JT) { + errs() << "Jump Table instruction addr = 0x" + << Twine::utohexstr(BC.MIA->getJumpTable(*Inst)) << "\n"; + JT->print(errs()); + } + dump(); + } + return Valid; } BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const { diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index cfacff073d80..9df58686f751 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -51,6 +51,7 @@ extern bool shouldProcess(const BinaryFunction &); extern cl::opt PrintDynoStats; extern cl::opt Relocs; extern cl::opt UpdateDebugSections; +extern cl::opt IndirectCallPromotion; extern cl::opt Verbosity; static cl::opt @@ -346,9 +347,8 @@ bool BinaryFunction::isForwardCall(const MCSymbol *CalleeSymbol) const { } } -void BinaryFunction::dump(std::string Annotation, - bool PrintInstructions) const { - print(dbgs(), Annotation, PrintInstructions); +void BinaryFunction::dump(bool PrintInstructions) const { + print(dbgs(), "", PrintInstructions); } void BinaryFunction::print(raw_ostream &OS, std::string Annotation, @@ -1709,7 +1709,8 @@ bool BinaryFunction::buildCFG() { if (!JT) continue; JT->Count += BInfo.Branches; - if (opts::JumpTables < JTS_AGGRESSIVE) + if (opts::IndirectCallPromotion < ICP_JUMP_TABLES && + opts::JumpTables < JTS_AGGRESSIVE) continue; if (JT->Counts.empty()) JT->Counts.resize(JT->Entries.size()); @@ -1718,7 +1719,9 @@ bool BinaryFunction::buildCFG() { EI += Delta; while (EI != JT->Entries.end()) { if (ToBB->getLabel() == *EI) { - JT->Counts[Delta] += BInfo.Branches; + assert(Delta < JT->Counts.size()); + JT->Counts[Delta].Mispreds += BInfo.Mispreds; + JT->Counts[Delta].Count += BInfo.Branches; } ++Delta; ++EI; @@ -2821,10 +2824,6 @@ bool BinaryFunction::validateCFG() const { bool Valid = true; for (auto *BB : BasicBlocks) { Valid &= BB->validateSuccessorInvariants(); - if (!Valid) { - errs() << "BOLT-WARNING: CFG invalid in " << *this << " @ " - << BB->getName() << "\n"; - } } if (!Valid) @@ -3549,6 +3548,34 @@ void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { } } +std::pair +BinaryFunction::JumpTable::getEntriesForAddress(const uint64_t Addr) const { + const uint64_t InstOffset = Addr - Address; + size_t StartIndex = 0, EndIndex = 0; + uint64_t Offset = 0; + + for (size_t I = 0; I < Entries.size(); ++I) { + auto LI = Labels.find(Offset); + if (LI != Labels.end()) { + const auto NextLI = std::next(LI); + const auto NextOffset = + NextLI == Labels.end() ? getSize() : NextLI->first; + if (InstOffset >= LI->first && InstOffset < NextOffset) { + StartIndex = I; + EndIndex = I; + while (Offset < NextOffset) { + ++EndIndex; + Offset += EntrySize; + } + break; + } + } + Offset += EntrySize; + } + + return std::make_pair(StartIndex, EndIndex); +} + void BinaryFunction::JumpTable::updateOriginal(BinaryContext &BC) { // In non-relocation mode we have to emit jump tables in local sections. // This way we only overwrite them when a corresponding function is @@ -3591,7 +3618,7 @@ uint64_t BinaryFunction::JumpTable::emit(MCStreamer *Streamer, CurrentLabel = LI->second; CurrentLabelCount = 0; } - CurrentLabelCount += Counts[Index]; + CurrentLabelCount += Counts[Index].Count; } LabelCounts[CurrentLabel] = CurrentLabelCount; } else { @@ -3648,8 +3675,10 @@ void BinaryFunction::JumpTable::print(raw_ostream &OS) const { } } OS << format(" 0x%04" PRIx64 " : ", Offset) << Entry->getName(); - if (!Counts.empty()) - OS << " : " << Counts[Offset / EntrySize]; + if (!Counts.empty()) { + OS << " : " << Counts[Offset / EntrySize].Mispreds + << "/" << Counts[Offset / EntrySize].Count; + } OS << '\n'; Offset += EntrySize; } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 9120f42bd41c..05d7d273459e 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -157,6 +157,13 @@ enum JumpTableSupportLevel : char { JTS_AGGRESSIVE = 4, /// Aggressive splitting of jump tables. }; +enum IndirectCallPromotionType : char { + ICP_NONE, /// Don't perform ICP. + ICP_CALLS, /// Perform ICP on indirect calls. + ICP_JUMP_TABLES, /// Perform ICP on jump tables. + ICP_ALL /// Perform ICP on calls and jump tables. +}; + /// BinaryFunction is a representation of machine-level function. /// /// We use the term "Binary" as "Machine" was already taken. @@ -211,6 +218,12 @@ class BinaryFunction : public AddressRangesOwner { RT_USER }; + /// Branch statistics for jump table entries. + struct JumpInfo { + uint64_t Mispreds{0}; + uint64_t Count{0}; + }; + static constexpr uint64_t COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; // Function size, in number of BBs, above which we fallback to a heuristic @@ -583,6 +596,10 @@ class BinaryFunction : public AddressRangesOwner { return std::max(OffsetEntries.size(), Entries.size()) * EntrySize; } + /// Get the indexes for symbol entries that correspond to the jump table + /// starting at (or containing) 'Addr'. + std::pair getEntriesForAddress(const uint64_t Addr) const; + /// Constructor. JumpTable(uint64_t Address, std::size_t EntrySize, @@ -596,7 +613,7 @@ class BinaryFunction : public AddressRangesOwner { /// Dynamic number of times each entry in the table was referenced. /// Identical entries will have a shared count (identical for every /// entry in the set). - std::vector Counts; + std::vector Counts; /// Total number of times this jump table was used. uint64_t Count{0}; @@ -742,6 +759,19 @@ class BinaryFunction : public AddressRangesOwner { return EntryOffsets.count(Offset); } + void addInstruction(uint64_t Offset, MCInst &&Instruction) { + Instructions.emplace(Offset, std::forward(Instruction)); + } + + /// Return instruction at a given offset in the function. Valid before + /// CFG is constructed. + MCInst *getInstructionAtOffset(uint64_t Offset) { + assert(CurrentState == State::Disassembled && + "can only call function in Disassembled state"); + auto II = Instructions.find(Offset); + return (II == Instructions.end()) ? nullptr : &II->second; + } + /// Different types of indirect branches encountered during disassembly. enum class IndirectBranchType : char { UNKNOWN = 0, /// Unable to determine type. @@ -901,6 +931,11 @@ class BinaryFunction : public AddressRangesOwner { return BC; } + /// Return BinaryContext for the function. + BinaryContext &getBinaryContext() { + return BC; + } + /// Attempt to validate CFG invariants. bool validateCFG() const; @@ -1164,6 +1199,11 @@ class BinaryFunction : public AddressRangesOwner { return JumpTables.size(); } + const JumpTable *getJumpTable(const MCInst &Inst) const { + const auto Address = BC.MIA->getJumpTable(Inst); + return getJumpTableContainingAddress(Address); + } + const MCSymbol *getPersonalityFunction() const { return PersonalityFunction; } @@ -1322,25 +1362,12 @@ class BinaryFunction : public AddressRangesOwner { /// Dump function information to debug output. If \p PrintInstructions /// is true - include instruction disassembly. - void dump(std::string Annotation = "", bool PrintInstructions = true) const; + void dump(bool PrintInstructions = true) const; /// Print function information to the \p OS stream. void print(raw_ostream &OS, std::string Annotation = "", bool PrintInstructions = true) const; - void addInstruction(uint64_t Offset, MCInst &&Instruction) { - Instructions.emplace(Offset, std::forward(Instruction)); - } - - /// Return instruction at a given offset in the function. Valid before - /// CFG is constructed. - MCInst *getInstructionAtOffset(uint64_t Offset) { - assert(CurrentState == State::Disassembled && - "can only call function in Disassembled state"); - auto II = Instructions.find(Offset); - return (II == Instructions.end()) ? nullptr : &II->second; - } - /// Return true if function has a profile, even if the profile does not /// match CFG 100%. bool hasProfile() const { diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 4f65cfa2bc16..ceb5160af23c 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -11,6 +11,7 @@ #include "BinaryPassManager.h" #include "Passes/FrameOptimizer.h" +#include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" @@ -40,12 +41,6 @@ EliminateUnreachable("eliminate-unreachable", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -IndirectCallPromotion("indirect-call-promotion", - cl::desc("indirect call promotion"), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt InlineSmallFunctions("inline-small-functions", cl::desc("inline functions with a single basic block"), @@ -302,8 +297,7 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintICF), opts::ICF); - Manager.registerPass(llvm::make_unique(PrintICP), - opts::IndirectCallPromotion); + Manager.registerPass(llvm::make_unique(PrintICP)); Manager.registerPass(llvm::make_unique(PrintPeepholes), opts::Peepholes); @@ -361,8 +355,8 @@ void BinaryFunctionPassManager::runAllPasses( // This pass should always run last.* Manager.registerPass(llvm::make_unique(PrintFinalized)); - // *except for this pass. TODO: figure out why moving this before function - // reordering breaks things badly. + // *except for this pass. This pass turns tail calls into jumps which + // makes them invisible to function reordering. Manager.registerPass( llvm::make_unique(PrintAfterLowering)); diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 69ca62f6e9c8..d425bc8eee57 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -84,55 +84,6 @@ ICFUseDFS("icf-dfs", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::list -ICPFuncsList("icp-funcs", - cl::CommaSeparated, - cl::desc("list of functions to enable ICP for"), - cl::value_desc("func1,func2,func3,..."), - cl::Hidden, - cl::cat(BoltOptCategory)); - -static cl::opt -ICPOldCodeSequence("icp-old-code-sequence", - cl::desc("use old code sequence for promoted calls"), - cl::init(false), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltOptCategory)); - -static cl::opt -IndirectCallPromotionMispredictThreshold( - "indirect-call-promotion-mispredict-threshold", - cl::desc("misprediction threshold for skipping ICP on an " - "indirect call"), - cl::init(2), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - -static cl::opt -IndirectCallPromotionThreshold("indirect-call-promotion-threshold", - cl::desc("threshold for optimizing a frequently taken indirect call"), - cl::init(90), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - -static cl::opt -IndirectCallPromotionTopN("indirect-call-promotion-topn", - cl::desc("number of targets to consider when doing indirect " - "call promotion"), - cl::init(1), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - -static cl::opt -IndirectCallPromotionUseMispredicts("indirect-call-promotion-use-mispredicts", - cl::desc("use misprediction frequency for determining whether or not ICP " - "should be applied at a callsite. The " - "-indirect-call-promotion-mispredict-threshold value will be used " - "by this heuristic"), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt MinBranchClusters("min-branch-clusters", cl::desc("use a modified clustering algorithm geared towards minimizing " @@ -404,8 +355,8 @@ void ReorderBasicBlocks::runOnFunctions( for (auto &It : BFs) { auto &Function = It.second; - if (!shouldOptimize(Function)) - continue; + if (!shouldOptimize(Function)) + continue; const bool ShouldSplit = (opts::SplitFunctions == BinaryFunction::ST_ALL) || @@ -1146,546 +1097,6 @@ void PrintSortedBy::runOnFunctions( } } -// Get list of targets for a given call sorted by most frequently -// called first. -std::vector IndirectCallPromotion::getCallTargets( - BinaryContext &BC, - const FuncBranchData &BranchData, - const MCInst &Inst -) const { - auto Offset = BC.MIA->getAnnotationAs(Inst, "IndirectBranchData"); - auto Branches = BranchData.getBranchRange(Offset); - std::vector Targets(Branches.begin(), Branches.end()); - - // Sort by most commonly called targets. - std::sort(Targets.begin(), Targets.end(), - [](const BranchInfo &A, const BranchInfo &B) { - return A.Branches > B.Branches; - }); - - // Remove non-symbol targets - auto Last = std::remove_if(Targets.begin(), - Targets.end(), - [](const BranchInfo &BI) { - return !BI.To.IsSymbol; - }); - Targets.erase(Last, Targets.end()); - - return Targets; -} - -std::vector> -IndirectCallPromotion::findCallTargetSymbols( - BinaryContext &BC, - const std::vector &Targets, - const size_t N -) const { - std::vector> SymTargets; - - for (size_t I = 0; I < N; ++I) { - assert(Targets[I].To.IsSymbol && "All ICP targets must be symbols."); - auto Itr = BC.GlobalSymbols.find(Targets[I].To.Name); - if (Itr == BC.GlobalSymbols.end()) { - // punt if we can't find a symbol. - break; - } - MCSymbol* Symbol = BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); - assert(Symbol && "All ICP targets must be known symbols."); - SymTargets.push_back(std::make_pair(Symbol, 0)); - } - - return SymTargets; -} - -std::vector> -IndirectCallPromotion::rewriteCall(BinaryContext &BC, - BinaryFunction &Function, - BinaryBasicBlock *IndCallBlock, - const MCInst &CallInst, - MCInstrAnalysis::ICPdata &&ICPcode) const { - // Create new basic blocks with correct code in each one first. - std::vector> NewBBs; - const bool IsTailCall = BC.MIA->isTailCall(CallInst); - - // Move instructions from the tail of the original call block - // to the merge block. - - // Remember any pseudo instructions following a tail call. These - // must be preserved and moved to the original block. - std::vector TailInsts; - const auto *TailInst= &CallInst; - if (IsTailCall) { - while (TailInst + 1 < &(*IndCallBlock->end()) && - BC.MII->get((TailInst + 1)->getOpcode()).isPseudo()) { - TailInsts.push_back(*++TailInst); - } - } - - auto MovedInst = IndCallBlock->splitInstructions(&CallInst); - - IndCallBlock->replaceInstruction(&CallInst, ICPcode.front().second); - IndCallBlock->addInstructions(TailInsts.begin(), TailInsts.end()); - - for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) { - auto &Sym = Itr->first; - auto &Insts = Itr->second; - assert(Sym); - auto TBB = Function.createBasicBlock(0, Sym); - for (auto &Inst : Insts) { // sanitize new instructions. - if (BC.MIA->isCall(Inst)) - BC.MIA->removeAnnotation(Inst, "IndirectBranchData"); - } - TBB->addInstructions(Insts.begin(), Insts.end()); - NewBBs.emplace_back(std::move(TBB)); - } - - // Move tail of instructions from after the original call to - // the merge block. - if (!IsTailCall) { - NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end()); - } - - return NewBBs; -} - -BinaryBasicBlock *IndirectCallPromotion::fixCFG( - BinaryContext &BC, - BinaryFunction &Function, - BinaryBasicBlock *IndCallBlock, - const bool IsTailCall, - IndirectCallPromotion::BasicBlocksVector &&NewBBs, - const std::vector &Targets -) const { - BinaryBasicBlock *MergeBlock = !IsTailCall ? NewBBs.back().get() : nullptr; - assert(NewBBs.size() >= 2); - assert(NewBBs.size() % 2 == 1 || IndCallBlock->succ_empty()); - assert(NewBBs.size() % 2 == 1 || IsTailCall); - using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo; - - if (MergeBlock) { - std::vector OldSucc(IndCallBlock->successors().begin(), - IndCallBlock->successors().end()); - std::vector BranchInfo(IndCallBlock->branch_info_begin(), - IndCallBlock->branch_info_end()); - - // Remove all successors from block doing the indirect call. - IndCallBlock->removeSuccessors(OldSucc.begin(), OldSucc.end()); - assert(IndCallBlock->succ_empty()); - - // Move them to the merge block. - MergeBlock->addSuccessors(OldSucc.begin(), - OldSucc.end(), - BranchInfo.begin(), - BranchInfo.end()); - - // Update the execution count on the MergeBlock. - MergeBlock->setExecutionCount(IndCallBlock->getExecutionCount()); - } - - // Scale indirect call counts to the execution count of the original - // basic block containing the indirect call. - uint64_t TotalIndirectBranches = 0; - uint64_t TotalIndirectMispreds = 0; - for (const auto &BI : Targets) { - TotalIndirectBranches += BI.Branches; - TotalIndirectMispreds += BI.Mispreds; - } - - uint64_t TotalCount = 0; - uint64_t TotalMispreds = 0; - - if (Function.hasValidProfile()) { - TotalCount = IndCallBlock->getExecutionCount(); - TotalMispreds = - TotalCount * ((double)TotalIndirectMispreds / TotalIndirectBranches); - assert(TotalCount != BinaryBasicBlock::COUNT_NO_PROFILE); - } - - // New BinaryBranchInfo scaled to the execution count of the original BB. - std::vector BBI; - for (auto Itr = Targets.begin(); Itr != Targets.end(); ++Itr) { - BBI.push_back( - BinaryBranchInfo{ - uint64_t(TotalCount * ((double)Itr->Branches / TotalIndirectBranches)), - uint64_t(TotalMispreds * ((double)Itr->Mispreds / TotalIndirectMispreds)) - } - ); - } - auto BI = BBI.begin(); - auto updateCurrentBranchInfo = [&]{ - assert(BI < BBI.end()); - TotalCount -= BI->Count; - TotalMispreds -= BI->MispredictedCount; - ++BI; - }; - - // Fix up successors and execution counts. - updateCurrentBranchInfo(); - IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount); // uncond branch - IndCallBlock->addSuccessor(NewBBs[0].get(), BBI[0]); // conditional branch - - size_t Adj = 1 + (!IsTailCall ? 1 : 0); - for (size_t I = 0; I < NewBBs.size() - Adj; ++I) { - assert(TotalCount <= IndCallBlock->getExecutionCount() || - TotalCount <= uint64_t(TotalIndirectBranches)); - uint64_t ExecCount = BBI[(I+1)/2].Count; - NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); - NewBBs[I]->setIsCold(IndCallBlock->isCold()); - if (I % 2 == 0) { - if (MergeBlock) { - NewBBs[I]->addSuccessor(MergeBlock, BBI[(I+1)/2].Count); // uncond - } - } else { - assert(I + 2 < NewBBs.size()); - updateCurrentBranchInfo(); - NewBBs[I]->addSuccessor(NewBBs[I+2].get(), TotalCount); // uncond branch - NewBBs[I]->addSuccessor(NewBBs[I+1].get(), BBI[(I+1)/2]); // cond. branch - ExecCount += TotalCount; - } - NewBBs[I]->setExecutionCount(ExecCount); - } - - // Arrange for the MergeBlock to be the fallthrough for the first - // promoted call block. - if (MergeBlock) { - MergeBlock->setCanOutline(IndCallBlock->canOutline()); - MergeBlock->setIsCold(IndCallBlock->isCold()); - std::unique_ptr MBPtr; - std::swap(MBPtr, NewBBs.back()); - NewBBs.pop_back(); - NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); - // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? - NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch - } - - // cold call block - // TODO: should be able to outline/cold this block. - NewBBs.back()->setExecutionCount(TotalCount); - NewBBs.back()->setCanOutline(IndCallBlock->canOutline()); - NewBBs.back()->setIsCold(IndCallBlock->isCold()); - - // update BB and BB layout. - Function.insertBasicBlocks(IndCallBlock, std::move(NewBBs)); - assert(Function.validateCFG()); - - return MergeBlock; -} - -size_t -IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, - const MCInst &Inst, - const std::vector &Targets, - uint64_t NumCalls) { - // If we have no targets (or no calls), skip this callsite. - if (Targets.empty() || !NumCalls) { - if (opts::Verbosity >= 1) { - const auto InstIdx = &Inst - &(*BB->begin()); - outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " - << InstIdx << " in " << BB->getName() - << ", calls = " << NumCalls - << ", targets empty or NumCalls == 0.\n"; - } - return 0; - } - - const auto TrialN = std::min(size_t(opts::IndirectCallPromotionTopN), - Targets.size()); - - if (!opts::ICPFuncsList.empty()) { - for (auto &Name : opts::ICPFuncsList) { - if (BB->getFunction()->hasName(Name)) - return TrialN; - } - return 0; - } - - // Pick the top N targets. - uint64_t TotalCallsTopN = 0; - uint64_t TotalMispredictsTopN = 0; - size_t N = 0; - - if (opts::IndirectCallPromotionUseMispredicts) { - // Count total number of mispredictions for (at most) the top N targets. - // We may choose a smaller N (TrialN vs. N) if the frequency threshold - // is exceeded by fewer targets. - double Threshold = double(opts::IndirectCallPromotionMispredictThreshold); - for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { - const auto Frequency = (100.0 * Targets[I].Mispreds) / NumCalls; - TotalMispredictsTopN += Targets[I].Mispreds; - TotalNumFrequentCalls += Targets[I].Branches; - Threshold -= Frequency; - } - - // Compute the misprediction frequency of the top N call targets. If this - // frequency is greater than the threshold, we should try ICP on this callsite. - const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls; - - if (TopNFrequency == 0 || - TopNFrequency < opts::IndirectCallPromotionMispredictThreshold) { - if (opts::Verbosity >= 1) { - const auto InstIdx = &Inst - &(*BB->begin()); - outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " - << InstIdx << " in " << BB->getName() << ", calls = " - << NumCalls << ", top N mis. frequency " - << format("%.1f", TopNFrequency) << "% < " - << opts::IndirectCallPromotionMispredictThreshold << "%\n"; - } - return 0; - } - } else { - // Count total number of calls for (at most) the top N targets. - // We may choose a smaller N (TrialN vs. N) if the frequency threshold - // is exceeded by fewer targets. - double Threshold = double(opts::IndirectCallPromotionThreshold); - for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { - const auto Frequency = (100.0 * Targets[I].Branches) / NumCalls; - TotalCallsTopN += Targets[I].Branches; - TotalMispredictsTopN += Targets[I].Mispreds; - TotalNumFrequentCalls += Targets[I].Branches; - Threshold -= Frequency; - } - - // Compute the frequency of the top N call targets. If this frequency - // is greater than the threshold, we should try ICP on this callsite. - const double TopNFrequency = (100.0 * TotalCallsTopN) / NumCalls; - - if (TopNFrequency == 0 || - TopNFrequency < opts::IndirectCallPromotionThreshold) { - if (opts::Verbosity >= 1) { - const auto InstIdx = &Inst - &(*BB->begin()); - outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " - << InstIdx << " in " << BB->getName() << ", calls = " - << NumCalls << ", top N frequency " - << format("%.1f", TopNFrequency) << "% < " - << opts::IndirectCallPromotionThreshold << "%\n"; - } - return 0; - } - - // Compute the misprediction frequency of the top N call targets. If - // this frequency is less than the threshold, we should skip ICP at - // this callsite. - const double TopNMispredictFrequency = - (100.0 * TotalMispredictsTopN) / NumCalls; - - if (TopNMispredictFrequency < - opts::IndirectCallPromotionMispredictThreshold) { - if (opts::Verbosity >= 1) { - const auto InstIdx = &Inst - &(*BB->begin()); - outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " - << InstIdx << " in " << BB->getName() << ", calls = " - << NumCalls << ", top N mispredict frequency " - << format("%.1f", TopNMispredictFrequency) << "% < " - << opts::IndirectCallPromotionMispredictThreshold << "%\n"; - } - return 0; - } - } - - return N; -} - -void -IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, - const MCInst &Inst, - const std::vector &Targets, - const size_t N, - uint64_t NumCalls) const { - auto &BC = BB->getFunction()->getBinaryContext(); - const auto InstIdx = &Inst - &(*BB->begin()); - bool Separator = false; - - outs() << "BOLT-INFO: ICP candidate branch info: " - << *BB->getFunction() << " @ " << InstIdx - << " in " << BB->getName() - << " -> calls = " << NumCalls - << (BC.MIA->isTailCall(Inst) ? " (tail)" : ""); - for (size_t I = 0; I < N; I++) { - const auto Frequency = 100.0 * Targets[I].Branches / NumCalls; - const auto MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls; - outs() << (Separator ? " | " : ", "); - Separator = true; - outs() << Targets[I].To.Name - << ", calls = " << Targets[I].Branches - << ", mispreds = " << Targets[I].Mispreds - << ", taken freq = " << format("%.1f", Frequency) << "%" - << ", mis. freq = " << format("%.1f", MisFrequency) << "%"; - } - outs() << "\n"; - - DEBUG({ - dbgs() << "BOLT-INFO: ICP original call instruction:\n"; - BC.printInstruction(dbgs(), Inst, Targets[0].From.Offset, nullptr, true); - }); -} - -void IndirectCallPromotion::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions -) { - for (auto &BFIt : BFs) { - auto &Function = BFIt.second; - - if (!Function.isSimple() || !opts::shouldProcess(Function)) - continue; - - const auto BranchDataOrErr = BC.DR.getFuncBranchData(Function.getNames()); - if (const auto EC = BranchDataOrErr.getError()) { - DEBUG(dbgs() << "BOLT-INFO: no branch data found for \"" - << Function << "\"\n"); - continue; - } - const FuncBranchData &BranchData = BranchDataOrErr.get(); - const bool HasLayout = !Function.layout_empty(); - - // Note: this is not just counting calls. - TotalCalls += BranchData.ExecutionCount; - - // Total number of indirect calls issued from the current Function. - // (a fraction of TotalIndirectCalls) - uint64_t FuncTotalIndirectCalls = 0; - - std::vector BBs; - for (auto &BB : Function) { - // Skip indirect calls in cold blocks. - if (!HasLayout || !Function.isSplit() || !BB.isCold()) { - BBs.push_back(&BB); - } - } - - while (!BBs.empty()) { - auto *BB = BBs.back(); - BBs.pop_back(); - - for (unsigned Idx = 0; Idx < BB->size(); ++Idx) { - auto &Inst = BB->getInstructionAtIndex(Idx); - const auto InstIdx = &Inst - &(*BB->begin()); - - if (!BC.MIA->hasAnnotation(Inst, "IndirectBranchData")) - continue; - - assert(BC.MIA->isCall(Inst)); - - ++TotalIndirectCallsites; - - const auto Targets = getCallTargets(BC, BranchData, Inst); - - // Compute the total number of calls from this particular callsite. - uint64_t NumCalls = 0; - for (const auto &BInfo : Targets) { - NumCalls += BInfo.Branches; - } - FuncTotalIndirectCalls += NumCalls; - - // Should this callsite be optimized? Return the number of targets - // to use when promoting this call. A value of zero means to skip - // this callsite. - size_t N = canPromoteCallsite(BB, Inst, Targets, NumCalls); - - if (!N) - continue; - - if (opts::Verbosity >= 1) { - printCallsiteInfo(BB, Inst, Targets, N, NumCalls); - } - - // Find MCSymbols or absolute addresses for each call target. - const auto SymTargets = findCallTargetSymbols(BC, Targets, N); - - // If we can't resolve any of the target symbols, punt on this callsite. - if (SymTargets.size() < N) { - const auto LastTarget = SymTargets.size(); - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: ICP failed to find target symbol for " - << Targets[LastTarget].To.Name << " in " - << Function << " @ " << InstIdx << " in " - << BB->getName() << ", calls = " << NumCalls << "\n"; - } - continue; - } - - // Generate new promoted call code for this callsite. - auto ICPcode = - BC.MIA->indirectCallPromotion(Inst, - SymTargets, - opts::ICPOldCodeSequence, - BC.Ctx.get()); - - if (ICPcode.empty()) { - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: ICP failed in " << Function << " @ " - << InstIdx << " in " << BB->getName() - << ", calls = " << NumCalls - << ", unable to generate promoted call code.\n"; - } - continue; - } - - DEBUG({ - auto Offset = Targets[0].From.Offset; - dbgs() << "BOLT-INFO: ICP indirect call code:\n"; - for (const auto &entry : ICPcode) { - const auto &Sym = entry.first; - const auto &Insts = entry.second; - if (Sym) dbgs() << Sym->getName() << ":\n"; - Offset = BC.printInstructions(dbgs(), - Insts.begin(), - Insts.end(), - Offset); - } - dbgs() << "---------------------------------------------------\n"; - }); - - // Rewrite the CFG with the newly generated ICP code. - const bool IsTailCall = BC.MIA->isTailCall(Inst); - auto NewBBs = rewriteCall(BC, Function, BB, Inst, std::move(ICPcode)); - - // Fix the CFG after inserting the new basic blocks. - auto MergeBlock = fixCFG(BC, Function, BB, IsTailCall, - std::move(NewBBs), Targets); - - // Since the tail of the original block was split off and it may contain - // additional indirect calls, we must add the merge block to the set of - // blocks to process. - if (MergeBlock) { - BBs.push_back(MergeBlock); - } - - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: ICP succeeded in " - << Function << " @ " << InstIdx - << " in " << BB->getName() - << " -> calls = " << NumCalls << "\n"; - } - - ++TotalOptimizedIndirectCallsites; - - Modified.insert(&Function); - } - } - TotalIndirectCalls += FuncTotalIndirectCalls; - } - - outs() << "BOLT-INFO: ICP total indirect callsites = " - << TotalIndirectCallsites - << "\n" - << "BOLT-INFO: ICP total number of calls = " - << TotalCalls - << "\n" - << "BOLT-INFO: ICP percentage of calls that are indirect = " - << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) - << "%\n" - << "BOLT-INFO: ICP percentage of indirect calls that can be " - "optimized = " - << format("%.1f", (100.0 * TotalNumFrequentCalls) / TotalIndirectCalls) - << "%\n" - << "BOLT-INFO: ICP percentage of indirect calls that are optimized = " - << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / - TotalIndirectCallsites) - << "%\n"; -} - void InstructionLowering::runOnFunctions( BinaryContext &BC, std::map &BFs, diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index fe420594828e..f7345419a995 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -326,154 +326,6 @@ class PrintSortedBy : public BinaryFunctionPass { std::set &LargeFunctions) override; }; -/// Optimize indirect calls. -/// The indirect call promotion pass visits each indirect call and -/// examines the BranchData for each. If the most frequent targets -/// from that callsite exceed the specified threshold (default 90%), -/// the call is promoted. Otherwise, it is ignored. By default, -/// only one target is considered at each callsite. -/// -/// When an candidate callsite is processed, we modify the callsite -/// to test for the most common call targets before calling through -/// the original generic call mechanism. -/// -/// The CFG and layout are modified by ICP. -/// -/// A few new command line options have been added: -/// -indirect-call-promotion -/// -indirect-call-promotion-threshold= -/// -indirect-call-promotion-mispredict-threshold= -/// -indirect-call-promotion-topn= -/// -/// The threshold is the minimum frequency of a call target needed -/// before ICP is triggered. -/// -/// The mispredict threshold is used to disable the optimization at -/// any callsite where the branch predictor does a good enough job -/// that ICP wouldn't help regardless of the frequency of the most -/// common target. -/// -/// The topn option controls the number of targets to consider for -/// each callsite, e.g. ICP is triggered if topn=2 and the total -/// frequency of the top two call targets exceeds the threshold. -/// -/// The minimize code size option controls whether or not the hot -/// calls are to registers (callq %r10) or to function addresses -/// (callq $foo). -/// -/// Example of ICP: -/// -/// C++ code: -/// -/// int B_count = 0; -/// int C_count = 0; -/// -/// struct A { virtual void foo() = 0; } -/// struct B : public A { virtual void foo() { ++B_count; }; }; -/// struct C : public A { virtual void foo() { ++C_count; }; }; -/// -/// A* a = ... -/// a->foo(); -/// ... -/// -/// original assembly: -/// -/// B0: 49 8b 07 mov (%r15),%rax -/// 4c 89 ff mov %r15,%rdi -/// ff 10 callq *(%rax) -/// 41 83 e6 01 and $0x1,%r14d -/// 4d 89 e6 mov %r12,%r14 -/// 4c 0f 44 f5 cmove %rbp,%r14 -/// 4c 89 f7 mov %r14,%rdi -/// ... -/// -/// after ICP: -/// -/// B0: 49 8b 07 mov (%r15),%rax -/// 4c 89 ff mov %r15,%rdi -/// 48 81 38 e0 0b 40 00 cmpq $B::foo,(%rax) -/// 75 29 jne B3 -/// B1: e8 45 03 00 00 callq $B::foo -/// B2: 41 83 e6 01 and $0x1,%r14d -/// 4d 89 e6 mov %r12,%r14 -/// 4c 0f 44 f5 cmove %rbp,%r14 -/// 4c 89 f7 mov %r14,%rdi -/// ... -/// -/// B3: ff 10 callq *(%rax) -/// eb d6 jmp B2 -/// -class IndirectCallPromotion : public BinaryFunctionPass { - using BasicBlocksVector = std::vector>; - std::unordered_set Modified; - // Total number of calls from all callsites. - uint64_t TotalCalls{0}; - - // Total number of indirect calls from all callsites. - // (a fraction of TotalCalls) - uint64_t TotalIndirectCalls{0}; - - // Total number of callsites that use indirect calls. - // (the total number of callsites is not recorded) - uint64_t TotalIndirectCallsites{0}; - - // Total number of indirect callsites that are optimized by ICP. - // (a fraction of TotalIndirectCallsites) - uint64_t TotalOptimizedIndirectCallsites{0}; - - // Total number of indirect calls that are optimized by ICP. - // (a fraction of TotalCalls) - uint64_t TotalNumFrequentCalls{0}; - - std::vector getCallTargets(BinaryContext &BC, - const FuncBranchData &BranchData, - const MCInst &Inst) const; - - size_t canPromoteCallsite(const BinaryBasicBlock *BB, - const MCInst &Inst, - const std::vector &Targets, - uint64_t NumCalls); - - void printCallsiteInfo(const BinaryBasicBlock *BB, - const MCInst &Inst, - const std::vector &Targets, - const size_t N, - uint64_t NumCalls) const; - - std::vector> - findCallTargetSymbols(BinaryContext &BC, - const std::vector &Targets, - const size_t N) const; - - std::vector> - rewriteCall(BinaryContext &BC, - BinaryFunction &Function, - BinaryBasicBlock *IndCallBlock, - const MCInst &CallInst, - MCInstrAnalysis::ICPdata &&ICPcode) const; - - BinaryBasicBlock *fixCFG(BinaryContext &BC, - BinaryFunction &Function, - BinaryBasicBlock *IndCallBlock, - const bool IsTailCall, - BasicBlocksVector &&NewBBs, - const std::vector &Targets) const; - - public: - explicit IndirectCallPromotion(const cl::opt &PrintPass) - : BinaryFunctionPass(PrintPass) { } - - const char *getName() const override { - return "indirect-call-promotion"; - } - bool shouldPrint(const BinaryFunction &BF) const override { - return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; - } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; -}; - /// Pass for lowering any instructions that we have raised and that have /// to be lowered. class InstructionLowering : public BinaryFunctionPass { diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 01b61bc4c6a5..61ce50fefbb8 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_library(LLVMBOLTPasses FrameOptimizer.cpp HFSort.cpp HFSortPlus.cpp + IndirectCallPromotion.cpp Inliner.cpp ReorderAlgorithm.cpp ) diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp new file mode 100644 index 000000000000..7fb92df815ec --- /dev/null +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -0,0 +1,829 @@ +//===--- BinaryPasses.cpp - Binary-level analysis/optimization passes -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "IndirectCallPromotion.h" +#include "llvm/Support/Options.h" + +#define DEBUG_TYPE "ICP" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +extern cl::opt Verbosity; +extern bool shouldProcess(const bolt::BinaryFunction &Function); + +cl::opt +IndirectCallPromotion("indirect-call-promotion", + cl::init(ICP_NONE), + cl::desc("indirect call promotion"), + cl::values( + clEnumValN(ICP_NONE, "none", "do not perform indirect call promotion"), + clEnumValN(ICP_CALLS, "calls", "perform ICP on indirect calls"), + clEnumValN(ICP_JUMP_TABLES, "jump-tables", "perform ICP on jump tables"), + clEnumValN(ICP_ALL, "all", "perform ICP on calls and jump tables"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionThreshold( + "indirect-call-promotion-threshold", + cl::desc("threshold for optimizing a frequently taken indirect call"), + cl::init(90), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionMispredictThreshold( + "indirect-call-promotion-mispredict-threshold", + cl::desc("misprediction threshold for skipping ICP on an " + "indirect call"), + cl::init(2), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionUseMispredicts( + "indirect-call-promotion-use-mispredicts", + cl::desc("use misprediction frequency for determining whether or not ICP " + "should be applied at a callsite. The " + "-indirect-call-promotion-mispredict-threshold value will be used " + "by this heuristic"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionTopN( + "indirect-call-promotion-topn", + cl::desc("number of targets to consider when doing indirect " + "call promotion"), + cl::init(1), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::list +ICPFuncsList("icp-funcs", + cl::CommaSeparated, + cl::desc("list of functions to enable ICP for"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +ICPOldCodeSequence( + "icp-old-code-sequence", + cl::desc("use old code sequence for promoted calls"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF, + const BranchInfo &BI) +: From(BF.getSymbol()), + To(uint64_t(BI.To.Offset)), + Mispreds{uint64_t(BI.Mispreds)}, + Branches{uint64_t(BI.Branches)}, + Histories{BI.Histories} { + if (BI.To.IsSymbol) { + auto &BC = BF.getBinaryContext(); + auto Itr = BC.GlobalSymbols.find(BI.To.Name); + if (Itr != BC.GlobalSymbols.end()) { + To.IsSymbol = true; + To.Sym = BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); + To.Addr = 0; + assert(To.Sym); + } + } +} + +// Get list of targets for a given call sorted by most frequently +// called first. +std::vector +IndirectCallPromotion::getCallTargets( + BinaryFunction &BF, + const MCInst &Inst +) const { + auto &BC = BF.getBinaryContext(); + std::vector Targets; + + if (const auto *JT = BF.getJumpTable(Inst)) { + const Location From(BF.getSymbol()); + const auto Range = JT->getEntriesForAddress(BC.MIA->getJumpTable(Inst)); + assert(JT->Counts.empty() || JT->Counts.size() >= Range.second); + BinaryFunction::JumpInfo DefaultJI; + const auto *JI = JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first]; + const size_t JIAdj = JT->Counts.empty() ? 0 : 1; + for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) { + auto *Entry = JT->Entries[I]; + assert(BF.getBasicBlockForLabel(Entry) || + Entry == BF.getFunctionEndLabel() || + Entry == BF.getFunctionColdEndLabel()); + const Location To(Entry); + Callsite CS{From, To, JI->Mispreds, JI->Count, BranchHistories()}; + Targets.emplace_back(CS); + } + + // Sort by symbol then addr. + std::sort(Targets.begin(), Targets.end(), + [](const Callsite &A, const Callsite &B) { + if (A.To.IsSymbol && B.To.IsSymbol) + return A.To.Sym < B.To.Sym; + else if (A.To.IsSymbol && !B.To.IsSymbol) + return true; + else if (!A.To.IsSymbol && B.To.IsSymbol) + return false; + else + return A.To.Addr < B.To.Addr; + }); + + // TODO: I'm going to leave this as is since it will be fixed in + // D5005620 and it ought to make merging easier if there are fewer + // changes. + auto First = Targets.begin(); + auto Last = Targets.end(); + auto Result = First; + while (++First != Last) { + auto &A = *Result; + const auto &B = *First; + if (A.To.IsSymbol && B.To.IsSymbol && A.To.Sym == B.To.Sym) { + A.Mispreds += B.Mispreds; + A.Branches += B.Branches; + } else { + *(++Result) = *First; + } + } + ++Result; + + DEBUG( + if (Targets.end() - Result > 0) { + dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result) + << " duplicate targets removed\n"; + }); + + Targets.erase(Result, Targets.end()); + } else { + const auto BranchDataOrErr = BC.DR.getFuncBranchData(BF.getNames()); + const auto &BranchData = BranchDataOrErr.get(); + auto Offset = BC.MIA->getAnnotationAs(Inst, "IndirectBranchData"); + for (const auto &BI : BranchData.getBranchRange(Offset)) { + Callsite Site(BF, BI); + if (Site.isValid()) { + Targets.emplace_back(std::move(Site)); + } + } + } + + // Sort by most commonly called targets. + std::sort(Targets.begin(), Targets.end(), + [](const Callsite &A, const Callsite &B) { + return A.Branches > B.Branches; + }); + + // Remove non-symbol targets + auto Last = std::remove_if(Targets.begin(), + Targets.end(), + [](const Callsite &CS) { + return !CS.To.IsSymbol; + }); + Targets.erase(Last, Targets.end()); + + DEBUG( + if (BF.getJumpTable(Inst)) { + uint64_t TotalCount = 0; + uint64_t TotalMispreds = 0; + for (const auto &S : Targets) { + TotalCount += S.Branches; + TotalMispreds += S.Mispreds; + } + if (!TotalCount) TotalCount = 1; + if (!TotalMispreds) TotalMispreds = 1; + + dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size() + << ", Count = " << TotalCount + << ", Mispreds = " << TotalMispreds << "\n"; + + size_t I = 0; + for (const auto &S : Targets) { + dbgs () << "Count[" << I << "] = " << S.Branches << ", " + << format("%.1f", (100.0*S.Branches)/TotalCount) << ", " + << "Mispreds[" << I << "] = " << S.Mispreds << ", " + << format("%.1f", (100.0*S.Mispreds)/TotalMispreds) << "\n"; + ++I; + } + }); + + return Targets; +} + +std::vector> +IndirectCallPromotion::findCallTargetSymbols( + BinaryContext &BC, + const std::vector &Targets, + const size_t N +) const { + std::vector> SymTargets; + + for (size_t I = 0; I < N; ++I) { + assert(Targets[I].To.IsSymbol && "All ICP targets must be to known symbols"); + SymTargets.push_back(std::make_pair(Targets[I].To.Sym, 0)); + } + + return SymTargets; +} + +std::vector> +IndirectCallPromotion::rewriteCall(BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *IndCallBlock, + const MCInst &CallInst, + MCInstrAnalysis::ICPdata &&ICPcode) const { + // Create new basic blocks with correct code in each one first. + std::vector> NewBBs; + const bool IsTailCallOrJT = (BC.MIA->isTailCall(CallInst) || + Function.getJumpTable(CallInst)); + + // Move instructions from the tail of the original call block + // to the merge block. + + // Remember any pseudo instructions following a tail call. These + // must be preserved and moved to the original block. + std::vector TailInsts; + const auto *TailInst= &CallInst; + if (IsTailCallOrJT) { + while (TailInst + 1 < &(*IndCallBlock->end()) && + BC.MII->get((TailInst + 1)->getOpcode()).isPseudo()) { + TailInsts.push_back(*++TailInst); + } + } + + auto MovedInst = IndCallBlock->splitInstructions(&CallInst); + + IndCallBlock->replaceInstruction(&CallInst, ICPcode.front().second); + IndCallBlock->addInstructions(TailInsts.begin(), TailInsts.end()); + + for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) { + auto &Sym = Itr->first; + auto &Insts = Itr->second; + assert(Sym); + auto TBB = Function.createBasicBlock(0, Sym); + for (auto &Inst : Insts) { // sanitize new instructions. + if (BC.MIA->isCall(Inst)) + BC.MIA->removeAnnotation(Inst, "IndirectBranchData"); + } + TBB->addInstructions(Insts.begin(), Insts.end()); + NewBBs.emplace_back(std::move(TBB)); + } + + // Move tail of instructions from after the original call to + // the merge block. + if (!IsTailCallOrJT) { + NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end()); + } else { + // assert(MovedInst.empty()); empty or just CFI + } + + return NewBBs; +} + +BinaryBasicBlock *IndirectCallPromotion::fixCFG( + BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *IndCallBlock, + const bool IsTailCall, + const bool IsJumpTable, + IndirectCallPromotion::BasicBlocksVector &&NewBBs, + const std::vector &Targets +) const { + using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo; + BinaryBasicBlock *MergeBlock = nullptr; + + auto moveSuccessors = [](BinaryBasicBlock *Old, BinaryBasicBlock *New) { + std::vector OldSucc(Old->successors().begin(), + Old->successors().end()); + std::vector BranchInfo(Old->branch_info_begin(), + Old->branch_info_end()); + + // Remove all successors from the old block. + Old->removeSuccessors(OldSucc.begin(), OldSucc.end()); + assert(Old->succ_empty()); + + // Move them to the new block. + New->addSuccessors(OldSucc.begin(), + OldSucc.end(), + BranchInfo.begin(), + BranchInfo.end()); + + // Update the execution count on the new block. + New->setExecutionCount(Old->getExecutionCount()); + }; + + // Scale indirect call counts to the execution count of the original + // basic block containing the indirect call. + uint64_t TotalIndirectBranches = 0; + uint64_t TotalIndirectMispreds = 0; + for (const auto &BI : Targets) { + TotalIndirectBranches += BI.Branches; + TotalIndirectMispreds += BI.Mispreds; + } + + uint64_t TotalCount = 0; + uint64_t TotalMispreds = 0; + + if (Function.hasValidProfile()) { + TotalCount = IndCallBlock->getExecutionCount(); + TotalMispreds = + TotalCount * ((double)TotalIndirectMispreds / TotalIndirectBranches); + assert(TotalCount != BinaryBasicBlock::COUNT_NO_PROFILE); + } + + // New BinaryBranchInfo scaled to the execution count of the original BB. + std::vector BBI; + for (auto Itr = Targets.begin(); Itr != Targets.end(); ++Itr) { + BBI.push_back( + BinaryBranchInfo{ + uint64_t(TotalCount * ((double)Itr->Branches / TotalIndirectBranches)), + uint64_t(TotalMispreds * ((double)Itr->Mispreds / TotalIndirectMispreds)) + }); + } + + auto BI = BBI.begin(); + auto updateCurrentBranchInfo = [&]{ + assert(BI < BBI.end()); + TotalCount -= BI->Count; + TotalMispreds -= BI->MispredictedCount; + ++BI; + }; + + if (IsTailCall || IsJumpTable) { + if (IsJumpTable) { + moveSuccessors(IndCallBlock, NewBBs.back().get()); + } + + // Fix up successors and execution counts. + updateCurrentBranchInfo(); + if (IsJumpTable) { + assert(Targets[0].To.IsSymbol); + auto *Succ = Function.getBasicBlockForLabel(Targets[0].To.Sym); + IndCallBlock->addSuccessor(Succ, BBI[0]); // cond branch + } + IndCallBlock->addSuccessor(NewBBs[0].get(), TotalCount); // fallthru branch + + for (size_t I = 0; I < NewBBs.size() - 1; ++I) { + assert(TotalCount <= IndCallBlock->getExecutionCount() || + TotalCount <= uint64_t(TotalIndirectBranches)); + uint64_t ExecCount = BBI[I+1].Count; + updateCurrentBranchInfo(); + if (IsJumpTable) { + assert(Targets[I+1].To.IsSymbol); + auto *Succ = Function.getBasicBlockForLabel(Targets[I+1].To.Sym); + NewBBs[I]->addSuccessor(Succ, BBI[I+1]); + } + NewBBs[I]->addSuccessor(NewBBs[I+1].get(), TotalCount); // fallthru + ExecCount += TotalCount; + NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); + NewBBs[I]->setIsCold(IndCallBlock->isCold()); + NewBBs[I]->setExecutionCount(ExecCount); + } + + } else { + assert(NewBBs.size() >= 2); + assert(NewBBs.size() % 2 == 1 || IndCallBlock->succ_empty()); + assert(NewBBs.size() % 2 == 1); + + MergeBlock = NewBBs.back().get(); + + moveSuccessors(IndCallBlock, MergeBlock); + + // Fix up successors and execution counts. + updateCurrentBranchInfo(); + IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount); // cond branch + IndCallBlock->addSuccessor(NewBBs[0].get(), BBI[0]); // uncond branch + + for (size_t I = 0; I < NewBBs.size() - 2; ++I) { + assert(TotalCount <= IndCallBlock->getExecutionCount() || + TotalCount <= uint64_t(TotalIndirectBranches)); + uint64_t ExecCount = BBI[(I+1)/2].Count; + NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); + NewBBs[I]->setIsCold(IndCallBlock->isCold()); + if (I % 2 == 0) { + NewBBs[I]->addSuccessor(MergeBlock, BBI[(I+1)/2].Count); // uncond + } else { + assert(I + 2 < NewBBs.size()); + updateCurrentBranchInfo(); + NewBBs[I]->addSuccessor(NewBBs[I+2].get(), TotalCount); // uncond branch + NewBBs[I]->addSuccessor(NewBBs[I+1].get(), BBI[(I+1)/2]); // cond. branch + ExecCount += TotalCount; + } + NewBBs[I]->setExecutionCount(ExecCount); + } + + // Arrange for the MergeBlock to be the fallthrough for the first + // promoted call block. + MergeBlock->setCanOutline(IndCallBlock->canOutline()); + MergeBlock->setIsCold(IndCallBlock->isCold()); + std::unique_ptr MBPtr; + std::swap(MBPtr, NewBBs.back()); + NewBBs.pop_back(); + NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); + // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? + NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch + } + + // cold call block + // TODO: should be able to outline/cold this block. + NewBBs.back()->setExecutionCount(TotalCount); + NewBBs.back()->setCanOutline(IndCallBlock->canOutline()); + NewBBs.back()->setIsCold(IndCallBlock->isCold()); + + // update BB and BB layout. + Function.insertBasicBlocks(IndCallBlock, std::move(NewBBs)); + assert(Function.validateCFG()); + + return MergeBlock; +} + +size_t +IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, + const MCInst &Inst, + const std::vector &Targets, + uint64_t NumCalls) { + const bool IsJumpTable = BB->getFunction()->getJumpTable(Inst); + + // If we have no targets (or no calls), skip this callsite. + if (Targets.empty() || !NumCalls) { + if (opts::Verbosity >= 1) { + const auto InstIdx = &Inst - &(*BB->begin()); + outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " + << InstIdx << " in " << BB->getName() + << ", calls = " << NumCalls + << ", targets empty or NumCalls == 0.\n"; + } + return 0; + } + + const auto TrialN = std::min(size_t(opts::IndirectCallPromotionTopN), + Targets.size()); + + if (!opts::ICPFuncsList.empty()) { + for (auto &Name : opts::ICPFuncsList) { + if (BB->getFunction()->hasName(Name)) + return TrialN; + } + return 0; + } + + // Pick the top N targets. + uint64_t TotalCallsTopN = 0; + uint64_t TotalMispredictsTopN = 0; + size_t N = 0; + + if (opts::IndirectCallPromotionUseMispredicts) { + // Count total number of mispredictions for (at most) the top N targets. + // We may choose a smaller N (TrialN vs. N) if the frequency threshold + // is exceeded by fewer targets. + double Threshold = double(opts::IndirectCallPromotionMispredictThreshold); + for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { + const auto Frequency = (100.0 * Targets[I].Mispreds) / NumCalls; + TotalMispredictsTopN += Targets[I].Mispreds; + if (!IsJumpTable) + TotalNumFrequentCalls += Targets[I].Branches; + else + TotalNumFrequentJmps += Targets[I].Branches; + Threshold -= Frequency; + } + + // Compute the misprediction frequency of the top N call targets. If this + // frequency is greater than the threshold, we should try ICP on this callsite. + const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls; + + if (TopNFrequency == 0 || + TopNFrequency < opts::IndirectCallPromotionMispredictThreshold) { + if (opts::Verbosity >= 1) { + const auto InstIdx = &Inst - &(*BB->begin()); + outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " + << InstIdx << " in " << BB->getName() << ", calls = " + << NumCalls << ", top N mis. frequency " + << format("%.1f", TopNFrequency) << "% < " + << opts::IndirectCallPromotionMispredictThreshold << "%\n"; + } + return 0; + } + } else { + // Count total number of calls for (at most) the top N targets. + // We may choose a smaller N (TrialN vs. N) if the frequency threshold + // is exceeded by fewer targets. + double Threshold = double(opts::IndirectCallPromotionThreshold); + for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { + const auto Frequency = (100.0 * Targets[I].Branches) / NumCalls; + TotalCallsTopN += Targets[I].Branches; + TotalMispredictsTopN += Targets[I].Mispreds; + if (!IsJumpTable) + TotalNumFrequentCalls += Targets[I].Branches; + else + TotalNumFrequentJmps += Targets[I].Branches; + Threshold -= Frequency; + } + + // Compute the frequency of the top N call targets. If this frequency + // is greater than the threshold, we should try ICP on this callsite. + const double TopNFrequency = (100.0 * TotalCallsTopN) / NumCalls; + + if (TopNFrequency == 0 || + TopNFrequency < opts::IndirectCallPromotionThreshold) { + if (opts::Verbosity >= 1) { + const auto InstIdx = &Inst - &(*BB->begin()); + outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " + << InstIdx << " in " << BB->getName() << ", calls = " + << NumCalls << ", top N frequency " + << format("%.1f", TopNFrequency) << "% < " + << opts::IndirectCallPromotionThreshold << "%\n"; + } + return 0; + } + + // Compute the misprediction frequency of the top N call targets. If + // this frequency is less than the threshold, we should skip ICP at + // this callsite. + const double TopNMispredictFrequency = + (100.0 * TotalMispredictsTopN) / NumCalls; + + if (TopNMispredictFrequency < + opts::IndirectCallPromotionMispredictThreshold) { + if (opts::Verbosity >= 1) { + const auto InstIdx = &Inst - &(*BB->begin()); + outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " + << InstIdx << " in " << BB->getName() << ", calls = " + << NumCalls << ", top N mispredict frequency " + << format("%.1f", TopNMispredictFrequency) << "% < " + << opts::IndirectCallPromotionMispredictThreshold << "%\n"; + } + return 0; + } + } + + return N; +} + +void +IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, + const MCInst &Inst, + const std::vector &Targets, + const size_t N, + uint64_t NumCalls) const { + auto &BC = BB->getFunction()->getBinaryContext(); + const bool IsTailCall = BC.MIA->isTailCall(Inst); + const bool IsJumpTable = BB->getFunction()->getJumpTable(Inst); + const auto InstIdx = &Inst - &(*BB->begin()); + bool Separator = false; + + outs() << "BOLT-INFO: ICP candidate branch info: " + << *BB->getFunction() << " @ " << InstIdx + << " in " << BB->getName() + << " -> calls = " << NumCalls + << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : "")); + for (size_t I = 0; I < N; I++) { + const auto Frequency = 100.0 * Targets[I].Branches / NumCalls; + const auto MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls; + outs() << (Separator ? " | " : ", "); + Separator = true; + if (Targets[I].To.IsSymbol) + outs() << Targets[I].To.Sym->getName(); + else + outs() << Targets[I].To.Addr; + outs() << ", calls = " << Targets[I].Branches + << ", mispreds = " << Targets[I].Mispreds + << ", taken freq = " << format("%.1f", Frequency) << "%" + << ", mis. freq = " << format("%.1f", MisFrequency) << "%"; + } + outs() << "\n"; + + DEBUG({ + dbgs() << "BOLT-INFO: ICP original call instruction:\n"; + BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true); + }); +} + +void IndirectCallPromotion::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions +) { + if (opts::IndirectCallPromotion == ICP_NONE) + return; + + for (auto &BFIt : BFs) { + auto &Function = BFIt.second; + + if (!Function.isSimple() || !opts::shouldProcess(Function)) + continue; + + const auto BranchDataOrErr = BC.DR.getFuncBranchData(Function.getNames()); + if (const auto EC = BranchDataOrErr.getError()) { + DEBUG(dbgs() << "BOLT-INFO: no branch data found for \"" + << Function << "\"\n"); + continue; + } + const FuncBranchData &BranchData = BranchDataOrErr.get(); + const bool HasLayout = !Function.layout_empty(); + + // Note: this is not just counting calls. + TotalCalls += BranchData.ExecutionCount; + + // Total number of indirect calls issued from the current Function. + // (a fraction of TotalIndirectCalls) + uint64_t FuncTotalIndirectCalls = 0; + uint64_t FuncTotalIndirectJmps = 0; + + std::vector BBs; + for (auto &BB : Function) { + // Skip indirect calls in cold blocks. + if (!HasLayout || !Function.isSplit() || !BB.isCold()) { + BBs.push_back(&BB); + } + } + + while (!BBs.empty()) { + auto *BB = BBs.back(); + BBs.pop_back(); + + for (unsigned Idx = 0; Idx < BB->size(); ++Idx) { + auto &Inst = BB->getInstructionAtIndex(Idx); + const auto InstIdx = &Inst - &(*BB->begin()); + const bool IsTailCall = BC.MIA->isTailCall(Inst); + const bool IsJumpTable = Function.getJumpTable(Inst); + const bool HasBranchData = + BC.MIA->hasAnnotation(Inst, "IndirectBranchData"); + const bool OptimizeCalls = + (opts::IndirectCallPromotion == ICP_CALLS || + opts::IndirectCallPromotion == ICP_ALL); + const bool OptimizeJumpTables = + (opts::IndirectCallPromotion == ICP_JUMP_TABLES || + opts::IndirectCallPromotion == ICP_ALL); + + if (!((HasBranchData && OptimizeCalls) || + (IsJumpTable && OptimizeJumpTables))) + continue; + + assert(BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)); + + if (IsJumpTable) + ++TotalJumpTableCallsites; + else + ++TotalIndirectCallsites; + + const auto Targets = getCallTargets(Function, Inst); + + // Compute the total number of calls from this particular callsite. + uint64_t NumCalls = 0; + for (const auto &BInfo : Targets) { + NumCalls += BInfo.Branches; + } + if (!IsJumpTable) + FuncTotalIndirectCalls += NumCalls; + else + FuncTotalIndirectJmps += NumCalls; + + // Should this callsite be optimized? Return the number of targets + // to use when promoting this call. A value of zero means to skip + // this callsite. + size_t N = canPromoteCallsite(BB, Inst, Targets, NumCalls); + + if (!N) + continue; + + if (opts::Verbosity >= 1) { + printCallsiteInfo(BB, Inst, Targets, N, NumCalls); + } + + // Find MCSymbols or absolute addresses for each call target. + const auto SymTargets = findCallTargetSymbols(BC, Targets, N); + + // If we can't resolve any of the target symbols, punt on this callsite. + if (SymTargets.size() < N) { + const auto LastTarget = SymTargets.size(); + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: ICP failed to find target symbol for " + << Targets[LastTarget].To.Sym->getName() << " in " + << Function << " @ " << InstIdx << " in " + << BB->getName() << ", calls = " << NumCalls << "\n"; + } + continue; + } + + // Generate new promoted call code for this callsite. + auto ICPcode = + BC.MIA->indirectCallPromotion(Inst, + SymTargets, + opts::ICPOldCodeSequence, + BC.Ctx.get()); + + if (ICPcode.empty()) { + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: ICP failed in " << Function << " @ " + << InstIdx << " in " << BB->getName() + << ", calls = " << NumCalls + << ", unable to generate promoted call code.\n"; + } + continue; + } + + DEBUG({ + auto Offset = Targets[0].From.Addr; + dbgs() << "BOLT-INFO: ICP indirect call code:\n"; + for (const auto &entry : ICPcode) { + const auto &Sym = entry.first; + const auto &Insts = entry.second; + if (Sym) dbgs() << Sym->getName() << ":\n"; + Offset = BC.printInstructions(dbgs(), + Insts.begin(), + Insts.end(), + Offset); + } + dbgs() << "---------------------------------------------------\n"; + }); + + // Rewrite the CFG with the newly generated ICP code. + auto NewBBs = rewriteCall(BC, Function, BB, Inst, std::move(ICPcode)); + + // Fix the CFG after inserting the new basic blocks. + auto MergeBlock = fixCFG(BC, Function, BB, IsTailCall, IsJumpTable, + std::move(NewBBs), Targets); + + // Since the tail of the original block was split off and it may contain + // additional indirect calls, we must add the merge block to the set of + // blocks to process. + if (MergeBlock) { + BBs.push_back(MergeBlock); + } + + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: ICP succeeded in " + << Function << " @ " << InstIdx + << " in " << BB->getName() + << " -> calls = " << NumCalls << "\n"; + } + + if (IsJumpTable) + ++TotalOptimizedJumpTableCallsites; + else + ++TotalOptimizedIndirectCallsites; + + Modified.insert(&Function); + } + } + TotalIndirectCalls += FuncTotalIndirectCalls; + TotalIndirectJmps += FuncTotalIndirectJmps; + } + + outs() << "BOLT-INFO: ICP total indirect callsites = " + << TotalIndirectCallsites + << "\n" + << "BOLT-INFO: ICP total jump table callsites = " + << TotalJumpTableCallsites + << "\n" + << "BOLT-INFO: ICP total number of calls = " + << TotalCalls + << "\n" + << "BOLT-INFO: ICP percentage of calls that are indirect = " + << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) + << "%\n" + << "BOLT-INFO: ICP percentage of indirect calls that can be " + "optimized = " + << format("%.1f", (100.0 * TotalNumFrequentCalls) / + std::max(TotalIndirectCalls, 1ul)) + << "%\n" + << "BOLT-INFO: ICP percentage of indirect calls that are optimized = " + << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / + std::max(TotalIndirectCallsites, 1ul)) + << "%\n" + << "BOLT-INFO: ICP percentage of jump table calls that can be " + "optimized = " + << format("%.1f", (100.0 * TotalNumFrequentJmps) / + std::max(TotalIndirectJmps, 1ul)) + << "%\n" + << "BOLT-INFO: ICP percentage of jump table calls that are optimized = " + << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / + std::max(TotalJumpTableCallsites, 1ul)) + << "%\n"; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/IndirectCallPromotion.h b/bolt/Passes/IndirectCallPromotion.h new file mode 100644 index 000000000000..eb7e583d6ec1 --- /dev/null +++ b/bolt/Passes/IndirectCallPromotion.h @@ -0,0 +1,217 @@ +//===--- BinaryPasses.h - Binary-level analysis/optimization passes -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The indirect call promotion (ICP) optimization pass. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_INDIRECT_CALL_PROMOTION_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_INDIRECT_CALL_PROMOTION_H + +#include "BinaryPasses.h" +#include "DataReader.h" + +namespace llvm { +namespace bolt { + +/// Optimize indirect calls. +/// The indirect call promotion pass visits each indirect call and +/// examines the BranchData for each. If the most frequent targets +/// from that callsite exceed the specified threshold (default 90%), +/// the call is promoted. Otherwise, it is ignored. By default, +/// only one target is considered at each callsite. +/// +/// When an candidate callsite is processed, we modify the callsite +/// to test for the most common call targets before calling through +/// the original generic call mechanism. +/// +/// The CFG and layout are modified by ICP. +/// +/// A few new command line options have been added: +/// -indirect-call-promotion=[none,call,jump-tables,all] +/// -indirect-call-promotion-threshold= +/// -indirect-call-promotion-mispredict-threshold= +/// -indirect-call-promotion-topn= +/// +/// The threshold is the minimum frequency of a call target needed +/// before ICP is triggered. +/// +/// The mispredict threshold is used to disable the optimization at +/// any callsite where the branch predictor does a good enough job +/// that ICP wouldn't help regardless of the frequency of the most +/// common target. +/// +/// The topn option controls the number of targets to consider for +/// each callsite, e.g. ICP is triggered if topn=2 and the total +/// frequency of the top two call targets exceeds the threshold. +/// +/// The minimize code size option controls whether or not the hot +/// calls are to registers (callq %r10) or to function addresses +/// (callq $foo). +/// +/// Example of ICP: +/// +/// C++ code: +/// +/// int B_count = 0; +/// int C_count = 0; +/// +/// struct A { virtual void foo() = 0; } +/// struct B : public A { virtual void foo() { ++B_count; }; }; +/// struct C : public A { virtual void foo() { ++C_count; }; }; +/// +/// A* a = ... +/// a->foo(); +/// ... +/// +/// original assembly: +/// +/// B0: 49 8b 07 mov (%r15),%rax +/// 4c 89 ff mov %r15,%rdi +/// ff 10 callq *(%rax) +/// 41 83 e6 01 and $0x1,%r14d +/// 4d 89 e6 mov %r12,%r14 +/// 4c 0f 44 f5 cmove %rbp,%r14 +/// 4c 89 f7 mov %r14,%rdi +/// ... +/// +/// after ICP: +/// +/// B0: 49 8b 07 mov (%r15),%rax +/// 4c 89 ff mov %r15,%rdi +/// 48 81 38 e0 0b 40 00 cmpq $B::foo,(%rax) +/// 75 29 jne B3 +/// B1: e8 45 03 00 00 callq $B::foo +/// B2: 41 83 e6 01 and $0x1,%r14d +/// 4d 89 e6 mov %r12,%r14 +/// 4c 0f 44 f5 cmove %rbp,%r14 +/// 4c 89 f7 mov %r14,%rdi +/// ... +/// +/// B3: ff 10 callq *(%rax) +/// eb d6 jmp B2 +/// +class IndirectCallPromotion : public BinaryFunctionPass { + using BasicBlocksVector = std::vector>; + struct Location { + bool IsSymbol{false}; + MCSymbol *Sym{nullptr}; + uint64_t Addr{0}; + bool isValid() const { + return (IsSymbol && Sym) || (!IsSymbol && Addr != 0); + } + Location() { } + explicit Location(MCSymbol *Sym) : IsSymbol(true), Sym(Sym) { } + explicit Location(uint64_t Addr) : Addr(Addr) { } + }; + + struct Callsite { + Location From; + Location To; + uint64_t Mispreds{0}; + uint64_t Branches{0}; + BranchHistories Histories; + bool isValid() const { + return From.isValid() && To.isValid(); + } + Callsite(BinaryFunction &BF, const BranchInfo &BI); + Callsite(const Location &From, const Location &To, + uint64_t Mispreds, uint64_t Branches, + const BranchHistories &Histories) + : From(From), To(To), Mispreds(Mispreds), Branches(Branches), + Histories(Histories) { } + }; + + std::unordered_set Modified; + // Total number of calls from all callsites. + uint64_t TotalCalls{0}; + + // Total number of indirect calls from all callsites. + // (a fraction of TotalCalls) + uint64_t TotalIndirectCalls{0}; + + // Total number of jmp table calls from all callsites. + // (a fraction of TotalCalls) + uint64_t TotalIndirectJmps{0}; + + // Total number of callsites that use indirect calls. + // (the total number of callsites is not recorded) + uint64_t TotalIndirectCallsites{0}; + + // Total number of callsites that are jump tables. + uint64_t TotalJumpTableCallsites{0}; + + // Total number of indirect callsites that are optimized by ICP. + // (a fraction of TotalIndirectCallsites) + uint64_t TotalOptimizedIndirectCallsites{0}; + + // Total number of jump table callsites that are optimized by ICP. + uint64_t TotalOptimizedJumpTableCallsites{0}; + + // Total number of indirect calls that are optimized by ICP. + // (a fraction of TotalCalls) + uint64_t TotalNumFrequentCalls{0}; + + // Total number of jump table calls that are optimized by ICP. + // (a fraction of TotalCalls) + uint64_t TotalNumFrequentJmps{0}; + + std::vector getCallTargets(BinaryFunction &BF, + const MCInst &Inst) const; + + size_t canPromoteCallsite(const BinaryBasicBlock *BB, + const MCInst &Inst, + const std::vector &Targets, + uint64_t NumCalls); + + void printCallsiteInfo(const BinaryBasicBlock *BB, + const MCInst &Inst, + const std::vector &Targets, + const size_t N, + uint64_t NumCalls) const; + + std::vector> + findCallTargetSymbols(BinaryContext &BC, + const std::vector &Targets, + const size_t N) const; + + std::vector> + rewriteCall(BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *IndCallBlock, + const MCInst &CallInst, + MCInstrAnalysis::ICPdata &&ICPcode) const; + + BinaryBasicBlock *fixCFG(BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *IndCallBlock, + const bool IsTailCall, + const bool IsJumpTable, + BasicBlocksVector &&NewBBs, + const std::vector &Targets) const; + + public: + explicit IndirectCallPromotion(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "indirect-call-promotion"; + } + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/Passes/Inliner.cpp b/bolt/Passes/Inliner.cpp index c376700f0526..3374e6d8bc8c 100644 --- a/bolt/Passes/Inliner.cpp +++ b/bolt/Passes/Inliner.cpp @@ -37,7 +37,6 @@ ForceInlineFunctions("force-inline", } - namespace llvm { namespace bolt { From d8dad45b05f56ce2af3d86be55b334266de655b3 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 22 May 2017 17:17:04 -0700 Subject: [PATCH 250/904] [BOLT] Fix debug info for input with continuous range. Summary: When we see a compilation unit with continuous range on input, it has two attributes: DW_AT_low_pc and DW_AT_high_pc. We convert the range to a non-continuous one and change the attributes to DW_AT_ranges and DW_AT_producer. However, gdb seems to expect every compilation unit to have a base address specified via DW_AT_low_pc, even when its value is always 0. Otherwise gdb will not show proper debug info for such modules. With this diff we produce DW_AT_ranges followed by DW_AT_low_pc. The problem is that the first attribute takes DW_FORM_sec_offset which is exactly 4 bytes, and in many cases we are left with 12 bytes to fill in. We used to fill this space with DW_AT_producer, which took an arbitrary-length field. For DW_AT_low_pc we can use a trick of using DW_FORM_udata (unsigned ULEB128 encoded integer) which can take up to 12 bytes, even when the value is 0. (cherry picked from commit 55d8927846e5b0f6191bcbdc709af35936897b70) --- bolt/DWARFRewriter.cpp | 51 +++++++++++++++++++----------------------- bolt/DebugData.cpp | 13 +++++++++++ bolt/DebugData.h | 5 +++++ 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index a7c59c05bd70..c3e1eca911e6 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -174,15 +174,18 @@ void RewriteInstance::updateDWARFObjectAddressRanges( DIE->getAttributeValue(Unit, dwarf::DW_AT_ranges, FormValue, &AttrOffset); DebugInfoPatcher->addLE32Patch(AttrOffset, DebugRangesOffset); } else { - // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc. - // We require the compiler to put both attributes one after the other - // for our approach to work. low_pc and high_pc both occupy 8 bytes - // as we're dealing with a 64-bit ELF. We basically change low_pc to - // DW_AT_ranges and high_pc to DW_AT_producer. ranges spans only 4 bytes - // in 32-bit DWARF, which we assume to be used, which leaves us with 12 - // more bytes. We then set the value of DW_AT_producer as an arbitrary - // 12-byte string that fills the remaining space and leaves the rest of - // the abbreviation layout unchanged. + // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc emitted back + // to back. We replace the attributes with DW_AT_ranges and DW_AT_low_pc. + // The low_pc attribute is required for DW_TAG_compile_units to set a base + // address. + // + // Since DW_AT_ranges takes 4-byte DW_FROM_sec_offset value, we have to fill + // in up to 12-bytes left after removal of low/high pc field from + // .debug_info. + // + // To fill in the gap we use a variable length DW_FORM_udata encoding for + // DW_AT_low_pc. We exploit the fact that the encoding can take an arbitrary + // large size. if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) != -1U && AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc) != -1U) { uint32_t LowPCOffset = -1U; @@ -197,19 +200,15 @@ void RewriteInstance::updateDWARFObjectAddressRanges( (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " + errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " << "at offset 0x" << Twine::utohexstr(DIE->getOffset()) << "\n"; - } return; } if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " - << "Cannot update DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; - } + errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " + << "Cannot update DIE at offset 0x" + << Twine::utohexstr(DIE->getOffset()) << '\n'; return; } @@ -221,23 +220,19 @@ void RewriteInstance::updateDWARFObjectAddressRanges( AbbrevPatcher->addAttributePatch(Unit, AbbrevCode, dwarf::DW_AT_high_pc, - dwarf::DW_AT_producer, - dwarf::DW_FORM_string); - unsigned StringSize = 0; + dwarf::DW_AT_low_pc, + dwarf::DW_FORM_udata); + unsigned LowPCSize = 0; if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr || HighPCFormValue.getForm() == dwarf::DW_FORM_data8) { - StringSize = 12; + LowPCSize = 12; } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) { - StringSize = 8; + LowPCSize = 8; } else { - assert(0 && "unexpected form"); + llvm_unreachable("unexpected form"); } - DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); - std::string ProducerString{"LLVM-BOLT"}; - ProducerString.resize(StringSize, ' '); - ProducerString.back() = '\0'; - DebugInfoPatcher->addBinaryPatch(LowPCOffset + 4, ProducerString); + DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize); } else { if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index fd19e6a3fc47..2a18d207716b 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -15,6 +15,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/LEB128.h" #include #include @@ -270,6 +271,18 @@ void SimpleBinaryPatcher::addLEPatch(uint32_t Offset, uint64_t NewValue, Patches.emplace_back(std::make_pair(Offset, LE64)); } +void SimpleBinaryPatcher::addUDataPatch(uint32_t Offset, uint64_t Value, uint64_t Size) { + const auto EncodedSize = getULEB128Size(Value); + assert(EncodedSize <= Size && "value did not fit"); + + const auto Padding = Size - EncodedSize; + std::string Buff; + raw_string_ostream OS(Buff); + encodeULEB128(Value, OS, Padding); + + Patches.emplace_back(Offset, OS.str()); +} + void SimpleBinaryPatcher::addLE64Patch(uint32_t Offset, uint64_t NewValue) { addLEPatch(Offset, NewValue, 8); } diff --git a/bolt/DebugData.h b/bolt/DebugData.h index b1f984207a1c..7108d506aca7 100644 --- a/bolt/DebugData.h +++ b/bolt/DebugData.h @@ -345,6 +345,11 @@ class SimpleBinaryPatcher : public BinaryPatcher { /// little-endian value at offset \p Offset. void addLE32Patch(uint32_t Offset, uint32_t NewValue); + /// Add a patch at \p Offset with \p Value using unsigned LEB128 encoding with + /// size \p Size. \p Size should not be less than a minimum number of bytes + /// needed to encode \p Value. + void addUDataPatch(uint32_t Offset, uint64_t Value, uint64_t Size); + void patchBinary(std::string &BinaryContents) override; }; From e3bc35e51ad69c90ed72e31a524155b07cff20bc Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 1 May 2017 16:51:27 -0700 Subject: [PATCH 251/904] [BOLT] Add dataflow infrastructure Summary: This diff introduces a common infrastructure for performing dataflow analyses in BinaryFunctions as well as a few analyses that are useful in a variety of scenarios. The largest user of this infrastructure so far is shrink wrapping, which will be added in a separate diff. (cherry picked from commit e4e76133a915fb7ca81a6626b69ce9d28984dcca) --- bolt/BinaryFunction.h | 30 ++ bolt/Passes/CMakeLists.txt | 5 + bolt/Passes/DataflowAnalysis.cpp | 40 ++ bolt/Passes/DataflowAnalysis.h | 529 ++++++++++++++++++++ bolt/Passes/DataflowInfoManager.cpp | 170 +++++++ bolt/Passes/DataflowInfoManager.h | 72 +++ bolt/Passes/DominatorAnalysis.h | 141 ++++++ bolt/Passes/FrameAnalysis.cpp | 703 +++++++++++++++++++++++++++ bolt/Passes/FrameAnalysis.h | 262 ++++++++++ bolt/Passes/FrameOptimizer.cpp | 12 +- bolt/Passes/LivenessAnalysis.cpp | 19 + bolt/Passes/LivenessAnalysis.h | 79 +++ bolt/Passes/ReachingDefOrUse.h | 126 +++++ bolt/Passes/ReachingInsns.h | 84 ++++ bolt/Passes/StackPointerTracking.cpp | 28 ++ bolt/Passes/StackPointerTracking.h | 203 ++++++++ 16 files changed, 2499 insertions(+), 4 deletions(-) create mode 100644 bolt/Passes/DataflowAnalysis.cpp create mode 100644 bolt/Passes/DataflowAnalysis.h create mode 100644 bolt/Passes/DataflowInfoManager.cpp create mode 100644 bolt/Passes/DataflowInfoManager.h create mode 100644 bolt/Passes/DominatorAnalysis.h create mode 100644 bolt/Passes/FrameAnalysis.cpp create mode 100644 bolt/Passes/FrameAnalysis.h create mode 100644 bolt/Passes/LivenessAnalysis.cpp create mode 100644 bolt/Passes/LivenessAnalysis.h create mode 100644 bolt/Passes/ReachingDefOrUse.h create mode 100644 bolt/Passes/ReachingInsns.h create mode 100644 bolt/Passes/StackPointerTracking.cpp create mode 100644 bolt/Passes/StackPointerTracking.h diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 05d7d273459e..15a48d3ee0bc 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -974,6 +974,20 @@ class BinaryFunction : public AddressRangesOwner { return nullptr; } + /// Retrieve the landing pad BB associated with invoke instruction \p Invoke + /// that is in \p BB. Return nullptr if none exists + BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB, + const MCInst &InvokeInst) { + assert(BC.MIA->isInvoke(InvokeInst) && "must be invoke instruction"); + MCLandingPad LP = BC.MIA->getEHInfo(InvokeInst); + if (LP.first) { + auto *LBB = BB.getLandingPad(LP.first); + assert (LBB && "Landing pad should be defined"); + return LBB; + } + return nullptr; + } + /// Return the name of the function as extracted from the binary file. /// If the function has multiple names - return the last one /// followed by "(*#)". @@ -1412,6 +1426,14 @@ class BinaryFunction : public AddressRangesOwner { return; } + BinaryBasicBlock::iterator addCFIInstruction(BinaryBasicBlock *BB, + BinaryBasicBlock::iterator Pos, + MCCFIInstruction &&Inst) { + auto Idx = FrameInstructions.size(); + FrameInstructions.emplace_back(std::forward(Inst)); + return addCFIPseudo(BB, Pos, Idx); + } + /// Insert a CFI pseudo instruction in a basic block. This pseudo instruction /// is a placeholder that refers to a real MCCFIInstruction object kept by /// this function that will be emitted at that position. @@ -1424,6 +1446,14 @@ class BinaryFunction : public AddressRangesOwner { } /// Retrieve the MCCFIInstruction object associated with a CFI pseudo. + MCCFIInstruction* getCFIFor(const MCInst &Instr) { + if (!BC.MIA->isCFI(Instr)) + return nullptr; + uint32_t Offset = Instr.getOperand(0).getImm(); + assert(Offset < FrameInstructions.size() && "Invalid CFI offset"); + return &FrameInstructions[Offset]; + } + const MCCFIInstruction* getCFIFor(const MCInst &Instr) const { if (!BC.MIA->isCFI(Instr)) return nullptr; diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 61ce50fefbb8..b764de69c4c5 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -1,11 +1,16 @@ add_llvm_library(LLVMBOLTPasses BinaryPasses.cpp + DataflowAnalysis.cpp + DataflowInfoManager.cpp + FrameAnalysis.cpp FrameOptimizer.cpp HFSort.cpp HFSortPlus.cpp IndirectCallPromotion.cpp Inliner.cpp + LivenessAnalysis.cpp ReorderAlgorithm.cpp + StackPointerTracking.cpp ) include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt ) diff --git a/bolt/Passes/DataflowAnalysis.cpp b/bolt/Passes/DataflowAnalysis.cpp new file mode 100644 index 000000000000..e3a1894a930b --- /dev/null +++ b/bolt/Passes/DataflowAnalysis.cpp @@ -0,0 +1,40 @@ +#include "DataflowAnalysis.h" + +namespace llvm { +namespace bolt { + +void doForAllPreds(const BinaryContext &BC, const BinaryBasicBlock &BB, + std::function Task) { + for (auto Pred : BB.predecessors()) { + if (Pred->isValid()) + Task(ProgramPoint::getLastPointAt(*Pred)); + } + if (!BB.isLandingPad()) + return; + for (auto Thrower : BB.throwers()) { + for (auto &Inst : *Thrower) { + if (!BC.MIA->isInvoke(Inst) || + BC.MIA->getEHInfo(Inst).first != BB.getLabel()) + continue; + Task(ProgramPoint(&Inst)); + } + } +} + +/// Operates on all successors of a basic block. +void doForAllSuccs(const BinaryBasicBlock &BB, + std::function Task) { + for (auto Succ : BB.successors()) { + if (Succ->isValid()) + Task(ProgramPoint::getFirstPointAt(*Succ)); + } +} + +} // namespace bolt +} // namespace llvm + +llvm::raw_ostream &llvm::operator<<(llvm::raw_ostream &OS, + const BitVector &Val) { + OS << "BitVector"; + return OS; +} diff --git a/bolt/Passes/DataflowAnalysis.h b/bolt/Passes/DataflowAnalysis.h new file mode 100644 index 000000000000..1252be07eaa2 --- /dev/null +++ b/bolt/Passes/DataflowAnalysis.h @@ -0,0 +1,529 @@ +//===--- Passes/DataflowAnalysis.h ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWANALYSIS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWANALYSIS_H + +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "llvm/Support/Timer.h" +#include + +namespace llvm { +namespace bolt { + +/// Represents a given program point as viewed by a dataflow analysis. This +/// point is a location that may be either an instruction or a basic block. +/// Example: +/// +/// BB1: --> ProgramPoint 1 (stored as bb *) +/// add --> ProgramPoint 2 (stored as inst *) +/// sub --> ProgramPoint 3 (stored as inst *) +/// jmp --> ProgramPoint 4 (stored as inst *) +/// +/// ProgramPoints allow us to attach a state to any location in the program +/// and is a core concept used in the dataflow analysis engine. +/// +/// A dataflow analysis will associate a state with a program point. In +/// analyses whose direction is forward, this state tracks what happened after +/// the execution of an instruction, and the BB tracks the state of what +/// happened before the execution of the first instruction in this BB. For +/// backwards dataflow analyses, state tracks what happened before the +/// execution of a given instruction, while the state associated with a BB +/// tracks what happened after the execution of the last instruction of a BB. +class ProgramPoint { + enum IDTy : bool { BB = 0, Inst } ID; + + union DataU { + BinaryBasicBlock *BB; + MCInst *Inst; + DataU(BinaryBasicBlock *BB) : BB(BB) {} + DataU(MCInst *Inst) : Inst(Inst) {} + } Data; + +public: + ProgramPoint() : ID(IDTy::BB), Data((MCInst *)nullptr) {} + ProgramPoint(BinaryBasicBlock *BB) : ID(IDTy::BB), Data(BB) {} + ProgramPoint(MCInst *Inst) : ID(IDTy::Inst), Data(Inst) {} + + /// Convenience function to access the last program point of a basic block, + /// which is equal to its last instruction. If it is empty, it is equal to + /// itself. + static ProgramPoint getLastPointAt(BinaryBasicBlock &BB) { + auto Last = BB.rbegin(); + if (Last != BB.rend()) + return ProgramPoint(&*Last); + return ProgramPoint(&BB); + } + + /// Similar to getLastPointAt. + static ProgramPoint getFirstPointAt(BinaryBasicBlock &BB) { + auto First = BB.begin(); + if (First != BB.end()) + return ProgramPoint(&*First); + return ProgramPoint(&BB); + } + + void operator=(const ProgramPoint &PP) { + ID = PP.ID; + Data.BB = PP.Data.BB; + } + bool operator<(const ProgramPoint &PP) const { return Data.BB < PP.Data.BB; } + bool operator==(const ProgramPoint &PP) const { + return Data.BB == PP.Data.BB; + } + + bool isBB() const { return ID == IDTy::BB; } + bool isInst() const { return ID == IDTy::Inst; } + + BinaryBasicBlock *getBB() const { + assert(isBB()); + return Data.BB; + } + MCInst *getInst() const { + assert(isInst()); + return Data.Inst; + } + + friend DenseMapInfo; +}; + +/// Convenience function to operate on all predecessors of a BB, as viewed +/// by a dataflow analysis. This includes throw sites if it is a landing pad. +void doForAllPreds(const BinaryContext &BC, const BinaryBasicBlock &BB, + std::function Task); + +/// Operates on all successors of a basic block. +void doForAllSuccs(const BinaryBasicBlock &BB, + std::function Task); + +/// Base class for dataflow analyses. Depends on the type of whatever object is +/// stored as the state (StateTy) at each program point. The dataflow then +/// updates the state at each program point depending on the instruction being +/// processed, iterating until all points converge and agree on a state value. +/// Remember that depending on how you formulate your dataflow equation, this +/// may not converge and will loop indefinitely. +/// /p Backward indicates the direction of the dataflow. If false, direction is +/// forward. +/// +/// Example: Compute the set of live registers at each program point. +/// +/// Modelling: +/// Let State be the set of registers that are live. The kill set of a +/// point is the set of all registers clobbered by the instruction at this +/// program point. The gen set is the set of all registers read by it. +/// +/// out{b} = Union (s E succs{b}) {in{s}} +/// in{b} = (out{b} - kill{b}) U gen{b} +/// +/// Template parameters: +/// StateTy = BitVector, where each index corresponds to a machine register +/// Backward = true (live reg operates in reverse order) +/// +/// Subclass implementation notes: +/// Confluence operator = union (if a reg is alive in any succ, it is alive +/// in the current block). +/// +template +class DataflowAnalysis { + /// CRTP convenience methods + Derived &derived() { + return *static_cast(this); + } + + const Derived &const_derived() const { + return *static_cast(this); + } + +protected: + const BinaryContext &BC; + /// Reference to the function being analysed + BinaryFunction &Func; + + /// Tracks the state at basic block start (end) if direction of the dataflow + /// is forward (backward). + std::unordered_map StateAtBBEntry; + /// Map a point to its previous (succeeding) point if the direction of the + /// dataflow is forward (backward). This is used to support convenience + /// methods to access the resulting state before (after) a given instruction, + /// otherwise our clients need to keep "prev" pointers themselves. + DenseMap PrevPoint; + + /// Perform any bookkeeping before dataflow starts + void preflight() { + llvm_unreachable("Unimplemented method"); + } + + /// Sets initial state for each BB + StateTy getStartingStateAtBB(const BinaryBasicBlock &BB) { + llvm_unreachable("Unimplemented method"); + } + + /// Sets initial state for each instruction (out set) + StateTy getStartingStateAtPoint(const MCInst &Point) { + llvm_unreachable("Unimplemented method"); + } + + /// Computes the in set for the first instruction in a BB by applying the + /// confluence operator to the out sets of the last instruction of each pred + /// (in case of a backwards dataflow, we will operate on the in sets of each + /// successor to determine the starting state of the last instruction of the + /// current BB) + void doConfluence(StateTy &StateOut, const StateTy &StateIn) { + llvm_unreachable("Unimplemented method"); + } + + /// In case of a forwards dataflow, compute the in set for the first + /// instruction in a Landing Pad considering all out sets for associated + /// throw sites. + /// In case of a backwards dataflow, compute the in set of a invoke + /// instruction considering in sets for the first instructions of its + /// landing pads. + void doConfluenceWithLP(StateTy &StateOut, const StateTy &StateIn, + const MCInst &Invoke) { + return derived().doConfluence(StateOut, StateIn); + } + + /// Returns the out set of an instruction given its in set. + /// If backwards, computes the in set given its out set. + StateTy computeNext(const MCInst &Point, const StateTy &Cur) { + llvm_unreachable("Unimplemented method"); + return StateTy(); + } + + /// Returns the MCAnnotation name + StringRef getAnnotationName() const { + llvm_unreachable("Unimplemented method"); + return StringRef(""); + } + + /// Private getter methods accessing state in a read-write fashion + StateTy &getOrCreateStateAt(const BinaryBasicBlock &BB) { + return StateAtBBEntry[&BB]; + } + + StateTy &getOrCreateStateAt(MCInst &Point) { + return BC.MIA->getOrCreateAnnotationAs( + BC.Ctx.get(), Point, derived().getAnnotationName()); + } + + StateTy &getOrCreateStateAt(ProgramPoint Point) { + if (Point.isBB()) + return getOrCreateStateAt(*Point.getBB()); + return getOrCreateStateAt(*Point.getInst()); + } + +public: + /// If the direction of the dataflow is forward, operates on the last + /// instruction of all predecessors when performing an iteration of the + /// dataflow equation for the start of this BB. If backwards, operates on + /// the first instruction of all successors. + void doForAllSuccsOrPreds(const BinaryBasicBlock &BB, + std::function Task) { + if (!Backward) + return doForAllPreds(BC, BB, Task); + return doForAllSuccs(BB, Task); + } + + /// We need the current binary context and the function that will be processed + /// in this dataflow analysis. + DataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF) + : BC(BC), Func(BF) {} + virtual ~DataflowAnalysis() { + cleanAnnotations(); + } + + /// Track the state at basic block start (end) if direction of the dataflow + /// is forward (backward). + ErrorOr getStateAt(const BinaryBasicBlock &BB) const { + auto Iter = StateAtBBEntry.find(&BB); + if (Iter == StateAtBBEntry.end()) + return make_error_code(errc::result_out_of_range); + return Iter->second; + } + + /// Track the state at the end (start) of each MCInst in this function if + /// the direction of the dataflow is forward (backward). + ErrorOr getStateAt(const MCInst &Point) const { + return BC.MIA->tryGetAnnotationAs( + Point, const_derived().getAnnotationName()); + } + + /// Return the out set (in set) of a given program point if the direction of + /// the dataflow is forward (backward). + ErrorOr getStateAt(ProgramPoint Point) const { + if (Point.isBB()) + return getStateAt(*Point.getBB()); + return getStateAt(*Point.getInst()); + } + + ErrorOr getStateBefore(const MCInst &Point) { + return getStateAt(PrevPoint[&Point]); + } + + /// Return the in set (out set) of a given program point if the direction of + /// the dataflow is forward (backward). + ErrorOrgetStateBefore(ProgramPoint Point) { + if (Point.isBB()) + return getStateAt(*Point.getBB()); + return getStateAt(PrevPoint[Point.getInst()]); + } + + /// Remove any state annotations left by this analysis + void cleanAnnotations() { + for (auto &BB : Func) { + for (auto &Inst : BB) { + BC.MIA->removeAnnotation(Inst, derived().getAnnotationName()); + } + } + } + + /// Public entry point that will perform the entire analysis form start to + /// end. + void run() { + derived().preflight(); + + // Initialize state for all points of the function + for (auto &BB : Func) { + auto &St = getOrCreateStateAt(BB); + St = derived().getStartingStateAtBB(BB); + for (auto &Inst : BB) { + auto &St = getOrCreateStateAt(Inst); + St = derived().getStartingStateAtPoint(Inst); + } + } + assert(Func.begin() != Func.end() && "Unexpected empty function"); + + std::queue Worklist; + // TODO: Pushing this in a DFS ordering will greatly speed up the dataflow + // performance. + if (!Backward) { + for (auto &BB : Func) { + Worklist.push(&BB); + MCInst *Prev = nullptr; + for (auto &Inst : BB) { + PrevPoint[&Inst] = Prev ? ProgramPoint(Prev) : ProgramPoint(&BB); + Prev = &Inst; + } + } + } else { + for (auto I = Func.rbegin(), E = Func.rend(); I != E; ++I) { + Worklist.push(&*I); + MCInst *Prev = nullptr; + for (auto J = (*I).rbegin(), E2 = (*I).rend(); J != E2; ++J) { + auto &Inst = *J; + PrevPoint[&Inst] = Prev ? ProgramPoint(Prev) : ProgramPoint(&*I); + Prev = &Inst; + } + } + } + + // Main dataflow loop + while (!Worklist.empty()) { + auto *BB = Worklist.front(); + Worklist.pop(); + + // Calculate state at the entry of first instruction in BB + StateTy StateAtEntry = getOrCreateStateAt(*BB); + if (BB->isLandingPad()) { + doForAllSuccsOrPreds(*BB, [&](ProgramPoint P) { + if (P.isInst() && BC.MIA->isInvoke(*P.getInst())) + derived().doConfluenceWithLP(StateAtEntry, *getStateAt(P), + *P.getInst()); + else + derived().doConfluence(StateAtEntry, *getStateAt(P)); + }); + } else { + doForAllSuccsOrPreds(*BB, [&](ProgramPoint P) { + derived().doConfluence(StateAtEntry, *getStateAt(P)); + }); + } + + bool Changed = false; + StateTy &St = getOrCreateStateAt(*BB); + if (St != StateAtEntry) { + Changed = true; + St = std::move(StateAtEntry); + } + + // Propagate information from first instruction down to the last one + StateTy *PrevState = &St; + const MCInst *LAST = nullptr; + if (!Backward) + LAST = &*BB->rbegin(); + else + LAST = &*BB->begin(); + + auto doNext = [&] (MCInst &Inst, const BinaryBasicBlock &BB) { + StateTy CurState = derived().computeNext(Inst, *PrevState); + + if (Backward && BC.MIA->isInvoke(Inst)) { + auto *LBB = Func.getLandingPadBBFor(BB, Inst); + if (LBB) { + auto First = LBB->begin(); + if (First != LBB->end()) { + derived().doConfluenceWithLP(CurState, + getOrCreateStateAt(&*First), Inst); + } else { + derived().doConfluenceWithLP(CurState, getOrCreateStateAt(LBB), + Inst); + } + } + } + + StateTy &St = getOrCreateStateAt(Inst); + if (St != CurState) { + St = CurState; + if (&Inst == LAST) + Changed = true; + } + PrevState = &St; + }; + + if (!Backward) { + for (auto &Inst : *BB) { + doNext(Inst, *BB); + } + } else { + for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) { + doNext(*I, *BB); + } + } + + if (Changed) { + if (!Backward) { + for (auto Succ : BB->successors()) { + Worklist.push(Succ); + } + for (auto LandingPad : BB->landing_pads()) { + Worklist.push(LandingPad); + } + } else { + for (auto Pred : BB->predecessors()) { + Worklist.push(Pred); + } + for (auto Thrower : BB->throwers()) { + Worklist.push(Thrower); + } + } + } + } // end while (!Worklist.empty()) + } +}; + +/// Define an iterator for navigating the expressions calculated by a +/// dataflow analysis at each program point, when they are backed by a +/// BitVector. +class ExprIterator + : public std::iterator { + const BitVector *BV; + const std::vector &Expressions; + int Idx; + +public: + ExprIterator &operator++() { + assert(Idx != -1 && "Iterator already at the end"); + Idx = BV->find_next(Idx); + return *this; + } + ExprIterator operator++(int) { + assert(Idx != -1 && "Iterator already at the end"); + ExprIterator Ret = *this; + ++(*this); + return Ret; + } + bool operator==(const ExprIterator &Other) const { return Idx == Other.Idx; } + bool operator!=(const ExprIterator &Other) const { return Idx != Other.Idx; } + const MCInst *operator*() { + assert(Idx != -1 && "Invalid access to end iterator"); + return Expressions[Idx]; + } + ExprIterator(const BitVector *BV, const std::vector &Exprs) + : BV(BV), Expressions(Exprs) { + Idx = BV->find_first(); + } + ExprIterator(const BitVector *BV, const std::vector &Exprs, + int Idx) + : BV(BV), Expressions(Exprs), Idx(Idx) {} + + int getBitVectorIndex() const { + return Idx; + } +}; + +/// Specialization of DataflowAnalysis whose state specifically stores +/// a set of instructions. +template +class InstrsDataflowAnalysis + : public DataflowAnalysis { +public: + /// These iterator functions offer access to the set of pointers to + /// instructions in a given program point + template + ExprIterator expr_begin(T &Point) const { + if (auto State = this->getStateAt(Point)) + return ExprIterator(&*State, Expressions); + return expr_end(); + } + ExprIterator expr_begin(BitVector &BV) const { + return ExprIterator(&BV, Expressions); + } + ExprIterator expr_end() const { + return ExprIterator(nullptr, Expressions, -1); + } + + /// Used to size the set of expressions/definitions being tracked by the + /// dataflow analysis + uint64_t NumInstrs{0}; + /// We put every MCInst we want to track (which one representing an + /// expression/def) into a vector because we need to associate them with + /// small numbers. They will be tracked via BitVectors throughout the + /// dataflow analysis. + std::vector Expressions; + /// Maps expressions defs (MCInsts) to its index in the Expressions vector + std::unordered_map ExprToIdx; + + InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF) + : DataflowAnalysis(BC, BF) {} + virtual ~InstrsDataflowAnalysis() {} +}; + +} // namespace bolt + +/// DenseMapInfo allows us to use the DenseMap LLVM data structure to store +/// ProgramPoints. +template<> struct DenseMapInfo { + static inline bolt::ProgramPoint getEmptyKey() { + uintptr_t Val = static_cast(-1); + Val <<= PointerLikeTypeTraits::NumLowBitsAvailable; + return bolt::ProgramPoint(reinterpret_cast(Val)); + } + static inline bolt::ProgramPoint getTombstoneKey() { + uintptr_t Val = static_cast(-2); + Val <<= PointerLikeTypeTraits::NumLowBitsAvailable; + return bolt::ProgramPoint(reinterpret_cast(Val)); + } + static unsigned getHashValue(const bolt::ProgramPoint &PP) { + return (unsigned((uintptr_t)PP.Data.BB) >> 4) ^ + (unsigned((uintptr_t)PP.Data.BB) >> 9); + } + static bool isEqual(const bolt::ProgramPoint &LHS, + const bolt::ProgramPoint &RHS) { + return LHS.Data.BB == RHS.Data.BB; + } +}; + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const BitVector &Val); + +} // namespace llvm + +#endif diff --git a/bolt/Passes/DataflowInfoManager.cpp b/bolt/Passes/DataflowInfoManager.cpp new file mode 100644 index 000000000000..0c4cdbe99e06 --- /dev/null +++ b/bolt/Passes/DataflowInfoManager.cpp @@ -0,0 +1,170 @@ +//===--- Passes/DataflowInfoManager.cpp -----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DataflowInfoManager.h" + + +namespace llvm { +namespace bolt { + +ReachingDefOrUse &DataflowInfoManager::getReachingDefs() { + if (RD) + return *RD; + assert(FA && "FrameAnalysis required"); + RD.reset(new ReachingDefOrUse(*FA, BC, BF)); + { + NamedRegionTimer T1("RD", "Dataflow", true); + RD->run(); + } + return *RD; +} + +void DataflowInfoManager::invalidateReachingDefs() { + RD.reset(nullptr); +} + +ReachingDefOrUse &DataflowInfoManager::getReachingUses() { + if (RU) + return *RU; + assert(FA && "FrameAnalysis required"); + RU.reset(new ReachingDefOrUse(*FA, BC, BF)); + { + NamedRegionTimer T1("RU", "Dataflow", true); + RU->run(); + } + return *RU; +} + +void DataflowInfoManager::invalidateReachingUses() { + RU.reset(nullptr); +} + +LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() { + if (LA) + return *LA; + assert(FA && "FrameAnalysis required"); + LA.reset(new LivenessAnalysis(*FA, BC, BF)); + { + NamedRegionTimer T1("LA", "Dataflow", true); + LA->run(); + } + return *LA; +} + +void DataflowInfoManager::invalidateLivenessAnalysis() { + LA.reset(nullptr); +} + +DominatorAnalysis &DataflowInfoManager::getDominatorAnalysis() { + if (DA) + return *DA; + DA.reset(new DominatorAnalysis(BC, BF)); + { + NamedRegionTimer T1("DA", "Dataflow", true); + DA->run(); + } + return *DA; +} + +void DataflowInfoManager::invalidateDominatorAnalysis() { + DA.reset(nullptr); +} + +DominatorAnalysis &DataflowInfoManager::getPostDominatorAnalysis() { + if (PDA) + return *PDA; + PDA.reset(new DominatorAnalysis(BC, BF)); + { + NamedRegionTimer T1("PDA", "Dataflow", true); + PDA->run(); + } + return *PDA; +} + +void DataflowInfoManager::invalidatePostDominatorAnalysis() { + PDA.reset(nullptr); +} + +StackPointerTracking &DataflowInfoManager::getStackPointerTracking() { + if (SPT) + return *SPT; + SPT.reset(new StackPointerTracking(BC, BF)); + { + NamedRegionTimer T1("SPT", "Dataflow", true); + SPT->run(); + } + return *SPT; +} + +void DataflowInfoManager::invalidateStackPointerTracking() { + SPT.reset(nullptr); +} + +ReachingInsns &DataflowInfoManager::getReachingInsns() { + if (RI) + return *RI; + RI.reset(new ReachingInsns(BC, BF)); + { + NamedRegionTimer T1("RI", "Dataflow", true); + RI->run(); + } + return *RI; +} + +void DataflowInfoManager::invalidateReachingInsns() { + RI.reset(nullptr); +} + +ReachingInsns &DataflowInfoManager::getReachingInsnsBackwards() { + if (RIB) + return *RIB; + RIB.reset(new ReachingInsns(BC, BF)); + { + NamedRegionTimer T1("RIB", "Dataflow", true); + RIB->run(); + } + return *RIB; +} + +void DataflowInfoManager::invalidateReachingInsnsBackwards() { + RIB.reset(nullptr); +} + +std::unordered_map & +DataflowInfoManager::getInsnToBBMap() { + if (InsnToBB) + return *InsnToBB; + InsnToBB.reset(new std::unordered_map()); + for (auto &BB : BF) { + for (auto &Inst : BB) + (*InsnToBB)[&Inst] = &BB; + } + return *InsnToBB; +} + +void DataflowInfoManager::invalidateInsnToBBMap() { + InsnToBB.reset(nullptr); +} + +void DataflowInfoManager::invalidateAll() { + invalidateReachingDefs(); + invalidateReachingUses(); + invalidateLivenessAnalysis(); + invalidateDominatorAnalysis(); + invalidatePostDominatorAnalysis(); + invalidateStackPointerTracking(); + invalidateReachingInsns(); + invalidateReachingInsnsBackwards(); + invalidateInsnToBBMap(); +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/DataflowInfoManager.h b/bolt/Passes/DataflowInfoManager.h new file mode 100644 index 000000000000..a9ef9f7d897d --- /dev/null +++ b/bolt/Passes/DataflowInfoManager.h @@ -0,0 +1,72 @@ +//===--- Passes/DataflowInfoManager.h -------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWINFOMANAGER_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWINFOMANAGER_H + +#include "FrameAnalysis.h" +#include "ReachingDefOrUse.h" +#include "DominatorAnalysis.h" +#include "StackPointerTracking.h" +#include "ReachingInsns.h" +#include "LivenessAnalysis.h" + +namespace llvm { +namespace bolt { + +/// Manages instances for dataflow analyses and try to preserve the data +/// calculated by each analysis as much as possible, saving the need to +/// recompute it. Also provide an interface for data invalidation when the +/// analysis is outdated after a transform pass modified the function. +class DataflowInfoManager { + const FrameAnalysis *FA; + const BinaryContext &BC; + BinaryFunction &BF; + std::unique_ptr> RD; + std::unique_ptr> RU; + std::unique_ptr LA; + std::unique_ptr> DA; + std::unique_ptr> PDA; + std::unique_ptr SPT; + std::unique_ptr> RI; + std::unique_ptr> RIB; + std::unique_ptr> + InsnToBB; + +public: + DataflowInfoManager(const FrameAnalysis *FA, const BinaryContext &BC, + BinaryFunction &BF) : FA(FA), BC(BC), BF(BF) {}; + + ReachingDefOrUse &getReachingDefs(); + void invalidateReachingDefs(); + ReachingDefOrUse &getReachingUses(); + void invalidateReachingUses(); + LivenessAnalysis &getLivenessAnalysis(); + void invalidateLivenessAnalysis(); + DominatorAnalysis &getDominatorAnalysis(); + void invalidateDominatorAnalysis(); + DominatorAnalysis &getPostDominatorAnalysis(); + void invalidatePostDominatorAnalysis(); + StackPointerTracking &getStackPointerTracking(); + void invalidateStackPointerTracking(); + ReachingInsns &getReachingInsns(); + void invalidateReachingInsns(); + ReachingInsns &getReachingInsnsBackwards(); + void invalidateReachingInsnsBackwards(); + std::unordered_map &getInsnToBBMap(); + void invalidateInsnToBBMap(); + void invalidateAll(); +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/Passes/DominatorAnalysis.h b/bolt/Passes/DominatorAnalysis.h new file mode 100644 index 000000000000..87eef5f7662f --- /dev/null +++ b/bolt/Passes/DominatorAnalysis.h @@ -0,0 +1,141 @@ +//===--- Passes/DominatorAnalysis.h ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_DOMINATORANALYSIS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_DOMINATORANALYSIS_H + +#include "DataflowAnalysis.h" + +namespace llvm { +namespace bolt { + +/// The whole reason for running a dominator analysis at the instruction level +/// (that is much more expensive than at the BB level) is because of invoke +/// instructions that may cause early exits in the middle of the BB, making half +/// of the BB potentially dominate the landing pad but not instructions after +/// the invoke. +template +class DominatorAnalysis + : public InstrsDataflowAnalysis, Backward> { + friend class DataflowAnalysis, BitVector, + Backward>; + +public: + DominatorAnalysis(const BinaryContext &BC, BinaryFunction &BF) + : InstrsDataflowAnalysis, Backward>(BC, BF) {} + virtual ~DominatorAnalysis() {} + + SmallVector getDominanceFrontierFor(const MCInst &Dom) { + SmallVector Result; + auto DomIdx = this->ExprToIdx[&Dom]; + assert(!Backward && "Post-dom frontier not implemented"); + for (auto &BB : this->Func) { + bool HasDominatedPred = false; + bool HasNonDominatedPred = false; + SmallVector Candidates; + this->doForAllSuccsOrPreds(BB, [&](ProgramPoint P) { + if ((*this->getStateAt(P))[DomIdx]) { + Candidates.emplace_back(P); + HasDominatedPred = true; + return; + } + HasNonDominatedPred = true; + }); + if (HasDominatedPred && HasNonDominatedPred) + Result.append(Candidates.begin(), Candidates.end()); + if ((*this->getStateAt(ProgramPoint::getLastPointAt(BB)))[DomIdx] && + BB.succ_begin() == BB.succ_end()) + Result.emplace_back(ProgramPoint::getLastPointAt(BB)); + } + std::sort(Result.begin(), Result.end()); + Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); + return Result; + } + + bool doesADominatesB(const MCInst &A, const MCInst &B) { + return (*this->getStateAt(B))[this->ExprToIdx[&A]]; + } + + bool doesADominatesB(ProgramPoint A, const MCInst &B) { + if (A.isInst()) + return doesADominatesB(*A.getInst(), B); + + // This analysis keep track of which instructions dominates another + // instruction, it doesn't keep track of BBs. So we need a non-empty + // BB if we want to know whether this BB dominates something. + BinaryBasicBlock *BB = A.getBB(); + while (BB->size() == 0) { + if (BB->succ_size() == 0) + return false; + assert (BB->succ_size() == 1); + BB = *BB->succ_begin(); + } + const MCInst &InstA = *BB->begin(); + return doesADominatesB(InstA, B); + } + + void doForAllDominators(const MCInst &Inst, + std::function Task) { + for (auto I = this->expr_begin(Inst), E = this->expr_end(); I != E; ++I) { + Task(**I); + } + } + +private: + void preflight() { + // Populate our universe of tracked expressions with all instructions + // except pseudos + for (auto &BB : this->Func) { + for (auto &Inst : BB) { + this->Expressions.push_back(&Inst); + this->ExprToIdx[&Inst] = this->NumInstrs++; + } + } + } + + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) { + // Entry points start with empty set + // All others start with the full set. + if (!Backward && BB.pred_size() == 0 && BB.throw_size() == 0) + return BitVector(this->NumInstrs, false); + if (Backward && BB.succ_size() == 0) + return BitVector(this->NumInstrs, false); + return BitVector(this->NumInstrs, true); + } + + BitVector getStartingStateAtPoint(const MCInst &Point) { + return BitVector(this->NumInstrs, true); + } + + void doConfluence(BitVector &StateOut, const BitVector &StateIn) { + StateOut &= StateIn; + } + + BitVector computeNext(const MCInst &Point, const BitVector &Cur) { + BitVector Next = Cur; + // Gen + if (!this->BC.MIA->isCFI(Point)) { + Next.set(this->ExprToIdx[&Point]); + } + return Next; + } + + StringRef getAnnotationName() const { + if (Backward) + return StringRef("PostDominatorAnalysis"); + return StringRef("DominatorAnalysis"); + } +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp new file mode 100644 index 000000000000..a5aadf9522f9 --- /dev/null +++ b/bolt/Passes/FrameAnalysis.cpp @@ -0,0 +1,703 @@ +//===--- Passes/FrameAnalysis.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +#include "FrameAnalysis.h" +#include + +#define DEBUG_TYPE "fa" + +using namespace llvm; + +namespace opts { +extern cl::opt Verbosity; +extern bool shouldProcess(const bolt::BinaryFunction &Function); + +static cl::list + FrameOptFunctionNames("funcs-fop", cl::CommaSeparated, + cl::desc("list of functions to apply frame opts"), + cl::value_desc("func1,func2,func3,...")); + +static cl::opt FrameOptFunctionNamesFile( + "funcs-file-fop", + cl::desc("file with list of functions to frame optimize")); + +bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) { + if (!FrameOptFunctionNamesFile.empty()) { + assert(!FrameOptFunctionNamesFile.empty() && "unexpected empty file name"); + std::ifstream FuncsFile(FrameOptFunctionNamesFile, std::ios::in); + std::string FuncName; + while (std::getline(FuncsFile, FuncName)) { + FrameOptFunctionNames.push_back(FuncName); + } + FrameOptFunctionNamesFile = ""; + } + + bool IsValid = true; + if (!FrameOptFunctionNames.empty()) { + IsValid = false; + for (auto &Name : FrameOptFunctionNames) { + if (Function.hasName(Name)) { + IsValid = true; + break; + } + } + } + if (!IsValid) + return false; + + return IsValid; +} +} // namespace opts + +namespace llvm { +namespace bolt { + +raw_ostream &operator<<(raw_ostream &OS, const FrameIndexEntry &FIE) { + OS << "FrameIndexEntry"; + return OS; +} + +namespace { + +/// This class should be used to iterate through basic blocks in layout order +/// to analyze instructions for frame accesses. The user should call +/// enterNewBB() whenever starting analyzing a new BB and doNext() for each +/// instruction. After doNext(), if isValidAccess() returns true, it means the +/// current instruction accesses the frame and getFIE() may be used to obtain +/// details about this access. +class FrameAccessAnalysis { + /// We depend on Stack Pointer Tracking to figure out the current SP offset + /// value at a given program point + StackPointerTracking SPT; + /// Context vars + const BinaryContext &BC; + const BinaryFunction &BF; + // Vars used for storing useful CFI info to give us a hint about how the stack + // is used in this function + int SPOffset{0}; + int FPOffset{0}; + int64_t CfaOffset{-8}; + uint16_t CfaReg{7}; + std::stack> CFIStack; + /// Our pointer to access SPT info + const MCInst *Prev{nullptr}; + /// Info about the last frame access + bool IsValidAccess{false}; + FrameIndexEntry FIE; + + bool decodeFrameAccess(const MCInst &Inst) { + int32_t SrcImm{0}; + MCPhysReg Reg{0}; + int64_t StackOffset{0}; + bool IsIndexed{false}; + if (!BC.MIA->isStackAccess( + Inst, FIE.IsLoad, FIE.IsStore, FIE.IsStoreFromReg, Reg, SrcImm, + FIE.StackPtrReg, StackOffset, FIE.Size, FIE.IsSimple, IsIndexed)) { + return true; + } + + if (IsIndexed) { + DEBUG(dbgs() << "Giving up on indexed memory access in the frame\n"); + return false; + } + + assert(FIE.Size != 0); + + FIE.RegOrImm = SrcImm; + if (FIE.IsLoad || FIE.IsStoreFromReg) + FIE.RegOrImm = Reg; + + if (FIE.StackPtrReg == BC.MIA->getStackPointer() && SPOffset != SPT.EMPTY && + SPOffset != SPT.SUPERPOSITION) { + DEBUG(dbgs() << "Adding access via SP while CFA reg is another one\n"); + FIE.StackOffset = SPOffset + StackOffset; + } else if (FIE.StackPtrReg == BC.MIA->getFramePointer() && + FPOffset != SPT.EMPTY && FPOffset != SPT.SUPERPOSITION) { + DEBUG(dbgs() << "Adding access via FP while CFA reg is another one\n"); + FIE.StackOffset = FPOffset + StackOffset; + } else if (FIE.StackPtrReg == + BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false)) { + FIE.StackOffset = CfaOffset + StackOffset; + } else { + DEBUG(dbgs() << "Found stack access with reg different than cfa reg.\n"); + DEBUG(dbgs() << "\tCurrent CFA reg: " << CfaReg + << "\n\tStack access reg: " << FIE.StackPtrReg << "\n"); + DEBUG(dbgs() << "Blame insn: "); + DEBUG(Inst.dump()); + return false; + } + IsValidAccess = true; + return true; + } + +public: + FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF) + : SPT(BC, BF), BC(BC), BF(BF) { + { + NamedRegionTimer T1("SPT", "Dataflow", true); + SPT.run(); + } + } + + void enterNewBB() { Prev = nullptr; } + const FrameIndexEntry &getFIE() const { return FIE; } + int getSPOffset() const { return SPOffset; } + bool isValidAccess() const { return IsValidAccess; } + + bool doNext(const BinaryBasicBlock &BB, const MCInst &Inst) { + IsValidAccess = false; + std::tie(SPOffset, FPOffset) = + Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB); + Prev = &Inst; + // Use CFI information to keep track of which register is being used to + // access the frame + if (BC.MIA->isCFI(Inst)) { + const auto *CFI = BF.getCFIFor(Inst); + switch (CFI->getOperation()) { + case MCCFIInstruction::OpDefCfa: + CfaOffset = CFI->getOffset(); + // Fall-through + case MCCFIInstruction::OpDefCfaRegister: + CfaReg = CFI->getRegister(); + break; + case MCCFIInstruction::OpDefCfaOffset: + CfaOffset = CFI->getOffset(); + break; + case MCCFIInstruction::OpRememberState: + CFIStack.push(std::make_pair(CfaOffset, CfaReg)); + break; + case MCCFIInstruction::OpRestoreState: { + if (CFIStack.empty()) { + dbgs() << "Assertion is about to fail: " << BF.getPrintName() << "\n"; + } + assert(!CFIStack.empty() && "Corrupt CFI stack"); + auto &Elem = CFIStack.top(); + CFIStack.pop(); + CfaOffset = Elem.first; + CfaReg = Elem.second; + break; + } + case MCCFIInstruction::OpAdjustCfaOffset: + llvm_unreachable("Unhandled AdjustCfaOffset"); + break; + default: + break; + } + return true; + } + + if (BC.MIA->leaksStackAddress(Inst, *BC.MRI, SPT.HasFramePointer)) { + DEBUG(dbgs() << "Leaked stack address, giving up on this function.\n"); + DEBUG(dbgs() << "Blame insn: "); + DEBUG(Inst.dump()); + return false; + } + + return decodeFrameAccess(Inst); + } +}; + +} // end anonymous namespace + +void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, + ArgAccesses &&AA) { + if (AA.AssumeEverything) { + // Index 0 in ArgAccessesVector represents an "assumeeverything" entry + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "ArgAccessEntry", 0U); + return; + } + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "ArgAccessEntry", + (unsigned)ArgAccessesVector.size()); + ArgAccessesVector.emplace_back(AA); +} + +void FrameAnalysis::addArgInStackAccessFor(const BinaryContext &BC, + MCInst &Inst, + const ArgInStackAccess &Arg) { + auto AA = getArgAccessesFor(BC, Inst); + if (!AA) { + addArgAccessesFor(BC, Inst, ArgAccesses(false)); + AA = getArgAccessesFor(BC, Inst); + assert(AA && "Object setup failed"); + } + auto &Set = AA->Set; + assert(!AA->AssumeEverything && "Adding arg to AssumeEverything set"); + Set.emplace(Arg); +} + +void FrameAnalysis::addFIEFor(const BinaryContext &BC, MCInst &Inst, + const FrameIndexEntry &FIE) { + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "FrameAccessEntry", + (unsigned)FIEVector.size()); + FIEVector.emplace_back(FIE); +} + +ErrorOr +FrameAnalysis::getArgAccessesFor(const BinaryContext &BC, const MCInst &Inst) { + if (auto Idx = BC.MIA->tryGetAnnotationAs(Inst, "ArgAccessEntry")) { + assert(ArgAccessesVector.size() > *Idx && "Out of bounds"); + return ArgAccessesVector[*Idx]; + } + return make_error_code(errc::result_out_of_range); +} + +ErrorOr +FrameAnalysis::getArgAccessesFor(const BinaryContext &BC, + const MCInst &Inst) const { + if (auto Idx = BC.MIA->tryGetAnnotationAs(Inst, "ArgAccessEntry")) { + assert(ArgAccessesVector.size() > *Idx && "Out of bounds"); + return ArgAccessesVector[*Idx]; + } + return make_error_code(errc::result_out_of_range); +} + +ErrorOr +FrameAnalysis::getFIEFor(const BinaryContext &BC, const MCInst &Inst) const { + if (auto Idx = + BC.MIA->tryGetAnnotationAs(Inst, "FrameAccessEntry")) { + assert(FIEVector.size() > *Idx && "Out of bounds"); + return FIEVector[*Idx]; + } + return make_error_code(errc::result_out_of_range); +} + +void FrameAnalysis::buildCallGraph(BinaryContext &BC, + std::map &BFs) { + for (auto &I : BFs) { + BinaryFunction &Caller = I.second; + + Functions.emplace(&Caller); + + for (BinaryBasicBlock &BB : Caller) { + for (MCInst &Inst : BB) { + if (!BC.MIA->isCall(Inst)) + continue; + + auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); + if (!TargetSymbol) { + // This is an indirect call, we cannot record a target. + continue; + } + + auto *Function = BC.getFunctionForSymbol(TargetSymbol); + if (!Function) { + // Call to a function without a BinaryFunction object. + continue; + } + // Create a new edge in the call graph + CallGraphEdges[&Caller].emplace_back(Function); + ReverseCallGraphEdges[Function].emplace_back(&Caller); + } + } + } +} + +void FrameAnalysis::buildCGTraversalOrder() { + enum NodeStatus { NEW, VISITING, VISITED }; + std::unordered_map NodeStatus; + std::stack Worklist; + + for (auto *Func : Functions) { + Worklist.push(Func); + NodeStatus[Func] = NEW; + } + + while (!Worklist.empty()) { + auto *Func = Worklist.top(); + Worklist.pop(); + + if (NodeStatus[Func] == VISITED) + continue; + + if (NodeStatus[Func] == VISITING) { + TopologicalCGOrder.push_back(Func); + NodeStatus[Func] = VISITED; + continue; + } + + assert(NodeStatus[Func] == NEW); + NodeStatus[Func] = VISITING; + Worklist.push(Func); + for (auto *Callee : CallGraphEdges[Func]) { + if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED) + continue; + Worklist.push(Callee); + } + } +} + +void FrameAnalysis::getInstClobberList(const BinaryContext &BC, + const MCInst &Inst, + BitVector &KillSet) const { + if (!BC.MIA->isCall(Inst)) { + BC.MIA->getClobberedRegs(Inst, KillSet, *BC.MRI); + return; + } + + const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); + // If indirect call, kill set should have all elements + if (TargetSymbol == nullptr) { + KillSet.set(0, KillSet.size()); + return; + } + + const auto *Function = BC.getFunctionForSymbol(TargetSymbol); + if (Function == nullptr) { + // Call to a function without a BinaryFunction object. + // This should be a call to a PLT entry, and since it is a trampoline to + // a DSO, we can't really know the code in advance. Conservatively assume + // everything is clobbered. + KillSet.set(0, KillSet.size()); + return; + } + auto BV = RegsKilledMap.find(Function); + if (BV != RegsKilledMap.end()) { + KillSet |= BV->second; + return; + } + // Ignore calls to function whose clobber list wasn't yet calculated. This + // instruction will be evaluated again once we have info for the callee. + return; +} + +BitVector FrameAnalysis::getFunctionClobberList(const BinaryContext &BC, + const BinaryFunction *Func) { + BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false); + + if (!Func->isSimple() || !Func->hasCFG()) { + RegsKilled.set(0, RegsKilled.size()); + return RegsKilled; + } + + for (const auto &BB : *Func) { + for (const auto &Inst : BB) { + getInstClobberList(BC, Inst, RegsKilled); + } + } + + return RegsKilled; +} + +void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { + std::queue Queue; + + for (auto *Func : TopologicalCGOrder) { + Queue.push(Func); + } + + while (!Queue.empty()) { + auto *Func = Queue.front(); + Queue.pop(); + + BitVector RegsKilled = getFunctionClobberList(BC, Func); + bool Updated = computeArgsAccessed(BC, *Func); + + if (RegsKilledMap.find(Func) == RegsKilledMap.end()) { + RegsKilledMap[Func] = std::move(RegsKilled); + continue; + } + + if (RegsKilledMap[Func] != RegsKilled || Updated) { + for (auto Caller : ReverseCallGraphEdges[Func]) { + Queue.push(Caller); + } + } + RegsKilledMap[Func] = std::move(RegsKilled); + } + + if (opts::Verbosity == 0 && (!DebugFlag || !isCurrentDebugType("fa"))) + return; + + // This loop is for computing statistics only + for (auto *Func : TopologicalCGOrder) { + auto Iter = RegsKilledMap.find(Func); + assert(Iter != RegsKilledMap.end() && + "Failed to compute all clobbers list"); + if (Iter->second.all()) { + auto Count = Func->getExecutionCount(); + if (Count != BinaryFunction::COUNT_NO_PROFILE) + CountFunctionsAllClobber += Count; + ++NumFunctionsAllClobber; + } + if (!DebugFlag || !isCurrentDebugType("fa")) + continue; + // DEBUG only + dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; + const BitVector &RegsKilled = Iter->second; + int RegIdx = RegsKilled.find_first(); + while (RegIdx != -1) { + dbgs() << "\tREG" << RegIdx; + RegIdx = RegsKilled.find_next(RegIdx); + }; + dbgs() << "\n"; + } +} + +bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC, + const BinaryFunction &BF, MCInst &Inst, + int CurOffset) { + if (!BC.MIA->isCall(Inst)) + return false; + + std::set Res; + const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); + // If indirect call, we conservatively assume it accesses all stack positions + if (TargetSymbol == nullptr) { + addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true)); + bool Updated{false}; + if (!FunctionsRequireAlignment.count(&BF)) { + Updated = true; + FunctionsRequireAlignment.insert(&BF); + } + return Updated; + } + + const auto *Function = BC.getFunctionForSymbol(TargetSymbol); + // Call to a function without a BinaryFunction object. Conservatively assume + // it accesses all stack positions + if (Function == nullptr) { + addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true)); + bool Updated{false}; + if (!FunctionsRequireAlignment.count(&BF)) { + Updated = true; + FunctionsRequireAlignment.insert(&BF); + } + return Updated; + } + + auto Iter = ArgsTouchedMap.find(Function); + if (Iter == ArgsTouchedMap.end()) + return false; + + bool Changed = false; + if (BC.MIA->isTailCall(Inst)) { + // Ignore checking CurOffset because we can't always reliably determine the + // offset specially after an epilogue, where tailcalls happen. It should be + // -8. + for (auto Elem : Iter->second) { + if (ArgsTouchedMap[&BF].find(Elem) == ArgsTouchedMap[&BF].end()) { + ArgsTouchedMap[&BF].emplace(Elem); + Changed = true; + } + } + } + if (FunctionsRequireAlignment.count(Function) && + !FunctionsRequireAlignment.count(&BF)) { + Changed = true; + FunctionsRequireAlignment.insert(&BF); + } + + if (CurOffset == StackPointerTracking::EMPTY || + CurOffset == StackPointerTracking::SUPERPOSITION) { + addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true)); + return Changed; + } + + for (auto Elem : Iter->second) { + if (Elem.first == -1) { + addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true)); + break; + } + DEBUG(dbgs() << "Added arg in stack access annotation " + << CurOffset + Elem.first << "\n"); + addArgInStackAccessFor( + BC, Inst, ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first, + /*Size=*/Elem.second}); + } + return Changed; +} + +bool FrameAnalysis::computeArgsAccessed(const BinaryContext &BC, + BinaryFunction &BF) { + if (!BF.isSimple() || !BF.hasCFG()) { + DEBUG(dbgs() << "Treating " << BF.getPrintName() << " conservatively.\n"); + bool Updated = false; + ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0)); + if (!FunctionsRequireAlignment.count(&BF)) { + Updated = true; + FunctionsRequireAlignment.insert(&BF); + } + return Updated; + } + + bool UpdatedArgsTouched = false; + FrameAccessAnalysis FAA(BC, BF); + + for (auto BB : BF.layout()) { + FAA.enterNewBB(); + + for (auto &Inst : *BB) { + if (!FAA.doNext(*BB, Inst)) { + ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0)); + break; + } + + // Check for calls -- attach stack accessing info to them regarding their + // target + if (updateArgsTouchedFor(BC, BF, Inst, FAA.getSPOffset())) + UpdatedArgsTouched = true; + + // Check for stack accesses that affect callers + if (!FAA.isValidAccess()) + continue; + + const FrameIndexEntry &FIE = FAA.getFIE(); + if (FIE.StackOffset < 0) + continue; + if (ArgsTouchedMap[&BF].find(std::make_pair(FIE.StackOffset, FIE.Size)) != + ArgsTouchedMap[&BF].end()) + continue; + + // Record accesses to the previous stack frame + ArgsTouchedMap[&BF].emplace(std::make_pair(FIE.StackOffset, FIE.Size)); + UpdatedArgsTouched = true; + DEBUG({ + dbgs() << "Arg access offset " << FIE.StackOffset << " added to:\n"; + BC.printInstruction(dbgs(), Inst, 0, &BF, true); + }); + } + } + if (FunctionsRequireAlignment.count(&BF)) + return UpdatedArgsTouched; + + bool UpdatedAlignedStatus = false; + for (auto &BB : BF) { + if (UpdatedAlignedStatus) + break; + for (auto &Inst : BB) { + if (BC.MIA->requiresAlignedAddress(Inst)) { + if (!FunctionsRequireAlignment.count(&BF)) { + UpdatedAlignedStatus = true; + FunctionsRequireAlignment.insert(&BF); + break; + } + } + } + } + return UpdatedArgsTouched || UpdatedAlignedStatus; +} + +bool FrameAnalysis::restoreFrameIndex(const BinaryContext &BC, + BinaryFunction &BF) { + FrameAccessAnalysis FAA(BC, BF); + + DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName() + << "\"\n"); + for (auto BB : BF.layout()) { + DEBUG(dbgs() << "\tNow at BB " << BB->getName() << "\n"); + FAA.enterNewBB(); + + for (auto &Inst : *BB) { + if (!FAA.doNext(*BB, Inst)) + return false; + DEBUG({ + dbgs() << "\t\tNow at "; + Inst.dump(); + dbgs() << "\t\t\tSP offset is " << FAA.getSPOffset() << "\n"; + }); + + if (!FAA.isValidAccess()) + continue; + + const FrameIndexEntry &FIE = FAA.getFIE(); + + addFIEFor(BC, Inst, FIE); + DEBUG({ + dbgs() << "Frame index annotation " << FIE << " added to:\n"; + BC.printInstruction(dbgs(), Inst, 0, &BF, true); + }); + } + } + return true; +} + +void FrameAnalysis::cleanAnnotations(const BinaryContext &BC, + std::map &BFs) { + for (auto &I : BFs) { + for (auto &BB : I.second) { + for (auto &Inst : BB) { + BC.MIA->removeAnnotation(Inst, "ArgAccessEntry"); + BC.MIA->removeAnnotation(Inst, "FrameAccessEntry"); + } + } + } +} + +void FrameAnalysis::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &) { + { + NamedRegionTimer T1("Callgraph construction", "FOP breakdown", true); + buildCallGraph(BC, BFs); + } + { + NamedRegionTimer T1("build cg traversal order", "FOP breakdown", true); + buildCGTraversalOrder(); + } + { + NamedRegionTimer T1("build clobber map", "FOP breakdown", true); + buildClobberMap(BC); + } + for (auto &I : BFs) { + auto Count = I.second.getExecutionCount(); + if (Count != BinaryFunction::COUNT_NO_PROFILE) + CountDenominator += Count; + + // "shouldOptimize" for passes that run after finalize + if (!(I.second.isSimple() && I.second.hasCFG() && + opts::shouldProcess(I.second) && (I.second.getSize() > 0)) || + !opts::shouldFrameOptimize(I.second)) { + ++NumFunctionsNotOptimized; + if (Count != BinaryFunction::COUNT_NO_PROFILE) + CountFunctionsNotOptimized += Count; + continue; + } + + { + NamedRegionTimer T1("restore frame index", "FOP breakdown", true); + if (!restoreFrameIndex(BC, I.second)) { + ++NumFunctionsFailedRestoreFI; + auto Count = I.second.getExecutionCount(); + if (Count != BinaryFunction::COUNT_NO_PROFILE) + CountFunctionsFailedRestoreFI += Count; + continue; + } + } + AnalyzedFunctions.insert(&I.second); + } +} + +void FrameAnalysis::printStats() { + outs() << "BOLT-INFO FRAME ANALYSIS: Number of functions conservatively " + "treated as clobbering all registers: " + << NumFunctionsAllClobber + << format(" (%.1lf%% dyn cov)\n", + (100.0 * CountFunctionsAllClobber / CountDenominator)) + << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsNotOptimized + << " function(s) " + << format("(%.1lf%% dyn cov)", + (100.0 * CountFunctionsNotOptimized / CountDenominator)) + << " were not optimized.\n" + << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI + << " function(s) " + << format("(%.1lf%% dyn cov)", + (100.0 * CountFunctionsFailedRestoreFI / CountDenominator)) + << " could not have its frame indices restored.\n"; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/FrameAnalysis.h b/bolt/Passes/FrameAnalysis.h new file mode 100644 index 000000000000..c2c2938d60b2 --- /dev/null +++ b/bolt/Passes/FrameAnalysis.h @@ -0,0 +1,262 @@ +//===--- Passes/FrameAnalysis.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H + +#include "BinaryPasses.h" +#include "StackPointerTracking.h" + +namespace llvm { +namespace bolt { + +/// Alias analysis information attached to each instruction that accesses a +/// frame position. This is called a "frame index" by LLVM Target libs when +/// it is building a MachineFunction frame, and we use the same name here +/// because we are essentially doing the job of frame reconstruction. +struct FrameIndexEntry { + /// If both IsLoad and IsStore are set, it means this is an instruction that + /// reads and updates this frame location. + bool IsLoad; + bool IsStore; + /// If a store, this controls whether the store uses a register os an imm + /// as the source value. + bool IsStoreFromReg; + /// If load, this holds the destination register. If store, this holds + /// either the source register or source immediate. + int32_t RegOrImm; + + /// StackOffset and Size are the two aspects that identify this frame access + /// for the purposes of alias analysis. + int64_t StackOffset; + uint8_t Size; + + /// If this is false, we will never atempt to remove or optimize this + /// instruction. We just use it to keep track of stores we don't fully + /// understand but we know it may write to a frame position. + bool IsSimple; + + uint16_t StackPtrReg; +}; + +/// Record an access to an argument in stack. This should be attached to +/// call instructions, so StackOffset and Size are determined in the context +/// of the caller. This information helps the caller understand how the callee +/// may access its private stack. +struct ArgInStackAccess { + int64_t StackOffset; + uint8_t Size; + + bool operator<(const ArgInStackAccess &RHS) const { + if (StackOffset != RHS.StackOffset) + return StackOffset < RHS.StackOffset; + return Size < RHS.Size; + } +}; + +/// The set of all args-in-stack accesses for a given instruction. If +/// AssumeEverything is true, then the set should be ignored and the +/// corresponding instruction should be treated as accessing the entire +/// stack for the purposes of analysis and optimization. +struct ArgAccesses { + bool AssumeEverything; + std::set Set; + + explicit ArgAccesses(bool AssumeEverything) + : AssumeEverything(AssumeEverything) {} +}; + +raw_ostream &operator<<(raw_ostream &OS, + const FrameIndexEntry &FIE); + +/// This pass attaches stack access information to instructions. If a load/store +/// instruction accesses a stack position, it will identify the CFA offset and +/// size information of this access, where CFA is the Canonical Frame Address +/// (using DWARF terminology). +/// +/// This pass also computes frame usage information obtained by a bottom-up call +/// graph traversal: which registers are clobbered by functions (including their +/// callees as determined by the call graph), whether a function accesses its +/// caller's stack frame and whether a function demands its stack to be aligned +/// due to the use of SSE aligned load/store operations present in itself or any +/// of its direct or indirect callees. +/// +/// Initialization: +/// +/// FrameAnalysis FA(PrintPass); +/// RA.runOnFunctions(BC, BFs, LargeFunctions); +/// +/// Usage (fetching frame access information about a given instruction): +/// +/// auto FIE = FA.getFIEFor(BC, Instruction); +/// if (FIE && FIE->IsSimple) { +/// ... = FIE->StackOffset +/// ... = FIE->Size +/// } +/// +/// Usage (determining the set of stack positions accessed by the target of a +/// call: +/// +/// auto Args = FA.getArgAccessesFor(BC, CallInst); +/// if (Args && Args->AssumeEverything) { +/// ... callee may access any position of our current stack frame +/// } +/// +class FrameAnalysis : public BinaryFunctionPass { + /// Call graph info + /// The set of functions analyzed by our call graph + std::set Functions; + /// Model the "function calls function" edges + std::map> + CallGraphEdges; + /// Model the "function called by function" edges + std::map> + ReverseCallGraphEdges; + /// DFS or reverse post-ordering of the call graph nodes to allow us to + /// traverse the call graph bottom-up + std::deque TopologicalCGOrder; + + /// Map functions to the set of registers they may overwrite starting at when + /// it is called until it returns to the caller. + std::map RegsKilledMap; + + /// Map functions to the set of tuples representing + /// accesses to stack positions that belongs to caller + std::map>> + ArgsTouchedMap; + + /// The set of functions we were able to perform the full analysis up to + /// restoring frame indexes for all load/store instructions. + DenseSet AnalyzedFunctions; + + /// Set of functions that require the stack to be 16B aligned + DenseSet FunctionsRequireAlignment; + + /// Owns ArgAccesses for all instructions. References to elements are + /// attached to instructions as indexes to this vector, in MCAnnotations. + std::vector ArgAccessesVector; + /// Same for FrameIndexEntries. + std::vector FIEVector; + + /// Analysis stats counters + uint64_t NumFunctionsAllClobber{0}; + uint64_t CountFunctionsAllClobber{0}; + uint64_t NumFunctionsNotOptimized{0}; + uint64_t NumFunctionsFailedRestoreFI{0}; + uint64_t CountFunctionsNotOptimized{0}; + uint64_t CountFunctionsFailedRestoreFI{0}; + uint64_t CountDenominator{0}; + + /// Convenience functions for appending MCAnnotations to instructions with + /// our specific data + void addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, + ArgAccesses &&AA); + void addArgInStackAccessFor(const BinaryContext &BC, MCInst &Inst, + const ArgInStackAccess &Arg); + void addFIEFor(const BinaryContext &BC, MCInst &Inst, + const FrameIndexEntry &FIE); + + /// Perform the initial step of populating CallGraphEdges and + /// ReverseCallGraphEdges for all functions in BFs. + void buildCallGraph(BinaryContext &BC, + std::map &BFs); + + /// Compute a DFS traversal of the call graph in Functions, CallGraphEdges + /// and ReverseCallGraphEdges and stores it in TopologicalCGOrder. + void buildCGTraversalOrder(); + + /// Compute the set of registers \p Func may write to during its execution, + /// starting at the point when it is called up until when it returns. Returns + /// a BitVector the size of the target number of registers, representing the + /// set of clobbered registers. + BitVector getFunctionClobberList(const BinaryContext &BC, + const BinaryFunction *Func); + + /// Perform the step of building the set of registers clobbered by each + /// function execution, populating RegsKilledMap. + void buildClobberMap(const BinaryContext &BC); + + /// Analyzes an instruction and if it is a call, checks the called function + /// to record which args in stack are accessed, if any. Returns true if + /// the args data associated with this instruction were updated. + bool updateArgsTouchedFor(const BinaryContext &BC, const BinaryFunction &BF, + MCInst &Inst, int CurOffset); + + /// Performs a pass over \p BF to check for accesses to arguments in stack, + /// flagging those as accessing the caller stack frame. All functions called + /// by \p BF must have been previously analyzed. Returns true if updated + /// args data about this function. + bool computeArgsAccessed(const BinaryContext &BC, BinaryFunction &BF); + + /// Alias analysis to disambiguate which frame position is accessed by each + /// instruction in function \p BF. Add MCAnnotation to + /// instructions that access a frame position. Return false if it failed + /// to analyze and this information can't be safely determined for \p BF. + bool restoreFrameIndex(const BinaryContext &BC, BinaryFunction &BF); + +public: + explicit FrameAnalysis(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { + // Position 0 of the vector should be always associated with "assume access + // everything". + ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true)); + } + + const char *getName() const override { + return "frame-analysis"; + } + + /// Return true if we could fully analyze \p Func + bool hasFrameInfo(const BinaryFunction &Func) const { + return AnalyzedFunctions.count(&Func); + } + + /// Return true if \p Func cannot operate with a misaligned CFA + bool requiresAlignment(const BinaryFunction &Func) const { + return FunctionsRequireAlignment.count(&Func); + } + + /// Compute the set of registers \p Inst may write to, marking them in + /// \p KillSet. If this is a call, try to get the set of registers the call + /// target will write to. + void getInstClobberList(const BinaryContext &BC, const MCInst &Inst, + BitVector &KillSet) const; + + /// Functions for retrieving our specific MCAnnotation data from instructions + ErrorOr getArgAccessesFor(const BinaryContext &BC, + const MCInst &Inst); + + ErrorOr getArgAccessesFor(const BinaryContext &BC, + const MCInst &Inst) const; + + ErrorOr getFIEFor(const BinaryContext &BC, + const MCInst &Inst) const; + + /// Pass entry point + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; + + /// Remove all MCAnnotations attached by this pass + void cleanAnnotations(const BinaryContext &BC, + std::map &BFs); + + + /// Print to standard output statistics about the analysis performed by this + /// pass + void printStats(); +}; + +} // namespace bolt +} // namespace llvm + + +#endif diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 7e87b15f22d0..2eb3fafc38eb 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -523,7 +523,8 @@ class StackPointerTracking : public ForwardDataflow { if (BC.MII->get(Point.getOpcode()) .hasDefOfPhysReg(Point, MIA->getStackPointer(), *BC.MRI)) { int64_t Offset = Cur; - if (!MIA->updateStackPointerUpdate(Point, Offset)) + if (!MIA->evaluateSimple(Point, Offset, std::make_pair(0, 0), + std::make_pair(0, 0))) return SUPERPOSITION; return static_cast(Offset); @@ -606,7 +607,7 @@ bool FrameOptimizerPass::restoreFrameIndex(const BinaryContext &BC, continue; } - if (BC.MIA->leaksStackAddress(Inst, *BC.MRI)) { + if (BC.MIA->leaksStackAddress(Inst, *BC.MRI, false)) { DEBUG(dbgs() << "Leaked stack address, giving up on this function.\n"); DEBUG(dbgs() << "Blame insn: "); DEBUG(Inst.dump()); @@ -614,6 +615,7 @@ bool FrameOptimizerPass::restoreFrameIndex(const BinaryContext &BC, } bool IsLoad = false; + bool IsStore = false; bool IsStoreFromReg = false; bool IsSimple = false; int32_t SrcImm{0}; @@ -621,8 +623,10 @@ bool FrameOptimizerPass::restoreFrameIndex(const BinaryContext &BC, MCPhysReg StackPtrReg{0}; int64_t StackOffset{0}; uint8_t Size{0}; - if (BC.MIA->isStackAccess(Inst, IsLoad, IsStoreFromReg, Reg, SrcImm, - StackPtrReg, StackOffset, Size, IsSimple)) { + bool IsIndexed = false; + if (BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, Reg, + SrcImm, StackPtrReg, StackOffset, Size, + IsSimple, IsIndexed)) { assert(Size != 0); if (CfaRegLocked && CfaRegLockedVal != CfaReg) { DEBUG(dbgs() << "CFA reg changed, giving up on this function.\n"); diff --git a/bolt/Passes/LivenessAnalysis.cpp b/bolt/Passes/LivenessAnalysis.cpp new file mode 100644 index 000000000000..db8156cc1ed4 --- /dev/null +++ b/bolt/Passes/LivenessAnalysis.cpp @@ -0,0 +1,19 @@ +//===--- Passes/LivenessAnalysis.cpp --------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +#include "LivenessAnalysis.h" + +namespace llvm { +namespace bolt { + +LivenessAnalysis::~LivenessAnalysis() {} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h new file mode 100644 index 000000000000..f95a9ef12503 --- /dev/null +++ b/bolt/Passes/LivenessAnalysis.h @@ -0,0 +1,79 @@ +//===--- Passes/LivenessAnalysis.h ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_LIVENESSANALYSIS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_LIVENESSANALYSIS_H + +#include "DataflowAnalysis.h" +#include "FrameAnalysis.h" + +namespace llvm { +namespace bolt { + +class LivenessAnalysis + : public DataflowAnalysis { + friend class DataflowAnalysis; + +public: + LivenessAnalysis(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF) + : DataflowAnalysis(BC, BF), FA(FA), + NumRegs(BC.MRI->getNumRegs()) {} + virtual ~LivenessAnalysis(); + +protected: + /// Reference to the result of stack frame analysis + const FrameAnalysis &FA; + const uint16_t NumRegs; + + void preflight() {} + + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) { + return BitVector(NumRegs, false); + } + + BitVector getStartingStateAtPoint(const MCInst &Point) { + return BitVector(NumRegs, false); + } + + void doConfluence(BitVector &StateOut, const BitVector &StateIn) { + StateOut |= StateIn; + } + + BitVector computeNext(const MCInst &Point, const BitVector &Cur) { + BitVector Next = Cur; + // Kill + auto Written = BitVector(NumRegs, false); + if (this->BC.MIA->isCall(Point)) + FA.getInstClobberList(this->BC, Point, Written); + else + this->BC.MIA->getWrittenRegs(Point, Written, *this->BC.MRI); + Written.flip(); + Next &= Written; + // Gen + if (!this->BC.MIA->isCFI(Point)) { + auto Used = BitVector(NumRegs, false); + this->BC.MIA->getUsedRegs(Point, Used, *this->BC.MRI); + Next |= Used; + } + return Next; + } + + StringRef getAnnotationName() const { + return StringRef("LivenessAnalysis"); + } +}; + +} // end namespace bolt +} // end namespace llvm + + +#endif diff --git a/bolt/Passes/ReachingDefOrUse.h b/bolt/Passes/ReachingDefOrUse.h new file mode 100644 index 000000000000..ca67389b281a --- /dev/null +++ b/bolt/Passes/ReachingDefOrUse.h @@ -0,0 +1,126 @@ +//===--- Passes/ReachingDefOrUse.h ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H + +#include "DataflowAnalysis.h" + +namespace llvm { +namespace bolt { + +/// If \p Def is true, this computes a forward dataflow equation to +/// propagate reaching definitions. +/// If false, this computes a backward dataflow equation propagating +/// uses to their definitions. +template +class ReachingDefOrUse + : public InstrsDataflowAnalysis, !Def> { + friend class DataflowAnalysis, BitVector, !Def>; + +public: + ReachingDefOrUse(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF) + : InstrsDataflowAnalysis, !Def>(BC, BF), FA(FA) {} + virtual ~ReachingDefOrUse() {} + + bool isReachedBy(MCPhysReg Reg, ExprIterator Candidates) { + for (auto I = Candidates; I != this->expr_end(); ++I) { + auto BV = BitVector(this->BC.MRI->getNumRegs(), false); + if (Def) { + FA.getInstClobberList(this->BC, **I, BV); + } else { + this->BC.MIA->getTouchedRegs(**I, BV, *this->BC.MRI); + } + if (BV[Reg]) + return true; + } + return false; + } + + bool doesAReachesB(const MCInst &A, const MCInst &B) { + return (*this->getStateAt(B))[this->ExprToIdx[&A]]; + } + +protected: + /// Reference to the result of stack frame analysis + const FrameAnalysis &FA; + + void preflight() { + // Populate our universe of tracked expressions with all instructions + // except pseudos + for (auto &BB : this->Func) { + for (auto &Inst : BB) { + this->Expressions.push_back(&Inst); + this->ExprToIdx[&Inst] = this->NumInstrs++; + } + } + } + + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) { + return BitVector(this->NumInstrs, false); + } + + BitVector getStartingStateAtPoint(const MCInst &Point) { + return BitVector(this->NumInstrs, false); + } + + void doConfluence(BitVector &StateOut, const BitVector &StateIn) { + StateOut |= StateIn; + } + + /// Define the function computing the kill set -- whether expression Y, a + /// tracked expression, will be considered to be dead after executing X. + bool doesXKillsY(const MCInst *X, const MCInst *Y) { + // getClobberedRegs for X and Y. If they intersect, return true + auto XClobbers = BitVector(this->BC.MRI->getNumRegs(), false); + auto YClobbers = BitVector(this->BC.MRI->getNumRegs(), false); + FA.getInstClobberList(this->BC, *X, XClobbers); + // In defs, write after write -> kills first write + // In uses, write after access (read or write) -> kills access + if (Def) + FA.getInstClobberList(this->BC, *Y, YClobbers); + else + this->BC.MIA->getTouchedRegs(*Y, YClobbers, *this->BC.MRI); + // X kills Y if it clobbers Y completely -- this is a conservative approach. + // In practice, we may produce use-def links that may not exist. + XClobbers &= YClobbers; + return XClobbers == YClobbers; + } + + BitVector computeNext(const MCInst &Point, const BitVector &Cur) { + BitVector Next = Cur; + // Kill + for (auto I = this->expr_begin(Next), E = this->expr_end(); I != E; ++I) { + assert(*I != nullptr && "Lost pointers"); + if (doesXKillsY(&Point, *I)) { + Next.reset(I.getBitVectorIndex()); + } + } + // Gen + if (!this->BC.MIA->isCFI(Point)) { + Next.set(this->ExprToIdx[&Point]); + } + return Next; + } + + StringRef getAnnotationName() const { + if (Def) + return StringRef("ReachingDefs"); + return StringRef("ReachingUses"); + } +}; + +} // end namespace bolt +} // end namespace llvm + + +#endif diff --git a/bolt/Passes/ReachingInsns.h b/bolt/Passes/ReachingInsns.h new file mode 100644 index 000000000000..4bcdb3d843dd --- /dev/null +++ b/bolt/Passes/ReachingInsns.h @@ -0,0 +1,84 @@ +//===--- Passes/ReachingInsns.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H + +namespace llvm { +namespace bolt { + +template +class ReachingInsns + : public InstrsDataflowAnalysis, Backward> { + friend class DataflowAnalysis, BitVector, Backward>; + +public: + ReachingInsns(const BinaryContext &BC, BinaryFunction &BF) + : InstrsDataflowAnalysis(BC, BF) {} + virtual ~ReachingInsns() {} + + bool isInLoop(const BinaryBasicBlock &BB) { + const MCInst *First = BB.begin() != BB.end() ? &*BB.begin() : nullptr; + assert(First && "This analysis does not work for empty BB"); + return ((*this->getStateAt(BB))[this->ExprToIdx[First]]); + } + + bool isInLoop(const MCInst &Inst) { + const BinaryBasicBlock *BB = InsnToBB[&Inst]; + assert(BB && "Unknown instruction"); + return isInLoop(*BB); + } + +protected: + std::unordered_map InsnToBB; + + void preflight() { + for (auto &BB : this->Func) { + for (auto &Inst : BB) { + this->Expressions.push_back(&Inst); + this->ExprToIdx[&Inst] = this->NumInstrs++; + InsnToBB[&Inst] = &BB; + } + } + } + + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) { + return BitVector(this->NumInstrs, false); + } + + BitVector getStartingStateAtPoint(const MCInst &Point) { + return BitVector(this->NumInstrs, false); + } + + void doConfluence(BitVector &StateOut, const BitVector &StateIn) { + StateOut |= StateIn; + } + + BitVector computeNext(const MCInst &Point, const BitVector &Cur) { + BitVector Next = Cur; + // Gen + if (!this->BC.MIA->isCFI(Point)) { + Next.set(this->ExprToIdx[&Point]); + } + return Next; + } + + StringRef getAnnotationName() const { + if (Backward) + return StringRef("ReachingInsnsBackward"); + return StringRef("ReachingInsns"); + } +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/Passes/StackPointerTracking.cpp b/bolt/Passes/StackPointerTracking.cpp new file mode 100644 index 000000000000..ce12627242cb --- /dev/null +++ b/bolt/Passes/StackPointerTracking.cpp @@ -0,0 +1,28 @@ +//===--- Passes/StackPointerTracking.cpp ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "StackPointerTracking.h" + +namespace llvm { +namespace bolt { + +StackPointerTracking::StackPointerTracking(const BinaryContext &BC, + BinaryFunction &BF) + : StackPointerTrackingBase(BC, BF) {} + +} // end namespace bolt +} // end namespace llvm + +llvm::raw_ostream &llvm::operator<<(llvm::raw_ostream &OS, + const std::pair &Val) { + OS << Val.first << ", " << Val.second; + return OS; +} diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/Passes/StackPointerTracking.h new file mode 100644 index 000000000000..7f02e766dfc9 --- /dev/null +++ b/bolt/Passes/StackPointerTracking.h @@ -0,0 +1,203 @@ +//===--- Passes/StackPointerTracking.h ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKPOINTERTRACKING_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKPOINTERTRACKING_H + +#include "DataflowAnalysis.h" + +namespace llvm { +namespace bolt { + +/// Perform a dataflow analysis to track the value of SP as an offset relative +/// to the CFA. +template +class StackPointerTrackingBase + : public DataflowAnalysis> { + friend class DataflowAnalysis>; + +protected: + void preflight() {} + + int getEmpty() { return EMPTY; } + + std::pair getStartingStateAtBB(const BinaryBasicBlock &BB) { + // Entry BB start with offset 8 from CFA. + // All others start with EMPTY (meaning we don't know anything). + if (BB.isEntryPoint()) + return std::make_pair(-8, getEmpty()); + return std::make_pair(getEmpty(), getEmpty()); + } + + std::pair getStartingStateAtPoint(const MCInst &Point) { + return std::make_pair(getEmpty(), getEmpty()); + } + + void doConfluenceSingleReg(int &StateOut, const int &StateIn) { + if (StateOut == EMPTY) { + StateOut = StateIn; + return; + } + if (StateIn == EMPTY || StateIn == StateOut) + return; + + // We can't agree on a specific value from this point on + StateOut = SUPERPOSITION; + } + + void doConfluence(std::pair &StateOut, + const std::pair &StateIn) { + doConfluenceSingleReg(StateOut.first, StateIn.first); + doConfluenceSingleReg(StateOut.second, StateIn.second); + } + + void doConfluenceWithLP(std::pair &StateOut, + const std::pair &StateIn, + const MCInst &Invoke) { + int SPVal = StateIn.first; + for (const auto &Operand : Invoke) { + if (Operand.isGnuArgsSize()) { + auto ArgsSize = Operand.getGnuArgsSize(); + if (SPVal != EMPTY && SPVal != SUPERPOSITION) { + SPVal += ArgsSize; + } + } + } + doConfluenceSingleReg(StateOut.first, SPVal); + doConfluenceSingleReg(StateOut.second, StateIn.second); + } + + int computeNextSP(const MCInst &Point, int SPVal, int FPVal) { + const auto &MIA = this->BC.MIA; + + if (int Sz = MIA->getPushSize(Point)) { + if (SPVal == EMPTY || SPVal == SUPERPOSITION) + return SPVal; + + return SPVal - Sz; + } + + if (int Sz = MIA->getPopSize(Point)) { + if (SPVal == EMPTY || SPVal == SUPERPOSITION) + return SPVal; + + return SPVal + Sz; + } + + MCPhysReg From, To; + if (MIA->isRegToRegMove(Point, From, To) && To == MIA->getStackPointer() && + From == MIA->getFramePointer()) { + if (FPVal == EMPTY || FPVal == SUPERPOSITION) + return FPVal; + + if (MIA->isLeave(Point)) + return FPVal + 8; + else + return FPVal; + } + + if (this->BC.MII->get(Point.getOpcode()) + .hasDefOfPhysReg(Point, MIA->getStackPointer(), *this->BC.MRI)) { + std::pair SP; + if (SPVal != EMPTY && SPVal != SUPERPOSITION) + SP = std::make_pair(MIA->getStackPointer(), SPVal); + else + SP = std::make_pair(0, 0); + std::pair FP; + if (FPVal != EMPTY && FPVal != SUPERPOSITION) + FP = std::make_pair(MIA->getFramePointer(), FPVal); + else + FP = std::make_pair(0, 0); + int64_t Output; + if (!MIA->evaluateSimple(Point, Output, SP, FP)) + return SUPERPOSITION; + + return static_cast(Output); + } + + return SPVal; + } + + int computeNextFP(const MCInst &Point, int SPVal, int FPVal) { + const auto &MIA = this->BC.MIA; + + MCPhysReg From, To; + if (MIA->isRegToRegMove(Point, From, To) && To == MIA->getFramePointer() && + From == MIA->getStackPointer()) { + HasFramePointer = true; + return SPVal; + } + + if (this->BC.MII->get(Point.getOpcode()) + .hasDefOfPhysReg(Point, MIA->getFramePointer(), *this->BC.MRI)) { + std::pair FP; + if (FPVal != EMPTY && FPVal != SUPERPOSITION) + FP = std::make_pair(MIA->getFramePointer(), FPVal); + else + FP = std::make_pair(0, 0); + std::pair SP; + if (SPVal != EMPTY && SPVal != SUPERPOSITION) + SP = std::make_pair(MIA->getStackPointer(), SPVal); + else + SP = std::make_pair(0, 0); + int64_t Output; + if (!MIA->evaluateSimple(Point, Output, SP, FP)) + return SUPERPOSITION; + + if (!HasFramePointer) { + if (MIA->leaksStackAddress(Point, *this->BC.MRI, false)) { + HasFramePointer = true; + } + } + return static_cast(Output); + } + + return FPVal; + } + + std::pair computeNext(const MCInst &Point, + const std::pair &Cur) { + return std::make_pair(computeNextSP(Point, Cur.first, Cur.second), + computeNextFP(Point, Cur.first, Cur.second)); + } + + StringRef getAnnotationName() const { + return StringRef("StackPointerTracking"); + } + +public: + StackPointerTrackingBase(const BinaryContext &BC, BinaryFunction &BF) + : DataflowAnalysis>(BC, BF) {} + virtual ~StackPointerTrackingBase() {} + bool HasFramePointer{false}; + + static constexpr int SUPERPOSITION = std::numeric_limits::max(); + static constexpr int EMPTY = std::numeric_limits::min(); +}; + +class StackPointerTracking + : public StackPointerTrackingBase { + friend class DataflowAnalysis>; + +public: + StackPointerTracking(const BinaryContext &BC, BinaryFunction &BF); + virtual ~StackPointerTracking() {} +}; + +} // end namespace bolt + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const std::pair &Val); + +} // end namespace llvm + + +#endif From 7731128adcc3b17edbdde69a65d3254f85bcfe8b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 16 May 2017 09:27:34 -0700 Subject: [PATCH 252/904] [BOLT] Rework debug info processing. Summary: Multiple improvements to debug info handling: * Add support for relocation mode. * Speed-up processing. * Reduce memory consumption. * Bug fixes. The high-level idea behind the new debug handling is that we don't save intermediate state for ranges and location lists. Instead we depend on function and basic block address transformations to update the info as a final post-processing step. For HHVM in non-relocation mode the peak memory went down from 55GB to 35GB. Processing time went from over 6 minutes to under 5 minutes. (cherry picked from commit b66ff24ec19f0285a2fd52f6184eb435043ed97a) --- bolt/BinaryBasicBlock.h | 30 +++- bolt/BinaryContext.cpp | 109 ++---------- bolt/BinaryContext.h | 25 +-- bolt/BinaryFunction.cpp | 153 +++++++++++++++-- bolt/BinaryFunction.h | 26 ++- bolt/DWARFRewriter.cpp | 360 +++++++++++++++++---------------------- bolt/DebugData.cpp | 224 +++++++++--------------- bolt/DebugData.h | 229 +++++-------------------- bolt/RewriteInstance.cpp | 49 ++++-- bolt/RewriteInstance.h | 59 +++---- 10 files changed, 534 insertions(+), 730 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 3ddfeeb7ef80..ad1227279217 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -75,7 +75,7 @@ class BinaryBasicBlock { /// [Begin, End) address range for this block in the output binary. std::pair OutputAddressRange{0, 0}; - /// Original range of the basic block in the function. + /// Original offset range of the basic block in the function. std::pair InputRange{INVALID_OFFSET, INVALID_OFFSET}; /// Alignment requirements for the block. @@ -666,6 +666,11 @@ class BinaryBasicBlock { OutputAddressRange.second = Address; } + /// Gets the memory address range of this BB in the input binary. + std::pair getInputAddressRange() const { + return InputRange; + } + /// Gets the memory address range of this BB in the output binary. std::pair getOutputAddressRange() const { return OutputAddressRange; @@ -696,6 +701,22 @@ class BinaryBasicBlock { /// Validate successor invariants for this BB. bool validateSuccessorInvariants(); + /// Return offset of the basic block from the function start on input. + uint32_t getInputOffset() const { + return InputRange.first; + } + + /// Return offset from the function start to location immediately past + /// the end of the basic block. + uint32_t getEndOffset() const { + return InputRange.second; + } + + /// Return size of the basic block on input. + uint32_t getOriginalSize() const { + return InputRange.second - InputRange.first; + } + private: void adjustNumPseudos(const MCInst &Inst, int Sign); @@ -717,10 +738,15 @@ class BinaryBasicBlock { void clearLandingPads(); /// Return offset of the basic block from the function start. - uint64_t getOffset() const { + uint32_t getOffset() const { return InputRange.first; } + /// Set end offset of this basic block. + void setEndOffset(uint32_t Offset) { + InputRange.second = Offset; + } + /// Get the index of this basic block. unsigned getIndex() const { assert(isValid()); diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 3c4c80cdbbb9..55926e96f608 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -12,6 +12,7 @@ #include "BinaryContext.h" #include "BinaryFunction.h" #include "llvm/ADT/Twine.h" +#include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/DebugInfo/DWARF/DWARFUnit.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" @@ -38,6 +39,15 @@ PrintDebugInfo("print-debug-info", BinaryContext::~BinaryContext() { } +MCObjectWriter *BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { + if (!MAB) { + MAB = std::unique_ptr( + TheTarget->createMCAsmBackend(*MRI, TripleName, "")); + } + + return MAB->createObjectWriter(OS); +} + MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix) { MCSymbol *Symbol{nullptr}; @@ -78,7 +88,6 @@ MCSymbol *BinaryContext::getGlobalSymbolAtAddress(uint64_t Address) const { void BinaryContext::foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF, std::map &BFs) { - // Copy name list. ParentBF.addNewNames(ChildBF.getNames()); @@ -120,71 +129,12 @@ void BinaryContext::printGlobalSymbols(raw_ostream& OS) const { namespace { -/// Returns a binary function that contains a given address in the input -/// binary, or nullptr if none does. -BinaryFunction *getBinaryFunctionContainingAddress( - uint64_t Address, - std::map &BinaryFunctions) { - auto It = BinaryFunctions.upper_bound(Address); - if (It != BinaryFunctions.begin()) { - --It; - if (It->first + It->second.getSize() > Address) { - return &It->second; - } - } - return nullptr; -} - -// Traverses the DIE tree in a recursive depth-first search and finds lexical -// blocks and instances of inlined subroutines, saving them in -// AddressRangesObjects. -void findAddressRangesObjects( - const DWARFCompileUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, - std::map &Functions, - std::vector &AddressRangesObjects) { - auto Tag = DIE->getTag(); - if (Tag == dwarf::DW_TAG_lexical_block || - Tag == dwarf::DW_TAG_inlined_subroutine || - Tag == dwarf::DW_TAG_try_block || - Tag == dwarf::DW_TAG_catch_block) { - auto const &Ranges = DIE->getAddressRanges(Unit); - if (!Ranges.empty()) { - // We have to process all ranges, even for functions that we are not - // updating. The primary reason is that abbrev entries are shared - // and if we convert one DIE, it may affect the rest. Thus - // the conservative approach that does not involve expanding - // .debug_abbrev, is to switch all DIEs to use .debug_ranges, even if - // they have a simple [a,b) range. The secondary reason is that it allows - // us to get rid of the original portion of .debug_ranges to save - // space in the binary. - auto Function = getBinaryFunctionContainingAddress(Ranges.front().first, - Functions); - AddressRangesObjects.emplace_back(Unit, DIE); - auto &Object = AddressRangesObjects.back(); - for (const auto &Range : Ranges) { - if (Function && Function->isSimple()) { - Object.addAddressRange(*Function, Range.first, Range.second); - } else { - Object.addAbsoluteRange(Range.first, Range.second); - } - } - } - } - - // Recursively visit each child. - for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { - findAddressRangesObjects(Unit, Child, Functions, AddressRangesObjects); - } -} - /// Recursively finds DWARF DW_TAG_subprogram DIEs and match them with /// BinaryFunctions. Record DIEs for unknown subprograms (mostly functions that /// are never called and removed from the binary) in Unknown. void findSubprograms(DWARFCompileUnit *Unit, const DWARFDebugInfoEntryMinimal *DIE, - std::map &BinaryFunctions, - BinaryContext::DIECompileUnitVector &Unknown) { + std::map &BinaryFunctions) { if (DIE->isSubprogramDIE()) { // TODO: handle DW_AT_ranges. uint64_t LowPC, HighPC; @@ -193,7 +143,7 @@ void findSubprograms(DWARFCompileUnit *Unit, if (It != BinaryFunctions.end()) { It->second.addSubprogramDIE(Unit, DIE); } else { - Unknown.emplace_back(DIE, Unit); + // The function must have been optimized away by GC. } } else { const auto RangesVector = DIE->getAddressRanges(Unit); @@ -208,7 +158,7 @@ void findSubprograms(DWARFCompileUnit *Unit, for (auto ChildDIE = DIE->getFirstChild(); ChildDIE != nullptr && !ChildDIE->isNULL(); ChildDIE = ChildDIE->getSibling()) { - findSubprograms(Unit, ChildDIE, BinaryFunctions, Unknown); + findSubprograms(Unit, ChildDIE, BinaryFunctions); } } @@ -250,8 +200,7 @@ void BinaryContext::preprocessDebugInfo( // For each CU, iterate over its children DIEs and match subprogram DIEs to // BinaryFunctions. for (auto &CU : DwCtx->compile_units()) { - findSubprograms(CU.get(), CU->getUnitDIE(false), BinaryFunctions, - UnknownFunctions); + findSubprograms(CU.get(), CU->getUnitDIE(false), BinaryFunctions); } // Some functions may not have a corresponding subprogram DIE @@ -290,36 +239,6 @@ void BinaryContext::preprocessDebugInfo( } } -void BinaryContext::preprocessFunctionDebugInfo( - std::map &BinaryFunctions) { - // Iterate over DIE trees finding objects that contain address ranges. - for (const auto &CU : DwCtx->compile_units()) { - findAddressRangesObjects(CU.get(), CU->getUnitDIE(false), BinaryFunctions, - AddressRangesObjects); - } - - // Iterate over location lists and save them in LocationLists. - auto DebugLoc = DwCtx->getDebugLoc(); - for (const auto &DebugLocEntry : DebugLoc->getLocationLists()) { - if (DebugLocEntry.Entries.empty()) - continue; - const auto StartAddress = DebugLocEntry.Entries.front().Begin; - auto *Function = getBinaryFunctionContainingAddress(StartAddress, - BinaryFunctions); - if (!Function || !Function->isSimple()) - continue; - LocationLists.emplace_back(DebugLocEntry.Offset); - auto &LocationList = LocationLists.back(); - for (const auto &Location : DebugLocEntry.Entries) { - LocationList.addLocation( - &Location.Loc, - *Function, - Location.Begin, - Location.End); - } - } -} - void BinaryContext::printCFI(raw_ostream &OS, uint32_t Operation) { switch(Operation) { case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index a98c5c9f7cb6..5b4d3169beae 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -106,25 +106,9 @@ class BinaryContext { /// when a function has more than a single entry point. std::set InterproceduralReferences; - /// List of DWARF location lists in .debug_loc. - std::vector LocationLists; - /// Section relocations. std::map> SectionRelocations; - /// List of DWARF entries in .debug_info that have address ranges to be - /// updated. These include lexical blocks (DW_TAG_lexical_block) and concrete - /// instances of inlined subroutines (DW_TAG_inlined_subroutine). - std::vector AddressRangesObjects; - - using DIECompileUnitVector = - std::vector> ; - - /// List of subprogram DIEs that have addresses that don't match any - /// function, along with their CU. - DIECompileUnitVector UnknownFunctions; - std::unique_ptr Ctx; std::unique_ptr DwCtx; @@ -153,6 +137,8 @@ class BinaryContext { std::unique_ptr DisAsm; + std::unique_ptr MAB; + std::function ErrorCheck; const DataReader &DR; @@ -190,6 +176,8 @@ class BinaryContext { ~BinaryContext(); + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS); + /// Return a global symbol registered at a given \p Address. If no symbol /// exists, create one with unique name using \p Prefix. /// If there are multiple symbols registered at the \p Address, then @@ -251,11 +239,6 @@ class BinaryContext { void preprocessDebugInfo( std::map &BinaryFunctions); - /// Populate internal data structures with debug info that depends on - /// disassembled functions. - void preprocessFunctionDebugInfo( - std::map &BinaryFunctions); - /// Add a filename entry from SrcCUID to DestCUID. unsigned addDebugFilenameToUnit(const uint32_t DestCUID, const uint32_t SrcCUID, diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 9df58686f751..a5bd8e531460 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -239,20 +239,8 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { CompareBasicBlockOffsets()); assert(I != BasicBlockOffsets.begin() && "first basic block not at offset 0"); --I; - return I->second; -} - -size_t -BinaryFunction::getBasicBlockOriginalSize(const BinaryBasicBlock *BB) const { - if (!hasCFG()) - return 0; - - auto Index = getIndex(BB); - if (Index + 1 == BasicBlocks.size()) { - return Size - BB->getOffset(); - } else { - return BasicBlocks[Index + 1]->getOffset() - BB->getOffset(); - } + auto *BB = I->second; + return (Offset < BB->getOffset() + BB->getOriginalSize()) ? BB : nullptr; } void BinaryFunction::markUnreachable() { @@ -1863,9 +1851,14 @@ bool BinaryFunction::buildCFG() { removeConditionalTailCalls(); // Set the basic block layout to the original order. + PrevBB = nullptr; for (auto BB : BasicBlocks) { BasicBlocksLayout.emplace_back(BB); + if (PrevBB) + PrevBB->setEndOffset(BB->getOffset()); + PrevBB = BB; } + PrevBB->setEndOffset(getSize()); // Make any necessary adjustments for indirect branches. if (!postProcessIndirectBranches()) { @@ -3756,6 +3749,138 @@ void BinaryFunction::calculateLoopInfo() { } } +DWARFAddressRangesVector BinaryFunction::getOutputAddressRanges() const { + DWARFAddressRangesVector OutputRanges; + + OutputRanges.emplace_back(getOutputAddress(), + getOutputAddress() + getOutputSize()); + if (isSplit()) { + assert(isEmitted() && "split function should be emitted"); + OutputRanges.emplace_back(cold().getAddress(), + cold().getAddress() + cold().getImageSize()); + } + + return OutputRanges; +} + +DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( + DWARFAddressRangesVector InputRanges) const { + // If the function wasn't changed - there's nothing to update. + if (!isEmitted() && !opts::Relocs) + return InputRanges; + + DWARFAddressRangesVector OutputRanges; + uint64_t PrevEndAddress = 0; + + for (const auto &Range : InputRanges) { + if (!containsAddress(Range.first)) { + DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " + << *this << " : [0x" << Twine::utohexstr(Range.first) + << ", 0x" << Twine::utohexstr(Range.second) << "]\n"); + PrevEndAddress = 0; + continue; + } + auto InputOffset = Range.first - getAddress(); + const auto InputEndOffset = Range.second - getAddress(); + do { + const auto *BB = getBasicBlockContainingOffset(InputOffset); + if (!BB) { + DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " + << *this << " : [0x" << Twine::utohexstr(Range.first) + << ", 0x" << Twine::utohexstr(Range.second) << "]\n"); + PrevEndAddress = 0; + break; + } + + // Skip the range if the block was deleted. + if (const auto OutputStart = BB->getOutputAddressRange().first) { + const auto StartAddress = OutputStart + InputOffset - BB->getOffset(); + auto EndAddress = BB->getOutputAddressRange().second; + if (InputEndOffset < BB->getEndOffset()) + EndAddress = StartAddress + InputEndOffset - InputOffset; + + if (StartAddress == PrevEndAddress) { + OutputRanges.back().second = EndAddress; + } else { + OutputRanges.emplace_back(StartAddress, EndAddress); + } + PrevEndAddress = EndAddress; + } + + InputOffset = BB->getEndOffset(); + } while (InputOffset < InputEndOffset); + } + + return OutputRanges; +} + +DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( + DWARFDebugLoc::LocationList &&InputLL, + uint64_t BaseAddress) const { + + // If the function wasn't changed - there's nothing to update. + if (!isEmitted() && !opts::Relocs) { + if (!BaseAddress) { + return InputLL; + } else { + auto OutputLL = std::move(InputLL); + for (auto &Entry : OutputLL.Entries) { + Entry.Begin += BaseAddress; + Entry.End += BaseAddress; + } + return OutputLL; + } + } + + DWARFDebugLoc::LocationList OutputLL; + + uint64_t PrevEndAddress = 0; + for (auto &Entry : InputLL.Entries) { + const auto Start = Entry.Begin + BaseAddress; + const auto End = Entry.End + BaseAddress; + if (!containsAddress(Start)) { + DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " + << *this << " : [0x" << Twine::utohexstr(Start) + << ", 0x" << Twine::utohexstr(End) << "]\n"); + PrevEndAddress = 0; + continue; + } + auto InputOffset = Start - getAddress(); + const auto InputEndOffset = End - getAddress(); + do { + const auto *BB = getBasicBlockContainingOffset(InputOffset); + if (!BB) { + DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " + << *this << " : [0x" << Twine::utohexstr(Start) + << ", 0x" << Twine::utohexstr(End) << "]\n"); + PrevEndAddress = 0; + break; + } + + // Skip the range if the block was deleted. + if (const auto OutputStart = BB->getOutputAddressRange().first) { + const auto StartAddress = OutputStart + InputOffset - BB->getOffset(); + auto EndAddress = BB->getOutputAddressRange().second; + if (InputEndOffset < BB->getEndOffset()) + EndAddress = StartAddress + InputEndOffset - InputOffset; + + if (StartAddress == PrevEndAddress) { + OutputLL.Entries.back().End = EndAddress; + } else { + OutputLL.Entries.emplace_back( + DWARFDebugLoc::Entry{StartAddress, + EndAddress, + std::move(Entry.Loc)}); + } + PrevEndAddress = EndAddress; + } + InputOffset = BB->getEndOffset(); + } while (InputOffset < InputEndOffset); + } + + return OutputLL; +} + void BinaryFunction::printLoopInfo(raw_ostream &OS) const { OS << "Loop Info for Function \"" << *this << "\""; if (hasValidProfile()) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 15a48d3ee0bc..4e65fbdf70e5 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -167,7 +167,7 @@ enum IndirectCallPromotionType : char { /// BinaryFunction is a representation of machine-level function. /// /// We use the term "Binary" as "Machine" was already taken. -class BinaryFunction : public AddressRangesOwner { +class BinaryFunction { public: enum class State : char { Empty = 0, /// Function body is empty. @@ -356,6 +356,11 @@ class BinaryFunction : public AddressRangesOwner { /// from the function start. BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset); + const BinaryBasicBlock *getBasicBlockContainingOffset(uint64_t Offset) const { + return const_cast(this) + ->getBasicBlockContainingOffset(Offset); + } + /// Return basic block that started at offset \p Offset. BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) { BinaryBasicBlock *BB = getBasicBlockContainingOffset(Offset); @@ -1559,11 +1564,6 @@ class BinaryFunction : public AddressRangesOwner { return *this; } - /// Sets the function's address ranges list offset in .debug_ranges. - void setAddressRangesOffset(uint32_t Offset) { - AddressRangesOffset = Offset; - } - /// Returns the offset of the function's address ranges in .debug_ranges. uint32_t getAddressRangesOffset() const { return AddressRangesOffset; } @@ -1792,9 +1792,6 @@ class BinaryFunction : public AddressRangesOwner { return UnitLineTable; } - /// Returns the size of the basic block in the original binary. - size_t getBasicBlockOriginalSize(const BinaryBasicBlock *BB) const; - /// Returns an estimate of the function's hot part after splitting. /// This is a very rough estimate, as with C++ exceptions there are /// blocks we don't move, and it makes no attempt at estimating the size @@ -1819,6 +1816,17 @@ class BinaryFunction : public AddressRangesOwner { return Estimate; } + /// Return output address ranges for a function. + DWARFAddressRangesVector getOutputAddressRanges() const; + + DWARFAddressRangesVector translateInputToOutputRanges( + DWARFAddressRangesVector InputRanges) const; + + /// \p BaseAddress to be applied to all addresses in \pInputLL. + DWARFDebugLoc::LocationList translateInputToOutputLocationList( + DWARFDebugLoc::LocationList &&InputLL, + uint64_t BaseAddress) const; + virtual ~BinaryFunction(); /// Info for fragmented functions. diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index c3e1eca911e6..a785dfdd4bb8 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -49,6 +49,7 @@ namespace opts { extern cl::OptionCategory BoltCategory; extern cl::opt Verbosity; +extern cl::opt Relocs; static cl::opt KeepARanges("keep-aranges", @@ -63,78 +64,152 @@ void RewriteInstance::updateDebugInfo() { SectionPatchers[".debug_abbrev"] = llvm::make_unique(); SectionPatchers[".debug_info"] = llvm::make_unique(); - updateFunctionRanges(); + RangesSectionsWriter = llvm::make_unique(BC.get()); + LocationListWriter = llvm::make_unique(BC.get()); - updateAddressRangesObjects(); - - updateEmptyModuleRanges(); - - generateDebugRanges(); - - updateLocationLists(); + for (auto &CU : BC->DwCtx->compile_units()) { + updateUnitDebugInfo(CU.get(), + CU->getUnitDIE(false), + std::vector{}); + } - updateDWARFAddressRanges(); + finalizeDebugSections(); updateGdbIndexSection(); } -void RewriteInstance::updateEmptyModuleRanges() { - const auto &CUAddressRanges = RangesSectionsWriter.getCUAddressRanges(); - for (const auto &CU : BC->DwCtx->compile_units()) { - if (CUAddressRanges.find(CU->getOffset()) != CUAddressRanges.end()) - continue; - auto const &Ranges = CU->getUnitDIE(true)->getAddressRanges(CU.get()); - for (auto const &Range : Ranges) { - RangesSectionsWriter.addRange(CU->getOffset(), - Range.first, - Range.second - Range.first); +void RewriteInstance::updateUnitDebugInfo( + DWARFCompileUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + std::vector FunctionStack) { + + bool IsFunctionDef = false; + switch (DIE->getTag()) { + case dwarf::DW_TAG_compile_unit: + { + const auto ModuleRanges = DIE->getAddressRanges(Unit); + auto OutputRanges = translateModuleAddressRanges(ModuleRanges); + const auto RangesSectionOffset = + RangesSectionsWriter->addCURanges(Unit->getOffset(), + std::move(OutputRanges)); + updateDWARFObjectAddressRanges(Unit, DIE, RangesSectionOffset); } - } -} - -void RewriteInstance::updateDWARFAddressRanges() { - // Update DW_AT_ranges for all compilation units. - for (const auto &CU : BC->DwCtx->compile_units()) { - const auto CUID = CU->getOffset(); - const auto RSOI = RangesSectionsWriter.getRangesOffsetCUMap().find(CUID); - if (RSOI == RangesSectionsWriter.getRangesOffsetCUMap().end()) - continue; - updateDWARFObjectAddressRanges(RSOI->second, CU.get(), CU->getUnitDIE()); - } - - // Update address ranges of functions. - for (const auto &BFI : BinaryFunctions) { - const auto &Function = BFI.second; - for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { - updateDWARFObjectAddressRanges( - Function.getAddressRangesOffset(), - DIECompileUnitPair.second, - DIECompileUnitPair.first); + break; + + case dwarf::DW_TAG_subprogram: + { + // The function cannot have multiple ranges on the input. + uint64_t LowPC, HighPC; + if (DIE->getLowAndHighPC(Unit, LowPC, HighPC)) { + IsFunctionDef = true; + const auto *Function = getBinaryFunctionAtAddress(LowPC); + if (Function && Function->isFolded()) { + Function = nullptr; + } + FunctionStack.push_back(Function); + auto RangesSectionOffset = + RangesSectionsWriter->getEmptyRangesOffset(); + if (Function) { + auto FunctionRanges = Function->getOutputAddressRanges(); + RangesSectionOffset = + RangesSectionsWriter->addRanges(Function, + std::move(FunctionRanges)); + } + updateDWARFObjectAddressRanges(Unit, DIE, RangesSectionOffset); + } + } + break; + + case dwarf::DW_TAG_lexical_block: + case dwarf::DW_TAG_inlined_subroutine: + case dwarf::DW_TAG_try_block: + case dwarf::DW_TAG_catch_block: + { + auto RangesSectionOffset = + RangesSectionsWriter->getEmptyRangesOffset(); + const BinaryFunction *Function = + FunctionStack.empty() ? nullptr : FunctionStack.back(); + if (Function) { + const auto Ranges = DIE->getAddressRanges(Unit); + auto OutputRanges = Function->translateInputToOutputRanges(Ranges); + DEBUG( + if (OutputRanges.empty() != Ranges.empty()) { + dbgs() << "BOLT-DEBUG: problem with DIE at 0x" + << Twine::utohexstr(DIE->getOffset()) << " in CU at 0x" + << Twine::utohexstr(Unit->getOffset()) << '\n'; + } + ); + RangesSectionOffset = + RangesSectionsWriter->addRanges(Function, std::move(OutputRanges)); + } + updateDWARFObjectAddressRanges(Unit, DIE, RangesSectionOffset); + } + break; + + default: + { + // Handle any tag that can have DW_AT_location attribute. + DWARFFormValue Value; + uint32_t AttrOffset; + if (DIE->getAttributeValue(Unit, dwarf::DW_AT_location, Value, + &AttrOffset)) { + if (Value.isFormClass(DWARFFormValue::FC_Constant) || + Value.isFormClass(DWARFFormValue::FC_SectionOffset)) { + auto LocListSectionOffset = LocationListWriter->getEmptyListOffset(); + const BinaryFunction *Function = + FunctionStack.empty() ? nullptr : FunctionStack.back(); + if (Function) { + // Limit parsing to a single list to save memory. + DWARFDebugLoc::LocationList LL; + LL.Offset = Value.isFormClass(DWARFFormValue::FC_Constant) ? + Value.getAsUnsignedConstant().getValue() : + Value.getAsSectionOffset().getValue(); + + Unit->getContext().getOneDebugLocList(LL); + assert(!LL.Entries.empty() && "location list cannot be empty"); + + const auto OutputLL = Function + ->translateInputToOutputLocationList(std::move(LL), + Unit->getBaseAddress()); + DEBUG( + if (OutputLL.Entries.empty()) { + dbgs() << "BOLT-DEBUG: location list translated to an empty one " + "at 0x" + << Twine::utohexstr(DIE->getOffset()) << " in CU at 0x" + << Twine::utohexstr(Unit->getOffset()) << '\n'; + } + ); + + LocListSectionOffset = LocationListWriter->addList(OutputLL); + } + + auto DebugInfoPatcher = + static_cast( + SectionPatchers[".debug_info"].get()); + DebugInfoPatcher->addLE32Patch(AttrOffset, LocListSectionOffset); + } else { + assert((Value.isFormClass(DWARFFormValue::FC_Exprloc) || + Value.isFormClass(DWARFFormValue::FC_Block)) && + "unexpected DW_AT_location form"); + } + } } } - // Update address ranges of DIEs with addresses that don't match functions. - for (auto &DIECompileUnitPair : BC->UnknownFunctions) { - updateDWARFObjectAddressRanges( - RangesSectionsWriter.getEmptyRangesListOffset(), - DIECompileUnitPair.second, - DIECompileUnitPair.first); + // Recursively update each child. + for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { + updateUnitDebugInfo(Unit, Child, FunctionStack); } - // Update address ranges of DWARF block objects (lexical/try/catch blocks, - // inlined subroutine instances, etc). - for (const auto &Obj : BC->AddressRangesObjects) { - updateDWARFObjectAddressRanges( - Obj.getAddressRangesOffset(), - Obj.getCompileUnit(), - Obj.getDIE()); - } + if (IsFunctionDef) + FunctionStack.pop_back(); } + void RewriteInstance::updateDWARFObjectAddressRanges( - uint32_t DebugRangesOffset, const DWARFUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE) { + const DWARFDebugInfoEntryMinimal *DIE, + uint64_t DebugRangesOffset) { // Some objects don't have an associated DIE and cannot be updated (such as // compiler-generated functions). @@ -300,15 +375,6 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { } } -void RewriteInstance::updateAddressRangesObjects() { - for (auto &Obj : BC->AddressRangesObjects) { - for (const auto &Range : Obj.getAbsoluteAddressRanges()) { - RangesSectionsWriter.addRange(&Obj, Range.first, - Range.second - Range.first); - } - } -} - void RewriteInstance::updateLineTableOffsets() { const auto LineSection = BC->Ctx->getObjectFileInfo()->getDwarfLineSection(); @@ -367,159 +433,51 @@ void RewriteInstance::updateLineTableOffsets() { } } -void RewriteInstance::updateFunctionRanges() { - auto addDebugArangesEntry = [&](const BinaryFunction &Function, - uint64_t RangeBegin, - uint64_t RangeSize) { - // The function potentially has multiple associated CUs because of - // the identical code folding optimization. Update all of them with - // the range. - for (const auto DIECompileUnitPair : Function.getSubprogramDIEs()) { - const auto CU = DIECompileUnitPair.second; - if (CU->getOffset() != -1U) - RangesSectionsWriter.addRange(CU->getOffset(), RangeBegin, RangeSize); - } - }; - - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - // If function doesn't have registered DIEs - there's nothting to update. - if (Function.getSubprogramDIEs().empty()) - continue; - // Use either new (image) or original size for the function range. - auto Size = Function.isSimple() ? Function.getImageSize() - : Function.getSize(); - addDebugArangesEntry(Function, - Function.getAddress(), - Size); - RangesSectionsWriter.addRange(&Function, Function.getAddress(), Size); - if (Function.isSimple() && Function.cold().getImageSize()) { - addDebugArangesEntry(Function, - Function.cold().getAddress(), - Function.cold().getImageSize()); - RangesSectionsWriter.addRange(&Function, - Function.cold().getAddress(), - Function.cold().getImageSize()); - } - } -} - -void RewriteInstance::generateDebugRanges() { - enum { RANGES, ARANGES }; - for (auto RT = RANGES + 0; RT <= ARANGES; ++RT) { - // Skip .debug_aranges if we are re-generating .gdb_index. - if (!opts::KeepARanges && GdbIndexSection.getObject() && RT == ARANGES) - continue; - - const char *SectionName = (RT == RANGES) ? ".debug_ranges" - : ".debug_aranges"; - SmallVector RangesBuffer; - raw_svector_ostream OS(RangesBuffer); +void RewriteInstance::finalizeDebugSections() { + // Skip .debug_aranges if we are re-generating .gdb_index. + if (opts::KeepARanges || !GdbIndexSection.getObject()) { + SmallVector ARangesBuffer; + raw_svector_ostream OS(ARangesBuffer); auto MAB = std::unique_ptr( BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, "")); auto Writer = std::unique_ptr(MAB->createObjectWriter(OS)); - if (RT == RANGES) { - RangesSectionsWriter.writeRangesSection(Writer.get()); - } else { - RangesSectionsWriter.writeArangesSection(Writer.get()); - } - const auto &DebugRangesContents = OS.str(); + RangesSectionsWriter->writeArangesSection(Writer.get()); + const auto &ARangesContents = OS.str(); // Freed by ExecutableFileMemoryManager. - uint8_t *SectionData = new uint8_t[DebugRangesContents.size()]; - memcpy(SectionData, DebugRangesContents.data(), DebugRangesContents.size()); - - EFMM->NoteSectionInfo[SectionName] = SectionInfo( + uint8_t *SectionData = new uint8_t[ARangesContents.size()]; + memcpy(SectionData, ARangesContents.data(), ARangesContents.size()); + EFMM->NoteSectionInfo[".debug_aranges"] = SectionInfo( reinterpret_cast(SectionData), - DebugRangesContents.size(), + ARangesContents.size(), /*Alignment=*/0, /*IsCode=*/false, /*IsReadOnly=*/true, /*IsLocal=*/false); } -} - -void RewriteInstance::updateLocationLists() { - // Write new contents to .debug_loc. - SmallVector DebugLocBuffer; - raw_svector_ostream OS(DebugLocBuffer); - - auto MAB = std::unique_ptr( - BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, "")); - auto Writer = std::unique_ptr(MAB->createObjectWriter(OS)); - - DebugLocWriter LocationListsWriter; - - for (const auto &Loc : BC->LocationLists) { - LocationListsWriter.write(Loc, Writer.get()); - } - const auto &DebugLocContents = OS.str(); - - // Free'd by ExecutableFileMemoryManager. - uint8_t *SectionData = new uint8_t[DebugLocContents.size()]; - memcpy(SectionData, DebugLocContents.data(), DebugLocContents.size()); + auto RangesSectionContents = RangesSectionsWriter->finalize(); + EFMM->NoteSectionInfo[".debug_ranges"] = SectionInfo( + reinterpret_cast(RangesSectionContents->data()), + RangesSectionContents->size(), + /*Alignment=*/1, + /*IsCode=*/false, + /*IsReadOnly=*/true, + /*IsLocal=*/false); + auto LocationListSectionContents = LocationListWriter->finalize(); + const auto SectionSize = LocationListSectionContents->size(); + uint8_t *SectionData = new uint8_t[SectionSize]; + memcpy(SectionData, LocationListSectionContents->data(), SectionSize); EFMM->NoteSectionInfo[".debug_loc"] = SectionInfo( reinterpret_cast(SectionData), - DebugLocContents.size(), - /*Alignment=*/0, + SectionSize, + /*Alignment=*/1, /*IsCode=*/false, /*IsReadOnly=*/true, /*IsLocal=*/false); - - // For each CU, update pointers into .debug_loc. - for (const auto &CU : BC->DwCtx->compile_units()) { - updateLocationListPointers( - CU.get(), - CU->getUnitDIE(false), - LocationListsWriter.getUpdatedLocationListOffsets()); - } -} - -void RewriteInstance::updateLocationListPointers( - const DWARFUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, - const std::map &UpdatedOffsets) { - // Stop if we're in a non-simple function, which will not be rewritten. - auto Tag = DIE->getTag(); - if (Tag == dwarf::DW_TAG_subprogram) { - uint64_t LowPC = -1ULL, HighPC = -1ULL; - DIE->getLowAndHighPC(Unit, LowPC, HighPC); - if (LowPC != -1ULL) { - auto It = BinaryFunctions.find(LowPC); - if (It != BinaryFunctions.end() && !It->second.isSimple()) - return; - } - } - // If the DIE has a DW_AT_location attribute with a section offset, update it. - DWARFFormValue Value; - uint32_t AttrOffset; - if (DIE->getAttributeValue(Unit, dwarf::DW_AT_location, Value, &AttrOffset) && - (Value.isFormClass(DWARFFormValue::FC_Constant) || - Value.isFormClass(DWARFFormValue::FC_SectionOffset))) { - uint64_t DebugLocOffset = -1ULL; - if (Value.isFormClass(DWARFFormValue::FC_SectionOffset)) { - DebugLocOffset = Value.getAsSectionOffset().getValue(); - } else if (Value.isFormClass(DWARFFormValue::FC_Constant)) { // DWARF 3 - DebugLocOffset = Value.getAsUnsignedConstant().getValue(); - } - - auto It = UpdatedOffsets.find(DebugLocOffset); - if (It != UpdatedOffsets.end()) { - auto DebugInfoPatcher = - static_cast( - SectionPatchers[".debug_info"].get()); - DebugInfoPatcher->addLE32Patch(AttrOffset, It->second + DebugLocSize); - } - } - - // Recursively visit children. - for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { - updateLocationListPointers(Unit, Child, UpdatedOffsets); - } } void RewriteInstance::updateGdbIndexSection() { @@ -576,7 +534,7 @@ void RewriteInstance::updateGdbIndexSection() { // Calculate the size of the new address table. uint32_t NewAddressTableSize = 0; - for (const auto &CURangesPair : RangesSectionsWriter.getCUAddressRanges()) { + for (const auto &CURangesPair : RangesSectionsWriter->getCUAddressRanges()) { const auto &Ranges = CURangesPair.second; NewAddressTableSize += Ranges.size() * 20; } @@ -604,12 +562,12 @@ void RewriteInstance::updateGdbIndexSection() { Buffer += CUListSize; // Generate new address table. - for (const auto &CURangesPair : RangesSectionsWriter.getCUAddressRanges()) { + for (const auto &CURangesPair : RangesSectionsWriter->getCUAddressRanges()) { const auto CUIndex = OffsetToIndexMap[CURangesPair.first]; const auto &Ranges = CURangesPair.second; for (const auto &Range : Ranges) { write64le(Buffer, Range.first); - write64le(Buffer + 8, Range.first + Range.second); + write64le(Buffer + 8, Range.second); write32le(Buffer + 16, CUIndex); Buffer += 20; } diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index 2a18d207716b..9b9600656809 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -31,115 +31,6 @@ namespace bolt { const DebugLineTableRowRef DebugLineTableRowRef::NULL_ROW{0, 0}; -void BasicBlockOffsetRanges::addAddressRange(BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress, - const BinaryData *Data) { - if (Function.getState() != BinaryFunction::State::CFG) - return; - - const auto BBRange = Function.getBasicBlockRangeFromOffsetToEnd( - BeginAddress - Function.getAddress()); - - DEBUG(dbgs() << "adding range [0x" << Twine::utohexstr(BeginAddress) << ", 0x" - << Twine::utohexstr(EndAddress) << ") to function " << Function - << '\n'); - - if (BBRange.begin() == BBRange.end()) { - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: no basic blocks in function " - << Function << " intersect with debug range [0x" - << Twine::utohexstr(BeginAddress) << ", 0x" - << Twine::utohexstr(EndAddress) << ")\n"; - } - return; - } - - for (auto &BB : BBRange) { - const auto BBAddress = Function.getBasicBlockOriginalAddress(&BB); - - // Some ranges could be of the form [BBAddress, BBAddress). - if (BBAddress > EndAddress || - (BBAddress == EndAddress && EndAddress != BeginAddress)) - break; - - const auto InternalAddressRangeBegin = std::max(BBAddress, BeginAddress); - assert(BB.getFunction() == &Function && "mismatching functions\n"); - const auto InternalAddressRangeEnd = - std::min(BBAddress + Function.getBasicBlockOriginalSize(&BB), - EndAddress); - - assert(BB.isValid() && "attempting to record debug info for a deleted BB."); - - AddressRanges.emplace_back( - BBAddressRange{ - &BB, - static_cast(InternalAddressRangeBegin - BBAddress), - static_cast(InternalAddressRangeEnd - BBAddress), - Data}); - } -} - -std::vector -BasicBlockOffsetRanges::getAbsoluteAddressRanges() const { - std::vector AbsoluteRanges; - for (const auto &BBAddressRange : AddressRanges) { - if (!BBAddressRange.BasicBlock->isValid()) - continue; - auto BBOutputAddressRange = - BBAddressRange.BasicBlock->getOutputAddressRange(); - uint64_t NewRangeBegin = BBOutputAddressRange.first + - BBAddressRange.RangeBeginOffset; - // If the end offset pointed to the end of the basic block, then we set - // the new end range to cover the whole basic block as the BB's size - // might have increased. - auto BBFunction = BBAddressRange.BasicBlock->getFunction(); - uint64_t NewRangeEnd = - (BBAddressRange.RangeEndOffset == - BBFunction->getBasicBlockOriginalSize(BBAddressRange.BasicBlock)) - ? BBOutputAddressRange.second - : (BBOutputAddressRange.first + BBAddressRange.RangeEndOffset); - AbsoluteRanges.emplace_back(AbsoluteRange{NewRangeBegin, NewRangeEnd, - BBAddressRange.Data}); - } - if (AbsoluteRanges.empty()) { - return AbsoluteRanges; - } - // Merge adjacent ranges that have the same data. - std::sort(AbsoluteRanges.begin(), AbsoluteRanges.end(), - [](const AbsoluteRange &A, const AbsoluteRange &B) { - return A.Begin < B.Begin; - }); - decltype(AbsoluteRanges) MergedRanges; - - MergedRanges.emplace_back(AbsoluteRanges[0]); - for (unsigned I = 1, S = AbsoluteRanges.size(); I != S; ++I) { - // If this range complements the last one and they point to the same - // (possibly null) data, merge them instead of creating another one. - if (AbsoluteRanges[I].Begin == MergedRanges.back().End && - AbsoluteRanges[I].Data == MergedRanges.back().Data) { - MergedRanges.back().End = AbsoluteRanges[I].End; - } else { - MergedRanges.emplace_back(AbsoluteRanges[I]); - } - } - - return MergedRanges; -} - -void DebugRangesSectionsWriter::addRange(uint32_t CompileUnitOffset, - uint64_t Address, - uint64_t Size) { - CUAddressRanges[CompileUnitOffset].emplace_back(std::make_pair(Address, - Size)); -} - -void DebugRangesSectionsWriter::addRange(AddressRangesOwner *BF, - uint64_t Address, - uint64_t Size) { - ObjectAddressRanges[BF].emplace_back(std::make_pair(Address, Size)); -} - namespace { // Writes address ranges to Writer as pairs of 64-bit (address, size). @@ -147,14 +38,14 @@ namespace { // the form (begin address, range size), otherwise (begin address, end address). // Terminates the list by writing a pair of two zeroes. // Returns the number of written bytes. -uint32_t writeAddressRanges( +uint64_t writeAddressRanges( MCObjectWriter *Writer, - const std::vector> &AddressRanges, - bool RelativeRange) { - // Write entries. + const DWARFAddressRangesVector &AddressRanges, + const bool WriteRelativeRanges = false) { for (auto &Range : AddressRanges) { Writer->writeLE64(Range.first); - Writer->writeLE64((!RelativeRange) * Range.first + Range.second); + Writer->writeLE64(WriteRelativeRanges ? Range.second - Range.first + : Range.second); } // Finish with 0 entries. Writer->writeLE64(0); @@ -164,28 +55,57 @@ uint32_t writeAddressRanges( } // namespace -void DebugRangesSectionsWriter::writeRangesSection(MCObjectWriter *Writer) { - uint32_t SectionOffset = 0; - for (const auto &CUOffsetAddressRangesPair : CUAddressRanges) { - const auto CUOffset = CUOffsetAddressRangesPair.first; - RangesSectionOffsetCUMap[CUOffset] = SectionOffset; - const auto &AddressRanges = CUOffsetAddressRangesPair.second; - SectionOffset += writeAddressRanges(Writer, AddressRanges, false); - } +DebugRangesSectionsWriter::DebugRangesSectionsWriter(BinaryContext *BC) { + RangesBuffer = llvm::make_unique>(); + RangesStream = llvm::make_unique(*RangesBuffer); + Writer = + std::unique_ptr(BC->createObjectWriter(*RangesStream)); + + // Add an empty range as the first entry; + SectionOffset += writeAddressRanges(Writer.get(), DWARFAddressRangesVector{}); +} + +uint64_t DebugRangesSectionsWriter::addCURanges( + uint64_t CUOffset, + DWARFAddressRangesVector &&Ranges) { + const auto RangesOffset = addRanges(Ranges); + CUAddressRanges.emplace(CUOffset, std::move(Ranges)); - for (const auto &BFAddressRangesPair : ObjectAddressRanges) { - BFAddressRangesPair.first->setAddressRangesOffset(SectionOffset); - const auto &AddressRanges = BFAddressRangesPair.second; - SectionOffset += writeAddressRanges(Writer, AddressRanges, false); + return RangesOffset; +} + +uint64_t +DebugRangesSectionsWriter::addRanges(const BinaryFunction *Function, + DWARFAddressRangesVector &&Ranges) { + if (Ranges.empty()) + return getEmptyRangesOffset(); + + static const BinaryFunction *CachedFunction; + + if (Function == CachedFunction) { + const auto RI = CachedRanges.find(Ranges); + if (RI != CachedRanges.end()) + return RI->second; + } else { + CachedRanges.clear(); + CachedFunction = Function; } - // Write an empty address list to be used for objects with unknown address - // ranges. - EmptyRangesListOffset = SectionOffset; - SectionOffset += writeAddressRanges( - Writer, - std::vector>{}, - false); + const auto EntryOffset = addRanges(Ranges); + CachedRanges.emplace(std::move(Ranges), EntryOffset); + + return EntryOffset; +} + +uint64_t +DebugRangesSectionsWriter::addRanges(const DWARFAddressRangesVector &Ranges) { + if (Ranges.empty()) + return getEmptyRangesOffset(); + + const auto EntryOffset = SectionOffset; + SectionOffset += writeAddressRanges(Writer.get(), Ranges); + + return EntryOffset; } void @@ -228,28 +148,38 @@ DebugRangesSectionsWriter::writeArangesSection(MCObjectWriter *Writer) const { } } -void DebugLocWriter::write(const LocationList &LocList, - MCObjectWriter *Writer) { - // Reference: DWARF 4 specification section 7.7.3. - UpdatedOffsets[LocList.getOriginalOffset()] = SectionOffset; - auto AbsoluteRanges = LocList.getAbsoluteAddressRanges(); +DebugLocWriter::DebugLocWriter(BinaryContext *BC) { + LocBuffer = llvm::make_unique>(); + LocStream = llvm::make_unique(*LocBuffer); + Writer = + std::unique_ptr(BC->createObjectWriter(*LocStream)); + + // Add an empty list as the first entry; + Writer->writeLE64(0); + Writer->writeLE64(0); + SectionOffset += 2 * 8; +} + +// DWARF 4: 2.6.2 +uint64_t DebugLocWriter::addList(const DWARFDebugLoc::LocationList &LocList) { + if (LocList.Entries.empty()) + return getEmptyListOffset(); - for (const auto &Entry : LocList.getAbsoluteAddressRanges()) { + const auto EntryOffset = SectionOffset; + for (const auto &Entry : LocList.Entries) { Writer->writeLE64(Entry.Begin); Writer->writeLE64(Entry.End); - assert(Entry.Data && "Entry with null location expression."); - Writer->writeLE16(Entry.Data->size()); - - // Need to convert binary data from unsigned char to char. + Writer->writeLE16(Entry.Loc.size()); Writer->writeBytes( - StringRef(reinterpret_cast(Entry.Data->data()), - Entry.Data->size())); - - SectionOffset += 2 * 8 + 2 + Entry.Data->size(); + StringRef(reinterpret_cast(Entry.Loc.data()), + Entry.Loc.size())); + SectionOffset += 2 * 8 + 2 + Entry.Loc.size(); } Writer->writeLE64(0); Writer->writeLE64(0); SectionOffset += 2 * 8; + + return EntryOffset; } void SimpleBinaryPatcher::addBinaryPatch(uint32_t Offset, diff --git a/bolt/DebugData.h b/bolt/DebugData.h index 7108d506aca7..c448ebed0b0c 100644 --- a/bolt/DebugData.h +++ b/bolt/DebugData.h @@ -18,6 +18,7 @@ #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/Support/SMLoc.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -33,6 +34,7 @@ class MCObjectWriter; namespace bolt { +class BinaryContext; class BasicBlockTable; class BinaryBasicBlock; class BinaryFunction; @@ -76,235 +78,94 @@ struct DebugLineTableRowRef { } }; -/// Represents a list of address ranges where addresses are relative to the -/// beginning of basic blocks. Useful for converting address ranges in the input -/// binary to equivalent ranges after optimizations take place. -class BasicBlockOffsetRanges { -public: - typedef SmallVectorImpl BinaryData; - struct AbsoluteRange { - uint64_t Begin; - uint64_t End; - const BinaryData *Data; - }; - - /// Add range [BeginAddress, EndAddress) to the address ranges list. - /// \p Function is the function that contains the given address range. - void addAddressRange(BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress, - const BinaryData *Data = nullptr); - - /// Returns the list of absolute addresses calculated using the output address - /// of the basic blocks, i.e. the input ranges updated after basic block - /// addresses might have changed, together with the data associated to them. - std::vector getAbsoluteAddressRanges() const; - -private: - /// An address range inside one basic block. - struct BBAddressRange { - const BinaryBasicBlock *BasicBlock; - /// Beginning of the range counting from BB's start address. - uint16_t RangeBeginOffset; - /// (Exclusive) end of the range counting from BB's start address. - uint16_t RangeEndOffset; - /// Binary data associated with this range. - const BinaryData *Data; - - void print(raw_ostream &OS) const { - OS << " BasicBlock : " << BasicBlock->getName() << '\n'; - OS << " StartOffset: " << RangeBeginOffset << '\n'; - OS << " EndOffset: " << RangeEndOffset << '\n'; - } - }; - - std::vector AddressRanges; -}; - -/// Abstract interface for classes that represent objects that have -/// associated address ranges in .debug_ranges. These address ranges can -/// be serialized by DebugRangesSectionsWriter which notifies the object -/// of where in the section its address ranges list was written. -class AddressRangesOwner { -public: - virtual void setAddressRangesOffset(uint32_t Offset) = 0; - - virtual ~AddressRangesOwner() {} -}; - -/// Represents DWARF entities that have generic address ranges, maintaining -/// their address ranges to be updated on the output debugging information. -class AddressRangesDWARFObject : public AddressRangesOwner { -public: - AddressRangesDWARFObject(const DWARFCompileUnit *CU, - const DWARFDebugInfoEntryMinimal *DIE) - : CU(CU), DIE(DIE) { } - - /// Add range [BeginAddress, EndAddress) to this object. - void addAddressRange(BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress) { - BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress); - } - - /// Add range that is guaranteed to not change. - void addAbsoluteRange(uint64_t BeginAddress, - uint64_t EndAddress) { - AbsoluteRanges.emplace_back(std::make_pair(BeginAddress, EndAddress)); - } - - std::vector> getAbsoluteAddressRanges() const { - auto AddressRangesWithData = BBOffsetRanges.getAbsoluteAddressRanges(); - std::vector> - AddressRanges(AddressRangesWithData.size()); - for (unsigned I = 0, S = AddressRanges.size(); I != S; ++I) { - AddressRanges[I] = std::make_pair(AddressRangesWithData[I].Begin, - AddressRangesWithData[I].End); - } - std::move(AbsoluteRanges.begin(), - AbsoluteRanges.end(), - std::back_inserter(AddressRanges)); - return AddressRanges; - } - - void setAddressRangesOffset(uint32_t Offset) { AddressRangesOffset = Offset; } - - uint32_t getAddressRangesOffset() const { return AddressRangesOffset; } - - const DWARFCompileUnit *getCompileUnit() const { return CU; } - const DWARFDebugInfoEntryMinimal *getDIE() const { return DIE; } - -private: - const DWARFCompileUnit *CU; - const DWARFDebugInfoEntryMinimal *DIE; - - BasicBlockOffsetRanges BBOffsetRanges; - - std::vector> AbsoluteRanges; - - /// Offset of the address ranges of this object in the output .debug_ranges. - uint32_t AddressRangesOffset{-1U}; -}; - - - -/// Represents DWARF location lists, maintaining their list of location -/// expressions and the address ranges in which they are valid to be updated in -/// the output debugging information. -class LocationList { -public: - LocationList(uint32_t Offset) : DebugLocOffset(Offset) { } - - /// Add a location expression that is valid in [BeginAddress, EndAddress) - /// within Function to location list. - void addLocation(const BasicBlockOffsetRanges::BinaryData *Expression, - BinaryFunction &Function, - uint64_t BeginAddress, - uint64_t EndAddress) { - BBOffsetRanges.addAddressRange(Function, BeginAddress, EndAddress, - Expression); - } - - std::vector - getAbsoluteAddressRanges() const { - return BBOffsetRanges.getAbsoluteAddressRanges(); - } - - uint32_t getOriginalOffset() const { return DebugLocOffset; } - -private: - BasicBlockOffsetRanges BBOffsetRanges; - - /// Offset of this location list in the input .debug_loc section. - uint32_t DebugLocOffset; -}; - /// Serializes the .debug_ranges and .debug_aranges DWARF sections. class DebugRangesSectionsWriter { public: - DebugRangesSectionsWriter() = default; + DebugRangesSectionsWriter(BinaryContext *BC); - /// Adds a range to the .debug_arange section. - void addRange(uint32_t CompileUnitOffset, uint64_t Address, uint64_t Size); + /// Add ranges for CU matching \p CUOffset and return offset into section. + uint64_t addCURanges(uint64_t CUOffset, DWARFAddressRangesVector &&Ranges); - /// Adds an address range that belongs to a given object. - /// When .debug_ranges is written, the offset of the range corresponding - /// to the function will be set using BF->setAddressRangesOffset(). - void addRange(AddressRangesOwner *ARO, uint64_t Address, uint64_t Size); + /// Add ranges with caching for \p Function. + uint64_t addRanges(const BinaryFunction *Function, + DWARFAddressRangesVector &&Ranges); - using RangesCUMapType = std::map; + /// Add ranges and return offset into section. + uint64_t addRanges(const DWARFAddressRangesVector &Ranges); /// Writes .debug_aranges with the added ranges to the MCObjectWriter. void writeArangesSection(MCObjectWriter *Writer) const; - /// Writes .debug_ranges with the added ranges to the MCObjectWriter. - void writeRangesSection(MCObjectWriter *Writer); - /// Resets the writer to a clear state. void reset() { CUAddressRanges.clear(); - ObjectAddressRanges.clear(); - RangesSectionOffsetCUMap.clear(); - } - - /// Return mapping of CUs to offsets in .debug_ranges. - const RangesCUMapType &getRangesOffsetCUMap() const { - return RangesSectionOffsetCUMap; } /// Returns an offset of an empty address ranges list that is always written /// to .debug_ranges - uint32_t getEmptyRangesListOffset() const { return EmptyRangesListOffset; } + uint64_t getEmptyRangesOffset() const { return EmptyRangesOffset; } /// Map DWARFCompileUnit index to ranges. - using CUAddressRangesType = - std::map>>; + using CUAddressRangesType = std::map; /// Return ranges for a given CU. const CUAddressRangesType &getCUAddressRanges() const { return CUAddressRanges; } + SmallVectorImpl *finalize() { + return RangesBuffer.release(); + } + private: + std::unique_ptr> RangesBuffer; + + std::unique_ptr RangesStream; + + std::unique_ptr Writer; + + /// Current offset in the section (updated as new entries are written). + /// Starts with 16 since the first 16 bytes are reserved for an empty range. + uint32_t SectionOffset{0}; + /// Map from compile unit offset to the list of address intervals that belong /// to that compile unit. Each interval is a pair /// (first address, interval size). CUAddressRangesType CUAddressRanges; - /// Map from BinaryFunction to the list of address intervals that belong - /// to that function, represented like CUAddressRanges. - std::map>> - ObjectAddressRanges; - /// Offset of an empty address ranges list. - uint32_t EmptyRangesListOffset; + static constexpr uint64_t EmptyRangesOffset{0}; - /// When writing data to .debug_ranges remember offset per CU. - RangesCUMapType RangesSectionOffsetCUMap; + /// Cached used for de-duplicating entries for the same function. + std::map CachedRanges; }; /// Serializes the .debug_loc DWARF section with LocationLists. class DebugLocWriter { public: - /// Writes the given location list to the writer. - void write(const LocationList &LocList, MCObjectWriter *Writer); + DebugLocWriter(BinaryContext *BC); + + uint64_t addList(const DWARFDebugLoc::LocationList &LocList); - using UpdatedOffsetMapType = std::map; + uint64_t getEmptyListOffset() const { return EmptyListOffset; } - /// Returns mapping from offsets in the input .debug_loc to offsets in the - /// output .debug_loc section with the corresponding updated location list - /// entry. - const UpdatedOffsetMapType &getUpdatedLocationListOffsets() const { - return UpdatedOffsets; + SmallVectorImpl *finalize() { + return LocBuffer.release(); } private: + std::unique_ptr> LocBuffer; + + std::unique_ptr LocStream; + + std::unique_ptr Writer; + + /// Offset of an empty location list. + static uint64_t const EmptyListOffset = 0; + /// Current offset in the section (updated as new entries are written). + /// Starts with 16 since the first 16 bytes are reserved for an empty range. uint32_t SectionOffset{0}; - - /// Map from input offsets to output offsets for location lists that were - /// updated, generated after write(). - UpdatedOffsetMapType UpdatedOffsets; }; /// Abstract interface for classes that apply modifications to a binary string. diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 7c04024a755c..9187d66bfea4 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -386,7 +386,7 @@ size_t padFunction(const BinaryFunction &Function) { } // namespace opts -constexpr const char *RewriteInstance::DebugSectionsToOverwrite[]; +constexpr const char *RewriteInstance::SectionsToOverwrite[]; const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; @@ -619,6 +619,7 @@ void RewriteInstance::reset() { EHFrame = nullptr; FailedAddresses.clear(); RangesSectionsWriter.reset(); + LocationListWriter.reset(); TotalScore = 0; } @@ -743,7 +744,6 @@ void RewriteInstance::run() { discoverFileObjects(); readDebugInfo(); disassembleFunctions(); - readFunctionDebugInfo(); runOptimizationPasses(); emitFunctions(); @@ -758,7 +758,6 @@ void RewriteInstance::run() { discoverFileObjects(); readDebugInfo(); disassembleFunctions(); - readFunctionDebugInfo(); runOptimizationPasses(); emitFunctions(); } @@ -790,7 +789,6 @@ void RewriteInstance::run() { FunctionIt->second.setSimple(false); } - readFunctionDebugInfo(); runOptimizationPasses(); emitFunctions(); } @@ -1623,13 +1621,6 @@ void RewriteInstance::readDebugInfo() { BC->preprocessDebugInfo(BinaryFunctions); } -void RewriteInstance::readFunctionDebugInfo() { - if (!opts::UpdateDebugSections) - return; - - BC->preprocessFunctionDebugInfo(BinaryFunctions); -} - void RewriteInstance::disassembleFunctions() { // Disassemble every function and build it's control flow graph. TotalScore = 0; @@ -1867,13 +1858,11 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, } Section->setHasInstructions(true); + BC.Ctx->addGenDwarfSection(Section); Streamer.SwitchSection(Section); - if (!opts::Relocs) - Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress()); - if (opts::Relocs) { Streamer.EmitCodeAlignment(std::max((unsigned)opts::AlignFunctions, BinaryFunction::MinAlign), @@ -1881,6 +1870,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, BinaryFunction::MinAlign - 1)); } else { Streamer.EmitCodeAlignment(Function.getAlignment()); + Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress()); } MCContext &Context = Streamer.getContext(); @@ -2100,7 +2090,7 @@ void RewriteInstance::emitFunctions() { Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_end")); } - if (opts::UpdateDebugSections) + if (!opts::Relocs && opts::UpdateDebugSections) updateDebugLineInfoForNonSimpleFunctions(); emitDataSections(Streamer.get()); @@ -2365,8 +2355,11 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; - if (!Function.isEmitted()) + if (!Function.isEmitted()) { + Function.setOutputAddress(Function.getAddress()); + Function.setOutputSize(Function.getSize()); continue; + } if (opts::Relocs) { const auto BaseAddress = NewTextSectionStartAddress; @@ -2386,7 +2379,6 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { const auto ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol); Function.cold().setAddress(BaseAddress + ColdStartOffset); Function.cold().setImageSize(ColdEndOffset - ColdStartOffset); - } } else { Function.setOutputAddress(Function.getAddress()); @@ -3475,7 +3467,7 @@ uint64_t RewriteInstance::getFileOffsetForAddress(uint64_t Address) const { } bool RewriteInstance::willOverwriteSection(StringRef SectionName) { - for (auto &OverwriteName : DebugSectionsToOverwrite) { + for (auto &OverwriteName : SectionsToOverwrite) { if (SectionName == OverwriteName) return true; } @@ -3512,3 +3504,24 @@ RewriteInstance::getBinaryFunctionAtAddress(uint64_t Address) const { return BC->getFunctionForSymbol(Symbol); } + +DWARFAddressRangesVector RewriteInstance::translateModuleAddressRanges( + const DWARFAddressRangesVector &InputRanges) const { + DWARFAddressRangesVector OutputRanges; + + for (const auto Range : InputRanges) { + auto BFI = BinaryFunctions.lower_bound(Range.first); + while (BFI != BinaryFunctions.end()) { + const auto &Function = BFI->second; + if (Function.getAddress() >= Range.second) + break; + const auto FunctionRanges = Function.getOutputAddressRanges(); + std::move(std::begin(FunctionRanges), + std::end(FunctionRanges), + std::back_inserter(OutputRanges)); + std::advance(BFI, 1); + } + } + + return OutputRanges; +} diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 1253cd9cec06..bef1e0b9642f 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -170,10 +170,6 @@ class RewriteInstance { /// Read information from debug sections. void readDebugInfo(); - /// Read information from debug sections that depends on disassembled - /// functions. - void readFunctionDebugInfo(); - /// Disassemble each function in the binary and associate it with a /// BinaryFunction object, preparing all information necessary for binary /// optimization. @@ -198,6 +194,13 @@ class RewriteInstance { /// Update debug information in the file for re-written code. void updateDebugInfo(); + /// Recursively update debug info for all DIEs in \p Unit. + /// If \p Function is not empty, it points to a function corresponding + /// to a parent DW_TAG_subprogram node of the current \p DIE. + void updateUnitDebugInfo(DWARFCompileUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + std::vector FunctionStack); + /// Map all sections to their final addresses. void mapFileSections(orc::ObjectLinkingLayer<>::ObjSetHandleT &ObjectsHandle); @@ -254,6 +257,10 @@ class RewriteInstance { const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) const; + /// Produce output address ranges based on input ranges for some module. + DWARFAddressRangesVector translateModuleAddressRanges( + const DWARFAddressRangesVector &InputRanges) const; + private: /// Detect addresses and offsets available in the binary for allocating @@ -317,31 +324,8 @@ class RewriteInstance { /// and updates stmt_list for a corresponding compile unit. void updateLineTableOffsets(); - /// Adds an entry to be saved in the .debug_aranges/.debug_ranges section. - /// \p OriginalFunctionAddress function's address in the original binary, - /// used for compile unit lookup. - /// \p RangeBegin first address of the address range being added. - /// \p RangeSie size in bytes of the address range. - void addDebugRangesEntry(uint64_t OriginalFunctionAddress, - uint64_t RangeBegin, - uint64_t RangeSize); - - /// Update internal function ranges after functions have been written. - void updateFunctionRanges(); - - /// Update objects with address ranges after optimization. - void updateAddressRangesObjects(); - - /// If we've never mapped the unit, e.g. because there were no functions - /// marked in DWARF, update with the original ranges so that we can free up - /// the old part of .debug_ranges. - void updateEmptyModuleRanges(); - - /// Generate new contents for .debug_loc. - void updateLocationLists(); - /// Generate new contents for .debug_ranges and .debug_aranges section. - void generateDebugRanges(); + void finalizeDebugSections(); /// Patches the binary for DWARF address ranges (e.g. in functions and lexical /// blocks) to be updated. @@ -357,15 +341,9 @@ class RewriteInstance { /// new address ranges in the output binary. /// \p Unit Compile uniit the object belongs to. /// \p DIE is the object's DIE in the input binary. - void updateDWARFObjectAddressRanges(uint32_t DebugRangesOffset, - const DWARFUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE); - - /// Updates pointers in .debug_info to location lists in .debug_loc. - void updateLocationListPointers( - const DWARFUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, - const std::map &UpdatedOffsets); + void updateDWARFObjectAddressRanges(const DWARFUnit *Unit, + const DWARFDebugInfoEntryMinimal *DIE, + uint64_t DebugRangesOffset); /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { @@ -392,10 +370,11 @@ class RewriteInstance { private: /// When updating debug info, these are the sections we overwrite. - static constexpr const char *DebugSectionsToOverwrite[] = { + static constexpr const char *SectionsToOverwrite[] = { ".shstrtab", ".debug_aranges", ".debug_line", + ".debug_loc", ".debug_ranges", ".gdb_index", }; @@ -452,7 +431,9 @@ class RewriteInstance { /// Stores and serializes information that will be put into the .debug_ranges /// and .debug_aranges DWARF sections. - DebugRangesSectionsWriter RangesSectionsWriter; + std::unique_ptr RangesSectionsWriter; + + std::unique_ptr LocationListWriter; /// Patchers used to apply simple changes to sections of the input binary. /// Maps section name -> patcher. From b4c3b555a4ca4fcd7131c9909a0e9bdf55b93d17 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 19 May 2017 14:45:46 -0700 Subject: [PATCH 253/904] Don't add useless uncond branch to fallthroughs when running SCTC. Summary: SCTC was sometimes adding unconditional branches to fallthrough blocks. This diff checks to see if the unconditional branch is really necessary, e.g. it's not to a fallthrough block. (cherry picked from commit e793be4f6af16f1e6d4ca2204472ac9943d7d07a) --- bolt/BinaryFunction.cpp | 3 +- bolt/Passes/BinaryPasses.cpp | 58 ++++++++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a5bd8e531460..d40b635eb636 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -149,7 +149,8 @@ namespace bolt { constexpr unsigned NoRegister = 0; constexpr const char *DynoStats::Desc[]; - +constexpr unsigned BinaryFunction::MinAlign; + namespace { /// Gets debug line information for the instruction located at the given diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index d425bc8eee57..0c989f9fef9f 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -569,6 +569,15 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, auto &MIA = BC.MIA; uint64_t NumLocalCTCCandidates = 0; uint64_t NumLocalCTCs = 0; + std::vector> + NeedsUncondBranch; + + // Will block be deleted by UCE? + auto isValid = [](const BinaryBasicBlock *BB) { + return (BB->pred_size() != 0 || + BB->isLandingPad() || + BB->isEntryPoint()); + }; for (auto *BB : BF.layout()) { // Locate BB with a single direct tail-call instruction. @@ -623,18 +632,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // the target for the unconditional branch or add a unconditional // branch to the old target. This has to be done manually since // fixupBranches is not called after SCTC. - if (UncondBranch) { - MIA->replaceBranchTarget(*UncondBranch, - CondSucc->getLabel(), - BC.Ctx.get()); - } else { - MCInst Branch; - auto Result = MIA->createUncondBranch(Branch, - CondSucc->getLabel(), - BC.Ctx.get()); - assert(Result); - PredBB->addInstruction(Branch); - } + NeedsUncondBranch.emplace_back(std::make_tuple(BB, PredBB, CondSucc)); // Swap branch statistics after swapping the branch targets. auto BI = PredBB->branch_info_begin(); std::swap(*BI, *(BI + 1)); @@ -651,9 +649,39 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, } // Remove the block from CFG if all predecessors were removed. - BB->markValid(BB->pred_size() != 0 || - BB->isLandingPad() || - BB->isEntryPoint()); + BB->markValid(isValid(BB)); + } + + // Add unconditional branches at the end of BBs to new successors + // as long as the successor is not a fallthrough. + for (auto &Entry : NeedsUncondBranch) { + auto *BB = std::get<0>(Entry); + auto *PredBB = std::get<1>(Entry); + auto *CondSucc = std::get<2>(Entry); + + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); + + // Only add a new branch if the target is not the fall-through. + if (BF.getBasicBlockAfter(BB) != CondSucc || isValid(BB)) { + if (UncondBranch) { + MIA->replaceBranchTarget(*UncondBranch, + CondSucc->getLabel(), + BC.Ctx.get()); + } else { + MCInst Branch; + auto Result = MIA->createUncondBranch(Branch, + CondSucc->getLabel(), + BC.Ctx.get()); + assert(Result); + PredBB->addInstruction(Branch); + } + } else if (UncondBranch) { + PredBB->eraseInstruction(UncondBranch); + } } if (NumLocalCTCs > 0) { From c7e7ff6cf1633a4bb262c6527b91ece71caeaec5 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 1 May 2017 14:04:40 -0700 Subject: [PATCH 254/904] [BOLT] Optimize jump tables with hot entries Summary: This diff is similar to Bill's diff for optimizing jump tables (and is built on top of it), but it differs in the strategy used to optimize the jump table. The previous approach loads the target address from the jump table and compare it to check if it is a hot target. This accomplishes branch misprediction reduction by promote the indirect jmp to a (more predictable) direct jmp. load %r10, JMPTABLE cmp %r10, HOTTARGET je HOTTARGET ijmp [JMPTABLE + %index * scale] The idea in this diff is instead to make dcache better by avoiding the load of the jump table, leaving branch mispredictions as a secondary target. To do this we compare the index used in the indirect jmp and if it matches a known hot entry, it performs a direct jump to the target. cmp %index, HOTINDEX je CORRESPONDING_TARGET ijmp [JMPTABLE + %index * scale] The downside of this approach is that we may have multiple indices associated with a single target, but we only have profiling to show which targets are hot and we have no clue about which indices are hot. INDEX TARGET 0 4004f8 8 4004f8 10 4003d0 18 4004f8 Profiling data: TARGET COUNT 4004f8 10020 4003d0 17 In this example, we know 4004f8 is hot, but to make a direct call to it we need to check for indices 0, 8 and 18 -- 3 comparisons instead of 1. Therefore, once we know a target is hot, we must generate code to compare against all possible indices associated with this target because we don't know which index is the hot one (IF there's a hotter index). cmp %index, 0 je 4004f8 cmp %index, 8 je 4004f8 cmp %index, 18 je 4004f8 (... up to N comparisons as in --indirect-call-promotion-topn=N ) ijmp [JMPTABLE + %index * scale] (cherry picked from commit 2b75e1c97f1b1cf4c3b4e2ec0ee507f7dc428217) --- bolt/BinaryFunction.cpp | 11 +- bolt/BinaryFunction.h | 2 + bolt/Passes/BinaryPasses.h | 3 +- bolt/Passes/FrameAnalysis.cpp | 6 +- bolt/Passes/FrameAnalysis.h | 13 ++- bolt/Passes/IndirectCallPromotion.cpp | 148 +++++++++++++++++++------- bolt/Passes/IndirectCallPromotion.h | 6 +- 7 files changed, 142 insertions(+), 47 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index d40b635eb636..62b4474e28fc 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -771,6 +771,11 @@ BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, &DispExpr)) return IndirectBranchType::UNKNOWN; + // Do not set annotate with index reg if address was precomputed earlier + // and reg may not be live at the jump site. + if (MemLocInstr != &Instruction) + IndexRegNum = 0; + if ((BaseRegNum != bolt::NoRegister && BaseRegNum != RIPRegister) || SegRegNum != bolt::NoRegister) return IndirectBranchType::UNKNOWN; @@ -800,7 +805,7 @@ BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, // Check if there's already a jump table registered at this address. if (auto *JT = getJumpTableContainingAddress(ArrayStart)) { - auto JTOffset = ArrayStart - JT->Address; + auto JTOffset = ArrayStart - JT->Address; if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE && JTOffset != 0) { // Adjust the size of this jump table and create a new one if necessary. // We cannot re-use the entries since the offsets are relative to the @@ -829,7 +834,7 @@ BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, } BC.MIA->replaceMemOperandDisp(*MemLocInstr, LI->second, BC.Ctx.get()); - BC.MIA->setJumpTable(Instruction, ArrayStart); + BC.MIA->setJumpTable(BC.Ctx.get(), Instruction, ArrayStart, IndexRegNum); JTSites.emplace_back(Offset, ArrayStart); @@ -912,7 +917,7 @@ BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, std::move(JTOffsetCandidates), {{0, JTStartLabel}}}); BC.MIA->replaceMemOperandDisp(*MemLocInstr, JTStartLabel, BC.Ctx.get()); - BC.MIA->setJumpTable(Instruction, ArrayStart); + BC.MIA->setJumpTable(BC.Ctx.get(), Instruction, ArrayStart, IndexRegNum); JTSites.emplace_back(Offset, ArrayStart); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 4e65fbdf70e5..bc17055cc25d 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -565,6 +565,7 @@ class BinaryFunction { /// /// The jump table may include other jump tables that are referenced by /// a different label at a different offset in this jump table. +public: struct JumpTable { enum JumpTableType : char { JTT_NORMAL, @@ -634,6 +635,7 @@ class BinaryFunction { /// Print for debugging purposes. void print(raw_ostream &OS) const; }; +private: /// All compound jump tables for this function. /// -> diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index f7345419a995..2f97f8326a50 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -29,8 +29,9 @@ namespace bolt { /// An optimization/analysis pass that runs on functions. class BinaryFunctionPass { - const cl::opt &PrintPass; protected: + const cl::opt &PrintPass; + explicit BinaryFunctionPass(const cl::opt &PrintPass) : PrintPass(PrintPass) { } diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index a5aadf9522f9..1e38c7748229 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -404,7 +404,7 @@ void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { Queue.pop(); BitVector RegsKilled = getFunctionClobberList(BC, Func); - bool Updated = computeArgsAccessed(BC, *Func); + bool Updated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func); if (RegsKilledMap.find(Func) == RegsKilledMap.end()) { RegsKilledMap[Func] = std::move(RegsKilled); @@ -652,6 +652,10 @@ void FrameAnalysis::runOnFunctions(BinaryContext &BC, NamedRegionTimer T1("build clobber map", "FOP breakdown", true); buildClobberMap(BC); } + + if (ClobberAnalysisOnly) + return; + for (auto &I : BFs) { auto Count = I.second.getExecutionCount(); if (Count != BinaryFunction::COUNT_NO_PROFILE) diff --git a/bolt/Passes/FrameAnalysis.h b/bolt/Passes/FrameAnalysis.h index c2c2938d60b2..12a350572f16 100644 --- a/bolt/Passes/FrameAnalysis.h +++ b/bolt/Passes/FrameAnalysis.h @@ -92,7 +92,7 @@ raw_ostream &operator<<(raw_ostream &OS, /// Initialization: /// /// FrameAnalysis FA(PrintPass); -/// RA.runOnFunctions(BC, BFs, LargeFunctions); +/// FA.runOnFunctions(BC, BFs, LargeFunctions); /// /// Usage (fetching frame access information about a given instruction): /// @@ -155,6 +155,11 @@ class FrameAnalysis : public BinaryFunctionPass { uint64_t CountFunctionsFailedRestoreFI{0}; uint64_t CountDenominator{0}; + /// If this flag is set to true, the analysis will never run completely, + /// but will stop after callgraph and a clobber analysis for every function + /// has been computed. + bool ClobberAnalysisOnly{false}; + /// Convenience functions for appending MCAnnotations to instructions with /// our specific data void addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, @@ -203,8 +208,10 @@ class FrameAnalysis : public BinaryFunctionPass { bool restoreFrameIndex(const BinaryContext &BC, BinaryFunction &BF); public: - explicit FrameAnalysis(const cl::opt &PrintPass) - : BinaryFunctionPass(PrintPass) { + explicit FrameAnalysis(const cl::opt &PrintPass, + bool ClobberAnalysisOnly=false) + : BinaryFunctionPass(PrintPass), + ClobberAnalysisOnly(ClobberAnalysisOnly) { // Position 0 of the vector should be always associated with "assume access // everything". ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true)); diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index 7fb92df815ec..7d7311347d6a 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "IndirectCallPromotion.h" +#include "DataflowInfoManager.h" #include "llvm/Support/Options.h" #define DEBUG_TYPE "ICP" @@ -90,6 +91,12 @@ ICPOldCodeSequence( cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt ICPJumpTablesByTarget( + "icp-jump-tables-targets", + cl::desc( + "for jump tables, optimize indirect jmp targets instead of indices"), + cl::init(false), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { @@ -125,19 +132,27 @@ IndirectCallPromotion::getCallTargets( std::vector Targets; if (const auto *JT = BF.getJumpTable(Inst)) { + // Don't support PIC jump tables for now + if (!opts::ICPJumpTablesByTarget && + JT->Type == BinaryFunction::JumpTable::JTT_PIC) + return Targets; const Location From(BF.getSymbol()); const auto Range = JT->getEntriesForAddress(BC.MIA->getJumpTable(Inst)); assert(JT->Counts.empty() || JT->Counts.size() >= Range.second); BinaryFunction::JumpInfo DefaultJI; const auto *JI = JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first]; const size_t JIAdj = JT->Counts.empty() ? 0 : 1; + assert(JT->Type == BinaryFunction::JumpTable::JTT_PIC || + JT->EntrySize == BC.AsmInfo->getPointerSize()); for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) { auto *Entry = JT->Entries[I]; assert(BF.getBasicBlockForLabel(Entry) || Entry == BF.getFunctionEndLabel() || Entry == BF.getFunctionColdEndLabel()); const Location To(Entry); - Callsite CS{From, To, JI->Mispreds, JI->Count, BranchHistories()}; + Callsite CS{ + From, To, JI->Mispreds, JI->Count, BranchHistories(), + I - Range.first}; Targets.emplace_back(CS); } @@ -154,9 +169,10 @@ IndirectCallPromotion::getCallTargets( return A.To.Addr < B.To.Addr; }); - // TODO: I'm going to leave this as is since it will be fixed in - // D5005620 and it ought to make merging easier if there are fewer - // changes. + // Targets may contain multiple entries to the same target, but using + // different indices. Their profile will report the same number of branches + // for different indices if the target is the same. That's because we don't + // profile the index value, but only the target via LBR. auto First = Targets.begin(); auto Last = Targets.end(); auto Result = First; @@ -164,19 +180,17 @@ IndirectCallPromotion::getCallTargets( auto &A = *Result; const auto &B = *First; if (A.To.IsSymbol && B.To.IsSymbol && A.To.Sym == B.To.Sym) { - A.Mispreds += B.Mispreds; - A.Branches += B.Branches; + A.JTIndex.insert(A.JTIndex.end(), B.JTIndex.begin(), B.JTIndex.end()); } else { *(++Result) = *First; } } ++Result; - DEBUG( - if (Targets.end() - Result > 0) { - dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result) - << " duplicate targets removed\n"; - }); + DEBUG(if (Targets.end() - Result > 0) { + dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result) + << " duplicate targets removed\n"; + }); Targets.erase(Result, Targets.end()); } else { @@ -241,9 +255,18 @@ IndirectCallPromotion::findCallTargetSymbols( ) const { std::vector> SymTargets; - for (size_t I = 0; I < N; ++I) { - assert(Targets[I].To.IsSymbol && "All ICP targets must be to known symbols"); - SymTargets.push_back(std::make_pair(Targets[I].To.Sym, 0)); + size_t TgtIdx = 0; + for (size_t I = 0; I < N; ++TgtIdx) { + assert(Targets[TgtIdx].To.IsSymbol && "All ICP targets must be to known symbols"); + if (Targets[TgtIdx].JTIndex.empty()) { + SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, 0)); + ++I; + } else { + for (auto Idx : Targets[TgtIdx].JTIndex) { + SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, Idx)); + ++I; + } + } } return SymTargets; @@ -357,11 +380,18 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( // New BinaryBranchInfo scaled to the execution count of the original BB. std::vector BBI; for (auto Itr = Targets.begin(); Itr != Targets.end(); ++Itr) { - BBI.push_back( - BinaryBranchInfo{ - uint64_t(TotalCount * ((double)Itr->Branches / TotalIndirectBranches)), - uint64_t(TotalMispreds * ((double)Itr->Mispreds / TotalIndirectMispreds)) - }); + const auto BranchPct = (double)Itr->Branches / TotalIndirectBranches; + const auto MispredPct = (double)Itr->Mispreds / TotalIndirectMispreds; + if (Itr->JTIndex.empty()) { + BBI.push_back(BinaryBranchInfo{uint64_t(TotalCount * BranchPct), + uint64_t(TotalMispreds * MispredPct)}); + continue; + } + for (size_t I = 0, E = Itr->JTIndex.size(); I != E; ++I) { + BBI.push_back( + BinaryBranchInfo{uint64_t(TotalCount * (BranchPct / E)), + uint64_t(TotalMispreds * (MispredPct / E))}); + } } auto BI = BBI.begin(); @@ -377,11 +407,22 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( moveSuccessors(IndCallBlock, NewBBs.back().get()); } + std::vector SymTargets; + for (size_t I = 0; I < Targets.size(); ++I) { + assert(Targets[I].To.IsSymbol); + if (Targets[I].JTIndex.empty()) + SymTargets.push_back(Targets[I].To.Sym); + else { + for (size_t Idx = 0, E = Targets[I].JTIndex.size(); Idx != E; ++Idx) { + SymTargets.push_back(Targets[I].To.Sym); + } + } + } + // Fix up successors and execution counts. updateCurrentBranchInfo(); if (IsJumpTable) { - assert(Targets[0].To.IsSymbol); - auto *Succ = Function.getBasicBlockForLabel(Targets[0].To.Sym); + auto *Succ = Function.getBasicBlockForLabel(SymTargets[0]); IndCallBlock->addSuccessor(Succ, BBI[0]); // cond branch } IndCallBlock->addSuccessor(NewBBs[0].get(), TotalCount); // fallthru branch @@ -392,8 +433,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( uint64_t ExecCount = BBI[I+1].Count; updateCurrentBranchInfo(); if (IsJumpTable) { - assert(Targets[I+1].To.IsSymbol); - auto *Succ = Function.getBasicBlockForLabel(Targets[I+1].To.Sym); + auto *Succ = Function.getBasicBlockForLabel(SymTargets[I+1]); NewBBs[I]->addSuccessor(Succ, BBI[I+1]); } NewBBs[I]->addSuccessor(NewBBs[I+1].get(), TotalCount); // fallthru @@ -411,7 +451,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( MergeBlock = NewBBs.back().get(); moveSuccessors(IndCallBlock, MergeBlock); - + // Fix up successors and execution counts. updateCurrentBranchInfo(); IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount); // cond branch @@ -495,7 +535,8 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, uint64_t TotalMispredictsTopN = 0; size_t N = 0; - if (opts::IndirectCallPromotionUseMispredicts) { + if (opts::IndirectCallPromotionUseMispredicts && + (!IsJumpTable || opts::ICPJumpTablesByTarget)) { // Count total number of mispredictions for (at most) the top N targets. // We may choose a smaller N (TrialN vs. N) if the frequency threshold // is exceeded by fewer targets. @@ -531,7 +572,10 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, // We may choose a smaller N (TrialN vs. N) if the frequency threshold // is exceeded by fewer targets. double Threshold = double(opts::IndirectCallPromotionThreshold); - for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { + for (size_t I = 0; I < TrialN && Threshold > 0; ++I) { + if (N + (Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size()) > + TrialN) + break; const auto Frequency = (100.0 * Targets[I].Branches) / NumCalls; TotalCallsTopN += Targets[I].Branches; TotalMispredictsTopN += Targets[I].Mispreds; @@ -540,6 +584,7 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, else TotalNumFrequentJmps += Targets[I].Branches; Threshold -= Frequency; + N += Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size(); } // Compute the frequency of the top N call targets. If this frequency @@ -559,6 +604,11 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, return 0; } + // Don't check misprediction frequency for jump tables -- we don't really + // care as long as we are saving loads from the jump table. + if (IsJumpTable && !opts::ICPJumpTablesByTarget) + return N; + // Compute the misprediction frequency of the top N call targets. If // this frequency is less than the threshold, we should skip ICP at // this callsite. @@ -629,6 +679,10 @@ void IndirectCallPromotion::runOnFunctions( if (opts::IndirectCallPromotion == ICP_NONE) return; + FrameAnalysis FA(PrintPass, /*ClobberAnalysisOnly=*/true); + if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) + FA.runOnFunctions(BC, BFs, LargeFunctions); + for (auto &BFIt : BFs) { auto &Function = BFIt.second; @@ -659,7 +713,10 @@ void IndirectCallPromotion::runOnFunctions( BBs.push_back(&BB); } } + if (BBs.empty()) + continue; + DataflowInfoManager Info(&FA, BC, Function); while (!BBs.empty()) { auto *BB = BBs.back(); BBs.pop_back(); @@ -677,7 +734,7 @@ void IndirectCallPromotion::runOnFunctions( const bool OptimizeJumpTables = (opts::IndirectCallPromotion == ICP_JUMP_TABLES || opts::IndirectCallPromotion == ICP_ALL); - + if (!((HasBranchData && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) continue; @@ -701,6 +758,19 @@ void IndirectCallPromotion::runOnFunctions( else FuncTotalIndirectJmps += NumCalls; + // If FLAGS regs is alive after this jmp site, do not try + // promoting because we will clobber FLAGS. + if (IsJumpTable && (*Info.getLivenessAnalysis().getStateBefore( + Inst))[BC.MIA->getFlagsReg()]) { + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: ICP failed in " << Function << " @ " + << InstIdx << " in " << BB->getName() + << ", calls = " << NumCalls + << ", cannot clobber flags reg.\n"; + } + continue; + } + // Should this callsite be optimized? Return the number of targets // to use when promoting this call. A value of zero means to skip // this callsite. @@ -720,20 +790,21 @@ void IndirectCallPromotion::runOnFunctions( if (SymTargets.size() < N) { const auto LastTarget = SymTargets.size(); if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: ICP failed to find target symbol for " - << Targets[LastTarget].To.Sym->getName() << " in " - << Function << " @ " << InstIdx << " in " - << BB->getName() << ", calls = " << NumCalls << "\n"; + outs() << "BOLT-INFO: ICP failed in " << Function << " @ " + << InstIdx << " in " << BB->getName() + << ", calls = " << NumCalls + << ", ICP failed to find target symbol for " + << Targets[LastTarget].To.Sym->getName() << "\n"; } continue; } // Generate new promoted call code for this callsite. auto ICPcode = - BC.MIA->indirectCallPromotion(Inst, - SymTargets, - opts::ICPOldCodeSequence, - BC.Ctx.get()); + (IsJumpTable && !opts::ICPJumpTablesByTarget) + ? BC.MIA->jumpTablePromotion(Inst, SymTargets, BC.Ctx.get()) + : BC.MIA->indirectCallPromotion( + Inst, SymTargets, opts::ICPOldCodeSequence, BC.Ctx.get()); if (ICPcode.empty()) { if (opts::Verbosity >= 1) { @@ -793,6 +864,9 @@ void IndirectCallPromotion::runOnFunctions( TotalIndirectJmps += FuncTotalIndirectJmps; } + if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) + FA.cleanAnnotations(BC, BFs); + outs() << "BOLT-INFO: ICP total indirect callsites = " << TotalIndirectCallsites << "\n" @@ -814,12 +888,12 @@ void IndirectCallPromotion::runOnFunctions( << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / std::max(TotalIndirectCallsites, 1ul)) << "%\n" - << "BOLT-INFO: ICP percentage of jump table calls that can be " + << "BOLT-INFO: ICP percentage of indirect branches that are " "optimized = " << format("%.1f", (100.0 * TotalNumFrequentJmps) / std::max(TotalIndirectJmps, 1ul)) << "%\n" - << "BOLT-INFO: ICP percentage of jump table calls that are optimized = " + << "BOLT-INFO: ICP percentage of jump table callsites that are optimized = " << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / std::max(TotalJumpTableCallsites, 1ul)) << "%\n"; diff --git a/bolt/Passes/IndirectCallPromotion.h b/bolt/Passes/IndirectCallPromotion.h index eb7e583d6ec1..43dc6183f00d 100644 --- a/bolt/Passes/IndirectCallPromotion.h +++ b/bolt/Passes/IndirectCallPromotion.h @@ -117,15 +117,17 @@ class IndirectCallPromotion : public BinaryFunctionPass { uint64_t Mispreds{0}; uint64_t Branches{0}; BranchHistories Histories; + // Indices in the jmp table (jt only) + std::vector JTIndex; bool isValid() const { return From.isValid() && To.isValid(); } Callsite(BinaryFunction &BF, const BranchInfo &BI); Callsite(const Location &From, const Location &To, uint64_t Mispreds, uint64_t Branches, - const BranchHistories &Histories) + const BranchHistories &Histories, uint64_t JTIndex) : From(From), To(To), Mispreds(Mispreds), Branches(Branches), - Histories(Histories) { } + Histories(Histories), JTIndex(1, JTIndex) { } }; std::unordered_set Modified; From 3fe5abc73029ce21c766920931761a384d028caa Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 24 May 2017 14:14:16 -0700 Subject: [PATCH 255/904] Add .bolt_info notes section containing BOLT revision and command line args. Summary: Optinally add a .bolt_info notes section containing BOLT revision and command line args. The new section is controlled by the -add-bolt-info flag which is on by default. (cherry picked from commit 52813470cfbeb7ee2408bfa0de8b919be5755ddb) --- bolt/CMakeLists.txt | 43 +++++++++++++++++++++++++++++++++++++ bolt/RewriteInstance.cpp | 46 +++++++++++++++++++++++++++++++++++++++- bolt/RewriteInstance.h | 10 ++++++++- bolt/llvm-bolt.cpp | 15 ++++++++++++- 4 files changed, 111 insertions(+), 3 deletions(-) diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 95b9e1fe4019..c8e2f3e3c45b 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -1,6 +1,49 @@ add_subdirectory(merge-fdata) add_subdirectory(Passes) +# Get the current git revision for BOLT. +function(get_version ofn) + find_program(git_executable NAMES git git.exe git.cmd) + if (git_executable) + execute_process(COMMAND ${git_executable} rev-parse HEAD + WORKING_DIRECTORY ${LLVM_MAIN_SRC_DIR} + TIMEOUT 5 + RESULT_VARIABLE git_result + OUTPUT_VARIABLE git_output) + if( git_result EQUAL 0 ) + string(STRIP "${git_output}" git_ref_id) + set(BOLT_REVISION "${git_ref_id}") + endif() + endif() + + # If we can't find a revision, set it to "". + if (NOT BOLT_REVISION) + set(BOLT_REVISION "") + endif() + + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} + COMMAND echo '"${BOLT_REVISION}"' > ${CMAKE_CURRENT_BINARY_DIR}/${ofn} + COMMENT "Generating bogus ${ofn}..." + ) + + set(VERSION_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} PARENT_SCOPE) + + # `make clean' must remove all those generated files: + set_property(DIRECTORY APPEND + PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${ofn}) + set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${ofn} PROPERTIES + GENERATED 1) +endfunction() + +# Creates a public target for generating the revision file. +function(add_public_gen_version_target target) + add_custom_target(${target} DEPENDS ${VERSION_OUTPUT}) + set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} ${target} PARENT_SCOPE) +endfunction() + +get_version(BoltRevision.inc) +add_public_gen_version_target(GenBoltRevision) + set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} BOLTPasses diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 9187d66bfea4..01d117a7b493 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -51,6 +51,7 @@ #include "llvm/Support/TargetSelect.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include #include @@ -311,6 +312,13 @@ Verbosity("v", cl::ZeroOrMore, cl::cat(BoltCategory)); +static cl::opt +AddBoltInfo("add-bolt-info", + cl::desc("add BOLT version and command line argument information to " + "processed binaries"), + cl::init(true), + cl::cat(BoltCategory)); + // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { @@ -392,6 +400,12 @@ const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; const std::string RewriteInstance::BOLTSecPrefix = ".bolt"; +namespace llvm { +namespace bolt { +extern const char *BoltRevision; +} +} + static void report_error(StringRef Message, std::error_code EC) { assert(EC); errs() << "BOLT-ERROR: '" << Message << "': " << EC.message() << ".\n"; @@ -597,8 +611,12 @@ std::unique_ptr createBinaryContext( } // namespace RewriteInstance::RewriteInstance(ELFObjectFileBase *File, - const DataReader &DR) + const DataReader &DR, + const int Argc, + const char *const *Argv) : InputFile(File), + Argc(Argc), + Argv(Argv), BC(createBinaryContext("x86-64", "x86_64-unknown-linux", DR, std::unique_ptr( new DWARFContextInMemory(*InputFile, nullptr, true)))) { @@ -796,6 +814,8 @@ void RewriteInstance::run() { if (opts::UpdateDebugSections) updateDebugInfo(); + addBoltInfoSection(); + // Copy allocatable part of the input. std::error_code EC; Out = llvm::make_unique(opts::OutputFilename, EC, @@ -2765,6 +2785,30 @@ void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { EFMM->NoteSectionInfo[".shstrtab"].IsStrTab = true; } +void RewriteInstance::addBoltInfoSection() { + if (opts::AddBoltInfo) { + std::string Str; + raw_string_ostream OS(Str); + + OS << "BOLT revision: " << BoltRevision << ", " << "command line:"; + for (auto I = 0; I < Argc; ++I) { + OS << " " << Argv[I]; + } + + const auto BoltInfo = OS.str(); + const auto SectionSize = BoltInfo.size(); + uint8_t *SectionData = new uint8_t[SectionSize]; + memcpy(SectionData, BoltInfo.data(), SectionSize); + EFMM->NoteSectionInfo[".bolt_info"] = + SectionInfo(reinterpret_cast(SectionData), + SectionSize, + /*Alignment=*/1, + /*IsCode=*/false, + /*IsReadOnly=*/true, + /*IsLocal=*/false); + } +} + // Rewrite section header table inserting new entries as needed. The sections // header table size itself may affect the offsets of other sections, // so we are placing it at the end of the binary. diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index bef1e0b9642f..ccb48658766f 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -146,7 +146,8 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { /// events. class RewriteInstance { public: - RewriteInstance(llvm::object::ELFObjectFileBase *File, const DataReader &DR); + RewriteInstance(llvm::object::ELFObjectFileBase *File, const DataReader &DR, + const int Argc, const char *const *Argv); ~RewriteInstance(); /// Reset all state except for split hints. Used to run a second pass with @@ -320,6 +321,9 @@ class RewriteInstance { /// Finalize memory image of section header string table. ELF_FUNCTION(finalizeSectionStringTable); + /// Add a notes section containing the BOLT revision and command line options. + void addBoltInfoSection(); + /// Computes output .debug_line line table offsets for each compile unit, /// and updates stmt_list for a corresponding compile unit. void updateLineTableOffsets(); @@ -388,6 +392,10 @@ class RewriteInstance { /// An instance of the input binary we are processing, externally owned. llvm::object::ELFObjectFileBase *InputFile; + /// Command line args used to process binary. + const int Argc; + const char *const *Argv; + std::unique_ptr BC; std::unique_ptr CFIRdWrt; diff --git a/bolt/llvm-bolt.cpp b/bolt/llvm-bolt.cpp index 5f75b2ff5817..f492dfadc679 100644 --- a/bolt/llvm-bolt.cpp +++ b/bolt/llvm-bolt.cpp @@ -69,6 +69,18 @@ static void report_error(StringRef Message, std::error_code EC) { exit(1); } +namespace llvm { +namespace bolt { +const char *BoltRevision = +#include "BoltRevision.inc" +; +} +} + +static void printBoltRevision() { + errs() << "BOLT revision " << BoltRevision << "\n"; +} + int main(int argc, char **argv) { // Print a stack trace if we signal out. sys::PrintStackTraceOnErrorSignal(); @@ -88,6 +100,7 @@ int main(int argc, char **argv) { cl::HideUnrelatedOptions(makeArrayRef(opts::BoltCategories)); // Register the target printer for --version. + cl::AddExtraVersionPrinter(printBoltRevision); cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); cl::ParseCommandLineOptions(argc, argv, @@ -122,7 +135,7 @@ int main(int argc, char **argv) { Binary &Binary = *BinaryOrErr.get().getBinary(); if (auto *e = dyn_cast(&Binary)) { - RewriteInstance RI(e, *DR.get()); + RewriteInstance RI(e, *DR.get(), argc, argv); RI.run(); } else { report_error(opts::InputFilename, object_error::invalid_file_type); From b720a1405d8c90afeb510fd02158ffc040fa961a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 24 May 2017 21:59:01 -0700 Subject: [PATCH 256/904] [BOLT] Fix C++ ABI function alignment. Summary: C++ functions have to be aligned at 2-bytes minimum on x86-64. (cherry picked from commit 2094f4d466497f278a362c04aa8893b6c00b91a0) --- bolt/RewriteInstance.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 01d117a7b493..7803017aec28 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1884,10 +1884,9 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Streamer.SwitchSection(Section); if (opts::Relocs) { - Streamer.EmitCodeAlignment(std::max((unsigned)opts::AlignFunctions, - BinaryFunction::MinAlign), - std::max((unsigned)opts::AlignFunctionsMaxBytes, - BinaryFunction::MinAlign - 1)); + Streamer.EmitCodeAlignment(BinaryFunction::MinAlign); + Streamer.EmitCodeAlignment(opts::AlignFunctions, + opts::AlignFunctionsMaxBytes); } else { Streamer.EmitCodeAlignment(Function.getAlignment()); Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress()); From 7c35b3140e997c4a090d418d0fd13513424bccc2 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 25 May 2017 10:29:38 -0700 Subject: [PATCH 257/904] [BOLT] Fix no-assertions build. (cherry picked from commit 3651fc7de0e6d48f815c80b9d2cfeda46669c4a4) --- bolt/BinaryBasicBlock.cpp | 5 +- bolt/BinaryFunction.cpp | 11 ++- bolt/Passes/BinaryPasses.cpp | 147 +++++++++++++++++---------------- bolt/Passes/FrameAnalysis.cpp | 29 ++++--- bolt/Passes/FrameOptimizer.cpp | 60 ++++++++------ bolt/Passes/Inliner.cpp | 1 + bolt/RewriteInstance.cpp | 3 + 7 files changed, 144 insertions(+), 112 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 40d55e91ffa4..e580995be8da 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -301,8 +301,9 @@ void BinaryBasicBlock::addLandingPad(BinaryBasicBlock *LPBlock) { void BinaryBasicBlock::clearLandingPads() { for (auto *LPBlock : LandingPads) { - auto count = LPBlock->Throwers.erase(this); - assert(count == 1 && "Possible duplicate entry in LandingPads"); + auto Count = LPBlock->Throwers.erase(this); + (void)Count; + assert(Count == 1 && "Possible duplicate entry in LandingPads"); } LandingPads.clear(); } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 62b4474e28fc..b9e4039fca58 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1074,6 +1074,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { const auto Result = BC.MIA->replaceImmWithSymbol(Instruction, Relocation.Symbol, Relocation.Addend, BC.Ctx.get(), Value); + (void)Result; assert(Result && "cannot replace immediate with relocation"); // Make sure we replaced the correct immediate (instruction @@ -1240,6 +1241,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { case IndirectBranchType::POSSIBLE_TAIL_CALL: { auto Result = MIA->convertJmpToTailCall(Instruction); + (void)Result; assert(Result); if (BranchDataOrErr) { MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData", @@ -2351,7 +2353,6 @@ void BinaryFunction::annotateCFIState() { } bool BinaryFunction::fixCFIState() { - auto Sep = ""; DEBUG(dbgs() << "Trying to fix CFI states for each BB after reordering.\n"); DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this << ": "); @@ -2402,6 +2403,8 @@ bool BinaryFunction::fixCFIState() { int32_t State = 0; auto *FDEStartBB = BasicBlocksLayout[0]; bool SeenCold = false; + auto Sep = ""; + (void)Sep; for (auto *BB : BasicBlocksLayout) { const auto CFIStateAtExit = BB->getCFIStateAtExit(); @@ -3093,8 +3096,8 @@ void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { auto BIMergeI = BBMerge->branch_info_begin(); auto BII = BB->branch_info_begin(); for (const auto *BBSucc : BB->successors()) { - auto *BBMergeSucc = *BBMergeSI; - assert(getIndex(BBSucc) == BF.getIndex(BBMergeSucc)); + (void)BBSucc; + assert(getIndex(BBSucc) == BF.getIndex(*BBMergeSI)); // At this point no branch count should be set to COUNT_NO_PROFILE. assert(BII->Count != BinaryBasicBlock::COUNT_NO_PROFILE && @@ -3124,7 +3127,7 @@ void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { assert(BBMergeI == BF.end()); } -__attribute__((noinline)) BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { +BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { BasicBlockOrderType DFS; unsigned Index = 0; std::stack Stack; diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 0c989f9fef9f..330eb466d29f 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -676,6 +676,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, auto Result = MIA->createUncondBranch(Branch, CondSucc->getLabel(), BC.Ctx.get()); + (void)Result; assert(Result); PredBB->addInstruction(Branch); } @@ -1324,78 +1325,86 @@ void ReorderFunctions::reorder(std::vector &&Clusters, } } - if (opts::ReorderFunctions != BinaryFunction::RT_NONE && - (opts::Verbosity > 0 || - (DebugFlag && isCurrentDebugType("hfsort")))) { - uint64_t TotalSize = 0; - uint64_t CurPage = 0; - uint64_t Hotfuncs = 0; - double TotalDistance = 0; - double TotalCalls = 0; - double TotalCalls64B = 0; - double TotalCalls4KB = 0; - double TotalCalls2MB = 0; - dbgs() << "============== page 0 ==============\n"; - for (auto& Cluster : Clusters) { - dbgs() << - format("-------- density = %.3lf (%u / %u) --------\n", - (double) Cluster.Samples / Cluster.Size, - Cluster.Samples, Cluster.Size); - - for (auto FuncId : Cluster.Targets) { - if (Cg.Targets[FuncId].Samples > 0) { - Hotfuncs++; - - dbgs() << "BOLT-INFO: hot func " << *Funcs[FuncId] - << " (" << Cg.Targets[FuncId].Size << ")\n"; - - uint64_t Dist = 0; - uint64_t Calls = 0; - for (auto Dst : Cg.Targets[FuncId].Succs) { - auto& A = *Cg.Arcs.find(Arc(FuncId, Dst)); - auto D = - std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset)); - auto W = A.Weight; - Calls += W; - if (D < 64) TotalCalls64B += W; - if (D < 4096) TotalCalls4KB += W; - if (D < (2 << 20)) TotalCalls2MB += W; - Dist += A.Weight * D; - dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: " - "weight = %.0lf, callDist = %f\n", - A.Src, FuncAddr[A.Src], A.AvgCallOffset, - A.Dst, FuncAddr[A.Dst], A.Weight, D); - } - TotalCalls += Calls; - TotalDistance += Dist; - dbgs() << format("start = %6u : avgCallDist = %lu : %s\n", - TotalSize, - Calls ? Dist / Calls : 0, - Funcs[FuncId]->getPrintName().c_str()); - TotalSize += Cg.Targets[FuncId].Size; - auto NewPage = TotalSize / PageSize; - if (NewPage != CurPage) { - CurPage = NewPage; - dbgs() << format("============== page %u ==============\n", CurPage); - } + if (opts::ReorderFunctions == BinaryFunction::RT_NONE) + return; + + if (opts::Verbosity == 0) { +#ifndef NDEBUG + if (!DebugFlag || !isCurrentDebugType("hfsort")) + return; +#else + return; +#endif + } + + TotalSize = 0; + uint64_t CurPage = 0; + uint64_t Hotfuncs = 0; + double TotalDistance = 0; + double TotalCalls = 0; + double TotalCalls64B = 0; + double TotalCalls4KB = 0; + double TotalCalls2MB = 0; + dbgs() << "============== page 0 ==============\n"; + for (auto& Cluster : Clusters) { + dbgs() << + format("-------- density = %.3lf (%u / %u) --------\n", + (double) Cluster.Samples / Cluster.Size, + Cluster.Samples, Cluster.Size); + + for (auto FuncId : Cluster.Targets) { + if (Cg.Targets[FuncId].Samples > 0) { + Hotfuncs++; + + dbgs() << "BOLT-INFO: hot func " << *Funcs[FuncId] + << " (" << Cg.Targets[FuncId].Size << ")\n"; + + uint64_t Dist = 0; + uint64_t Calls = 0; + for (auto Dst : Cg.Targets[FuncId].Succs) { + auto& A = *Cg.Arcs.find(Arc(FuncId, Dst)); + auto D = + std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset)); + auto W = A.Weight; + Calls += W; + if (D < 64) TotalCalls64B += W; + if (D < 4096) TotalCalls4KB += W; + if (D < (2 << 20)) TotalCalls2MB += W; + Dist += A.Weight * D; + dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: " + "weight = %.0lf, callDist = %f\n", + A.Src, FuncAddr[A.Src], A.AvgCallOffset, + A.Dst, FuncAddr[A.Dst], A.Weight, D); + } + TotalCalls += Calls; + TotalDistance += Dist; + dbgs() << format("start = %6u : avgCallDist = %lu : %s\n", + TotalSize, + Calls ? Dist / Calls : 0, + Funcs[FuncId]->getPrintName().c_str()); + TotalSize += Cg.Targets[FuncId].Size; + auto NewPage = TotalSize / PageSize; + if (NewPage != CurPage) { + CurPage = NewPage; + dbgs() << format("============== page %u ==============\n", CurPage); } } } - dbgs() << format(" Number of hot functions: %u\n" - " Number of clusters: %lu\n", - Hotfuncs, Clusters.size()) - << format(" Final average call distance = %.1lf (%.0lf / %.0lf)\n", - TotalCalls ? TotalDistance / TotalCalls : 0, - TotalDistance, TotalCalls) - << format(" Total Calls = %.0lf\n", TotalCalls); - if (TotalCalls) { - dbgs() << format(" Total Calls within 64B = %.0lf (%.2lf%%)\n", - TotalCalls64B, 100 * TotalCalls64B / TotalCalls) - << format(" Total Calls within 4KB = %.0lf (%.2lf%%)\n", - TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls) - << format(" Total Calls within 2MB = %.0lf (%.2lf%%)\n", - TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls); - } + } + dbgs() << format(" Number of hot functions: %u\n" + " Number of clusters: %lu\n", + Hotfuncs, Clusters.size()) + << format(" Final average call distance = %.1lf (%.0lf / %.0lf)\n", + TotalCalls ? TotalDistance / TotalCalls : 0, + TotalDistance, TotalCalls) + << format(" Total Calls = %.0lf\n", TotalCalls); + if (TotalCalls) { + dbgs() << format(" Total Calls within 64B = %.0lf (%.2lf%%)\n", + TotalCalls64B, 100 * TotalCalls64B / TotalCalls) + << format(" Total Calls within 4KB = %.0lf (%.2lf%%)\n", + TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls) + << format(" Total Calls within 2MB = %.0lf (%.2lf%%)\n", + TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls); } } diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 1e38c7748229..b964dacfd3c4 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -419,8 +419,14 @@ void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { RegsKilledMap[Func] = std::move(RegsKilled); } - if (opts::Verbosity == 0 && (!DebugFlag || !isCurrentDebugType("fa"))) + if (opts::Verbosity == 0) { +#ifndef NDEBUG + if (!DebugFlag || !isCurrentDebugType("fa")) + return; +#else return; +#endif + } // This loop is for computing statistics only for (auto *Func : TopologicalCGOrder) { @@ -433,17 +439,16 @@ void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { CountFunctionsAllClobber += Count; ++NumFunctionsAllClobber; } - if (!DebugFlag || !isCurrentDebugType("fa")) - continue; - // DEBUG only - dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; - const BitVector &RegsKilled = Iter->second; - int RegIdx = RegsKilled.find_first(); - while (RegIdx != -1) { - dbgs() << "\tREG" << RegIdx; - RegIdx = RegsKilled.find_next(RegIdx); - }; - dbgs() << "\n"; + DEBUG_WITH_TYPE("fa", + dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; + const BitVector &RegsKilled = Iter->second; + int RegIdx = RegsKilled.find_first(); + while (RegIdx != -1) { + dbgs() << "\tREG" << RegIdx; + RegIdx = RegsKilled.find_next(RegIdx); + }; + dbgs() << "\n"; + ); } } diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 2eb3fafc38eb..3fd5a83c9ac2 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -168,8 +168,14 @@ void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) { RegsKilledMap[Func] = std::move(RegsKilled); } - if (opts::Verbosity == 0 && (!DebugFlag || !isCurrentDebugType("fop"))) + if (opts::Verbosity == 0) { +#ifndef NDEBUG + if (!DebugFlag || !isCurrentDebugType("fop")) + return; +#else return; +#endif + } // This loop is for computing statistics only for (auto *Func : TopologicalCGOrder) { @@ -182,17 +188,16 @@ void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) { CountFunctionsAllClobber += Count; ++NumFunctionsAllClobber; } - if (!DebugFlag || !isCurrentDebugType("fop")) - continue; - // DEBUG only - dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; - const BitVector &RegsKilled = Iter->second; - int RegIdx = RegsKilled.find_first(); - while (RegIdx != -1) { - dbgs() << "\tREG" << RegIdx; - RegIdx = RegsKilled.find_next(RegIdx); - }; - dbgs() << "\n"; + DEBUG_WITH_TYPE("fop", + dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; + const BitVector &RegsKilled = Iter->second; + int RegIdx = RegsKilled.find_first(); + while (RegIdx != -1) { + dbgs() << "\tREG" << RegIdx; + RegIdx = RegsKilled.find_next(RegIdx); + }; + dbgs() << "\n"; + ); } } @@ -665,18 +670,17 @@ bool FrameOptimizerPass::restoreFrameIndex(const BinaryContext &BC, CfaOffset + StackOffset, Size, IsSimple}); } - if (!DebugFlag || !isCurrentDebugType("fop")) - continue; - // DEBUG only - dbgs() << "Frame index annotation added to:\n"; - BC.printInstruction(dbgs(), Inst, 0, &BF, true); - dbgs() << " FrameIndexEntry \n"; + DEBUG_WITH_TYPE("fop", + dbgs() << "Frame index annotation added to:\n"; + BC.printInstruction(dbgs(), Inst, 0, &BF, true); + dbgs() << " FrameIndexEntry \n"; + ); } } } @@ -816,8 +820,14 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads << " redundant load(s).\n"; - if (opts::Verbosity == 0 && (!DebugFlag || !isCurrentDebugType("fop"))) + if (opts::Verbosity == 0) { +#ifndef NDEBUG + if (!DebugFlag || !isCurrentDebugType("fop")) + return; +#else return; +#endif + } outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg << " load(s) to use a register instead of a stack access, and " diff --git a/bolt/Passes/Inliner.cpp b/bolt/Passes/Inliner.cpp index 3374e6d8bc8c..c27d42a5562d 100644 --- a/bolt/Passes/Inliner.cpp +++ b/bolt/Passes/Inliner.cpp @@ -249,6 +249,7 @@ InlineSmallFunctions::inlineCall( const bool Result = BC.MIA->analyzeBranch(Instruction, OldTargetLabel, OldFTLabel, CondBranch, UncondBranch); + (void)Result; assert(Result && "analyzeBranch failed on instruction guaranteed to be a branch"); assert(OldTargetLabel); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 7803017aec28..73e8bce410c8 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2552,7 +2552,9 @@ void RewriteInstance::patchELFPHDRTable() { OS.seek(PHDRTableOffset); bool ModdedGnuStack = false; + (void)ModdedGnuStack; bool AddedSegment = false; + (void)AddedSegment; // Copy existing program headers with modifications. for (auto &Phdr : Obj->program_headers()) { @@ -3244,6 +3246,7 @@ void RewriteInstance::rewriteFile() { // Make sure output stream has enough reserved space, otherwise // pwrite() will fail. auto Offset = OS.seek(getFileOffsetForAddress(NextAvailableAddress)); + (void)Offset; assert(Offset == getFileOffsetForAddress(NextAvailableAddress) && "error resizing output file"); From dda0f20d6b67078359313f425022b15ec6a11f81 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 24 May 2017 18:40:29 -0700 Subject: [PATCH 258/904] Add option to generate function order file. Summary: Add -generate-function-order= option to write the computed function order to a file. We can read this order in later rather than recomputing each time we process a binary with BOLT. (cherry picked from commit 4cad6345cf3e07a7e08fe45915ec69d652c4c0a9) --- bolt/Passes/BinaryPasses.cpp | 58 ++++++++++++++++++++++++++++++++++-- bolt/RewriteInstance.cpp | 26 ++++++++-------- bolt/RewriteInstance.h | 3 ++ 3 files changed, 72 insertions(+), 15 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 330eb466d29f..5b958b8e942b 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -77,6 +77,12 @@ FunctionOrderFile("function-order", "reordering"), cl::cat(BoltOptCategory)); +static cl::opt +GenerateFunctionOrderFile("generate-function-order", + cl::desc("file to dump the ordered list of functions to use for function " + "reordering"), + cl::cat(BoltOptCategory)); + static cl::opt ICFUseDFS("icf-dfs", cl::desc("use DFS ordering when using -icf option"), @@ -1516,7 +1522,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, if (FuncAddrs.empty()) { errs() << "BOLT-WARNING: Reorder functions: can't find function for " - << Function << "\n"; + << Function << ".\n"; continue; } @@ -1527,11 +1533,13 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, auto *BF = BC.getFunctionForSymbol(FuncSym); if (!BF) { errs() << "BOLT-WARNING: Reorder functions: can't find function for " - << Function << "\n"; + << Function << ".\n"; break; } if (!BF->hasValidIndex()) { BF->setIndex(Index++); + } else if (opts::Verbosity > 0) { + errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n"; } } } @@ -1540,6 +1548,52 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, } reorder(std::move(Clusters), BFs); + + if (!opts::GenerateFunctionOrderFile.empty()) { + std::ofstream FuncsFile(opts::GenerateFunctionOrderFile, std::ios::out); + if (!FuncsFile) { + errs() << "Ordered functions file \"" << opts::GenerateFunctionOrderFile + << "\" can't be opened.\n"; + exit(1); + } + + std::vector SortedFunctions(BFs.size()); + + std::transform(BFs.begin(), + BFs.end(), + SortedFunctions.begin(), + [](std::pair &BFI) { + return &BFI.second; + }); + + // Sort functions by index. + std::stable_sort( + SortedFunctions.begin(), + SortedFunctions.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } else if (A->hasValidIndex() && !B->hasValidIndex()) { + return true; + } else if (!A->hasValidIndex() && B->hasValidIndex()) { + return false; + } else { + return A->getAddress() < B->getAddress(); + } + }); + + for (const auto *Func : SortedFunctions) { + if (!Func->hasValidIndex()) + break; + FuncsFile << Func->getSymbol()->getName().data() << "\n"; + } + FuncsFile.close(); + + outs() << "BOLT-INFO: dumped function order to \"" + << opts::GenerateFunctionOrderFile << "\"\n"; + + exit(0); + } } } // namespace bolt diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 73e8bce410c8..f0ae7c2da37a 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1854,11 +1854,9 @@ void RewriteInstance::runOptimizationPasses() { opts::PrintDynoStats || opts::DynoStatsAll); } -namespace { - // Helper function to emit the contents of a function via a MCStreamer object. -void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, - BinaryContext &BC, bool EmitColdPart) { +void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Function, + bool EmitColdPart) { if (Function.getSize() == 0) return; @@ -1867,19 +1865,19 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, MCSection *Section; if (opts::Relocs) { - Section = BC.MOFI->getTextSection(); + Section = BC->MOFI->getTextSection(); } else { // Each fuction is emmitted into its own section. Section = - BC.Ctx->getELFSection(EmitColdPart ? Function.getColdCodeSectionName() - : Function.getCodeSectionName(), + BC->Ctx->getELFSection(EmitColdPart ? Function.getColdCodeSectionName() + : Function.getCodeSectionName(), ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); } Section->setHasInstructions(true); - BC.Ctx->addGenDwarfSection(Section); + BC->Ctx->addGenDwarfSection(Section); Streamer.SwitchSection(Section); @@ -1898,7 +1896,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, // Emit all names the function is known under. for (const auto &Name : Function.getNames()) { Twine EmitName = EmitColdPart ? Twine(Name).concat(".cold") : Name; - auto *EmitSymbol = BC.Ctx->getOrCreateSymbol(EmitName); + auto *EmitSymbol = BC->Ctx->getOrCreateSymbol(EmitName); Streamer.EmitSymbolAttribute(EmitSymbol, MCSA_ELF_TypeFunction); DEBUG(dbgs() << "emitting symbol " << EmitSymbol->getName() << " for function " << Function << '\n'); @@ -1915,7 +1913,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, auto *LSDASymbol = EmitColdPart ? Function.getColdLSDASymbol() : Function.getLSDASymbol(); if (LSDASymbol) { - Streamer.EmitCFILsda(LSDASymbol, BC.MOFI->getLSDAEncoding()); + Streamer.EmitCFILsda(LSDASymbol, BC->MOFI->getLSDAEncoding()); } else { Streamer.EmitCFILsda(0, dwarf::DW_EH_PE_omit); } @@ -1973,6 +1971,8 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, Function.setEmitted(); } +namespace { + template std::vector singletonSet(T t) { std::vector Vec; @@ -2080,7 +2080,7 @@ void RewriteInstance::emitFunctions() { for (auto *FPtr : SortedFunctions) { if (!FPtr->isSplit() || !FPtr->isSimple()) continue; - emitFunction(*Streamer, *FPtr, *BC.get(), /*EmitColdPart=*/true); + emitFunction(*Streamer, *FPtr, /*EmitColdPart=*/true); } } DEBUG(dbgs() << "BOLT-DEBUG: first cold function: " << Function << '\n'); @@ -2096,10 +2096,10 @@ void RewriteInstance::emitFunctions() { << Function << "\" : " << Function.getFunctionNumber() << '\n'); - emitFunction(*Streamer, Function, *BC.get(), /*EmitColdPart=*/false); + emitFunction(*Streamer, Function, /*EmitColdPart=*/false); if (!opts::Relocs && Function.isSplit()) - emitFunction(*Streamer, Function, *BC.get(), /*EmitColdPart=*/true); + emitFunction(*Streamer, Function, /*EmitColdPart=*/true); ++CurrentIndex; } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index ccb48658766f..db12598bb354 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -263,6 +263,9 @@ class RewriteInstance { const DWARFAddressRangesVector &InputRanges) const; private: + /// Emit a single function. + void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, + bool EmitColdPart); /// Detect addresses and offsets available in the binary for allocating /// new sections. From f5116b7314a737c76a10b7fe1a83e7278268391c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 24 May 2017 15:20:27 -0700 Subject: [PATCH 259/904] [BOLT] Emit sorted DWARF ranges and location lists. Summary: When producing address ranges and location lists for debug info add a post-processing step that sorts them and merges adjacent entries. Fix a memory allocation/free issue for .debug_ranges section. (cherry picked from commit 804e6eb00f9cef2d0f84f4408ffcc7fa1fee79c0) --- bolt/BinaryFunction.cpp | 104 ++++++++++++++++++++++++++++++---------- bolt/BinaryFunction.h | 11 +++-- bolt/DWARFRewriter.cpp | 14 +++--- 3 files changed, 95 insertions(+), 34 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index b9e4039fca58..90398f8ef86f 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -3773,14 +3773,15 @@ DWARFAddressRangesVector BinaryFunction::getOutputAddressRanges() const { } DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( - DWARFAddressRangesVector InputRanges) const { - // If the function wasn't changed - there's nothing to update. + const DWARFAddressRangesVector &InputRanges) const { + // If the function hasn't changed return the same ranges. if (!isEmitted() && !opts::Relocs) return InputRanges; - DWARFAddressRangesVector OutputRanges; + // Even though we will merge ranges in a post-processing pass, we attempt to + // merge them in a main processing loop as it improves the processing time. uint64_t PrevEndAddress = 0; - + DWARFAddressRangesVector OutputRanges; for (const auto &Range : InputRanges) { if (!containsAddress(Range.first)) { DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " @@ -3790,10 +3791,16 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( continue; } auto InputOffset = Range.first - getAddress(); - const auto InputEndOffset = Range.second - getAddress(); + const auto InputEndOffset = std::min(Range.second - getAddress(), getSize()); + + auto BBI = std::upper_bound(BasicBlockOffsets.begin(), + BasicBlockOffsets.end(), + BasicBlockOffset(InputOffset, nullptr), + CompareBasicBlockOffsets()); + --BBI; do { - const auto *BB = getBasicBlockContainingOffset(InputOffset); - if (!BB) { + const auto *BB = BBI->second; + if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) { DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " << *this << " : [0x" << Twine::utohexstr(Range.first) << ", 0x" << Twine::utohexstr(Range.second) << "]\n"); @@ -3809,24 +3816,40 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( EndAddress = StartAddress + InputEndOffset - InputOffset; if (StartAddress == PrevEndAddress) { - OutputRanges.back().second = EndAddress; + OutputRanges.back().second = std::max(OutputRanges.back().second, + EndAddress); } else { - OutputRanges.emplace_back(StartAddress, EndAddress); + OutputRanges.emplace_back(StartAddress, + std::max(StartAddress, EndAddress)); } - PrevEndAddress = EndAddress; + PrevEndAddress = OutputRanges.back().second; } InputOffset = BB->getEndOffset(); + ++BBI; } while (InputOffset < InputEndOffset); } - return OutputRanges; + // Post-processing pass to sort and merge ranges. + std::sort(OutputRanges.begin(), OutputRanges.end()); + DWARFAddressRangesVector MergedRanges; + PrevEndAddress = 0; + for(const auto &Range : OutputRanges) { + if (Range.first <= PrevEndAddress) { + MergedRanges.back().second = std::max(MergedRanges.back().second, + Range.second); + } else { + MergedRanges.emplace_back(Range.first, Range.second); + } + PrevEndAddress = MergedRanges.back().second; + } + + return MergedRanges; } DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( - DWARFDebugLoc::LocationList &&InputLL, + const DWARFDebugLoc::LocationList &InputLL, uint64_t BaseAddress) const { - // If the function wasn't changed - there's nothing to update. if (!isEmitted() && !opts::Relocs) { if (!BaseAddress) { @@ -3841,9 +3864,9 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( } } - DWARFDebugLoc::LocationList OutputLL; - uint64_t PrevEndAddress = 0; + SmallVectorImpl *PrevLoc = nullptr; + DWARFDebugLoc::LocationList OutputLL; for (auto &Entry : InputLL.Entries) { const auto Start = Entry.Begin + BaseAddress; const auto End = Entry.End + BaseAddress; @@ -3851,14 +3874,18 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " << *this << " : [0x" << Twine::utohexstr(Start) << ", 0x" << Twine::utohexstr(End) << "]\n"); - PrevEndAddress = 0; continue; } auto InputOffset = Start - getAddress(); - const auto InputEndOffset = End - getAddress(); + const auto InputEndOffset = std::min(End - getAddress(), getSize()); + auto BBI = std::upper_bound(BasicBlockOffsets.begin(), + BasicBlockOffsets.end(), + BasicBlockOffset(InputOffset, nullptr), + CompareBasicBlockOffsets()); + --BBI; do { - const auto *BB = getBasicBlockContainingOffset(InputOffset); - if (!BB) { + const auto *BB = BBI->second; + if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) { DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " << *this << " : [0x" << Twine::utohexstr(Start) << ", 0x" << Twine::utohexstr(End) << "]\n"); @@ -3873,21 +3900,48 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( if (InputEndOffset < BB->getEndOffset()) EndAddress = StartAddress + InputEndOffset - InputOffset; - if (StartAddress == PrevEndAddress) { - OutputLL.Entries.back().End = EndAddress; + if (StartAddress == PrevEndAddress && Entry.Loc == *PrevLoc) { + OutputLL.Entries.back().End = std::max(OutputLL.Entries.back().End, + EndAddress); } else { OutputLL.Entries.emplace_back( DWARFDebugLoc::Entry{StartAddress, - EndAddress, - std::move(Entry.Loc)}); + std::max(StartAddress, EndAddress), + Entry.Loc}); } - PrevEndAddress = EndAddress; + PrevEndAddress = OutputLL.Entries.back().End; + PrevLoc = &OutputLL.Entries.back().Loc; } + + ++BBI; InputOffset = BB->getEndOffset(); } while (InputOffset < InputEndOffset); } - return OutputLL; + // Sort and merge adjacent entries with identical location. + std::stable_sort(OutputLL.Entries.begin(), OutputLL.Entries.end(), + [] (const DWARFDebugLoc::Entry &A, const DWARFDebugLoc::Entry &B) { + return A.Begin < B.Begin; + }); + DWARFDebugLoc::LocationList MergedLL; + PrevEndAddress = 0; + PrevLoc = nullptr; + for(const auto &Entry : OutputLL.Entries) { + if (Entry.Begin <= PrevEndAddress && *PrevLoc == Entry.Loc) { + MergedLL.Entries.back().End = std::max(Entry.End, + MergedLL.Entries.back().End);; + } else { + const auto Begin = std::max(Entry.Begin, PrevEndAddress); + const auto End = std::max(Begin, Entry.End); + MergedLL.Entries.emplace_back(DWARFDebugLoc::Entry{Begin, + End, + Entry.Loc}); + } + PrevEndAddress = MergedLL.Entries.back().End; + PrevLoc = &MergedLL.Entries.back().Loc; + } + + return MergedLL; } void BinaryFunction::printLoopInfo(raw_ostream &OS) const { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index bc17055cc25d..e5d2673fe9f0 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -1821,12 +1821,17 @@ class BinaryFunction { /// Return output address ranges for a function. DWARFAddressRangesVector getOutputAddressRanges() const; + /// Take address ranges corresponding to the input binary and translate + /// them to address ranges in the output binary. DWARFAddressRangesVector translateInputToOutputRanges( - DWARFAddressRangesVector InputRanges) const; + const DWARFAddressRangesVector &InputRanges) const; - /// \p BaseAddress to be applied to all addresses in \pInputLL. + /// Similar to translateInputToOutputRanges() but operates on location lists + /// and moves associated data to output location lists. + /// + /// \p BaseAddress is applied to all addresses in \pInputLL. DWARFDebugLoc::LocationList translateInputToOutputLocationList( - DWARFDebugLoc::LocationList &&InputLL, + const DWARFDebugLoc::LocationList &InputLL, uint64_t BaseAddress) const; virtual ~BinaryFunction(); diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index a785dfdd4bb8..04f625a0e0b7 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -169,8 +169,7 @@ void RewriteInstance::updateUnitDebugInfo( assert(!LL.Entries.empty() && "location list cannot be empty"); const auto OutputLL = Function - ->translateInputToOutputLocationList(std::move(LL), - Unit->getBaseAddress()); + ->translateInputToOutputLocationList(LL, Unit->getBaseAddress()); DEBUG( if (OutputLL.Entries.empty()) { dbgs() << "BOLT-DEBUG: location list translated to an empty one " @@ -459,17 +458,20 @@ void RewriteInstance::finalizeDebugSections() { } auto RangesSectionContents = RangesSectionsWriter->finalize(); + auto SectionSize = RangesSectionContents->size(); + uint8_t *SectionData = new uint8_t[SectionSize]; + memcpy(SectionData, RangesSectionContents->data(), SectionSize); EFMM->NoteSectionInfo[".debug_ranges"] = SectionInfo( - reinterpret_cast(RangesSectionContents->data()), - RangesSectionContents->size(), + reinterpret_cast(SectionData), + SectionSize, /*Alignment=*/1, /*IsCode=*/false, /*IsReadOnly=*/true, /*IsLocal=*/false); auto LocationListSectionContents = LocationListWriter->finalize(); - const auto SectionSize = LocationListSectionContents->size(); - uint8_t *SectionData = new uint8_t[SectionSize]; + SectionSize = LocationListSectionContents->size(); + SectionData = new uint8_t[SectionSize]; memcpy(SectionData, LocationListSectionContents->data(), SectionSize); EFMM->NoteSectionInfo[".debug_loc"] = SectionInfo( reinterpret_cast(SectionData), From bfabc79d21194bf31ba1c910bcc6b857e5ac447d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 30 May 2017 19:06:22 -0700 Subject: [PATCH 260/904] [BOLT] Fix SCTC again. Summary: Respect hot/cold boundaries when using BinaryFunction::getBasicBlockAfter(). (cherry picked from commit 85b89a866d9c8f482d4729951c114d082e7adf28) --- bolt/BinaryFunction.h | 20 +++++++++++--------- bolt/Passes/BinaryPasses.cpp | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index e5d2673fe9f0..a6724cb7365a 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -965,20 +965,22 @@ class BinaryFunction { /// Returns the basic block after the given basic block in the layout or /// nullptr the last basic block is given. - const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB) const { + const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, + bool IgnoreSplits = true) const { for (auto I = layout_begin(), E = layout_end(); I != E; ++I) { - if (*I == BB && std::next(I) != E) - return *std::next(I); + auto Next = std::next(I); + if (*I == BB && Next != E) { + return (IgnoreSplits || (*I)->isCold() == (*Next)->isCold()) + ? *Next : nullptr; + } } return nullptr; } - BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB) { - for (auto I = layout_begin(), E = layout_end(); I != E; ++I) { - if (*I == BB && std::next(I) != E) - return *std::next(I); - } - return nullptr; + BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, + bool IgnoreSplits = true) { + return + const_cast(this)->getBasicBlockAfter(BB, IgnoreSplits); } /// Retrieve the landing pad BB associated with invoke instruction \p Invoke diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 5b958b8e942b..0685b57efbee 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -672,7 +672,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); // Only add a new branch if the target is not the fall-through. - if (BF.getBasicBlockAfter(BB) != CondSucc || isValid(BB)) { + if (BF.getBasicBlockAfter(BB, false) != CondSucc || isValid(BB)) { if (UncondBranch) { MIA->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), From afb4523f3c5e6c0e792a2903956a721df0fb0723 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 31 May 2017 09:36:49 -0700 Subject: [PATCH 261/904] [BOLT] Update addresses for DW_TAG_GNU_call_site and DW_TAG_label. Summary: Some DWARF tags (such as GNU_call_site and label) reference instruction addresses in the input binary. When we update debug info we need to update these tags too with new addresses. Also fix base address used for calculation of output addresses in relocation mode. (cherry picked from commit 6cc3c88b814ef19801facd701fdd3b9d536277c6) --- bolt/BinaryFunction.cpp | 24 ++++++++++++++++++++++++ bolt/BinaryFunction.h | 7 +++++++ bolt/DWARFRewriter.cpp | 29 +++++++++++++++++++++++++---- bolt/RewriteInstance.cpp | 11 ++++++++--- 4 files changed, 64 insertions(+), 7 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 90398f8ef86f..6bf637006923 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -3772,6 +3772,30 @@ DWARFAddressRangesVector BinaryFunction::getOutputAddressRanges() const { return OutputRanges; } +uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const { + // If the function hasn't changed return the same address. + if (!isEmitted() && !opts::Relocs) + return Address; + + if (Address < getAddress()) + return 0; + + // FIXME: #18950828 - we rely on relative offsets inside basic blocks to stay + // intact. Instead we can use pseudo instructions and/or annotations. + const auto Offset = Address - getAddress(); + const auto *BB = getBasicBlockContainingOffset(Offset); + if (!BB) { + // Special case for address immediately past the end of the function. + if (Offset == getSize()) + return getOutputAddress() + getOutputSize(); + + return 0; + } + + return std::min(BB->getOutputAddressRange().first + Offset - BB->getOffset(), + BB->getOutputAddressRange().second); +} + DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( const DWARFAddressRangesVector &InputRanges) const { // If the function hasn't changed return the same ranges. diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index a6724cb7365a..6552c97d4bbe 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -1823,6 +1823,13 @@ class BinaryFunction { /// Return output address ranges for a function. DWARFAddressRangesVector getOutputAddressRanges() const; + /// Given an address corresponding to an instruction in the input binary, + /// return an address of this instruction in output binary. + /// + /// Return 0 if no matching address could be found or the instruction was + /// removed. + uint64_t translateInputToOutputAddress(uint64_t Address) const; + /// Take address ranges corresponding to the input binary and translate /// them to address ranges in the output binary. DWARFAddressRangesVector translateInputToOutputRanges( diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 04f625a0e0b7..7d6f583985c2 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -151,13 +151,13 @@ void RewriteInstance::updateUnitDebugInfo( // Handle any tag that can have DW_AT_location attribute. DWARFFormValue Value; uint32_t AttrOffset; + const BinaryFunction *Function = + FunctionStack.empty() ? nullptr : FunctionStack.back(); if (DIE->getAttributeValue(Unit, dwarf::DW_AT_location, Value, &AttrOffset)) { if (Value.isFormClass(DWARFFormValue::FC_Constant) || Value.isFormClass(DWARFFormValue::FC_SectionOffset)) { auto LocListSectionOffset = LocationListWriter->getEmptyListOffset(); - const BinaryFunction *Function = - FunctionStack.empty() ? nullptr : FunctionStack.back(); if (Function) { // Limit parsing to a single list to save memory. DWARFDebugLoc::LocationList LL; @@ -172,8 +172,8 @@ void RewriteInstance::updateUnitDebugInfo( ->translateInputToOutputLocationList(LL, Unit->getBaseAddress()); DEBUG( if (OutputLL.Entries.empty()) { - dbgs() << "BOLT-DEBUG: location list translated to an empty one " - "at 0x" + dbgs() << "BOLT-DEBUG: location list translated to an empty " + "one at 0x" << Twine::utohexstr(DIE->getOffset()) << " in CU at 0x" << Twine::utohexstr(Unit->getOffset()) << '\n'; } @@ -191,6 +191,27 @@ void RewriteInstance::updateUnitDebugInfo( Value.isFormClass(DWARFFormValue::FC_Block)) && "unexpected DW_AT_location form"); } + } else if (DIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, Value, + &AttrOffset)) { + const auto Result = Value.getAsAddress(Unit); + if (Result.hasValue()) { + uint64_t NewAddress = 0; + if (Function) { + const auto Address = Result.getValue(); + NewAddress = Function->translateInputToOutputAddress(Address); + DEBUG(dbgs() << "BOLT-DEBUG: Fixing low_pc 0x" + << Twine::utohexstr(Address) + << " for DIE with tag " << DIE->getTag() + << " to 0x" << Twine::utohexstr(NewAddress) << '\n'); + } + auto DebugInfoPatcher = + static_cast( + SectionPatchers[".debug_info"].get()); + DebugInfoPatcher->addLE64Patch(AttrOffset, NewAddress); + } else if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: unexpected form value for attribute at 0x" + << Twine::utohexstr(AttrOffset); + } } } } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index f0ae7c2da37a..50d0d5edda90 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2410,7 +2410,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { continue; // Output ranges should match the input if the body hasn't changed. - if (!Function.isSimple()) + if (!Function.isSimple() && !opts::Relocs) continue; BinaryBasicBlock *PrevBB = nullptr; @@ -2418,8 +2418,13 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { BBI != BBE; ++BBI) { auto *BB = *BBI; assert(BB->getLabel()->isDefined(false) && "symbol should be defined"); - uint64_t BaseAddress = BB->isCold() ? Function.cold().getAddress() - : Function.getOutputAddress(); + uint64_t BaseAddress; + if (opts::Relocs) { + BaseAddress = NewTextSectionStartAddress; + } else { + BaseAddress = BB->isCold() ? Function.cold().getAddress() + : Function.getOutputAddress(); + } uint64_t Address = BaseAddress + Layout.getSymbolOffset(*BB->getLabel()); BB->setOutputStartAddress(Address); From c8af12fbeb2d3e283f7ad8721881e5fc066ca67d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 31 May 2017 14:23:37 -0700 Subject: [PATCH 262/904] [BOLT] Fix SCTC again again. Summary: I put the const_cast(this) on the wrong version of getBasicBlockAfter(). It's on the right one now. (cherry picked from commit cfc8ea8a23d6983158f960d63c16dd044221b9a5) --- bolt/BinaryFunction.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 6552c97d4bbe..eeb5c52ed389 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -967,6 +967,12 @@ class BinaryFunction { /// nullptr the last basic block is given. const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, bool IgnoreSplits = true) const { + return + const_cast(this)->getBasicBlockAfter(BB, IgnoreSplits); + } + + BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, + bool IgnoreSplits = true) { for (auto I = layout_begin(), E = layout_end(); I != E; ++I) { auto Next = std::next(I); if (*I == BB && Next != E) { @@ -977,12 +983,6 @@ class BinaryFunction { return nullptr; } - BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, - bool IgnoreSplits = true) { - return - const_cast(this)->getBasicBlockAfter(BB, IgnoreSplits); - } - /// Retrieve the landing pad BB associated with invoke instruction \p Invoke /// that is in \p BB. Return nullptr if none exists BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB, From 9750051e706fd10a36169fdc0f41bfd7ed8ece6f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 26 May 2017 12:53:21 -0700 Subject: [PATCH 263/904] HFSort/call graph refactoring Summary: I've factored out the call graph code from dataflow and function reordering code and done a few small renames/cleanups. I've also moved the function reordering pass into a separate file because it was starting to get big. I've got more refactoring planned for hfsort/call graph but this is a start. (cherry picked from commit ffca8c45559c7000962a05be845164a84712f4dd) --- bolt/BinaryPassManager.cpp | 1 + bolt/Passes/BinaryPasses.cpp | 478 ------------------------------- bolt/Passes/BinaryPasses.h | 24 +- bolt/Passes/CMakeLists.txt | 3 + bolt/Passes/CallGraph.cpp | 262 +++++++++++++++++ bolt/Passes/CallGraph.h | 113 ++++++++ bolt/Passes/FrameAnalysis.cpp | 73 +---- bolt/Passes/FrameAnalysis.h | 20 +- bolt/Passes/FrameOptimizer.cpp | 73 +---- bolt/Passes/FrameOptimizer.h | 22 +- bolt/Passes/HFSort.cpp | 294 +++---------------- bolt/Passes/HFSort.h | 137 ++------- bolt/Passes/HFSortPlus.cpp | 54 ++-- bolt/Passes/PettisAndHansen.cpp | 206 +++++++++++++ bolt/Passes/ReorderFunctions.cpp | 406 ++++++++++++++++++++++++++ bolt/Passes/ReorderFunctions.h | 43 +++ 16 files changed, 1138 insertions(+), 1071 deletions(-) create mode 100644 bolt/Passes/CallGraph.cpp create mode 100644 bolt/Passes/CallGraph.h create mode 100644 bolt/Passes/PettisAndHansen.cpp create mode 100644 bolt/Passes/ReorderFunctions.cpp create mode 100644 bolt/Passes/ReorderFunctions.h diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index ceb5160af23c..0dac7f0b1b04 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -13,6 +13,7 @@ #include "Passes/FrameOptimizer.h" #include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" +#include "Passes/ReorderFunctions.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 0685b57efbee..d04b64e3955c 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -10,11 +10,8 @@ //===----------------------------------------------------------------------===// #include "BinaryPasses.h" -#include "HFSort.h" #include "llvm/Support/Options.h" -#include - #define DEBUG_TYPE "bolt" using namespace llvm; @@ -52,11 +49,9 @@ namespace opts { extern cl::OptionCategory BoltOptCategory; extern cl::opt Verbosity; -extern cl::opt RandomSeed; extern cl::opt Relocs; extern cl::opt SplitFunctions; extern bool shouldProcess(const bolt::BinaryFunction &Function); -extern size_t padFunction(const bolt::BinaryFunction &Function); enum DynoStatsSortOrder : char { Ascending, @@ -71,18 +66,6 @@ DynoStatsSortOrderOpt("print-sorted-by-order", cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); -static cl::opt -FunctionOrderFile("function-order", - cl::desc("file containing an ordered list of functions to use for function " - "reordering"), - cl::cat(BoltOptCategory)); - -static cl::opt -GenerateFunctionOrderFile("generate-function-order", - cl::desc("file to dump the ordered list of functions to use for function " - "reordering"), - cl::cat(BoltOptCategory)); - static cl::opt ICFUseDFS("icf-dfs", cl::desc("use DFS ordering when using -icf option"), @@ -143,41 +126,6 @@ ReorderBlocks("reorder-blocks", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -cl::opt -ReorderFunctions("reorder-functions", - cl::desc("reorder and cluster functions (works only with relocations)"), - cl::init(bolt::BinaryFunction::RT_NONE), - cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE, - "none", - "do not reorder functions"), - clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT, - "exec-count", - "order by execution count"), - clEnumValN(bolt::BinaryFunction::RT_HFSORT, - "hfsort", - "use hfsort algorithm"), - clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS, - "hfsort+", - "use hfsort+ algorithm"), - clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN, - "pettis-hansen", - "use Pettis-Hansen algorithm"), - clEnumValN(bolt::BinaryFunction::RT_RANDOM, - "random", - "reorder functions randomly"), - clEnumValN(bolt::BinaryFunction::RT_USER, - "user", - "use function order specified by -function-order"), - clEnumValEnd), - cl::cat(BoltOptCategory)); - -static cl::opt -ReorderFunctionsUseHotSize("reorder-functions-use-hot-size", - cl::desc("use a function's hot size when doing clustering"), - cl::init(true), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - enum SctcModes : char { SctcAlways, SctcPreserveDirection, @@ -200,13 +148,6 @@ SctcMode("sctc-mode", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -UseEdgeCounts("use-edge-counts", - cl::desc("use edge count data when doing clustering"), - cl::init(true), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - } // namespace opts namespace llvm { @@ -1177,424 +1118,5 @@ void StripRepRet::runOnFunctions( } } -void ReorderFunctions::buildCallGraph(BinaryContext &BC, - std::map &BFs) { - // Add call graph nodes. - auto lookupNode = [&](BinaryFunction *Function) { - auto It = FuncToTargetId.find(Function); - if (It == FuncToTargetId.end()) { - // It's ok to use the hot size here when the function is split. This is - // because emitFunctions will emit the hot part first in the order that is - // computed by ReorderFunctions. The cold part will be emitted with the - // rest of the cold functions and code. - const auto Size = opts::ReorderFunctionsUseHotSize && Function->isSplit() - ? Function->estimateHotSize() - : Function->estimateSize(); - const auto Id = Cg.addTarget(Size); - assert(size_t(Id) == Funcs.size()); - Funcs.push_back(Function); - FuncToTargetId[Function] = Id; - // NOTE: for functions without a profile, we set the number of samples - // to zero. This will keep these functions from appearing in the hot - // section. This is a little weird because we wouldn't be trying to - // create a node for a function unless it was the target of a call from - // a hot block. The alternative would be to set the count to one or - // accumulate the number of calls from the callsite into the function - // samples. Results from perfomance testing seem to favor the zero - // count though, so I'm leaving it this way for now. - Cg.Targets[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0; - assert(Funcs[Id] == Function); - return Id; - } else { - return It->second; - } - }; - - // Add call graph edges. - uint64_t NotProcessed = 0; - uint64_t TotalCalls = 0; - for (auto &It : BFs) { - auto *Function = &It.second; - - if(!shouldOptimize(*Function) || !Function->hasProfile()) { - continue; - } - - auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames()); - const auto SrcId = lookupNode(Function); - uint64_t Offset = Function->getAddress(); - - auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) { - if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) { - const auto DstId = lookupNode(DstFunc); - auto &A = Cg.incArcWeight(SrcId, DstId, Count); - if (!opts::UseEdgeCounts) { - A.AvgCallOffset += (Offset - DstFunc->getAddress()); - } - DEBUG(dbgs() << "BOLT-DEBUG: Reorder functions: call " << *Function - << " -> " << *DstFunc << " @ " << Offset << "\n"); - return true; - } - return false; - }; - - for (auto *BB : Function->layout()) { - // Don't count calls from cold blocks - if (BB->isCold()) - continue; - - for (auto &Inst : *BB) { - // Find call instructions and extract target symbols from each one. - if (BC.MIA->isCall(Inst)) { - ++TotalCalls; - if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) { - // For direct calls, just use the BB execution count. - assert(BB->hasProfile()); - const auto Count = opts::UseEdgeCounts ? BB->getExecutionCount() : 1; - if (!recordCall(DstSym, Count)) - ++NotProcessed; - } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { - // For indirect calls and jump tables, use branch data. - assert(BranchDataOrErr); - const FuncBranchData &BranchData = BranchDataOrErr.get(); - const auto DataOffset = - BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); - - for (const auto &BI : BranchData.getBranchRange(DataOffset)) { - // Count each target as a separate call. - ++TotalCalls; - - if (!BI.To.IsSymbol) { - ++NotProcessed; - continue; - } - - auto Itr = BC.GlobalSymbols.find(BI.To.Name); - if (Itr == BC.GlobalSymbols.end()) { - ++NotProcessed; - continue; - } - - const auto *DstSym = - BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); - - if (!recordCall(DstSym, opts::UseEdgeCounts ? BI.Branches : 1)) - ++NotProcessed; - } - } - } - - if (!opts::UseEdgeCounts) { - Offset += BC.computeCodeSize(&Inst, &Inst + 1); - } - } - } - } - outs() << "BOLT-WARNING: ReorderFunctions: " << NotProcessed - << " callsites not processed out of " << TotalCalls << "\n"; - - // Normalize arc weights. - if (!opts::UseEdgeCounts) { - for (TargetId FuncId = 0; FuncId < Cg.Targets.size(); ++FuncId) { - auto& Func = Cg.Targets[FuncId]; - for (auto Caller : Func.Preds) { - auto& A = *Cg.Arcs.find(Arc(Caller, FuncId)); - A.NormalizedWeight = A.Weight / Func.Samples; - A.AvgCallOffset /= A.Weight; - assert(A.AvgCallOffset < Cg.Targets[Caller].Size); - } - } - } else { - for (TargetId FuncId = 0; FuncId < Cg.Targets.size(); ++FuncId) { - auto &Func = Cg.Targets[FuncId]; - for (auto Caller : Func.Preds) { - auto& A = *Cg.Arcs.find(Arc(Caller, FuncId)); - A.NormalizedWeight = A.Weight / Func.Samples; - } - } - } -} - -void ReorderFunctions::reorder(std::vector &&Clusters, - std::map &BFs) { - std::vector FuncAddr(Cg.Targets.size()); // Just for computing stats - uint64_t TotalSize = 0; - uint32_t Index = 0; - - // Set order of hot functions based on clusters. - for (const auto& Cluster : Clusters) { - for (const auto FuncId : Cluster.Targets) { - assert(Cg.Targets[FuncId].Samples > 0); - Funcs[FuncId]->setIndex(Index++); - FuncAddr[FuncId] = TotalSize; - TotalSize += Cg.Targets[FuncId].Size; - } - } - - if (opts::ReorderFunctions == BinaryFunction::RT_NONE) - return; - - if (opts::Verbosity == 0) { -#ifndef NDEBUG - if (!DebugFlag || !isCurrentDebugType("hfsort")) - return; -#else - return; -#endif - } - - TotalSize = 0; - uint64_t CurPage = 0; - uint64_t Hotfuncs = 0; - double TotalDistance = 0; - double TotalCalls = 0; - double TotalCalls64B = 0; - double TotalCalls4KB = 0; - double TotalCalls2MB = 0; - dbgs() << "============== page 0 ==============\n"; - for (auto& Cluster : Clusters) { - dbgs() << - format("-------- density = %.3lf (%u / %u) --------\n", - (double) Cluster.Samples / Cluster.Size, - Cluster.Samples, Cluster.Size); - - for (auto FuncId : Cluster.Targets) { - if (Cg.Targets[FuncId].Samples > 0) { - Hotfuncs++; - - dbgs() << "BOLT-INFO: hot func " << *Funcs[FuncId] - << " (" << Cg.Targets[FuncId].Size << ")\n"; - - uint64_t Dist = 0; - uint64_t Calls = 0; - for (auto Dst : Cg.Targets[FuncId].Succs) { - auto& A = *Cg.Arcs.find(Arc(FuncId, Dst)); - auto D = - std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset)); - auto W = A.Weight; - Calls += W; - if (D < 64) TotalCalls64B += W; - if (D < 4096) TotalCalls4KB += W; - if (D < (2 << 20)) TotalCalls2MB += W; - Dist += A.Weight * D; - dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: " - "weight = %.0lf, callDist = %f\n", - A.Src, FuncAddr[A.Src], A.AvgCallOffset, - A.Dst, FuncAddr[A.Dst], A.Weight, D); - } - TotalCalls += Calls; - TotalDistance += Dist; - dbgs() << format("start = %6u : avgCallDist = %lu : %s\n", - TotalSize, - Calls ? Dist / Calls : 0, - Funcs[FuncId]->getPrintName().c_str()); - TotalSize += Cg.Targets[FuncId].Size; - auto NewPage = TotalSize / PageSize; - if (NewPage != CurPage) { - CurPage = NewPage; - dbgs() << format("============== page %u ==============\n", CurPage); - } - } - } - } - dbgs() << format(" Number of hot functions: %u\n" - " Number of clusters: %lu\n", - Hotfuncs, Clusters.size()) - << format(" Final average call distance = %.1lf (%.0lf / %.0lf)\n", - TotalCalls ? TotalDistance / TotalCalls : 0, - TotalDistance, TotalCalls) - << format(" Total Calls = %.0lf\n", TotalCalls); - if (TotalCalls) { - dbgs() << format(" Total Calls within 64B = %.0lf (%.2lf%%)\n", - TotalCalls64B, 100 * TotalCalls64B / TotalCalls) - << format(" Total Calls within 4KB = %.0lf (%.2lf%%)\n", - TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls) - << format(" Total Calls within 2MB = %.0lf (%.2lf%%)\n", - TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls); - } -} - -namespace { - -std::vector readFunctionOrderFile() { - std::vector FunctionNames; - std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in); - if (!FuncsFile) { - errs() << "Ordered functions file \"" << opts::FunctionOrderFile - << "\" can't be opened.\n"; - exit(1); - } - std::string FuncName; - while (std::getline(FuncsFile, FuncName)) { - FunctionNames.push_back(FuncName); - } - return FunctionNames; -} - -} - -void ReorderFunctions::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { - if (!opts::Relocs && opts::ReorderFunctions != BinaryFunction::RT_NONE) { - errs() << "BOLT-ERROR: Function reordering only works when " - << "relocs are enabled.\n"; - exit(1); - } - - if (opts::ReorderFunctions != BinaryFunction::RT_NONE && - opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT && - opts::ReorderFunctions != BinaryFunction::RT_USER) { - buildCallGraph(BC, BFs); - } - - std::vector Clusters; - - switch(opts::ReorderFunctions) { - case BinaryFunction::RT_NONE: - break; - case BinaryFunction::RT_EXEC_COUNT: - { - std::vector SortedFunctions(BFs.size()); - uint32_t Index = 0; - std::transform(BFs.begin(), - BFs.end(), - SortedFunctions.begin(), - [](std::pair &BFI) { - return &BFI.second; - }); - std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), - [&](const BinaryFunction *A, const BinaryFunction *B) { - if (!opts::shouldProcess(*A)) - return false; - const auto PadA = opts::padFunction(*A); - const auto PadB = opts::padFunction(*B); - if (!PadA || !PadB) { - if (PadA) - return true; - if (PadB) - return false; - } - return !A->hasProfile() && - (B->hasProfile() || - (A->getExecutionCount() > B->getExecutionCount())); - }); - for (auto *BF : SortedFunctions) { - if (BF->hasProfile()) - BF->setIndex(Index++); - } - } - break; - case BinaryFunction::RT_HFSORT: - Clusters = clusterize(Cg); - break; - case BinaryFunction::RT_HFSORT_PLUS: - Clusters = hfsortPlus(Cg); - break; - case BinaryFunction::RT_PETTIS_HANSEN: - Clusters = pettisAndHansen(Cg); - break; - case BinaryFunction::RT_RANDOM: - std::srand(opts::RandomSeed); - Clusters = randomClusters(Cg); - break; - case BinaryFunction::RT_USER: - { - uint32_t Index = 0; - for (const auto &Function : readFunctionOrderFile()) { - std::vector FuncAddrs; - - auto Itr = BC.GlobalSymbols.find(Function); - if (Itr == BC.GlobalSymbols.end()) { - uint32_t LocalID = 1; - while(1) { - // If we can't find the main symbol name, look for alternates. - Itr = BC.GlobalSymbols.find(Function + "/" + std::to_string(LocalID)); - if (Itr != BC.GlobalSymbols.end()) - FuncAddrs.push_back(Itr->second); - else - break; - LocalID++; - } - } else { - FuncAddrs.push_back(Itr->second); - } - - if (FuncAddrs.empty()) { - errs() << "BOLT-WARNING: Reorder functions: can't find function for " - << Function << ".\n"; - continue; - } - - for (const auto FuncAddr : FuncAddrs) { - const auto *FuncSym = BC.getOrCreateGlobalSymbol(FuncAddr, "FUNCat"); - assert(FuncSym); - - auto *BF = BC.getFunctionForSymbol(FuncSym); - if (!BF) { - errs() << "BOLT-WARNING: Reorder functions: can't find function for " - << Function << ".\n"; - break; - } - if (!BF->hasValidIndex()) { - BF->setIndex(Index++); - } else if (opts::Verbosity > 0) { - errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n"; - } - } - } - } - break; - } - - reorder(std::move(Clusters), BFs); - - if (!opts::GenerateFunctionOrderFile.empty()) { - std::ofstream FuncsFile(opts::GenerateFunctionOrderFile, std::ios::out); - if (!FuncsFile) { - errs() << "Ordered functions file \"" << opts::GenerateFunctionOrderFile - << "\" can't be opened.\n"; - exit(1); - } - - std::vector SortedFunctions(BFs.size()); - - std::transform(BFs.begin(), - BFs.end(), - SortedFunctions.begin(), - [](std::pair &BFI) { - return &BFI.second; - }); - - // Sort functions by index. - std::stable_sort( - SortedFunctions.begin(), - SortedFunctions.end(), - [](const BinaryFunction *A, const BinaryFunction *B) { - if (A->hasValidIndex() && B->hasValidIndex()) { - return A->getIndex() < B->getIndex(); - } else if (A->hasValidIndex() && !B->hasValidIndex()) { - return true; - } else if (!A->hasValidIndex() && B->hasValidIndex()) { - return false; - } else { - return A->getAddress() < B->getAddress(); - } - }); - - for (const auto *Func : SortedFunctions) { - if (!Func->hasValidIndex()) - break; - FuncsFile << Func->getSymbol()->getName().data() << "\n"; - } - FuncsFile.close(); - - outs() << "BOLT-INFO: dumped function order to \"" - << opts::GenerateFunctionOrderFile << "\"\n"; - - exit(0); - } -} - } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 2f97f8326a50..2a25ec656ebd 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -18,6 +18,7 @@ #include "BinaryFunction.h" #include "HFSort.h" #include "llvm/Support/CommandLine.h" + #include #include #include @@ -358,29 +359,6 @@ class StripRepRet : public BinaryFunctionPass { std::set &LargeFunctions) override; }; -/// Modify function order for streaming based on hotness. -class ReorderFunctions : public BinaryFunctionPass { - static constexpr uint32_t PageSize = 2 << 20; - std::vector Funcs; - std::unordered_map FuncToTargetId; - TargetGraph Cg; - - void buildCallGraph(BinaryContext &BC, - std::map &BFs); - void reorder(std::vector &&Clusters, - std::map &BFs); - public: - explicit ReorderFunctions(const cl::opt &PrintPass) - : BinaryFunctionPass(PrintPass) { } - - const char *getName() const override { - return "reorder-functions"; - } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; -}; - } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index b764de69c4c5..82c2a788e059 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMBOLTPasses BinaryPasses.cpp + CallGraph.cpp DataflowAnalysis.cpp DataflowInfoManager.cpp FrameAnalysis.cpp @@ -9,7 +10,9 @@ add_llvm_library(LLVMBOLTPasses IndirectCallPromotion.cpp Inliner.cpp LivenessAnalysis.cpp + PettisAndHansen.cpp ReorderAlgorithm.cpp + ReorderFunctions.cpp StackPointerTracking.cpp ) diff --git a/bolt/Passes/CallGraph.cpp b/bolt/Passes/CallGraph.cpp new file mode 100644 index 000000000000..50757270c547 --- /dev/null +++ b/bolt/Passes/CallGraph.cpp @@ -0,0 +1,262 @@ +//===--- Passes/CallGraph.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "CallGraph.h" +#include "BinaryFunction.h" +#include "BinaryContext.h" + +#define DEBUG_TYPE "callgraph" + +#if defined(__x86_64__) && !defined(_MSC_VER) +# if (!defined USE_SSECRC) +# define USE_SSECRC +# endif +#else +# undef USE_SSECRC +#endif + +namespace { + +inline size_t hash_int64_fallback(int64_t key) { + // "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function." + // http://www.concentric.net/~ttwang/tech/inthash.htm + key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = key ^ ((unsigned long long)key >> 24); + key = (key + (key << 3)) + (key << 8); // key * 265 + key = key ^ ((unsigned long long)key >> 14); + key = (key + (key << 2)) + (key << 4); // key * 21 + key = key ^ ((unsigned long long)key >> 28); + return static_cast(static_cast(key)); +} + +inline size_t hash_int64(int64_t k) { +#if defined(USE_SSECRC) && defined(__SSE4_2__) + size_t h = 0; + __asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k)); + return h; +#else + return hash_int64_fallback(k); +#endif +} + +inline size_t hash_int64_pair(int64_t k1, int64_t k2) { +#if defined(USE_SSECRC) && defined(__SSE4_2__) + // crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes + // differently from (k2, k1). + k1 += k1; + __asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2)); + return k1; +#else + return (hash_int64(k1) << 1) ^ hash_int64(k2); +#endif +} + +} + +namespace llvm { +namespace bolt { + +int64_t CallGraph::Arc::Hash::operator()(const Arc &Arc) const { +#ifdef USE_STD_HASH + std::hash Hasher; + return hashCombine(Hasher(Arc.Src), Arc.Dst); +#else + return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst)); +#endif +} + +CallGraph buildCallGraph(BinaryContext &BC, + std::map &BFs, + std::function Filter, + bool IncludeColdCalls, + bool UseFunctionHotSize, + bool UseEdgeCounts) { + CallGraph Cg; + + // Add call graph nodes. + auto lookupNode = [&](BinaryFunction *Function) { + auto It = Cg.FuncToNodeId.find(Function); + if (It == Cg.FuncToNodeId.end()) { + // It's ok to use the hot size here when the function is split. This is + // because emitFunctions will emit the hot part first in the order that is + // computed by ReorderFunctions. The cold part will be emitted with the + // rest of the cold functions and code. + const auto Size = UseFunctionHotSize && Function->isSplit() + ? Function->estimateHotSize() + : Function->estimateSize(); + const auto Id = Cg.addNode(Size); + assert(size_t(Id) == Cg.Funcs.size()); + Cg.Funcs.push_back(Function); + Cg.FuncToNodeId[Function] = Id; + // NOTE: for functions without a profile, we set the number of samples + // to zero. This will keep these functions from appearing in the hot + // section. This is a little weird because we wouldn't be trying to + // create a node for a function unless it was the target of a call from + // a hot block. The alternative would be to set the count to one or + // accumulate the number of calls from the callsite into the function + // samples. Results from perfomance testing seem to favor the zero + // count though, so I'm leaving it this way for now. + Cg.Nodes[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0; + assert(Cg.Funcs[Id] == Function); + return Id; + } else { + return It->second; + } + }; + + // Add call graph edges. + uint64_t NotProcessed = 0; + uint64_t TotalCalls = 0; + for (auto &It : BFs) { + auto *Function = &It.second; + + if(Filter(*Function)) { + continue; + } + + auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames()); + const auto SrcId = lookupNode(Function); + uint64_t Offset = Function->getAddress(); + + auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) { + if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) { + const auto DstId = lookupNode(DstFunc); + auto &A = Cg.incArcWeight(SrcId, DstId, Count); + if (!UseEdgeCounts) { + A.AvgCallOffset += (Offset - DstFunc->getAddress()); + } + DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function + << " -> " << *DstFunc << " @ " << Offset << "\n"); + return true; + } + return false; + }; + + for (auto *BB : Function->layout()) { + // Don't count calls from cold blocks + if (BB->isCold() && !IncludeColdCalls) + continue; + + for (auto &Inst : *BB) { + // Find call instructions and extract target symbols from each one. + if (!BC.MIA->isCall(Inst)) + continue; + + ++TotalCalls; + if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) { + // For direct calls, just use the BB execution count. + const auto Count = UseEdgeCounts && BB->hasProfile() + ? BB->getExecutionCount() : 1; + if (!recordCall(DstSym, Count)) + ++NotProcessed; + } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { + // For indirect calls and jump tables, use branch data. + if(!BranchDataOrErr) { + ++NotProcessed; + continue; + } + const FuncBranchData &BranchData = BranchDataOrErr.get(); + const auto DataOffset = + BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); + + for (const auto &BI : BranchData.getBranchRange(DataOffset)) { + // Count each target as a separate call. + ++TotalCalls; + + if (!BI.To.IsSymbol) { + ++NotProcessed; + continue; + } + + auto Itr = BC.GlobalSymbols.find(BI.To.Name); + if (Itr == BC.GlobalSymbols.end()) { + ++NotProcessed; + continue; + } + + const auto *DstSym = + BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); + + if (!recordCall(DstSym, UseEdgeCounts ? BI.Branches : 1)) + ++NotProcessed; + } + } + + if (!UseEdgeCounts) { + Offset += BC.computeCodeSize(&Inst, &Inst + 1); + } + } + } + } + + outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed + << " callsites not processed out of " << TotalCalls << "\n"; + + return Cg; +} + +CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint32_t Samples) { + auto Id = Nodes.size(); + Nodes.emplace_back(Size, Samples); + return Id; +} + +const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W) { + auto Res = Arcs.emplace(Src, Dst, W); + if (!Res.second) { + Res.first->Weight += W; + return *Res.first; + } + Nodes[Src].Succs.push_back(Dst); + Nodes[Dst].Preds.push_back(Src); + return *Res.first; +} + +std::deque CallGraph::buildTraversalOrder() { + std::deque TopologicalOrder; + enum NodeStatus { NEW, VISITING, VISITED }; + std::vector NodeStatus(Funcs.size()); + std::stack Worklist; + + for (auto *Func : Funcs) { + const auto Id = FuncToNodeId.at(Func); + Worklist.push(Id); + NodeStatus[Id] = NEW; + } + + while (!Worklist.empty()) { + const auto FuncId = Worklist.top(); + Worklist.pop(); + + if (NodeStatus[FuncId] == VISITED) + continue; + + if (NodeStatus[FuncId] == VISITING) { + TopologicalOrder.push_back(Funcs[FuncId]); + NodeStatus[FuncId] = VISITED; + continue; + } + + assert(NodeStatus[FuncId] == NEW); + NodeStatus[FuncId] = VISITING; + Worklist.push(FuncId); + for (const auto Callee : Nodes[FuncId].Succs) { + if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED) + continue; + Worklist.push(Callee); + } + } + + return TopologicalOrder; +} + +} +} diff --git a/bolt/Passes/CallGraph.h b/bolt/Passes/CallGraph.h new file mode 100644 index 000000000000..df984bc2b7c2 --- /dev/null +++ b/bolt/Passes/CallGraph.h @@ -0,0 +1,113 @@ +//===--- Passes/CallGraph.h -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H + +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace bolt { + +class BinaryFunction; +class BinaryContext; + +// TODO: find better place for this +inline int64_t hashCombine(const int64_t Seed, const int64_t Val) { + std::hash Hasher; + return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2)); +} + +/// A call graph class. +class CallGraph { +public: + using NodeId = size_t; + static constexpr NodeId InvalidId = -1; + + class Arc { + public: + struct Hash { + int64_t operator()(const Arc &Arc) const; + }; + + Arc(NodeId S, NodeId D, double W = 0) + : Src(S) + , Dst(D) + , Weight(W) + {} + Arc(const Arc&) = delete; + + friend bool operator==(const Arc &Lhs, const Arc &Rhs) { + return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst; + } + + const NodeId Src; + const NodeId Dst; + mutable double Weight; + mutable double NormalizedWeight{0}; + mutable double AvgCallOffset{0}; + }; + + class Node { + public: + explicit Node(uint32_t Size, uint32_t Samples = 0) + : Size(Size), Samples(Samples) + {} + + uint32_t Size; + uint32_t Samples; + + // preds and succs contain no duplicate elements and self arcs are not allowed + std::vector Preds; + std::vector Succs; + }; + + NodeId addNode(uint32_t Size, uint32_t Samples = 0); + const Arc &incArcWeight(NodeId Src, NodeId Dst, double W = 1.0); + + /// Compute a DFS traversal of the call graph. + std::deque buildTraversalOrder(); + + std::vector Nodes; + std::unordered_set Arcs; + std::vector Funcs; + std::unordered_map FuncToNodeId; +}; + +inline bool NoFilter(const BinaryFunction &) { return false; } + +/// Builds a call graph from the map of BinaryFunctions provided in BFs. +/// The arguments control how the graph is constructed. +/// Filter is called on each function, any function that it returns true for +/// is omitted from the graph. +/// If IncludeColdCalls is true, then calls from cold BBs are considered for the +/// graph, otherwise they are ignored. +/// UseFunctionHotSize controls whether the hot size of a function is used when +/// filling in the Size attribute of new Nodes. +/// UseEdgeCounts is used to control if the AvgCallOffset attribute on Arcs is +/// computed using the offsets of call instructions. +CallGraph buildCallGraph(BinaryContext &BC, + std::map &BFs, + std::function Filter = NoFilter, + bool IncludeColdCalls = true, + bool UseFunctionHotSize = false, + bool UseEdgeCounts = false); + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index b964dacfd3c4..e3132bb6ea69 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -275,71 +275,6 @@ FrameAnalysis::getFIEFor(const BinaryContext &BC, const MCInst &Inst) const { return make_error_code(errc::result_out_of_range); } -void FrameAnalysis::buildCallGraph(BinaryContext &BC, - std::map &BFs) { - for (auto &I : BFs) { - BinaryFunction &Caller = I.second; - - Functions.emplace(&Caller); - - for (BinaryBasicBlock &BB : Caller) { - for (MCInst &Inst : BB) { - if (!BC.MIA->isCall(Inst)) - continue; - - auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); - if (!TargetSymbol) { - // This is an indirect call, we cannot record a target. - continue; - } - - auto *Function = BC.getFunctionForSymbol(TargetSymbol); - if (!Function) { - // Call to a function without a BinaryFunction object. - continue; - } - // Create a new edge in the call graph - CallGraphEdges[&Caller].emplace_back(Function); - ReverseCallGraphEdges[Function].emplace_back(&Caller); - } - } - } -} - -void FrameAnalysis::buildCGTraversalOrder() { - enum NodeStatus { NEW, VISITING, VISITED }; - std::unordered_map NodeStatus; - std::stack Worklist; - - for (auto *Func : Functions) { - Worklist.push(Func); - NodeStatus[Func] = NEW; - } - - while (!Worklist.empty()) { - auto *Func = Worklist.top(); - Worklist.pop(); - - if (NodeStatus[Func] == VISITED) - continue; - - if (NodeStatus[Func] == VISITING) { - TopologicalCGOrder.push_back(Func); - NodeStatus[Func] = VISITED; - continue; - } - - assert(NodeStatus[Func] == NEW); - NodeStatus[Func] = VISITING; - Worklist.push(Func); - for (auto *Callee : CallGraphEdges[Func]) { - if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED) - continue; - Worklist.push(Callee); - } - } -} - void FrameAnalysis::getInstClobberList(const BinaryContext &BC, const MCInst &Inst, BitVector &KillSet) const { @@ -412,8 +347,8 @@ void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { } if (RegsKilledMap[Func] != RegsKilled || Updated) { - for (auto Caller : ReverseCallGraphEdges[Func]) { - Queue.push(Caller); + for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) { + Queue.push(Cg.Funcs[Caller]); } } RegsKilledMap[Func] = std::move(RegsKilled); @@ -647,11 +582,11 @@ void FrameAnalysis::runOnFunctions(BinaryContext &BC, std::set &) { { NamedRegionTimer T1("Callgraph construction", "FOP breakdown", true); - buildCallGraph(BC, BFs); + Cg = buildCallGraph(BC, BFs); } { NamedRegionTimer T1("build cg traversal order", "FOP breakdown", true); - buildCGTraversalOrder(); + TopologicalCGOrder = Cg.buildTraversalOrder(); } { NamedRegionTimer T1("build clobber map", "FOP breakdown", true); diff --git a/bolt/Passes/FrameAnalysis.h b/bolt/Passes/FrameAnalysis.h index 12a350572f16..d9f6f939f977 100644 --- a/bolt/Passes/FrameAnalysis.h +++ b/bolt/Passes/FrameAnalysis.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H #include "BinaryPasses.h" +#include "CallGraph.h" #include "StackPointerTracking.h" namespace llvm { @@ -112,14 +113,8 @@ raw_ostream &operator<<(raw_ostream &OS, /// class FrameAnalysis : public BinaryFunctionPass { /// Call graph info - /// The set of functions analyzed by our call graph - std::set Functions; - /// Model the "function calls function" edges - std::map> - CallGraphEdges; - /// Model the "function called by function" edges - std::map> - ReverseCallGraphEdges; + CallGraph Cg; + /// DFS or reverse post-ordering of the call graph nodes to allow us to /// traverse the call graph bottom-up std::deque TopologicalCGOrder; @@ -169,15 +164,6 @@ class FrameAnalysis : public BinaryFunctionPass { void addFIEFor(const BinaryContext &BC, MCInst &Inst, const FrameIndexEntry &FIE); - /// Perform the initial step of populating CallGraphEdges and - /// ReverseCallGraphEdges for all functions in BFs. - void buildCallGraph(BinaryContext &BC, - std::map &BFs); - - /// Compute a DFS traversal of the call graph in Functions, CallGraphEdges - /// and ReverseCallGraphEdges and stores it in TopologicalCGOrder. - void buildCGTraversalOrder(); - /// Compute the set of registers \p Func may write to during its execution, /// starting at the point when it is called up until when it returns. Returns /// a BitVector the size of the target number of registers, representing the diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 3fd5a83c9ac2..975bac260c45 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -24,71 +24,6 @@ extern cl::opt Verbosity; namespace llvm { namespace bolt { -void FrameOptimizerPass::buildCallGraph( - const BinaryContext &BC, std::map &BFs) { - for (auto &I : BFs) { - BinaryFunction &Caller = I.second; - - Functions.emplace(&Caller); - - for (BinaryBasicBlock &BB : Caller) { - for (MCInst &Inst : BB) { - if (!BC.MIA->isCall(Inst)) - continue; - - const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); - if (!TargetSymbol) { - // This is an indirect call, we cannot record a target. - continue; - } - - const auto *Function = BC.getFunctionForSymbol(TargetSymbol); - if (!Function) { - // Call to a function without a BinaryFunction object. - continue; - } - // Create a new edge in the call graph - CallGraphEdges[&Caller].emplace_back(Function); - ReverseCallGraphEdges[Function].emplace_back(&Caller); - } - } - } -} - -void FrameOptimizerPass::buildCGTraversalOrder() { - enum NodeStatus { NEW, VISITING, VISITED }; - std::unordered_map NodeStatus; - std::stack Worklist; - - for (auto *Func : Functions) { - Worklist.push(Func); - NodeStatus[Func] = NEW; - } - - while (!Worklist.empty()) { - const auto *Func = Worklist.top(); - Worklist.pop(); - - if (NodeStatus[Func] == VISITED) - continue; - - if (NodeStatus[Func] == VISITING) { - TopologicalCGOrder.push_back(Func); - NodeStatus[Func] = VISITED; - continue; - } - - assert(NodeStatus[Func] == NEW); - NodeStatus[Func] = VISITING; - Worklist.push(Func); - for (const auto *Callee : CallGraphEdges[Func]) { - if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED) - continue; - Worklist.push(Callee); - } - } -} - void FrameOptimizerPass::getInstClobberList(const BinaryContext &BC, const MCInst &Inst, BitVector &KillSet) const { @@ -161,8 +96,8 @@ void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) { } if (RegsKilledMap[Func] != RegsKilled) { - for (auto Caller : ReverseCallGraphEdges[Func]) { - Queue.push(Caller); + for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) { + Queue.push(Cg.Funcs[Caller]); } } RegsKilledMap[Func] = std::move(RegsKilled); @@ -794,8 +729,8 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, uint64_t CountFunctionsNotOptimized{0}; uint64_t CountFunctionsFailedRestoreFI{0}; uint64_t CountDenominator{0}; - buildCallGraph(BC, BFs); - buildCGTraversalOrder(); + Cg = buildCallGraph(BC, BFs); + TopologicalCGOrder = Cg.buildTraversalOrder(); buildClobberMap(BC); for (auto &I : BFs) { auto Count = I.second.getExecutionCount(); diff --git a/bolt/Passes/FrameOptimizer.h b/bolt/Passes/FrameOptimizer.h index fbe63ea251f5..6f6fdc1b680c 100644 --- a/bolt/Passes/FrameOptimizer.h +++ b/bolt/Passes/FrameOptimizer.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H #include "BinaryPasses.h" +#include "CallGraph.h" namespace llvm { namespace bolt { @@ -75,17 +76,11 @@ class FrameOptimizerPass : public BinaryFunctionPass { uint64_t CountFunctionsAllClobber{0}; /// Call graph info - /// The set of functions analyzed by our call graph - std::set Functions; - /// Model the "function calls function" edges - std::map> - CallGraphEdges; - /// Model the "function called by function" edges - std::map> - ReverseCallGraphEdges; + CallGraph Cg; + /// DFS or reverse post-ordering of the call graph nodes to allow us to /// traverse the call graph bottom-up - std::deque TopologicalCGOrder; + std::deque TopologicalCGOrder; /// Map functions to the set of registers they may overwrite starting at when /// it is called until it returns to the caller. @@ -126,15 +121,6 @@ class FrameOptimizerPass : public BinaryFunctionPass { void getInstClobberList(const BinaryContext &BC, const MCInst &Inst, BitVector &KillSet) const; private: - /// Perform the initial step of populating CallGraphEdges and - /// ReverseCallGraphEdges for all functions in BFs. - void buildCallGraph(const BinaryContext &BC, - std::map &BFs); - - /// Compute a DFS traversal of the call graph in Functions, CallGraphEdges - /// and ReverseCallGraphEdges and stores it in TopologicalCGOrder. - void buildCGTraversalOrder(); - /// Compute the set of registers \p Func may write to during its execution, /// starting at the point when it is called up until when it returns. Returns /// a BitVector the size of the target number of registers, representing the diff --git a/bolt/Passes/HFSort.cpp b/bolt/Passes/HFSort.cpp index 9cab6f6f21dd..7f40ba8a3320 100644 --- a/bolt/Passes/HFSort.cpp +++ b/bolt/Passes/HFSort.cpp @@ -40,6 +40,10 @@ namespace llvm { namespace bolt { +using NodeId = CallGraph::NodeId; +using Arc = CallGraph::Arc; +using Node = CallGraph::Node; + namespace { // The number of pages to reserve for the functions with highest @@ -55,32 +59,11 @@ constexpr double MinArcProbability = 0.1; // willing to degrade it's density by merging a callee. constexpr int CallerDegradeFactor = 8; -// Maximum size of a cluster, in bytes. -constexpr uint32_t MaxClusterSize = 1 << 20; - -constexpr uint32_t PageSize = 2 << 20; - -} -//////////////////////////////////////////////////////////////////////////////// - -TargetId TargetGraph::addTarget(uint32_t Size, uint32_t Samples) { - auto Id = Targets.size(); - Targets.emplace_back(Size, Samples); - return Id; } -const Arc &TargetGraph::incArcWeight(TargetId Src, TargetId Dst, double W) { - auto Res = Arcs.emplace(Src, Dst, W); - if (!Res.second) { - Res.first->Weight += W; - return *Res.first; - } - Targets[Src].Succs.push_back(Dst); - Targets[Dst].Preds.push_back(Src); - return *Res.first; -} +//////////////////////////////////////////////////////////////////////////////// -Cluster::Cluster(TargetId Id, const TargetNode &Func) { +Cluster::Cluster(NodeId Id, const Node &Func) { Targets.push_back(Id); Size = Func.Size; Samples = Func.Samples; @@ -103,53 +86,47 @@ std::string Cluster::toString() const { } namespace { -//////////////////////////////////////////////////////////////////////////////// -bool compareClustersDensity(const Cluster &C1, const Cluster &C2) { - return C1.density() > C2.density(); -} - -//////////////////////////////////////////////////////////////////////////////// - -void freezeClusters(const TargetGraph &Cg, std::vector &Clusters) { +void freezeClusters(const CallGraph &Cg, std::vector &Clusters) { uint32_t TotalSize = 0; std::sort(Clusters.begin(), Clusters.end(), compareClustersDensity); for (auto &C : Clusters) { uint32_t NewSize = TotalSize + C.Size; - if (NewSize > FrozenPages * PageSize) break; + if (NewSize > FrozenPages * HugePageSize) break; C.Frozen = true; TotalSize = NewSize; auto Fid = C.Targets[0]; DEBUG(dbgs() << format("freezing cluster for func %d, size = %u, samples = %u)\n", - Fid, Cg.Targets[Fid].Size, Cg.Targets[Fid].Samples);); + Fid, Cg.Nodes[Fid].Size, Cg.Nodes[Fid].Samples);); } } -void mergeInto(Cluster &Into, Cluster&& Other, const double Aw = 0) { - Into.Targets.insert(Into.Targets.end(), - Other.Targets.begin(), - Other.Targets.end()); - Into.Size += Other.Size; - Into.Samples += Other.Samples; +} + +void Cluster::merge(Cluster&& Other, const double Aw) { + Targets.insert(Targets.end(), + Other.Targets.begin(), + Other.Targets.end()); + Size += Other.Size; + Samples += Other.Samples; Other.Size = 0; Other.Samples = 0; Other.Targets.clear(); } -} -std::vector clusterize(const TargetGraph &Cg) { - std::vector SortedFuncs; +std::vector clusterize(const CallGraph &Cg) { + std::vector SortedFuncs; - // indexed by TargetId, keeps it's current cluster - std::vector FuncCluster(Cg.Targets.size(), nullptr); + // indexed by NodeId, keeps it's current cluster + std::vector FuncCluster(Cg.Nodes.size(), nullptr); std::vector Clusters; - Clusters.reserve(Cg.Targets.size()); + Clusters.reserve(Cg.Nodes.size()); - for (TargetId F = 0; F < Cg.Targets.size(); F++) { - if (Cg.Targets[F].Samples == 0) continue; - Clusters.emplace_back(F, Cg.Targets[F]); + for (NodeId F = 0; F < Cg.Nodes.size(); F++) { + if (Cg.Nodes[F].Samples == 0) continue; + Clusters.emplace_back(F, Cg.Nodes[F]); SortedFuncs.push_back(F); } @@ -164,9 +141,9 @@ std::vector clusterize(const TargetGraph &Cg) { std::sort( SortedFuncs.begin(), SortedFuncs.end(), - [&] (const TargetId F1, const TargetId F2) { - const auto &Func1 = Cg.Targets[F1]; - const auto &Func2 = Cg.Targets[F2]; + [&] (const NodeId F1, const NodeId F2) { + const auto &Func1 = Cg.Nodes[F1]; + const auto &Func2 = Cg.Nodes[F2]; return (uint64_t)Func1.Samples * Func2.Size > // TODO: is this correct? (uint64_t)Func2.Samples * Func1.Size; @@ -180,12 +157,12 @@ std::vector clusterize(const TargetGraph &Cg) { if (Cluster->Frozen) continue; // Find best predecessor. - TargetId BestPred = InvalidId; + NodeId BestPred = CallGraph::InvalidId; double BestProb = 0; - for (const auto Src : Cg.Targets[Fid].Preds) { + for (const auto Src : Cg.Nodes[Fid].Preds) { auto &A = *Cg.Arcs.find(Arc(Src, Fid)); - if (BestPred == InvalidId || A.NormalizedWeight > BestProb) { + if (BestPred == CallGraph::InvalidId || A.NormalizedWeight > BestProb) { BestPred = A.Src; BestProb = A.NormalizedWeight; } @@ -196,7 +173,7 @@ std::vector clusterize(const TargetGraph &Cg) { // caller is too low. if (BestProb < MinArcProbability) continue; - assert(BestPred != InvalidId); + assert(BestPred != CallGraph::InvalidId); auto PredCluster = FuncCluster[BestPred]; @@ -223,13 +200,13 @@ std::vector clusterize(const TargetGraph &Cg) { DEBUG(dbgs() << format("merging %s -> %s: %u\n", PredCluster->toString().c_str(), Cluster->toString().c_str(), - Cg.Targets[Fid].Samples);); + Cg.Nodes[Fid].Samples);); for (auto F : Cluster->Targets) { FuncCluster[F] = PredCluster; } - mergeInto(*PredCluster, std::move(*Cluster)); + PredCluster->merge(std::move(*Cluster)); } // Return the set of Clusters that are left, which are the ones that @@ -250,203 +227,14 @@ std::vector clusterize(const TargetGraph &Cg) { return SortedClusters; } -//////////////////////////////////////////////////////////////////////////////// - -namespace { -class ClusterArc { -public: - ClusterArc(Cluster *Ca, Cluster *Cb, double W = 0) - : C1(std::min(Ca, Cb)) - , C2(std::max(Ca, Cb)) - , Weight(W) - {} - - friend bool operator==(const ClusterArc &Lhs, const ClusterArc &Rhs) { - return Lhs.C1 == Rhs.C1 && Lhs.C2 == Rhs.C2; - } - - Cluster *const C1; - Cluster *const C2; - mutable double Weight; -}; - -class ClusterArcHash { -public: - int64_t operator()(const ClusterArc &Arc) const { - std::hash Hasher; - return hashCombine(Hasher(int64_t(Arc.C1)), int64_t(Arc.C2)); - } -}; - -using ClusterArcSet = std::unordered_set; - -void orderFuncs(const TargetGraph &Cg, Cluster *C1, Cluster *C2) { - TargetId C1head = C1->Targets.front(); - TargetId C1tail = C1->Targets.back(); - TargetId C2head = C2->Targets.front(); - TargetId C2tail = C2->Targets.back(); - - double C1headC2head = 0; - double C1headC2tail = 0; - double C1tailC2head = 0; - double C1tailC2tail = 0; - - for (const auto &Arc : Cg.Arcs) { - if ((Arc.Src == C1head && Arc.Dst == C2head) || - (Arc.Dst == C1head && Arc.Src == C2head)) { - C1headC2head += Arc.Weight; - } else if ((Arc.Src == C1head && Arc.Dst == C2tail) || - (Arc.Dst == C1head && Arc.Src == C2tail)) { - C1headC2tail += Arc.Weight; - } else if ((Arc.Src == C1tail && Arc.Dst == C2head) || - (Arc.Dst == C1tail && Arc.Src == C2head)) { - C1tailC2head += Arc.Weight; - } else if ((Arc.Src == C1tail && Arc.Dst == C2tail) || - (Arc.Dst == C1tail && Arc.Src == C2tail)) { - C1tailC2tail += Arc.Weight; - } - } - - const double Max = std::max(std::max(C1headC2head, C1headC2tail), - std::max(C1tailC2head, C1tailC2tail)); - - if (C1headC2head == Max) { - // flip C1 - std::reverse(C1->Targets.begin(), C1->Targets.end()); - } else if (C1headC2tail == Max) { - // flip C1 C2 - std::reverse(C1->Targets.begin(), C1->Targets.end()); - std::reverse(C2->Targets.begin(), C2->Targets.end()); - } else if (C1tailC2tail == Max) { - // flip C2 - std::reverse(C2->Targets.begin(), C2->Targets.end()); - } -} -} - -std::vector pettisAndHansen(const TargetGraph &Cg) { - // indexed by TargetId, keeps its current cluster - std::vector FuncCluster(Cg.Targets.size(), nullptr); - std::vector Clusters; - std::vector Funcs; - - Clusters.reserve(Cg.Targets.size()); - - for (TargetId F = 0; F < Cg.Targets.size(); F++) { - if (Cg.Targets[F].Samples == 0) continue; - Clusters.emplace_back(F, Cg.Targets[F]); - FuncCluster[F] = &Clusters.back(); - Funcs.push_back(F); - } - - ClusterArcSet Carcs; - - auto insertOrInc = [&](Cluster *C1, Cluster *C2, double Weight) { - auto Res = Carcs.emplace(C1, C2, Weight); - if (!Res.second) { - Res.first->Weight += Weight; - } - }; - - // Create a std::vector of cluster arcs - - for (auto &Arc : Cg.Arcs) { - if (Arc.Weight == 0) continue; - - auto const S = FuncCluster[Arc.Src]; - auto const D = FuncCluster[Arc.Dst]; - - // ignore if s or d is nullptr - - if (S == nullptr || D == nullptr) continue; - - // ignore self-edges - - if (S == D) continue; - - insertOrInc(S, D, Arc.Weight); - } - - // Find an arc with max weight and merge its nodes - - while (!Carcs.empty()) { - auto Maxpos = std::max_element( - Carcs.begin(), - Carcs.end(), - [&] (const ClusterArc &Carc1, const ClusterArc &Carc2) { - return Carc1.Weight < Carc2.Weight; - } - ); - - auto Max = *Maxpos; - Carcs.erase(Maxpos); - - auto const C1 = Max.C1; - auto const C2 = Max.C2; - - if (C1->Size + C2->Size > MaxClusterSize) continue; - - if (C1->Frozen || C2->Frozen) continue; - - // order functions and merge cluster - - orderFuncs(Cg, C1, C2); - - DEBUG(dbgs() << format("merging %s -> %s: %.1f\n", C2->toString().c_str(), - C1->toString().c_str(), Max.Weight);); - - // update carcs: merge C1arcs to C2arcs - - std::unordered_map C2arcs; - for (auto &Carc : Carcs) { - if (Carc.C1 == C2) C2arcs.emplace(Carc, Carc.C2); - if (Carc.C2 == C2) C2arcs.emplace(Carc, Carc.C1); - } - - for (auto It : C2arcs) { - auto const C = It.second; - auto const C2arc = It.first; - - insertOrInc(C, C1, C2arc.Weight); - Carcs.erase(C2arc); - } - - // update FuncCluster - - for (auto F : C2->Targets) { - FuncCluster[F] = C1; - } - mergeInto(*C1, std::move(*C2), Max.Weight); - } - - // Return the set of Clusters that are left, which are the ones that - // didn't get merged. - - std::set LiveClusters; - std::vector OutClusters; - - for (auto Fid : Funcs) { - LiveClusters.insert(FuncCluster[Fid]); - } - for (auto C : LiveClusters) { - OutClusters.push_back(std::move(*C)); - } - - std::sort(OutClusters.begin(), - OutClusters.end(), - compareClustersDensity); - - return OutClusters; -} - -std::vector randomClusters(const TargetGraph &Cg) { - std::vector FuncIds(Cg.Targets.size(), 0); +std::vector randomClusters(const CallGraph &Cg) { + std::vector FuncIds(Cg.Nodes.size(), 0); std::vector Clusters; - Clusters.reserve(Cg.Targets.size()); + Clusters.reserve(Cg.Nodes.size()); - for (TargetId F = 0; F < Cg.Targets.size(); F++) { - if (Cg.Targets[F].Samples == 0) continue; - Clusters.emplace_back(F, Cg.Targets[F]); + for (NodeId F = 0; F < Cg.Nodes.size(); F++) { + if (Cg.Nodes[F].Samples == 0) continue; + Clusters.emplace_back(F, Cg.Nodes[F]); } std::sort(Clusters.begin(), @@ -477,7 +265,7 @@ std::vector randomClusters(const TargetGraph &Cg) { if (MergeIdx == Clusters.size()) { ++Idx; } else { - mergeInto(Clusters[Idx], std::move(Clusters[MergeIdx])); + Clusters[Idx].merge(std::move(Clusters[MergeIdx])); Clusters.erase(Clusters.begin() + MergeIdx); } } diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h index acf4e87d2b84..7bcd974d7515 100644 --- a/bolt/Passes/HFSort.h +++ b/bolt/Passes/HFSort.h @@ -37,157 +37,60 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_HFSORT_H #define LLVM_TOOLS_LLVM_BOLT_HFSORT_H +#include "CallGraph.h" + #include -#include #include -#include - -#if defined(__x86_64__) && !defined(_MSC_VER) -# if (!defined USE_SSECRC) -# define USE_SSECRC -# endif -#else -# undef USE_SSECRC -#endif namespace llvm { namespace bolt { -using TargetId = size_t; -constexpr TargetId InvalidId = -1; - -class Arc { -public: - Arc(TargetId S, TargetId D, double W = 0) - : Src(S) - , Dst(D) - , Weight(W) - {} - Arc(const Arc&) = delete; - - friend bool operator==(const Arc &Lhs, const Arc &Rhs) { - return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst; - } - - const TargetId Src; - const TargetId Dst; - mutable double Weight; - mutable double NormalizedWeight{0}; - mutable double AvgCallOffset{0}; -}; - -namespace { - -inline int64_t hashCombine(const int64_t Seed, const int64_t Val) { - std::hash Hasher; - return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2)); -} - -inline size_t hash_int64_fallback(int64_t key) { - // "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function." - // http://www.concentric.net/~ttwang/tech/inthash.htm - key = (~key) + (key << 21); // key = (key << 21) - key - 1; - key = key ^ ((unsigned long long)key >> 24); - key = (key + (key << 3)) + (key << 8); // key * 265 - key = key ^ ((unsigned long long)key >> 14); - key = (key + (key << 2)) + (key << 4); // key * 21 - key = key ^ ((unsigned long long)key >> 28); - return static_cast(static_cast(key)); -} - -inline size_t hash_int64(int64_t k) { -#if defined(USE_SSECRC) && defined(__SSE4_2__) - size_t h = 0; - __asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k)); - return h; -#else - return hash_int64_fallback(k); -#endif -} - -inline size_t hash_int64_pair(int64_t k1, int64_t k2) { -#if defined(USE_SSECRC) && defined(__SSE4_2__) - // crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes - // differently from (k2, k1). - k1 += k1; - __asm("crc32q %1, %0\n" : "+r" (k1) : "rm"(k2)); - return k1; -#else - return (hash_int64(k1) << 1) ^ hash_int64(k2); -#endif -} - -} - -class ArcHash { -public: - int64_t operator()(const Arc &Arc) const { -#ifdef USE_STD_HASH - std::hash Hasher; - return hashCombine(Hasher(Arc.Src), Arc.Dst); -#else - return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst)); -#endif - } -}; - -class TargetNode { -public: - explicit TargetNode(uint32_t Size, uint32_t Samples = 0) - : Size(Size), Samples(Samples) - {} - - uint32_t Size; - uint32_t Samples; - - // preds and succs contain no duplicate elements and self arcs are not allowed - std::vector Preds; - std::vector Succs; -}; - -class TargetGraph { -public: - TargetId addTarget(uint32_t Size, uint32_t Samples = 0); - const Arc &incArcWeight(TargetId Src, TargetId Dst, double W = 1.0); - - std::vector Targets; - std::unordered_set Arcs; -}; - class Cluster { public: - Cluster(TargetId Id, const TargetNode &F); + Cluster(CallGraph::NodeId Id, const CallGraph::Node &F); std::string toString() const; double density() const { return (double)Samples / Size; } - std::vector Targets; + void merge(Cluster &&Other, const double Aw = 0); + + std::vector Targets; uint32_t Samples; uint32_t Size; bool Frozen; // not a candidate for merging }; +// Maximum size of a cluster, in bytes. +constexpr uint32_t MaxClusterSize = 1 << 20; + +// Size of a huge page in bytes. +constexpr uint32_t HugePageSize = 2 << 20; + +inline bool compareClustersDensity(const Cluster &C1, const Cluster &C2) { + return C1.density() > C2.density(); +} + /* * Cluster functions in order to minimize call distance. */ -std::vector clusterize(const TargetGraph &Cg); +std::vector clusterize(const CallGraph &Cg); /* * Optimize function placement for iTLB cache and i-cache. */ -std::vector hfsortPlus(const TargetGraph &Cg); +std::vector hfsortPlus(const CallGraph &Cg); /* * Pettis-Hansen code layout algorithm * reference: K. Pettis and R. C. Hansen, "Profile Guided Code Positioning", * PLDI '90 */ -std::vector pettisAndHansen(const TargetGraph &Cg); +std::vector pettisAndHansen(const CallGraph &Cg); /* Group functions into clusters randomly. */ -std::vector randomClusters(const TargetGraph &Cg); +std::vector randomClusters(const CallGraph &Cg); } } diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index 523841210a56..bb925f88da50 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -43,6 +43,10 @@ namespace llvm { namespace bolt { +using NodeId = CallGraph::NodeId; +using Arc = CallGraph::Arc; +using Node = CallGraph::Node; + namespace { // The size of a cache page @@ -117,7 +121,7 @@ class PrecomputedResults { // A wrapper for algorthm-wide variables struct AlgoState { // the call graph - const TargetGraph *Cg; + const CallGraph *Cg; // the total number of samples in the graph double TotalSamples; // target_id => cluster @@ -126,10 +130,6 @@ struct AlgoState { std::vector Addr; }; -bool compareClustersDensity(const Cluster &C1, const Cluster &C2) { - return C1.density() > C2.density(); -} - } /* @@ -199,7 +199,7 @@ double expectedCacheHitRatio(const AlgoState &State, sortByDensity(Clusters); // generate function addresses with an alignment - std::vector Addr(State.Cg->Targets.size(), InvalidAddr); + std::vector Addr(State.Cg->Nodes.size(), InvalidAddr); size_t CurAddr = 0; // 'hotness' of the pages std::vector PageSamples; @@ -207,11 +207,11 @@ double expectedCacheHitRatio(const AlgoState &State, for (auto TargetId : Cluster->Targets) { if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16; Addr[TargetId] = CurAddr; - CurAddr += State.Cg->Targets[TargetId].Size; + CurAddr += State.Cg->Nodes[TargetId].Size; // update page weight size_t Page = Addr[TargetId] / PageSize; while (PageSamples.size() <= Page) PageSamples.push_back(0.0); - PageSamples[Page] += State.Cg->Targets[TargetId].Samples; + PageSamples[Page] += State.Cg->Nodes[TargetId].Samples; } } @@ -220,12 +220,12 @@ double expectedCacheHitRatio(const AlgoState &State, for (auto Cluster : Clusters) { for (auto TargetId : Cluster->Targets) { size_t Page = Addr[TargetId] / PageSize; - double Samples = State.Cg->Targets[TargetId].Samples; + double Samples = State.Cg->Nodes[TargetId].Samples; // probability that the page is not present in the cache double MissProb = missProbability(State, PageSamples[Page]); - for (auto Pred : State.Cg->Targets[TargetId].Preds) { - if (State.Cg->Targets[Pred].Samples == 0) continue; + for (auto Pred : State.Cg->Nodes[TargetId].Preds) { + if (State.Cg->Nodes[Pred].Samples == 0) continue; auto A = State.Cg->Arcs.find(Arc(Pred, TargetId)); // the source page @@ -252,13 +252,13 @@ std::unordered_set adjacentClusters(const AlgoState &State, Cluster *C) { std::unordered_set Result; for (auto TargetId : C->Targets) { - for (auto Succ : State.Cg->Targets[TargetId].Succs) { + for (auto Succ : State.Cg->Nodes[TargetId].Succs) { auto SuccCluster = State.FuncCluster[Succ]; if (SuccCluster != nullptr && SuccCluster != C) { Result.insert(SuccCluster); } } - for (auto Pred : State.Cg->Targets[TargetId].Preds) { + for (auto Pred : State.Cg->Nodes[TargetId].Preds) { auto PredCluster = State.FuncCluster[Pred]; if (PredCluster != nullptr && PredCluster != C) { Result.insert(PredCluster); @@ -286,7 +286,7 @@ double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) { double shortCalls(const AlgoState &State, Cluster *Cluster) { double Calls = 0; for (auto TargetId : Cluster->Targets) { - for (auto Succ : State.Cg->Targets[TargetId].Succs) { + for (auto Succ : State.Cg->Nodes[TargetId].Succs) { if (State.FuncCluster[Succ] == Cluster) { auto A = State.Cg->Arcs.find(Arc(TargetId, Succ)); @@ -310,7 +310,7 @@ double shortCalls(const AlgoState &State, Cluster *ClusterSucc) { double Calls = 0; for (auto TargetId : ClusterPred->Targets) { - for (auto Succ : State.Cg->Targets[TargetId].Succs) { + for (auto Succ : State.Cg->Nodes[TargetId].Succs) { if (State.FuncCluster[Succ] == ClusterSucc) { auto A = State.Cg->Arcs.find(Arc(TargetId, Succ)); @@ -323,7 +323,7 @@ double shortCalls(const AlgoState &State, } for (auto TargetId : ClusterPred->Targets) { - for (auto Pred : State.Cg->Targets[TargetId].Preds) { + for (auto Pred : State.Cg->Nodes[TargetId].Preds) { if (State.FuncCluster[Pred] == ClusterSucc) { auto A = State.Cg->Arcs.find(Arc(Pred, TargetId)); @@ -389,7 +389,7 @@ void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) { for (auto TargetId : Into->Targets) { State.FuncCluster[TargetId] = Into; State.Addr[TargetId] = CurAddr; - CurAddr += State.Cg->Targets[TargetId].Size; + CurAddr += State.Cg->Nodes[TargetId].Size; } Other->Size = 0; @@ -400,29 +400,29 @@ void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) { /* * HFSortPlus - layout of hot functions with iTLB cache optimization */ -std::vector hfsortPlus(const TargetGraph &Cg) { +std::vector hfsortPlus(const CallGraph &Cg) { // create a cluster for every function std::vector AllClusters; - AllClusters.reserve(Cg.Targets.size()); - for (TargetId F = 0; F < Cg.Targets.size(); F++) { - AllClusters.emplace_back(F, Cg.Targets[F]); + AllClusters.reserve(Cg.Nodes.size()); + for (NodeId F = 0; F < Cg.Nodes.size(); F++) { + AllClusters.emplace_back(F, Cg.Nodes[F]); } // initialize objects used by the algorithm std::vector Clusters; - Clusters.reserve(Cg.Targets.size()); + Clusters.reserve(Cg.Nodes.size()); AlgoState State; State.Cg = &Cg; State.TotalSamples = 0; - State.FuncCluster = std::vector(Cg.Targets.size(), nullptr); - State.Addr = std::vector(Cg.Targets.size(), InvalidAddr); - for (TargetId F = 0; F < Cg.Targets.size(); F++) { - if (Cg.Targets[F].Samples == 0) continue; + State.FuncCluster = std::vector(Cg.Nodes.size(), nullptr); + State.Addr = std::vector(Cg.Nodes.size(), InvalidAddr); + for (NodeId F = 0; F < Cg.Nodes.size(); F++) { + if (Cg.Nodes[F].Samples == 0) continue; Clusters.push_back(&AllClusters[F]); State.FuncCluster[F] = &AllClusters[F]; State.Addr[F] = 0; - State.TotalSamples += Cg.Targets[F].Samples; + State.TotalSamples += Cg.Nodes[F].Samples; } DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n" diff --git a/bolt/Passes/PettisAndHansen.cpp b/bolt/Passes/PettisAndHansen.cpp new file mode 100644 index 000000000000..d8d828726a04 --- /dev/null +++ b/bolt/Passes/PettisAndHansen.cpp @@ -0,0 +1,206 @@ +#include "HFSort.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "hfsort" + +namespace llvm { +namespace bolt { + +using NodeId = CallGraph::NodeId; +using Arc = CallGraph::Arc; +using Node = CallGraph::Node; + +namespace { +class ClusterArc { +public: + ClusterArc(Cluster *Ca, Cluster *Cb, double W = 0) + : C1(std::min(Ca, Cb)) + , C2(std::max(Ca, Cb)) + , Weight(W) + {} + + friend bool operator==(const ClusterArc &Lhs, const ClusterArc &Rhs) { + return Lhs.C1 == Rhs.C1 && Lhs.C2 == Rhs.C2; + } + + Cluster *const C1; + Cluster *const C2; + mutable double Weight; +}; + +class ClusterArcHash { +public: + int64_t operator()(const ClusterArc &Arc) const { + std::hash Hasher; + return hashCombine(Hasher(int64_t(Arc.C1)), int64_t(Arc.C2)); + } +}; + +using ClusterArcSet = std::unordered_set; + +void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) { + auto C1head = C1->Targets.front(); + auto C1tail = C1->Targets.back(); + auto C2head = C2->Targets.front(); + auto C2tail = C2->Targets.back(); + + double C1headC2head = 0; + double C1headC2tail = 0; + double C1tailC2head = 0; + double C1tailC2tail = 0; + + for (const auto &Arc : Cg.Arcs) { + if ((Arc.Src == C1head && Arc.Dst == C2head) || + (Arc.Dst == C1head && Arc.Src == C2head)) { + C1headC2head += Arc.Weight; + } else if ((Arc.Src == C1head && Arc.Dst == C2tail) || + (Arc.Dst == C1head && Arc.Src == C2tail)) { + C1headC2tail += Arc.Weight; + } else if ((Arc.Src == C1tail && Arc.Dst == C2head) || + (Arc.Dst == C1tail && Arc.Src == C2head)) { + C1tailC2head += Arc.Weight; + } else if ((Arc.Src == C1tail && Arc.Dst == C2tail) || + (Arc.Dst == C1tail && Arc.Src == C2tail)) { + C1tailC2tail += Arc.Weight; + } + } + + const double Max = std::max(std::max(C1headC2head, C1headC2tail), + std::max(C1tailC2head, C1tailC2tail)); + + if (C1headC2head == Max) { + // flip C1 + std::reverse(C1->Targets.begin(), C1->Targets.end()); + } else if (C1headC2tail == Max) { + // flip C1 C2 + std::reverse(C1->Targets.begin(), C1->Targets.end()); + std::reverse(C2->Targets.begin(), C2->Targets.end()); + } else if (C1tailC2tail == Max) { + // flip C2 + std::reverse(C2->Targets.begin(), C2->Targets.end()); + } +} +} + +std::vector pettisAndHansen(const CallGraph &Cg) { + // indexed by NodeId, keeps its current cluster + std::vector FuncCluster(Cg.Nodes.size(), nullptr); + std::vector Clusters; + std::vector Funcs; + + Clusters.reserve(Cg.Nodes.size()); + + for (NodeId F = 0; F < Cg.Nodes.size(); F++) { + if (Cg.Nodes[F].Samples == 0) continue; + Clusters.emplace_back(F, Cg.Nodes[F]); + FuncCluster[F] = &Clusters.back(); + Funcs.push_back(F); + } + + ClusterArcSet Carcs; + + auto insertOrInc = [&](Cluster *C1, Cluster *C2, double Weight) { + auto Res = Carcs.emplace(C1, C2, Weight); + if (!Res.second) { + Res.first->Weight += Weight; + } + }; + + // Create a std::vector of cluster arcs + + for (auto &Arc : Cg.Arcs) { + if (Arc.Weight == 0) continue; + + auto const S = FuncCluster[Arc.Src]; + auto const D = FuncCluster[Arc.Dst]; + + // ignore if s or d is nullptr + + if (S == nullptr || D == nullptr) continue; + + // ignore self-edges + + if (S == D) continue; + + insertOrInc(S, D, Arc.Weight); + } + + // Find an arc with max weight and merge its nodes + + while (!Carcs.empty()) { + auto Maxpos = std::max_element( + Carcs.begin(), + Carcs.end(), + [&] (const ClusterArc &Carc1, const ClusterArc &Carc2) { + return Carc1.Weight < Carc2.Weight; + } + ); + + auto Max = *Maxpos; + Carcs.erase(Maxpos); + + auto const C1 = Max.C1; + auto const C2 = Max.C2; + + if (C1->Size + C2->Size > MaxClusterSize) continue; + + if (C1->Frozen || C2->Frozen) continue; + + // order functions and merge cluster + + orderFuncs(Cg, C1, C2); + + DEBUG(dbgs() << format("merging %s -> %s: %.1f\n", C2->toString().c_str(), + C1->toString().c_str(), Max.Weight);); + + // update carcs: merge C1arcs to C2arcs + + std::unordered_map C2arcs; + for (auto &Carc : Carcs) { + if (Carc.C1 == C2) C2arcs.emplace(Carc, Carc.C2); + if (Carc.C2 == C2) C2arcs.emplace(Carc, Carc.C1); + } + + for (auto It : C2arcs) { + auto const C = It.second; + auto const C2arc = It.first; + + insertOrInc(C, C1, C2arc.Weight); + Carcs.erase(C2arc); + } + + // update FuncCluster + + for (auto F : C2->Targets) { + FuncCluster[F] = C1; + } + C1->merge(std::move(*C2), Max.Weight); + } + + // Return the set of Clusters that are left, which are the ones that + // didn't get merged. + + std::set LiveClusters; + std::vector OutClusters; + + for (auto Fid : Funcs) { + LiveClusters.insert(FuncCluster[Fid]); + } + for (auto C : LiveClusters) { + OutClusters.push_back(std::move(*C)); + } + + std::sort(OutClusters.begin(), + OutClusters.end(), + compareClustersDensity); + + return OutClusters; +} + +} +} diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp new file mode 100644 index 000000000000..09320bdab7cd --- /dev/null +++ b/bolt/Passes/ReorderFunctions.cpp @@ -0,0 +1,406 @@ +//===--- ReorderFunctions.cpp - Function reordering pass ------------ -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "ReorderFunctions.h" +#include "llvm/Support/Options.h" +#include + +#define DEBUG_TYPE "hfsort" + +using namespace llvm; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; +extern cl::opt Verbosity; +extern cl::opt Relocs; +extern cl::opt RandomSeed; + +extern bool shouldProcess(const bolt::BinaryFunction &Function); +extern size_t padFunction(const bolt::BinaryFunction &Function); + +cl::opt +ReorderFunctions("reorder-functions", + cl::desc("reorder and cluster functions (works only with relocations)"), + cl::init(bolt::BinaryFunction::RT_NONE), + cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE, + "none", + "do not reorder functions"), + clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT, + "exec-count", + "order by execution count"), + clEnumValN(bolt::BinaryFunction::RT_HFSORT, + "hfsort", + "use hfsort algorithm"), + clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS, + "hfsort+", + "use hfsort+ algorithm"), + clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN, + "pettis-hansen", + "use Pettis-Hansen algorithm"), + clEnumValN(bolt::BinaryFunction::RT_RANDOM, + "random", + "reorder functions randomly"), + clEnumValN(bolt::BinaryFunction::RT_USER, + "user", + "use function order specified by -function-order"), + clEnumValEnd), + cl::cat(BoltOptCategory)); + +static cl::opt +ReorderFunctionsUseHotSize("reorder-functions-use-hot-size", + cl::desc("use a function's hot size when doing clustering"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +FunctionOrderFile("function-order", + cl::desc("file containing an ordered list of functions to use for function " + "reordering"), + cl::cat(BoltOptCategory)); + +static cl::opt +GenerateFunctionOrderFile("generate-function-order", + cl::desc("file to dump the ordered list of functions to use for function " + "reordering"), + cl::cat(BoltOptCategory)); + +static cl::opt +UseEdgeCounts("use-edge-counts", + cl::desc("use edge count data when doing clustering"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +using NodeId = CallGraph::NodeId; +using Arc = CallGraph::Arc; +using Node = CallGraph::Node; + +void ReorderFunctions::normalizeArcWeights() { + // Normalize arc weights. + if (!opts::UseEdgeCounts) { + for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) { + auto& Func = Cg.Nodes[FuncId]; + for (auto Caller : Func.Preds) { + auto& A = *Cg.Arcs.find(Arc(Caller, FuncId)); + A.NormalizedWeight = A.Weight / Func.Samples; + A.AvgCallOffset /= A.Weight; + assert(A.AvgCallOffset < Cg.Nodes[Caller].Size); + } + } + } else { + for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) { + auto &Func = Cg.Nodes[FuncId]; + for (auto Caller : Func.Preds) { + auto& A = *Cg.Arcs.find(Arc(Caller, FuncId)); + A.NormalizedWeight = A.Weight / Func.Samples; + } + } + } +} + +void ReorderFunctions::reorder(std::vector &&Clusters, + std::map &BFs) { + std::vector FuncAddr(Cg.Nodes.size()); // Just for computing stats + uint64_t TotalSize = 0; + uint32_t Index = 0; + + // Set order of hot functions based on clusters. + for (const auto& Cluster : Clusters) { + for (const auto FuncId : Cluster.Targets) { + assert(Cg.Nodes[FuncId].Samples > 0); + Cg.Funcs[FuncId]->setIndex(Index++); + FuncAddr[FuncId] = TotalSize; + TotalSize += Cg.Nodes[FuncId].Size; + } + } + + if (opts::ReorderFunctions == BinaryFunction::RT_NONE) + return; + + if (opts::Verbosity == 0) { +#ifndef NDEBUG + if (!DebugFlag || !isCurrentDebugType("hfsort")) + return; +#else + return; +#endif + } + + TotalSize = 0; + uint64_t CurPage = 0; + uint64_t Hotfuncs = 0; + double TotalDistance = 0; + double TotalCalls = 0; + double TotalCalls64B = 0; + double TotalCalls4KB = 0; + double TotalCalls2MB = 0; + dbgs() << "============== page 0 ==============\n"; + for (auto& Cluster : Clusters) { + dbgs() << + format("-------- density = %.3lf (%u / %u) --------\n", + (double) Cluster.Samples / Cluster.Size, + Cluster.Samples, Cluster.Size); + + for (auto FuncId : Cluster.Targets) { + if (Cg.Nodes[FuncId].Samples > 0) { + Hotfuncs++; + + dbgs() << "BOLT-INFO: hot func " << *Cg.Funcs[FuncId] + << " (" << Cg.Nodes[FuncId].Size << ")\n"; + + uint64_t Dist = 0; + uint64_t Calls = 0; + for (auto Dst : Cg.Nodes[FuncId].Succs) { + auto& A = *Cg.Arcs.find(Arc(FuncId, Dst)); + auto D = + std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset)); + auto W = A.Weight; + Calls += W; + if (D < 64) TotalCalls64B += W; + if (D < 4096) TotalCalls4KB += W; + if (D < (2 << 20)) TotalCalls2MB += W; + Dist += A.Weight * D; + dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: " + "weight = %.0lf, callDist = %f\n", + A.Src, FuncAddr[A.Src], A.AvgCallOffset, + A.Dst, FuncAddr[A.Dst], A.Weight, D); + } + TotalCalls += Calls; + TotalDistance += Dist; + dbgs() << format("start = %6u : avgCallDist = %lu : %s\n", + TotalSize, + Calls ? Dist / Calls : 0, + Cg.Funcs[FuncId]->getPrintName().c_str()); + TotalSize += Cg.Nodes[FuncId].Size; + auto NewPage = TotalSize / HugePageSize; + if (NewPage != CurPage) { + CurPage = NewPage; + dbgs() << format("============== page %u ==============\n", CurPage); + } + } + } + } + dbgs() << format(" Number of hot functions: %u\n" + " Number of clusters: %lu\n", + Hotfuncs, Clusters.size()) + << format(" Final average call distance = %.1lf (%.0lf / %.0lf)\n", + TotalCalls ? TotalDistance / TotalCalls : 0, + TotalDistance, TotalCalls) + << format(" Total Calls = %.0lf\n", TotalCalls); + if (TotalCalls) { + dbgs() << format(" Total Calls within 64B = %.0lf (%.2lf%%)\n", + TotalCalls64B, 100 * TotalCalls64B / TotalCalls) + << format(" Total Calls within 4KB = %.0lf (%.2lf%%)\n", + TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls) + << format(" Total Calls within 2MB = %.0lf (%.2lf%%)\n", + TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls); + } +} + +namespace { + +std::vector readFunctionOrderFile() { + std::vector FunctionNames; + std::ifstream FuncsFile(opts::FunctionOrderFile, std::ios::in); + if (!FuncsFile) { + errs() << "Ordered functions file \"" << opts::FunctionOrderFile + << "\" can't be opened.\n"; + exit(1); + } + std::string FuncName; + while (std::getline(FuncsFile, FuncName)) { + FunctionNames.push_back(FuncName); + } + return FunctionNames; +} + +} + +void ReorderFunctions::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + if (!opts::Relocs && opts::ReorderFunctions != BinaryFunction::RT_NONE) { + errs() << "BOLT-ERROR: Function reordering only works when " + << "relocs are enabled.\n"; + exit(1); + } + + if (opts::ReorderFunctions != BinaryFunction::RT_NONE && + opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT && + opts::ReorderFunctions != BinaryFunction::RT_USER) { + Cg = buildCallGraph(BC, + BFs, + [this](const BinaryFunction &BF) { + return !shouldOptimize(BF) || !BF.hasProfile(); + }, + false, // IncludeColdCalls + opts::ReorderFunctionsUseHotSize, + opts::UseEdgeCounts); + normalizeArcWeights(); + } + + std::vector Clusters; + + switch(opts::ReorderFunctions) { + case BinaryFunction::RT_NONE: + break; + case BinaryFunction::RT_EXEC_COUNT: + { + std::vector SortedFunctions(BFs.size()); + uint32_t Index = 0; + std::transform(BFs.begin(), + BFs.end(), + SortedFunctions.begin(), + [](std::pair &BFI) { + return &BFI.second; + }); + std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), + [&](const BinaryFunction *A, const BinaryFunction *B) { + if (!opts::shouldProcess(*A)) + return false; + const auto PadA = opts::padFunction(*A); + const auto PadB = opts::padFunction(*B); + if (!PadA || !PadB) { + if (PadA) + return true; + if (PadB) + return false; + } + return !A->hasProfile() && + (B->hasProfile() || + (A->getExecutionCount() > B->getExecutionCount())); + }); + for (auto *BF : SortedFunctions) { + if (BF->hasProfile()) + BF->setIndex(Index++); + } + } + break; + case BinaryFunction::RT_HFSORT: + Clusters = clusterize(Cg); + break; + case BinaryFunction::RT_HFSORT_PLUS: + Clusters = hfsortPlus(Cg); + break; + case BinaryFunction::RT_PETTIS_HANSEN: + Clusters = pettisAndHansen(Cg); + break; + case BinaryFunction::RT_RANDOM: + std::srand(opts::RandomSeed); + Clusters = randomClusters(Cg); + break; + case BinaryFunction::RT_USER: + { + uint32_t Index = 0; + for (const auto &Function : readFunctionOrderFile()) { + std::vector FuncAddrs; + + auto Itr = BC.GlobalSymbols.find(Function); + if (Itr == BC.GlobalSymbols.end()) { + uint32_t LocalID = 1; + while(1) { + // If we can't find the main symbol name, look for alternates. + Itr = BC.GlobalSymbols.find(Function + "/" + std::to_string(LocalID)); + if (Itr != BC.GlobalSymbols.end()) + FuncAddrs.push_back(Itr->second); + else + break; + LocalID++; + } + } else { + FuncAddrs.push_back(Itr->second); + } + + if (FuncAddrs.empty()) { + errs() << "BOLT-WARNING: Reorder functions: can't find function for " + << Function << ".\n"; + continue; + } + + for (const auto FuncAddr : FuncAddrs) { + const auto *FuncSym = BC.getOrCreateGlobalSymbol(FuncAddr, "FUNCat"); + assert(FuncSym); + + auto *BF = BC.getFunctionForSymbol(FuncSym); + if (!BF) { + errs() << "BOLT-WARNING: Reorder functions: can't find function for " + << Function << ".\n"; + break; + } + if (!BF->hasValidIndex()) { + BF->setIndex(Index++); + } else if (opts::Verbosity > 0) { + errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n"; + } + } + } + } + break; + } + + reorder(std::move(Clusters), BFs); + + if (!opts::GenerateFunctionOrderFile.empty()) { + std::ofstream FuncsFile(opts::GenerateFunctionOrderFile, std::ios::out); + if (!FuncsFile) { + errs() << "Ordered functions file \"" << opts::GenerateFunctionOrderFile + << "\" can't be opened.\n"; + exit(1); + } + + std::vector SortedFunctions(BFs.size()); + + std::transform(BFs.begin(), + BFs.end(), + SortedFunctions.begin(), + [](std::pair &BFI) { + return &BFI.second; + }); + + // Sort functions by index. + std::stable_sort( + SortedFunctions.begin(), + SortedFunctions.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } else if (A->hasValidIndex() && !B->hasValidIndex()) { + return true; + } else if (!A->hasValidIndex() && B->hasValidIndex()) { + return false; + } else { + return A->getAddress() < B->getAddress(); + } + }); + + for (const auto *Func : SortedFunctions) { + if (!Func->hasValidIndex()) + break; + FuncsFile << Func->getSymbol()->getName().data() << "\n"; + } + FuncsFile.close(); + + outs() << "BOLT-INFO: dumped function order to \"" + << opts::GenerateFunctionOrderFile << "\"\n"; + + exit(0); + } +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/ReorderFunctions.h b/bolt/Passes/ReorderFunctions.h new file mode 100644 index 000000000000..d90bdaabfb5b --- /dev/null +++ b/bolt/Passes/ReorderFunctions.h @@ -0,0 +1,43 @@ +//===--- ReorderFunctions.h - Function reordering pass --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H + +#include "BinaryPasses.h" +#include "HFSort.h" + +namespace llvm { +namespace bolt { + +/// Modify function order for streaming based on hotness. +class ReorderFunctions : public BinaryFunctionPass { + CallGraph Cg; + + void normalizeArcWeights(); + void reorder(std::vector &&Clusters, + std::map &BFs); + public: + explicit ReorderFunctions(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "reorder-functions"; + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif From 17ff810755e8ca748a962ab52ed774e750dd3b07 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 1 Jun 2017 12:30:52 -0700 Subject: [PATCH 264/904] [BOLT] Do not assert on an empty location list. Summary: Clang generates an empty debug location list, which doesn't make sense, but we probably shouldn't assert on it and instead issue a warning in verbosity mode. There is only a single empty location list in the whole llvm binary. (cherry picked from commit 148dba65f4698a4d90cc53586d57b5e1dd0d0023) --- bolt/DWARFRewriter.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 7d6f583985c2..b94c2c63b987 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -166,20 +166,22 @@ void RewriteInstance::updateUnitDebugInfo( Value.getAsSectionOffset().getValue(); Unit->getContext().getOneDebugLocList(LL); - assert(!LL.Entries.empty() && "location list cannot be empty"); - - const auto OutputLL = Function - ->translateInputToOutputLocationList(LL, Unit->getBaseAddress()); - DEBUG( - if (OutputLL.Entries.empty()) { + if (LL.Entries.empty()) { + errs() << "BOLT-WARNING: empty location list detected at 0x" + << Twine::utohexstr(LL.Offset) << " for DIE at 0x" + << Twine::utohexstr(DIE->getOffset()) << " in CU at 0x" + << Twine::utohexstr(Unit->getOffset()) << '\n'; + } else { + const auto OutputLL = Function-> + translateInputToOutputLocationList(LL, Unit->getBaseAddress()); + DEBUG(if (OutputLL.Entries.empty()) { dbgs() << "BOLT-DEBUG: location list translated to an empty " "one at 0x" << Twine::utohexstr(DIE->getOffset()) << " in CU at 0x" << Twine::utohexstr(Unit->getOffset()) << '\n'; - } - ); - - LocListSectionOffset = LocationListWriter->addList(OutputLL); + }); + LocListSectionOffset = LocationListWriter->addList(OutputLL); + } } auto DebugInfoPatcher = @@ -416,7 +418,7 @@ void RewriteInstance::updateLineTableOffsets() { continue; auto *CU = BC->DwCtx->getCompileUnitForOffset(CUOffset); - assert(CU && "expected non-null CU"); + assert(CU && "no CU found at offset"); auto LTOffset = BC->DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list); if (!LTOffset) From b0d2c2f1b0893fe72ac0cbc81ee72e38ba3a138b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 26 May 2017 15:46:46 -0700 Subject: [PATCH 265/904] [BOLT] More CG refactoring Summary: Do some additional refactoring of the CallGraph class. Add a BinaryFunctionCallGraph class that has the BOLT specific bits. This is in preparation to moving the generic CallGraph class into a library that both BOLT and HHVM can use. Make data members of CallGraph private and add the appropriate accessor methods. (cherry picked from commit ea00aaa13f8b0434cb7d808dd83c45fd0904f51c) --- bolt/Passes/BinaryFunctionCallGraph.cpp | 195 ++++++++++++++++++++++++ bolt/Passes/BinaryFunctionCallGraph.h | 80 ++++++++++ bolt/Passes/CMakeLists.txt | 1 + bolt/Passes/CallGraph.cpp | 191 +++-------------------- bolt/Passes/CallGraph.h | 139 ++++++++++++----- bolt/Passes/FrameAnalysis.cpp | 4 +- bolt/Passes/FrameAnalysis.h | 4 +- bolt/Passes/FrameOptimizer.cpp | 4 +- bolt/Passes/FrameOptimizer.h | 4 +- bolt/Passes/HFSort.cpp | 111 ++++++++------ bolt/Passes/HFSort.h | 24 ++- bolt/Passes/HFSortPlus.cpp | 131 +++++++--------- bolt/Passes/PettisAndHansen.cpp | 68 ++++----- bolt/Passes/ReorderFunctions.cpp | 130 ++++++++-------- bolt/Passes/ReorderFunctions.h | 5 +- bolt/RewriteInstance.cpp | 4 +- 16 files changed, 656 insertions(+), 439 deletions(-) create mode 100644 bolt/Passes/BinaryFunctionCallGraph.cpp create mode 100644 bolt/Passes/BinaryFunctionCallGraph.h diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp new file mode 100644 index 000000000000..16ea4bc376dc --- /dev/null +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -0,0 +1,195 @@ +//===--- Passes/BinaryFunctionCallGraph.cpp -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryFunctionCallGraph.h" +#include "BinaryFunction.h" +#include "BinaryContext.h" + +#define DEBUG_TYPE "callgraph" + +namespace llvm { +namespace bolt { + +CallGraph::NodeId BinaryFunctionCallGraph::addNode(BinaryFunction *BF, + uint32_t Size, + uint64_t Samples) { + auto Id = CallGraph::addNode(Size, Samples); + assert(size_t(Id) == Funcs.size()); + Funcs.push_back(BF); + FuncToNodeId[BF] = Id; + assert(Funcs[Id] == BF); + return Id; +} + +std::deque BinaryFunctionCallGraph::buildTraversalOrder() { + std::deque TopologicalOrder; + enum NodeStatus { NEW, VISITING, VISITED }; + std::vector NodeStatus(Funcs.size()); + std::stack Worklist; + + for (auto *Func : Funcs) { + const auto Id = FuncToNodeId.at(Func); + Worklist.push(Id); + NodeStatus[Id] = NEW; + } + + while (!Worklist.empty()) { + const auto FuncId = Worklist.top(); + Worklist.pop(); + + if (NodeStatus[FuncId] == VISITED) + continue; + + if (NodeStatus[FuncId] == VISITING) { + TopologicalOrder.push_back(Funcs[FuncId]); + NodeStatus[FuncId] = VISITED; + continue; + } + + assert(NodeStatus[FuncId] == NEW); + NodeStatus[FuncId] = VISITING; + Worklist.push(FuncId); + for (const auto Callee : successors(FuncId)) { + if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED) + continue; + Worklist.push(Callee); + } + } + + return TopologicalOrder; +} + +BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, + std::map &BFs, + CgFilterFunction Filter, + bool IncludeColdCalls, + bool UseFunctionHotSize, + bool UseEdgeCounts) { + BinaryFunctionCallGraph Cg; + + // Add call graph nodes. + auto lookupNode = [&](BinaryFunction *Function) { + const auto Id = Cg.maybeGetNodeId(Function); + if (Id == CallGraph::InvalidId) { + // It's ok to use the hot size here when the function is split. This is + // because emitFunctions will emit the hot part first in the order that is + // computed by ReorderFunctions. The cold part will be emitted with the + // rest of the cold functions and code. + const auto Size = UseFunctionHotSize && Function->isSplit() + ? Function->estimateHotSize() + : Function->estimateSize(); + // NOTE: for functions without a profile, we set the number of samples + // to zero. This will keep these functions from appearing in the hot + // section. This is a little weird because we wouldn't be trying to + // create a node for a function unless it was the target of a call from + // a hot block. The alternative would be to set the count to one or + // accumulate the number of calls from the callsite into the function + // samples. Results from perfomance testing seem to favor the zero + // count though, so I'm leaving it this way for now. + const auto Samples = + Function->hasProfile() ? Function->getExecutionCount() : 0; + return Cg.addNode(Function, Size, Samples); + } else { + return Id; + } + }; + + // Add call graph edges. + uint64_t NotProcessed = 0; + uint64_t TotalCalls = 0; + for (auto &It : BFs) { + auto *Function = &It.second; + + if(Filter(*Function)) { + continue; + } + + auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames()); + const auto SrcId = lookupNode(Function); + uint64_t Offset = Function->getAddress(); + + auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) { + if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) { + const auto DstId = lookupNode(DstFunc); + const auto AvgDelta = !UseEdgeCounts ? Offset - DstFunc->getAddress() : 0; + Cg.incArcWeight(SrcId, DstId, Count, AvgDelta); + DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function + << " -> " << *DstFunc << " @ " << Offset << "\n"); + return true; + } + return false; + }; + + for (auto *BB : Function->layout()) { + // Don't count calls from cold blocks + if (BB->isCold() && !IncludeColdCalls) + continue; + + for (auto &Inst : *BB) { + // Find call instructions and extract target symbols from each one. + if (!BC.MIA->isCall(Inst)) + continue; + + ++TotalCalls; + if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) { + // For direct calls, just use the BB execution count. + const auto Count = UseEdgeCounts && BB->hasProfile() + ? BB->getExecutionCount() : 1; + if (!recordCall(DstSym, Count)) + ++NotProcessed; + } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { + // For indirect calls and jump tables, use branch data. + if (!BranchDataOrErr) { + ++NotProcessed; + continue; + } + const FuncBranchData &BranchData = BranchDataOrErr.get(); + const auto DataOffset = + BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); + + for (const auto &BI : BranchData.getBranchRange(DataOffset)) { + // Count each target as a separate call. + ++TotalCalls; + + if (!BI.To.IsSymbol) { + ++NotProcessed; + continue; + } + + auto Itr = BC.GlobalSymbols.find(BI.To.Name); + if (Itr == BC.GlobalSymbols.end()) { + ++NotProcessed; + continue; + } + + const auto *DstSym = + BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); + + if (!recordCall(DstSym, UseEdgeCounts ? BI.Branches : 1)) + ++NotProcessed; + } + } + + if (!UseEdgeCounts) { + Offset += BC.computeCodeSize(&Inst, &Inst + 1); + } + } + } + } + + outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed + << " callsites not processed out of " << TotalCalls << "\n"; + + return Cg; +} + +} +} diff --git a/bolt/Passes/BinaryFunctionCallGraph.h b/bolt/Passes/BinaryFunctionCallGraph.h new file mode 100644 index 000000000000..abb03f9a1d2b --- /dev/null +++ b/bolt/Passes/BinaryFunctionCallGraph.h @@ -0,0 +1,80 @@ +//===--- Passes/CallGraph.h -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_BINARY_FUNCTION_CALLGRAPH_H + +#include "CallGraph.h" + +#include +#include +#include +#include + +namespace llvm { +namespace bolt { + +class BinaryFunction; +class BinaryContext; + +class BinaryFunctionCallGraph : public CallGraph { +public: + NodeId maybeGetNodeId(const BinaryFunction *BF) const { + auto Itr = FuncToNodeId.find(BF); + return Itr != FuncToNodeId.end() ? Itr->second : InvalidId; + } + NodeId getNodeId(const BinaryFunction *BF) const { + auto Itr = FuncToNodeId.find(BF); + assert(Itr != FuncToNodeId.end()); + return Itr->second; + } + BinaryFunction *nodeIdToFunc(NodeId Id) { + assert(Id < Funcs.size()); + return Funcs[Id]; + } + const BinaryFunction *nodeIdToFunc(NodeId Id) const { + assert(Id < Funcs.size()); + return Funcs[Id]; + } + NodeId addNode(BinaryFunction *BF, uint32_t Size, uint64_t Samples = 0); + + /// Compute a DFS traversal of the call graph. + std::deque buildTraversalOrder(); + +private: + std::unordered_map FuncToNodeId; + std::vector Funcs; +}; + +using CgFilterFunction = std::function; +inline bool NoFilter(const BinaryFunction &) { return false; } + +/// Builds a call graph from the map of BinaryFunctions provided in BFs. +/// The arguments control how the graph is constructed. +/// Filter is called on each function, any function that it returns true for +/// is omitted from the graph. +/// If IncludeColdCalls is true, then calls from cold BBs are considered for the +/// graph, otherwise they are ignored. +/// UseFunctionHotSize controls whether the hot size of a function is used when +/// filling in the Size attribute of new Nodes. +/// UseEdgeCounts is used to control if the AvgCallOffset attribute on Arcs is +/// computed using the offsets of call instructions. +BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, + std::map &BFs, + CgFilterFunction Filter = NoFilter, + bool IncludeColdCalls = true, + bool UseFunctionHotSize = false, + bool UseEdgeCounts = false); + +} +} + +#endif diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 82c2a788e059..3fc9336f29c6 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMBOLTPasses BinaryPasses.cpp + BinaryFunctionCallGraph.cpp CallGraph.cpp DataflowAnalysis.cpp DataflowInfoManager.cpp diff --git a/bolt/Passes/CallGraph.cpp b/bolt/Passes/CallGraph.cpp index 50757270c547..14def27b6adf 100644 --- a/bolt/Passes/CallGraph.cpp +++ b/bolt/Passes/CallGraph.cpp @@ -67,195 +67,52 @@ namespace bolt { int64_t CallGraph::Arc::Hash::operator()(const Arc &Arc) const { #ifdef USE_STD_HASH std::hash Hasher; - return hashCombine(Hasher(Arc.Src), Arc.Dst); + return hashCombine(Hasher(Arc.src()), Arc.dst()); #else - return hash_int64_pair(int64_t(Arc.Src), int64_t(Arc.Dst)); + return hash_int64_pair(int64_t(Arc.src()), int64_t(Arc.dst())); #endif } -CallGraph buildCallGraph(BinaryContext &BC, - std::map &BFs, - std::function Filter, - bool IncludeColdCalls, - bool UseFunctionHotSize, - bool UseEdgeCounts) { - CallGraph Cg; - - // Add call graph nodes. - auto lookupNode = [&](BinaryFunction *Function) { - auto It = Cg.FuncToNodeId.find(Function); - if (It == Cg.FuncToNodeId.end()) { - // It's ok to use the hot size here when the function is split. This is - // because emitFunctions will emit the hot part first in the order that is - // computed by ReorderFunctions. The cold part will be emitted with the - // rest of the cold functions and code. - const auto Size = UseFunctionHotSize && Function->isSplit() - ? Function->estimateHotSize() - : Function->estimateSize(); - const auto Id = Cg.addNode(Size); - assert(size_t(Id) == Cg.Funcs.size()); - Cg.Funcs.push_back(Function); - Cg.FuncToNodeId[Function] = Id; - // NOTE: for functions without a profile, we set the number of samples - // to zero. This will keep these functions from appearing in the hot - // section. This is a little weird because we wouldn't be trying to - // create a node for a function unless it was the target of a call from - // a hot block. The alternative would be to set the count to one or - // accumulate the number of calls from the callsite into the function - // samples. Results from perfomance testing seem to favor the zero - // count though, so I'm leaving it this way for now. - Cg.Nodes[Id].Samples = Function->hasProfile() ? Function->getExecutionCount() : 0; - assert(Cg.Funcs[Id] == Function); - return Id; - } else { - return It->second; - } - }; - - // Add call graph edges. - uint64_t NotProcessed = 0; - uint64_t TotalCalls = 0; - for (auto &It : BFs) { - auto *Function = &It.second; - - if(Filter(*Function)) { - continue; - } - - auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames()); - const auto SrcId = lookupNode(Function); - uint64_t Offset = Function->getAddress(); - - auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) { - if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) { - const auto DstId = lookupNode(DstFunc); - auto &A = Cg.incArcWeight(SrcId, DstId, Count); - if (!UseEdgeCounts) { - A.AvgCallOffset += (Offset - DstFunc->getAddress()); - } - DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function - << " -> " << *DstFunc << " @ " << Offset << "\n"); - return true; - } - return false; - }; - - for (auto *BB : Function->layout()) { - // Don't count calls from cold blocks - if (BB->isCold() && !IncludeColdCalls) - continue; - - for (auto &Inst : *BB) { - // Find call instructions and extract target symbols from each one. - if (!BC.MIA->isCall(Inst)) - continue; - - ++TotalCalls; - if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) { - // For direct calls, just use the BB execution count. - const auto Count = UseEdgeCounts && BB->hasProfile() - ? BB->getExecutionCount() : 1; - if (!recordCall(DstSym, Count)) - ++NotProcessed; - } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { - // For indirect calls and jump tables, use branch data. - if(!BranchDataOrErr) { - ++NotProcessed; - continue; - } - const FuncBranchData &BranchData = BranchDataOrErr.get(); - const auto DataOffset = - BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); - - for (const auto &BI : BranchData.getBranchRange(DataOffset)) { - // Count each target as a separate call. - ++TotalCalls; - - if (!BI.To.IsSymbol) { - ++NotProcessed; - continue; - } - - auto Itr = BC.GlobalSymbols.find(BI.To.Name); - if (Itr == BC.GlobalSymbols.end()) { - ++NotProcessed; - continue; - } - - const auto *DstSym = - BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); - - if (!recordCall(DstSym, UseEdgeCounts ? BI.Branches : 1)) - ++NotProcessed; - } - } - - if (!UseEdgeCounts) { - Offset += BC.computeCodeSize(&Inst, &Inst + 1); - } - } - } - } - - outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed - << " callsites not processed out of " << TotalCalls << "\n"; - - return Cg; -} - -CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint32_t Samples) { +CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint64_t Samples) { auto Id = Nodes.size(); Nodes.emplace_back(Size, Samples); return Id; } -const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W) { +const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W, + double Offset) { auto Res = Arcs.emplace(Src, Dst, W); if (!Res.second) { Res.first->Weight += W; return *Res.first; } + Res.first->AvgCallOffset += Offset; Nodes[Src].Succs.push_back(Dst); Nodes[Dst].Preds.push_back(Src); return *Res.first; } -std::deque CallGraph::buildTraversalOrder() { - std::deque TopologicalOrder; - enum NodeStatus { NEW, VISITING, VISITED }; - std::vector NodeStatus(Funcs.size()); - std::stack Worklist; - - for (auto *Func : Funcs) { - const auto Id = FuncToNodeId.at(Func); - Worklist.push(Id); - NodeStatus[Id] = NEW; - } - - while (!Worklist.empty()) { - const auto FuncId = Worklist.top(); - Worklist.pop(); - - if (NodeStatus[FuncId] == VISITED) - continue; - - if (NodeStatus[FuncId] == VISITING) { - TopologicalOrder.push_back(Funcs[FuncId]); - NodeStatus[FuncId] = VISITED; - continue; +void CallGraph::normalizeArcWeights(bool UseEdgeCounts) { + // Normalize arc weights. + if (!UseEdgeCounts) { + for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) { + auto& Func = getNode(FuncId); + for (auto Caller : Func.predecessors()) { + auto Arc = findArc(Caller, FuncId); + Arc->NormalizedWeight = Arc->weight() / Func.samples(); + Arc->AvgCallOffset /= Arc->weight(); + assert(Arc->AvgCallOffset < size(Caller)); + } } - - assert(NodeStatus[FuncId] == NEW); - NodeStatus[FuncId] = VISITING; - Worklist.push(FuncId); - for (const auto Callee : Nodes[FuncId].Succs) { - if (NodeStatus[Callee] == VISITING || NodeStatus[Callee] == VISITED) - continue; - Worklist.push(Callee); + } else { + for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) { + auto &Func = getNode(FuncId); + for (auto Caller : Func.predecessors()) { + auto Arc = findArc(Caller, FuncId); + Arc->NormalizedWeight = Arc->weight() / Func.samples(); + } } } - - return TopologicalOrder; } } diff --git a/bolt/Passes/CallGraph.h b/bolt/Passes/CallGraph.h index df984bc2b7c2..64960bf3d76d 100644 --- a/bolt/Passes/CallGraph.h +++ b/bolt/Passes/CallGraph.h @@ -12,20 +12,14 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H #define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPH_H +#include #include #include -#include #include -#include -#include -#include namespace llvm { namespace bolt { -class BinaryFunction; -class BinaryContext; - // TODO: find better place for this inline int64_t hashCombine(const int64_t Seed, const int64_t Val) { std::hash Hasher; @@ -55,6 +49,14 @@ class CallGraph { return Lhs.Src == Rhs.Src && Lhs.Dst == Rhs.Dst; } + NodeId src() const { return Src; } + NodeId dst() const { return Dst; } + double weight() const { return Weight; } + double avgCallOffset() const { return AvgCallOffset; } + double normalizedWeight() const { return NormalizedWeight; } + + private: + friend class CallGraph; const NodeId Src; const NodeId Dst; mutable double Weight; @@ -62,50 +64,115 @@ class CallGraph { mutable double AvgCallOffset{0}; }; + using ArcsType = std::unordered_set; + using ArcIterator = ArcsType::iterator; + using ArcConstIterator = ArcsType::const_iterator; + class Node { public: - explicit Node(uint32_t Size, uint32_t Samples = 0) + explicit Node(uint32_t Size, uint64_t Samples = 0) : Size(Size), Samples(Samples) {} + uint32_t size() const { return Size; } + uint64_t samples() const { return Samples; } + + const std::vector &successors() const { + return Succs; + } + const std::vector &predecessors() const { + return Preds; + } + + private: + friend class CallGraph; uint32_t Size; - uint32_t Samples; + uint64_t Samples; // preds and succs contain no duplicate elements and self arcs are not allowed std::vector Preds; std::vector Succs; }; - NodeId addNode(uint32_t Size, uint32_t Samples = 0); - const Arc &incArcWeight(NodeId Src, NodeId Dst, double W = 1.0); - - /// Compute a DFS traversal of the call graph. - std::deque buildTraversalOrder(); - + size_t numNodes() const { + return Nodes.size(); + } + const Node &getNode(const NodeId Id) const { + assert(Id < Nodes.size()); + return Nodes[Id]; + } + uint32_t size(const NodeId Id) const { + assert(Id < Nodes.size()); + return Nodes[Id].Size; + } + uint64_t samples(const NodeId Id) const { + assert(Id < Nodes.size()); + return Nodes[Id].Samples; + } + const std::vector &successors(const NodeId Id) const { + assert(Id < Nodes.size()); + return Nodes[Id].Succs; + } + const std::vector &predecessors(const NodeId Id) const { + assert(Id < Nodes.size()); + return Nodes[Id].Preds; + } + NodeId addNode(uint32_t Size, uint64_t Samples = 0); + const Arc &incArcWeight(NodeId Src, NodeId Dst, double W = 1.0, + double Offset = 0.0); + ArcIterator findArc(NodeId Src, NodeId Dst) { + return Arcs.find(Arc(Src, Dst)); + } + ArcConstIterator findArc(NodeId Src, NodeId Dst) const { + return Arcs.find(Arc(Src, Dst)); + } + const ArcsType &getArcs() const { + return Arcs; + } + + void normalizeArcWeights(bool UseEdgeCounts); + + template + void printDot(char* fileName, L getLabel) const; +private: std::vector Nodes; - std::unordered_set Arcs; - std::vector Funcs; - std::unordered_map FuncToNodeId; + ArcsType Arcs; }; -inline bool NoFilter(const BinaryFunction &) { return false; } - -/// Builds a call graph from the map of BinaryFunctions provided in BFs. -/// The arguments control how the graph is constructed. -/// Filter is called on each function, any function that it returns true for -/// is omitted from the graph. -/// If IncludeColdCalls is true, then calls from cold BBs are considered for the -/// graph, otherwise they are ignored. -/// UseFunctionHotSize controls whether the hot size of a function is used when -/// filling in the Size attribute of new Nodes. -/// UseEdgeCounts is used to control if the AvgCallOffset attribute on Arcs is -/// computed using the offsets of call instructions. -CallGraph buildCallGraph(BinaryContext &BC, - std::map &BFs, - std::function Filter = NoFilter, - bool IncludeColdCalls = true, - bool UseFunctionHotSize = false, - bool UseEdgeCounts = false); +template +void CallGraph::printDot(char* FileName, L GetLabel) const { + FILE* File = fopen(FileName, "wt"); + if (!File) return; + + fprintf(File, "digraph g {\n"); + for (NodeId F = 0; F < Nodes.size(); F++) { + if (Nodes[F].samples() == 0) continue; + fprintf( + File, + "f%lu [label=\"%s\\nsamples=%u\\nsize=%u\"];\n", + F, + GetLabel(F), + Nodes[F].samples(), + Nodes[F].size()); + } + for (NodeId F = 0; F < Nodes.size(); F++) { + if (Nodes[F].samples() == 0) continue; + for (auto Dst : Nodes[F].successors()) { + auto Arc = findArc(F, Dst); + fprintf( + File, + "f%lu -> f%u [label=\"normWgt=%.3lf,weight=%.0lf,callOffset=%.1lf\"];" + "\n", + F, + Dst, + Arc->normalizedWeight(), + Arc->weight(), + Arc->avgCallOffset()); + } + } + fprintf(File, "}\n"); + fclose(File); +} } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index e3132bb6ea69..7c0110926381 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -347,8 +347,8 @@ void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { } if (RegsKilledMap[Func] != RegsKilled || Updated) { - for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) { - Queue.push(Cg.Funcs[Caller]); + for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) { + Queue.push(Cg.nodeIdToFunc(Caller)); } } RegsKilledMap[Func] = std::move(RegsKilled); diff --git a/bolt/Passes/FrameAnalysis.h b/bolt/Passes/FrameAnalysis.h index d9f6f939f977..b182d84bcb78 100644 --- a/bolt/Passes/FrameAnalysis.h +++ b/bolt/Passes/FrameAnalysis.h @@ -13,7 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H #include "BinaryPasses.h" -#include "CallGraph.h" +#include "BinaryFunctionCallGraph.h" #include "StackPointerTracking.h" namespace llvm { @@ -113,7 +113,7 @@ raw_ostream &operator<<(raw_ostream &OS, /// class FrameAnalysis : public BinaryFunctionPass { /// Call graph info - CallGraph Cg; + BinaryFunctionCallGraph Cg; /// DFS or reverse post-ordering of the call graph nodes to allow us to /// traverse the call graph bottom-up diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 975bac260c45..7f02b840ba64 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -96,8 +96,8 @@ void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) { } if (RegsKilledMap[Func] != RegsKilled) { - for (auto Caller : Cg.Nodes[Cg.FuncToNodeId.at(Func)].Preds) { - Queue.push(Cg.Funcs[Caller]); + for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) { + Queue.push(Cg.nodeIdToFunc(Caller)); } } RegsKilledMap[Func] = std::move(RegsKilled); diff --git a/bolt/Passes/FrameOptimizer.h b/bolt/Passes/FrameOptimizer.h index 6f6fdc1b680c..e3423ad19f42 100644 --- a/bolt/Passes/FrameOptimizer.h +++ b/bolt/Passes/FrameOptimizer.h @@ -13,7 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H #include "BinaryPasses.h" -#include "CallGraph.h" +#include "BinaryFunctionCallGraph.h" namespace llvm { namespace bolt { @@ -76,7 +76,7 @@ class FrameOptimizerPass : public BinaryFunctionPass { uint64_t CountFunctionsAllClobber{0}; /// Call graph info - CallGraph Cg; + BinaryFunctionCallGraph Cg; /// DFS or reverse post-ordering of the call graph nodes to allow us to /// traverse the call graph bottom-up diff --git a/bolt/Passes/HFSort.cpp b/bolt/Passes/HFSort.cpp index 7f40ba8a3320..dd2a364d2ff7 100644 --- a/bolt/Passes/HFSort.cpp +++ b/bolt/Passes/HFSort.cpp @@ -30,13 +30,17 @@ #include "HFSort.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" +#include "llvm/Support/Options.h" #include "llvm/Support/raw_ostream.h" -#include -#include +#include #undef DEBUG_TYPE #define DEBUG_TYPE "hfsort" +namespace opts { +extern llvm::cl::opt Verbosity; +} + namespace llvm { namespace bolt { @@ -65,10 +69,10 @@ constexpr int CallerDegradeFactor = 8; Cluster::Cluster(NodeId Id, const Node &Func) { Targets.push_back(Id); - Size = Func.Size; - Samples = Func.Samples; + Size = Func.size(); + Samples = Func.samples(); + Density = (double)Samples / Size; Frozen = false; - DEBUG(dbgs() << "new Cluster: " << toString() << "\n"); } std::string Cluster::toString() const { @@ -91,25 +95,31 @@ void freezeClusters(const CallGraph &Cg, std::vector &Clusters) { uint32_t TotalSize = 0; std::sort(Clusters.begin(), Clusters.end(), compareClustersDensity); for (auto &C : Clusters) { - uint32_t NewSize = TotalSize + C.Size; + uint32_t NewSize = TotalSize + C.size(); if (NewSize > FrozenPages * HugePageSize) break; - C.Frozen = true; + C.freeze(); TotalSize = NewSize; - auto Fid = C.Targets[0]; - DEBUG(dbgs() << - format("freezing cluster for func %d, size = %u, samples = %u)\n", - Fid, Cg.Nodes[Fid].Size, Cg.Nodes[Fid].Samples);); + DEBUG( + auto Fid = C.target(0); + dbgs() << + format("freezing cluster for func %d, size = %u, samples = %lu)\n", + Fid, Cg.size(Fid), Cg.samples(Fid));); } } } +void Cluster::reverseTargets() { + std::reverse(Targets.begin(), Targets.end()); +} + void Cluster::merge(Cluster&& Other, const double Aw) { Targets.insert(Targets.end(), Other.Targets.begin(), Other.Targets.end()); Size += Other.Size; Samples += Other.Samples; + Density = (double)Samples / Size; Other.Size = 0; Other.Samples = 0; @@ -120,13 +130,13 @@ std::vector clusterize(const CallGraph &Cg) { std::vector SortedFuncs; // indexed by NodeId, keeps it's current cluster - std::vector FuncCluster(Cg.Nodes.size(), nullptr); + std::vector FuncCluster(Cg.numNodes(), nullptr); std::vector Clusters; - Clusters.reserve(Cg.Nodes.size()); + Clusters.reserve(Cg.numNodes()); - for (NodeId F = 0; F < Cg.Nodes.size(); F++) { - if (Cg.Nodes[F].Samples == 0) continue; - Clusters.emplace_back(F, Cg.Nodes[F]); + for (NodeId F = 0; F < Cg.numNodes(); F++) { + if (Cg.samples(F) == 0) continue; + Clusters.emplace_back(F, Cg.getNode(F)); SortedFuncs.push_back(F); } @@ -135,18 +145,18 @@ std::vector clusterize(const CallGraph &Cg) { // The size and order of Clusters is fixed until we reshuffle it immediately // before returning. for (auto &Cluster : Clusters) { - FuncCluster[Cluster.Targets.front()] = &Cluster; + FuncCluster[Cluster.targets().front()] = &Cluster; } std::sort( SortedFuncs.begin(), SortedFuncs.end(), [&] (const NodeId F1, const NodeId F2) { - const auto &Func1 = Cg.Nodes[F1]; - const auto &Func2 = Cg.Nodes[F2]; + const auto &Func1 = Cg.getNode(F1); + const auto &Func2 = Cg.getNode(F2); return - (uint64_t)Func1.Samples * Func2.Size > // TODO: is this correct? - (uint64_t)Func2.Samples * Func1.Size; + Func1.samples() * Func2.size() > // TODO: is this correct? + Func2.samples() * Func1.size(); } ); @@ -154,17 +164,17 @@ std::vector clusterize(const CallGraph &Cg) { // one containing its most likely predecessor. for (const auto Fid : SortedFuncs) { auto Cluster = FuncCluster[Fid]; - if (Cluster->Frozen) continue; + if (Cluster->frozen()) continue; // Find best predecessor. NodeId BestPred = CallGraph::InvalidId; double BestProb = 0; - for (const auto Src : Cg.Nodes[Fid].Preds) { - auto &A = *Cg.Arcs.find(Arc(Src, Fid)); - if (BestPred == CallGraph::InvalidId || A.NormalizedWeight > BestProb) { - BestPred = A.Src; - BestProb = A.NormalizedWeight; + for (const auto Src : Cg.predecessors(Fid)) { + const auto &Arc = *Cg.findArc(Src, Fid); + if (BestPred == CallGraph::InvalidId || Arc.normalizedWeight() > BestProb) { + BestPred = Arc.src(); + BestProb = Arc.normalizedWeight(); } } @@ -180,29 +190,32 @@ std::vector clusterize(const CallGraph &Cg) { // Skip if no predCluster (predecessor w/ no samples), or if same // as cluster, of it's frozen. if (PredCluster == nullptr || PredCluster == Cluster || - PredCluster->Frozen) { + PredCluster->frozen()) { continue; } // Skip if merged cluster would be bigger than the threshold. - if (Cluster->Size + PredCluster->Size > MaxClusterSize) continue; + if (Cluster->size() + PredCluster->size() > MaxClusterSize) continue; // Check if the merge is good for the caller. // Don't merge if the caller's density is significantly better // than the density resulting from the merge. const double NewDensity = - ((double)PredCluster->Samples + Cluster->Samples) / - (PredCluster->Size + Cluster->Size); + ((double)PredCluster->samples() + Cluster->samples()) / + (PredCluster->size() + Cluster->size()); if (PredCluster->density() > NewDensity * CallerDegradeFactor) { continue; } - DEBUG(dbgs() << format("merging %s -> %s: %u\n", - PredCluster->toString().c_str(), - Cluster->toString().c_str(), - Cg.Nodes[Fid].Samples);); + DEBUG( + if (opts::Verbosity > 1) { + dbgs() << format("merging %s -> %s: %u\n", + PredCluster->toString().c_str(), + Cluster->toString().c_str(), + Cg.samples(Fid)); + }); - for (auto F : Cluster->Targets) { + for (auto F : Cluster->targets()) { FuncCluster[F] = PredCluster; } @@ -212,12 +225,16 @@ std::vector clusterize(const CallGraph &Cg) { // Return the set of Clusters that are left, which are the ones that // didn't get merged (so their first func is its original func). std::vector SortedClusters; + std::unordered_set Visited; for (const auto Func : SortedFuncs) { auto Cluster = FuncCluster[Func]; - if (!Cluster || Cluster->Targets.empty()) continue; - if (Cluster->Targets[0] != Func) continue; + if (!Cluster || + Visited.count(Cluster) == 1 || + Cluster->target(0) != Func) { + continue; + } SortedClusters.emplace_back(std::move(*Cluster)); - Cluster->Targets.clear(); + Visited.insert(Cluster); } std::sort(SortedClusters.begin(), @@ -228,32 +245,32 @@ std::vector clusterize(const CallGraph &Cg) { } std::vector randomClusters(const CallGraph &Cg) { - std::vector FuncIds(Cg.Nodes.size(), 0); + std::vector FuncIds(Cg.numNodes(), 0); std::vector Clusters; - Clusters.reserve(Cg.Nodes.size()); + Clusters.reserve(Cg.numNodes()); - for (NodeId F = 0; F < Cg.Nodes.size(); F++) { - if (Cg.Nodes[F].Samples == 0) continue; - Clusters.emplace_back(F, Cg.Nodes[F]); + for (NodeId F = 0; F < Cg.numNodes(); F++) { + if (Cg.samples(F) == 0) continue; + Clusters.emplace_back(F, Cg.getNode(F)); } std::sort(Clusters.begin(), Clusters.end(), [](const Cluster &A, const Cluster &B) { - return A.Size < B.Size; + return A.size() < B.size(); }); auto pickMergeCluster = [&Clusters](const size_t Idx) { size_t MaxIdx = Idx + 1; while (MaxIdx < Clusters.size() && - Clusters[Idx].Size + Clusters[MaxIdx].Size <= MaxClusterSize) { + Clusters[Idx].size() + Clusters[MaxIdx].size() <= MaxClusterSize) { ++MaxIdx; } if (MaxIdx - Idx > 1) { size_t MergeIdx = (std::rand() % (MaxIdx - Idx - 1)) + Idx + 1; - assert(Clusters[MergeIdx].Size + Clusters[Idx].Size <= MaxClusterSize); + assert(Clusters[MergeIdx].size() + Clusters[Idx].size() <= MaxClusterSize); return MergeIdx; } return Clusters.size(); diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h index 7bcd974d7515..31b2b94205f3 100644 --- a/bolt/Passes/HFSort.h +++ b/bolt/Passes/HFSort.h @@ -50,15 +50,27 @@ class Cluster { Cluster(CallGraph::NodeId Id, const CallGraph::Node &F); std::string toString() const; - double density() const { - return (double)Samples / Size; - } - + double density() const { return Density; } + uint64_t samples() const { return Samples; } + uint32_t size() const { return Size; } + bool frozen() const { return Frozen; } + void freeze() { Frozen = true; } void merge(Cluster &&Other, const double Aw = 0); - + size_t numTargets() const { + return Targets.size(); + } + const std::vector &targets() const { + return Targets; + } + CallGraph::NodeId target(size_t N) const { + return Targets[N]; + } + void reverseTargets(); +private: std::vector Targets; - uint32_t Samples; + uint64_t Samples; uint32_t Size; + double Density; bool Frozen; // not a candidate for merging }; diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index bb925f88da50..063fb7be4dcc 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -1,4 +1,4 @@ -//===--- HFSort.cpp - Cluster functions by hotness ------------------------===// +//===--- HFSortPlus.cpp - Cluster functions by hotness --------------------===// // // The LLVM Compiler Infrastructure // @@ -144,9 +144,9 @@ void sortByDensity(std::vector &Clusters) { const double D2 = C2->density(); // making sure the sorting is deterministic if (D1 != D2) return D1 > D2; - if (C1->Size != C2->Size) return C1->Size < C2->Size; - if (C1->Samples != C2->Samples) return C1->Samples > C2->Samples; - return C1->Targets[0] < C2->Targets[0]; + if (C1->size() != C2->size()) return C1->size() < C2->size(); + if (C1->samples() != C2->samples()) return C1->samples() > C2->samples(); + return C1->target(0) < C2->target(0); } ); } @@ -155,8 +155,8 @@ void sortByDensity(std::vector &Clusters) { * Density of a cluster formed by merging a given pair of clusters */ double density(Cluster *ClusterPred, Cluster *ClusterSucc) { - const double CombinedSamples = ClusterPred->Samples + ClusterSucc->Samples; - const double CombinedSize = ClusterPred->Size + ClusterSucc->Size; + const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples(); + const double CombinedSize = ClusterPred->size() + ClusterSucc->size(); return CombinedSamples / CombinedSize; } @@ -199,42 +199,42 @@ double expectedCacheHitRatio(const AlgoState &State, sortByDensity(Clusters); // generate function addresses with an alignment - std::vector Addr(State.Cg->Nodes.size(), InvalidAddr); + std::vector Addr(State.Cg->numNodes(), InvalidAddr); size_t CurAddr = 0; // 'hotness' of the pages std::vector PageSamples; for (auto Cluster : Clusters) { - for (auto TargetId : Cluster->Targets) { + for (auto TargetId : Cluster->targets()) { if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16; Addr[TargetId] = CurAddr; - CurAddr += State.Cg->Nodes[TargetId].Size; + CurAddr += State.Cg->size(TargetId); // update page weight size_t Page = Addr[TargetId] / PageSize; while (PageSamples.size() <= Page) PageSamples.push_back(0.0); - PageSamples[Page] += State.Cg->Nodes[TargetId].Samples; + PageSamples[Page] += State.Cg->samples(TargetId); } } // computing expected number of misses for every function double Misses = 0; for (auto Cluster : Clusters) { - for (auto TargetId : Cluster->Targets) { + for (auto TargetId : Cluster->targets()) { size_t Page = Addr[TargetId] / PageSize; - double Samples = State.Cg->Nodes[TargetId].Samples; + double Samples = State.Cg->samples(TargetId); // probability that the page is not present in the cache double MissProb = missProbability(State, PageSamples[Page]); - for (auto Pred : State.Cg->Nodes[TargetId].Preds) { - if (State.Cg->Nodes[Pred].Samples == 0) continue; - auto A = State.Cg->Arcs.find(Arc(Pred, TargetId)); + for (auto Pred : State.Cg->predecessors(TargetId)) { + if (State.Cg->samples(Pred) == 0) continue; + const auto &Arc = *State.Cg->findArc(Pred, TargetId); // the source page - size_t SrcPage = (Addr[Pred] + (size_t)A->AvgCallOffset) / PageSize; + size_t SrcPage = (Addr[Pred] + (size_t)Arc.avgCallOffset()) / PageSize; if (Page != SrcPage) { // this is a miss - Misses += A->Weight * MissProb; + Misses += Arc.weight() * MissProb; } - Samples -= A->Weight; + Samples -= Arc.weight(); } // the remaining samples come from the jitted code @@ -251,14 +251,14 @@ double expectedCacheHitRatio(const AlgoState &State, std::unordered_set adjacentClusters(const AlgoState &State, Cluster *C) { std::unordered_set Result; - for (auto TargetId : C->Targets) { - for (auto Succ : State.Cg->Nodes[TargetId].Succs) { + for (auto TargetId : C->targets()) { + for (auto Succ : State.Cg->successors(TargetId)) { auto SuccCluster = State.FuncCluster[Succ]; if (SuccCluster != nullptr && SuccCluster != C) { Result.insert(SuccCluster); } } - for (auto Pred : State.Cg->Nodes[TargetId].Preds) { + for (auto Pred : State.Cg->predecessors(TargetId)) { auto PredCluster = State.FuncCluster[Pred]; if (PredCluster != nullptr && PredCluster != C) { Result.insert(PredCluster); @@ -285,15 +285,15 @@ double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) { */ double shortCalls(const AlgoState &State, Cluster *Cluster) { double Calls = 0; - for (auto TargetId : Cluster->Targets) { - for (auto Succ : State.Cg->Nodes[TargetId].Succs) { + for (auto TargetId : Cluster->targets()) { + for (auto Succ : State.Cg->successors(TargetId)) { if (State.FuncCluster[Succ] == Cluster) { - auto A = State.Cg->Arcs.find(Arc(TargetId, Succ)); + const auto &Arc = *State.Cg->findArc(TargetId, Succ); - auto SrcAddr = State.Addr[TargetId] + A->AvgCallOffset; + auto SrcAddr = State.Addr[TargetId] + Arc.avgCallOffset(); auto DstAddr = State.Addr[Succ]; - Calls += expectedCalls(SrcAddr, DstAddr, A->Weight); + Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); } } } @@ -309,29 +309,29 @@ double shortCalls(const AlgoState &State, Cluster *ClusterPred, Cluster *ClusterSucc) { double Calls = 0; - for (auto TargetId : ClusterPred->Targets) { - for (auto Succ : State.Cg->Nodes[TargetId].Succs) { + for (auto TargetId : ClusterPred->targets()) { + for (auto Succ : State.Cg->successors(TargetId)) { if (State.FuncCluster[Succ] == ClusterSucc) { - auto A = State.Cg->Arcs.find(Arc(TargetId, Succ)); + const auto &Arc = *State.Cg->findArc(TargetId, Succ); - auto SrcAddr = State.Addr[TargetId] + A->AvgCallOffset; - auto DstAddr = State.Addr[Succ] + ClusterPred->Size; + auto SrcAddr = State.Addr[TargetId] + Arc.avgCallOffset(); + auto DstAddr = State.Addr[Succ] + ClusterPred->size(); - Calls += expectedCalls(SrcAddr, DstAddr, A->Weight); + Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); } } } - for (auto TargetId : ClusterPred->Targets) { - for (auto Pred : State.Cg->Nodes[TargetId].Preds) { + for (auto TargetId : ClusterPred->targets()) { + for (auto Pred : State.Cg->predecessors(TargetId)) { if (State.FuncCluster[Pred] == ClusterSucc) { - auto A = State.Cg->Arcs.find(Arc(Pred, TargetId)); + const auto &Arc = *State.Cg->findArc(Pred, TargetId); - auto SrcAddr = State.Addr[Pred] + A->AvgCallOffset + - ClusterPred->Size; + auto SrcAddr = State.Addr[Pred] + Arc.avgCallOffset() + + ClusterPred->size(); auto DstAddr = State.Addr[TargetId]; - Calls += expectedCalls(SrcAddr, DstAddr, A->Weight); + Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); } } } @@ -355,12 +355,12 @@ double mergeGain(const AlgoState &State, Cluster *ClusterPred, Cluster *ClusterSucc) { // cache misses on the first cluster - double LongCallsPred = ClusterPred->Samples - shortCalls(State, ClusterPred); + double LongCallsPred = ClusterPred->samples() - shortCalls(State, ClusterPred); double ProbPred = missProbability(State, ClusterPred->density() * PageSize); double ExpectedMissesPred = LongCallsPred * ProbPred; // cache misses on the second cluster - double LongCallsSucc = ClusterSucc->Samples - shortCalls(State, ClusterSucc); + double LongCallsSucc = ClusterSucc->samples() - shortCalls(State, ClusterSucc); double ProbSucc = missProbability(State, ClusterSucc->density() * PageSize); double ExpectedMissesSucc = LongCallsSucc * ProbSucc; @@ -373,28 +373,7 @@ double mergeGain(const AlgoState &State, double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew; // scaling the result to increase the importance of merging short clusters - return Gain / (ClusterPred->Size + ClusterSucc->Size); -} - - /* - * Merge two clusters - */ -void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) { - auto &Targets = Other->Targets; - Into->Targets.insert(Into->Targets.end(), Targets.begin(), Targets.end()); - Into->Size += Other->Size; - Into->Samples += Other->Samples; - - size_t CurAddr = 0; - for (auto TargetId : Into->Targets) { - State.FuncCluster[TargetId] = Into; - State.Addr[TargetId] = CurAddr; - CurAddr += State.Cg->Nodes[TargetId].Size; - } - - Other->Size = 0; - Other->Samples = 0; - Other->Targets.clear(); + return Gain / (ClusterPred->size() + ClusterSucc->size()); } /* @@ -403,26 +382,26 @@ void mergeInto(AlgoState &State, Cluster *Into, Cluster *Other) { std::vector hfsortPlus(const CallGraph &Cg) { // create a cluster for every function std::vector AllClusters; - AllClusters.reserve(Cg.Nodes.size()); - for (NodeId F = 0; F < Cg.Nodes.size(); F++) { - AllClusters.emplace_back(F, Cg.Nodes[F]); + AllClusters.reserve(Cg.numNodes()); + for (NodeId F = 0; F < Cg.numNodes(); F++) { + AllClusters.emplace_back(F, Cg.getNode(F)); } // initialize objects used by the algorithm std::vector Clusters; - Clusters.reserve(Cg.Nodes.size()); + Clusters.reserve(Cg.numNodes()); AlgoState State; State.Cg = &Cg; State.TotalSamples = 0; - State.FuncCluster = std::vector(Cg.Nodes.size(), nullptr); - State.Addr = std::vector(Cg.Nodes.size(), InvalidAddr); - for (NodeId F = 0; F < Cg.Nodes.size(); F++) { - if (Cg.Nodes[F].Samples == 0) continue; + State.FuncCluster = std::vector(Cg.numNodes(), nullptr); + State.Addr = std::vector(Cg.numNodes(), InvalidAddr); + for (NodeId F = 0; F < Cg.numNodes(); F++) { + if (Cg.samples(F) == 0) continue; Clusters.push_back(&AllClusters[F]); State.FuncCluster[F] = &AllClusters[F]; State.Addr[F] = 0; - State.TotalSamples += Cg.Nodes[F].Samples; + State.TotalSamples += Cg.samples(F); } DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n" @@ -482,7 +461,15 @@ std::vector hfsortPlus(const CallGraph &Cg) { Cache.invalidate(BestClusterSucc); // merge the best pair of clusters - mergeInto(State, BestClusterPred, BestClusterSucc); + BestClusterPred->merge(std::move(*BestClusterSucc)); + + size_t CurAddr = 0; + for (auto TargetId : BestClusterPred->targets()) { + State.FuncCluster[TargetId] = BestClusterPred; + State.Addr[TargetId] = CurAddr; + CurAddr += State.Cg->size(TargetId); + } + // remove BestClusterSucc from the list of active clusters auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc); Clusters.erase(Iter, Clusters.end()); diff --git a/bolt/Passes/PettisAndHansen.cpp b/bolt/Passes/PettisAndHansen.cpp index d8d828726a04..90b0e2fc25da 100644 --- a/bolt/Passes/PettisAndHansen.cpp +++ b/bolt/Passes/PettisAndHansen.cpp @@ -44,29 +44,29 @@ class ClusterArcHash { using ClusterArcSet = std::unordered_set; void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) { - auto C1head = C1->Targets.front(); - auto C1tail = C1->Targets.back(); - auto C2head = C2->Targets.front(); - auto C2tail = C2->Targets.back(); + auto C1head = C1->targets().front(); + auto C1tail = C1->targets().back(); + auto C2head = C2->targets().front(); + auto C2tail = C2->targets().back(); double C1headC2head = 0; double C1headC2tail = 0; double C1tailC2head = 0; double C1tailC2tail = 0; - for (const auto &Arc : Cg.Arcs) { - if ((Arc.Src == C1head && Arc.Dst == C2head) || - (Arc.Dst == C1head && Arc.Src == C2head)) { - C1headC2head += Arc.Weight; - } else if ((Arc.Src == C1head && Arc.Dst == C2tail) || - (Arc.Dst == C1head && Arc.Src == C2tail)) { - C1headC2tail += Arc.Weight; - } else if ((Arc.Src == C1tail && Arc.Dst == C2head) || - (Arc.Dst == C1tail && Arc.Src == C2head)) { - C1tailC2head += Arc.Weight; - } else if ((Arc.Src == C1tail && Arc.Dst == C2tail) || - (Arc.Dst == C1tail && Arc.Src == C2tail)) { - C1tailC2tail += Arc.Weight; + for (const auto &Arc : Cg.getArcs()) { + if ((Arc.src() == C1head && Arc.dst() == C2head) || + (Arc.dst() == C1head && Arc.src() == C2head)) { + C1headC2head += Arc.weight(); + } else if ((Arc.src() == C1head && Arc.dst() == C2tail) || + (Arc.dst() == C1head && Arc.src() == C2tail)) { + C1headC2tail += Arc.weight(); + } else if ((Arc.src() == C1tail && Arc.dst() == C2head) || + (Arc.dst() == C1tail && Arc.src() == C2head)) { + C1tailC2head += Arc.weight(); + } else if ((Arc.src() == C1tail && Arc.dst() == C2tail) || + (Arc.dst() == C1tail && Arc.src() == C2tail)) { + C1tailC2tail += Arc.weight(); } } @@ -75,29 +75,29 @@ void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) { if (C1headC2head == Max) { // flip C1 - std::reverse(C1->Targets.begin(), C1->Targets.end()); + C1->reverseTargets(); } else if (C1headC2tail == Max) { // flip C1 C2 - std::reverse(C1->Targets.begin(), C1->Targets.end()); - std::reverse(C2->Targets.begin(), C2->Targets.end()); + C1->reverseTargets(); + C2->reverseTargets(); } else if (C1tailC2tail == Max) { // flip C2 - std::reverse(C2->Targets.begin(), C2->Targets.end()); + C2->reverseTargets(); } } } std::vector pettisAndHansen(const CallGraph &Cg) { // indexed by NodeId, keeps its current cluster - std::vector FuncCluster(Cg.Nodes.size(), nullptr); + std::vector FuncCluster(Cg.numNodes(), nullptr); std::vector Clusters; std::vector Funcs; - Clusters.reserve(Cg.Nodes.size()); + Clusters.reserve(Cg.numNodes()); - for (NodeId F = 0; F < Cg.Nodes.size(); F++) { - if (Cg.Nodes[F].Samples == 0) continue; - Clusters.emplace_back(F, Cg.Nodes[F]); + for (NodeId F = 0; F < Cg.numNodes(); F++) { + if (Cg.samples(F) == 0) continue; + Clusters.emplace_back(F, Cg.getNode(F)); FuncCluster[F] = &Clusters.back(); Funcs.push_back(F); } @@ -113,11 +113,11 @@ std::vector pettisAndHansen(const CallGraph &Cg) { // Create a std::vector of cluster arcs - for (auto &Arc : Cg.Arcs) { - if (Arc.Weight == 0) continue; + for (auto &Arc : Cg.getArcs()) { + if (Arc.weight() == 0) continue; - auto const S = FuncCluster[Arc.Src]; - auto const D = FuncCluster[Arc.Dst]; + auto const S = FuncCluster[Arc.src()]; + auto const D = FuncCluster[Arc.dst()]; // ignore if s or d is nullptr @@ -127,7 +127,7 @@ std::vector pettisAndHansen(const CallGraph &Cg) { if (S == D) continue; - insertOrInc(S, D, Arc.Weight); + insertOrInc(S, D, Arc.weight()); } // Find an arc with max weight and merge its nodes @@ -147,9 +147,9 @@ std::vector pettisAndHansen(const CallGraph &Cg) { auto const C1 = Max.C1; auto const C2 = Max.C2; - if (C1->Size + C2->Size > MaxClusterSize) continue; + if (C1->size() + C2->size() > MaxClusterSize) continue; - if (C1->Frozen || C2->Frozen) continue; + if (C1->frozen() || C2->frozen()) continue; // order functions and merge cluster @@ -176,7 +176,7 @@ std::vector pettisAndHansen(const CallGraph &Cg) { // update FuncCluster - for (auto F : C2->Targets) { + for (auto F : C2->targets()) { FuncCluster[F] = C1; } C1->merge(std::move(*C2), Max.Weight); diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index 09320bdab7cd..72cb18c31e6f 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "ReorderFunctions.h" +#include "HFSort.h" #include "llvm/Support/Options.h" #include @@ -90,42 +91,19 @@ using NodeId = CallGraph::NodeId; using Arc = CallGraph::Arc; using Node = CallGraph::Node; -void ReorderFunctions::normalizeArcWeights() { - // Normalize arc weights. - if (!opts::UseEdgeCounts) { - for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) { - auto& Func = Cg.Nodes[FuncId]; - for (auto Caller : Func.Preds) { - auto& A = *Cg.Arcs.find(Arc(Caller, FuncId)); - A.NormalizedWeight = A.Weight / Func.Samples; - A.AvgCallOffset /= A.Weight; - assert(A.AvgCallOffset < Cg.Nodes[Caller].Size); - } - } - } else { - for (NodeId FuncId = 0; FuncId < Cg.Nodes.size(); ++FuncId) { - auto &Func = Cg.Nodes[FuncId]; - for (auto Caller : Func.Preds) { - auto& A = *Cg.Arcs.find(Arc(Caller, FuncId)); - A.NormalizedWeight = A.Weight / Func.Samples; - } - } - } -} - void ReorderFunctions::reorder(std::vector &&Clusters, std::map &BFs) { - std::vector FuncAddr(Cg.Nodes.size()); // Just for computing stats + std::vector FuncAddr(Cg.numNodes()); // Just for computing stats uint64_t TotalSize = 0; uint32_t Index = 0; // Set order of hot functions based on clusters. for (const auto& Cluster : Clusters) { - for (const auto FuncId : Cluster.Targets) { - assert(Cg.Nodes[FuncId].Samples > 0); - Cg.Funcs[FuncId]->setIndex(Index++); + for (const auto FuncId : Cluster.targets()) { + assert(Cg.samples(FuncId) > 0); + Cg.nodeIdToFunc(FuncId)->setIndex(Index++); FuncAddr[FuncId] = TotalSize; - TotalSize += Cg.Nodes[FuncId].Size; + TotalSize += Cg.size(FuncId); } } @@ -141,6 +119,11 @@ void ReorderFunctions::reorder(std::vector &&Clusters, #endif } + bool PrintDetailed = opts::Verbosity > 1; +#ifndef NDEBUG + PrintDetailed |= + (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0); +#endif TotalSize = 0; uint64_t CurPage = 0; uint64_t Hotfuncs = 0; @@ -149,65 +132,84 @@ void ReorderFunctions::reorder(std::vector &&Clusters, double TotalCalls64B = 0; double TotalCalls4KB = 0; double TotalCalls2MB = 0; - dbgs() << "============== page 0 ==============\n"; + if (PrintDetailed) { + outs() << "BOLT-INFO: Function reordering page layout\n" + << "BOLT-INFO: ============== page 0 ==============\n"; + } for (auto& Cluster : Clusters) { - dbgs() << - format("-------- density = %.3lf (%u / %u) --------\n", - (double) Cluster.Samples / Cluster.Size, - Cluster.Samples, Cluster.Size); + if (PrintDetailed) { + outs() << + format("BOLT-INFO: -------- density = %.3lf (%u / %u) --------\n", + Cluster.density(), Cluster.samples(), Cluster.size()); + } - for (auto FuncId : Cluster.Targets) { - if (Cg.Nodes[FuncId].Samples > 0) { + for (auto FuncId : Cluster.targets()) { + if (Cg.samples(FuncId) > 0) { Hotfuncs++; - dbgs() << "BOLT-INFO: hot func " << *Cg.Funcs[FuncId] - << " (" << Cg.Nodes[FuncId].Size << ")\n"; + if (PrintDetailed) { + outs() << "BOLT-INFO: hot func " << *Cg.nodeIdToFunc(FuncId) + << " (" << Cg.size(FuncId) << ")\n"; + } uint64_t Dist = 0; uint64_t Calls = 0; - for (auto Dst : Cg.Nodes[FuncId].Succs) { - auto& A = *Cg.Arcs.find(Arc(FuncId, Dst)); - auto D = - std::abs(FuncAddr[A.Dst] - (FuncAddr[FuncId] + A.AvgCallOffset)); - auto W = A.Weight; + for (auto Dst : Cg.successors(FuncId)) { + const auto& Arc = *Cg.findArc(FuncId, Dst); + const auto D = std::abs(FuncAddr[Arc.dst()] - + (FuncAddr[FuncId] + Arc.avgCallOffset())); + const auto W = Arc.weight(); Calls += W; if (D < 64) TotalCalls64B += W; if (D < 4096) TotalCalls4KB += W; if (D < (2 << 20)) TotalCalls2MB += W; - Dist += A.Weight * D; - dbgs() << format("arc: %u [@%lu+%.1lf] -> %u [@%lu]: " - "weight = %.0lf, callDist = %f\n", - A.Src, FuncAddr[A.Src], A.AvgCallOffset, - A.Dst, FuncAddr[A.Dst], A.Weight, D); + Dist += Arc.weight() * D; + if (PrintDetailed) { + outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: " + "weight = %.0lf, callDist = %f\n", + Arc.src(), + FuncAddr[Arc.src()], + Arc.avgCallOffset(), + Arc.dst(), + FuncAddr[Arc.dst()], + Arc.weight(), D); + } } TotalCalls += Calls; TotalDistance += Dist; - dbgs() << format("start = %6u : avgCallDist = %lu : %s\n", - TotalSize, - Calls ? Dist / Calls : 0, - Cg.Funcs[FuncId]->getPrintName().c_str()); - TotalSize += Cg.Nodes[FuncId].Size; - auto NewPage = TotalSize / HugePageSize; - if (NewPage != CurPage) { - CurPage = NewPage; - dbgs() << format("============== page %u ==============\n", CurPage); + TotalSize += Cg.size(FuncId); + + if (PrintDetailed) { + outs() << format("BOLT-INFO: start = %6u : avgCallDist = %lu : %s\n", + TotalSize, + Calls ? Dist / Calls : 0, + Cg.nodeIdToFunc(FuncId)->getPrintName().c_str()); + const auto NewPage = TotalSize / HugePageSize; + if (NewPage != CurPage) { + CurPage = NewPage; + outs() << + format("BOLT-INFO: ============== page %u ==============\n", + CurPage); + } } } } } - dbgs() << format(" Number of hot functions: %u\n" - " Number of clusters: %lu\n", + outs() << "BOLT-INFO: Function reordering stats\n" + << format("BOLT-INFO: Number of hot functions: %u\n" + "BOLT-INFO: Number of clusters: %lu\n", Hotfuncs, Clusters.size()) - << format(" Final average call distance = %.1lf (%.0lf / %.0lf)\n", + << format("BOLT-INFO: Final average call distance = %.1lf " + "(%.0lf / %.0lf)\n", TotalCalls ? TotalDistance / TotalCalls : 0, TotalDistance, TotalCalls) - << format(" Total Calls = %.0lf\n", TotalCalls); + << format("BOLT-INFO: Total Calls = %.0lf\n", TotalCalls); if (TotalCalls) { - dbgs() << format(" Total Calls within 64B = %.0lf (%.2lf%%)\n", + outs() << format("BOLT-INFO: Total Calls within 64B = %.0lf (%.2lf%%)\n", TotalCalls64B, 100 * TotalCalls64B / TotalCalls) - << format(" Total Calls within 4KB = %.0lf (%.2lf%%)\n", + << format("BOLT-INFO: Total Calls within 4KB = %.0lf (%.2lf%%)\n", TotalCalls4KB, 100 * TotalCalls4KB / TotalCalls) - << format(" Total Calls within 2MB = %.0lf (%.2lf%%)\n", + << format("BOLT-INFO: Total Calls within 2MB = %.0lf (%.2lf%%)\n", TotalCalls2MB, 100 * TotalCalls2MB / TotalCalls); } } @@ -251,7 +253,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, false, // IncludeColdCalls opts::ReorderFunctionsUseHotSize, opts::UseEdgeCounts); - normalizeArcWeights(); + Cg.normalizeArcWeights(opts::UseEdgeCounts); } std::vector Clusters; diff --git a/bolt/Passes/ReorderFunctions.h b/bolt/Passes/ReorderFunctions.h index d90bdaabfb5b..57f804ae2290 100644 --- a/bolt/Passes/ReorderFunctions.h +++ b/bolt/Passes/ReorderFunctions.h @@ -13,16 +13,15 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_FNCTIONS_H #include "BinaryPasses.h" -#include "HFSort.h" +#include "BinaryFunctionCallGraph.h" namespace llvm { namespace bolt { /// Modify function order for streaming based on hotness. class ReorderFunctions : public BinaryFunctionPass { - CallGraph Cg; + BinaryFunctionCallGraph Cg; - void normalizeArcWeights(); void reorder(std::vector &&Clusters, std::map &BFs); public: diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 50d0d5edda90..a44b6ff83da1 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1534,14 +1534,14 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { Rel.getType() != ELF::R_X86_64_GOTTPOFF && Rel.getType() != ELF::R_X86_64_GOTPCREL) { if (!IsPCRelative) { - if (opts::Verbosity > 1 && + if (opts::Verbosity > 2 && ExtractedValue != Address) { errs() << "BOLT-WARNING: mismatch ExtractedValue = 0x" << Twine::utohexstr(ExtractedValue) << '\n'; } Address = ExtractedValue; } else { - if (opts::Verbosity > 1 && + if (opts::Verbosity > 2 && ExtractedValue != Address - Rel.getOffset() + Addend) { errs() << "BOLT-WARNING: PC-relative mismatch ExtractedValue = 0x" << Twine::utohexstr(ExtractedValue) << '\n'; From 1bacb33bb1a432ce4816d55dbf3e3ce9078fa26d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 26 May 2017 17:42:39 -0700 Subject: [PATCH 266/904] [BOLT] Make hfsort+ deterministic and add test case Summary: Make hfsort+ algorithm deterministic. We only had a test for hfsort. Since hfsort+ is going to be the default, I've added a test for that too. (cherry picked from commit 749e309d704f6944e18388ddab57a0879575effb) --- bolt/Passes/HFSort.cpp | 9 ++++----- bolt/Passes/HFSort.h | 17 +++++++++++++---- bolt/Passes/HFSortPlus.cpp | 25 +++++++++++++++++-------- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/bolt/Passes/HFSort.cpp b/bolt/Passes/HFSort.cpp index dd2a364d2ff7..cb93191dc7a5 100644 --- a/bolt/Passes/HFSort.cpp +++ b/bolt/Passes/HFSort.cpp @@ -67,12 +67,11 @@ constexpr int CallerDegradeFactor = 8; //////////////////////////////////////////////////////////////////////////////// -Cluster::Cluster(NodeId Id, const Node &Func) { +Cluster::Cluster(NodeId Id, const Node &Func) +: Samples(Func.samples()), + Size(Func.size()), + Density((double)Samples / Size) { Targets.push_back(Id); - Size = Func.size(); - Samples = Func.samples(); - Density = (double)Samples / Size; - Frozen = false; } std::string Cluster::toString() const { diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h index 31b2b94205f3..7f32a99cfee7 100644 --- a/bolt/Passes/HFSort.h +++ b/bolt/Passes/HFSort.h @@ -66,12 +66,21 @@ class Cluster { return Targets[N]; } void reverseTargets(); + void setId(uint32_t NewId) { + assert(Id == -1u); + Id = NewId; + } + uint32_t id() const { + assert(Id != -1u); + return Id; + } private: + uint32_t Id{-1u}; std::vector Targets; - uint64_t Samples; - uint32_t Size; - double Density; - bool Frozen; // not a candidate for merging + uint64_t Samples{0}; + uint32_t Size{0}; + double Density{0.0}; + bool Frozen{false}; // not a candidate for merging }; // Maximum size of a cluster, in bytes. diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index 063fb7be4dcc..761a807413b1 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -128,6 +128,8 @@ struct AlgoState { std::vector FuncCluster; // current address of the function from the beginning of its cluster std::vector Addr; + // maximum cluster id. + size_t MaxClusterId; }; } @@ -136,7 +138,7 @@ struct AlgoState { * Sorting clusters by their density in decreasing order */ void sortByDensity(std::vector &Clusters) { - std::sort( + std::stable_sort( Clusters.begin(), Clusters.end(), [&] (const Cluster *C1, const Cluster *C2) { @@ -248,23 +250,29 @@ double expectedCacheHitRatio(const AlgoState &State, /* * Get adjacent clusters (the ones that share an arc) with the given one */ -std::unordered_set adjacentClusters(const AlgoState &State, - Cluster *C) { - std::unordered_set Result; +std::vector adjacentClusters(const AlgoState &State, Cluster *C) { + std::vector Result; + Result.reserve(State.MaxClusterId); for (auto TargetId : C->targets()) { for (auto Succ : State.Cg->successors(TargetId)) { auto SuccCluster = State.FuncCluster[Succ]; if (SuccCluster != nullptr && SuccCluster != C) { - Result.insert(SuccCluster); + Result.push_back(SuccCluster); } } for (auto Pred : State.Cg->predecessors(TargetId)) { auto PredCluster = State.FuncCluster[Pred]; if (PredCluster != nullptr && PredCluster != C) { - Result.insert(PredCluster); + Result.push_back(PredCluster); } } } + std::sort(Result.begin(), Result.end(), + [](const Cluster *A, const Cluster *B) { + return A->id() < B->id(); + }); + auto Last = std::unique(Result.begin(), Result.end()); + Result.erase(Last, Result.end()); return Result; } @@ -385,6 +393,7 @@ std::vector hfsortPlus(const CallGraph &Cg) { AllClusters.reserve(Cg.numNodes()); for (NodeId F = 0; F < Cg.numNodes(); F++) { AllClusters.emplace_back(F, Cg.getNode(F)); + AllClusters.back().setId(F); } // initialize objects used by the algorithm @@ -395,9 +404,9 @@ std::vector hfsortPlus(const CallGraph &Cg) { State.TotalSamples = 0; State.FuncCluster = std::vector(Cg.numNodes(), nullptr); State.Addr = std::vector(Cg.numNodes(), InvalidAddr); + State.MaxClusterId = AllClusters.back().id(); for (NodeId F = 0; F < Cg.numNodes(); F++) { if (Cg.samples(F) == 0) continue; - Clusters.push_back(&AllClusters[F]); State.FuncCluster[F] = &AllClusters[F]; State.Addr[F] = 0; @@ -487,7 +496,7 @@ std::vector hfsortPlus(const CallGraph &Cg) { Result.emplace_back(std::move(*Cluster)); } - std::sort(Result.begin(), Result.end(), compareClustersDensity); + assert(std::is_sorted(Result.begin(), Result.end(), compareClustersDensity)); return Result; } From 739f210b915c59faac0f0e3c57d4ac5259d6f128 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 2 Jun 2017 18:41:31 -0700 Subject: [PATCH 267/904] [BOLT] Fix misc issues in relocation mode. Summary: Fix issues discovered while testing LTO mode with bfd linker: * Correctly update absolute function references from code with addend. * Support .got.plt section generated by bfd linker. * Support quirks of .tbss section. * Don't ignore functions if the size in FDE doesn't match the size in the symbol table. Instead keep processing using the maximum indicated size. (cherry picked from commit 07bb827286c1fb4f37455af249b944b6de1ad781) --- bolt/RewriteInstance.cpp | 57 +++++++++++++++++++++++++--------------- bolt/RewriteInstance.h | 3 +++ 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index a44b6ff83da1..7ef342a5124d 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1075,16 +1075,9 @@ void RewriteInstance::discoverFileObjects() { } else if (FDE.getAddressRange() != SymbolSize) { if (SymbolSize) { // Function addresses match but sizes differ. - errs() << "BOLT-ERROR: sizes differ for function " << UniqueName + errs() << "BOLT-WARNING: sizes differ for function " << UniqueName << ". FDE : " << FDE.getAddressRange() - << "; symbol table : " << SymbolSize << ". Skipping.\n"; - - // Create maximum size non-simple function. - IsSimple = false; - } - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: adjusting size of function " << UniqueName - << " using FDE data.\n"; + << "; symbol table : " << SymbolSize << ". Using max size.\n"; } SymbolSize = std::max(SymbolSize, FDE.getAddressRange()); } @@ -1135,6 +1128,9 @@ void RewriteInstance::discoverFileObjects() { if (SectionName == ".plt") { // Set the size to 0 to prevent PLT from being disassembled. createBinaryFunction("__BOLT_PLT_PSEUDO" , *Section, Address, 0, false); + } else if (SectionName == ".plt.got") { + createBinaryFunction("__BOLT_PLT_GOT_PSEUDO" , *Section, Address, 0, + false); } else { std::string FunctionName = "__BOLT_FDE_FUNCat" + Twine::utohexstr(Address).str(); @@ -1149,6 +1145,17 @@ void RewriteInstance::discoverFileObjects() { } } + if (PLTGOTSection.getObject()) { + // Check if we need to create a function for .plt.got. Some linkers + // (depending on the version) would mark it with FDE while others wouldn't. + if (!getBinaryFunctionContainingAddress(PLTGOTSection.getAddress(), true)) { + DEBUG(dbgs() << "BOLT-DEBUG: creating .plt.got pseudo function at 0x" + << Twine::utohexstr(PLTGOTSection.getAddress()) << '\n'); + createBinaryFunction("__BOLT_PLT_GOT_PSEUDO" , PLTGOTSection, + PLTGOTSection.getAddress(), 0, false); + } + } + if (!SeenFileName && BC->DR.hasLocalsWithFileName() && !opts::AllowStripped) { errs() << "BOLT-ERROR: input binary does not have local file symbols " "but profile data includes function names with embedded file " @@ -1341,11 +1348,16 @@ void RewriteInstance::readSpecialSections() { HasTextRelocations = true; } else if (SectionName == ".gdb_index") { GdbIndexSection = Section; + } else if (SectionName == ".plt.got") { + PLTGOTSection = Section; } // Ignore zero-size allocatable sections as they present no interest to us. + // Note that .tbss is marked as having a positive size while in reality it + // is not taking any allocatable space. if ((ELFSectionRef(Section).getFlags() & ELF::SHF_ALLOC) && - Section.getSize() > 0) { + Section.getSize() > 0 && + SectionName != ".tbss") { BC->AllocatableSections.emplace(std::make_pair(Section.getAddress(), Section)); } @@ -1497,6 +1509,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { ForceRelocation = true; } + bool IsAbsoluteCodeRefWithAddend = false; if (!IsPCRelative && Addend != 0 && IsFromCode && !SymbolIsSection) { auto RefSection = BC->getSectionForAddress(SymbolAddress); if (RefSection && RefSection->isText()) { @@ -1511,10 +1524,10 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { << "; type name = " << TypeName << '\n'; assert(ExtractedValue == SymbolAddress + Addend && "value mismatch"); + Address = SymbolAddress; + IsAbsoluteCodeRefWithAddend = true; } - } - - if (Addend < 0 && IsPCRelative) { + } else if (Addend < 0 && IsPCRelative) { Address -= Addend; } else { Addend = 0; @@ -1534,12 +1547,14 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { Rel.getType() != ELF::R_X86_64_GOTTPOFF && Rel.getType() != ELF::R_X86_64_GOTPCREL) { if (!IsPCRelative) { - if (opts::Verbosity > 2 && - ExtractedValue != Address) { - errs() << "BOLT-WARNING: mismatch ExtractedValue = 0x" - << Twine::utohexstr(ExtractedValue) << '\n'; + if (!IsAbsoluteCodeRefWithAddend) { + if (opts::Verbosity > 2 && + ExtractedValue != Address) { + errs() << "BOLT-WARNING: mismatch ExtractedValue = 0x" + << Twine::utohexstr(ExtractedValue) << '\n'; + } + Address = ExtractedValue; } - Address = ExtractedValue; } else { if (opts::Verbosity > 2 && ExtractedValue != Address - Rel.getOffset() + Addend) { @@ -1554,7 +1569,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { if (IsFromCode) { ContainingBF = getBinaryFunctionContainingAddress(Rel.getOffset()); assert(ContainingBF && "cannot find function for address in code"); - DEBUG(dbgs() << "BOLT-DEBUG: relocation belongs to " << ContainingBF + DEBUG(dbgs() << "BOLT-DEBUG: relocation belongs to " << *ContainingBF << '\n'); } @@ -1621,7 +1636,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { if (IsFromCode) { if (ReferencedBF || ForceRelocation) { ContainingBF->addRelocation(Rel.getOffset(), ReferencedSymbol, - Rel.getType(), Addend, Address); + Rel.getType(), Addend, ExtractedValue); } else { DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocation from code to data\n"); } @@ -1729,7 +1744,7 @@ void RewriteInstance::disassembleFunctions() { // PLT requires special handling and could be ignored in this context. StringRef SectionName; Section->getName(SectionName); - if (SectionName == ".plt") + if (SectionName == ".plt" || SectionName == ".plt.got") continue; if (opts::Relocs) { diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index db12598bb354..f09c6ebb93a7 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -467,6 +467,9 @@ class RewriteInstance { /// .gdb_index section. SectionRef GdbIndexSection; + /// .plt.got section. + SectionRef PLTGOTSection; + uint64_t NewSymTabOffset{0}; /// Keep track of functions we fail to write in the binary. We need to avoid From 4e650abec9043650a479c97f57bd1df4202366fc Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 1 May 2017 16:52:54 -0700 Subject: [PATCH 268/904] [BOLT] Add shrink wrapping pass Summary: Add an implementation for shrink wrapping, a frame optimization that moves callee-saved register spills from hot prologues to cold successors. (cherry picked from commit 0f6c17710d04a2971ca224a37fb88dbc25b56565) --- bolt/BinaryBasicBlock.cpp | 37 +- bolt/BinaryBasicBlock.h | 43 +- bolt/BinaryContext.cpp | 71 +- bolt/BinaryContext.h | 19 +- bolt/BinaryFunction.cpp | 71 +- bolt/BinaryFunction.h | 20 + bolt/BinaryPassManager.cpp | 18 +- bolt/Passes/AllocCombiner.cpp | 116 ++ bolt/Passes/AllocCombiner.h | 48 + bolt/Passes/BinaryPasses.cpp | 4 +- bolt/Passes/BinaryPasses.h | 6 + bolt/Passes/CMakeLists.txt | 5 + bolt/Passes/DataflowAnalysis.h | 24 +- bolt/Passes/DataflowInfoManager.cpp | 68 +- bolt/Passes/DataflowInfoManager.h | 14 + bolt/Passes/DominatorAnalysis.h | 24 +- bolt/Passes/FrameAnalysis.cpp | 35 +- bolt/Passes/FrameOptimizer.cpp | 811 ++-------- bolt/Passes/FrameOptimizer.h | 125 +- bolt/Passes/LivenessAnalysis.h | 13 + bolt/Passes/ReachingDefOrUse.h | 6 + bolt/Passes/ReachingInsns.h | 8 + bolt/Passes/ShrinkWrapping.cpp | 1785 +++++++++++++++++++++ bolt/Passes/ShrinkWrapping.h | 477 ++++++ bolt/Passes/StackAllocationAnalysis.cpp | 153 ++ bolt/Passes/StackAllocationAnalysis.h | 68 + bolt/Passes/StackAvailableExpressions.cpp | 132 ++ bolt/Passes/StackAvailableExpressions.h | 58 + bolt/Passes/StackPointerTracking.h | 6 + bolt/Passes/StackReachingUses.cpp | 112 ++ bolt/Passes/StackReachingUses.h | 71 + bolt/RewriteInstance.cpp | 3 + 32 files changed, 3608 insertions(+), 843 deletions(-) create mode 100644 bolt/Passes/AllocCombiner.cpp create mode 100644 bolt/Passes/AllocCombiner.h create mode 100644 bolt/Passes/ShrinkWrapping.cpp create mode 100644 bolt/Passes/ShrinkWrapping.h create mode 100644 bolt/Passes/StackAllocationAnalysis.cpp create mode 100644 bolt/Passes/StackAllocationAnalysis.h create mode 100644 bolt/Passes/StackAvailableExpressions.cpp create mode 100644 bolt/Passes/StackAvailableExpressions.h create mode 100644 bolt/Passes/StackReachingUses.cpp create mode 100644 bolt/Passes/StackReachingUses.h diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index e580995be8da..8a56beba1835 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -148,8 +148,9 @@ BinaryBasicBlock *BinaryBasicBlock::getLandingPad(const MCSymbol *Label) const { } int32_t BinaryBasicBlock::getCFIStateAtInstr(const MCInst *Instr) const { - assert(getFunction()->getState() == BinaryFunction::State::CFG && - "can only calculate CFI state when function is in active CFG state"); + assert( + getFunction()->getState() >= BinaryFunction::State::CFG && + "can only calculate CFI state when function is in or past the CFG state"); const auto &FDEProgram = getFunction()->getFDEProgram(); @@ -316,6 +317,38 @@ bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB, return MIA->analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch); } +MCInst *BinaryBasicBlock::getTerminatorBefore(MCInst *Pos) { + auto &BC = Function->getBinaryContext(); + auto Itr = rbegin(); + bool Check = Pos ? false : true; + MCInst *FirstTerminator{nullptr}; + while (Itr != rend()) { + if (!Check) { + if (&*Itr == Pos) + Check = true; + ++Itr; + continue; + } + if (BC.MIA->isTerminator(*Itr)) + FirstTerminator = &*Itr; + ++Itr; + } + return FirstTerminator; +} + +bool BinaryBasicBlock::hasTerminatorAfter(MCInst *Pos) { + auto &BC = Function->getBinaryContext(); + auto Itr = rbegin(); + while (Itr != rend()) { + if (&*Itr == Pos) + return false; + if (BC.MIA->isTerminator(*Itr)) + return true; + ++Itr; + } + return false; +} + bool BinaryBasicBlock::swapConditionalSuccessors() { if (succ_size() != 2) return false; diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index ad1227279217..151ac321484c 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -617,20 +617,26 @@ class BinaryBasicBlock { return Instructions.erase(II); } + /// Retrieve iterator for \p Inst or return end iterator if instruction is not + /// from this basic block. + decltype(Instructions)::iterator findInstruction(const MCInst *Inst) { + if (Instructions.empty()) + return Instructions.end(); + size_t Index = Inst - &Instructions[0]; + return Index >= Instructions.size() ? Instructions.end() + : Instructions.begin() + Index; + } + /// Replace an instruction with a sequence of instructions. Returns true /// if the instruction to be replaced was found and replaced. template bool replaceInstruction(const MCInst *Inst, Itr Begin, Itr End) { - auto I = Instructions.end(); - auto B = Instructions.begin(); - while (I > B) { - --I; - if (&*I == Inst) { - adjustNumPseudos(*Inst, -1); - Instructions.insert(Instructions.erase(I), Begin, End); - adjustNumPseudos(Begin, End, 1); - return true; - } + auto I = findInstruction(Inst); + if (I != Instructions.end()) { + adjustNumPseudos(*Inst, -1); + Instructions.insert(Instructions.erase(I), Begin, End); + adjustNumPseudos(Begin, End, 1); + return true; } return false; } @@ -640,6 +646,23 @@ class BinaryBasicBlock { return replaceInstruction(Inst, Replacement.begin(), Replacement.end()); } + /// Insert \p NewInst before \p At, which must be an existing instruction in + /// this BB. Return a pointer to the newly inserted instruction. + iterator insertInstruction(iterator At, MCInst &&NewInst) { + adjustNumPseudos(NewInst, 1); + return Instructions.emplace(At, std::move(NewInst)); + } + + /// Helper to retrieve any terminators in \p BB before \p Pos. This is used + /// to skip CFI instructions and to retrieve the first terminator instruction + /// in basic blocks with two terminators (conditional jump and unconditional + /// jump). + MCInst *getTerminatorBefore(MCInst *Pos); + + /// Used to identify whether an instruction is before a terminator and whether + /// moving it to the end of the BB would render it dead code. + bool hasTerminatorAfter(MCInst *Pos); + /// Split apart the instructions in this basic block starting at Inst. /// The instructions following Inst are removed and returned in a vector. std::vector splitInstructions(const MCInst *Inst) { diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 55926e96f608..26f29ab6f71a 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -239,24 +239,57 @@ void BinaryContext::preprocessDebugInfo( } } -void BinaryContext::printCFI(raw_ostream &OS, uint32_t Operation) { - switch(Operation) { - case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; - case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; - case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; - case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; - case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; - case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; - case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; - case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; - case MCCFIInstruction::OpAdjustCfaOffset: OS << "OfAdjustCfaOffset"; break; - case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; - case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; - case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; - case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; - case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; - case MCCFIInstruction::OpGnuArgsSize: OS << "OpGnuArgsSize"; break; - default: OS << "Op#" << Operation; break; +void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) { + uint32_t Operation = Inst.getOperation(); + switch (Operation) { + case MCCFIInstruction::OpSameValue: + OS << "OpSameValue Reg" << Inst.getRegister(); + break; + case MCCFIInstruction::OpRememberState: + OS << "OpRememberState"; + break; + case MCCFIInstruction::OpRestoreState: + OS << "OpRestoreState"; + break; + case MCCFIInstruction::OpOffset: + OS << "OpOffset Reg" << Inst.getRegister() << " " << Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfaRegister: + OS << "OpDefCfaRegister Reg" << Inst.getRegister(); + break; + case MCCFIInstruction::OpDefCfaOffset: + OS << "OpDefCfaOffset " << Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfa: + OS << "OpDefCfa Reg" << Inst.getRegister() << " " << Inst.getOffset(); + break; + case MCCFIInstruction::OpRelOffset: + OS << "OpRelOffset"; + break; + case MCCFIInstruction::OpAdjustCfaOffset: + OS << "OfAdjustCfaOffset"; + break; + case MCCFIInstruction::OpEscape: + OS << "OpEscape"; + break; + case MCCFIInstruction::OpRestore: + OS << "OpRestore"; + break; + case MCCFIInstruction::OpUndefined: + OS << "OpUndefined"; + break; + case MCCFIInstruction::OpRegister: + OS << "OpRegister"; + break; + case MCCFIInstruction::OpWindowSave: + OS << "OpWindowSave"; + break; + case MCCFIInstruction::OpGnuArgsSize: + OS << "OpGnuArgsSize"; + break; + default: + OS << "Op#" << Operation; + break; } } @@ -274,7 +307,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, uint32_t Offset = Instruction.getOperand(0).getImm(); OS << "\t!CFI\t$" << Offset << "\t; "; if (Function) - printCFI(OS, Function->getCFIFor(Instruction)->getOperation()); + printCFI(OS, *Function->getCFIFor(Instruction)); OS << "\n"; return; } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 5b4d3169beae..fcc54e358cfa 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -143,6 +143,12 @@ class BinaryContext { const DataReader &DR; + /// Sum of execution count of all functions + uint64_t SumExecutionCount{0}; + + /// Number of functions with profile information + uint64_t NumProfiledFuncs{0}; + BinaryContext(std::unique_ptr Ctx, std::unique_ptr DwCtx, std::unique_ptr TheTriple, @@ -262,8 +268,19 @@ class BinaryContext { return Size; } + /// Return a function execution count threshold for determining whether the + /// the function is 'hot'. Consider it hot if count is above the average exec + /// count of profiled functions. + uint64_t getHotThreshold() const { + static uint64_t Threshold{0}; + if (Threshold == 0) { + Threshold = NumProfiledFuncs ? SumExecutionCount / NumProfiledFuncs : 1; + } + return Threshold; + } + /// Print the string name for a CFI operation. - static void printCFI(raw_ostream &OS, uint32_t Operation); + static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst); /// Print a single MCInst in native format. If Function is non-null, /// the instruction will be annotated with CFI and possibly DWARF line table diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 6bf637006923..410dc317b880 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -150,7 +150,7 @@ constexpr unsigned NoRegister = 0; constexpr const char *DynoStats::Desc[]; constexpr unsigned BinaryFunction::MinAlign; - + namespace { /// Gets debug line information for the instruction located at the given @@ -535,8 +535,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (auto &Elmt : OffsetToCFI) { OS << format(" %08x:\t", Elmt.first); assert(Elmt.second < FrameInstructions.size() && "Incorrect CFI offset"); - BinaryContext::printCFI(OS, - FrameInstructions[Elmt.second].getOperation()); + BinaryContext::printCFI(OS, FrameInstructions[Elmt.second]); OS << "\n"; } } else { @@ -544,7 +543,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (uint32_t I = 0, E = FrameInstructions.size(); I != E; ++I) { const MCCFIInstruction &CFI = FrameInstructions[I]; OS << format(" %d:\t", I); - BinaryContext::printCFI(OS, CFI.getOperation()); + BinaryContext::printCFI(OS, CFI); OS << "\n"; } } @@ -3442,6 +3441,54 @@ void BinaryFunction::updateLayout(LayoutType Type, updateLayoutIndices(); } +bool BinaryFunction::replaceJumpTableEntryIn(BinaryBasicBlock *BB, + BinaryBasicBlock *OldDest, + BinaryBasicBlock *NewDest) { + auto *Instr = BB->getLastNonPseudoInstr(); + if (!Instr || !BC.MIA->isIndirectBranch(*Instr)) + return false; + auto JTAddress = BC.MIA->getJumpTable(*Instr); + assert(JTAddress && "Invalid jump table address"); + auto *JT = getJumpTableContainingAddress(JTAddress); + assert(JT && "No jump table structure for this indirect branch"); + bool Patched = JT->replaceDestination(JTAddress, OldDest->getLabel(), + NewDest->getLabel()); + assert(Patched && "Invalid entry to be replaced in jump table"); + return true; +} + +BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From, + BinaryBasicBlock *To) { + // Create intermediate BB + MCSymbol *Tmp = BC.Ctx->createTempSymbol("SplitEdge", true); + auto NewBB = createBasicBlock(0, Tmp); + auto NewBBPtr = NewBB.get(); + + // Update "From" BB + auto I = From->succ_begin(); + auto BI = From->branch_info_begin(); + for (; I != From->succ_end(); ++I) { + if (*I == To) + break; + ++BI; + } + assert(I != From->succ_end() && "Invalid CFG edge in splitEdge!"); + uint64_t OrigCount{BI->Count}; + uint64_t OrigMispreds{BI->MispredictedCount}; + replaceJumpTableEntryIn(From, To, NewBBPtr); + From->replaceSuccessor(To, NewBBPtr, OrigCount, OrigMispreds); + + NewBB->addSuccessor(To, OrigCount, OrigMispreds); + NewBB->setExecutionCount(OrigCount); + NewBB->setIsCold(From->isCold()); + + // Update CFI and BB layout with new intermediate BB + std::vector> NewBBs; + NewBBs.emplace_back(std::move(NewBB)); + insertBasicBlocks(From, std::move(NewBBs), true, true); + return NewBBPtr; +} + bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const { // Some symbols are tolerated inside function bodies, others are not. @@ -3578,6 +3625,22 @@ BinaryFunction::JumpTable::getEntriesForAddress(const uint64_t Addr) const { return std::make_pair(StartIndex, EndIndex); } +bool BinaryFunction::JumpTable::replaceDestination(uint64_t JTAddress, + const MCSymbol *OldDest, + MCSymbol *NewDest) { + bool Patched{false}; + const auto Range = getEntriesForAddress(JTAddress); + for (auto I = &Entries[Range.first], E = &Entries[Range.second]; + I != E; ++I) { + auto &Entry = *I; + if (Entry == OldDest) { + Patched = true; + Entry = NewDest; + } + } + return Patched; +} + void BinaryFunction::JumpTable::updateOriginal(BinaryContext &BC) { // In non-relocation mode we have to emit jump tables in local sections. // This way we only overwrite them when a corresponding function is diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index eeb5c52ed389..affbd39ade44 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -624,6 +624,11 @@ class BinaryFunction { /// Total number of times this jump table was used. uint64_t Count{0}; + /// Change all entries of the jump table in \p JTAddress pointing to + /// \p OldDest to \p NewDest. Return false if unsuccessful. + bool replaceDestination(uint64_t JTAddress, const MCSymbol *OldDest, + MCSymbol *NewDest); + /// Update jump table at its original location. void updateOriginal(BinaryContext &BC); @@ -1368,6 +1373,21 @@ class BinaryFunction { /// new blocks into the CFG. This must be called after updateLayout. void updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks); + /// Change \p OrigDest to \p NewDest in the jump table used at the end of + /// \p BB. Returns false if \p OrigDest couldn't be find as a valid target + /// and no replacement took place. + bool replaceJumpTableEntryIn(BinaryBasicBlock *BB, + BinaryBasicBlock *OldDest, + BinaryBasicBlock *NewDest); + + /// Split the CFG edge by inserting an intermediate basic block. + /// Returns a pointer to this new intermediate basic block. BB "From" will be + /// updated to jump to the intermediate block, which in turn will have an + /// unconditional branch to BB "To". + /// User needs to manually call fixBranches(). This function only creates the + /// correct CFG edges. + BinaryBasicBlock *splitEdge(BinaryBasicBlock *From, BinaryBasicBlock *To); + /// Determine direction of the branch based on the current layout. /// Callee is responsible of updating basic block indices prior to using /// this function (e.g. by calling BinaryFunction::updateLayoutIndices()). diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 0dac7f0b1b04..eda6c575b3bc 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "BinaryPassManager.h" +#include "Passes/AllocCombiner.h" #include "Passes/FrameOptimizer.h" #include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" @@ -62,12 +63,6 @@ OptimizeBodylessFunctions("optimize-bodyless-functions", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -OptimizeFrameAccesses("frame-opt", - cl::desc("optimize stack frame accesses"), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt Peepholes("peepholes", cl::desc("run peephole optimizations"), @@ -331,9 +326,6 @@ void BinaryFunctionPassManager::runAllPasses( // fix branches consistency internally. Manager.registerPass(llvm::make_unique(PrintAfterBranchFixup)); - Manager.registerPass(llvm::make_unique(PrintFOP), - OptimizeFrameAccesses); - // This pass should come close to last since it uses the estimated hot // size of a function to determine the order. It should definitely // also happen after any changes to the call graph are made, e.g. inlining. @@ -356,6 +348,14 @@ void BinaryFunctionPassManager::runAllPasses( // This pass should always run last.* Manager.registerPass(llvm::make_unique(PrintFinalized)); + // FrameOptimizer has an implicit dependency on FinalizeFunctions. + // FrameOptimizer move values around and needs to update CFIs. To do this, it + // must read CFI, interpret it and rewrite it, so CFIs need to be correctly + // placed according to the final layout. + Manager.registerPass(llvm::make_unique(PrintFOP)); + + Manager.registerPass(llvm::make_unique(PrintFOP)); + // *except for this pass. This pass turns tail calls into jumps which // makes them invisible to function reordering. Manager.registerPass( diff --git a/bolt/Passes/AllocCombiner.cpp b/bolt/Passes/AllocCombiner.cpp new file mode 100644 index 000000000000..6d9c82732012 --- /dev/null +++ b/bolt/Passes/AllocCombiner.cpp @@ -0,0 +1,116 @@ +#include "AllocCombiner.h" + +#define DEBUG_TYPE "alloccombiner" + +using namespace llvm; + +namespace opts { +extern bool shouldProcess(const bolt::BinaryFunction &Function); + +extern cl::opt FrameOptimization; + +} // end namespace opts + +namespace llvm { +namespace bolt { + +namespace { + +bool getStackAdjustmentSize(const BinaryContext &BC, const MCInst &Inst, + int64_t &Adjustment) { + return BC.MIA->evaluateSimple(Inst, Adjustment, + std::make_pair(BC.MIA->getStackPointer(), 0LL), + std::make_pair(0, 0LL)); +} + +bool isIndifferentToSP(const MCInst &Inst, const BinaryContext &BC) { + if (BC.MIA->isCFI(Inst)) + return true; + + const auto II = BC.MII->get(Inst.getOpcode()); + if (BC.MIA->isTerminator(Inst) || + II.hasImplicitDefOfPhysReg(BC.MIA->getStackPointer(), BC.MRI.get()) || + II.hasImplicitUseOfPhysReg(BC.MIA->getStackPointer())) + return false; + + for (int I = 0, E = Inst.getNumOperands(); I != E; ++I) { + const auto &Operand = Inst.getOperand(I); + if (Operand.isReg() && Operand.getReg() == BC.MIA->getStackPointer()) { + return false; + } + } + return true; +} + +bool shouldProc(BinaryFunction &Function) { + return Function.isSimple() && Function.hasCFG() && + opts::shouldProcess(Function) && (Function.getSize() > 0); +} + +void runForAllWeCare(std::map &BFs, + std::function Task) { + for (auto &It : BFs) { + auto &Function = It.second; + if (shouldProc(Function)) + Task(Function); + } +} + +} // end anonymous namespace + +void AllocCombinerPass::combineAdjustments(BinaryContext &BC, + BinaryFunction &BF) { + for (auto &BB : BF) { + MCInst *Prev = nullptr; + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + if (isIndifferentToSP(Inst, BC)) + continue; // Skip updating Prev + + int64_t Adjustment{0LL}; + if (!Prev || !BC.MIA->isStackAdjustment(Inst) || + !BC.MIA->isStackAdjustment(*Prev) || + !getStackAdjustmentSize(BC, *Prev, Adjustment)) { + Prev = &Inst; + continue; + } + + DEBUG({ + dbgs() << "At \"" << BF.getPrintName() << "\", combining: \n"; + Inst.dump(); + Prev->dump(); + dbgs() << "Adjustment: " << Adjustment << "\n"; + }); + + if (BC.MIA->isSUB(Inst)) + Adjustment = -Adjustment; + + BC.MIA->addToImm(Inst, Adjustment, BC.Ctx.get()); + + DEBUG({ + dbgs() << "After adjustment:\n"; + Inst.dump(); + }); + + BB.eraseInstruction(Prev); + ++NumCombined; + Prev = &Inst; + } + } +} + +void AllocCombinerPass::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + if (opts::FrameOptimization == FOP_NONE) + return; + + runForAllWeCare( + BFs, [&](BinaryFunction &Function) { combineAdjustments(BC, Function); }); + + outs() << "BOLT-INFO: Allocation combiner: " << NumCoalesced + << " empty spaces coalesced.\n"; +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/AllocCombiner.h b/bolt/Passes/AllocCombiner.h new file mode 100644 index 000000000000..1be39974be3c --- /dev/null +++ b/bolt/Passes/AllocCombiner.h @@ -0,0 +1,48 @@ +//===--- Passes/AllocCombiner.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEDEFRAG_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEDEFRAG_H + +#include "BinaryPasses.h" +#include "DataflowInfoManager.h" + +namespace llvm { +namespace bolt { + +class AllocCombinerPass : public BinaryFunctionPass { + /// Stats aggregating variables + uint64_t NumCombined{0}; + uint64_t NumCoalesced{0}; + + void combineAdjustments(BinaryContext &BC, BinaryFunction &BF); + void coalesceEmptySpace(BinaryContext &BC, BinaryFunction &BF, + DataflowInfoManager &Info, FrameAnalysis &FA); + +public: + explicit AllocCombinerPass(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { + return "alloc-combiner"; + } + + /// Pass entry point + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + + +#endif diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index d04b64e3955c..0d832cc90b5f 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -584,9 +584,11 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, auto BI = PredBB->branch_info_begin(); std::swap(*BI, *(BI + 1)); } else { - // Change destination of the unconditional branch. + // Change destination of the conditional branch. MIA->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get()); } + // Annotate it, so "isCall" returns true for this jcc + MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "IsCTC", true); // Remove the unused successor which may be eliminated later // if there are no other users. diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 2a25ec656ebd..d80876ca79b4 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -359,6 +359,12 @@ class StripRepRet : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +enum FrameOptimizationType : char { + FOP_NONE, /// Don't perform FOP. + FOP_HOT, /// Perform FOP on hot functions. + FOP_ALL /// Perform FOP on all functions. +}; + } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 3fc9336f29c6..7d9714893c45 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(LLVMBOLTPasses + AllocCombiner.cpp BinaryPasses.cpp BinaryFunctionCallGraph.cpp CallGraph.cpp @@ -14,7 +15,11 @@ add_llvm_library(LLVMBOLTPasses PettisAndHansen.cpp ReorderAlgorithm.cpp ReorderFunctions.cpp + ShrinkWrapping.cpp + StackAllocationAnalysis.cpp + StackAvailableExpressions.cpp StackPointerTracking.cpp + StackReachingUses.cpp ) include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt ) diff --git a/bolt/Passes/DataflowAnalysis.h b/bolt/Passes/DataflowAnalysis.h index 1252be07eaa2..e28d2b085c29 100644 --- a/bolt/Passes/DataflowAnalysis.h +++ b/bolt/Passes/DataflowAnalysis.h @@ -265,12 +265,13 @@ class DataflowAnalysis { return getStateAt(*Point.getInst()); } + /// Relies on a ptr map to fetch the previous instruction and then retrieve + /// state. WARNING: Watch out for invalidated pointers. Do not use this + /// function if you invalidated pointers after the analysis has been completed ErrorOr getStateBefore(const MCInst &Point) { return getStateAt(PrevPoint[&Point]); } - /// Return the in set (out set) of a given program point if the direction of - /// the dataflow is forward (backward). ErrorOrgetStateBefore(ProgramPoint Point) { if (Point.isBB()) return getStateAt(*Point.getBB()); @@ -491,6 +492,25 @@ class InstrsDataflowAnalysis /// Maps expressions defs (MCInsts) to its index in the Expressions vector std::unordered_map ExprToIdx; + /// Return whether \p Expr is in the state set at \p Point + bool count(ProgramPoint Point, const MCInst &Expr) const { + auto IdxIter = ExprToIdx.find(&Expr); + assert (IdxIter != ExprToIdx.end() && "Invalid Expr"); + return (*this->getStateAt(Point))[IdxIter->second]; + } + + bool count(const MCInst &Point, const MCInst &Expr) const { + auto IdxIter = ExprToIdx.find(&Expr); + assert (IdxIter != ExprToIdx.end() && "Invalid Expr"); + return (*this->getStateAt(Point))[IdxIter->second]; + } + + /// Return whether \p Expr is in the state set at the instr of index + /// \p PointIdx + bool count(unsigned PointIdx, const MCInst &Expr) const { + return count(*Expressions[PointIdx], Expr); + } + InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF) : DataflowAnalysis(BC, BF) {} virtual ~InstrsDataflowAnalysis() {} diff --git a/bolt/Passes/DataflowInfoManager.cpp b/bolt/Passes/DataflowInfoManager.cpp index 0c4cdbe99e06..e280c1554b3d 100644 --- a/bolt/Passes/DataflowInfoManager.cpp +++ b/bolt/Passes/DataflowInfoManager.cpp @@ -20,10 +20,7 @@ ReachingDefOrUse &DataflowInfoManager::getReachingDefs() { return *RD; assert(FA && "FrameAnalysis required"); RD.reset(new ReachingDefOrUse(*FA, BC, BF)); - { - NamedRegionTimer T1("RD", "Dataflow", true); - RD->run(); - } + RD->run(); return *RD; } @@ -36,10 +33,7 @@ ReachingDefOrUse &DataflowInfoManager::getReachingUses() { return *RU; assert(FA && "FrameAnalysis required"); RU.reset(new ReachingDefOrUse(*FA, BC, BF)); - { - NamedRegionTimer T1("RU", "Dataflow", true); - RU->run(); - } + RU->run(); return *RU; } @@ -52,10 +46,7 @@ LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() { return *LA; assert(FA && "FrameAnalysis required"); LA.reset(new LivenessAnalysis(*FA, BC, BF)); - { - NamedRegionTimer T1("LA", "Dataflow", true); - LA->run(); - } + LA->run(); return *LA; } @@ -63,14 +54,24 @@ void DataflowInfoManager::invalidateLivenessAnalysis() { LA.reset(nullptr); } +StackReachingUses &DataflowInfoManager::getStackReachingUses() { + if (SRU) + return *SRU; + assert(FA && "FrameAnalysis required"); + SRU.reset(new StackReachingUses(*FA, BC, BF)); + SRU->run(); + return *SRU; +} + +void DataflowInfoManager::invalidateStackReachingUses() { + SRU.reset(nullptr); +} + DominatorAnalysis &DataflowInfoManager::getDominatorAnalysis() { if (DA) return *DA; DA.reset(new DominatorAnalysis(BC, BF)); - { - NamedRegionTimer T1("DA", "Dataflow", true); - DA->run(); - } + DA->run(); return *DA; } @@ -82,10 +83,7 @@ DominatorAnalysis &DataflowInfoManager::getPostDominatorAnalysis() { if (PDA) return *PDA; PDA.reset(new DominatorAnalysis(BC, BF)); - { - NamedRegionTimer T1("PDA", "Dataflow", true); - PDA->run(); - } + PDA->run(); return *PDA; } @@ -97,14 +95,12 @@ StackPointerTracking &DataflowInfoManager::getStackPointerTracking() { if (SPT) return *SPT; SPT.reset(new StackPointerTracking(BC, BF)); - { - NamedRegionTimer T1("SPT", "Dataflow", true); - SPT->run(); - } + SPT->run(); return *SPT; } void DataflowInfoManager::invalidateStackPointerTracking() { + invalidateStackAllocationAnalysis(); SPT.reset(nullptr); } @@ -112,10 +108,7 @@ ReachingInsns &DataflowInfoManager::getReachingInsns() { if (RI) return *RI; RI.reset(new ReachingInsns(BC, BF)); - { - NamedRegionTimer T1("RI", "Dataflow", true); - RI->run(); - } + RI->run(); return *RI; } @@ -127,10 +120,7 @@ ReachingInsns &DataflowInfoManager::getReachingInsnsBackwards() { if (RIB) return *RIB; RIB.reset(new ReachingInsns(BC, BF)); - { - NamedRegionTimer T1("RIB", "Dataflow", true); - RIB->run(); - } + RIB->run(); return *RIB; } @@ -138,6 +128,18 @@ void DataflowInfoManager::invalidateReachingInsnsBackwards() { RIB.reset(nullptr); } +StackAllocationAnalysis &DataflowInfoManager::getStackAllocationAnalysis() { + if (SAA) + return *SAA; + SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking())); + SAA->run(); + return *SAA; +} + +void DataflowInfoManager::invalidateStackAllocationAnalysis() { + SAA.reset(nullptr); +} + std::unordered_map & DataflowInfoManager::getInsnToBBMap() { if (InsnToBB) @@ -158,11 +160,13 @@ void DataflowInfoManager::invalidateAll() { invalidateReachingDefs(); invalidateReachingUses(); invalidateLivenessAnalysis(); + invalidateStackReachingUses(); invalidateDominatorAnalysis(); invalidatePostDominatorAnalysis(); invalidateStackPointerTracking(); invalidateReachingInsns(); invalidateReachingInsnsBackwards(); + invalidateStackAllocationAnalysis(); invalidateInsnToBBMap(); } diff --git a/bolt/Passes/DataflowInfoManager.h b/bolt/Passes/DataflowInfoManager.h index a9ef9f7d897d..34a6b64bef15 100644 --- a/bolt/Passes/DataflowInfoManager.h +++ b/bolt/Passes/DataflowInfoManager.h @@ -14,10 +14,12 @@ #include "FrameAnalysis.h" #include "ReachingDefOrUse.h" +#include "StackReachingUses.h" #include "DominatorAnalysis.h" #include "StackPointerTracking.h" #include "ReachingInsns.h" #include "LivenessAnalysis.h" +#include "StackAllocationAnalysis.h" namespace llvm { namespace bolt { @@ -33,11 +35,13 @@ class DataflowInfoManager { std::unique_ptr> RD; std::unique_ptr> RU; std::unique_ptr LA; + std::unique_ptr SRU; std::unique_ptr> DA; std::unique_ptr> PDA; std::unique_ptr SPT; std::unique_ptr> RI; std::unique_ptr> RIB; + std::unique_ptr SAA; std::unique_ptr> InsnToBB; @@ -45,12 +49,20 @@ class DataflowInfoManager { DataflowInfoManager(const FrameAnalysis *FA, const BinaryContext &BC, BinaryFunction &BF) : FA(FA), BC(BC), BF(BF) {}; + /// Helper function to fetch the parent BB associated with a program point + /// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock) + BinaryBasicBlock *getParentBB(ProgramPoint PP) { + return PP.isBB() ? PP.getBB() : getInsnToBBMap()[PP.getInst()]; + } + ReachingDefOrUse &getReachingDefs(); void invalidateReachingDefs(); ReachingDefOrUse &getReachingUses(); void invalidateReachingUses(); LivenessAnalysis &getLivenessAnalysis(); void invalidateLivenessAnalysis(); + StackReachingUses &getStackReachingUses(); + void invalidateStackReachingUses(); DominatorAnalysis &getDominatorAnalysis(); void invalidateDominatorAnalysis(); DominatorAnalysis &getPostDominatorAnalysis(); @@ -61,6 +73,8 @@ class DataflowInfoManager { void invalidateReachingInsns(); ReachingInsns &getReachingInsnsBackwards(); void invalidateReachingInsnsBackwards(); + StackAllocationAnalysis &getStackAllocationAnalysis(); + void invalidateStackAllocationAnalysis(); std::unordered_map &getInsnToBBMap(); void invalidateInsnToBBMap(); void invalidateAll(); diff --git a/bolt/Passes/DominatorAnalysis.h b/bolt/Passes/DominatorAnalysis.h index 87eef5f7662f..4abc508e78f0 100644 --- a/bolt/Passes/DominatorAnalysis.h +++ b/bolt/Passes/DominatorAnalysis.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_DOMINATORANALYSIS_H #include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" namespace llvm { namespace bolt { @@ -60,13 +61,21 @@ class DominatorAnalysis return Result; } - bool doesADominatesB(const MCInst &A, const MCInst &B) { - return (*this->getStateAt(B))[this->ExprToIdx[&A]]; + bool doesADominateB(const MCInst &A, unsigned BIdx) { + return this->count(BIdx, A); } - bool doesADominatesB(ProgramPoint A, const MCInst &B) { + bool doesADominateB(const MCInst &A, const MCInst &B) { + return this->count(B, A); + } + + bool doesADominateB(const MCInst &A, ProgramPoint B) { + return this->count(B, A); + } + + bool doesADominateB(ProgramPoint A, const MCInst &B) { if (A.isInst()) - return doesADominatesB(*A.getInst(), B); + return doesADominateB(*A.getInst(), B); // This analysis keep track of which instructions dominates another // instruction, it doesn't keep track of BBs. So we need a non-empty @@ -79,7 +88,7 @@ class DominatorAnalysis BB = *BB->succ_begin(); } const MCInst &InstA = *BB->begin(); - return doesADominatesB(InstA, B); + return doesADominateB(InstA, B); } void doForAllDominators(const MCInst &Inst, @@ -89,6 +98,11 @@ class DominatorAnalysis } } + void run() { + NamedRegionTimer T1("DA", "Dataflow", true); + InstrsDataflowAnalysis, Backward>::run(); + } + private: void preflight() { // Populate our universe of tracked expressions with all instructions diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 7c0110926381..38d770ad679a 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -215,6 +215,12 @@ class FrameAccessAnalysis { void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, ArgAccesses &&AA) { + if (auto OldAA = getArgAccessesFor(BC, Inst)) { + if (OldAA->AssumeEverything) + return; + *OldAA = std::move(AA); + return; + } if (AA.AssumeEverything) { // Index 0 in ArgAccessesVector represents an "assumeeverything" entry BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "ArgAccessEntry", 0U); @@ -222,7 +228,7 @@ void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, } BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "ArgAccessEntry", (unsigned)ArgAccessesVector.size()); - ArgAccessesVector.emplace_back(AA); + ArgAccessesVector.emplace_back(std::move(AA)); } void FrameAnalysis::addArgInStackAccessFor(const BinaryContext &BC, @@ -329,29 +335,39 @@ BitVector FrameAnalysis::getFunctionClobberList(const BinaryContext &BC, void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { std::queue Queue; + std::set InQueue; for (auto *Func : TopologicalCGOrder) { Queue.push(Func); + InQueue.insert(Func); } while (!Queue.empty()) { auto *Func = Queue.front(); Queue.pop(); + InQueue.erase(Func); BitVector RegsKilled = getFunctionClobberList(BC, Func); - bool Updated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func); + bool ArgsUpdated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func); + bool RegsUpdated = false; if (RegsKilledMap.find(Func) == RegsKilledMap.end()) { RegsKilledMap[Func] = std::move(RegsKilled); - continue; + } else { + RegsUpdated = RegsKilledMap[Func] != RegsKilled; + if (RegsUpdated) + RegsKilledMap[Func] = std::move(RegsKilled); } - if (RegsKilledMap[Func] != RegsKilled || Updated) { + if (RegsUpdated || ArgsUpdated) { for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) { - Queue.push(Cg.nodeIdToFunc(Caller)); + BinaryFunction *CallerFunc = Cg.nodeIdToFunc(Caller); + if (!InQueue.count(CallerFunc)) { + InQueue.insert(CallerFunc); + Queue.push(CallerFunc); + } } } - RegsKilledMap[Func] = std::move(RegsKilled); } if (opts::Verbosity == 0) { @@ -453,10 +469,11 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC, break; } DEBUG(dbgs() << "Added arg in stack access annotation " - << CurOffset + Elem.first << "\n"); + << CurOffset + Elem.first << "\n"); addArgInStackAccessFor( - BC, Inst, ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first, - /*Size=*/Elem.second}); + BC, Inst, + ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first, + /*Size=*/Elem.second}); } return Changed; } diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 7f02b840ba64..4662cf87515b 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -10,6 +10,11 @@ //===----------------------------------------------------------------------===// #include "FrameOptimizer.h" +#include "FrameAnalysis.h" +#include "ShrinkWrapping.h" +#include "StackAvailableExpressions.h" +#include "StackReachingUses.h" +#include "llvm/Support/Timer.h" #include #include @@ -19,616 +24,34 @@ using namespace llvm; namespace opts { extern cl::opt Verbosity; -} - -namespace llvm { -namespace bolt { - -void FrameOptimizerPass::getInstClobberList(const BinaryContext &BC, - const MCInst &Inst, - BitVector &KillSet) const { - if (!BC.MIA->isCall(Inst)) { - BC.MIA->getClobberedRegs(Inst, KillSet, *BC.MRI); - return; - } - - const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); - // If indirect call, kill set should have all elements - if (TargetSymbol == nullptr) { - KillSet.set(0, KillSet.size()); - return; - } - - const auto *Function = BC.getFunctionForSymbol(TargetSymbol); - if (Function == nullptr) { - // Call to a function without a BinaryFunction object. - // This should be a call to a PLT entry, and since it is a trampoline to - // a DSO, we can't really know the code in advance. Conservatively assume - // everything is clobbered. - KillSet.set(0, KillSet.size()); - return; - } - auto BV = RegsKilledMap.find(Function); - if (BV != RegsKilledMap.end()) { - KillSet |= BV->second; - return; - } - // Ignore calls to function whose clobber list wasn't yet calculated. This - // instruction will be evaluated again once we have info for the callee. - return; -} - -BitVector -FrameOptimizerPass::getFunctionClobberList(const BinaryContext &BC, - const BinaryFunction *Func) { - BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false); - - if (!Func->isSimple() || !shouldOptimize(*Func)) { - RegsKilled.set(0, RegsKilled.size()); - return RegsKilled; - } - - for (const auto &BB : *Func) { - for (const auto &Inst : BB) { - getInstClobberList(BC, Inst, RegsKilled); - } - } - - return RegsKilled; -} - -void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) { - std::queue Queue; - - for (auto *Func : TopologicalCGOrder) { - Queue.push(Func); - } - - while (!Queue.empty()) { - auto *Func = Queue.front(); - Queue.pop(); - - BitVector RegsKilled = getFunctionClobberList(BC, Func); - - if (RegsKilledMap.find(Func) == RegsKilledMap.end()) { - RegsKilledMap[Func] = std::move(RegsKilled); - continue; - } - - if (RegsKilledMap[Func] != RegsKilled) { - for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) { - Queue.push(Cg.nodeIdToFunc(Caller)); - } - } - RegsKilledMap[Func] = std::move(RegsKilled); - } - - if (opts::Verbosity == 0) { -#ifndef NDEBUG - if (!DebugFlag || !isCurrentDebugType("fop")) - return; -#else - return; -#endif - } - - // This loop is for computing statistics only - for (auto *Func : TopologicalCGOrder) { - auto Iter = RegsKilledMap.find(Func); - assert(Iter != RegsKilledMap.end() && - "Failed to compute all clobbers list"); - if (Iter->second.all()) { - auto Count = Func->getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsAllClobber += Count; - ++NumFunctionsAllClobber; - } - DEBUG_WITH_TYPE("fop", - dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; - const BitVector &RegsKilled = Iter->second; - int RegIdx = RegsKilled.find_first(); - while (RegIdx != -1) { - dbgs() << "\tREG" << RegIdx; - RegIdx = RegsKilled.find_next(RegIdx); - }; - dbgs() << "\n"; - ); - } -} - -namespace { - -template -class ForwardDataflow { -protected: - /// Reference to the function being analysed - const BinaryFunction &Func; - - /// Tracks the set of available exprs at the end of each MCInst in this - /// function - std::unordered_map StateAtPoint; - /// Tracks the set of available exprs at basic block start - std::unordered_map StateAtBBEntry; - - virtual void preflight() = 0; - - virtual StateTy getStartingStateAtBB(const BinaryBasicBlock &BB) = 0; - - virtual StateTy getStartingStateAtPoint(const MCInst &Point) = 0; - - virtual void doConfluence(StateTy &StateOut, const StateTy &StateIn) = 0; - - virtual StateTy computeNext(const MCInst &Point, const StateTy &Cur) = 0; - -public: - ForwardDataflow(const BinaryFunction &BF) : Func(BF) {} - virtual ~ForwardDataflow() {} - - ErrorOrgetStateAt(const BinaryBasicBlock &BB) const { - auto Iter = StateAtBBEntry.find(&BB); - if (Iter == StateAtBBEntry.end()) - return make_error_code(errc::result_out_of_range); - return Iter->second; - } - - ErrorOrgetStateAt(const MCInst &Point) const { - auto Iter = StateAtPoint.find(&Point); - if (Iter == StateAtPoint.end()) - return make_error_code(errc::result_out_of_range); - return Iter->second; - } - - void run() { - preflight(); - - // Initialize state for all points of the function - for (auto &BB : Func) { - StateAtBBEntry[&BB] = getStartingStateAtBB(BB); - for (auto &Inst : BB) { - StateAtPoint[&Inst] = getStartingStateAtPoint(Inst); - } - } - assert(Func.begin() != Func.end() && "Unexpected empty function"); - - std::queue Worklist; - // TODO: Pushing this in a DFS ordering will greatly speed up the dataflow - // performance. - for (auto &BB : Func) { - Worklist.push(&BB); - } - - // Main dataflow loop - while (!Worklist.empty()) { - auto *BB = Worklist.front(); - Worklist.pop(); - - DEBUG(dbgs() << "\tNow at BB " << BB->getName() << "\n"); - - // Calculate state at the entry of first instruction in BB - StateTy &StateAtEntry = StateAtBBEntry[BB]; - for (auto I = BB->pred_begin(), E = BB->pred_end(); I != E; ++I) { - auto Last = (*I)->rbegin(); - if (Last != (*I)->rend()) { - doConfluence(StateAtEntry, StateAtPoint[&*Last]); - } else { - doConfluence(StateAtEntry, StateAtBBEntry[*I]); - } - } - // Skip empty - if (BB->begin() == BB->end()) - continue; - - // Propagate information from first instruction down to the last one - bool Changed = false; - StateTy *PrevState = &StateAtEntry; - const MCInst *LAST = &*BB->rbegin(); - for (auto &Inst : *BB) { - DEBUG(dbgs() << "\t\tNow at "); - DEBUG(Inst.dump()); - - StateTy CurState = computeNext(Inst, *PrevState); - - if (StateAtPoint[&Inst] != CurState) { - StateAtPoint[&Inst] = CurState; - if (&Inst == LAST) - Changed = true; - } - PrevState = &StateAtPoint[&Inst]; - } - - if (Changed) { - for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) { - Worklist.push(*I); - } - } - } - } -}; - -class StackAvailableExpressions : public ForwardDataflow { -public: - StackAvailableExpressions(const FrameOptimizerPass &FOP, - const BinaryContext &BC, const BinaryFunction &BF) - : ForwardDataflow(BF), FOP(FOP), FrameIndexMap(FOP.FrameIndexMap), - BC(BC) {} - virtual ~StackAvailableExpressions() {} - - /// Define an iterator for navigating the expressions calculated by the - /// dataflow at each program point - class ExprIterator - : public std::iterator { - public: - ExprIterator &operator++() { - assert(Idx != -1 && "Iterator already at the end"); - Idx = BV->find_next(Idx); - return *this; - } - ExprIterator operator++(int) { - assert(Idx != -1 && "Iterator already at the end"); - ExprIterator Ret = *this; - ++(*this); - return Ret; - } - bool operator==(ExprIterator Other) const { return Idx == Other.Idx; } - bool operator!=(ExprIterator Other) const { return Idx != Other.Idx; } - const MCInst *operator*() { - assert(Idx != -1 && "Invalid access to end iterator"); - return Expressions[Idx]; - } - ExprIterator(const BitVector *BV, const std::vector &Exprs) - : BV(BV), Expressions(Exprs) { - Idx = BV->find_first(); - } - ExprIterator(const BitVector *BV, const std::vector &Exprs, - int Idx) - : BV(BV), Expressions(Exprs), Idx(Idx) {} - - private: - const BitVector *BV; - const std::vector &Expressions; - public: - int Idx; - }; - ExprIterator expr_begin(const BitVector &BV) const { - return ExprIterator(&BV, Expressions); - } - ExprIterator expr_begin(const MCInst &Point) const { - auto Iter = StateAtPoint.find(&Point); - if (Iter == StateAtPoint.end()) - return expr_end(); - return ExprIterator(&Iter->second, Expressions); - } - ExprIterator expr_begin(const BinaryBasicBlock &BB) const { - auto Iter = StateAtBBEntry.find(&BB); - if (Iter == StateAtBBEntry.end()) - return expr_end(); - return ExprIterator(&Iter->second, Expressions); - } - ExprIterator expr_end() const { - return ExprIterator(nullptr, Expressions, -1); - } - -private: - /// Reference to the result of stack frame analysis - const FrameOptimizerPass &FOP; - const FrameOptimizerPass::FrameIndexMapTy &FrameIndexMap; - const BinaryContext &BC; - - /// Used to size the set of expressions/definitions being tracked by the - /// dataflow analysis - uint64_t NumInstrs{0}; - /// We put every MCInst we want to track (which one representing an - /// expression/def) into a vector because we need to associate them with - /// small numbers. They will be tracked via BitVectors throughout the - /// dataflow analysis. - std::vector Expressions; - /// Maps expressions defs (MCInsts) to its index in the Expressions vector - std::unordered_map ExprToIdx; - - void preflight() override { - DEBUG(dbgs() << "Starting StackAvailableExpressions on \"" - << Func.getPrintName() << "\"\n"); - - // Populate our universe of tracked expressions. We are interested in - // tracking available stores to frame position at any given point of the - // program. - for (auto &BB : Func) { - for (auto &Inst : BB) { - auto FIEIter = FrameIndexMap.find(&Inst); - if (FIEIter == FrameIndexMap.end()) - continue; - const auto &FIE = FIEIter->second; - if (FIE.IsLoad == false && FIE.IsSimple == true) { - Expressions.push_back(&Inst); - ExprToIdx[&Inst] = NumInstrs++; - } - } - } - } - - BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) override { - // Entry points start with empty set (Function entry and landing pads). - // All others start with the full set. - if (BB.pred_size() == 0) - return BitVector(NumInstrs, false); - return BitVector(NumInstrs, true); - } - - BitVector getStartingStateAtPoint(const MCInst &Point) override { - return BitVector(NumInstrs, true); - } - - void doConfluence(BitVector &StateOut, const BitVector &StateIn) override { - StateOut &= StateIn; - } - - /// Define the function computing the kill set -- whether expression Y, a - /// tracked expression, will be considered to be dead after executing X. - bool doesXKillsY(const MCInst *X, const MCInst *Y) { - // if both are stores, and both store to the same stack location, return - // true - auto FIEIterX = FrameIndexMap.find(X); - auto FIEIterY = FrameIndexMap.find(Y); - if (FIEIterX != FrameIndexMap.end() && FIEIterY != FrameIndexMap.end()) { - const FrameOptimizerPass::FrameIndexEntry &FIEX = FIEIterX->second; - const FrameOptimizerPass::FrameIndexEntry &FIEY = FIEIterY->second;; - if (FIEX.IsLoad == 0 && FIEY.IsLoad == 0 && - FIEX.StackOffset + FIEX.Size > FIEY.StackOffset && - FIEX.StackOffset < FIEY.StackOffset + FIEY.Size) - return true; - } - // getClobberedRegs for X and Y. If they intersect, return true - BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false); - BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false); - FOP.getInstClobberList(BC, *X, XClobbers); - // If Y is a store to stack, its clobber list is its source reg. This is - // different than the rest because we want to check if the store source - // reaches its corresponding load untouched. - if (FIEIterY != FrameIndexMap.end() && FIEIterY->second.IsLoad == 0 && - FIEIterY->second.IsStoreFromReg) { - YClobbers.set(FIEIterY->second.RegOrImm); - } else { - FOP.getInstClobberList(BC, *Y, YClobbers); - } - XClobbers &= YClobbers; - return XClobbers.any(); - } - - BitVector computeNext(const MCInst &Point, const BitVector &Cur) override { - BitVector Next = Cur; - // Kill - for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) { - assert(*I != nullptr && "Lost pointers"); - DEBUG(dbgs() << "\t\t\tDoes it kill "); - DEBUG((*I)->dump()); - if (doesXKillsY(&Point, *I)) { - DEBUG(dbgs() << "\t\t\t\tYes\n"); - Next.reset(I.Idx); - } - }; - // Gen - auto FIEIter = FrameIndexMap.find(&Point); - if (FIEIter != FrameIndexMap.end() && - FIEIter->second.IsLoad == false && - FIEIter->second.IsSimple == true) - Next.set(ExprToIdx[&Point]); - return Next; - } -}; - -class StackPointerTracking : public ForwardDataflow { - const BinaryContext &BC; - - void preflight() override { - DEBUG(dbgs() << "Starting StackPointerTracking on \"" - << Func.getPrintName() << "\"\n"); - } - - int getStartingStateAtBB(const BinaryBasicBlock &BB) override { - // Entry BB start with offset 8 from CFA. - // All others start with EMPTY (meaning we don't know anything). - if (BB.isEntryPoint()) - return -8; - return EMPTY; - } - - int getStartingStateAtPoint(const MCInst &Point) override { - return EMPTY; - } - - void doConfluence(int &StateOut, const int &StateIn) override { - if (StateOut == EMPTY) { - StateOut = StateIn; - return; - } - if (StateIn == EMPTY || StateIn == StateOut) - return; - - // We can't agree on a specific value from this point on - StateOut = SUPERPOSITION; - } - - int computeNext(const MCInst &Point, const int &Cur) override { - const auto &MIA = BC.MIA; - - if (Cur == EMPTY || Cur == SUPERPOSITION) - return Cur; - - if (int Sz = MIA->getPushSize(Point)) - return Cur - Sz; - - if (int Sz = MIA->getPopSize(Point)) - return Cur + Sz; - - if (BC.MII->get(Point.getOpcode()) - .hasDefOfPhysReg(Point, MIA->getStackPointer(), *BC.MRI)) { - int64_t Offset = Cur; - if (!MIA->evaluateSimple(Point, Offset, std::make_pair(0, 0), - std::make_pair(0, 0))) - return SUPERPOSITION; - - return static_cast(Offset); - } - - return Cur; - } -public: - StackPointerTracking(const BinaryContext &BC, const BinaryFunction &BF) - : ForwardDataflow(BF), BC(BC) {} - virtual ~StackPointerTracking() {} - - static constexpr int SUPERPOSITION = std::numeric_limits::max(); - static constexpr int EMPTY = std::numeric_limits::min(); -}; +extern cl::OptionCategory BoltOptCategory; -} // anonymous namespace +using namespace bolt; -bool FrameOptimizerPass::restoreFrameIndex(const BinaryContext &BC, - const BinaryFunction &BF) { - StackPointerTracking SPT(BC, BF); +cl::opt +FrameOptimization("frame-opt", + cl::init(FOP_NONE), + cl::desc("optimize stack frame accesses"), + cl::values( + clEnumValN(FOP_NONE, "none", "do not perform frame optimization"), + clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"), + clEnumValN(FOP_ALL, "all", "perform FOP on all functions"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); - SPT.run(); +} // namespace opts - // Vars used for storing useful CFI info to give us a hint about how the stack - // is used in this function - int64_t CfaOffset{-8}; - uint16_t CfaReg{7}; - bool CfaRegLocked{false}; - uint16_t CfaRegLockedVal{0}; - std::stack> CFIStack; - - DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName() - << "\"\n"); - - // TODO: Implement SP tracking and improve this analysis - for (auto &BB : BF) { - DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n"); - - const MCInst *Prev = nullptr; - for (const auto &Inst : BB) { - int SPOffset = (Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB)); - DEBUG({ - dbgs() << "\t\tNow at "; - Inst.dump(); - dbgs() << "\t\t\tSP offset is " << SPOffset << "\n"; - }); - Prev = &Inst; - // Use CFI information to keep track of which register is being used to - // access the frame - if (BC.MIA->isCFI(Inst)) { - const auto *CFI = BF.getCFIFor(Inst); - switch (CFI->getOperation()) { - case MCCFIInstruction::OpDefCfa: - CfaOffset = CFI->getOffset(); - // Fall-through - case MCCFIInstruction::OpDefCfaRegister: - CfaReg = CFI->getRegister(); - break; - case MCCFIInstruction::OpDefCfaOffset: - CfaOffset = CFI->getOffset(); - break; - case MCCFIInstruction::OpRememberState: - CFIStack.push(std::make_pair(CfaOffset, CfaReg)); - break; - case MCCFIInstruction::OpRestoreState: { - assert(!CFIStack.empty() && "Corrupt CFI stack"); - auto &Elem = CFIStack.top(); - CFIStack.pop(); - CfaOffset = Elem.first; - CfaReg = Elem.second; - break; - } - case MCCFIInstruction::OpAdjustCfaOffset: - llvm_unreachable("Unhandled AdjustCfaOffset"); - break; - default: - break; - } - continue; - } - - if (BC.MIA->leaksStackAddress(Inst, *BC.MRI, false)) { - DEBUG(dbgs() << "Leaked stack address, giving up on this function.\n"); - DEBUG(dbgs() << "Blame insn: "); - DEBUG(Inst.dump()); - return false; - } - - bool IsLoad = false; - bool IsStore = false; - bool IsStoreFromReg = false; - bool IsSimple = false; - int32_t SrcImm{0}; - MCPhysReg Reg{0}; - MCPhysReg StackPtrReg{0}; - int64_t StackOffset{0}; - uint8_t Size{0}; - bool IsIndexed = false; - if (BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, Reg, - SrcImm, StackPtrReg, StackOffset, Size, - IsSimple, IsIndexed)) { - assert(Size != 0); - if (CfaRegLocked && CfaRegLockedVal != CfaReg) { - DEBUG(dbgs() << "CFA reg changed, giving up on this function.\n"); - return false; - } - if (StackPtrReg != BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false)) { - if (StackPtrReg != BC.MIA->getStackPointer() || - SPOffset == SPT.EMPTY || SPOffset == SPT.SUPERPOSITION) { - DEBUG(dbgs() - << "Found stack access with reg different than cfa reg.\n"); - DEBUG(dbgs() << "\tCurrent CFA reg: " << CfaReg - << "\n\tStack access reg: " << StackPtrReg << "\n"); - DEBUG(dbgs() << "Blame insn: "); - DEBUG(Inst.dump()); - return false; - } - DEBUG(dbgs() << "Adding access via SP while CFA reg is another one\n"); - if (IsStoreFromReg || IsLoad) - SrcImm = Reg; - // Ignore accesses to the previous stack frame - if (SPOffset + StackOffset >= 0) - continue; - FrameIndexMap.emplace( - &Inst, FrameIndexEntry{IsLoad, IsStoreFromReg, SrcImm, - SPOffset + StackOffset, Size, IsSimple}); - } else { - CfaRegLocked = true; - CfaRegLockedVal = CfaReg; - if (IsStoreFromReg || IsLoad) - SrcImm = Reg; - // Ignore accesses to the previous stack frame - if (CfaOffset + StackOffset >= 0) - continue; - FrameIndexMap.emplace( - &Inst, FrameIndexEntry{IsLoad, IsStoreFromReg, SrcImm, - CfaOffset + StackOffset, Size, IsSimple}); - } - - DEBUG_WITH_TYPE("fop", - dbgs() << "Frame index annotation added to:\n"; - BC.printInstruction(dbgs(), Inst, 0, &BF, true); - dbgs() << " FrameIndexEntry \n"; - ); - } - } - } - return true; -} - -void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, - BinaryFunction &BF) { - StackAvailableExpressions SAE(*this, BC, BF); +namespace llvm { +namespace bolt { +void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF) { + StackAvailableExpressions SAE(FA, BC, BF); SAE.run(); - DEBUG(dbgs() << "Performing frame optimization\n"); + DEBUG(dbgs() << "Performing unnecessary loads removal\n"); std::deque> ToErase; bool Changed = false; const auto ExprEnd = SAE.expr_end(); @@ -648,16 +71,16 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, // if Inst is a load from stack and the current available expressions show // this value is available in a register or immediate, replace this load // with move from register or from immediate. - const auto Iter = FrameIndexMap.find(&Inst); - if (Iter == FrameIndexMap.end()) { + auto FIEX = FA.getFIEFor(BC, Inst); + if (!FIEX) { Prev = &Inst; continue; } - const FrameIndexEntry &FIEX = Iter->second; // FIXME: Change to remove IsSimple == 0. We're being conservative here, // but once replaceMemOperandWithReg is ready, we should feed it with all // sorts of complex instructions. - if (FIEX.IsLoad == 0 || FIEX.IsSimple == 0) { + if (FIEX->IsLoad == false || FIEX->IsSimple == false || + FIEX->StackOffset >= 0) { Prev = &Inst; continue; } @@ -665,13 +88,14 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); I != ExprEnd; ++I) { const MCInst *AvailableInst = *I; - const auto Iter = FrameIndexMap.find(AvailableInst); - if (Iter == FrameIndexMap.end()) + auto FIEY = FA.getFIEFor(BC, *AvailableInst); + if (!FIEY) continue; - - const FrameIndexEntry &FIEY = Iter->second; - assert(FIEY.IsLoad == 0 && FIEY.IsSimple != 0); - if (FIEX.StackOffset != FIEY.StackOffset || FIEX.Size != FIEY.Size) + assert(FIEY->IsStore && FIEY->IsSimple); + if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size) + continue; + // TODO: Change push/pops to stack adjustment instruction + if (BC.MIA->isPop(Inst)) continue; ++NumRedundantLoads; @@ -682,12 +106,13 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, DEBUG(AvailableInst->dump()); DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); // Replace load - if (FIEY.IsStoreFromReg) { - if (!BC.MIA->replaceMemOperandWithReg(Inst, FIEY.RegOrImm)) { + if (FIEY->IsStoreFromReg) { + if (!BC.MIA->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) { DEBUG(dbgs() << "FAILED to change operand to a reg\n"); break; } ++NumLoadsChangedToReg; + BC.MIA->removeAnnotation(Inst, "FrameAccessEntry"); DEBUG(dbgs() << "Changed operand to a reg\n"); if (BC.MIA->isRedundantMove(Inst)) { ++NumLoadsDeleted; @@ -697,12 +122,13 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, } } else { char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - support::ulittle64_t::ref(Buf + 0) = FIEY.RegOrImm; + support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm; DEBUG(dbgs() << "Changing operand to an imm... "); if (!BC.MIA->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) { DEBUG(dbgs() << "FAILED\n"); } else { ++NumLoadsChangedToImm; + BC.MIA->removeAnnotation(Inst, "FrameAccessEntry"); DEBUG(dbgs() << "Ok\n"); } } @@ -716,71 +142,130 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, if (Changed) { DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); } + // TODO: Implement an interface of eraseInstruction that works out the + // complete list of elements to remove. for (auto I : ToErase) { I.first->eraseInstruction(I.second); } } +void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF) { + StackReachingUses SRU(FA, BC, BF); + SRU.run(); + + DEBUG(dbgs() << "Performing unused stores removal\n"); + std::vector> ToErase; + bool Changed = false; + for (auto &BB : BF) { + DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n"); + const MCInst *Prev = nullptr; + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + DEBUG({ + dbgs() << "\t\tNow at "; + Inst.dump(); + for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB); + I != SRU.expr_end(); ++I) { + dbgs() << "\t\t\tReached by: "; + (*I)->dump(); + } + }); + auto FIEX = FA.getFIEFor(BC, Inst); + if (!FIEX) { + Prev = &Inst; + continue; + } + if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) { + Prev = &Inst; + continue; + } + + if (SRU.isStoreUsed(*FIEX, + Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) { + Prev = &Inst; + continue; + } + // TODO: Change push/pops to stack adjustment instruction + if (BC.MIA->isPush(Inst)) + continue; + + ++NumRedundantStores; + Changed = true; + DEBUG(dbgs() << "Unused store instruction: "); + DEBUG(Inst.dump()); + DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); + // Delete it! + ToErase.push_back(std::make_pair(&BB, &Inst)); + Prev = &Inst; + } + } + + for (auto I : ToErase) { + I.first->eraseInstruction(I.second); + } + if (Changed) { + DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); + } +} + void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, std::map &BFs, - std::set &) { - uint64_t NumFunctionsNotOptimized{0}; - uint64_t NumFunctionsFailedRestoreFI{0}; - uint64_t CountFunctionsNotOptimized{0}; - uint64_t CountFunctionsFailedRestoreFI{0}; - uint64_t CountDenominator{0}; - Cg = buildCallGraph(BC, BFs); - TopologicalCGOrder = Cg.buildTraversalOrder(); - buildClobberMap(BC); + std::set &LargeFunctions) { + if (opts::FrameOptimization == FOP_NONE) + return; + + // Run FrameAnalysis pass + FrameAnalysis FA(PrintPass); + FA.runOnFunctions(BC, BFs, LargeFunctions); + + // Our main loop: perform caller-saved register optimizations, then + // callee-saved register optimizations (shrink wrapping). for (auto &I : BFs) { - auto Count = I.second.getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountDenominator += Count; - if (!shouldOptimize(I.second)) { - ++NumFunctionsNotOptimized; - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsNotOptimized += Count; + if (!FA.hasFrameInfo(I.second)) continue; + // Restrict pass execution if user asked to only run on hot functions + if (opts::FrameOptimization == FOP_HOT) { + if (I.second.getKnownExecutionCount() < BC.getHotThreshold()) + continue; + DEBUG(dbgs() << "Considering " << I.second.getPrintName() + << " for frame optimizations because its execution count ( " + << I.second.getKnownExecutionCount() + << " ) exceeds our hotness threshold ( " + << BC.getHotThreshold() << " )\n"); + } + { + NamedRegionTimer T1("remove loads", "FOP breakdown", true); + removeUnnecessaryLoads(FA, BC, I.second); } - if (!restoreFrameIndex(BC, I.second)) { - ++NumFunctionsFailedRestoreFI; - auto Count = I.second.getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsFailedRestoreFI += Count; + { + NamedRegionTimer T1("remove stores", "FOP breakdown", true); + removeUnusedStores(FA, BC, I.second); + } + // Don't even start shrink wrapping if no profiling info is available + if (I.second.getKnownExecutionCount() == 0) continue; + { + NamedRegionTimer T1("move spills", "FOP breakdown", true); + DataflowInfoManager Info(&FA, BC, I.second); + ShrinkWrapping SW(FA, BC, I.second, Info); + SW.perform(); } - removeUnnecessarySpills(BC, I.second); } - outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads - << " redundant load(s).\n"; - - if (opts::Verbosity == 0) { -#ifndef NDEBUG - if (!DebugFlag || !isCurrentDebugType("fop")) - return; -#else - return; -#endif - } + FA.cleanAnnotations(BC, BFs); + outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads + << " redundant load(s) and " << NumRedundantStores + << " unused store(s)\n"; outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg << " load(s) to use a register instead of a stack access, and " << NumLoadsChangedToImm << " to use an immediate.\n" - << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s).\n" - << "BOLT-INFO: FOP: Number of functions conservatively treated as " - "clobbering all registers: " - << NumFunctionsAllClobber - << format(" (%.1lf%% dyn cov)\n", - (100.0 * CountFunctionsAllClobber / CountDenominator)) - << "BOLT-INFO: FOP: " << NumFunctionsNotOptimized << " function(s) " - << format("(%.1lf%% dyn cov)", - (100.0 * CountFunctionsNotOptimized / CountDenominator)) - << " were not optimized.\n" - << "BOLT-INFO: FOP: " << NumFunctionsFailedRestoreFI << " function(s) " - << format("(%.1lf%% dyn cov)", - (100.0 * CountFunctionsFailedRestoreFI / CountDenominator)) - << " could not have its frame indices restored.\n"; + << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and " + << NumRedundantStores << " store(s).\n"; + FA.printStats(); + ShrinkWrapping::printStats(); } } // namespace bolt diff --git a/bolt/Passes/FrameOptimizer.h b/bolt/Passes/FrameOptimizer.h index e3423ad19f42..4ba8e1c2bb56 100644 --- a/bolt/Passes/FrameOptimizer.h +++ b/bolt/Passes/FrameOptimizer.h @@ -13,31 +13,40 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H #include "BinaryPasses.h" -#include "BinaryFunctionCallGraph.h" +#include "FrameAnalysis.h" namespace llvm { namespace bolt { -/// FrameOptimizerPass strives for removing unnecessary stack frame accesses. -/// For example, caller-saved registers may be conservatively pushed to the -/// stack because the callee may write to these registers. But if we can prove -/// the callee will never touch these registers, we can remove this spill. +/// FrameOptimizerPass strives for removing or moving stack frame accesses to +/// less frequently executed basic blocks, reducing the pressure on icache +/// usage as well as dynamic instruction count. /// -/// This optimization analyzes the call graph and first compute the set of +/// This is accomplished by analyzing both caller-saved register spills and +/// callee-saved register spills. This class handles the former while delegating +/// the latter to the class ShrinkWrapping. We discuss caller-saved register +/// spills optimization below. +/// +/// Caller-saved registers must be conservatively pushed to the stack because +/// the callee may write to these registers. If we can prove the callee will +/// never touch these registers, we can remove this spill. +/// +/// This optimization analyzes the call graph and first computes the set of /// registers that may get overwritten when executing a function (this includes /// the set of registers touched by all functions this function may call during -/// its execution). +/// its execution) -- see the FrameAnalysis class for implementation details. /// -/// The second step is to perform an alias analysis to disambiguate which stack -/// position is being accessed by each load/store instruction, and annotate -/// these instructions. +/// The second step is to perform an analysis to disambiguate which stack +/// position is being accessed by each load/store instruction -- see the +/// FrameAnalysis class. /// /// The third step performs a forward dataflow analysis, using intersection as /// the confluence operator, to propagate information about available -/// stack definitions at each point of the program. This definition shows -/// an equivalence between the value in a stack position and the value of a -/// register or immediate. To have those preserved, both register and the value -/// in the stack position cannot be touched by another instruction. +/// stack definitions at each point of the program. See the +/// StackAvailableExpressions class. This definition shows an equivalence +/// between the value in a stack position and the value of a register or +/// immediate. To have those preserved, both register and the value in the stack +/// position cannot be touched by another instruction. /// These definitions we are tracking occur in the form: /// /// stack def: MEM[FRAME - 0x5c] <= RAX @@ -62,86 +71,29 @@ namespace bolt { /// In this example, since the store source register is the same as the load /// destination register, this creates a redundant MOV that can be deleted. /// +/// Finally, another analysis propagates information about which instructions +/// are using (loading from) a stack position -- see StackReachingUses. If a +/// store sees no use of the value it is storing, it is eliminated. +/// class FrameOptimizerPass : public BinaryFunctionPass { /// Stats aggregating variables uint64_t NumRedundantLoads{0}; + uint64_t NumRedundantStores{0}; uint64_t NumLoadsChangedToReg{0}; uint64_t NumLoadsChangedToImm{0}; uint64_t NumLoadsDeleted{0}; - /// Number of functions we conservatively marked as clobbering the entire set - /// of registers because we couldn't fully understand it. - uint64_t NumFunctionsAllClobber{0}; - /// Execution count of those functions to give us an idea of their dynamic - /// coverage - uint64_t CountFunctionsAllClobber{0}; - - /// Call graph info - BinaryFunctionCallGraph Cg; - /// DFS or reverse post-ordering of the call graph nodes to allow us to - /// traverse the call graph bottom-up - std::deque TopologicalCGOrder; + /// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from + /// the frame. Use the analysis to convert memory loads to register moves or + /// immediate loads. Delete redundant register moves. + void removeUnnecessaryLoads(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF); - /// Map functions to the set of registers they may overwrite starting at when - /// it is called until it returns to the caller. - std::map RegsKilledMap; - -public: - /// Alias analysis information attached to each instruction that accesses a - /// frame position. This is called a "frame index" by LLVM Target libs when - /// it is building a MachineFunction frame, and we use the same name here - /// because we are essentially doing the job of frame reconstruction. - struct FrameIndexEntry { - /// If this is false, this instruction is necessarily a store - bool IsLoad; - /// If a store, this controls whether the store uses a register os an imm - /// as the source value. - bool IsStoreFromReg; - /// If load, this holds the destination register. If store, this holds - /// either the source register or source immediate. - int32_t RegOrImm; - - /// StackOffset and Size are the two aspects that identify this frame access - /// for the purposes of alias analysis. - int64_t StackOffset; - uint8_t Size; - - /// If this is false, we will never atempt to remove or optimize this - /// instruction. We just use it to keep track of stores we don't fully - /// understand but we know it may write to a frame position. - bool IsSimple; - }; - typedef std::unordered_map - FrameIndexMapTy; - FrameIndexMapTy FrameIndexMap; - - /// Compute the set of registers \p Inst may write to, marking them in - /// \p KillSet. If this is a call, try to get the set of registers the call - /// target will write to. - void getInstClobberList(const BinaryContext &BC, const MCInst &Inst, - BitVector &KillSet) const; -private: - /// Compute the set of registers \p Func may write to during its execution, - /// starting at the point when it is called up until when it returns. Returns - /// a BitVector the size of the target number of registers, representing the - /// set of clobbered registers. - BitVector getFunctionClobberList(const BinaryContext &BC, - const BinaryFunction *Func); - - /// Perform the step of building the set of registers clobbered by each - /// function execution, populating RegsKilledMap. - void buildClobberMap(const BinaryContext &BC); - - /// Alias analysis to disambiguate which frame position is accessed by each - /// instruction in function \p BF. Populates FrameIndexMap. - bool restoreFrameIndex(const BinaryContext &BC, const BinaryFunction &BF); - - /// Uses RegsKilledMap and FrameIndexMap to perform a dataflow analysis in - /// \p BF to reveal unnecessary reloads from the frame. Use the analysis - /// to convert memory loads to register moves or immediate loads. Delete - /// redundant register moves. - void removeUnnecessarySpills(const BinaryContext &BC, - BinaryFunction &BF); + /// Use information from stack frame usage to delete unused stores. + void removeUnusedStores(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF); public: explicit FrameOptimizerPass(const cl::opt &PrintPass) @@ -158,6 +110,7 @@ class FrameOptimizerPass : public BinaryFunctionPass { }; } // namespace bolt + } // namespace llvm diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index f95a9ef12503..ed9e0f00a1e2 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -14,6 +14,7 @@ #include "DataflowAnalysis.h" #include "FrameAnalysis.h" +#include "llvm/Support/Timer.h" namespace llvm { namespace bolt { @@ -29,6 +30,18 @@ class LivenessAnalysis NumRegs(BC.MRI->getNumRegs()) {} virtual ~LivenessAnalysis(); + bool isAlive(ProgramPoint PP, MCPhysReg Reg) const { + BitVector BV = (*this->getStateAt(PP)); + const BitVector &RegAliases = BC.MIA->getAliases(Reg, *BC.MRI); + BV &= RegAliases; + return BV.any(); + } + + void run() { + NamedRegionTimer T1("LA", "Dataflow", true); + DataflowAnalysis::run(); + } + protected: /// Reference to the result of stack frame analysis const FrameAnalysis &FA; diff --git a/bolt/Passes/ReachingDefOrUse.h b/bolt/Passes/ReachingDefOrUse.h index ca67389b281a..9b5f8695b3f1 100644 --- a/bolt/Passes/ReachingDefOrUse.h +++ b/bolt/Passes/ReachingDefOrUse.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H #include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" namespace llvm { namespace bolt { @@ -50,6 +51,11 @@ class ReachingDefOrUse return (*this->getStateAt(B))[this->ExprToIdx[&A]]; } + void run() { + NamedRegionTimer T1("RD", "Dataflow", true); + InstrsDataflowAnalysis, !Def>::run(); + } + protected: /// Reference to the result of stack frame analysis const FrameAnalysis &FA; diff --git a/bolt/Passes/ReachingInsns.h b/bolt/Passes/ReachingInsns.h index 4bcdb3d843dd..ce6cd8ccaa08 100644 --- a/bolt/Passes/ReachingInsns.h +++ b/bolt/Passes/ReachingInsns.h @@ -12,6 +12,9 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H #define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H +#include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" + namespace llvm { namespace bolt { @@ -37,6 +40,11 @@ class ReachingInsns return isInLoop(*BB); } + void run() { + NamedRegionTimer T1("RI", "Dataflow", true); + InstrsDataflowAnalysis, Backward>::run(); + } + protected: std::unordered_map InsnToBB; diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp new file mode 100644 index 000000000000..dcc5b5758c60 --- /dev/null +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -0,0 +1,1785 @@ +//===--- Passes/ShrinkWrapping.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "ShrinkWrapping.h" +#include + +#define DEBUG_TYPE "shrinkwrapping" + +using namespace llvm; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +static cl::opt ShrinkWrappingThreshold( + "shrink-wrapping-threshold", + cl::desc("Percentage of prologue execution count to use as threshold when" + " evaluating whether a block is cold enough to be profitable to" + " move eligible spills there"), + cl::init(40), cl::ZeroOrMore, cl::cat(BoltOptCategory)); +} + +namespace llvm { +namespace bolt { + +void CalleeSavedAnalysis::analyzeSaves() { + ReachingDefOrUse &RD = Info.getReachingDefs(); + StackReachingUses &SRU = Info.getStackReachingUses(); + auto &InsnToBB = Info.getInsnToBBMap(); + + DEBUG(dbgs() << "Checking spill locations\n"); + for (auto &BB : BF) { + DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); + const MCInst *Prev = nullptr; + for (auto &Inst : BB) { + if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (!FIE->IsStore || !FIE->IsSimple || !FIE->IsStoreFromReg || + FIE->StackOffset >= 0) { + Prev = &Inst; + continue; + } + + if (RD.isReachedBy(FIE->RegOrImm, + Prev ? RD.expr_begin(*Prev) : RD.expr_begin(BB))) { + Prev = &Inst; + continue; + } + + // If this stack position is accessed in another function, we are + // probably dealing with a parameter passed in a stack -- do not mess + // with it + if (SRU.isStoreUsed(*FIE, + Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB)), + /*IncludeLocalAccesses=*/false) { + Prev = &Inst; + continue; + } + + CalleeSaved.set(FIE->RegOrImm); + if (SaveFIEByReg[FIE->RegOrImm] == nullptr) + SaveFIEByReg[FIE->RegOrImm] = &*FIE; + SavingCost[FIE->RegOrImm] += InsnToBB[&Inst]->getKnownExecutionCount(); + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getSaveTag(), FIE->RegOrImm); + OffsetsByReg[FIE->RegOrImm] = FIE->StackOffset; + DEBUG(dbgs() << "Logging new candidate for Callee-Saved Reg: " + << FIE->RegOrImm << "\n"); + } + Prev = &Inst; + } + } +} + +void CalleeSavedAnalysis::analyzeRestores() { + ReachingDefOrUse &RU = Info.getReachingUses(); + + // Now compute all restores of these callee-saved regs + for (auto &BB : BF) { + const MCInst *Prev = nullptr; + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (!FIE->IsLoad || !FIE->IsSimple || !CalleeSaved[FIE->RegOrImm] || + FIE->StackOffset >= 0) { + Prev = &Inst; + continue; + } + + // If this reg is used locally after a restore, then we are probably + // not dealing with a callee-saved reg. Except if this use is by + // another store, but we don't cover this case yet. + if (RU.isReachedBy(FIE->RegOrImm, + Prev ? RU.expr_begin(*Prev) : RU.expr_begin(BB))) { + Prev = &Inst; + continue; + } + // If stack offsets between saves/store don't agree with each other, + // we don't completely understand what's happening here + if (FIE->StackOffset != OffsetsByReg[FIE->RegOrImm]) { + CalleeSaved.reset(FIE->RegOrImm); + DEBUG(dbgs() << "Dismissing Callee-Saved Reg because we found a " + "mismatching restore: " + << FIE->RegOrImm << "\n"); + Prev = &Inst; + continue; + } + + DEBUG(dbgs() << "Adding matching restore for: " << FIE->RegOrImm + << "\n"); + if (LoadFIEByReg[FIE->RegOrImm] == nullptr) + LoadFIEByReg[FIE->RegOrImm] = &*FIE; + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getRestoreTag(), + FIE->RegOrImm); + HasRestores.set(FIE->RegOrImm); + } + Prev = &Inst; + } + } +} + +std::vector CalleeSavedAnalysis::getSavesByReg(uint16_t Reg) { + std::vector Results; + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (getSavedReg(Inst) == Reg) + Results.push_back(&Inst); + } + } + return Results; +} + +std::vector CalleeSavedAnalysis::getRestoresByReg(uint16_t Reg) { + std::vector Results; + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (getRestoredReg(Inst) == Reg) + Results.push_back(&Inst); + } + } + return Results; +} + +CalleeSavedAnalysis::~CalleeSavedAnalysis() { + for (auto &BB : BF) { + for (auto &Inst : BB) { + BC.MIA->removeAnnotation(Inst, getSaveTag()); + BC.MIA->removeAnnotation(Inst, getRestoreTag()); + } + } +} + +void StackLayoutModifier::blacklistRegion(int64_t Offset, int64_t Size) { + if (BlacklistedRegions[Offset] < Size) { + BlacklistedRegions[Offset] = Size; + } +} + +bool StackLayoutModifier::isRegionBlacklisted(int64_t Offset, int64_t Size) { + for (auto Elem : BlacklistedRegions) { + if (Offset + Size > Elem.first && Offset < Elem.first + Elem.second) + return true; + } + return false; +} + +bool StackLayoutModifier::blacklistAllInConflictWith(int64_t Offset, + int64_t Size) { + bool HasConflict = false; + for (auto Iter = AvailableRegions.begin(); Iter != AvailableRegions.end();) { + auto &Elem = *Iter; + if (Offset + Size > Elem.first && Offset < Elem.first + Elem.second && + (Offset != Elem.first || Size != Elem.second)) { + Iter = AvailableRegions.erase(Iter); + HasConflict = true; + continue; + } + ++Iter; + } + if (HasConflict) { + blacklistRegion(Offset, Size); + return true; + } + return false; +} + +void StackLayoutModifier::checkFramePointerInitialization(MCInst &Point) { + auto &SPT = Info.getStackPointerTracking(); + if (!BC.MII->get(Point.getOpcode()) + .hasDefOfPhysReg(Point, BC.MIA->getFramePointer(), *BC.MRI)) + return; + + int SPVal, FPVal; + std::tie(SPVal, FPVal) = *SPT.getStateBefore(Point); + std::pair FP; + + if (FPVal != SPT.EMPTY && FPVal != SPT.SUPERPOSITION) + FP = std::make_pair(BC.MIA->getFramePointer(), FPVal); + else + FP = std::make_pair(0, 0); + std::pair SP; + + if (SPVal != SPT.EMPTY && SPVal != SPT.SUPERPOSITION) + SP = std::make_pair(BC.MIA->getStackPointer(), SPVal); + else + SP = std::make_pair(0, 0); + + int64_t Output; + if (!BC.MIA->evaluateSimple(Point, Output, SP, FP)) + return; + + // Not your regular frame pointer initialization... bail + if (Output != SPVal) + blacklistRegion(0, 0); +} + +void StackLayoutModifier::classifyStackAccesses() { + // Understand when stack slots are being used non-locally + auto &SRU = Info.getStackReachingUses(); + + for (auto &BB : BF) { + const MCInst *Prev = nullptr; + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + checkFramePointerInitialization(Inst); + auto FIEX = FA.getFIEFor(BC, Inst); + if (!FIEX) { + Prev = &Inst; + continue; + } + if (!FIEX->IsSimple || (FIEX->IsStore && !FIEX->IsStoreFromReg)) { + blacklistRegion(FIEX->StackOffset, FIEX->Size); + Prev = &Inst; + continue; + } + // If this stack position is accessed in another function, we are + // probably dealing with a parameter passed in a stack -- do not mess + // with it + if (SRU.isStoreUsed(*FIEX, + Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB), + /*IncludeLocalAccesses=*/false)) { + blacklistRegion(FIEX->StackOffset, FIEX->Size); + Prev = &Inst; + continue; + } + // Now we have a clear stack slot access. Check if its blacklisted or if + // it conflicts with another chunk. + if (isRegionBlacklisted(FIEX->StackOffset, FIEX->Size) || + blacklistAllInConflictWith(FIEX->StackOffset, FIEX->Size)) { + Prev = &Inst; + continue; + } + // We are free to go. Add it as available stack slot which we know how + // to move it. + AvailableRegions[FIEX->StackOffset] = FIEX->Size; + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getSlotTagName(), + FIEX->StackOffset); + RegionToRegMap[FIEX->StackOffset].insert(FIEX->RegOrImm); + RegToRegionMap[FIEX->RegOrImm].insert(FIEX->StackOffset); + DEBUG(dbgs() << "Adding region " << FIEX->StackOffset << " size " + << (int)FIEX->Size << "\n"); + } + } +} + +void StackLayoutModifier::classifyCFIs() { + std::stack> CFIStack; + int64_t CfaOffset{-8}; + uint16_t CfaReg{7}; + + auto recordAccess = [&](MCInst *Inst, int64_t Offset) { + const uint16_t Reg = BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false); + if (Reg == BC.MIA->getStackPointer() || Reg == BC.MIA->getFramePointer()) { + BC.MIA->addAnnotation(BC.Ctx.get(), *Inst, getSlotTagName(), Offset); + DEBUG(dbgs() << "Recording CFI " << Offset << "\n"); + } else { + IsSimple = false; + return; + } + }; + + for (auto &BB : BF.layout()) { + for (auto &Inst : *BB) { + if (!BC.MIA->isCFI(Inst)) + continue; + auto *CFI = BF.getCFIFor(Inst); + switch (CFI->getOperation()) { + case MCCFIInstruction::OpDefCfa: + CfaOffset = CFI->getOffset(); + recordAccess(&Inst, CfaOffset); + // Fall-through + case MCCFIInstruction::OpDefCfaRegister: + CfaReg = CFI->getRegister(); + break; + case MCCFIInstruction::OpDefCfaOffset: + CfaOffset = CFI->getOffset(); + recordAccess(&Inst, CfaOffset); + break; + case MCCFIInstruction::OpOffset: + recordAccess(&Inst, CFI->getOffset()); + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getOffsetCFIRegTagName(), + BC.MRI->getLLVMRegNum(CFI->getRegister(), + /*isEH=*/false)); + break; + case MCCFIInstruction::OpSameValue: + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getOffsetCFIRegTagName(), + BC.MRI->getLLVMRegNum(CFI->getRegister(), + /*isEH=*/false)); + break; + case MCCFIInstruction::OpRememberState: + CFIStack.push(std::make_pair(CfaOffset, CfaReg)); + break; + case MCCFIInstruction::OpRestoreState: { + assert(!CFIStack.empty() && "Corrupt CFI stack"); + auto &Elem = CFIStack.top(); + CFIStack.pop(); + CfaOffset = Elem.first; + CfaReg = Elem.second; + break; + } + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpAdjustCfaOffset: + llvm_unreachable("Unhandled AdjustCfaOffset"); + break; + default: + break; + } + } + } +} + +void StackLayoutModifier::scheduleChange( + MCInst &Inst, StackLayoutModifier::WorklistItem Item) { + auto &WList = BC.MIA->getOrCreateAnnotationAs>( + BC.Ctx.get(), Inst, getTodoTagName()); + WList.push_back(Item); +} + +bool StackLayoutModifier::canCollapseRegion(MCInst *DeletedPush) { + if (!IsSimple || !BC.MIA->isPush(*DeletedPush)) + return false; + + auto FIE = FA.getFIEFor(BC, *DeletedPush); + if (!FIE) + return false; + + return canCollapseRegion(FIE->StackOffset); +} + +bool StackLayoutModifier::canCollapseRegion(int64_t RegionAddr) { + if (!IsInitialized) + initialize(); + if (!IsSimple) + return false; + + if (CollapsedRegions.count(RegionAddr)) + return true; + + // Check if it is possible to readjust all accesses below RegionAddr + if (!BlacklistedRegions.empty()) + return false; + + return true; +} + +bool StackLayoutModifier::collapseRegion(MCInst *DeletedPush) { + auto FIE = FA.getFIEFor(BC, *DeletedPush); + if (!FIE) + return false; + int64_t RegionAddr = FIE->StackOffset; + int64_t RegionSz = FIE->Size; + return collapseRegion(DeletedPush, RegionAddr, RegionSz); +} + +bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr, + int64_t RegionSz) { + if (!canCollapseRegion(RegionAddr)) + return false; + + assert(IsInitialized); + auto &SAA = Info.getStackAllocationAnalysis(); + + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (!BC.MIA->hasAnnotation(Inst, getSlotTagName())) + continue; + auto Slot = + BC.MIA->getAnnotationAs( + Inst, getSlotTagName()); + if (!AvailableRegions.count(Slot)) + continue; + // We need to ensure this access is affected by the deleted push + if (!(*SAA.getStateBefore(Inst))[SAA.ExprToIdx[Alloc]]) + continue; + + if (BC.MIA->isCFI(Inst)) { + if (Slot > RegionAddr) + continue; + scheduleChange(Inst, WorklistItem(WorklistItem::AdjustCFI, RegionSz)); + continue; + } + + if (Slot == RegionAddr) { + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "AccessesDeletedPos", 0U); + continue; + } + if (BC.MIA->isPush(Inst) || BC.MIA->isPop(Inst)) { + continue; + } + + auto FIE = FA.getFIEFor(BC, Inst); + assert(FIE); + if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr) + continue; + + if (FIE->StackPtrReg == BC.MIA->getFramePointer() && Slot > RegionAddr) + continue; + + scheduleChange( + Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, RegionSz)); + } + } + + CollapsedRegions.insert(RegionAddr); + return true; +} + +void StackLayoutModifier::setOffsetForCollapsedAccesses(int64_t NewOffset) { + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (!BC.MIA->hasAnnotation(Inst, "AccessesDeletedPos")) + continue; + BC.MIA->removeAnnotation(Inst, "AccessesDeletedPos"); + scheduleChange( + Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, NewOffset)); + } + } +} + +bool StackLayoutModifier::canInsertRegion(ProgramPoint P) { + if (!IsInitialized) + initialize(); + if (!IsSimple) + return false; + + auto &SPT = Info.getStackPointerTracking(); + int64_t RegionAddr = SPT.getStateBefore(P)->first; + if (RegionAddr == SPT.SUPERPOSITION || RegionAddr == SPT.EMPTY) + return false; + + if (InsertedRegions.count(RegionAddr)) + return true; + + // Check if we are going to screw up stack accesses at call sites that + // pass parameters via stack + if (!BlacklistedRegions.empty()) + return false; + + return true; +} + +bool StackLayoutModifier::insertRegion(ProgramPoint P, int64_t RegionSz) { + if (!canInsertRegion(P)) + return false; + + assert(IsInitialized); + auto &SPT = Info.getStackPointerTracking(); + // This RegionAddr is slightly different from the one seen in collapseRegion + // This is the value of SP before the allocation the user wants to make. + int64_t RegionAddr = SPT.getStateBefore(P)->first; + if (RegionAddr == SPT.SUPERPOSITION || RegionAddr == SPT.EMPTY) + return false; + + auto &DA = Info.getDominatorAnalysis(); + + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (!BC.MIA->hasAnnotation(Inst, getSlotTagName())) + continue; + auto Slot = + BC.MIA->getAnnotationAs( + Inst, getSlotTagName()); + if (!AvailableRegions.count(Slot)) + continue; + + if (!(DA.doesADominateB(P, Inst))) + continue; + + if (BC.MIA->isCFI(Inst)) { + if (Slot >= RegionAddr) + continue; + scheduleChange(Inst, WorklistItem(WorklistItem::AdjustCFI, -RegionSz)); + continue; + } + + auto FIE = FA.getFIEFor(BC, Inst); + assert(FIE); + if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr) + continue; + if (FIE->StackPtrReg == BC.MIA->getFramePointer() && Slot >= RegionAddr) + continue; + if (BC.MIA->isPush(Inst) || BC.MIA->isPop(Inst)) + continue; + scheduleChange( + Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, -RegionSz)); + } + } + + InsertedRegions.insert(RegionAddr); + return true; +} + +void StackLayoutModifier::performChanges() { + std::set ModifiedCFIIndices; + for (auto &BB : BF) { + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + if (BC.MIA->hasAnnotation(Inst, "AccessesDeletedPos")) { + assert(BC.MIA->isPop(Inst) || BC.MIA->isPush(Inst)); + BC.MIA->removeAnnotation(Inst, "AccessesDeletedPos"); + } + if (!BC.MIA->hasAnnotation(Inst, getTodoTagName())) + continue; + auto &WList = BC.MIA->getAnnotationAs>( + Inst, getTodoTagName()); + int64_t Adjustment = 0; + WorklistItem::ActionType AdjustmentType = WorklistItem::None; + for (auto &WI : WList) { + if (WI.Action == WorklistItem::None) + continue; + assert(WI.Action == WorklistItem::AdjustLoadStoreOffset || + WI.Action == WorklistItem::AdjustCFI); + assert((AdjustmentType == WorklistItem::None || + AdjustmentType == WI.Action) && + "Conflicting actions requested at the same program point"); + AdjustmentType = WI.Action; + Adjustment += WI.OffsetUpdate; + } + if (!Adjustment) + continue; + if (AdjustmentType != WorklistItem::AdjustLoadStoreOffset) { + assert(BC.MIA->isCFI(Inst)); + uint32_t CFINum = Inst.getOperand(0).getImm(); + if (ModifiedCFIIndices.count(CFINum)) + continue; + ModifiedCFIIndices.insert(CFINum); + MCCFIInstruction *CFI = BF.getCFIFor(Inst); + DEBUG(dbgs() << "Changing CFI offset from " << CFI->getOffset() + << " to " << (CFI->getOffset() + Adjustment) << "\n"); + CFI->setOffset(CFI->getOffset() + Adjustment); + continue; + } + int32_t SrcImm{0}; + MCPhysReg Reg{0}; + MCPhysReg StackPtrReg{0}; + int64_t StackOffset{0}; + bool IsIndexed{false}; + bool IsLoad{false}; + bool IsStore{false}; + bool IsSimple{false}; + bool IsStoreFromReg{false}; + uint8_t Size{0}; + bool Success{false}; + Success = BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, + Reg, SrcImm, StackPtrReg, StackOffset, + Size, IsSimple, IsIndexed); + assert(Success && IsSimple && !IsIndexed && (!IsStore || IsStoreFromReg)); + if (StackPtrReg != BC.MIA->getFramePointer()) + Adjustment = -Adjustment; + if (IsLoad) + Success = BC.MIA->createRestoreFromStack( + Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size); + else if (IsStore) + Success = BC.MIA->createSaveToStack( + Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size); + DEBUG({ + dbgs() << "Adjusted instruction: "; + Inst.dump(); + }); + assert(Success); + } + } +} + +void StackLayoutModifier::initialize() { + classifyStackAccesses(); + classifyCFIs(); + IsInitialized = true; +} + +uint64_t ShrinkWrapping::SpillsMovedRegularMode = 0; +uint64_t ShrinkWrapping::SpillsMovedPushPopMode = 0; + +using BBIterTy = BinaryBasicBlock::iterator; + +void ShrinkWrapping::classifyCSRUses() { + auto &DA = Info.getDominatorAnalysis(); + auto &SPT = Info.getStackPointerTracking(); + UsesByReg = std::vector(BC.MRI->getNumRegs(), + BitVector(DA.NumInstrs, false)); + + const BitVector &FPAliases = + BC.MIA->getAliases(BC.MIA->getFramePointer(), *BC.MRI); + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (BC.MIA->isCFI(Inst)) + continue; + auto BV = BitVector(BC.MRI->getNumRegs(), false); + BC.MIA->getTouchedRegs(Inst, BV, *BC.MRI); + BV &= CSA.CalleeSaved; + for (int I = BV.find_first(); I != -1; I = BV.find_next(I)) { + if (I == 0) + continue; + if (CSA.getSavedReg(Inst) != I && CSA.getRestoredReg(Inst) != I) + UsesByReg[I].set(DA.ExprToIdx[&Inst]); + } + if (!SPT.HasFramePointer || !BC.MIA->isCall(Inst)) + continue; + BV = CSA.CalleeSaved; + BV &= FPAliases; + for (int I = BV.find_first(); I > 0; I = BV.find_next(I)) { + UsesByReg[I].set(DA.ExprToIdx[&Inst]); + } + } + } +} + +void ShrinkWrapping::pruneUnwantedCSRs() { + BitVector ParamRegs = BC.MIA->getRegsUsedAsParams(*BC.MRI); + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + if (!CSA.CalleeSaved[I]) + continue; + if (ParamRegs[I]) { + CSA.CalleeSaved.reset(I); + continue; + } + if (UsesByReg[I].empty()) { + DEBUG(dbgs() + << "Dismissing Callee-Saved Reg because we found no uses of it:" + << I << "\n"); + CSA.CalleeSaved.reset(I); + continue; + } + if (!CSA.HasRestores[I]) { + DEBUG(dbgs() << "Dismissing Callee-Saved Reg because it does not have " + "restores:" + << I << "\n"); + CSA.CalleeSaved.reset(I); + } + } +} + +void ShrinkWrapping::computeSaveLocations() { + SavePos = std::vector>(BC.MRI->getNumRegs()); + auto &RI = Info.getReachingInsnsBackwards(); + auto &DA = Info.getDominatorAnalysis(); + + DEBUG(dbgs() << "Checking save/restore possibilities\n"); + for (auto &BB : BF) { + DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); + + MCInst *First = BB.begin() != BB.end() ? &*BB.begin() : nullptr; + if (!First) + continue; + + // Use reaching instructions to detect if we are inside a loop - if we + // are, do not consider this BB as valid placement for saves. + if (RI.isInLoop(BB)) + continue; + + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + if (!CSA.CalleeSaved[I]) + continue; + + auto BBDominatedUses = BitVector(DA.NumInstrs, false); + for (auto J = UsesByReg[I].find_first(); J > 0; + J = UsesByReg[I].find_next(J)) { + if (DA.doesADominateB(*First, J)) + BBDominatedUses.set(J); + } + DEBUG(dbgs() << "\t\tBB " << BB.getName() << " dominates " + << BBDominatedUses.count() << " uses for reg " << I + << ". Total uses for reg is " << UsesByReg[I].count() + << "\n"); + BBDominatedUses &= UsesByReg[I]; + if (BBDominatedUses == UsesByReg[I]) { + DEBUG(dbgs() << "\t\t\tAdded " << BB.getName() << " as a save pos for " + << I << "\n"); + SavePos[I].insert(First); + DEBUG({ + dbgs() << "Dominated uses are:\n"; + for (auto J = UsesByReg[I].find_first(); J > 0; + J = UsesByReg[I].find_next(J)) { + dbgs() << "Idx " << J << ": "; + DA.Expressions[J]->dump(); + } + }); + } + } + } + + BestSaveCount = std::vector(BC.MRI->getNumRegs(), + std::numeric_limits::max()); + BestSavePos = std::vector(BC.MRI->getNumRegs(), nullptr); + auto &InsnToBB = Info.getInsnToBBMap(); + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + if (!CSA.CalleeSaved[I]) + continue; + + for (auto *Pos : SavePos[I]) { + auto *BB = InsnToBB[Pos]; + uint64_t Count = BB->getExecutionCount(); + if (Count != BinaryBasicBlock::COUNT_NO_PROFILE && + Count < BestSaveCount[I]) { + BestSavePos[I] = Pos; + BestSaveCount[I] = Count; + } + } + } +} + +void ShrinkWrapping::computeDomOrder() { + std::vector Order; + for (MCPhysReg I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + Order.push_back(I); + } + + auto &DA = Info.getDominatorAnalysis(); + auto &InsnToBB = Info.getInsnToBBMap(); + std::sort(Order.begin(), Order.end(), [&](const MCPhysReg &A, + const MCPhysReg &B) { + auto *BBA = BestSavePos[A] ? InsnToBB[BestSavePos[A]] : nullptr; + auto *BBB = BestSavePos[B] ? InsnToBB[BestSavePos[B]] : nullptr; + if (BBA == BBB) + return A < B; + if (!BBA && BBB) + return false; + if (BBA && !BBB) + return true; + if (DA.doesADominateB(*BestSavePos[A], *BestSavePos[B])) + return true; + if (DA.doesADominateB(*BestSavePos[B], *BestSavePos[A])) + return false; + return A < B; + }); + + for (MCPhysReg I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + DomOrder[Order[I]] = I; + } +} + +bool ShrinkWrapping::isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave, + uint64_t &TotalEstimatedWin) { + const uint64_t CurSavingCost = CSA.SavingCost[CSR]; + if (!CSA.CalleeSaved[CSR]) + return false; + + uint64_t BestCount = BestSaveCount[CSR]; + BestPosSave = BestSavePos[CSR]; + bool ShouldMove{false}; + if (BestCount != std::numeric_limits::max() && + BestCount < (opts::ShrinkWrappingThreshold / 100.0) * CurSavingCost) { + DEBUG({ + auto &InsnToBB = Info.getInsnToBBMap(); + dbgs() << "Better position for saves found in func " << BF.getPrintName() + << " count << " << BF.getKnownExecutionCount() << "\n"; + dbgs() << "Reg: " << CSR + << "; New BB: " << InsnToBB[BestPosSave]->getName() + << " Freq reduction: " << (CurSavingCost - BestCount) << "\n"; + }); + TotalEstimatedWin += CurSavingCost - BestCount; + ShouldMove = true; + } + + if (!ShouldMove) + return false; + if (!BestPosSave) { + DEBUG({ + dbgs() << "Dropping opportunity because we don't know where to put " + "stores -- total est. freq reduc: " + << TotalEstimatedWin << "\n"; + }); + return false; + } + return true; +} + +/// Auxiliar function used to create basic blocks for critical edges and update +/// the dominance frontier with these new locations +void ShrinkWrapping::splitFrontierCritEdges( + BinaryFunction *Func, SmallVector &Frontier, + const SmallVector &IsCritEdge, + const SmallVector &From, + const SmallVector, 4> &To) { + DEBUG(dbgs() << "splitFrontierCritEdges: Now handling func " + << BF.getPrintName() << "\n"); + for (size_t I = 0; I < Frontier.size(); ++I) { + if (!IsCritEdge[I]) + continue; + if (To[I].empty()) + continue; + auto FromBB = From[I]; + DEBUG(dbgs() << " - Now handling FrontierBB " << FromBB->getName() << "\n"); + for (auto DestinationBB : To[I]) { + DEBUG(dbgs() << " - Dest : " << DestinationBB->getName() << "\n"); + auto *NewBB = Func->splitEdge(FromBB, DestinationBB); + // Insert dummy instruction so this BB is never empty (we need this for + // PredictiveStackPointerTracking to work, since it annotates instructions + // and not BBs). + if (NewBB->empty()) { + MCInst NewInst; + BC.MIA->createNoop(NewInst); + NewBB->addInstruction(std::move(NewInst)); + scheduleChange(&*NewBB->begin(), WorklistItem(WorklistItem::Erase, 0)); + } + + // Update frontier + Frontier[I] = ProgramPoint::getLastPointAt(*NewBB); + } + } +} + +SmallVector +ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR, + uint64_t TotalEstimatedWin) { + SmallVector Frontier; + SmallVector IsCritEdge; + bool CannotPlace{false}; + auto &DA = Info.getDominatorAnalysis(); + + SmallVector CritEdgesFrom; + SmallVector, 4> CritEdgesTo; + // In case of a critical edge, we need to create extra BBs to host restores + // into edges transitioning to the dominance frontier, otherwise we pull these + // restores to inside the dominated area. + Frontier = DA.getDominanceFrontierFor(*BestPosSave); + for (auto &PP : Frontier) { + bool HasCritEdges{false}; + if (PP.isInst() && BC.MIA->isTerminator(*PP.getInst()) && + doesInstUsesCSR(*PP.getInst(), CSR)) { + CannotPlace = true; + } + BinaryBasicBlock *FrontierBB = Info.getParentBB(PP); + CritEdgesFrom.emplace_back(FrontierBB); + CritEdgesTo.emplace_back(0); + auto &Dests = CritEdgesTo.back(); + bool MayNeedLPSplitting{false}; + // Check for invoke instructions at the dominance frontier, which indicates + // the landing pad is not dominated. + if (PP.isInst() && BC.MIA->isInvoke(*PP.getInst())) + MayNeedLPSplitting = true; + doForAllSuccs(*FrontierBB, [&](ProgramPoint P) { + if (!DA.doesADominateB(*BestPosSave, P)) { + Dests.emplace_back(Info.getParentBB(P)); + return; + } + HasCritEdges = true; + }); + // This confirms LP splitting is necessary to continue. Bail. + if (MayNeedLPSplitting && Dests.empty()) { + DEBUG(dbgs() << "Bailing on restore placement to avoid LP splitting\n"); + Frontier.clear(); + return Frontier; + } + IsCritEdge.push_back(HasCritEdges); + } + if (std::accumulate(IsCritEdge.begin(), IsCritEdge.end(), 0)) { + DEBUG({ + dbgs() << "Now detected critical edges in the following frontier:\n"; + for (auto &PP : Frontier) { + if (PP.isBB()) + dbgs() << " BB: " << PP.getBB()->getName() << "\n"; + else { + dbgs() << " Inst: "; + PP.getInst()->dump(); + } + } + }); + splitFrontierCritEdges(&BF, Frontier, IsCritEdge, CritEdgesFrom, + CritEdgesTo); + // BitVectors that represent all insns of the function are invalid now + // since we changed BBs/Insts. Re-run steps that depend on pointers being + // valid + Info.invalidateAll(); + classifyCSRUses(); + } + if (CannotPlace) { + DEBUG({ + dbgs() << "Dropping opportunity because restore placement failed" + " -- total est. freq reduc: " + << TotalEstimatedWin << "\n"; + }); + Frontier.clear(); + return Frontier; + } + return Frontier; +} + +bool ShrinkWrapping::validatePushPopsMode(unsigned CSR, MCInst *BestPosSave, + int64_t SaveOffset) { + if (FA.requiresAlignment(BF)) { + DEBUG({ + dbgs() << "Reg " << CSR << " is not using push/pops due to function " + "alignment requirements.\n"; + }); + return false; + } + for (MCInst *Save : CSA.getSavesByReg(CSR)) { + if (!SLM.canCollapseRegion(Save)) { + DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n"); + return false; + } + } + + auto &SPT = Info.getStackPointerTracking(); + // Abort if we are inserting a push into an entry BB (offset -8) and this + // func sets up a frame pointer. + if (!SLM.canInsertRegion(BestPosSave) || + SaveOffset == SPT.SUPERPOSITION || SaveOffset == SPT.EMPTY || + (SaveOffset == -8 && SPT.HasFramePointer)) { + DEBUG({ + dbgs() << "Reg " << CSR << " cannot insert region or we are " + "trying to insert a push into entry bb.\n"; + }); + return false; + } + return true; +} + +SmallVector ShrinkWrapping::fixPopsPlacements( + const SmallVector &RestorePoints, int64_t SaveOffset, + unsigned CSR) { + SmallVector FixedRestorePoints = RestorePoints; + // Moving pop locations to the correct sp offset + auto &RI = Info.getReachingInsnsBackwards(); + auto &SPT = Info.getStackPointerTracking(); + for (auto &PP : FixedRestorePoints) { + auto *BB = Info.getParentBB(PP); + auto Found = false; + if (SPT.getStateAt(ProgramPoint::getLastPointAt(*BB))->first == + SaveOffset) { + BitVector BV = *RI.getStateAt(ProgramPoint::getLastPointAt(*BB)); + BV &= UsesByReg[CSR]; + if (!BV.any()) { + Found = true; + PP = BB; + continue; + } + } + for (auto RIt = BB->rbegin(), End = BB->rend(); RIt != End; ++RIt) { + if (SPT.getStateBefore(*RIt)->first == SaveOffset) { + BitVector BV = *RI.getStateAt(*RIt); + BV &= UsesByReg[CSR]; + if (!BV.any()) { + Found = true; + PP = &*RIt; + break; + } + } + } + if (!Found) { + DEBUG({ + dbgs() << "Could not find restore insertion point for " << CSR + << ", falling back to load/store mode\n"; + }); + FixedRestorePoints.clear(); + return FixedRestorePoints; + } + } + return FixedRestorePoints; +} + +void ShrinkWrapping::scheduleOldSaveRestoresRemoval(unsigned CSR, + bool UsePushPops) { + + for (auto &BB : BF.layout()) { + std::vector CFIs; + for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) { + auto &Inst = *I; + if (BC.MIA->isCFI(Inst)) { + // Delete all offset CFIs related to this CSR + if (SLM.getOffsetCFIReg(Inst) == CSR) { + HasDeletedOffsetCFIs[CSR] = true; + scheduleChange(&Inst, WorklistItem(WorklistItem::Erase, CSR)); + continue; + } + CFIs.push_back(&Inst); + continue; + } + + auto SavedReg = CSA.getSavedReg(Inst); + auto RestoredReg = CSA.getRestoredReg(Inst); + if (SavedReg != CSR && RestoredReg != CSR) { + CFIs.clear(); + continue; + } + + scheduleChange(&Inst, WorklistItem(UsePushPops + ? WorklistItem::Erase + : WorklistItem::ChangeToAdjustment, + CSR)); + + // Delete associated CFIs + const bool RecordDeletedPushCFIs = + SavedReg == CSR && DeletedPushCFIs[CSR].empty(); + const bool RecordDeletedPopCFIs = + RestoredReg == CSR && DeletedPopCFIs[CSR].empty(); + for (MCInst *CFI : CFIs) { + auto *MCCFI = BF.getCFIFor(*CFI); + // Do not touch these... + if (MCCFI->getOperation() == MCCFIInstruction::OpRestoreState || + MCCFI->getOperation() == MCCFIInstruction::OpRememberState) + continue; + scheduleChange(CFI, WorklistItem(WorklistItem::Erase, CSR)); + if (RecordDeletedPushCFIs) { + // Do not record this to be replayed later because we are going to + // rebuild it. + if (MCCFI->getOperation() == MCCFIInstruction::OpDefCfaOffset) + continue; + DeletedPushCFIs[CSR].push_back(CFI->getOperand(0).getImm()); + } + if (RecordDeletedPopCFIs) { + if (MCCFI->getOperation() == MCCFIInstruction::OpDefCfaOffset) + continue; + DeletedPopCFIs[CSR].push_back(CFI->getOperand(0).getImm()); + } + } + CFIs.clear(); + } + } +} + +bool ShrinkWrapping::doesInstUsesCSR(const MCInst &Inst, uint16_t CSR) { + if (BC.MIA->isCFI(Inst) || CSA.getSavedReg(Inst) == CSR || + CSA.getRestoredReg(Inst) == CSR) + return false; + BitVector BV = BitVector(BC.MRI->getNumRegs(), false); + BC.MIA->getTouchedRegs(Inst, BV, *BC.MRI); + return BV[CSR]; +} + +void ShrinkWrapping::scheduleSaveRestoreInsertions( + unsigned CSR, MCInst *BestPosSave, + SmallVector &RestorePoints, bool UsePushPops) { + auto &InsnToBB = Info.getInsnToBBMap(); + auto FIESave = CSA.SaveFIEByReg[CSR]; + auto FIELoad = CSA.LoadFIEByReg[CSR]; + assert(FIESave && FIELoad && "Invalid CSR"); + + DEBUG({ + dbgs() << "Scheduling save insertion at: "; + BestPosSave->dump(); + }); + + scheduleChange(BestPosSave, UsePushPops ? WorklistItem::InsertPushOrPop + : WorklistItem::InsertLoadOrStore, + *FIESave, CSR); + + for (auto &PP : RestorePoints) { + BinaryBasicBlock *FrontierBB = Info.getParentBB(PP); + DEBUG({ + dbgs() << "Scheduling restore insertion at: "; + if (PP.isInst()) + PP.getInst()->dump(); + else { + dbgs() << PP.getBB()->getName() << "\n"; + } + }); + MCInst *Term = + FrontierBB->getTerminatorBefore(PP.isInst() ? PP.getInst() : nullptr); + if (Term) + PP = Term; + if (PP.isInst() && doesInstUsesCSR(*PP.getInst(), CSR)) { + assert(!InsnToBB[PP.getInst()]->hasTerminatorAfter(PP.getInst()) && + "cannot move to end of bb"); + scheduleChange(InsnToBB[PP.getInst()], + UsePushPops ? WorklistItem::InsertPushOrPop + : WorklistItem::InsertLoadOrStore, + *FIELoad, CSR); + continue; + } + scheduleChange(PP, UsePushPops ? WorklistItem::InsertPushOrPop + : WorklistItem::InsertLoadOrStore, + *FIELoad, CSR); + } +} + +void ShrinkWrapping::moveSaveRestores() { + bool DisablePushPopMode{false}; + bool UsedPushPopMode{false}; + + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + MCInst *BestPosSave{nullptr}; + uint64_t TotalEstimatedWin{0}; + if (!isBestSavePosCold(I, BestPosSave, TotalEstimatedWin)) + continue; + SmallVector RestorePoints = + doRestorePlacement(BestPosSave, I, TotalEstimatedWin); + if (RestorePoints.empty()) + continue; + + auto FIESave = CSA.SaveFIEByReg[I]; + auto FIELoad = CSA.LoadFIEByReg[I]; + assert(FIESave && FIELoad); + auto &SPT = Info.getStackPointerTracking(); + auto SaveOffset = SPT.getStateBefore(*BestPosSave)->first; + auto SaveSize = FIESave->Size; + + // Operation mode: if true, will insert push/pops instead of loads/restores + bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset); + + if (UsePushPops) { + auto FixedRestorePoints = fixPopsPlacements(RestorePoints, SaveOffset, I); + if (FixedRestorePoints.empty()) + UsePushPops = false; + else + RestorePoints = FixedRestorePoints; + } + + // Disable push-pop mode for all CSRs in this function + if (!UsePushPops) + DisablePushPopMode = true; + else + UsedPushPopMode = true; + + scheduleOldSaveRestoresRemoval(I, UsePushPops); + scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops); + + // Schedule modifications to stack-accessing instructions via + // StackLayoutModifier + if (UsePushPops) { + for (MCInst *Save : CSA.getSavesByReg(I)) { + SLM.collapseRegion(Save); + } + SLM.insertRegion(BestPosSave, SaveSize); + } + + // Stats collection + if (UsePushPops) + ++SpillsMovedPushPopMode; + else + ++SpillsMovedRegularMode; + } + + // Revert push-pop mode if it failed for a single CSR + if (DisablePushPopMode && UsedPushPopMode) { + for (auto &BB : BF) { + auto WRI = Todo.find(&BB); + if (WRI != Todo.end()) { + auto &TodoList = WRI->second; + for (auto &Item : TodoList) { + if (Item.Action == WorklistItem::InsertPushOrPop) + Item.Action = WorklistItem::InsertLoadOrStore; + } + } + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + auto TodoList = BC.MIA->tryGetAnnotationAs>( + Inst, getAnnotationName()); + if (!TodoList) + continue; + bool isCFI = BC.MIA->isCFI(Inst); + for (auto &Item : *TodoList) { + if (Item.Action == WorklistItem::InsertPushOrPop) + Item.Action = WorklistItem::InsertLoadOrStore; + if (!isCFI && Item.Action == WorklistItem::Erase) + Item.Action = WorklistItem::ChangeToAdjustment; + } + } + } + } +} + +namespace { + +// A special StackPointerTracking that compensates for our future plans +// in removing/adding insn. +class PredictiveStackPointerTracking + : public StackPointerTrackingBase { + friend class DataflowAnalysis>; + decltype(ShrinkWrapping::Todo) &TodoMap; + DataflowInfoManager &Info; + +protected: + void compNextAux(const MCInst &Point, + const std::vector &TodoItems, + std::pair &Res) { + for (const auto &Item : TodoItems) { + if (Item.Action == ShrinkWrapping::WorklistItem::Erase && + BC.MIA->isPush(Point)) { + Res.first += BC.MIA->getPushSize(Point); + continue; + } + if (Item.Action == ShrinkWrapping::WorklistItem::Erase && + BC.MIA->isPop(Point)) { + Res.first -= BC.MIA->getPopSize(Point); + continue; + } + if (Item.Action == ShrinkWrapping::WorklistItem::InsertPushOrPop && + Item.FIEToInsert.IsStore) { + Res.first -= Item.FIEToInsert.Size; + continue; + } + if (Item.Action == ShrinkWrapping::WorklistItem::InsertPushOrPop && + Item.FIEToInsert.IsLoad) { + Res.first += Item.FIEToInsert.Size; + continue; + } + } + } + + std::pair computeNext(const MCInst &Point, + const std::pair &Cur) { + std::pair Res = + StackPointerTrackingBase::computeNext( + Point, Cur); + if (Res.first == StackPointerTracking::SUPERPOSITION || + Res.first == StackPointerTracking::EMPTY) + return Res; + auto TodoItems = + BC.MIA->tryGetAnnotationAs>( + Point, ShrinkWrapping::getAnnotationName()); + if (TodoItems) + compNextAux(Point, *TodoItems, Res); + auto &InsnToBBMap = Info.getInsnToBBMap(); + if (&*InsnToBBMap[&Point]->rbegin() != &Point) + return Res; + auto WRI = TodoMap.find(InsnToBBMap[&Point]); + if (WRI == TodoMap.end()) + return Res; + compNextAux(Point, WRI->second, Res); + return Res; + } + + StringRef getAnnotationName() const { + return StringRef("PredictiveStackPointerTracking"); + } + +public: + PredictiveStackPointerTracking(const BinaryContext &BC, BinaryFunction &BF, + decltype(ShrinkWrapping::Todo) &TodoMap, + DataflowInfoManager &Info) + : StackPointerTrackingBase(BC, BF), + TodoMap(TodoMap), Info(Info) {} + + void run() { + NamedRegionTimer T1("PSPT", "Dataflow", true); + StackPointerTrackingBase::run(); + } +}; + +} // end anonymous namespace + +void ShrinkWrapping::insertUpdatedCFI(unsigned CSR, int SPValPush, + int SPValPop) { + MCInst *SavePoint{nullptr}; + for (auto &BB : BF) { + for (auto InstIter = BB.rbegin(), EndIter = BB.rend(); InstIter != EndIter; + ++InstIter) { + int32_t SrcImm{0}; + MCPhysReg Reg{0}; + MCPhysReg StackPtrReg{0}; + int64_t StackOffset{0}; + bool IsIndexed{false}; + bool IsLoad{false}; + bool IsStore{false}; + bool IsSimple{false}; + bool IsStoreFromReg{false}; + uint8_t Size{0}; + if (!BC.MIA->isStackAccess(*InstIter, IsLoad, IsStore, IsStoreFromReg, + Reg, SrcImm, StackPtrReg, StackOffset, Size, + IsSimple, IsIndexed)) + continue; + if (Reg != CSR || !IsStore) + continue; + SavePoint = &*InstIter; + break; + } + if (SavePoint) + break; + } + assert(SavePoint); + DEBUG({ + dbgs() << "Now using as save point for reg " << CSR << " :"; + SavePoint->dump(); + }); + bool PrevAffectedZone{false}; + BinaryBasicBlock *PrevBB{nullptr}; + auto &DA = Info.getDominatorAnalysis(); + for (auto BB : BF.layout()) { + if (BB->size() == 0) + continue; + const bool InAffectedZoneAtEnd = DA.count(*BB->rbegin(), *SavePoint); + const bool InAffectedZoneAtBegin = + (*DA.getStateBefore(*BB->begin()))[DA.ExprToIdx[SavePoint]]; + bool InAffectedZone = InAffectedZoneAtBegin; + for (auto InstIter = BB->begin(); InstIter != BB->end(); ++InstIter) { + const bool CurZone = DA.count(*InstIter, *SavePoint); + if (InAffectedZone != CurZone) { + auto InsertionIter = InstIter; + ++InsertionIter; + InAffectedZone = CurZone; + if (InAffectedZone) { + InstIter = --insertCFIsForPushOrPop(*BB, InsertionIter, CSR, true, 0, + SPValPop); + } else { + InstIter = --insertCFIsForPushOrPop(*BB, InsertionIter, CSR, false, 0, + SPValPush); + } + } + } + if (InAffectedZoneAtBegin != PrevAffectedZone) { + if (InAffectedZoneAtBegin) { + insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, true, 0, SPValPush); + } else { + insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, false, 0, SPValPop); + } + } + PrevAffectedZone = InAffectedZoneAtEnd; + PrevBB = BB; + } +} + +void ShrinkWrapping::rebuildCFIForSP() { + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (!BC.MIA->isCFI(Inst)) + continue; + auto *CFI = BF.getCFIFor(Inst); + if (CFI->getOperation() == MCCFIInstruction::OpDefCfaOffset) + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "DeleteMe", 0U); + } + } + + int PrevSPVal{-8}; + BinaryBasicBlock *PrevBB{nullptr}; + auto &SPT = Info.getStackPointerTracking(); + for (auto BB : BF.layout()) { + if (BB->size() == 0) + continue; + const int SPValAtEnd = SPT.getStateAt(*BB->rbegin())->first; + const int SPValAtBegin = SPT.getStateBefore(*BB->begin())->first; + int SPVal = SPValAtBegin; + for (auto Iter = BB->begin(); Iter != BB->end(); ++Iter) { + const int CurVal = SPT.getStateAt(*Iter)->first; + if (SPVal != CurVal) { + auto InsertionIter = Iter; + ++InsertionIter; + Iter = BF.addCFIInstruction( + BB, InsertionIter, + MCCFIInstruction::createDefCfaOffset(nullptr, SPVal)); + SPVal = CurVal; + } + } + if (SPValAtBegin != PrevSPVal) { + BF.addCFIInstruction( + PrevBB, PrevBB->end(), + MCCFIInstruction::createDefCfaOffset(nullptr, SPValAtBegin)); + } + PrevSPVal = SPValAtEnd; + PrevBB = BB; + } + + for (auto &BB : BF) + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) + if (BC.MIA->hasAnnotation(*I, "DeleteMe")) + BB.eraseInstruction(&*I); +} + +MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal, + const FrameIndexEntry &FIE, + bool CreatePushOrPop) { + MCInst NewInst; + if (SPVal != StackPointerTracking::SUPERPOSITION && + SPVal != StackPointerTracking::EMPTY) { + if (FIE.IsLoad) { + if (!BC.MIA->createRestoreFromStack(NewInst, BC.MIA->getStackPointer(), + FIE.StackOffset - SPVal, FIE.RegOrImm, + FIE.Size)) { + errs() << "createRestoreFromStack: not supported on this platform\n"; + abort(); + } + } else { + if (!BC.MIA->createSaveToStack(NewInst, BC.MIA->getStackPointer(), + FIE.StackOffset - SPVal, FIE.RegOrImm, + FIE.Size)) { + errs() << "createSaveToStack: not supported on this platform\n"; + abort(); + } + } + if (CreatePushOrPop) + BC.MIA->changeToPushOrPop(NewInst); + return NewInst; + } + assert(FPVal != StackPointerTracking::SUPERPOSITION && + FPVal != StackPointerTracking::EMPTY); + + if (FIE.IsLoad) { + if (!BC.MIA->createRestoreFromStack(NewInst, BC.MIA->getFramePointer(), + FIE.StackOffset - FPVal, FIE.RegOrImm, + FIE.Size)) { + errs() << "createRestoreFromStack: not supported on this platform\n"; + abort(); + } + } else { + if (!BC.MIA->createSaveToStack(NewInst, BC.MIA->getFramePointer(), + FIE.StackOffset - FPVal, FIE.RegOrImm, + FIE.Size)) { + errs() << "createSaveToStack: not supported on this platform\n"; + abort(); + } + } + return NewInst; +} + +void ShrinkWrapping::updateCFIInstOffset(MCInst &Inst, int64_t NewOffset) { + auto *CFI = BF.getCFIFor(Inst); + if (UpdatedCFIs.count(CFI)) + return; + + switch (CFI->getOperation()) { + case MCCFIInstruction::OpDefCfa: + case MCCFIInstruction::OpDefCfaRegister: + case MCCFIInstruction::OpDefCfaOffset: + CFI->setOffset(NewOffset); + break; + case MCCFIInstruction::OpOffset: + default: + break; + } + + UpdatedCFIs.insert(CFI); +} + +BBIterTy ShrinkWrapping::insertCFIsForPushOrPop(BinaryBasicBlock &BB, + BBIterTy Pos, unsigned Reg, + bool isPush, int Sz, + int64_t NewOffset) { + if (isPush) { + for (uint32_t Idx : DeletedPushCFIs[Reg]) { + Pos = BF.addCFIPseudo(&BB, Pos, Idx); + updateCFIInstOffset(*Pos++, NewOffset); + } + if (HasDeletedOffsetCFIs[Reg]) { + Pos = ++BF.addCFIInstruction( + &BB, Pos, + MCCFIInstruction::createOffset( + nullptr, BC.MRI->getDwarfRegNum(Reg, false), NewOffset)); + } + } else { + for (uint32_t Idx : DeletedPopCFIs[Reg]) { + Pos = BF.addCFIPseudo(&BB, Pos, Idx); + updateCFIInstOffset(*Pos++, NewOffset); + } + if (HasDeletedOffsetCFIs[Reg]) { + Pos = ++BF.addCFIInstruction( + &BB, Pos, + MCCFIInstruction::createSameValue( + nullptr, BC.MRI->getDwarfRegNum(Reg, false))); + } + } + return Pos; +} + +BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint, + BinaryBasicBlock *CurBB, + const WorklistItem &Item, + int64_t SPVal, int64_t FPVal) { + // Trigger CFI reconstruction for this CSR if necessary - writing to + // PushOffsetByReg/PopOffsetByReg *will* trigger CFI update + if ((Item.FIEToInsert.IsStore && + !DeletedPushCFIs[Item.AffectedReg].empty()) || + (Item.FIEToInsert.IsLoad && !DeletedPopCFIs[Item.AffectedReg].empty()) || + HasDeletedOffsetCFIs[Item.AffectedReg]) { + if (Item.Action == WorklistItem::InsertPushOrPop) { + if (Item.FIEToInsert.IsStore) + PushOffsetByReg[Item.AffectedReg] = SPVal - Item.FIEToInsert.Size; + else + PopOffsetByReg[Item.AffectedReg] = SPVal; + } else { + if (Item.FIEToInsert.IsStore) + PushOffsetByReg[Item.AffectedReg] = Item.FIEToInsert.StackOffset; + else + PopOffsetByReg[Item.AffectedReg] = Item.FIEToInsert.StackOffset; + } + } + + DEBUG({ + dbgs() << "Creating stack access with SPVal = " << SPVal + << "; stack offset = " << Item.FIEToInsert.StackOffset + << " Is push = " << (Item.Action == WorklistItem::InsertPushOrPop) + << "\n"; + }); + MCInst NewInst = + createStackAccess(SPVal, FPVal, Item.FIEToInsert, + Item.Action == WorklistItem::InsertPushOrPop); + if (InsertionPoint != CurBB->end()) { + DEBUG({ + dbgs() << "Adding before Inst: "; + InsertionPoint->dump(); + dbgs() << "the following inst: "; + NewInst.dump(); + }); + return ++CurBB->insertInstruction(InsertionPoint, std::move(NewInst)); + } + CurBB->addInstruction(std::move(NewInst)); + DEBUG(dbgs() << "Adding to BB!\n"); + return CurBB->end(); +} + +BBIterTy ShrinkWrapping::processInsertionsList( + BBIterTy InsertionPoint, BinaryBasicBlock *CurBB, + std::vector &TodoList, int64_t SPVal, int64_t FPVal) { + bool HasInsertions{false}; + for (auto &Item : TodoList) { + if (Item.Action == WorklistItem::Erase || + Item.Action == WorklistItem::ChangeToAdjustment) + continue; + HasInsertions = true; + break; + } + + if (!HasInsertions) + return InsertionPoint; + + assert(((SPVal != StackPointerTracking::SUPERPOSITION && + SPVal != StackPointerTracking::EMPTY) || + (FPVal != StackPointerTracking::SUPERPOSITION && + FPVal != StackPointerTracking::EMPTY)) && + "Cannot insert if we have no idea of the stack state here"); + + // Revert the effect of PSPT for this location, we want SP Value before + // insertions + if (InsertionPoint == CurBB->end()) { + for (auto &Item : TodoList) { + if (Item.Action != WorklistItem::InsertPushOrPop) + continue; + if (Item.FIEToInsert.IsStore) + SPVal += Item.FIEToInsert.Size; + if (Item.FIEToInsert.IsLoad) + SPVal -= Item.FIEToInsert.Size; + } + } + + // Reorder POPs to obey the correct dominance relation between them + std::stable_sort(TodoList.begin(), TodoList.end(), [&](const WorklistItem &A, + const WorklistItem + &B) { + if ((A.Action != WorklistItem::InsertPushOrPop || !A.FIEToInsert.IsLoad) && + (B.Action != WorklistItem::InsertPushOrPop || !B.FIEToInsert.IsLoad)) + return false; + if ((A.Action != WorklistItem::InsertPushOrPop || !A.FIEToInsert.IsLoad)) + return false; + if ((B.Action != WorklistItem::InsertPushOrPop || !B.FIEToInsert.IsLoad)) + return true; + return DomOrder[B.AffectedReg] < DomOrder[A.AffectedReg]; + }); + + // Process insertions + for (auto &Item : TodoList) { + if (Item.Action == WorklistItem::Erase || + Item.Action == WorklistItem::ChangeToAdjustment) + continue; + + InsertionPoint = + processInsertion(InsertionPoint, CurBB, Item, SPVal, FPVal); + if (Item.Action == WorklistItem::InsertPushOrPop && + Item.FIEToInsert.IsStore) { + SPVal -= Item.FIEToInsert.Size; + } + if (Item.Action == WorklistItem::InsertPushOrPop && + Item.FIEToInsert.IsLoad) { + SPVal += Item.FIEToInsert.Size; + } + } + return InsertionPoint; +} + +bool ShrinkWrapping::processInsertions() { + PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info); + PSPT.run(); + + bool Changes{false}; + for (auto &BB : BF) { + // Process insertions before some inst. + for (auto I = BB.begin(); I != BB.end(); ++I) { + auto &Inst = *I; + auto TodoList = BC.MIA->tryGetAnnotationAs>( + Inst, getAnnotationName()); + if (!TodoList) + continue; + Changes = true; + auto List = *TodoList; + DEBUG({ + dbgs() << "Now processing insertions in " << BB.getName() + << " before inst: "; + Inst.dump(); + }); + auto Iter = I; + auto SPTState = + *PSPT.getStateAt(Iter == BB.begin() ? (ProgramPoint)&BB : &*(--Iter)); + I = processInsertionsList(I, &BB, List, SPTState.first, SPTState.second); + } + // Process insertions at the end of bb + auto WRI = Todo.find(&BB); + if (WRI != Todo.end()) { + auto SPTState = *PSPT.getStateAt(*BB.rbegin()); + processInsertionsList(BB.end(), &BB, WRI->second, SPTState.first, + SPTState.second); + Changes = true; + } + } + return Changes; +} + +void ShrinkWrapping::processDeletions() { + auto &LA = Info.getLivenessAnalysis(); + for (auto &BB : BF) { + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + auto TodoList = BC.MIA->tryGetAnnotationAs>( + Inst, getAnnotationName()); + if (!TodoList) + continue; + // Process all deletions + for (auto &Item : *TodoList) { + if (Item.Action != WorklistItem::Erase && + Item.Action != WorklistItem::ChangeToAdjustment) + continue; + + if (Item.Action == WorklistItem::ChangeToAdjustment) { + // Is flag reg alive across this func? + bool DontClobberFlags = LA.isAlive(&Inst, BC.MIA->getFlagsReg()); + if (auto Sz = BC.MIA->getPushSize(Inst)) { + BC.MIA->createStackPointerIncrement(Inst, Sz, DontClobberFlags); + continue; + } + if (auto Sz = BC.MIA->getPopSize(Inst)) { + BC.MIA->createStackPointerDecrement(Inst, Sz, DontClobberFlags); + continue; + } + } + + DEBUG({ + dbgs() << "Erasing: "; + Inst.dump(); + }); + BB.eraseInstruction(&Inst); + break; + } + } + } +} + +void ShrinkWrapping::rebuildCFI() { + const bool FP = Info.getStackPointerTracking().HasFramePointer; + Info.invalidateAll(); + if (!FP) { + rebuildCFIForSP(); + Info.invalidateAll(); + } + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + if (PushOffsetByReg[I] == 0 || PopOffsetByReg[I] == 0) + continue; + const int64_t SPValPush = PushOffsetByReg[I]; + const int64_t SPValPop = PopOffsetByReg[I]; + insertUpdatedCFI(I, SPValPush, SPValPop); + Info.invalidateAll(); + } +} + +void ShrinkWrapping::perform() { + HasDeletedOffsetCFIs = std::vector(BC.MRI->getNumRegs(), false); + PushOffsetByReg = std::vector(BC.MRI->getNumRegs(), 0LL); + PopOffsetByReg = std::vector(BC.MRI->getNumRegs(), 0LL); + DomOrder = std::vector(BC.MRI->getNumRegs(), 0); + + SLM.initialize(); + CSA.compute(); + classifyCSRUses(); + pruneUnwantedCSRs(); + computeSaveLocations(); + computeDomOrder(); + moveSaveRestores(); + DEBUG({ + dbgs() << "Func before shrink-wrapping: \n"; + BF.dump(); + }); + SLM.performChanges(); + // Early exit if processInsertions doesn't detect any todo items + if (!processInsertions()) + return; + processDeletions(); + rebuildCFI(); + // We may have split edges, creating BBs that need correct branching + BF.fixBranches(); + DEBUG({ + dbgs() << "Func after shrink-wrapping: \n"; + BF.dump(); + }); +} + +void ShrinkWrapping::printStats() { + outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode + << " spills inserting load/stores and " << SpillsMovedPushPopMode + << " spills inserting push/pops\n"; +} + +// Operators necessary as a result of using MCAnnotation +raw_ostream &operator<<(raw_ostream &OS, + const std::vector &Vec) { + OS << "SWTodo["; + auto Sep = ""; + for (const auto &Item : Vec) { + OS << Sep; + switch (Item.Action) { + case ShrinkWrapping::WorklistItem::Erase: + OS << "Erase"; + break; + case ShrinkWrapping::WorklistItem::ChangeToAdjustment: + OS << "ChangeToAdjustment"; + break; + case ShrinkWrapping::WorklistItem::InsertLoadOrStore: + OS << "InsertLoadOrStore"; + break; + case ShrinkWrapping::WorklistItem::InsertPushOrPop: + OS << "InsertPushOrPop"; + break; + } + Sep = ", "; + } + OS << "]"; + return OS; +} + +raw_ostream & +operator<<(raw_ostream &OS, + const std::vector &Vec) { + OS << "SLMTodo["; + auto Sep = ""; + for (const auto &Item : Vec) { + OS << Sep; + switch (Item.Action) { + case StackLayoutModifier::WorklistItem::None: + OS << "None"; + break; + case StackLayoutModifier::WorklistItem::AdjustLoadStoreOffset: + OS << "AdjustLoadStoreOffset"; + break; + case StackLayoutModifier::WorklistItem::AdjustCFI: + OS << "AdjustCFI"; + break; + } + Sep = ", "; + } + OS << "]"; + return OS; +} + +bool operator==(const ShrinkWrapping::WorklistItem &A, + const ShrinkWrapping::WorklistItem &B) { + return (A.Action == B.Action && A.AffectedReg == B.AffectedReg && + A.Adjustment == B.Adjustment && + A.FIEToInsert.IsLoad == B.FIEToInsert.IsLoad && + A.FIEToInsert.IsStore == B.FIEToInsert.IsStore && + A.FIEToInsert.RegOrImm == B.FIEToInsert.RegOrImm && + A.FIEToInsert.Size == B.FIEToInsert.Size && + A.FIEToInsert.IsSimple == B.FIEToInsert.IsSimple && + A.FIEToInsert.StackOffset == B.FIEToInsert.StackOffset); +} + +bool operator==(const StackLayoutModifier::WorklistItem &A, + const StackLayoutModifier::WorklistItem &B) { + return (A.Action == B.Action && A.OffsetUpdate == B.OffsetUpdate); +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/ShrinkWrapping.h b/bolt/Passes/ShrinkWrapping.h new file mode 100644 index 000000000000..7c28dea5ba47 --- /dev/null +++ b/bolt/Passes/ShrinkWrapping.h @@ -0,0 +1,477 @@ +//===--- Passes/ShrinkWrapping.h ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_SHRINKWRAPPING_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_SHRINKWRAPPING_H + +#include "BinaryPasses.h" +#include "FrameAnalysis.h" +#include "DataflowInfoManager.h" + +namespace llvm { +namespace bolt { + +/// Encapsulates logic required to analyze a binary function and detect which +/// registers are being saved as callee-saved, where are these saves and where +/// are the points where their original value are being restored. +class CalleeSavedAnalysis { + const FrameAnalysis &FA; + const BinaryContext &BC; + BinaryFunction &BF; + DataflowInfoManager &Info; + + /// Compute all stores of callee-saved regs. Those are the ones that stores a + /// register whose definition is not local. + void analyzeSaves(); + + /// Similar to analyzeSaves, tries to determine all instructions that recover + /// the original value of the callee-saved register before exiting the + /// function. + void analyzeRestores(); + + /// Returns the identifying string used to annotate instructions with metadata + /// for this analysis. These are deleted in the destructor. + static StringRef getSaveTag() { + return StringRef("CSA-SavedReg"); + } + static StringRef getRestoreTag() { + return StringRef("CSA-RestoredReg"); + } + +public: + BitVector CalleeSaved; + std::vector OffsetsByReg; + BitVector HasRestores; + std::vector SavingCost; + std::vector SaveFIEByReg; + std::vector LoadFIEByReg; + + CalleeSavedAnalysis(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF, DataflowInfoManager &Info) + : FA(FA), BC(BC), BF(BF), Info(Info), + CalleeSaved(BC.MRI->getNumRegs(), false), + OffsetsByReg(BC.MRI->getNumRegs(), 0LL), + HasRestores(BC.MRI->getNumRegs(), false), + SavingCost(BC.MRI->getNumRegs(), 0ULL), + SaveFIEByReg(BC.MRI->getNumRegs(), nullptr), + LoadFIEByReg(BC.MRI->getNumRegs(), nullptr) {} + + ~CalleeSavedAnalysis(); + + void compute() { + analyzeSaves(); + analyzeRestores(); + } + + /// Retrieves the value of the callee-saved register that is saved by this + /// instruction or 0 if this is not a CSR save instruction. + uint16_t getSavedReg(const MCInst &Inst) { + auto Val = BC.MIA->tryGetAnnotationAs( + Inst, getSaveTag()); + if (Val) + return *Val; + return 0; + } + + /// Retrieves the value of the callee-saved register that is restored by this + /// instruction or 0 if this is not a CSR restore instruction. + uint16_t getRestoredReg(const MCInst &Inst) { + auto Val = BC.MIA->tryGetAnnotationAs( + Inst, getRestoreTag()); + if (Val) + return *Val; + return 0; + } + + /// Routines to compute all saves/restores for a Reg (needs to traverse all + /// instructions). + std::vector getSavesByReg(uint16_t Reg); + std::vector getRestoresByReg(uint16_t Reg); +}; + +/// Identifies in a given binary function all stack regions being used and allow +/// us to edit the layout, removing or inserting new regions. When the layout is +/// modified, all affected stack-accessing instructions are updated. +class StackLayoutModifier { + const FrameAnalysis &FA; + const BinaryContext &BC; + BinaryFunction &BF; + DataflowInfoManager &Info; + + // Keep track of stack slots we know how to safely move + std::map AvailableRegions; + + DenseSet CollapsedRegions; + DenseSet InsertedRegions; + + // A map of chunks of stack memory we don't really know what's happening there + // and we need to leave it untouched. + std::map BlacklistedRegions; + + // Maps stack slots to the regs that are saved to them + DenseMap> RegionToRegMap; + DenseMap> RegToRegionMap; + + // If we can't understand how to move stack slots, IsSimple will be false + bool IsSimple{true}; + + bool IsInitialized{false}; + +public: + // Keep a worklist of operations to perform on the function to perform + // the requested layout modifications via collapseRegion()/insertRegion(). + struct WorklistItem { + enum ActionType : uint8_t { + None = 0, + AdjustLoadStoreOffset, + AdjustCFI, + } Action; + + int64_t OffsetUpdate{0}; + WorklistItem() : Action(None) {} + WorklistItem(ActionType Action) : Action(Action) {} + WorklistItem(ActionType Action, int OffsetUpdate) + : Action(Action), OffsetUpdate(OffsetUpdate) {} + }; +private: + + /// Mark the stack region identified by \p Offset and \p Size to be a + /// no-touch zone, whose accesses cannot be relocated to another region. + void blacklistRegion(int64_t Offset, int64_t Size); + + /// Check if this region overlaps with blacklisted addresses + bool isRegionBlacklisted(int64_t Offset, int64_t Size); + + /// Check if the region identified by \p Offset and \p Size has any conflicts + /// with available regions so far. If it has, blacklist all involved regions + /// and return true. + bool blacklistAllInConflictWith(int64_t Offset, int64_t Size); + + /// If \p Point is identified as frame pointer initialization (defining the + /// value of FP with SP), check for non-standard initialization that precludes + /// us from changing the stack layout. If positive, update blacklisted + /// regions. + void checkFramePointerInitialization(MCInst &Point); + + /// Make sense of each stack offsets we can freely change + void classifyStackAccesses(); + void classifyCFIs(); + + /// Used to keep track of modifications to the function that will later be + /// performed by performChanges(); + void scheduleChange(MCInst &Inst, WorklistItem Item); + static StringRef getTodoTagName() { + return StringRef("SLM-TodoTag"); + } + static StringRef getSlotTagName() { + return StringRef("SLM-SlotTag"); + } + static StringRef getOffsetCFIRegTagName() { + return StringRef("SLM-OffsetCFIReg"); + } + +public: + StackLayoutModifier(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF, DataflowInfoManager &Info) + : FA(FA), BC(BC), BF(BF), Info(Info) {} + + ~StackLayoutModifier() { + for (auto &BB : BF) { + for (auto &Inst : BB) { + BC.MIA->removeAnnotation(Inst, getTodoTagName()); + BC.MIA->removeAnnotation(Inst, getSlotTagName()); + BC.MIA->removeAnnotation(Inst, getOffsetCFIRegTagName()); + } + } + } + + /// Retrieves the value of the callee-saved register that is restored by this + /// instruction or 0 if this is not a CSR restore instruction. + uint16_t getOffsetCFIReg(const MCInst &Inst) { + auto Val = + BC.MIA->tryGetAnnotationAs(Inst, getOffsetCFIRegTagName()); + if (Val) + return *Val; + return 0; + } + + /// Check if it is possible to delete the push instruction \p DeletedPush. + /// This involves collapsing the region accessed by this push and updating all + /// other instructions that access affected memory regions. Return true if we + /// can update this. + bool canCollapseRegion(int64_t RegionAddr); + bool canCollapseRegion(MCInst *DeletedPush); + + /// Notify the layout manager that \p DeletedPush was deleted and that it + /// needs to update other affected stack-accessing instructions. + bool collapseRegion(MCInst *Alloc, int64_t RegionAddr, int64_t RegionSize); + bool collapseRegion(MCInst *DeletedPush); + + /// Set the new stack address difference for load/store instructions that + /// referenced a stack location that was deleted via collapseRegion. + void setOffsetForCollapsedAccesses(int64_t NewOffset); + + /// Check if it is possible to insert a push instruction at point \p P. + /// This involves inserting a new region in the stack, possibly affecting + /// instructions that access the frame. Return true if we can update them all. + bool canInsertRegion(ProgramPoint P); + + /// Notify the layout manager that a new push instruction has been inserted + /// at point \p P and that it will need to update relevant instructions. + bool insertRegion(ProgramPoint P, int64_t RegionSz); + + /// Perform all changes scheduled by collapseRegion()/insertRegion() + void performChanges(); + + /// Perform initial assessment of the function trying to understand its stack + /// accesses. + void initialize(); +}; + +/// Implements a pass to optimize callee-saved register spills. These spills +/// typically happen at function prologue/epilogue. When these are hot basic +/// blocks, this pass will try to move these spills to cold blocks whenever +/// possible. +class ShrinkWrapping { + const FrameAnalysis &FA; + const BinaryContext &BC; + BinaryFunction &BF; + DataflowInfoManager &Info; + StackLayoutModifier SLM; + /// For each CSR, store a vector of all CFI indexes deleted as a consequence + /// of moving this Callee-Saved Reg + DenseMap> DeletedPushCFIs; + DenseMap> DeletedPopCFIs; + std::vector HasDeletedOffsetCFIs; + SmallPtrSet UpdatedCFIs; + std::vector UsesByReg; + std::vector PushOffsetByReg; + std::vector PopOffsetByReg; + std::vector DomOrder; + CalleeSavedAnalysis CSA; + std::vector> SavePos; + std::vector BestSaveCount; + std::vector BestSavePos; + + /// Pass stats + static uint64_t SpillsMovedRegularMode; + static uint64_t SpillsMovedPushPopMode; + + /// Allow our custom worklist-sensitive analysis + /// PredictiveStackPointerTracking to access WorklistItem +public: + struct WorklistItem { + enum ActionType : uint8_t { + Erase = 0, + ChangeToAdjustment, + InsertLoadOrStore, + InsertPushOrPop + } Action; + FrameIndexEntry FIEToInsert; + unsigned AffectedReg; + int Adjustment{0}; + WorklistItem(ActionType Action, unsigned AffectedReg) + : Action(Action), FIEToInsert(), AffectedReg(AffectedReg) {} + WorklistItem(ActionType Action, unsigned AffectedReg, int Adjustment) + : Action(Action), FIEToInsert(), AffectedReg(AffectedReg), + Adjustment(Adjustment) {} + WorklistItem(ActionType Action, const FrameIndexEntry &FIE, + unsigned AffectedReg) + : Action(Action), FIEToInsert(FIE), AffectedReg(AffectedReg) {} + }; + + /// Insertion todo items scheduled to happen at the end of BBs. Since we + /// can't annotate BBs we maintain this bookkeeping here. + DenseMap> Todo; + + /// Annotation name used to tag instructions with removal or insertion actions + static StringRef getAnnotationName() { + return StringRef("ShrinkWrap-Todo"); + } +private: + using BBIterTy = BinaryBasicBlock::iterator; + + /// Calculate all possible uses/defs of these callee-saved regs + void classifyCSRUses(); + + // Ensure we don't work on cases where there are no uses of the callee-saved + // register. These unnecessary spills should have been removed by previous + // passes. + void pruneUnwantedCSRs(); + + // Map regs to their possible save possibilities (at start of these BBs) + void computeSaveLocations(); + + /// Look into the best save location found for saving callee-saved reg + /// \p CSR and evaluates whether we would benefit by moving the spill to this + /// new save location. Returns true in case it is profitable to perform the + /// move. + bool validateBestSavePos(unsigned CSR, MCInst *&BestPosSave, + uint64_t &TotalEstimatedWin); + + /// Populate the Todo map with worklistitems to change the function + template + void scheduleChange(ProgramPoint PP, T&& ...Item) { + if (PP.isInst()) { + auto &WList = BC.MIA->getOrCreateAnnotationAs>( + BC.Ctx.get(), *PP.getInst(), getAnnotationName()); + WList.emplace_back(std::forward(Item)...); + return; + } + // Avoid inserting on BBs with no instructions because we have a dataflow + // analysis that depends on insertions happening before real instructions + // (PredictiveStackPointerTracking) + BinaryBasicBlock *BB = PP.getBB(); + if (BB->size() != 0) { + Todo[BB].emplace_back(std::forward(Item)...); + return; + } + while (BB->size() == 0) { + assert (BB->succ_size() == 1); + BB = *BB->succ_begin(); + } + auto &WList = BC.MIA->getOrCreateAnnotationAs>( + BC.Ctx.get(), *BB->begin(), getAnnotationName()); + WList.emplace_back(std::forward(Item)...); + } + + /// Determine the POP ordering according to which CSR save is the dominator. + void computeDomOrder(); + + /// Check that the best possible location for a spill save (as determined by + /// computeSaveLocations) is cold enough to be worth moving the save to it. + /// \p CSR is the callee-saved register number, \p BestPosSave returns the + /// pointer to the cold location in case the function returns true, while + /// \p TotalEstimatedWin contains the ins dyn count reduction after moving. + bool isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave, + uint64_t &TotalEstimatedWin); + + /// Auxiliary function used to create basic blocks for critical edges and + /// update the dominance frontier with these new locations + void splitFrontierCritEdges( + BinaryFunction *Func, SmallVector &Frontier, + const SmallVector &IsCritEdge, + const SmallVector &From, + const SmallVector, 4> &To); + + /// After the best save location for a spill has been established in + /// \p BestPosSave for reg \p CSR, compute adequate locations to restore + /// the spilled value. This will be at the dominance frontier. + /// Returns an empty vector if we failed. In case of success, set + /// \p UsePushPops to true if we can operate in the push/pops mode. + SmallVector doRestorePlacement(MCInst *BestPosSave, + unsigned CSR, + uint64_t TotalEstimatedWin); + + /// Checks whether using push and pops (instead of the longer load-store + /// counterparts) is correct for reg \p CSR + bool validatePushPopsMode(unsigned CSR, MCInst *BestPosSave, + int64_t SaveOffset); + + /// Adjust restore locations to the correct SP offset if we are using POPs + /// instead of random-access load instructions. + SmallVector + fixPopsPlacements(const SmallVector &RestorePoints, + int64_t SaveOffset, unsigned CSR); + + /// When moving spills, mark all old spill locations to be deleted + void scheduleOldSaveRestoresRemoval(unsigned CSR, bool UsePushPops); + /// Return true if \p Inst uses reg \p CSR + bool doesInstUsesCSR(const MCInst &Inst, uint16_t CSR); + /// When moving spills, mark all new spill locations for insertion + void + scheduleSaveRestoreInsertions(unsigned CSR, MCInst *BestPosSave, + SmallVector &RestorePoints, + bool UsePushPops); + + /// Coordinate the replacement of callee-saved spills from their original + /// place (at prologue and epilogues) to colder basic blocks as determined + /// by computeSaveLocations(). + void moveSaveRestores(); + + /// After the spill locations for reg \p CSR has been moved and all affected + /// CFI has been removed, insert new updated CFI information for these + /// locations. + void insertUpdatedCFI(unsigned CSR, int SPValPush, int SPValPop); + + /// In case the function anchors the CFA reg as SP and we inserted pushes/pops + /// insert def_cfa_offsets at appropriate places (and delete old + /// def_cfa_offsets) + void rebuildCFIForSP(); + + /// Rebuild all CFI for affected Callee-Saved Registers. + void rebuildCFI(); + + /// Create a load-store instruction (depending on the contents of \p FIE). + /// If \p CreatePushOrPop is true, create a push/pop instead. Current SP/FP + /// values, as determined by StackPointerTracking, should be informed via + /// \p SPVal and \p FPVal in order to emit the correct offset form SP/FP. + MCInst createStackAccess(int SPVal, int FPVal, const FrameIndexEntry &FIE, + bool CreatePushOrPop); + + /// Update the CFI referenced by \p Inst with \p NewOffset, if the CFI has + /// an offset. + void updateCFIInstOffset(MCInst &Inst, int64_t NewOffset); + + /// Insert any CFI that should be attached to a register spill save/restore. + BBIterTy insertCFIsForPushOrPop(BinaryBasicBlock &BB, BBIterTy Pos, + unsigned Reg, bool isPush, int Sz, + int64_t NewOffset); + + /// Auxiliary function to processInsertionsList, adding a new instruction + /// before \p InsertionPoint as requested by \p Item. Return an updated + /// InsertionPoint for other instructions that need to be inserted at the same + /// original location, since this insertion may have invalidated the previous + /// location. + BBIterTy processInsertion(BBIterTy InsertionPoint, BinaryBasicBlock *CurBB, + const WorklistItem &Item, int64_t SPVal, + int64_t FPVal); + + /// Auxiliary function to processInsertions(), helping perform all the + /// insertion tasks in the todo list associated with a single insertion point. + /// Return true if at least one insertion was performed. + BBIterTy processInsertionsList(BBIterTy InsertionPoint, + BinaryBasicBlock *CurBB, + std::vector &TodoList, + int64_t SPVal, int64_t FPVal); + + /// Apply all insertion todo tasks regarding insertion of new stores/loads or + /// push/pops at annotated points. Return false if the entire function had + /// no todo tasks annotation and this pass has nothing to do. + bool processInsertions(); + + /// Apply all deletion todo tasks (or tasks to change a push/pop to a memory + /// access no-op) + void processDeletions(); + +public: + ShrinkWrapping(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF, DataflowInfoManager &Info) + : FA(FA), BC(BC), BF(BF), Info(Info), SLM(FA, BC, BF, Info), + CSA(FA, BC, BF, Info) {} + + ~ShrinkWrapping() { + for (auto &BB : BF) { + for (auto &Inst : BB) { + BC.MIA->removeAnnotation(Inst, getAnnotationName()); + } + } + } + + void perform(); + + static void printStats(); +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/Passes/StackAllocationAnalysis.cpp b/bolt/Passes/StackAllocationAnalysis.cpp new file mode 100644 index 000000000000..89f2d2a1c254 --- /dev/null +++ b/bolt/Passes/StackAllocationAnalysis.cpp @@ -0,0 +1,153 @@ +//===--- Passes/StackAllocationAnalysis.cpp -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "StackAllocationAnalysis.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "saa" + +namespace llvm { +namespace bolt { + +void StackAllocationAnalysis::preflight() { + DEBUG(dbgs() << "Starting StackAllocationAnalysis on \"" + << Func.getPrintName() << "\"\n"); + + for (auto &BB : this->Func) { + for (auto &Inst : BB) { + MCPhysReg From, To; + if (!BC.MIA->isPush(Inst) && (!BC.MIA->isRegToRegMove(Inst, From, To) || + To != BC.MIA->getStackPointer() || + From != BC.MIA->getFramePointer()) && + !BC.MII->get(Inst.getOpcode()) + .hasDefOfPhysReg(Inst, BC.MIA->getStackPointer(), *BC.MRI)) + continue; + this->Expressions.push_back(&Inst); + this->ExprToIdx[&Inst] = this->NumInstrs++; + } + } +} + +BitVector +StackAllocationAnalysis::getStartingStateAtBB(const BinaryBasicBlock &BB) { + return BitVector(this->NumInstrs, false); +} + +BitVector +StackAllocationAnalysis::getStartingStateAtPoint(const MCInst &Point) { + return BitVector(this->NumInstrs, false); +} + +void StackAllocationAnalysis::doConfluence(BitVector &StateOut, + const BitVector &StateIn) { + StateOut |= StateIn; +} + +BitVector StackAllocationAnalysis::doKill(const MCInst &Point, + const BitVector &StateIn, + int DeallocSize) { + int64_t SPOffset = SPT.getStateAt(Point)->first; + BitVector Next = StateIn; + if (SPOffset == SPT.SUPERPOSITION || SPOffset == SPT.EMPTY) + return Next; + for (auto I = this->expr_begin(Next), E = this->expr_end(); I != E; ++I) { + const MCInst *Instr = *I; + int64_t InstrOffset = SPT.getStateAt(*Instr)->first; + if (InstrOffset == SPT.SUPERPOSITION || InstrOffset == SPT.EMPTY) + continue; + if (InstrOffset < SPOffset) { + Next.reset(I.getBitVectorIndex()); + DEBUG({ + dbgs() << "SAA FYI: Killed: "; + Instr->dump(); + dbgs() << "by: "; + Point.dump(); + dbgs() << " (more info: Killed instr offset = " << InstrOffset + << ". SPOffset = " << SPOffset + << "; DeallocSize= " << DeallocSize << "\n"; + }); + } + } + return Next; +} + +void StackAllocationAnalysis::doConfluenceWithLP(BitVector &StateOut, + const BitVector &StateIn, + const MCInst &Invoke) { + BitVector NewIn = StateIn; + for (const auto &Operand : Invoke) { + if (Operand.isGnuArgsSize()) { + auto ArgsSize = Operand.getGnuArgsSize(); + NewIn = doKill(Invoke, NewIn, ArgsSize); + } + } + StateOut |= NewIn; +} + +BitVector StackAllocationAnalysis::computeNext(const MCInst &Point, + const BitVector &Cur) { + const auto &MIA = BC.MIA; + BitVector Next = Cur; + if (int Sz = MIA->getPopSize(Point)) { + Next = doKill(Point, Next, Sz); + return Next; + } + if (MIA->isPush(Point)) { + Next.set(this->ExprToIdx[&Point]); + return Next; + } + + MCPhysReg From, To; + int64_t SPOffset, FPOffset; + std::tie(SPOffset, FPOffset) = *SPT.getStateBefore(Point); + if (MIA->isRegToRegMove(Point, From, To) && To == MIA->getStackPointer() && + From == MIA->getFramePointer()) { + if (MIA->isLeave(Point)) + FPOffset += 8; + if (SPOffset < FPOffset) { + Next = doKill(Point, Next, FPOffset - SPOffset); + return Next; + } + if (SPOffset > FPOffset) { + Next.set(this->ExprToIdx[&Point]); + return Next; + } + } + if (BC.MII->get(Point.getOpcode()) + .hasDefOfPhysReg(Point, MIA->getStackPointer(), *BC.MRI)) { + std::pair SP; + if (SPOffset != SPT.EMPTY && SPOffset != SPT.SUPERPOSITION) + SP = std::make_pair(MIA->getStackPointer(), SPOffset); + else + SP = std::make_pair(0, 0); + std::pair FP; + if (FPOffset != SPT.EMPTY && FPOffset != SPT.SUPERPOSITION) + FP = std::make_pair(MIA->getFramePointer(), FPOffset); + else + FP = std::make_pair(0, 0); + int64_t Output; + if (!MIA->evaluateSimple(Point, Output, SP, FP)) + return Next; + + if (SPOffset < Output) { + Next = doKill(Point, Next, Output - SPOffset); + return Next; + } + if (SPOffset > Output) { + Next.set(this->ExprToIdx[&Point]); + return Next; + } + } + return Next; +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/StackAllocationAnalysis.h b/bolt/Passes/StackAllocationAnalysis.h new file mode 100644 index 000000000000..64fba984fed2 --- /dev/null +++ b/bolt/Passes/StackAllocationAnalysis.h @@ -0,0 +1,68 @@ +//===--- Passes/StackAllocationAnalysis.h ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKALLOCATIONANALYSIS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKALLOCATIONANALYSIS_H + +#include "DataflowAnalysis.h" +#include "StackPointerTracking.h" +#include "llvm/Support/Timer.h" + +namespace llvm { +namespace bolt { + +/// Perform a dataflow analysis to track the value of SP as an offset relative +/// to the CFA. +class StackAllocationAnalysis + : public InstrsDataflowAnalysis { + friend class DataflowAnalysis; + + StackPointerTracking &SPT; + +public: + StackAllocationAnalysis(const BinaryContext &BC, BinaryFunction &BF, + StackPointerTracking &SPT) + : InstrsDataflowAnalysis(BC, BF), + SPT(SPT) {} + virtual ~StackAllocationAnalysis() {} + + void run() { + NamedRegionTimer T1("SAA", "Dataflow", true); + InstrsDataflowAnalysis::run(); + } + +protected: + void preflight(); + + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB); + + BitVector getStartingStateAtPoint(const MCInst &Point); + + void doConfluence(BitVector &StateOut, const BitVector &StateIn); + + BitVector doKill(const MCInst &Point, const BitVector &StateIn, + int DeallocSize); + + void doConfluenceWithLP(BitVector &StateOut, const BitVector &StateIn, + const MCInst &Invoke); + + BitVector computeNext(const MCInst &Point, const BitVector &Cur); + + StringRef getAnnotationName() const { + return StringRef("StackAllocationAnalysis"); + } +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/Passes/StackAvailableExpressions.cpp b/bolt/Passes/StackAvailableExpressions.cpp new file mode 100644 index 000000000000..d0a5f5b1c12a --- /dev/null +++ b/bolt/Passes/StackAvailableExpressions.cpp @@ -0,0 +1,132 @@ +//===--- Passes/StackAvailableExpressions.cpp -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "StackAvailableExpressions.h" +#include "FrameAnalysis.h" + +#define DEBUG_TYPE "sae" + +namespace llvm { +namespace bolt { + +StackAvailableExpressions::StackAvailableExpressions(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF) + : InstrsDataflowAnalysis(BC, BF), FA(FA) {} + +void StackAvailableExpressions::preflight() { + DEBUG(dbgs() << "Starting StackAvailableExpressions on \"" + << Func.getPrintName() << "\"\n"); + + // Populate our universe of tracked expressions. We are interested in + // tracking available stores to frame position at any given point of the + // program. + for (auto &BB : Func) { + for (auto &Inst : BB) { + auto FIE = FA.getFIEFor(BC, Inst); + if (!FIE) + continue; + if (FIE->IsStore == true && FIE->IsSimple == true) { + Expressions.push_back(&Inst); + ExprToIdx[&Inst] = NumInstrs++; + } + } + } +} + +BitVector +StackAvailableExpressions::getStartingStateAtBB(const BinaryBasicBlock &BB) { + // Entry points start with empty set + // All others start with the full set. + if (BB.pred_size() == 0 && BB.throw_size() == 0) + return BitVector(NumInstrs, false); + return BitVector(NumInstrs, true); +} + +BitVector +StackAvailableExpressions::getStartingStateAtPoint(const MCInst &Point) { + return BitVector(NumInstrs, true); +} + +void StackAvailableExpressions::doConfluence(BitVector &StateOut, + const BitVector &StateIn) { + StateOut &= StateIn; +} + +namespace { + +bool isLoadRedundant(const FrameIndexEntry &LoadFIE, + const FrameIndexEntry &StoreFIE) { + if (LoadFIE.IsLoad == false || LoadFIE.IsSimple == false) { + return false; + } + if (LoadFIE.StackOffset == StoreFIE.StackOffset && + LoadFIE.Size == StoreFIE.Size) { + return true; + } + + return false; +} +} + +bool StackAvailableExpressions::doesXKillsY(const MCInst *X, const MCInst *Y) { + // if both are stores, and both store to the same stack location, return + // true + auto FIEX = FA.getFIEFor(BC, *X); + auto FIEY = FA.getFIEFor(BC, *Y); + if (FIEX && FIEY) { + if (isLoadRedundant(*FIEX, *FIEY)) + return false; + if (FIEX->IsStore == true && FIEY->IsStore == true && + FIEX->StackOffset + FIEX->Size > FIEY->StackOffset && + FIEX->StackOffset < FIEY->StackOffset + FIEY->Size) + return true; + } + // getClobberedRegs for X and Y. If they intersect, return true + BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false); + BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false); + FA.getInstClobberList(BC, *X, XClobbers); + // If Y is a store to stack, its clobber list is its source reg. This is + // different than the rest because we want to check if the store source + // reaches its corresponding load untouched. + if (FIEY && FIEY->IsStore == true && FIEY->IsStoreFromReg) { + YClobbers.set(FIEY->RegOrImm); + } else { + FA.getInstClobberList(BC, *Y, YClobbers); + } + XClobbers &= YClobbers; + return XClobbers.any(); +} + +BitVector StackAvailableExpressions::computeNext(const MCInst &Point, + const BitVector &Cur) { + BitVector Next = Cur; + // Kill + for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) { + assert(*I != nullptr && "Lost pointers"); + DEBUG(dbgs() << "\t\t\tDoes it kill "); + DEBUG((*I)->dump()); + if (doesXKillsY(&Point, *I)) { + DEBUG(dbgs() << "\t\t\t\tKilling "); + DEBUG((*I)->dump()); + Next.reset(I.getBitVectorIndex()); + } + } + // Gen + if (auto FIE = FA.getFIEFor(BC, Point)) { + if (FIE->IsStore == true && FIE->IsSimple == true) + Next.set(ExprToIdx[&Point]); + } + return Next; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/StackAvailableExpressions.h b/bolt/Passes/StackAvailableExpressions.h new file mode 100644 index 000000000000..6ec3234ff6ad --- /dev/null +++ b/bolt/Passes/StackAvailableExpressions.h @@ -0,0 +1,58 @@ +//===--- Passes/StackAvailableExpressions.h -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H + +#include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" + +namespace llvm { +namespace bolt { + +class FrameAnalysis; + +class StackAvailableExpressions + : public InstrsDataflowAnalysis { + friend class DataflowAnalysis; + +public: + StackAvailableExpressions(const FrameAnalysis &FA, + const BinaryContext &BC, BinaryFunction &BF); + virtual ~StackAvailableExpressions() {} + + void run() { + NamedRegionTimer T1("SAE", "Dataflow", true); + InstrsDataflowAnalysis::run(); + } + +protected: + /// Reference to the result of stack frame analysis + const FrameAnalysis &FA; + + void preflight(); + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB); + BitVector getStartingStateAtPoint(const MCInst &Point); + void doConfluence(BitVector &StateOut, const BitVector &StateIn); + /// Define the function computing the kill set -- whether expression Y, a + /// tracked expression, will be considered to be dead after executing X. + bool doesXKillsY(const MCInst *X, const MCInst *Y); + BitVector computeNext(const MCInst &Point, const BitVector &Cur); + + StringRef getAnnotationName() const { + return StringRef("StackAvailableExpressions"); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/Passes/StackPointerTracking.h index 7f02e766dfc9..99e4818c2395 100644 --- a/bolt/Passes/StackPointerTracking.h +++ b/bolt/Passes/StackPointerTracking.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKPOINTERTRACKING_H #include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" namespace llvm { namespace bolt { @@ -190,6 +191,11 @@ class StackPointerTracking public: StackPointerTracking(const BinaryContext &BC, BinaryFunction &BF); virtual ~StackPointerTracking() {} + + void run() { + NamedRegionTimer T1("SPT", "Dataflow", true); + StackPointerTrackingBase::run(); + } }; } // end namespace bolt diff --git a/bolt/Passes/StackReachingUses.cpp b/bolt/Passes/StackReachingUses.cpp new file mode 100644 index 000000000000..68e76b1438ff --- /dev/null +++ b/bolt/Passes/StackReachingUses.cpp @@ -0,0 +1,112 @@ +//===--- Passes/StackReachingUses.cpp -------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +#include "StackReachingUses.h" +#include "FrameAnalysis.h" + +#define DEBUG_TYPE "sru" + +namespace llvm { +namespace bolt { + +bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE, + ExprIterator Candidates, + bool IncludeLocalAccesses) const { + for (auto I = Candidates; I != expr_end(); ++I) { + const MCInst *ReachingInst = *I; + if (IncludeLocalAccesses) { + if (auto FIEY = FA.getFIEFor(BC, *ReachingInst)) { + assert(FIEY->IsLoad == 1); + if (StoreFIE.StackOffset + StoreFIE.Size > FIEY->StackOffset && + StoreFIE.StackOffset < FIEY->StackOffset + FIEY->Size) { + return true; + } + } + } + auto Args = FA.getArgAccessesFor(BC, *ReachingInst); + if (!Args) + continue; + if (Args->AssumeEverything) { + return true; + } + for (auto FIEY : Args->Set) { + if (StoreFIE.StackOffset + StoreFIE.Size > FIEY.StackOffset && + StoreFIE.StackOffset < FIEY.StackOffset + FIEY.Size) { + return true; + } + } + } + return false; +} + +void StackReachingUses::preflight() { + DEBUG(dbgs() << "Starting StackReachingUses on \"" << Func.getPrintName() + << "\"\n"); + + // Populate our universe of tracked expressions. We are interested in + // tracking reaching loads from frame position at any given point of the + // program. + for (auto &BB : Func) { + for (auto &Inst : BB) { + if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (FIE->IsLoad == true) { + Expressions.push_back(&Inst); + ExprToIdx[&Inst] = NumInstrs++; + continue; + } + } + auto AA = FA.getArgAccessesFor(BC, Inst); + if (AA && (!AA->Set.empty() || AA->AssumeEverything)) { + Expressions.push_back(&Inst); + ExprToIdx[&Inst] = NumInstrs++; + } + } + } +} + +bool StackReachingUses::doesXKillsY(const MCInst *X, const MCInst *Y) { + // if X is a store to the same stack location and the bytes fetched is a + // superset of those bytes affected by the load in Y, return true + auto FIEX = FA.getFIEFor(BC, *X); + auto FIEY = FA.getFIEFor(BC, *Y); + if (FIEX && FIEY) { + if (FIEX->IsStore == true && FIEY->IsLoad == true && + FIEX->StackOffset <= FIEY->StackOffset && + FIEX->StackOffset + FIEX->Size >= FIEY->StackOffset + FIEY->Size) + return true; + } + return false; +} + +BitVector StackReachingUses::computeNext(const MCInst &Point, + const BitVector &Cur) { + BitVector Next = Cur; + // Kill + for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) { + assert(*I != nullptr && "Lost pointers"); + if (doesXKillsY(&Point, *I)) { + DEBUG(dbgs() << "\t\t\tKilling "); + DEBUG((*I)->dump()); + Next.reset(I.getBitVectorIndex()); + } + }; + // Gen + if (auto FIE = FA.getFIEFor(BC, Point)) { + if (FIE->IsLoad == true) + Next.set(ExprToIdx[&Point]); + } + auto AA = FA.getArgAccessesFor(BC, Point); + if (AA && (!AA->Set.empty() || AA->AssumeEverything)) + Next.set(ExprToIdx[&Point]); + return Next; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/StackReachingUses.h b/bolt/Passes/StackReachingUses.h new file mode 100644 index 000000000000..7ea7094ef6bd --- /dev/null +++ b/bolt/Passes/StackReachingUses.h @@ -0,0 +1,71 @@ +//===--- Passes/StackReachingUses.h ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H + +#include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" + +namespace llvm { +namespace bolt { + +class FrameAnalysis; +struct FrameIndexEntry; + +class StackReachingUses + : public InstrsDataflowAnalysis { + friend class DataflowAnalysis; + +public: + StackReachingUses(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF) + : InstrsDataflowAnalysis(BC, BF), FA(FA) {} + virtual ~StackReachingUses() {} + + bool isStoreUsed(const FrameIndexEntry &StoreFIE, ExprIterator Candidates, + bool IncludeLocalAccesses = true) const; + + void run() { + NamedRegionTimer T1("SRU", "Dataflow", true); + InstrsDataflowAnalysis::run(); + } + +protected: + // Reference to the result of stack frame analysis + const FrameAnalysis &FA; + + void preflight(); + + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) { + return BitVector(NumInstrs, false); + } + + BitVector getStartingStateAtPoint(const MCInst &Point) { + return BitVector(NumInstrs, false); + } + + void doConfluence(BitVector &StateOut, const BitVector &StateIn) { + StateOut |= StateIn; + } + + // Define the function computing the kill set -- whether expression Y, a + // tracked expression, will be considered to be dead after executing X. + bool doesXKillsY(const MCInst *X, const MCInst *Y); + BitVector computeNext(const MCInst &Point, const BitVector &Cur); + + StringRef getAnnotationName() const { return StringRef("StackReachingUses"); } +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 7ef342a5124d..1ee3aa7a75a0 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1659,6 +1659,7 @@ void RewriteInstance::readDebugInfo() { void RewriteInstance::disassembleFunctions() { // Disassemble every function and build it's control flow graph. TotalScore = 0; + BC->SumExecutionCount = 0; for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; @@ -1803,6 +1804,7 @@ void RewriteInstance::disassembleFunctions() { } TotalScore += Function.getFunctionScore(); + BC->SumExecutionCount += Function.getKnownExecutionCount(); } // Iterate over all functions @@ -1821,6 +1823,7 @@ void RewriteInstance::disassembleFunctions() { else ++NumStaleProfileFunctions; } + BC->NumProfiledFuncs = ProfiledFunctions.size(); const auto NumAllProfiledFunctions = ProfiledFunctions.size() + NumStaleProfileFunctions; From 2356b10537ff4f6a3cdd0f1fdffe57a1b1e18b92 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 2 Jun 2017 16:57:22 -0700 Subject: [PATCH 269/904] Split FrameAnalysis and improve LivenessAnalysis Summary: Split FrameAnalysis into FrameAnalysis and RegAnalysis, since some optimizations only require register information about functions, not frame information. Refactor callgraph walking code into the CallGraphWalker class, allowing any analysis that depend on the call graph to easily traverse it via a visitor pattern. Also fix LivenessAnalysis, which was broken because it was not considering registers read into callees and incorporating this into caller. (cherry picked from commit 69087a345cb0391005d88c4fb42553b18a410668) --- bolt/Passes/BinaryFunctionCallGraph.cpp | 3 + bolt/Passes/CMakeLists.txt | 2 + bolt/Passes/CallGraphWalker.cpp | 46 +++++ bolt/Passes/CallGraphWalker.h | 67 +++++++ bolt/Passes/DataflowInfoManager.cpp | 12 +- bolt/Passes/DataflowInfoManager.h | 15 +- bolt/Passes/FrameAnalysis.cpp | 213 ++++------------------ bolt/Passes/FrameAnalysis.h | 93 +++------- bolt/Passes/FrameOptimizer.cpp | 23 ++- bolt/Passes/FrameOptimizer.h | 4 +- bolt/Passes/IndirectCallPromotion.cpp | 14 +- bolt/Passes/LivenessAnalysis.h | 51 +++++- bolt/Passes/ReachingDefOrUse.h | 15 +- bolt/Passes/RegAnalysis.cpp | 207 +++++++++++++++++++++ bolt/Passes/RegAnalysis.h | 82 +++++++++ bolt/Passes/ShrinkWrapping.cpp | 14 +- bolt/Passes/StackAvailableExpressions.cpp | 17 +- bolt/Passes/StackAvailableExpressions.h | 5 +- bolt/Passes/StackReachingUses.cpp | 16 +- 19 files changed, 583 insertions(+), 316 deletions(-) create mode 100644 bolt/Passes/CallGraphWalker.cpp create mode 100644 bolt/Passes/CallGraphWalker.h create mode 100644 bolt/Passes/RegAnalysis.cpp create mode 100644 bolt/Passes/RegAnalysis.h diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index 16ea4bc376dc..5d29cb64d9fd 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -12,6 +12,7 @@ #include "BinaryFunctionCallGraph.h" #include "BinaryFunction.h" #include "BinaryContext.h" +#include "llvm/Support/Timer.h" #define DEBUG_TYPE "callgraph" @@ -30,6 +31,7 @@ CallGraph::NodeId BinaryFunctionCallGraph::addNode(BinaryFunction *BF, } std::deque BinaryFunctionCallGraph::buildTraversalOrder() { + NamedRegionTimer T1("Build cg traversal order", "CG breakdown", true); std::deque TopologicalOrder; enum NodeStatus { NEW, VISITING, VISITED }; std::vector NodeStatus(Funcs.size()); @@ -73,6 +75,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, bool IncludeColdCalls, bool UseFunctionHotSize, bool UseEdgeCounts) { + NamedRegionTimer T1("Callgraph construction", "CG breakdown", true); BinaryFunctionCallGraph Cg; // Add call graph nodes. diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 7d9714893c45..b3114c2a05e6 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_library(LLVMBOLTPasses BinaryPasses.cpp BinaryFunctionCallGraph.cpp CallGraph.cpp + CallGraphWalker.cpp DataflowAnalysis.cpp DataflowInfoManager.cpp FrameAnalysis.cpp @@ -13,6 +14,7 @@ add_llvm_library(LLVMBOLTPasses Inliner.cpp LivenessAnalysis.cpp PettisAndHansen.cpp + RegAnalysis.cpp ReorderAlgorithm.cpp ReorderFunctions.cpp ShrinkWrapping.cpp diff --git a/bolt/Passes/CallGraphWalker.cpp b/bolt/Passes/CallGraphWalker.cpp new file mode 100644 index 000000000000..00f9d75a8dcd --- /dev/null +++ b/bolt/Passes/CallGraphWalker.cpp @@ -0,0 +1,46 @@ +#include "CallGraphWalker.h" +#include "llvm/Support/Timer.h" + +namespace llvm { +namespace bolt { + +void CallGraphWalker::traverseCG() { + NamedRegionTimer T1("CG Traversal", "CG breakdown", true); + std::queue Queue; + std::set InQueue; + + for (auto *Func : TopologicalCGOrder) { + Queue.push(Func); + InQueue.insert(Func); + } + + while (!Queue.empty()) { + auto *Func = Queue.front(); + Queue.pop(); + InQueue.erase(Func); + + bool Changed{false}; + for (auto Visitor : Visitors) { + bool CurVisit = Visitor(Func); + Changed = Changed || CurVisit; + } + + if (Changed) { + for (auto CallerID : CG.predecessors(CG.getNodeId(Func))) { + BinaryFunction *CallerFunc = CG.nodeIdToFunc(CallerID); + if (InQueue.count(CallerFunc)) + continue; + Queue.push(CallerFunc); + InQueue.insert(CallerFunc); + } + } + } +} + +void CallGraphWalker::walk() { + TopologicalCGOrder = CG.buildTraversalOrder(); + traverseCG(); +} + +} +} diff --git a/bolt/Passes/CallGraphWalker.h b/bolt/Passes/CallGraphWalker.h new file mode 100644 index 000000000000..195e536fd07d --- /dev/null +++ b/bolt/Passes/CallGraphWalker.h @@ -0,0 +1,67 @@ +//===--- Passes/CallGraphWalker.h -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPHWALKER_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_CALLGRAPHWALKER_H + +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "BinaryFunctionCallGraph.h" +#include +#include +#include +#include + +namespace llvm { +namespace bolt { + +/// Perform a bottom-up walk of the call graph with the intent of computing +/// a property that depends on callees. In the event of a CG cycles, this will +/// re-visit functions until their observed property converges. +class CallGraphWalker { + BinaryContext &BC; + std::map &BFs; + BinaryFunctionCallGraph &CG; + + /// DFS or reverse post-ordering of the call graph nodes to allow us to + /// traverse the call graph bottom-up + std::deque TopologicalCGOrder; + + /// Stores all visitor functions to call when traversing the call graph + typedef std::function CallbackTy; + std::vector Visitors; + + /// Do the bottom-up traversal + void traverseCG(); + +public: + /// Initialize core context references but don't do anything yet + CallGraphWalker(BinaryContext &BC, std::map &BFs, + BinaryFunctionCallGraph &CG) + : BC(BC), BFs(BFs), CG(CG) {} + + /// Register a new callback function to be called for each function when + /// traversing the call graph bottom-up. Function should return true iff + /// whatever information it is keeping track of has changed. Function must + /// converge with time, ie, it must eventually return false, otherwise the + /// call graph walk will never finish. + void registerVisitor(CallbackTy Callback) { + Visitors.emplace_back(Callback); + } + + /// Build the call graph, establish a traversal order and traverse it. + void walk(); +}; + +} +} + +#endif diff --git a/bolt/Passes/DataflowInfoManager.cpp b/bolt/Passes/DataflowInfoManager.cpp index e280c1554b3d..c9a1e416db1a 100644 --- a/bolt/Passes/DataflowInfoManager.cpp +++ b/bolt/Passes/DataflowInfoManager.cpp @@ -18,8 +18,8 @@ namespace bolt { ReachingDefOrUse &DataflowInfoManager::getReachingDefs() { if (RD) return *RD; - assert(FA && "FrameAnalysis required"); - RD.reset(new ReachingDefOrUse(*FA, BC, BF)); + assert(RA && "RegAnalysis required"); + RD.reset(new ReachingDefOrUse(*RA, BC, BF)); RD->run(); return *RD; } @@ -31,8 +31,8 @@ void DataflowInfoManager::invalidateReachingDefs() { ReachingDefOrUse &DataflowInfoManager::getReachingUses() { if (RU) return *RU; - assert(FA && "FrameAnalysis required"); - RU.reset(new ReachingDefOrUse(*FA, BC, BF)); + assert(RA && "RegAnalysis required"); + RU.reset(new ReachingDefOrUse(*RA, BC, BF)); RU->run(); return *RU; } @@ -44,8 +44,8 @@ void DataflowInfoManager::invalidateReachingUses() { LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() { if (LA) return *LA; - assert(FA && "FrameAnalysis required"); - LA.reset(new LivenessAnalysis(*FA, BC, BF)); + assert(RA && "RegAnalysis required"); + LA.reset(new LivenessAnalysis(*RA, BC, BF)); LA->run(); return *LA; } diff --git a/bolt/Passes/DataflowInfoManager.h b/bolt/Passes/DataflowInfoManager.h index 34a6b64bef15..c527650d1d74 100644 --- a/bolt/Passes/DataflowInfoManager.h +++ b/bolt/Passes/DataflowInfoManager.h @@ -12,14 +12,15 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWINFOMANAGER_H #define LLVM_TOOLS_LLVM_BOLT_PASSES_DATAFLOWINFOMANAGER_H +#include "DominatorAnalysis.h" #include "FrameAnalysis.h" +#include "LivenessAnalysis.h" #include "ReachingDefOrUse.h" -#include "StackReachingUses.h" -#include "DominatorAnalysis.h" -#include "StackPointerTracking.h" #include "ReachingInsns.h" -#include "LivenessAnalysis.h" +#include "RegAnalysis.h" #include "StackAllocationAnalysis.h" +#include "StackPointerTracking.h" +#include "StackReachingUses.h" namespace llvm { namespace bolt { @@ -29,6 +30,7 @@ namespace bolt { /// recompute it. Also provide an interface for data invalidation when the /// analysis is outdated after a transform pass modified the function. class DataflowInfoManager { + const RegAnalysis *RA; const FrameAnalysis *FA; const BinaryContext &BC; BinaryFunction &BF; @@ -46,8 +48,9 @@ class DataflowInfoManager { InsnToBB; public: - DataflowInfoManager(const FrameAnalysis *FA, const BinaryContext &BC, - BinaryFunction &BF) : FA(FA), BC(BC), BF(BF) {}; + DataflowInfoManager(const BinaryContext &BC, BinaryFunction &BF, + const RegAnalysis *RA, const FrameAnalysis *FA) + : RA(RA), FA(FA), BC(BC), BF(BF){}; /// Helper function to fetch the parent BB associated with a program point /// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock) diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 38d770ad679a..3cd2ce883b59 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// #include "FrameAnalysis.h" +#include "CallGraphWalker.h" #include #define DEBUG_TYPE "fa" @@ -213,9 +214,8 @@ class FrameAccessAnalysis { } // end anonymous namespace -void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, - ArgAccesses &&AA) { - if (auto OldAA = getArgAccessesFor(BC, Inst)) { +void FrameAnalysis::addArgAccessesFor(MCInst &Inst, ArgAccesses &&AA) { + if (auto OldAA = getArgAccessesFor(Inst)) { if (OldAA->AssumeEverything) return; *OldAA = std::move(AA); @@ -231,13 +231,12 @@ void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, ArgAccessesVector.emplace_back(std::move(AA)); } -void FrameAnalysis::addArgInStackAccessFor(const BinaryContext &BC, - MCInst &Inst, +void FrameAnalysis::addArgInStackAccessFor(MCInst &Inst, const ArgInStackAccess &Arg) { - auto AA = getArgAccessesFor(BC, Inst); + auto AA = getArgAccessesFor(Inst); if (!AA) { - addArgAccessesFor(BC, Inst, ArgAccesses(false)); - AA = getArgAccessesFor(BC, Inst); + addArgAccessesFor(Inst, ArgAccesses(false)); + AA = getArgAccessesFor(Inst); assert(AA && "Object setup failed"); } auto &Set = AA->Set; @@ -245,15 +244,13 @@ void FrameAnalysis::addArgInStackAccessFor(const BinaryContext &BC, Set.emplace(Arg); } -void FrameAnalysis::addFIEFor(const BinaryContext &BC, MCInst &Inst, - const FrameIndexEntry &FIE) { +void FrameAnalysis::addFIEFor(MCInst &Inst, const FrameIndexEntry &FIE) { BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "FrameAccessEntry", (unsigned)FIEVector.size()); FIEVector.emplace_back(FIE); } -ErrorOr -FrameAnalysis::getArgAccessesFor(const BinaryContext &BC, const MCInst &Inst) { +ErrorOr FrameAnalysis::getArgAccessesFor(const MCInst &Inst) { if (auto Idx = BC.MIA->tryGetAnnotationAs(Inst, "ArgAccessEntry")) { assert(ArgAccessesVector.size() > *Idx && "Out of bounds"); return ArgAccessesVector[*Idx]; @@ -262,8 +259,7 @@ FrameAnalysis::getArgAccessesFor(const BinaryContext &BC, const MCInst &Inst) { } ErrorOr -FrameAnalysis::getArgAccessesFor(const BinaryContext &BC, - const MCInst &Inst) const { +FrameAnalysis::getArgAccessesFor(const MCInst &Inst) const { if (auto Idx = BC.MIA->tryGetAnnotationAs(Inst, "ArgAccessEntry")) { assert(ArgAccessesVector.size() > *Idx && "Out of bounds"); return ArgAccessesVector[*Idx]; @@ -272,7 +268,7 @@ FrameAnalysis::getArgAccessesFor(const BinaryContext &BC, } ErrorOr -FrameAnalysis::getFIEFor(const BinaryContext &BC, const MCInst &Inst) const { +FrameAnalysis::getFIEFor(const MCInst &Inst) const { if (auto Idx = BC.MIA->tryGetAnnotationAs(Inst, "FrameAccessEntry")) { assert(FIEVector.size() > *Idx && "Out of bounds"); @@ -281,130 +277,17 @@ FrameAnalysis::getFIEFor(const BinaryContext &BC, const MCInst &Inst) const { return make_error_code(errc::result_out_of_range); } -void FrameAnalysis::getInstClobberList(const BinaryContext &BC, - const MCInst &Inst, - BitVector &KillSet) const { - if (!BC.MIA->isCall(Inst)) { - BC.MIA->getClobberedRegs(Inst, KillSet, *BC.MRI); - return; - } - - const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); - // If indirect call, kill set should have all elements - if (TargetSymbol == nullptr) { - KillSet.set(0, KillSet.size()); - return; - } - - const auto *Function = BC.getFunctionForSymbol(TargetSymbol); - if (Function == nullptr) { - // Call to a function without a BinaryFunction object. - // This should be a call to a PLT entry, and since it is a trampoline to - // a DSO, we can't really know the code in advance. Conservatively assume - // everything is clobbered. - KillSet.set(0, KillSet.size()); - return; - } - auto BV = RegsKilledMap.find(Function); - if (BV != RegsKilledMap.end()) { - KillSet |= BV->second; - return; - } - // Ignore calls to function whose clobber list wasn't yet calculated. This - // instruction will be evaluated again once we have info for the callee. - return; -} - -BitVector FrameAnalysis::getFunctionClobberList(const BinaryContext &BC, - const BinaryFunction *Func) { - BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false); - - if (!Func->isSimple() || !Func->hasCFG()) { - RegsKilled.set(0, RegsKilled.size()); - return RegsKilled; - } - - for (const auto &BB : *Func) { - for (const auto &Inst : BB) { - getInstClobberList(BC, Inst, RegsKilled); - } - } - - return RegsKilled; -} +void FrameAnalysis::traverseCG(BinaryFunctionCallGraph &CG) { + CallGraphWalker CGWalker(BC, BFs, CG); -void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { - std::queue Queue; - std::set InQueue; + CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool { + return computeArgsAccessed(*Func); + }); - for (auto *Func : TopologicalCGOrder) { - Queue.push(Func); - InQueue.insert(Func); - } - - while (!Queue.empty()) { - auto *Func = Queue.front(); - Queue.pop(); - InQueue.erase(Func); - - BitVector RegsKilled = getFunctionClobberList(BC, Func); - bool ArgsUpdated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func); - bool RegsUpdated = false; - - if (RegsKilledMap.find(Func) == RegsKilledMap.end()) { - RegsKilledMap[Func] = std::move(RegsKilled); - } else { - RegsUpdated = RegsKilledMap[Func] != RegsKilled; - if (RegsUpdated) - RegsKilledMap[Func] = std::move(RegsKilled); - } - - if (RegsUpdated || ArgsUpdated) { - for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) { - BinaryFunction *CallerFunc = Cg.nodeIdToFunc(Caller); - if (!InQueue.count(CallerFunc)) { - InQueue.insert(CallerFunc); - Queue.push(CallerFunc); - } - } - } - } - - if (opts::Verbosity == 0) { -#ifndef NDEBUG - if (!DebugFlag || !isCurrentDebugType("fa")) - return; -#else - return; -#endif - } - - // This loop is for computing statistics only - for (auto *Func : TopologicalCGOrder) { - auto Iter = RegsKilledMap.find(Func); - assert(Iter != RegsKilledMap.end() && - "Failed to compute all clobbers list"); - if (Iter->second.all()) { - auto Count = Func->getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsAllClobber += Count; - ++NumFunctionsAllClobber; - } - DEBUG_WITH_TYPE("fa", - dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; - const BitVector &RegsKilled = Iter->second; - int RegIdx = RegsKilled.find_first(); - while (RegIdx != -1) { - dbgs() << "\tREG" << RegIdx; - RegIdx = RegsKilled.find_next(RegIdx); - }; - dbgs() << "\n"; - ); - } + CGWalker.walk(); } -bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC, - const BinaryFunction &BF, MCInst &Inst, +bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst, int CurOffset) { if (!BC.MIA->isCall(Inst)) return false; @@ -413,7 +296,7 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC, const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); // If indirect call, we conservatively assume it accesses all stack positions if (TargetSymbol == nullptr) { - addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true)); + addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true)); bool Updated{false}; if (!FunctionsRequireAlignment.count(&BF)) { Updated = true; @@ -426,7 +309,7 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC, // Call to a function without a BinaryFunction object. Conservatively assume // it accesses all stack positions if (Function == nullptr) { - addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true)); + addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true)); bool Updated{false}; if (!FunctionsRequireAlignment.count(&BF)) { Updated = true; @@ -459,27 +342,25 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC, if (CurOffset == StackPointerTracking::EMPTY || CurOffset == StackPointerTracking::SUPERPOSITION) { - addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true)); + addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true)); return Changed; } for (auto Elem : Iter->second) { if (Elem.first == -1) { - addArgAccessesFor(BC, Inst, ArgAccesses(/*AssumeEverything=*/true)); + addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true)); break; } DEBUG(dbgs() << "Added arg in stack access annotation " << CurOffset + Elem.first << "\n"); addArgInStackAccessFor( - BC, Inst, - ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first, - /*Size=*/Elem.second}); + Inst, ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first, + /*Size=*/Elem.second}); } return Changed; } -bool FrameAnalysis::computeArgsAccessed(const BinaryContext &BC, - BinaryFunction &BF) { +bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) { if (!BF.isSimple() || !BF.hasCFG()) { DEBUG(dbgs() << "Treating " << BF.getPrintName() << " conservatively.\n"); bool Updated = false; @@ -505,7 +386,7 @@ bool FrameAnalysis::computeArgsAccessed(const BinaryContext &BC, // Check for calls -- attach stack accessing info to them regarding their // target - if (updateArgsTouchedFor(BC, BF, Inst, FAA.getSPOffset())) + if (updateArgsTouchedFor(BF, Inst, FAA.getSPOffset())) UpdatedArgsTouched = true; // Check for stack accesses that affect callers @@ -548,8 +429,7 @@ bool FrameAnalysis::computeArgsAccessed(const BinaryContext &BC, return UpdatedArgsTouched || UpdatedAlignedStatus; } -bool FrameAnalysis::restoreFrameIndex(const BinaryContext &BC, - BinaryFunction &BF) { +bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) { FrameAccessAnalysis FAA(BC, BF); DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName() @@ -572,7 +452,7 @@ bool FrameAnalysis::restoreFrameIndex(const BinaryContext &BC, const FrameIndexEntry &FIE = FAA.getFIE(); - addFIEFor(BC, Inst, FIE); + addFIEFor(Inst, FIE); DEBUG({ dbgs() << "Frame index annotation " << FIE << " added to:\n"; BC.printInstruction(dbgs(), Inst, 0, &BF, true); @@ -582,8 +462,7 @@ bool FrameAnalysis::restoreFrameIndex(const BinaryContext &BC, return true; } -void FrameAnalysis::cleanAnnotations(const BinaryContext &BC, - std::map &BFs) { +void FrameAnalysis::cleanAnnotations() { for (auto &I : BFs) { for (auto &BB : I.second) { for (auto &Inst : BB) { @@ -594,24 +473,15 @@ void FrameAnalysis::cleanAnnotations(const BinaryContext &BC, } } -void FrameAnalysis::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &) { - { - NamedRegionTimer T1("Callgraph construction", "FOP breakdown", true); - Cg = buildCallGraph(BC, BFs); - } - { - NamedRegionTimer T1("build cg traversal order", "FOP breakdown", true); - TopologicalCGOrder = Cg.buildTraversalOrder(); - } - { - NamedRegionTimer T1("build clobber map", "FOP breakdown", true); - buildClobberMap(BC); - } +FrameAnalysis::FrameAnalysis(BinaryContext &BC, + std::map &BFs, + BinaryFunctionCallGraph &CG) + : BC(BC), BFs(BFs) { + // Position 0 of the vector should be always associated with "assume access + // everything". + ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true)); - if (ClobberAnalysisOnly) - return; + traverseCG(CG); for (auto &I : BFs) { auto Count = I.second.getExecutionCount(); @@ -630,7 +500,7 @@ void FrameAnalysis::runOnFunctions(BinaryContext &BC, { NamedRegionTimer T1("restore frame index", "FOP breakdown", true); - if (!restoreFrameIndex(BC, I.second)) { + if (!restoreFrameIndex(I.second)) { ++NumFunctionsFailedRestoreFI; auto Count = I.second.getExecutionCount(); if (Count != BinaryFunction::COUNT_NO_PROFILE) @@ -643,12 +513,7 @@ void FrameAnalysis::runOnFunctions(BinaryContext &BC, } void FrameAnalysis::printStats() { - outs() << "BOLT-INFO FRAME ANALYSIS: Number of functions conservatively " - "treated as clobbering all registers: " - << NumFunctionsAllClobber - << format(" (%.1lf%% dyn cov)\n", - (100.0 * CountFunctionsAllClobber / CountDenominator)) - << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsNotOptimized + outs() << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsNotOptimized << " function(s) " << format("(%.1lf%% dyn cov)", (100.0 * CountFunctionsNotOptimized / CountDenominator)) diff --git a/bolt/Passes/FrameAnalysis.h b/bolt/Passes/FrameAnalysis.h index b182d84bcb78..69c188c2e2e3 100644 --- a/bolt/Passes/FrameAnalysis.h +++ b/bolt/Passes/FrameAnalysis.h @@ -12,8 +12,8 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEANALYSIS_H -#include "BinaryPasses.h" #include "BinaryFunctionCallGraph.h" +#include "BinaryPasses.h" #include "StackPointerTracking.h" namespace llvm { @@ -111,17 +111,9 @@ raw_ostream &operator<<(raw_ostream &OS, /// ... callee may access any position of our current stack frame /// } /// -class FrameAnalysis : public BinaryFunctionPass { - /// Call graph info - BinaryFunctionCallGraph Cg; - - /// DFS or reverse post-ordering of the call graph nodes to allow us to - /// traverse the call graph bottom-up - std::deque TopologicalCGOrder; - - /// Map functions to the set of registers they may overwrite starting at when - /// it is called until it returns to the caller. - std::map RegsKilledMap; +class FrameAnalysis { + BinaryContext &BC; + std::map &BFs; /// Map functions to the set of tuples representing /// accesses to stack positions that belongs to caller @@ -142,70 +134,44 @@ class FrameAnalysis : public BinaryFunctionPass { std::vector FIEVector; /// Analysis stats counters - uint64_t NumFunctionsAllClobber{0}; - uint64_t CountFunctionsAllClobber{0}; uint64_t NumFunctionsNotOptimized{0}; uint64_t NumFunctionsFailedRestoreFI{0}; uint64_t CountFunctionsNotOptimized{0}; uint64_t CountFunctionsFailedRestoreFI{0}; uint64_t CountDenominator{0}; - /// If this flag is set to true, the analysis will never run completely, - /// but will stop after callgraph and a clobber analysis for every function - /// has been computed. - bool ClobberAnalysisOnly{false}; - /// Convenience functions for appending MCAnnotations to instructions with /// our specific data - void addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, - ArgAccesses &&AA); - void addArgInStackAccessFor(const BinaryContext &BC, MCInst &Inst, - const ArgInStackAccess &Arg); - void addFIEFor(const BinaryContext &BC, MCInst &Inst, - const FrameIndexEntry &FIE); - - /// Compute the set of registers \p Func may write to during its execution, - /// starting at the point when it is called up until when it returns. Returns - /// a BitVector the size of the target number of registers, representing the - /// set of clobbered registers. - BitVector getFunctionClobberList(const BinaryContext &BC, - const BinaryFunction *Func); + void addArgAccessesFor(MCInst &Inst, ArgAccesses &&AA); + void addArgInStackAccessFor(MCInst &Inst, const ArgInStackAccess &Arg); + void addFIEFor(MCInst &Inst, const FrameIndexEntry &FIE); /// Perform the step of building the set of registers clobbered by each - /// function execution, populating RegsKilledMap. - void buildClobberMap(const BinaryContext &BC); + /// function execution, populating RegsKilledMap and RegsGenMap. + void traverseCG(BinaryFunctionCallGraph &CG); /// Analyzes an instruction and if it is a call, checks the called function /// to record which args in stack are accessed, if any. Returns true if /// the args data associated with this instruction were updated. - bool updateArgsTouchedFor(const BinaryContext &BC, const BinaryFunction &BF, - MCInst &Inst, int CurOffset); + bool updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst, + int CurOffset); /// Performs a pass over \p BF to check for accesses to arguments in stack, /// flagging those as accessing the caller stack frame. All functions called /// by \p BF must have been previously analyzed. Returns true if updated /// args data about this function. - bool computeArgsAccessed(const BinaryContext &BC, BinaryFunction &BF); + bool computeArgsAccessed(BinaryFunction &BF); /// Alias analysis to disambiguate which frame position is accessed by each /// instruction in function \p BF. Add MCAnnotation to /// instructions that access a frame position. Return false if it failed /// to analyze and this information can't be safely determined for \p BF. - bool restoreFrameIndex(const BinaryContext &BC, BinaryFunction &BF); + bool restoreFrameIndex(BinaryFunction &BF); public: - explicit FrameAnalysis(const cl::opt &PrintPass, - bool ClobberAnalysisOnly=false) - : BinaryFunctionPass(PrintPass), - ClobberAnalysisOnly(ClobberAnalysisOnly) { - // Position 0 of the vector should be always associated with "assume access - // everything". - ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true)); - } - - const char *getName() const override { - return "frame-analysis"; - } + explicit FrameAnalysis(BinaryContext &BC, + std::map &BFs, + BinaryFunctionCallGraph &CG); /// Return true if we could fully analyze \p Func bool hasFrameInfo(const BinaryFunction &Func) const { @@ -217,30 +183,19 @@ class FrameAnalysis : public BinaryFunctionPass { return FunctionsRequireAlignment.count(&Func); } - /// Compute the set of registers \p Inst may write to, marking them in - /// \p KillSet. If this is a call, try to get the set of registers the call - /// target will write to. - void getInstClobberList(const BinaryContext &BC, const MCInst &Inst, - BitVector &KillSet) const; - /// Functions for retrieving our specific MCAnnotation data from instructions - ErrorOr getArgAccessesFor(const BinaryContext &BC, - const MCInst &Inst); + ErrorOr getArgAccessesFor(const MCInst &Inst); - ErrorOr getArgAccessesFor(const BinaryContext &BC, - const MCInst &Inst) const; + ErrorOr getArgAccessesFor(const MCInst &Inst) const; - ErrorOr getFIEFor(const BinaryContext &BC, - const MCInst &Inst) const; - - /// Pass entry point - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + ErrorOr getFIEFor(const MCInst &Inst) const; /// Remove all MCAnnotations attached by this pass - void cleanAnnotations(const BinaryContext &BC, - std::map &BFs); + void cleanAnnotations(); + + ~FrameAnalysis() { + cleanAnnotations(); + } /// Print to standard output statistics about the analysis performed by this diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 4662cf87515b..094e668f419e 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -10,7 +10,6 @@ //===----------------------------------------------------------------------===// #include "FrameOptimizer.h" -#include "FrameAnalysis.h" #include "ShrinkWrapping.h" #include "StackAvailableExpressions.h" #include "StackReachingUses.h" @@ -45,10 +44,11 @@ FrameOptimization("frame-opt", namespace llvm { namespace bolt { -void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA, +void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA, + const FrameAnalysis &FA, const BinaryContext &BC, BinaryFunction &BF) { - StackAvailableExpressions SAE(FA, BC, BF); + StackAvailableExpressions SAE(RA, FA, BC, BF); SAE.run(); DEBUG(dbgs() << "Performing unnecessary loads removal\n"); @@ -71,7 +71,7 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA, // if Inst is a load from stack and the current available expressions show // this value is available in a register or immediate, replace this load // with move from register or from immediate. - auto FIEX = FA.getFIEFor(BC, Inst); + auto FIEX = FA.getFIEFor(Inst); if (!FIEX) { Prev = &Inst; continue; @@ -88,7 +88,7 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA, for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); I != ExprEnd; ++I) { const MCInst *AvailableInst = *I; - auto FIEY = FA.getFIEFor(BC, *AvailableInst); + auto FIEY = FA.getFIEFor(*AvailableInst); if (!FIEY) continue; assert(FIEY->IsStore && FIEY->IsSimple); @@ -172,7 +172,7 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, (*I)->dump(); } }); - auto FIEX = FA.getFIEFor(BC, Inst); + auto FIEX = FA.getFIEFor(Inst); if (!FIEX) { Prev = &Inst; continue; @@ -217,8 +217,9 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, return; // Run FrameAnalysis pass - FrameAnalysis FA(PrintPass); - FA.runOnFunctions(BC, BFs, LargeFunctions); + BinaryFunctionCallGraph CG = buildCallGraph(BC, BFs); + FrameAnalysis FA(BC, BFs, CG); + RegAnalysis RA(BC, BFs, CG); // Our main loop: perform caller-saved register optimizations, then // callee-saved register optimizations (shrink wrapping). @@ -237,7 +238,7 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, } { NamedRegionTimer T1("remove loads", "FOP breakdown", true); - removeUnnecessaryLoads(FA, BC, I.second); + removeUnnecessaryLoads(RA, FA, BC, I.second); } { NamedRegionTimer T1("remove stores", "FOP breakdown", true); @@ -248,14 +249,12 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, continue; { NamedRegionTimer T1("move spills", "FOP breakdown", true); - DataflowInfoManager Info(&FA, BC, I.second); + DataflowInfoManager Info(BC, I.second, &RA, &FA); ShrinkWrapping SW(FA, BC, I.second, Info); SW.perform(); } } - FA.cleanAnnotations(BC, BFs); - outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads << " redundant load(s) and " << NumRedundantStores << " unused store(s)\n"; diff --git a/bolt/Passes/FrameOptimizer.h b/bolt/Passes/FrameOptimizer.h index 4ba8e1c2bb56..3c6e3bee168a 100644 --- a/bolt/Passes/FrameOptimizer.h +++ b/bolt/Passes/FrameOptimizer.h @@ -14,6 +14,7 @@ #include "BinaryPasses.h" #include "FrameAnalysis.h" +#include "RegAnalysis.h" namespace llvm { namespace bolt { @@ -86,7 +87,8 @@ class FrameOptimizerPass : public BinaryFunctionPass { /// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from /// the frame. Use the analysis to convert memory loads to register moves or /// immediate loads. Delete redundant register moves. - void removeUnnecessaryLoads(const FrameAnalysis &FA, + void removeUnnecessaryLoads(const RegAnalysis &RA, + const FrameAnalysis &FA, const BinaryContext &BC, BinaryFunction &BF); diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index 7d7311347d6a..b2e54906db1b 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -679,9 +679,12 @@ void IndirectCallPromotion::runOnFunctions( if (opts::IndirectCallPromotion == ICP_NONE) return; - FrameAnalysis FA(PrintPass, /*ClobberAnalysisOnly=*/true); - if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) - FA.runOnFunctions(BC, BFs, LargeFunctions); + std::unique_ptr RA; + std::unique_ptr CG; + if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) { + CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); + RA.reset(new RegAnalysis(BC, BFs, *CG)); + } for (auto &BFIt : BFs) { auto &Function = BFIt.second; @@ -716,7 +719,7 @@ void IndirectCallPromotion::runOnFunctions( if (BBs.empty()) continue; - DataflowInfoManager Info(&FA, BC, Function); + DataflowInfoManager Info(BC, Function, RA.get(), nullptr); while (!BBs.empty()) { auto *BB = BBs.back(); BBs.pop_back(); @@ -864,9 +867,6 @@ void IndirectCallPromotion::runOnFunctions( TotalIndirectJmps += FuncTotalIndirectJmps; } - if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) - FA.cleanAnnotations(BC, BFs); - outs() << "BOLT-INFO: ICP total indirect callsites = " << TotalIndirectCallsites << "\n" diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index ed9e0f00a1e2..739f49150f4d 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -13,9 +13,14 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_LIVENESSANALYSIS_H #include "DataflowAnalysis.h" -#include "FrameAnalysis.h" +#include "RegAnalysis.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt AssumeABI; +} + namespace llvm { namespace bolt { @@ -24,9 +29,9 @@ class LivenessAnalysis friend class DataflowAnalysis; public: - LivenessAnalysis(const FrameAnalysis &FA, const BinaryContext &BC, + LivenessAnalysis(const RegAnalysis &RA, const BinaryContext &BC, BinaryFunction &BF) - : DataflowAnalysis(BC, BF), FA(FA), + : DataflowAnalysis(BC, BF), RA(RA), NumRegs(BC.MRI->getNumRegs()) {} virtual ~LivenessAnalysis(); @@ -42,9 +47,21 @@ class LivenessAnalysis DataflowAnalysis::run(); } + // Return a usable general-purpose reg after point P. Return 0 if no reg is + // available. + MCPhysReg scavengeRegAfter(ProgramPoint P) { + BitVector BV = *this->getStateAt(P); + BV.flip(); + BitVector GPRegs(NumRegs, false); + this->BC.MIA->getGPRegs(GPRegs, *this->BC.MRI); + BV &= GPRegs; + int Reg = BV.find_first(); + return Reg != -1 ? Reg : 0; + } + protected: - /// Reference to the result of stack frame analysis - const FrameAnalysis &FA; + /// Reference to the result of reg analysis + const RegAnalysis &RA; const uint16_t NumRegs; void preflight() {} @@ -63,18 +80,34 @@ class LivenessAnalysis BitVector computeNext(const MCInst &Point, const BitVector &Cur) { BitVector Next = Cur; + bool IsCall = this->BC.MIA->isCall(Point); // Kill auto Written = BitVector(NumRegs, false); - if (this->BC.MIA->isCall(Point)) - FA.getInstClobberList(this->BC, Point, Written); - else + if (!IsCall) { this->BC.MIA->getWrittenRegs(Point, Written, *this->BC.MRI); + } else { + RA.getInstClobberList(Point, Written); + // When clobber list is conservative, it is clobbering all/most registers, + // a conservative estimate because it knows nothing about this call. + // For our purposes, assume it kills no registers/callee-saved regs + // because we don't really know what's going on. + if (RA.isConservative(Written)) { + Written.reset(); + BC.MIA->getCalleeSavedRegs(Written, *this->BC.MRI); + } + } Written.flip(); Next &= Written; // Gen if (!this->BC.MIA->isCFI(Point)) { auto Used = BitVector(NumRegs, false); - this->BC.MIA->getUsedRegs(Point, Used, *this->BC.MRI); + RA.getInstUsedRegsList(Point, Used, /*GetClobbers*/false); + if (IsCall && + (!BC.MIA->isTailCall(Point) || !BC.MIA->isConditionalBranch(Point))) { + // Never gen FLAGS from a non-conditional call... this is overly + // conservative + Used.reset(BC.MIA->getFlagsReg()); + } Next |= Used; } return Next; diff --git a/bolt/Passes/ReachingDefOrUse.h b/bolt/Passes/ReachingDefOrUse.h index 9b5f8695b3f1..8d11ec0d9c5c 100644 --- a/bolt/Passes/ReachingDefOrUse.h +++ b/bolt/Passes/ReachingDefOrUse.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H #include "DataflowAnalysis.h" +#include "RegAnalysis.h" #include "llvm/Support/Timer.h" namespace llvm { @@ -28,16 +29,16 @@ class ReachingDefOrUse friend class DataflowAnalysis, BitVector, !Def>; public: - ReachingDefOrUse(const FrameAnalysis &FA, const BinaryContext &BC, + ReachingDefOrUse(const RegAnalysis &RA, const BinaryContext &BC, BinaryFunction &BF) - : InstrsDataflowAnalysis, !Def>(BC, BF), FA(FA) {} + : InstrsDataflowAnalysis, !Def>(BC, BF), RA(RA) {} virtual ~ReachingDefOrUse() {} bool isReachedBy(MCPhysReg Reg, ExprIterator Candidates) { for (auto I = Candidates; I != this->expr_end(); ++I) { auto BV = BitVector(this->BC.MRI->getNumRegs(), false); if (Def) { - FA.getInstClobberList(this->BC, **I, BV); + RA.getInstClobberList(**I, BV); } else { this->BC.MIA->getTouchedRegs(**I, BV, *this->BC.MRI); } @@ -57,8 +58,8 @@ class ReachingDefOrUse } protected: - /// Reference to the result of stack frame analysis - const FrameAnalysis &FA; + /// Reference to the result of reg analysis + const RegAnalysis &RA; void preflight() { // Populate our universe of tracked expressions with all instructions @@ -89,11 +90,11 @@ class ReachingDefOrUse // getClobberedRegs for X and Y. If they intersect, return true auto XClobbers = BitVector(this->BC.MRI->getNumRegs(), false); auto YClobbers = BitVector(this->BC.MRI->getNumRegs(), false); - FA.getInstClobberList(this->BC, *X, XClobbers); + RA.getInstClobberList(*X, XClobbers); // In defs, write after write -> kills first write // In uses, write after access (read or write) -> kills access if (Def) - FA.getInstClobberList(this->BC, *Y, YClobbers); + RA.getInstClobberList(*Y, YClobbers); else this->BC.MIA->getTouchedRegs(*Y, YClobbers, *this->BC.MRI); // X kills Y if it clobbers Y completely -- this is a conservative approach. diff --git a/bolt/Passes/RegAnalysis.cpp b/bolt/Passes/RegAnalysis.cpp new file mode 100644 index 000000000000..b17ada273daf --- /dev/null +++ b/bolt/Passes/RegAnalysis.cpp @@ -0,0 +1,207 @@ +#include "RegAnalysis.h" +#include "CallGraphWalker.h" +#include "llvm/Support/CommandLine.h" + +#define DEBUG_TYPE "ra" + +using namespace llvm; + +namespace opts { +extern cl::opt Verbosity; +extern cl::OptionCategory BoltOptCategory; + +cl::opt AssumeABI( + "assume-abi", + cl::desc("assume the ABI is never violated"), + cl::ZeroOrMore, + cl::init(false), + cl::cat(BoltOptCategory)); +} + +namespace llvm { +namespace bolt { + +RegAnalysis::RegAnalysis(BinaryContext &BC, + std::map &BFs, + BinaryFunctionCallGraph &CG) + : BC(BC) { + CallGraphWalker CGWalker(BC, BFs, CG); + + CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool { + BitVector RegsKilled = getFunctionClobberList(Func); + bool Updated = RegsKilledMap.find(Func) == RegsKilledMap.end() || + RegsKilledMap[Func] != RegsKilled; + if (Updated) + RegsKilledMap[Func] = std::move(RegsKilled); + return Updated; + }); + + CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool { + BitVector RegsGen = getFunctionUsedRegsList(Func); + bool Updated = RegsGenMap.find(Func) == RegsGenMap.end() || + RegsGenMap[Func] != RegsGen; + if (Updated) + RegsGenMap[Func] = std::move(RegsGen); + return Updated; + }); + + CGWalker.walk(); + + if (opts::Verbosity == 0) { +#ifndef NDEBUG + if (!DebugFlag || !isCurrentDebugType(DEBUG_TYPE)) + return; +#else + return; +#endif + } + + // This loop is for computing statistics only + for (auto &MapEntry : BFs) { + auto *Func = &MapEntry.second; + auto Iter = RegsKilledMap.find(Func); + assert(Iter != RegsKilledMap.end() && + "Failed to compute all clobbers list"); + if (Iter->second.all()) { + auto Count = Func->getExecutionCount(); + if (Count != BinaryFunction::COUNT_NO_PROFILE) + CountFunctionsAllClobber += Count; + ++NumFunctionsAllClobber; + } + DEBUG_WITH_TYPE("fa", + dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; + const BitVector &RegsKilled = Iter->second; + int RegIdx = RegsKilled.find_first(); + while (RegIdx != -1) { + dbgs() << "\tREG" << RegIdx; + RegIdx = RegsKilled.find_next(RegIdx); + }; + dbgs() << "\nUsed regs set for func: " << Func->getPrintName() << "\n"; + const BitVector &RegsUsed = RegsGenMap.find(Func)->second; + RegIdx = RegsUsed.find_first(); + while (RegIdx != -1) { + dbgs() << "\tREG" << RegIdx; + RegIdx = RegsUsed.find_next(RegIdx); + }; + dbgs() << "\n"; + ); + } +} + +void RegAnalysis::beConservative(BitVector &Result) const { + if (!opts::AssumeABI) { + Result.set(); + } else { + BitVector BV(BC.MRI->getNumRegs(), false); + BC.MIA->getCalleeSavedRegs(BV, *BC.MRI); + BV.flip(); + Result |= BV; + } +} + +bool RegAnalysis::isConservative(BitVector &Vec) const { + if (!opts::AssumeABI) { + return Vec.all(); + } else { + BitVector BV(BC.MRI->getNumRegs(), false); + BC.MIA->getCalleeSavedRegs(BV, *BC.MRI); + BV |= Vec; + return BV.all(); + } +} + +void RegAnalysis::getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet, + bool GetClobbers) const { + if (!BC.MIA->isCall(Inst)) { + if (GetClobbers) + BC.MIA->getClobberedRegs(Inst, RegSet, *BC.MRI); + else + BC.MIA->getUsedRegs(Inst, RegSet, *BC.MRI); + return; + } + + const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); + // If indirect call, we know nothing + if (TargetSymbol == nullptr) { + beConservative(RegSet); + return; + } + + const auto *Function = BC.getFunctionForSymbol(TargetSymbol); + if (Function == nullptr) { + // Call to a function without a BinaryFunction object. + // This should be a call to a PLT entry, and since it is a trampoline to + // a DSO, we can't really know the code in advance. + beConservative(RegSet); + return; + } + if (GetClobbers) { + auto BV = RegsKilledMap.find(Function); + if (BV != RegsKilledMap.end()) { + RegSet |= BV->second; + return; + } + // Ignore calls to function whose clobber list wasn't yet calculated. This + // instruction will be evaluated again once we have info for the callee. + return; + } + auto BV = RegsGenMap.find(Function); + if (BV != RegsGenMap.end()) { + RegSet |= BV->second; + return; + } +} + +void RegAnalysis::getInstClobberList(const MCInst &Inst, + BitVector &KillSet) const { + return getInstUsedRegsList(Inst, KillSet, /*GetClobbers*/ true); +} + +BitVector RegAnalysis::getFunctionUsedRegsList(const BinaryFunction *Func) { + BitVector UsedRegs = BitVector(BC.MRI->getNumRegs(), false); + + if (!Func->isSimple() || !Func->hasCFG()) { + beConservative(UsedRegs); + return UsedRegs; + } + + for (const auto &BB : *Func) { + for (const auto &Inst : BB) { + getInstUsedRegsList(Inst, UsedRegs, /*GetClobbers*/false); + if (UsedRegs.all()) + return UsedRegs; + } + } + + return UsedRegs; +} + +BitVector RegAnalysis::getFunctionClobberList(const BinaryFunction *Func) { + BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false); + + if (!Func->isSimple() || !Func->hasCFG()) { + beConservative(RegsKilled); + return RegsKilled; + } + + for (const auto &BB : *Func) { + for (const auto &Inst : BB) { + getInstClobberList(Inst, RegsKilled); + if (RegsKilled.all()) + return RegsKilled; + } + } + + return RegsKilled; +} + +void RegAnalysis::printStats() { + outs() << "BOLT-INFO REG ANALYSIS: Number of functions conservatively " + "treated as clobbering all registers: " + << NumFunctionsAllClobber + << format(" (%.1lf%% dyn cov)\n", + (100.0 * CountFunctionsAllClobber / CountDenominator)); +} + +} +} diff --git a/bolt/Passes/RegAnalysis.h b/bolt/Passes/RegAnalysis.h new file mode 100644 index 000000000000..dd802bcfb5f3 --- /dev/null +++ b/bolt/Passes/RegAnalysis.h @@ -0,0 +1,82 @@ +//===--- Passes/RegAnalysis.h ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REGANALYSIS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REGANALYSIS_H + +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "BinaryFunctionCallGraph.h" +#include "llvm/ADT/BitVector.h" +#include + +namespace llvm { +namespace bolt { + +/// Determine the set of registers read or clobbered for each instruction +/// in a BinaryFunction. If the instruction is a call, this analysis rely on +/// a call graph traversal to accurately extract the set of registers touched +/// after the call returns. +class RegAnalysis { + BinaryContext &BC; + + /// Map functions to the set of registers they may overwrite starting at when + /// it is called until it returns to the caller. + std::map RegsKilledMap; + + /// Similar concept above but for registers that are read in that function. + std::map RegsGenMap; + + /// Analysis stats counters + uint64_t NumFunctionsAllClobber{0}; + uint64_t CountFunctionsAllClobber{0}; + uint64_t CountDenominator{0}; + + /// Helper function used to get the set of clobbered/used regs whenever + /// we know nothing about the function. + void beConservative(BitVector &Result) const; + + /// Compute the set of registers \p Func may read from during its execution. + BitVector getFunctionUsedRegsList(const BinaryFunction *Func); + + /// Compute the set of registers \p Func may write to during its execution, + /// starting at the point when it is called up until when it returns. Returns + /// a BitVector the size of the target number of registers, representing the + /// set of clobbered registers. + BitVector getFunctionClobberList(const BinaryFunction *Func); + +public: + RegAnalysis(BinaryContext &BC, std::map &BFs, + BinaryFunctionCallGraph &CG); + + /// Compute the set of registers \p Inst may read from, marking them in + /// \p RegSet. If GetClobbers is true, the set set the instr may write to. + /// Use the callgraph to fill out this info for calls. + void getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet, + bool GetClobbers) const; + + /// Compute the set of registers \p Inst may write to, marking them in + /// \p KillSet. If this is a call, try to get the set of registers the call + /// target will write to. + void getInstClobberList(const MCInst &Inst, BitVector &KillSet) const; + + /// Return true iff Vec has a conservative estimation of used/clobbered regs, + /// expressing no specific knowledge of reg usage. + bool isConservative(BitVector &Vec) const; + + /// Print stats about the quality of our analysis + void printStats(); +}; + +} +} + +#endif diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp index dcc5b5758c60..58570fb036b3 100644 --- a/bolt/Passes/ShrinkWrapping.cpp +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -41,7 +41,7 @@ void CalleeSavedAnalysis::analyzeSaves() { DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); const MCInst *Prev = nullptr; for (auto &Inst : BB) { - if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (auto FIE = FA.getFIEFor(Inst)) { if (!FIE->IsStore || !FIE->IsSimple || !FIE->IsStoreFromReg || FIE->StackOffset >= 0) { Prev = &Inst; @@ -86,7 +86,7 @@ void CalleeSavedAnalysis::analyzeRestores() { const MCInst *Prev = nullptr; for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { auto &Inst = *I; - if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (auto FIE = FA.getFIEFor(Inst)) { if (!FIE->IsLoad || !FIE->IsSimple || !CalleeSaved[FIE->RegOrImm] || FIE->StackOffset >= 0) { Prev = &Inst; @@ -229,7 +229,7 @@ void StackLayoutModifier::classifyStackAccesses() { for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { auto &Inst = *I; checkFramePointerInitialization(Inst); - auto FIEX = FA.getFIEFor(BC, Inst); + auto FIEX = FA.getFIEFor(Inst); if (!FIEX) { Prev = &Inst; continue; @@ -346,7 +346,7 @@ bool StackLayoutModifier::canCollapseRegion(MCInst *DeletedPush) { if (!IsSimple || !BC.MIA->isPush(*DeletedPush)) return false; - auto FIE = FA.getFIEFor(BC, *DeletedPush); + auto FIE = FA.getFIEFor(*DeletedPush); if (!FIE) return false; @@ -370,7 +370,7 @@ bool StackLayoutModifier::canCollapseRegion(int64_t RegionAddr) { } bool StackLayoutModifier::collapseRegion(MCInst *DeletedPush) { - auto FIE = FA.getFIEFor(BC, *DeletedPush); + auto FIE = FA.getFIEFor(*DeletedPush); if (!FIE) return false; int64_t RegionAddr = FIE->StackOffset; @@ -414,7 +414,7 @@ bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr, continue; } - auto FIE = FA.getFIEFor(BC, Inst); + auto FIE = FA.getFIEFor(Inst); assert(FIE); if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr) continue; @@ -499,7 +499,7 @@ bool StackLayoutModifier::insertRegion(ProgramPoint P, int64_t RegionSz) { continue; } - auto FIE = FA.getFIEFor(BC, Inst); + auto FIE = FA.getFIEFor(Inst); assert(FIE); if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr) continue; diff --git a/bolt/Passes/StackAvailableExpressions.cpp b/bolt/Passes/StackAvailableExpressions.cpp index d0a5f5b1c12a..a2169d2992ce 100644 --- a/bolt/Passes/StackAvailableExpressions.cpp +++ b/bolt/Passes/StackAvailableExpressions.cpp @@ -17,10 +17,11 @@ namespace llvm { namespace bolt { -StackAvailableExpressions::StackAvailableExpressions(const FrameAnalysis &FA, +StackAvailableExpressions::StackAvailableExpressions(const RegAnalysis &RA, + const FrameAnalysis &FA, const BinaryContext &BC, BinaryFunction &BF) - : InstrsDataflowAnalysis(BC, BF), FA(FA) {} + : InstrsDataflowAnalysis(BC, BF), RA(RA), FA(FA) {} void StackAvailableExpressions::preflight() { DEBUG(dbgs() << "Starting StackAvailableExpressions on \"" @@ -31,7 +32,7 @@ void StackAvailableExpressions::preflight() { // program. for (auto &BB : Func) { for (auto &Inst : BB) { - auto FIE = FA.getFIEFor(BC, Inst); + auto FIE = FA.getFIEFor(Inst); if (!FIE) continue; if (FIE->IsStore == true && FIE->IsSimple == true) { @@ -80,8 +81,8 @@ bool isLoadRedundant(const FrameIndexEntry &LoadFIE, bool StackAvailableExpressions::doesXKillsY(const MCInst *X, const MCInst *Y) { // if both are stores, and both store to the same stack location, return // true - auto FIEX = FA.getFIEFor(BC, *X); - auto FIEY = FA.getFIEFor(BC, *Y); + auto FIEX = FA.getFIEFor(*X); + auto FIEY = FA.getFIEFor(*Y); if (FIEX && FIEY) { if (isLoadRedundant(*FIEX, *FIEY)) return false; @@ -93,14 +94,14 @@ bool StackAvailableExpressions::doesXKillsY(const MCInst *X, const MCInst *Y) { // getClobberedRegs for X and Y. If they intersect, return true BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false); BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false); - FA.getInstClobberList(BC, *X, XClobbers); + RA.getInstClobberList(*X, XClobbers); // If Y is a store to stack, its clobber list is its source reg. This is // different than the rest because we want to check if the store source // reaches its corresponding load untouched. if (FIEY && FIEY->IsStore == true && FIEY->IsStoreFromReg) { YClobbers.set(FIEY->RegOrImm); } else { - FA.getInstClobberList(BC, *Y, YClobbers); + RA.getInstClobberList(*Y, YClobbers); } XClobbers &= YClobbers; return XClobbers.any(); @@ -121,7 +122,7 @@ BitVector StackAvailableExpressions::computeNext(const MCInst &Point, } } // Gen - if (auto FIE = FA.getFIEFor(BC, Point)) { + if (auto FIE = FA.getFIEFor(Point)) { if (FIE->IsStore == true && FIE->IsSimple == true) Next.set(ExprToIdx[&Point]); } diff --git a/bolt/Passes/StackAvailableExpressions.h b/bolt/Passes/StackAvailableExpressions.h index 6ec3234ff6ad..d96f49d3886b 100644 --- a/bolt/Passes/StackAvailableExpressions.h +++ b/bolt/Passes/StackAvailableExpressions.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H #include "DataflowAnalysis.h" +#include "RegAnalysis.h" #include "llvm/Support/Timer.h" namespace llvm { @@ -25,7 +26,7 @@ class StackAvailableExpressions friend class DataflowAnalysis; public: - StackAvailableExpressions(const FrameAnalysis &FA, + StackAvailableExpressions(const RegAnalysis &RA, const FrameAnalysis &FA, const BinaryContext &BC, BinaryFunction &BF); virtual ~StackAvailableExpressions() {} @@ -35,7 +36,7 @@ class StackAvailableExpressions } protected: - /// Reference to the result of stack frame analysis + const RegAnalysis &RA; const FrameAnalysis &FA; void preflight(); diff --git a/bolt/Passes/StackReachingUses.cpp b/bolt/Passes/StackReachingUses.cpp index 68e76b1438ff..a7a91e92b06a 100644 --- a/bolt/Passes/StackReachingUses.cpp +++ b/bolt/Passes/StackReachingUses.cpp @@ -22,7 +22,7 @@ bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE, for (auto I = Candidates; I != expr_end(); ++I) { const MCInst *ReachingInst = *I; if (IncludeLocalAccesses) { - if (auto FIEY = FA.getFIEFor(BC, *ReachingInst)) { + if (auto FIEY = FA.getFIEFor(*ReachingInst)) { assert(FIEY->IsLoad == 1); if (StoreFIE.StackOffset + StoreFIE.Size > FIEY->StackOffset && StoreFIE.StackOffset < FIEY->StackOffset + FIEY->Size) { @@ -30,7 +30,7 @@ bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE, } } } - auto Args = FA.getArgAccessesFor(BC, *ReachingInst); + auto Args = FA.getArgAccessesFor(*ReachingInst); if (!Args) continue; if (Args->AssumeEverything) { @@ -55,14 +55,14 @@ void StackReachingUses::preflight() { // program. for (auto &BB : Func) { for (auto &Inst : BB) { - if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (auto FIE = FA.getFIEFor(Inst)) { if (FIE->IsLoad == true) { Expressions.push_back(&Inst); ExprToIdx[&Inst] = NumInstrs++; continue; } } - auto AA = FA.getArgAccessesFor(BC, Inst); + auto AA = FA.getArgAccessesFor(Inst); if (AA && (!AA->Set.empty() || AA->AssumeEverything)) { Expressions.push_back(&Inst); ExprToIdx[&Inst] = NumInstrs++; @@ -74,8 +74,8 @@ void StackReachingUses::preflight() { bool StackReachingUses::doesXKillsY(const MCInst *X, const MCInst *Y) { // if X is a store to the same stack location and the bytes fetched is a // superset of those bytes affected by the load in Y, return true - auto FIEX = FA.getFIEFor(BC, *X); - auto FIEY = FA.getFIEFor(BC, *Y); + auto FIEX = FA.getFIEFor(*X); + auto FIEY = FA.getFIEFor(*Y); if (FIEX && FIEY) { if (FIEX->IsStore == true && FIEY->IsLoad == true && FIEX->StackOffset <= FIEY->StackOffset && @@ -98,11 +98,11 @@ BitVector StackReachingUses::computeNext(const MCInst &Point, } }; // Gen - if (auto FIE = FA.getFIEFor(BC, Point)) { + if (auto FIE = FA.getFIEFor(Point)) { if (FIE->IsLoad == true) Next.set(ExprToIdx[&Point]); } - auto AA = FA.getArgAccessesFor(BC, Point); + auto AA = FA.getArgAccessesFor(Point); if (AA && (!AA->Set.empty() || AA->AssumeEverything)) Next.set(ExprToIdx[&Point]); return Next; From a90239e18001a82d86223db33d81cc776ce4162c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 7 Jun 2017 20:06:29 -0700 Subject: [PATCH 270/904] [BOLT] Fix ELF inter-section references Summary: Since we are stripping non-allocatable relocation sections from the binary and adding new sections it changes section indices in the binary. Some sections refer to other sections by their index which is stored in sh_link or sh_info field. Hence we need to update these field. In the past update of indices was done ad-hoc and as we started adding more complex updates to section header table the update mechanism became broken in some cases. As a result, we were putting wrong indices into sh_link/sh_info. The broken case was discovered while investigating a problem with a stripped BOLTed binary. In BOLTed binary .rela.plt was incorrectly pointing to one of the debug sections and strip command removed the debug section together with .rela section that was referencing it. The new update mechanism computes complete old to new section index mapping and updates sh_link/sh_info fields based on the mapping before writing section header entries into an output file. (cherry picked from commit 830f01c638dfcd4b1075d6180148b60f42a9d9ce) --- bolt/RewriteInstance.cpp | 102 ++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 60 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 1ee3aa7a75a0..60fed04e0854 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2855,20 +2855,18 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { auto *Obj = File->getELFFile(); auto &OS = Out->os(); - auto SHTOffset = OS.tell(); - uint64_t CurrentSectionIndex = 0; - NewSectionIndex.resize(Obj->getNumSections()); + std::vector SectionsToWrite; - SHTOffset = appendPadding(OS, SHTOffset, sizeof(Elf_Shdr)); + NewSectionIndex.resize(Obj->getNumSections()); // Copy over entries for original allocatable sections with minor // modifications (e.g. name). for (auto &Section : Obj->sections()) { // Always ignore this section. if (Section.sh_type == ELF::SHT_NULL) { - OS.write(reinterpret_cast(&Section), sizeof(Section)); - NewSectionIndex[0] = CurrentSectionIndex++; + NewSectionIndex[0] = SectionsToWrite.size(); + SectionsToWrite.emplace_back(Section); continue; } @@ -2892,18 +2890,12 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSection.sh_name = SHStrTab.getOffset(*SectionName); } - if (Section.sh_addr <= NewTextSectionStartAddress && - Section.sh_addr + Section.sh_size > NewTextSectionStartAddress) { - NewTextSectionIndex = CurrentSectionIndex; - } - - OS.write(reinterpret_cast(&NewSection), sizeof(NewSection)); NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = - CurrentSectionIndex++; + SectionsToWrite.size(); + SectionsToWrite.emplace_back(NewSection); } // Create entries for new allocatable sections. - std::vector SectionsToRewrite; for (auto &SMII : EFMM->SectionMapInfo) { const auto &SectionName = SMII.first; const auto &SI = SMII.second; @@ -2927,25 +2919,9 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSection.sh_link = 0; NewSection.sh_info = 0; NewSection.sh_addralign = SI.Alignment; - SectionsToRewrite.emplace_back(NewSection); - } - - // Write section header entries for new allocatable sections in offset order. - std::stable_sort(SectionsToRewrite.begin(), SectionsToRewrite.end(), - [] (Elf_Shdr A, Elf_Shdr B) { - return A.sh_offset < B.sh_offset; - }); - for (auto &SI : SectionsToRewrite) { - if (SI.sh_addr <= NewTextSectionStartAddress && - SI.sh_addr + SI.sh_size > NewTextSectionStartAddress) { - NewTextSectionIndex = CurrentSectionIndex; - } - OS.write(reinterpret_cast(&SI), - sizeof(SI)); - ++CurrentSectionIndex; + SectionsToWrite.emplace_back(NewSection); } - int64_t SectionCountDelta = SectionsToRewrite.size(); uint64_t LastFileOffset = 0; // Copy over entries for non-allocatable sections performing necessary @@ -2955,10 +2931,9 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { continue; if (Section.sh_flags & ELF::SHF_ALLOC) continue; - if (Section.sh_type == ELF::SHT_RELA) { - --SectionCountDelta; + // Strip non-allocatable relocation sections. + if (Section.sh_type == ELF::SHT_RELA) continue; - } ErrorOr SectionName = Obj->getSectionName(&Section); check_error(SectionName.getError(), "cannot get section name"); @@ -2973,25 +2948,14 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSection.sh_size = SI.Size; NewSection.sh_name = SHStrTab.getOffset(*SectionName); - // Adjust sh_link for sections that use it. - if (Section.sh_link) - NewSection.sh_link = Section.sh_link + SectionCountDelta; - - // Adjust sh_info for relocation sections. - if (Section.sh_type == ELF::SHT_REL || Section.sh_type == ELF::SHT_RELA) { - if (Section.sh_info) - NewSection.sh_info = Section.sh_info + SectionCountDelta; - } - - OS.write(reinterpret_cast(&NewSection), sizeof(NewSection)); NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = - CurrentSectionIndex++; + SectionsToWrite.size(); + SectionsToWrite.emplace_back(NewSection); LastFileOffset = SI.FileOffset; } // Create entries for new non-allocatable sections. - SectionsToRewrite.clear(); for (auto &SII : EFMM->NoteSectionInfo) { const auto &SectionName = SII.first; const auto &SI = SII.second; @@ -3012,20 +2976,41 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSection.sh_link = 0; NewSection.sh_info = 0; NewSection.sh_addralign = SI.Alignment ? SI.Alignment : 1; - SectionsToRewrite.emplace_back(NewSection); + SectionsToWrite.emplace_back(NewSection); } - // Write section header entries for new non-allocatable sections. - std::stable_sort(SectionsToRewrite.begin(), SectionsToRewrite.end(), + // Sort sections by their offset prior to writing. Only newly created sections + // were unsorted, hence this wouldn't ruin indices in NewSectionIndex. + std::stable_sort(SectionsToWrite.begin(), SectionsToWrite.end(), [] (Elf_Shdr A, Elf_Shdr B) { return A.sh_offset < B.sh_offset; }); - for (auto &SI : SectionsToRewrite) { - OS.write(reinterpret_cast(&SI), sizeof(SI)); - ++CurrentSectionIndex; + + DEBUG( + dbgs() << "BOLT-DEBUG: old to new section index mapping:\n"; + for (uint64_t I = 0; I < NewSectionIndex.size(); ++I) { + dbgs() << " " << I << " -> " << NewSectionIndex[I] << '\n'; + } + ); + + // Align starting address for section header table. + auto SHTOffset = OS.tell(); + SHTOffset = appendPadding(OS, SHTOffset, sizeof(Elf_Shdr)); + + // Write all section header entries while patching section references. + for (uint64_t Index = 0; Index < SectionsToWrite.size(); ++Index) { + auto &Section = SectionsToWrite[Index]; + if (Section.sh_addr <= NewTextSectionStartAddress && + Section.sh_addr + Section.sh_size > NewTextSectionStartAddress) { + NewTextSectionIndex = Index; + } + Section.sh_link = NewSectionIndex[Section.sh_link]; + if (Section.sh_type == ELF::SHT_REL || Section.sh_type == ELF::SHT_RELA) { + if (Section.sh_info) + Section.sh_info = NewSectionIndex[Section.sh_info]; + } + OS.write(reinterpret_cast(&Section), sizeof(Section)); } - const auto AllocSectionCountDelta = SectionCountDelta; - SectionCountDelta += SectionsToRewrite.size(); // Fix ELF header. auto NewEhdr = *Obj->getHeader(); @@ -3037,12 +3022,9 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewEhdr.e_phoff = PHDRTableOffset; NewEhdr.e_phnum = Phnum; NewEhdr.e_shoff = SHTOffset; - NewEhdr.e_shnum = NewEhdr.e_shnum + SectionCountDelta; - NewEhdr.e_shstrndx = NewEhdr.e_shstrndx + AllocSectionCountDelta; + NewEhdr.e_shnum = SectionsToWrite.size(); + NewEhdr.e_shstrndx = NewSectionIndex[NewEhdr.e_shstrndx]; OS.pwrite(reinterpret_cast(&NewEhdr), sizeof(NewEhdr), 0); - - assert(NewEhdr.e_shnum == CurrentSectionIndex && - "internal calculation error"); } template From 6764e1c67a74cb024298ff12d1a9b44fb3ba6cd2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 7 Jun 2017 18:31:06 -0700 Subject: [PATCH 271/904] [BOLT] Fix hfsort+ crash when no perf data is present. Summary: hfsort+ was trying to access the back() of an empty vector when no perf data is present. Just add a guard around that code. (cherry picked from commit 71121845c894961c4e7ae10957c8d1521451614f) --- bolt/Passes/HFSortPlus.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index 761a807413b1..8e0d58dba03a 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -129,7 +129,7 @@ struct AlgoState { // current address of the function from the beginning of its cluster std::vector Addr; // maximum cluster id. - size_t MaxClusterId; + size_t MaxClusterId{0}; }; } @@ -403,8 +403,10 @@ std::vector hfsortPlus(const CallGraph &Cg) { State.Cg = &Cg; State.TotalSamples = 0; State.FuncCluster = std::vector(Cg.numNodes(), nullptr); - State.Addr = std::vector(Cg.numNodes(), InvalidAddr); - State.MaxClusterId = AllClusters.back().id(); + State.Addr = std::vector(Cg.numNodes(), InvalidAddr); + if (!AllClusters.empty()) { + State.MaxClusterId = AllClusters.back().id(); + } for (NodeId F = 0; F < Cg.numNodes(); F++) { if (Cg.samples(F) == 0) continue; Clusters.push_back(&AllClusters[F]); From 73cf0f5b07af1f0cd54e711d712563db2cef3ecd Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 8 Jun 2017 13:46:17 -0700 Subject: [PATCH 272/904] [BOLT] Only print stats when requested Summary: Make LLVM timers only output numbers when the -time-opts option is used. (cherry picked from commit ec242b30604d1dc9a92ccb50276097a282e88dc5) --- bolt/BinaryPassManager.cpp | 2 +- bolt/Passes/BinaryFunctionCallGraph.cpp | 10 ++++++++-- bolt/Passes/CallGraphWalker.cpp | 7 ++++++- bolt/Passes/CallGraphWalker.h | 6 +----- bolt/Passes/DominatorAnalysis.h | 7 ++++++- bolt/Passes/FrameAnalysis.cpp | 8 +++++--- bolt/Passes/FrameOptimizer.cpp | 7 ++++--- bolt/Passes/LivenessAnalysis.h | 3 ++- bolt/Passes/ReachingDefOrUse.h | 7 ++++++- bolt/Passes/ReachingInsns.h | 7 ++++++- bolt/Passes/RegAnalysis.cpp | 2 +- bolt/Passes/ShrinkWrapping.cpp | 3 ++- bolt/Passes/StackAllocationAnalysis.h | 7 ++++++- bolt/Passes/StackAvailableExpressions.h | 7 ++++++- bolt/Passes/StackPointerTracking.h | 7 ++++++- bolt/Passes/StackReachingUses.h | 7 ++++++- 16 files changed, 72 insertions(+), 25 deletions(-) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index eda6c575b3bc..846d12f9ad53 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -185,7 +185,7 @@ StripRepRet("strip-rep-ret", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static llvm::cl::opt +llvm::cl::opt TimeOpts("time-opts", cl::desc("print time spent in each optimization"), cl::init(false), diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index 5d29cb64d9fd..d9e608cb1ca3 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -12,10 +12,15 @@ #include "BinaryFunctionCallGraph.h" #include "BinaryFunction.h" #include "BinaryContext.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" #define DEBUG_TYPE "callgraph" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { @@ -31,7 +36,8 @@ CallGraph::NodeId BinaryFunctionCallGraph::addNode(BinaryFunction *BF, } std::deque BinaryFunctionCallGraph::buildTraversalOrder() { - NamedRegionTimer T1("Build cg traversal order", "CG breakdown", true); + NamedRegionTimer T1("Build cg traversal order", "CG breakdown", + opts::TimeOpts); std::deque TopologicalOrder; enum NodeStatus { NEW, VISITING, VISITED }; std::vector NodeStatus(Funcs.size()); @@ -75,7 +81,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, bool IncludeColdCalls, bool UseFunctionHotSize, bool UseEdgeCounts) { - NamedRegionTimer T1("Callgraph construction", "CG breakdown", true); + NamedRegionTimer T1("Callgraph construction", "CG breakdown", opts::TimeOpts); BinaryFunctionCallGraph Cg; // Add call graph nodes. diff --git a/bolt/Passes/CallGraphWalker.cpp b/bolt/Passes/CallGraphWalker.cpp index 00f9d75a8dcd..720dc6c9d9a4 100644 --- a/bolt/Passes/CallGraphWalker.cpp +++ b/bolt/Passes/CallGraphWalker.cpp @@ -1,11 +1,16 @@ #include "CallGraphWalker.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { void CallGraphWalker::traverseCG() { - NamedRegionTimer T1("CG Traversal", "CG breakdown", true); + NamedRegionTimer T1("CG Traversal", "CG breakdown", opts::TimeOpts); std::queue Queue; std::set InQueue; diff --git a/bolt/Passes/CallGraphWalker.h b/bolt/Passes/CallGraphWalker.h index 195e536fd07d..b1b550058cac 100644 --- a/bolt/Passes/CallGraphWalker.h +++ b/bolt/Passes/CallGraphWalker.h @@ -27,8 +27,6 @@ namespace bolt { /// a property that depends on callees. In the event of a CG cycles, this will /// re-visit functions until their observed property converges. class CallGraphWalker { - BinaryContext &BC; - std::map &BFs; BinaryFunctionCallGraph &CG; /// DFS or reverse post-ordering of the call graph nodes to allow us to @@ -44,9 +42,7 @@ class CallGraphWalker { public: /// Initialize core context references but don't do anything yet - CallGraphWalker(BinaryContext &BC, std::map &BFs, - BinaryFunctionCallGraph &CG) - : BC(BC), BFs(BFs), CG(CG) {} + CallGraphWalker(BinaryFunctionCallGraph &CG) : CG(CG) {} /// Register a new callback function to be called for each function when /// traversing the call graph bottom-up. Function should return true iff diff --git a/bolt/Passes/DominatorAnalysis.h b/bolt/Passes/DominatorAnalysis.h index 4abc508e78f0..5586dac3043b 100644 --- a/bolt/Passes/DominatorAnalysis.h +++ b/bolt/Passes/DominatorAnalysis.h @@ -13,8 +13,13 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_DOMINATORANALYSIS_H #include "DataflowAnalysis.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { @@ -99,7 +104,7 @@ class DominatorAnalysis } void run() { - NamedRegionTimer T1("DA", "Dataflow", true); + NamedRegionTimer T1("DA", "Dataflow", opts::TimeOpts); InstrsDataflowAnalysis, Backward>::run(); } diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 3cd2ce883b59..a0fbff46b057 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -17,6 +17,7 @@ using namespace llvm; namespace opts { +extern cl::opt TimeOpts; extern cl::opt Verbosity; extern bool shouldProcess(const bolt::BinaryFunction &Function); @@ -149,7 +150,7 @@ class FrameAccessAnalysis { FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF) : SPT(BC, BF), BC(BC), BF(BF) { { - NamedRegionTimer T1("SPT", "Dataflow", true); + NamedRegionTimer T1("SPT", "Dataflow", opts::TimeOpts); SPT.run(); } } @@ -278,7 +279,7 @@ FrameAnalysis::getFIEFor(const MCInst &Inst) const { } void FrameAnalysis::traverseCG(BinaryFunctionCallGraph &CG) { - CallGraphWalker CGWalker(BC, BFs, CG); + CallGraphWalker CGWalker(CG); CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool { return computeArgsAccessed(*Func); @@ -499,7 +500,8 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, } { - NamedRegionTimer T1("restore frame index", "FOP breakdown", true); + NamedRegionTimer T1("restore frame index", "FOP breakdown", + opts::TimeOpts); if (!restoreFrameIndex(I.second)) { ++NumFunctionsFailedRestoreFI; auto Count = I.second.getExecutionCount(); diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 094e668f419e..38d3968950e0 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -23,6 +23,7 @@ using namespace llvm; namespace opts { extern cl::opt Verbosity; +extern cl::opt TimeOpts; extern cl::OptionCategory BoltOptCategory; using namespace bolt; @@ -237,18 +238,18 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, << BC.getHotThreshold() << " )\n"); } { - NamedRegionTimer T1("remove loads", "FOP breakdown", true); + NamedRegionTimer T1("remove loads", "FOP breakdown", opts::TimeOpts); removeUnnecessaryLoads(RA, FA, BC, I.second); } { - NamedRegionTimer T1("remove stores", "FOP breakdown", true); + NamedRegionTimer T1("remove stores", "FOP breakdown", opts::TimeOpts); removeUnusedStores(FA, BC, I.second); } // Don't even start shrink wrapping if no profiling info is available if (I.second.getKnownExecutionCount() == 0) continue; { - NamedRegionTimer T1("move spills", "FOP breakdown", true); + NamedRegionTimer T1("move spills", "FOP breakdown", opts::TimeOpts); DataflowInfoManager Info(BC, I.second, &RA, &FA); ShrinkWrapping SW(FA, BC, I.second, Info); SW.perform(); diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index 739f49150f4d..2fde42863392 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -19,6 +19,7 @@ namespace opts { extern llvm::cl::opt AssumeABI; +extern llvm::cl::opt TimeOpts; } namespace llvm { @@ -43,7 +44,7 @@ class LivenessAnalysis } void run() { - NamedRegionTimer T1("LA", "Dataflow", true); + NamedRegionTimer T1("LA", "Dataflow", opts::TimeOpts); DataflowAnalysis::run(); } diff --git a/bolt/Passes/ReachingDefOrUse.h b/bolt/Passes/ReachingDefOrUse.h index 8d11ec0d9c5c..f241bff62f66 100644 --- a/bolt/Passes/ReachingDefOrUse.h +++ b/bolt/Passes/ReachingDefOrUse.h @@ -14,8 +14,13 @@ #include "DataflowAnalysis.h" #include "RegAnalysis.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { @@ -53,7 +58,7 @@ class ReachingDefOrUse } void run() { - NamedRegionTimer T1("RD", "Dataflow", true); + NamedRegionTimer T1("RD", "Dataflow", opts::TimeOpts); InstrsDataflowAnalysis, !Def>::run(); } diff --git a/bolt/Passes/ReachingInsns.h b/bolt/Passes/ReachingInsns.h index ce6cd8ccaa08..047c6e3154ce 100644 --- a/bolt/Passes/ReachingInsns.h +++ b/bolt/Passes/ReachingInsns.h @@ -13,8 +13,13 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H #include "DataflowAnalysis.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { @@ -41,7 +46,7 @@ class ReachingInsns } void run() { - NamedRegionTimer T1("RI", "Dataflow", true); + NamedRegionTimer T1("RI", "Dataflow", opts::TimeOpts); InstrsDataflowAnalysis, Backward>::run(); } diff --git a/bolt/Passes/RegAnalysis.cpp b/bolt/Passes/RegAnalysis.cpp index b17ada273daf..570e09358132 100644 --- a/bolt/Passes/RegAnalysis.cpp +++ b/bolt/Passes/RegAnalysis.cpp @@ -25,7 +25,7 @@ RegAnalysis::RegAnalysis(BinaryContext &BC, std::map &BFs, BinaryFunctionCallGraph &CG) : BC(BC) { - CallGraphWalker CGWalker(BC, BFs, CG); + CallGraphWalker CGWalker(CG); CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool { BitVector RegsKilled = getFunctionClobberList(Func); diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp index 58570fb036b3..47636c1c6019 100644 --- a/bolt/Passes/ShrinkWrapping.cpp +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -18,6 +18,7 @@ using namespace llvm; namespace opts { +extern cl::opt TimeOpts; extern cl::OptionCategory BoltOptCategory; static cl::opt ShrinkWrappingThreshold( @@ -1251,7 +1252,7 @@ class PredictiveStackPointerTracking TodoMap(TodoMap), Info(Info) {} void run() { - NamedRegionTimer T1("PSPT", "Dataflow", true); + NamedRegionTimer T1("PSPT", "Dataflow", opts::TimeOpts); StackPointerTrackingBase::run(); } }; diff --git a/bolt/Passes/StackAllocationAnalysis.h b/bolt/Passes/StackAllocationAnalysis.h index 64fba984fed2..22291448763d 100644 --- a/bolt/Passes/StackAllocationAnalysis.h +++ b/bolt/Passes/StackAllocationAnalysis.h @@ -14,8 +14,13 @@ #include "DataflowAnalysis.h" #include "StackPointerTracking.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { @@ -36,7 +41,7 @@ class StackAllocationAnalysis virtual ~StackAllocationAnalysis() {} void run() { - NamedRegionTimer T1("SAA", "Dataflow", true); + NamedRegionTimer T1("SAA", "Dataflow", opts::TimeOpts); InstrsDataflowAnalysis::run(); } diff --git a/bolt/Passes/StackAvailableExpressions.h b/bolt/Passes/StackAvailableExpressions.h index d96f49d3886b..8291ce52100a 100644 --- a/bolt/Passes/StackAvailableExpressions.h +++ b/bolt/Passes/StackAvailableExpressions.h @@ -14,8 +14,13 @@ #include "DataflowAnalysis.h" #include "RegAnalysis.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { @@ -31,7 +36,7 @@ class StackAvailableExpressions virtual ~StackAvailableExpressions() {} void run() { - NamedRegionTimer T1("SAE", "Dataflow", true); + NamedRegionTimer T1("SAE", "Dataflow", opts::TimeOpts); InstrsDataflowAnalysis::run(); } diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/Passes/StackPointerTracking.h index 99e4818c2395..cfcf237a9380 100644 --- a/bolt/Passes/StackPointerTracking.h +++ b/bolt/Passes/StackPointerTracking.h @@ -13,8 +13,13 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKPOINTERTRACKING_H #include "DataflowAnalysis.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { @@ -193,7 +198,7 @@ class StackPointerTracking virtual ~StackPointerTracking() {} void run() { - NamedRegionTimer T1("SPT", "Dataflow", true); + NamedRegionTimer T1("SPT", "Dataflow", opts::TimeOpts); StackPointerTrackingBase::run(); } }; diff --git a/bolt/Passes/StackReachingUses.h b/bolt/Passes/StackReachingUses.h index 7ea7094ef6bd..84a46754c748 100644 --- a/bolt/Passes/StackReachingUses.h +++ b/bolt/Passes/StackReachingUses.h @@ -13,8 +13,13 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H #include "DataflowAnalysis.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" +namespace opts { +extern llvm::cl::opt TimeOpts; +} + namespace llvm { namespace bolt { @@ -35,7 +40,7 @@ class StackReachingUses bool IncludeLocalAccesses = true) const; void run() { - NamedRegionTimer T1("SRU", "Dataflow", true); + NamedRegionTimer T1("SRU", "Dataflow", opts::TimeOpts); InstrsDataflowAnalysis::run(); } From a9738b7de5094cfd447c54e33bd198e02a761144 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 7 Jun 2017 14:20:39 -0700 Subject: [PATCH 273/904] Fix dynostats for conditional tail calls Summary: Don't treat conditional tail calls as branches for dynostats. Count taken conditional tails calls as calls. Change SCTC to report dynamic numbers after it is done. (cherry picked from commit 2af49f6765c8e93e449436718910c2502eb35c3f) --- bolt/BinaryFunction.cpp | 49 ++++++++++++++++-------------------- bolt/Passes/BinaryPasses.cpp | 35 ++++++++++++++++++++++---- bolt/Passes/BinaryPasses.h | 2 ++ 3 files changed, 54 insertions(+), 32 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 410dc317b880..fe397319a92e 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -2083,7 +2083,7 @@ void BinaryFunction::inferFallThroughCounts() { auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); - // Compute preliminary execution time for each basic block + // Compute preliminary execution count for each basic block for (auto CurBB : BasicBlocks) { CurBB->ExecutionCount = 0; } @@ -4118,9 +4118,17 @@ DynoStats BinaryFunction::getDynoStats() const { } if (!BC.MIA->isCall(Instr)) continue; - Stats[DynoStats::FUNCTION_CALLS] += BBExecutionCount; + uint64_t CallFreq = BBExecutionCount; + if (BC.MIA->isCTC(Instr)) { + CallFreq = 0; + if (auto FreqOrErr = + BC.MIA->tryGetAnnotationAs(Instr, "CTCTakenFreq")) { + CallFreq = *FreqOrErr; + } + } + Stats[DynoStats::FUNCTION_CALLS] += CallFreq; if (BC.MIA->getMemoryOperandNo(Instr) != -1) { - Stats[DynoStats::INDIRECT_CALLS] += BBExecutionCount; + Stats[DynoStats::INDIRECT_CALLS] += CallFreq; } else if (const auto *CallSymbol = BC.MIA->getTargetSymbol(Instr)) { if (BC.getFunctionForSymbol(CallSymbol)) continue; @@ -4133,7 +4141,7 @@ DynoStats BinaryFunction::getDynoStats() const { StringRef SectionName; Section->getName(SectionName); if (SectionName == ".plt") { - Stats[DynoStats::PLT_CALLS] += BBExecutionCount; + Stats[DynoStats::PLT_CALLS] += CallFreq; } } } @@ -4175,36 +4183,23 @@ DynoStats BinaryFunction::getDynoStats() const { continue; } - // Conditional branch that could be followed by an unconditional branch. - uint64_t TakenCount; - uint64_t NonTakenCount; - bool IsForwardBranch; - if (BB->succ_size() == 2) { - TakenCount = BB->getBranchInfo(true).Count; - NonTakenCount = BB->getBranchInfo(false).Count; - IsForwardBranch = isForwardBranch(BB, BB->getConditionalSuccessor(true)); - } else { - // SCTC breaks the CFG invariant so we have to make some affordances - // here if we want dyno stats after running it. - TakenCount = BB->branch_info_begin()->Count; - if (TakenCount != COUNT_NO_PROFILE) - NonTakenCount = BBExecutionCount - TakenCount; - else - NonTakenCount = 0; - - // If succ_size == 0 then we are branching to a function - // rather than a BB label. - IsForwardBranch = BB->succ_size() == 0 - ? isForwardCall(BC.MIA->getTargetSymbol(*CondBranch)) - : isForwardBranch(BB, BB->getFallthrough()); + // CTCs + if (BC.MIA->isCTC(*CondBranch)) { + if (BB->branch_info_begin() != BB->branch_info_end()) + Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count; + continue; } + // Conditional branch that could be followed by an unconditional branch. + uint64_t TakenCount = BB->getBranchInfo(true).Count; if (TakenCount == COUNT_NO_PROFILE) TakenCount = 0; + + uint64_t NonTakenCount = BB->getBranchInfo(false).Count; if (NonTakenCount == COUNT_NO_PROFILE) NonTakenCount = 0; - if (IsForwardBranch) { + if (isForwardBranch(BB, BB->getConditionalSuccessor(true))) { Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount; Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount; } else { diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 0d832cc90b5f..5ff491935d2a 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -516,6 +516,8 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, auto &MIA = BC.MIA; uint64_t NumLocalCTCCandidates = 0; uint64_t NumLocalCTCs = 0; + uint64_t LocalCTCTakenCount = 0; + uint64_t LocalCTCExecCount = 0; std::vector> NeedsUncondBranch; @@ -587,14 +589,29 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // Change destination of the conditional branch. MIA->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get()); } + const uint64_t CTCTakenFreq = PredBB->getBranchInfo(true).Count == + BinaryBasicBlock::COUNT_NO_PROFILE + ? 0 + : PredBB->getBranchInfo(true).Count; // Annotate it, so "isCall" returns true for this jcc MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "IsCTC", true); + // Add info abount the conditional tail call frequency, otherwise this + // info will be lost when we delete the associated BranchInfo entry + BC.MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "CTCTakenFreq", + CTCTakenFreq); // Remove the unused successor which may be eliminated later // if there are no other users. PredBB->removeSuccessor(BB); + // Update BB execution count + if (BB->getKnownExecutionCount() > 0) { + assert(CTCTakenFreq <= BB->getKnownExecutionCount()); + BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); + } ++NumLocalCTCs; + LocalCTCTakenCount += CTCTakenFreq; + LocalCTCExecCount += PredBB->getKnownExecutionCount(); } // Remove the block from CFG if all predecessors were removed. @@ -643,11 +660,16 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, } DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs - << " conditional tail calls from a total of " << NumLocalCTCCandidates - << " candidates in function " << BF << "\n";); + << " conditional tail calls from a total of " + << NumLocalCTCCandidates << " candidates in function " << BF + << ". CTCs execution count for this function is " + << LocalCTCExecCount << " and CTC taken count is " + << LocalCTCTakenCount << "\n";); NumTailCallsPatched += NumLocalCTCs; NumCandidateTailCalls += NumLocalCTCCandidates; + CTCExecCount += LocalCTCExecCount; + CTCTakenCount += LocalCTCTakenCount; return NumLocalCTCs > 0; } @@ -672,10 +694,13 @@ void SimplifyConditionalTailCalls::runOnFunctions( outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched << " tail calls (" << NumOrigForwardBranches << " forward)" << " tail calls (" << NumOrigBackwardBranches << " backward)" - << " from a total of " << NumCandidateTailCalls - << " while removing " << NumDoubleJumps << " double jumps" + << " from a total of " << NumCandidateTailCalls << " while removing " + << NumDoubleJumps << " double jumps" << " and removing " << DeletedBlocks << " basic blocks" - << " totalling " << DeletedBytes << " bytes of code.\n"; + << " totalling " << DeletedBytes + << " bytes of code. CTCs total execution count is " << CTCExecCount + << " and the number of times CTCs are taken is " << CTCTakenCount + << ".\n"; } void Peepholes::shortenInstructions(BinaryContext &BC, diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index d80876ca79b4..a3e08f25c501 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -198,6 +198,8 @@ class FinalizeFunctions : public BinaryFunctionPass { class SimplifyConditionalTailCalls : public BinaryFunctionPass { uint64_t NumCandidateTailCalls{0}; uint64_t NumTailCallsPatched{0}; + uint64_t CTCExecCount{0}; + uint64_t CTCTakenCount{0}; uint64_t NumOrigForwardBranches{0}; uint64_t NumOrigBackwardBranches{0}; uint64_t NumDoubleJumps{0}; From 0f375040c377140af1d1cae09175682a1cb0544d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 6 Jun 2017 17:43:45 -0700 Subject: [PATCH 274/904] [BOLT] Fix hfsort+ caching mechanism Summary: There's good news and bad news. The good news is that this fixes the caching mechanism used by hfsort+ so that we always get the correct end results, i.e. the order is the same whether the cache is enabled or not. The bad news is that it takes about the same amount of time as the original to run. (~6min) The good news is that I can make some improvements on this implementation which I'll put up in another diff. The problem with the old caching mechanism is that it was caching values that were dependent on adjacent sets of clusters. It only invalidated the clusters being merged and none of other clusters that might have been affected. This version computes the adjacency information up front and updates it after every merge, rather than recomputing it for each iteration. It uses the adjacency data to properly invalidate any cached values. (cherry picked from commit 933b2d88e079afbfe17273157a31d7c86e033fdc) --- bolt/Passes/BinaryFunctionCallGraph.cpp | 22 +- bolt/Passes/CallGraph.cpp | 2 - bolt/Passes/CallGraph.h | 4 + bolt/Passes/HFSort.cpp | 18 +- bolt/Passes/HFSort.h | 8 +- bolt/Passes/HFSortPlus.cpp | 438 ++++++++++++++++-------- bolt/Passes/PettisAndHansen.cpp | 3 +- 7 files changed, 335 insertions(+), 160 deletions(-) diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index d9e608cb1ca3..9cf9f123bf83 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -12,13 +12,14 @@ #include "BinaryFunctionCallGraph.h" #include "BinaryFunction.h" #include "BinaryContext.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Options.h" #include "llvm/Support/Timer.h" #define DEBUG_TYPE "callgraph" namespace opts { extern llvm::cl::opt TimeOpts; +extern llvm::cl::opt Verbosity; } namespace llvm { @@ -130,8 +131,11 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, const auto DstId = lookupNode(DstFunc); const auto AvgDelta = !UseEdgeCounts ? Offset - DstFunc->getAddress() : 0; Cg.incArcWeight(SrcId, DstId, Count, AvgDelta); - DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function - << " -> " << *DstFunc << " @ " << Offset << "\n"); + DEBUG( + if (opts::Verbosity > 1) { + dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function + << " -> " << *DstFunc << " @ " << Offset << "\n"; + }); return true; } return false; @@ -194,8 +198,16 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, } } - outs() << "BOLT-WARNING: buildCallGraph: " << NotProcessed - << " callsites not processed out of " << TotalCalls << "\n"; +#ifndef NDEBUG + bool PrintInfo = DebugFlag && isCurrentDebugType("callgraph"); +#else + bool PrintInfo = false; +#endif + if (PrintInfo || opts::Verbosity > 0) { + outs() << format("BOLT-INFO: buildCallGraph: %u nodes, density = %.6lf, " + "%u callsites not processed out of %u.\n", + Cg.numNodes(), Cg.density(), NotProcessed, TotalCalls); + } return Cg; } diff --git a/bolt/Passes/CallGraph.cpp b/bolt/Passes/CallGraph.cpp index 14def27b6adf..70544fe6da45 100644 --- a/bolt/Passes/CallGraph.cpp +++ b/bolt/Passes/CallGraph.cpp @@ -10,8 +10,6 @@ //===----------------------------------------------------------------------===// #include "CallGraph.h" -#include "BinaryFunction.h" -#include "BinaryContext.h" #define DEBUG_TYPE "callgraph" diff --git a/bolt/Passes/CallGraph.h b/bolt/Passes/CallGraph.h index 64960bf3d76d..8c5d0fa99890 100644 --- a/bolt/Passes/CallGraph.h +++ b/bolt/Passes/CallGraph.h @@ -130,6 +130,10 @@ class CallGraph { return Arcs; } + double density() const { + return double(Arcs.size()) / (Nodes.size()*Nodes.size()); + } + void normalizeArcWeights(bool UseEdgeCounts); template diff --git a/bolt/Passes/HFSort.cpp b/bolt/Passes/HFSort.cpp index cb93191dc7a5..193ac30f40f4 100644 --- a/bolt/Passes/HFSort.cpp +++ b/bolt/Passes/HFSort.cpp @@ -112,17 +112,22 @@ void Cluster::reverseTargets() { std::reverse(Targets.begin(), Targets.end()); } -void Cluster::merge(Cluster&& Other, const double Aw) { +void Cluster::merge(const Cluster& Other, const double Aw) { Targets.insert(Targets.end(), Other.Targets.begin(), Other.Targets.end()); Size += Other.Size; Samples += Other.Samples; Density = (double)Samples / Size; +} - Other.Size = 0; - Other.Samples = 0; - Other.Targets.clear(); +void Cluster::clear() { + Id = -1u; + Size = 0; + Samples = 0; + Density = 0.0; + Targets.clear(); + Frozen = false; } std::vector clusterize(const CallGraph &Cg) { @@ -218,7 +223,8 @@ std::vector clusterize(const CallGraph &Cg) { FuncCluster[F] = PredCluster; } - PredCluster->merge(std::move(*Cluster)); + PredCluster->merge(*Cluster); + Cluster->clear(); } // Return the set of Clusters that are left, which are the ones that @@ -281,7 +287,7 @@ std::vector randomClusters(const CallGraph &Cg) { if (MergeIdx == Clusters.size()) { ++Idx; } else { - Clusters[Idx].merge(std::move(Clusters[MergeIdx])); + Clusters[Idx].merge(Clusters[MergeIdx]); Clusters.erase(Clusters.begin() + MergeIdx); } } diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h index 7f32a99cfee7..0cd5d66a99bd 100644 --- a/bolt/Passes/HFSort.h +++ b/bolt/Passes/HFSort.h @@ -55,7 +55,8 @@ class Cluster { uint32_t size() const { return Size; } bool frozen() const { return Frozen; } void freeze() { Frozen = true; } - void merge(Cluster &&Other, const double Aw = 0); + void merge(const Cluster &Other, const double Aw = 0); + void clear(); size_t numTargets() const { return Targets.size(); } @@ -66,12 +67,13 @@ class Cluster { return Targets[N]; } void reverseTargets(); + bool hasId() const { return Id != -1u; } void setId(uint32_t NewId) { - assert(Id == -1u); + assert(!hasId()); Id = NewId; } uint32_t id() const { - assert(Id != -1u); + assert(hasId()); return Id; } private: diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index 8e0d58dba03a..cebcf8946564 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -28,18 +28,53 @@ */ #include "HFSort.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" +#include "llvm/Support/Options.h" +#include "llvm/Support/raw_ostream.h" #include #include #include -#include "llvm/Support/Debug.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" #undef DEBUG_TYPE #define DEBUG_TYPE "hfsort" +namespace opts { + +extern llvm::cl::OptionCategory BoltOptCategory; +extern llvm::cl::opt Verbosity; + +static llvm::cl::opt +UseGainCache("hfsort+-use-cache", + llvm::cl::desc("Use a cache for mergeGain results when computing hfsort+."), + llvm::cl::ZeroOrMore, + llvm::cl::init(true), + llvm::cl::Hidden, + llvm::cl::cat(BoltOptCategory)); + +static llvm::cl::opt +UseShortCallCache("hfsort+-use-short-call-cache", + llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."), + llvm::cl::ZeroOrMore, + llvm::cl::init(true), + llvm::cl::Hidden, + llvm::cl::cat(BoltOptCategory)); + +const char* cacheKindString() { + if (opts::UseGainCache && opts::UseShortCallCache) + return "gain + short call cache"; + else if (opts::UseGainCache) + return "gain cache"; + else if (opts::UseShortCallCache) + return "short call cache"; + else + return "no cache"; +} + +} + namespace llvm { namespace bolt { @@ -60,66 +95,136 @@ constexpr uint32_t ITLBEntries = 16; constexpr size_t InvalidAddr = -1; -template -class HashPair { +// This class maintains adjacency information for all Clusters being +// processed. It is used to invalidate cache entries when merging +// Clusters and for visiting all neighbors of any given Cluster. +class AdjacencyMatrix { public: - size_t operator()(const std::pair &P) const { - size_t Seed(0); - Seed = hashCombine(Seed, (int64_t)P.first); - Seed = hashCombine(Seed, (int64_t)P.second); - return Seed; + AdjacencyMatrix(const CallGraph &Cg, + std::vector &Clusters, + const std::vector &FuncCluster) + : Clusters(Clusters), + Bits(Cg.numNodes(), BitVector(Cg.numNodes(), false)) { + initialize(Cg, FuncCluster); + } + + template + void forallAdjacent(const Cluster *C, F Func) const { + const_cast(this)->forallAdjacent(C, Func); + } + + template + void forallAdjacent(const Cluster *C, F Func) { + for (auto I = Bits[C->id()].find_first(); I != -1; I = Bits[C->id()].find_next(I)) { + Func(Clusters[I]); + } + } + + void merge(const Cluster *A, const Cluster *B) { + Bits[A->id()] |= Bits[B->id()]; + Bits[A->id()][A->id()] = false; + Bits[A->id()][B->id()] = false; + Bits[B->id()][A->id()] = false; + for (auto I = Bits[B->id()].find_first(); I != -1; I = Bits[B->id()].find_next(I)) { + Bits[I][A->id()] = true; + Bits[I][B->id()] = false; + } + } + + void dump(const Cluster *A) const { + outs() << "Cluster " << A->id() << ":"; + forallAdjacent(A, + [this,A](const Cluster *B) { + outs() << " " << B->id(); + }); + } + + void dump() const { + for (auto *A : Clusters) { + if (!A) continue; + dump(A); + outs() << "\n"; + } + } +private: + void set(const Cluster *A, const Cluster *B, bool Value) { + assert(A != B); + Bits[A->id()][B->id()] = Value; + Bits[B->id()][A->id()] = Value; + } + + void initialize(const CallGraph &Cg, const std::vector &FuncCluster) { + for (auto *A : Clusters) { + for (auto TargetId : A->targets()) { + for (auto Succ : Cg.successors(TargetId)) { + auto *B = FuncCluster[Succ]; + if (!B || B == A) continue; + set(A, B, true); + } + for (auto Pred : Cg.predecessors(TargetId)) { + auto *B = FuncCluster[Pred]; + if (!B || B == A) continue; + set(A, B, true); + } + } + } } + + std::vector Clusters; + std::vector Bits; }; // A cache of precomputed results for a pair of clusters class PrecomputedResults { public: - PrecomputedResults() {} + explicit PrecomputedResults(size_t Size) + : Size(Size), + Cache(new double[Size*Size]), + Valid(Size * Size, false) { + memset(Cache, 0, sizeof(double)*Size*Size); + } + ~PrecomputedResults() { + delete[] Cache; + } - bool contains(Cluster *First, Cluster *Second) const { - if (InvalidKeys.count(First) || InvalidKeys.count(Second)) { - return false; - } - const auto Key = std::make_pair(First, Second); - return Cache.find(Key) != Cache.end(); + bool contains(const Cluster *First, const Cluster *Second) const { + return Valid[index(First, Second)]; } - double get(Cluster *First, Cluster *Second) const { + double get(const Cluster *First, const Cluster *Second) const { assert(contains(First, Second)); - const auto Key = std::make_pair(First, Second); // TODO: use min/max? - return Cache.find(Key)->second; + return Cache[index(First, Second)]; } - void set(Cluster *First, Cluster *Second, double Value) { - const auto Key = std::make_pair(First, Second); - Cache[Key] = Value; - validate(First); - validate(Second); + void set(const Cluster *First, const Cluster *Second, double Value) { + const auto Index = index(First, Second); + Cache[Index] = Value; + Valid[Index] = true; } - void validate(Cluster *C) { - auto Itr = InvalidKeys.find(C); - if (Itr != InvalidKeys.end()) - InvalidKeys.erase(Itr); + void invalidate(const AdjacencyMatrix &Adjacent, const Cluster *C) { + invalidate(C); + Adjacent.forallAdjacent(C, [&](const Cluster *A) { invalidate(A); }); } - - void validateAll() { - InvalidKeys.clear(); + private: + void invalidate(const Cluster *C) { + Valid.reset(C->id() * Size, (C->id() + 1) * Size); } - void invalidate(Cluster *Cluster) { - InvalidKeys.insert(Cluster); + size_t index(const Cluster *First, const Cluster *Second) const { + return (First->id() * Size) + Second->id(); } - private: - std::unordered_map, - double, - HashPair> Cache; - std::unordered_set InvalidKeys; + size_t Size; + double *Cache; + BitVector Valid; }; -// A wrapper for algorthm-wide variables +// A wrapper for algorithm-wide variables struct AlgoState { + explicit AlgoState(size_t Size) + : Cache(Size), ShortCallPairCache(Size) { } + // the call graph const CallGraph *Cg; // the total number of samples in the graph @@ -130,42 +235,72 @@ struct AlgoState { std::vector Addr; // maximum cluster id. size_t MaxClusterId{0}; + // A cache that keeps precomputed values of mergeGain for pairs of clusters; + // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs + // containing both x and y (and recompute them on the next iteration) + PrecomputedResults Cache; + // Cache for shortCalls for a single cluster. + std::unordered_map ShortCallCache; + // Cache for shortCalls for a pair of Clusters + PrecomputedResults ShortCallPairCache; }; } -/* - * Sorting clusters by their density in decreasing order - */ -void sortByDensity(std::vector &Clusters) { - std::stable_sort( - Clusters.begin(), - Clusters.end(), - [&] (const Cluster *C1, const Cluster *C2) { - const double D1 = C1->density(); - const double D2 = C2->density(); - // making sure the sorting is deterministic - if (D1 != D2) return D1 > D2; - if (C1->size() != C2->size()) return C1->size() < C2->size(); - if (C1->samples() != C2->samples()) return C1->samples() > C2->samples(); - return C1->target(0) < C2->target(0); - } - ); -} - /* * Density of a cluster formed by merging a given pair of clusters */ -double density(Cluster *ClusterPred, Cluster *ClusterSucc) { +double density(const Cluster *ClusterPred, const Cluster *ClusterSucc) { const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples(); const double CombinedSize = ClusterPred->size() + ClusterSucc->size(); return CombinedSamples / CombinedSize; } +/* + * Deterministically compare clusters by their density in decreasing order. + */ +bool compareClusters(const Cluster *C1, const Cluster *C2) { + const double D1 = C1->density(); + const double D2 = C2->density(); + // making sure the sorting is deterministic + if (D1 != D2) return D1 > D2; + if (C1->size() != C2->size()) return C1->size() < C2->size(); + if (C1->samples() != C2->samples()) return C1->samples() > C2->samples(); + return C1->target(0) < C2->target(0); +} + +/* + * Deterministically compare pairs of clusters by their density + * in decreasing order. + */ +bool compareClusterPairs(const Cluster *A1, const Cluster *B1, + const Cluster *A2, const Cluster *B2) { + const auto D1 = density(A1, B1); + const auto D2 = density(A2, B2); + if (D1 != D2) return D1 > D2; + const auto Size1 = A1->size() + B1->size(); + const auto Size2 = A2->size() + B2->size(); + if (Size1 != Size2) return Size1 < Size2; + const auto Samples1 = A1->samples() + B1->samples(); + const auto Samples2 = A2->samples() + B2->samples(); + if (Samples1 != Samples2) return Samples1 > Samples2; + return A1->target(0) < A2->target(0); +} + +/* + * Sorting clusters by their density in decreasing order + */ +template +std::vector sortByDensity(const C &Clusters_) { + std::vector Clusters(Clusters_.begin(), Clusters_.end()); + std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters); + return Clusters; +} + /* * The probability that a page with a given weight is not present in the cache. * - * Assume that the hot function are called in a random order; then the + * Assume that the hot functions are called in a random order; then the * probability of a TLB page being accessed after a function call is * p=pageSamples/totalSamples. The probability that the page is not accessed * is (1-p), and the probability that it is not in the cache (i.e. not accessed @@ -194,11 +329,10 @@ double missProbability(const AlgoState &State, double PageSamples) { * page. The following procedure detects short and long calls, and estimates * the expected number of cache misses for the long ones. */ -double expectedCacheHitRatio(const AlgoState &State, - const std::vector &Clusters_) { - // copy and sort by density - std::vector Clusters(Clusters_); - sortByDensity(Clusters); +template +double expectedCacheHitRatio(const AlgoState &State, const C &Clusters_) { + // sort by density + std::vector Clusters(sortByDensity(Clusters_)); // generate function addresses with an alignment std::vector Addr(State.Cg->numNodes(), InvalidAddr); @@ -247,35 +381,6 @@ double expectedCacheHitRatio(const AlgoState &State, return 100.0 * (1.0 - Misses / State.TotalSamples); } -/* - * Get adjacent clusters (the ones that share an arc) with the given one - */ -std::vector adjacentClusters(const AlgoState &State, Cluster *C) { - std::vector Result; - Result.reserve(State.MaxClusterId); - for (auto TargetId : C->targets()) { - for (auto Succ : State.Cg->successors(TargetId)) { - auto SuccCluster = State.FuncCluster[Succ]; - if (SuccCluster != nullptr && SuccCluster != C) { - Result.push_back(SuccCluster); - } - } - for (auto Pred : State.Cg->predecessors(TargetId)) { - auto PredCluster = State.FuncCluster[Pred]; - if (PredCluster != nullptr && PredCluster != C) { - Result.push_back(PredCluster); - } - } - } - std::sort(Result.begin(), Result.end(), - [](const Cluster *A, const Cluster *B) { - return A->id() < B->id(); - }); - auto Last = std::unique(Result.begin(), Result.end()); - Result.erase(Last, Result.end()); - return Result; -} - /* * The expected number of calls for an edge withing the same TLB page */ @@ -291,7 +396,13 @@ double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) { * The expected number of calls within a given cluster with both endpoints on * the same TLB cache page */ -double shortCalls(const AlgoState &State, Cluster *Cluster) { +double shortCalls(AlgoState &State, const Cluster *Cluster) { + if (opts::UseShortCallCache) { + auto Itr = State.ShortCallCache.find(Cluster); + if (Itr != State.ShortCallCache.end()) + return Itr->second; + } + double Calls = 0; for (auto TargetId : Cluster->targets()) { for (auto Succ : State.Cg->successors(TargetId)) { @@ -306,6 +417,10 @@ double shortCalls(const AlgoState &State, Cluster *Cluster) { } } + if (opts::UseShortCallCache) { + State.ShortCallCache[Cluster] = Calls; + } + return Calls; } @@ -313,9 +428,14 @@ double shortCalls(const AlgoState &State, Cluster *Cluster) { * The number of calls between the two clusters with both endpoints on * the same TLB page, assuming that a given pair of clusters gets merged */ -double shortCalls(const AlgoState &State, - Cluster *ClusterPred, - Cluster *ClusterSucc) { +double shortCalls(AlgoState &State, + const Cluster *ClusterPred, + const Cluster *ClusterSucc) { + if (opts::UseShortCallCache && + State.ShortCallPairCache.contains(ClusterPred, ClusterSucc)) { + return State.ShortCallPairCache.get(ClusterPred, ClusterSucc); + } + double Calls = 0; for (auto TargetId : ClusterPred->targets()) { for (auto Succ : State.Cg->successors(TargetId)) { @@ -344,6 +464,10 @@ double shortCalls(const AlgoState &State, } } + if (opts::UseShortCallCache) { + State.ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls); + } + return Calls; } @@ -359,9 +483,13 @@ double shortCalls(const AlgoState &State, * increse the chance of merging short clusters, which is helpful for * the i-cache performance. */ -double mergeGain(const AlgoState &State, - Cluster *ClusterPred, - Cluster *ClusterSucc) { +double mergeGain(AlgoState &State, + const Cluster *ClusterPred, + const Cluster *ClusterSucc) { + if (opts::UseGainCache && State.Cache.contains(ClusterPred, ClusterSucc)) { + return State.Cache.get(ClusterPred, ClusterSucc); + } + // cache misses on the first cluster double LongCallsPred = ClusterPred->samples() - shortCalls(State, ClusterPred); double ProbPred = missProbability(State, ClusterPred->density() * PageSize); @@ -381,7 +509,20 @@ double mergeGain(const AlgoState &State, double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew; // scaling the result to increase the importance of merging short clusters - return Gain / (ClusterPred->size() + ClusterSucc->size()); + Gain /= (ClusterPred->size() + ClusterSucc->size()); + + if (opts::UseGainCache) { + State.Cache.set(ClusterPred, ClusterSucc, Gain); + } + + return Gain; +} + +template +void maybeErase(C &Container, const V& Value) { + auto Itr = Container.find(Value); + if (Itr != Container.end()) + Container.erase(Itr); } /* @@ -393,37 +534,35 @@ std::vector hfsortPlus(const CallGraph &Cg) { AllClusters.reserve(Cg.numNodes()); for (NodeId F = 0; F < Cg.numNodes(); F++) { AllClusters.emplace_back(F, Cg.getNode(F)); - AllClusters.back().setId(F); } // initialize objects used by the algorithm std::vector Clusters; Clusters.reserve(Cg.numNodes()); - AlgoState State; + AlgoState State(AllClusters.size()); // TODO: should use final Clusters.size() State.Cg = &Cg; State.TotalSamples = 0; State.FuncCluster = std::vector(Cg.numNodes(), nullptr); - State.Addr = std::vector(Cg.numNodes(), InvalidAddr); - if (!AllClusters.empty()) { - State.MaxClusterId = AllClusters.back().id(); - } + State.Addr = std::vector(Cg.numNodes(), InvalidAddr); + uint32_t Id = 0; for (NodeId F = 0; F < Cg.numNodes(); F++) { if (Cg.samples(F) == 0) continue; Clusters.push_back(&AllClusters[F]); + Clusters.back()->setId(Id); State.FuncCluster[F] = &AllClusters[F]; State.Addr[F] = 0; State.TotalSamples += Cg.samples(F); + ++Id; } + State.MaxClusterId = Id; - DEBUG(dbgs() << "Starting hfsort+ for " << Clusters.size() << " clusters\n" + AdjacencyMatrix Adjacent(Cg, Clusters, State.FuncCluster); + + DEBUG(dbgs() << "Starting hfsort+ w/" << opts::cacheKindString() << " for " + << Clusters.size() << " clusters\n" << format("Initial expected iTLB cache hit ratio: %.4lf\n", expectedCacheHitRatio(State, Clusters))); - // the cache keeps precomputed values of mergeGain for pairs of clusters; - // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs - // containing both x and y (and recompute them on the next iteration) - PrecomputedResults Cache; - int Steps = 0; // merge pairs of clusters while there is an improvement while (Clusters.size() > 1) { @@ -435,44 +574,46 @@ std::vector hfsortPlus(const CallGraph &Cg) { expectedCacheHitRatio(State, Clusters)); } ); - Steps++; + ++Steps; Cluster *BestClusterPred = nullptr; Cluster *BestClusterSucc = nullptr; double BestGain = -1; for (auto ClusterPred : Clusters) { // get candidates for merging with the current cluster - auto CandidateClusters = adjacentClusters(State, ClusterPred); - - // find the best candidate - for (auto ClusterSucc : CandidateClusters) { - // get a cost of merging two clusters - if (!Cache.contains(ClusterPred, ClusterSucc)) { - double Value = mergeGain(State, ClusterPred, ClusterSucc); - Cache.set(ClusterPred, ClusterSucc, Value); - assert(Cache.contains(ClusterPred, ClusterSucc)); - } - - double Gain = Cache.get(ClusterPred, ClusterSucc); - // breaking ties by density to make the hottest clusters be merged first - if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 && - density(ClusterPred, ClusterSucc) > - density(BestClusterPred, BestClusterSucc))) { - BestGain = Gain; - BestClusterPred = ClusterPred; - BestClusterSucc = ClusterSucc; + Adjacent.forallAdjacent( + ClusterPred, + // find the best candidate + [&](Cluster *ClusterSucc) { + assert(ClusterPred != ClusterSucc); + // get a cost of merging two clusters + const double Gain = mergeGain(State, ClusterPred, ClusterSucc); + + // breaking ties by density to make the hottest clusters be merged first + if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 && + compareClusterPairs(ClusterPred, + ClusterSucc, + BestClusterPred, + BestClusterSucc))) { + BestGain = Gain; + BestClusterPred = ClusterPred; + BestClusterSucc = ClusterSucc; + } } - } + ); } - Cache.validateAll(); if (BestGain <= 0.0) break; - Cache.invalidate(BestClusterPred); - Cache.invalidate(BestClusterSucc); - // merge the best pair of clusters - BestClusterPred->merge(std::move(*BestClusterSucc)); + DEBUG( + if (opts::Verbosity > 0) { + dbgs() << "Merging cluster " << BestClusterSucc->id() + << " into cluster " << BestClusterPred->id() << "\n"; + }); + + Adjacent.merge(BestClusterPred, BestClusterSucc); + BestClusterPred->merge(*BestClusterSucc); size_t CurAddr = 0; for (auto TargetId : BestClusterPred->targets()) { @@ -481,6 +622,18 @@ std::vector hfsortPlus(const CallGraph &Cg) { CurAddr += State.Cg->size(TargetId); } + if (opts::UseShortCallCache) { + maybeErase(State.ShortCallCache, BestClusterPred); + Adjacent.forallAdjacent(BestClusterPred, + [&State](const Cluster *C) { + maybeErase(State.ShortCallCache, C); + }); + State.ShortCallPairCache.invalidate(Adjacent, BestClusterPred); + } + if (opts::UseGainCache) { + State.Cache.invalidate(Adjacent, BestClusterPred); + } + // remove BestClusterSucc from the list of active clusters auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc); Clusters.erase(Iter, Clusters.end()); @@ -492,9 +645,8 @@ std::vector hfsortPlus(const CallGraph &Cg) { // Return the set of clusters that are left, which are the ones that // didn't get merged (so their first func is its original func). - sortByDensity(Clusters); std::vector Result; - for (auto Cluster : Clusters) { + for (auto Cluster : sortByDensity(Clusters)) { Result.emplace_back(std::move(*Cluster)); } diff --git a/bolt/Passes/PettisAndHansen.cpp b/bolt/Passes/PettisAndHansen.cpp index 90b0e2fc25da..18e9568d81be 100644 --- a/bolt/Passes/PettisAndHansen.cpp +++ b/bolt/Passes/PettisAndHansen.cpp @@ -179,7 +179,8 @@ std::vector pettisAndHansen(const CallGraph &Cg) { for (auto F : C2->targets()) { FuncCluster[F] = C1; } - C1->merge(std::move(*C2), Max.Weight); + C1->merge(*C2, Max.Weight); + C2->clear(); } // Return the set of Clusters that are left, which are the ones that From 80cb56b6f34aa98b51bd90bf67213ab76c2debe6 Mon Sep 17 00:00:00 2001 From: Bohan Ren Date: Mon, 22 May 2017 11:04:01 -0700 Subject: [PATCH 275/904] [BOLT] Expand BOLT report for basic block ordering Summary: Add a new positional option onto bolt: "-print-function-statistics=" which prints information about block ordering for requested number of functions. (cherry picked from commit 485af27a9acb4a717679fd48bb2fb9e0f1dbcbe0) --- bolt/BinaryFunction.cpp | 46 ++++++++++++++++++++++++++ bolt/BinaryFunction.h | 13 ++++++++ bolt/Passes/BinaryPasses.cpp | 64 +++++++++++++++++++++++++++++++----- 3 files changed, 114 insertions(+), 9 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index fe397319a92e..7de9ada88771 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -53,6 +53,7 @@ extern cl::opt Relocs; extern cl::opt UpdateDebugSections; extern cl::opt IndirectCallPromotion; extern cl::opt Verbosity; +extern cl::opt PrintFuncStat; static cl::opt AggressiveSplitting("split-all-cold", @@ -2538,6 +2539,8 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, } Algo->reorderBasicBlocks(*this, NewLayout); + if (opts::PrintFuncStat > 0) + BasicBlocksPreviousLayout = BasicBlocksLayout; BasicBlocksLayout.clear(); BasicBlocksLayout.swap(NewLayout); @@ -2545,6 +2548,49 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, splitFunction(); } +uint64_t BinaryFunction::getInstructionCount() const { + uint64_t Count = 0; + for (auto &Block : BasicBlocksLayout) { + Count += Block->getNumNonPseudos(); + } + return Count; +} + +bool BinaryFunction::hasLayoutChanged() const { + assert(opts::PrintFuncStat > 0 && "PrintFuncStat flag is not on"); + return BasicBlocksPreviousLayout != BasicBlocksLayout; +} + +uint64_t BinaryFunction::getEditDistance() const { + assert(opts::PrintFuncStat > 0 && "PrintFuncStat flag is not on"); + const auto LayoutSize = BasicBlocksPreviousLayout.size(); + if (LayoutSize < 2) { + return 0; + } + + std::vector> ChangeMatrix( + LayoutSize + 1, std::vector(LayoutSize + 1)); + + for (uint64_t I = 0; I <= LayoutSize; ++I) { + ChangeMatrix[I][0] = I; + ChangeMatrix[0][I] = I; + } + + for (uint64_t I = 1; I <= LayoutSize; ++I) { + for (uint64_t J = 1; J <= LayoutSize; ++J) { + if (BasicBlocksPreviousLayout[I] != BasicBlocksLayout[J]) { + ChangeMatrix[I][J] = + std::min(std::min(ChangeMatrix[I - 1][J], ChangeMatrix[I][J - 1]), + ChangeMatrix[I - 1][J - 1]) + 1; + } else { + ChangeMatrix[I][J] = ChangeMatrix[I - 1][J - 1]; + } + } + } + + return ChangeMatrix[LayoutSize][LayoutSize]; +} + void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { int64_t CurrentGnuArgsSize = 0; for (auto BB : layout()) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index affbd39ade44..531f1e3107d0 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -694,6 +694,8 @@ class BinaryFunction { BasicBlockListType BasicBlocks; BasicBlockListType DeletedBasicBlocks; BasicBlockOrderType BasicBlocksLayout; + /// Previous layout replaced by modifyLayout + BasicBlockOrderType BasicBlocksPreviousLayout; /// BasicBlockOffsets are used during CFG construction to map from code /// offsets to BinaryBasicBlocks. Any modifications made to the CFG @@ -1711,6 +1713,17 @@ class BinaryFunction { /// and size. uint64_t getFunctionScore(); + /// Return true if the layout has been changed by basic block reordering, + /// false otherwise. + bool hasLayoutChanged() const; + + /// Get the edit distance of the new layout with respect to the previous + /// layout after basic block reordering. + uint64_t getEditDistance() const; + + /// Get the number of instructions within this function. + uint64_t getInstructionCount() const; + const CFIInstrMapType &getFDEProgram() const { return FrameInstructions; } diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 5ff491935d2a..95fcef30fc31 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -97,6 +97,13 @@ PrintSortedBy("print-sorted-by", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +cl::opt +PrintFuncStat("print-function-statistics", + cl::desc("print statistics about basic block ordering"), + cl::init(0), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt ReorderBlocks("reorder-blocks", cl::desc("change layout of basic blocks in a function"), @@ -293,25 +300,64 @@ bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { } void ReorderBasicBlocks::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { + BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { if (opts::ReorderBlocks == BinaryFunction::LT_NONE) return; + uint64_t ModifiedFuncCount = 0; for (auto &It : BFs) { auto &Function = It.second; - if (!shouldOptimize(Function)) - continue; + if (!shouldOptimize(Function)) + continue; const bool ShouldSplit = - (opts::SplitFunctions == BinaryFunction::ST_ALL) || - (opts::SplitFunctions == BinaryFunction::ST_EH && - Function.hasEHRanges()) || - (LargeFunctions.find(It.first) != LargeFunctions.end()); + (opts::SplitFunctions == BinaryFunction::ST_ALL) || + (opts::SplitFunctions == BinaryFunction::ST_EH && + Function.hasEHRanges()) || + (LargeFunctions.find(It.first) != LargeFunctions.end()); Function.modifyLayout(opts::ReorderBlocks, opts::MinBranchClusters, ShouldSplit); + + if (opts::PrintFuncStat > 0 && Function.hasLayoutChanged()) { + ++ModifiedFuncCount; + } + } + + if (opts::PrintFuncStat > 0) { + raw_ostream &OS = outs(); + // Copy all the values into vector in order to sort them + std::map ScoreMap; + for (auto It = BFs.begin(); It != BFs.end(); ++It) { + ScoreMap.insert(std::pair( + It->second.getFunctionScore(), It->second)); + } + + OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; + OS << " There are " << BFs.size() << " functions in total. \n"; + OS << " Number of functions being modified: " << ModifiedFuncCount + << "\n"; + OS << " User asks for detailed information on top " + << opts::PrintFuncStat << " functions. (Ranked by function score)" + << "\n\n"; + uint64_t I = 0; + for (std::map::reverse_iterator + Rit = ScoreMap.rbegin(); + Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { + auto &Function = Rit->second; + + OS << " Information for function of top: " << (I + 1) << ": \n"; + OS << " Function Score is: " << Function.getFunctionScore() + << "\n"; + OS << " There are " << Function.size() + << " number of blocks in this function.\n"; + OS << " There are " << Function.getInstructionCount() + << " number of instructions in this function.\n"; + OS << " The edit distance for this function is: " + << Function.getEditDistance() << "\n\n"; + } } } From d5b7c68852cc7f1c33988988abfe8d08e661365b Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 9 Jun 2017 15:52:50 -0700 Subject: [PATCH 276/904] [BOLT] Fix SCTC execution count assertion Summary: SCTC is currently asserting (my fault :-) when running in combination with hot jump table entries optimization. This optimization sets the frequency for edges connecting basic blocks it creates and jump table targets based on the execution count of the original BB containing the indirect jump. This is OK as an estimation, but it breaks our assumption that the sum of the frequency of preds edges equals to our BB frequency. This happens because the frequency of the BB is rarely equal to its outgoing edges frequency. SCTC, in turn, was updating the execution count for BBs with tail calls by subtracting the frequency count of predecessor edges. Because hot jump table entries optimization broke the BB exec count = sum(preds freq) invariant, SCTC was asserting. To trigger this, the input program must have a jump table where each entry contains a tail call. This happens in the HHVM binary for func _ZN4HPHP11collections5issetEPNS_10ObjectDataEPKNS_10TypedValueE. (cherry picked from commit 2e6443e71300da683cdbe10bfd97216f81697457) --- bolt/Passes/BinaryPasses.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 95fcef30fc31..3b77ba91ebf7 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -650,9 +650,10 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // if there are no other users. PredBB->removeSuccessor(BB); // Update BB execution count - if (BB->getKnownExecutionCount() > 0) { - assert(CTCTakenFreq <= BB->getKnownExecutionCount()); + if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) { BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); + } else if (CTCTakenFreq > BB->getKnownExecutionCount()) { + BB->setExecutionCount(0); } ++NumLocalCTCs; From 91b5b7edcdb4f34e33d26873a4a996df87ee1fcf Mon Sep 17 00:00:00 2001 From: Bohan Ren Date: Wed, 7 Jun 2017 20:25:30 -0700 Subject: [PATCH 277/904] Normalize Clusters Twice Summary: This one will normalize cluster twice, leaving edges connecting two basic block untouched (cherry picked from commit da95dc8e0d741d7f4806e5067ad34a63a7b95b7f) --- bolt/BinaryBasicBlock.cpp | 4 ++++ bolt/BinaryBasicBlock.h | 3 +++ bolt/Passes/ReorderAlgorithm.cpp | 11 ++++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 8a56beba1835..49eebb8ac5ac 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -434,5 +434,9 @@ void BinaryBasicBlock::dump() const { outs() << "\n"; } +uint64_t BinaryBasicBlock::estimateSize() const { + return Function->getBinaryContext().computeCodeSize(begin(), end()); +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 151ac321484c..db610f7b313d 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -740,6 +740,9 @@ class BinaryBasicBlock { return InputRange.second - InputRange.first; } + /// Returns an estimate of size of basic block during run time. + uint64_t estimateSize() const; + private: void adjustNumPseudos(const MCInst &Inst, int Sign); diff --git a/bolt/Passes/ReorderAlgorithm.cpp b/bolt/Passes/ReorderAlgorithm.cpp index bb976acb5e41..b475b6f58bc9 100644 --- a/bolt/Passes/ReorderAlgorithm.cpp +++ b/bolt/Passes/ReorderAlgorithm.cpp @@ -69,11 +69,16 @@ void ClusterAlgorithm::computeClusterAverageFrequency() { AvgFreq.resize(Clusters.size(), 0.0); for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { double Freq = 0.0; + uint64_t ClusterSize = 0; for (auto BB : Clusters[I]) { - if (BB->getNumNonPseudos() > 0) - Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos(); + if (BB->getNumNonPseudos() > 0) { + Freq += BB->getExecutionCount(); + // Estimate the size of a block in bytes at run time + // NOTE: This might be inaccurate + ClusterSize += BB->estimateSize(); + } } - AvgFreq[I] = Freq; + AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; } } From d5a00017561b9d9f3a8cd708985c100f9fc6cd6c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 8 Jun 2017 10:55:28 -0700 Subject: [PATCH 278/904] [BOLT] More HFSort+ refactoring Summary: Move most of hfsort+ into a class so the state can more easily be shared. (cherry picked from commit c8dfba27af142007dd7c6105b860afd0750cbe0b) --- bolt/Passes/CallGraph.h | 29 +- bolt/Passes/HFSort.h | 4 +- bolt/Passes/HFSortPlus.cpp | 726 ++++++++++++++++--------------- bolt/Passes/PettisAndHansen.cpp | 4 +- bolt/Passes/ReorderFunctions.cpp | 18 +- 5 files changed, 429 insertions(+), 352 deletions(-) diff --git a/bolt/Passes/CallGraph.h b/bolt/Passes/CallGraph.h index 8c5d0fa99890..83837e55b67f 100644 --- a/bolt/Passes/CallGraph.h +++ b/bolt/Passes/CallGraph.h @@ -32,6 +32,22 @@ class CallGraph { using NodeId = size_t; static constexpr NodeId InvalidId = -1; + template + class iterator_range { + T Begin; + T End; + + public: + template + iterator_range(Container &&c) : Begin(c.begin()), End(c.end()) {} + iterator_range(T Begin, T End) + : Begin(std::move(Begin)), + End(std::move(End)) {} + + T begin() const { return Begin; } + T end() const { return End; } + }; + class Arc { public: struct Hash { @@ -57,9 +73,9 @@ class CallGraph { private: friend class CallGraph; - const NodeId Src; - const NodeId Dst; - mutable double Weight; + NodeId Src{InvalidId}; + NodeId Dst{InvalidId}; + mutable double Weight{0}; mutable double NormalizedWeight{0}; mutable double AvgCallOffset{0}; }; @@ -126,8 +142,11 @@ class CallGraph { ArcConstIterator findArc(NodeId Src, NodeId Dst) const { return Arcs.find(Arc(Src, Dst)); } - const ArcsType &getArcs() const { - return Arcs; + iterator_range arcs() const { + return iterator_range(Arcs.begin(), Arcs.end()); + } + iterator_range::const_iterator> nodes() const { + return iterator_range::const_iterator>(Nodes.begin(), Nodes.end()); } double density() const { diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h index 0cd5d66a99bd..9d7c447357c3 100644 --- a/bolt/Passes/HFSort.h +++ b/bolt/Passes/HFSort.h @@ -103,7 +103,9 @@ std::vector clusterize(const CallGraph &Cg); /* * Optimize function placement for iTLB cache and i-cache. */ -std::vector hfsortPlus(const CallGraph &Cg); +std::vector hfsortPlus(const CallGraph &Cg, + bool UseGainCache = true, + bool UseShortCallCache = true); /* * Pettis-Hansen code layout algorithm diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index cebcf8946564..ad041b7e7fbe 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -42,37 +42,7 @@ #define DEBUG_TYPE "hfsort" namespace opts { - -extern llvm::cl::OptionCategory BoltOptCategory; extern llvm::cl::opt Verbosity; - -static llvm::cl::opt -UseGainCache("hfsort+-use-cache", - llvm::cl::desc("Use a cache for mergeGain results when computing hfsort+."), - llvm::cl::ZeroOrMore, - llvm::cl::init(true), - llvm::cl::Hidden, - llvm::cl::cat(BoltOptCategory)); - -static llvm::cl::opt -UseShortCallCache("hfsort+-use-short-call-cache", - llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."), - llvm::cl::ZeroOrMore, - llvm::cl::init(true), - llvm::cl::Hidden, - llvm::cl::cat(BoltOptCategory)); - -const char* cacheKindString() { - if (opts::UseGainCache && opts::UseShortCallCache) - return "gain + short call cache"; - else if (opts::UseGainCache) - return "gain cache"; - else if (opts::UseShortCallCache) - return "short call cache"; - else - return "no cache"; -} - } namespace llvm { @@ -95,6 +65,17 @@ constexpr uint32_t ITLBEntries = 16; constexpr size_t InvalidAddr = -1; +const char* cacheKindString(bool UseGainCache, bool UseShortCallCache) { + if (UseGainCache && UseShortCallCache) + return "gain + short call cache"; + else if (UseGainCache) + return "gain cache"; + else if (UseShortCallCache) + return "short call cache"; + else + return "no cache"; +} + // This class maintains adjacency information for all Clusters being // processed. It is used to invalidate cache entries when merging // Clusters and for visiting all neighbors of any given Cluster. @@ -120,6 +101,8 @@ class AdjacencyMatrix { } } + // Merge adjacency info from cluster B into cluster A. Info for cluster B is left + // in an undefined state. void merge(const Cluster *A, const Cluster *B) { Bits[A->id()] |= Bits[B->id()]; Bits[A->id()][A->id()] = false; @@ -220,31 +203,14 @@ class PrecomputedResults { BitVector Valid; }; -// A wrapper for algorithm-wide variables -struct AlgoState { - explicit AlgoState(size_t Size) - : Cache(Size), ShortCallPairCache(Size) { } - - // the call graph - const CallGraph *Cg; - // the total number of samples in the graph - double TotalSamples; - // target_id => cluster - std::vector FuncCluster; - // current address of the function from the beginning of its cluster - std::vector Addr; - // maximum cluster id. - size_t MaxClusterId{0}; - // A cache that keeps precomputed values of mergeGain for pairs of clusters; - // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs - // containing both x and y (and recompute them on the next iteration) - PrecomputedResults Cache; - // Cache for shortCalls for a single cluster. - std::unordered_map ShortCallCache; - // Cache for shortCalls for a pair of Clusters - PrecomputedResults ShortCallPairCache; -}; - +/* + * Erase an element from a container if it is present. Otherwise, do nothing. + */ +template +void maybeErase(C &Container, const V& Value) { + auto Itr = Container.find(Value); + if (Itr != Container.end()) + Container.erase(Itr); } /* @@ -306,353 +272,427 @@ std::vector sortByDensity(const C &Clusters_) { * is (1-p), and the probability that it is not in the cache (i.e. not accessed * during the last kITLBEntries function calls) is (1-p)^kITLBEntries */ -double missProbability(const AlgoState &State, double PageSamples) { - double P = PageSamples / State.TotalSamples; - double X = ITLBEntries; - // avoiding precision issues for small values - if (P < 0.0001) return (1.0 - X * P + X * (X - 1.0) * P * P / 2.0); - return pow(1.0 - P, X); +double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) { + const auto Dist = std::abs(SrcAddr - DstAddr); + if (Dist > PageSize) { + return 0; + } + return (double(PageSize - Dist) / PageSize) * EdgeWeight; } /* - * Expected hit ratio of the iTLB cache under the given order of clusters - * - * Given an ordering of hot functions (and hence, their assignment to the - * iTLB pages), we can divide all functions calls into two categories: - * - 'short' ones that have a caller-callee distance less than a page; - * - 'long' ones where the distance exceeds a page. - * The short calls are likely to result in a iTLB cache hit. For the long ones, - * the hit/miss result depends on the 'hotness' of the page (i.e., how often - * the page is accessed). Assuming that functions are sent to the iTLB cache - * in a random order, the probability that a page is present in the cache is - * proportional to the number of samples corresponding to the functions on the - * page. The following procedure detects short and long calls, and estimates - * the expected number of cache misses for the long ones. + * HFSortPlus - layout of hot functions with iTLB cache optimization */ -template -double expectedCacheHitRatio(const AlgoState &State, const C &Clusters_) { - // sort by density - std::vector Clusters(sortByDensity(Clusters_)); - - // generate function addresses with an alignment - std::vector Addr(State.Cg->numNodes(), InvalidAddr); - size_t CurAddr = 0; - // 'hotness' of the pages - std::vector PageSamples; - for (auto Cluster : Clusters) { - for (auto TargetId : Cluster->targets()) { - if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16; - Addr[TargetId] = CurAddr; - CurAddr += State.Cg->size(TargetId); - // update page weight - size_t Page = Addr[TargetId] / PageSize; - while (PageSamples.size() <= Page) PageSamples.push_back(0.0); - PageSamples[Page] += State.Cg->samples(TargetId); +class HFSortPlus { +public: + /* + * The probability that a page with a given weight is not present in the cache. + * + * Assume that the hot functions are called in a random order; then the + * probability of a TLB page being accessed after a function call is + * p=pageSamples/totalSamples. The probability that the page is not accessed + * is (1-p), and the probability that it is not in the cache (i.e. not accessed + * during the last kITLBEntries function calls) is (1-p)^kITLBEntries + */ + double missProbability(double PageSamples) const { + double P = PageSamples / TotalSamples; + double X = ITLBEntries; + // avoiding precision issues for small values + if (P < 0.0001) return (1.0 - X * P + X * (X - 1.0) * P * P / 2.0); + return pow(1.0 - P, X); + } + + /* + * Expected hit ratio of the iTLB cache under the given order of clusters + * + * Given an ordering of hot functions (and hence, their assignment to the + * iTLB pages), we can divide all functions calls into two categories: + * - 'short' ones that have a caller-callee distance less than a page; + * - 'long' ones where the distance exceeds a page. + * The short calls are likely to result in a iTLB cache hit. For the long ones, + * the hit/miss result depends on the 'hotness' of the page (i.e., how often + * the page is accessed). Assuming that functions are sent to the iTLB cache + * in a random order, the probability that a page is present in the cache is + * proportional to the number of samples corresponding to the functions on the + * page. The following procedure detects short and long calls, and estimates + * the expected number of cache misses for the long ones. + */ + template + double expectedCacheHitRatio(const C &Clusters_) const { + // sort by density + std::vector Clusters(sortByDensity(Clusters_)); + + // generate function addresses with an alignment + std::vector Addr(Cg.numNodes(), InvalidAddr); + size_t CurAddr = 0; + // 'hotness' of the pages + std::vector PageSamples; + for (auto Cluster : Clusters) { + for (auto TargetId : Cluster->targets()) { + if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16; + Addr[TargetId] = CurAddr; + CurAddr += Cg.size(TargetId); + // update page weight + size_t Page = Addr[TargetId] / PageSize; + while (PageSamples.size() <= Page) PageSamples.push_back(0.0); + PageSamples[Page] += Cg.samples(TargetId); + } } - } - // computing expected number of misses for every function - double Misses = 0; - for (auto Cluster : Clusters) { - for (auto TargetId : Cluster->targets()) { - size_t Page = Addr[TargetId] / PageSize; - double Samples = State.Cg->samples(TargetId); - // probability that the page is not present in the cache - double MissProb = missProbability(State, PageSamples[Page]); - - for (auto Pred : State.Cg->predecessors(TargetId)) { - if (State.Cg->samples(Pred) == 0) continue; - const auto &Arc = *State.Cg->findArc(Pred, TargetId); - - // the source page - size_t SrcPage = (Addr[Pred] + (size_t)Arc.avgCallOffset()) / PageSize; - if (Page != SrcPage) { - // this is a miss - Misses += Arc.weight() * MissProb; + // computing expected number of misses for every function + double Misses = 0; + for (auto Cluster : Clusters) { + for (auto TargetId : Cluster->targets()) { + size_t Page = Addr[TargetId] / PageSize; + double Samples = Cg.samples(TargetId); + // probability that the page is not present in the cache + double MissProb = missProbability(PageSamples[Page]); + + for (auto Pred : Cg.predecessors(TargetId)) { + if (Cg.samples(Pred) == 0) continue; + const auto &Arc = *Cg.findArc(Pred, TargetId); + + // the source page + size_t SrcPage = (Addr[Pred] + (size_t)Arc.avgCallOffset()) / PageSize; + if (Page != SrcPage) { + // this is a miss + Misses += Arc.weight() * MissProb; + } + Samples -= Arc.weight(); } - Samples -= Arc.weight(); - } - // the remaining samples come from the jitted code - Misses += Samples * MissProb; + // the remaining samples come from the jitted code + Misses += Samples * MissProb; + } } - } - return 100.0 * (1.0 - Misses / State.TotalSamples); -} - -/* - * The expected number of calls for an edge withing the same TLB page - */ -double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) { - auto Dist = std::abs(SrcAddr - DstAddr); - if (Dist > PageSize) { - return 0; + return 100.0 * (1.0 - Misses / TotalSamples); } - return (double(PageSize - Dist) / PageSize) * EdgeWeight; -} -/* - * The expected number of calls within a given cluster with both endpoints on - * the same TLB cache page - */ -double shortCalls(AlgoState &State, const Cluster *Cluster) { - if (opts::UseShortCallCache) { - auto Itr = State.ShortCallCache.find(Cluster); - if (Itr != State.ShortCallCache.end()) - return Itr->second; - } + /* + * The expected number of calls within a given cluster with both endpoints on + * the same TLB cache page + */ + double shortCalls(const Cluster *Cluster) const { + if (UseShortCallCache) { + auto Itr = ShortCallCache.find(Cluster); + if (Itr != ShortCallCache.end()) + return Itr->second; + } - double Calls = 0; - for (auto TargetId : Cluster->targets()) { - for (auto Succ : State.Cg->successors(TargetId)) { - if (State.FuncCluster[Succ] == Cluster) { - const auto &Arc = *State.Cg->findArc(TargetId, Succ); + double Calls = 0; + for (auto TargetId : Cluster->targets()) { + for (auto Succ : Cg.successors(TargetId)) { + if (FuncCluster[Succ] == Cluster) { + const auto &Arc = *Cg.findArc(TargetId, Succ); - auto SrcAddr = State.Addr[TargetId] + Arc.avgCallOffset(); - auto DstAddr = State.Addr[Succ]; + auto SrcAddr = Addr[TargetId] + Arc.avgCallOffset(); + auto DstAddr = Addr[Succ]; - Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); + Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); + } } } - } - - if (opts::UseShortCallCache) { - State.ShortCallCache[Cluster] = Calls; - } - return Calls; -} + if (UseShortCallCache) { + ShortCallCache[Cluster] = Calls; + } -/* - * The number of calls between the two clusters with both endpoints on - * the same TLB page, assuming that a given pair of clusters gets merged - */ -double shortCalls(AlgoState &State, - const Cluster *ClusterPred, - const Cluster *ClusterSucc) { - if (opts::UseShortCallCache && - State.ShortCallPairCache.contains(ClusterPred, ClusterSucc)) { - return State.ShortCallPairCache.get(ClusterPred, ClusterSucc); + return Calls; } - double Calls = 0; - for (auto TargetId : ClusterPred->targets()) { - for (auto Succ : State.Cg->successors(TargetId)) { - if (State.FuncCluster[Succ] == ClusterSucc) { - const auto &Arc = *State.Cg->findArc(TargetId, Succ); + /* + * The number of calls between the two clusters with both endpoints on + * the same TLB page, assuming that a given pair of clusters gets merged + */ + double shortCalls(const Cluster *ClusterPred, + const Cluster *ClusterSucc) const { + if (UseShortCallCache && + ShortCallPairCache.contains(ClusterPred, ClusterSucc)) { + return ShortCallPairCache.get(ClusterPred, ClusterSucc); + } + + double Calls = 0; + for (auto TargetId : ClusterPred->targets()) { + for (auto Succ : Cg.successors(TargetId)) { + if (FuncCluster[Succ] == ClusterSucc) { + const auto &Arc = *Cg.findArc(TargetId, Succ); - auto SrcAddr = State.Addr[TargetId] + Arc.avgCallOffset(); - auto DstAddr = State.Addr[Succ] + ClusterPred->size(); + auto SrcAddr = Addr[TargetId] + Arc.avgCallOffset(); + auto DstAddr = Addr[Succ] + ClusterPred->size(); - Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); + Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); + } } } - } - for (auto TargetId : ClusterPred->targets()) { - for (auto Pred : State.Cg->predecessors(TargetId)) { - if (State.FuncCluster[Pred] == ClusterSucc) { - const auto &Arc = *State.Cg->findArc(Pred, TargetId); + for (auto TargetId : ClusterPred->targets()) { + for (auto Pred : Cg.predecessors(TargetId)) { + if (FuncCluster[Pred] == ClusterSucc) { + const auto &Arc = *Cg.findArc(Pred, TargetId); - auto SrcAddr = State.Addr[Pred] + Arc.avgCallOffset() + - ClusterPred->size(); - auto DstAddr = State.Addr[TargetId]; + auto SrcAddr = Addr[Pred] + Arc.avgCallOffset() + + ClusterPred->size(); + auto DstAddr = Addr[TargetId]; - Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); + Calls += expectedCalls(SrcAddr, DstAddr, Arc.weight()); + } } } - } - if (opts::UseShortCallCache) { - State.ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls); - } + if (UseShortCallCache) { + ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls); + } - return Calls; -} + return Calls; + } + + /* + * The gain of merging two clusters. + * + * We assume that the final clusters are sorted by their density, and hence + * every cluster is likely to be adjacent with clusters of the same density. + * Thus, the 'hotness' of every cluster can be estimated by density*pageSize, + * which is used to compute the probability of cache misses for long calls + * of a given cluster. + * The result is also scaled by the size of the resulting cluster in order to + * increse the chance of merging short clusters, which is helpful for + * the i-cache performance. + */ + double mergeGain(const Cluster *ClusterPred, + const Cluster *ClusterSucc) const { + if (UseGainCache && Cache.contains(ClusterPred, ClusterSucc)) { + return Cache.get(ClusterPred, ClusterSucc); + } -/* - * The gain of merging two clusters. - * - * We assume that the final clusters are sorted by their density, and hence - * every cluster is likely to be adjacent with clusters of the same density. - * Thus, the 'hotness' of every cluster can be estimated by density*pageSize, - * which is used to compute the probability of cache misses for long calls - * of a given cluster. - * The result is also scaled by the size of the resulting cluster in order to - * increse the chance of merging short clusters, which is helpful for - * the i-cache performance. - */ -double mergeGain(AlgoState &State, - const Cluster *ClusterPred, - const Cluster *ClusterSucc) { - if (opts::UseGainCache && State.Cache.contains(ClusterPred, ClusterSucc)) { - return State.Cache.get(ClusterPred, ClusterSucc); - } + // cache misses on the first cluster + double LongCallsPred = ClusterPred->samples() - shortCalls(ClusterPred); + double ProbPred = missProbability(ClusterPred->density() * PageSize); + double ExpectedMissesPred = LongCallsPred * ProbPred; + + // cache misses on the second cluster + double LongCallsSucc = ClusterSucc->samples() - shortCalls(ClusterSucc); + double ProbSucc = missProbability(ClusterSucc->density() * PageSize); + double ExpectedMissesSucc = LongCallsSucc * ProbSucc; + + // cache misses on the merged cluster + double LongCallsNew = LongCallsPred + LongCallsSucc - + shortCalls(ClusterPred, ClusterSucc); + double NewDensity = density(ClusterPred, ClusterSucc); + double ProbNew = missProbability(NewDensity * PageSize); + double MissesNew = LongCallsNew * ProbNew; + + double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew; + // scaling the result to increase the importance of merging short clusters + Gain /= (ClusterPred->size() + ClusterSucc->size()); + + if (UseGainCache) { + Cache.set(ClusterPred, ClusterSucc, Gain); + } - // cache misses on the first cluster - double LongCallsPred = ClusterPred->samples() - shortCalls(State, ClusterPred); - double ProbPred = missProbability(State, ClusterPred->density() * PageSize); - double ExpectedMissesPred = LongCallsPred * ProbPred; - - // cache misses on the second cluster - double LongCallsSucc = ClusterSucc->samples() - shortCalls(State, ClusterSucc); - double ProbSucc = missProbability(State, ClusterSucc->density() * PageSize); - double ExpectedMissesSucc = LongCallsSucc * ProbSucc; - - // cache misses on the merged cluster - double LongCallsNew = LongCallsPred + LongCallsSucc - - shortCalls(State, ClusterPred, ClusterSucc); - double NewDensity = density(ClusterPred, ClusterSucc); - double ProbNew = missProbability(State, NewDensity * PageSize); - double MissesNew = LongCallsNew * ProbNew; - - double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew; - // scaling the result to increase the importance of merging short clusters - Gain /= (ClusterPred->size() + ClusterSucc->size()); - - if (opts::UseGainCache) { - State.Cache.set(ClusterPred, ClusterSucc, Gain); - } + return Gain; + } + + /* + * Run hfsort+ algorithm and return ordered set of function clusters. + */ + std::vector run() { + DEBUG(dbgs() << "Starting hfsort+ w/" + << cacheKindString(UseGainCache, UseShortCallCache) + << " for " << Clusters.size() << " clusters\n" + << format("Initial expected iTLB cache hit ratio: %.4lf\n", + expectedCacheHitRatio(Clusters))); + + int Steps = 0; + // merge pairs of clusters while there is an improvement + while (Clusters.size() > 1) { + DEBUG( + if (Steps % 500 == 0) { + dbgs() << format("step = %d clusters = %lu expected_hit_rate = %.4lf\n", + Steps, Clusters.size(), + expectedCacheHitRatio(Clusters)); + }); + ++Steps; + + Cluster *BestClusterPred = nullptr; + Cluster *BestClusterSucc = nullptr; + double BestGain = -1; + for (auto ClusterPred : Clusters) { + // get candidates for merging with the current cluster + Adjacent.forallAdjacent( + ClusterPred, + // find the best candidate + [&](Cluster *ClusterSucc) { + assert(ClusterPred != ClusterSucc); + // get a cost of merging two clusters + const double Gain = mergeGain(ClusterPred, ClusterSucc); + + // breaking ties by density to make the hottest clusters be merged first + if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 && + compareClusterPairs(ClusterPred, + ClusterSucc, + BestClusterPred, + BestClusterSucc))) { + BestGain = Gain; + BestClusterPred = ClusterPred; + BestClusterSucc = ClusterSucc; + } + }); + } - return Gain; -} + if (BestGain <= 0.0) break; -template -void maybeErase(C &Container, const V& Value) { - auto Itr = Container.find(Value); - if (Itr != Container.end()) - Container.erase(Itr); -} + // merge the best pair of clusters + mergeClusters(BestClusterPred, BestClusterSucc); -/* - * HFSortPlus - layout of hot functions with iTLB cache optimization - */ -std::vector hfsortPlus(const CallGraph &Cg) { - // create a cluster for every function - std::vector AllClusters; - AllClusters.reserve(Cg.numNodes()); - for (NodeId F = 0; F < Cg.numNodes(); F++) { - AllClusters.emplace_back(F, Cg.getNode(F)); - } + // remove BestClusterSucc from the list of active clusters + auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc); + Clusters.erase(Iter, Clusters.end()); + } - // initialize objects used by the algorithm - std::vector Clusters; - Clusters.reserve(Cg.numNodes()); - AlgoState State(AllClusters.size()); // TODO: should use final Clusters.size() - State.Cg = &Cg; - State.TotalSamples = 0; - State.FuncCluster = std::vector(Cg.numNodes(), nullptr); - State.Addr = std::vector(Cg.numNodes(), InvalidAddr); - uint32_t Id = 0; - for (NodeId F = 0; F < Cg.numNodes(); F++) { - if (Cg.samples(F) == 0) continue; - Clusters.push_back(&AllClusters[F]); - Clusters.back()->setId(Id); - State.FuncCluster[F] = &AllClusters[F]; - State.Addr[F] = 0; - State.TotalSamples += Cg.samples(F); - ++Id; - } - State.MaxClusterId = Id; + DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n" + << format("Final expected iTLB cache hit ratio: %.4lf\n", + expectedCacheHitRatio(Clusters))); - AdjacencyMatrix Adjacent(Cg, Clusters, State.FuncCluster); + // Return the set of clusters that are left, which are the ones that + // didn't get merged (so their first func is its original func). + std::vector Result; + for (auto Cluster : sortByDensity(Clusters)) { + Result.emplace_back(std::move(*Cluster)); + } - DEBUG(dbgs() << "Starting hfsort+ w/" << opts::cacheKindString() << " for " - << Clusters.size() << " clusters\n" - << format("Initial expected iTLB cache hit ratio: %.4lf\n", - expectedCacheHitRatio(State, Clusters))); + assert(std::is_sorted(Result.begin(), Result.end(), compareClustersDensity)); - int Steps = 0; - // merge pairs of clusters while there is an improvement - while (Clusters.size() > 1) { - DEBUG( - if (Steps % 500 == 0) { - dbgs() << format("step = %d clusters = %lu expected_hit_rate = %.4lf\n", - Steps, - Clusters.size(), - expectedCacheHitRatio(State, Clusters)); - } - ); - ++Steps; - - Cluster *BestClusterPred = nullptr; - Cluster *BestClusterSucc = nullptr; - double BestGain = -1; - for (auto ClusterPred : Clusters) { - // get candidates for merging with the current cluster - Adjacent.forallAdjacent( - ClusterPred, - // find the best candidate - [&](Cluster *ClusterSucc) { - assert(ClusterPred != ClusterSucc); - // get a cost of merging two clusters - const double Gain = mergeGain(State, ClusterPred, ClusterSucc); - - // breaking ties by density to make the hottest clusters be merged first - if (Gain > BestGain || (std::abs(Gain - BestGain) < 1e-8 && - compareClusterPairs(ClusterPred, - ClusterSucc, - BestClusterPred, - BestClusterSucc))) { - BestGain = Gain; - BestClusterPred = ClusterPred; - BestClusterSucc = ClusterSucc; - } - } - ); + return Result; + } + + HFSortPlus(const CallGraph &Cg, + bool UseGainCache, + bool UseShortCallCache) + : Cg(Cg), + FuncCluster(Cg.numNodes(), nullptr), + Addr(Cg.numNodes(), InvalidAddr), + TotalSamples(0.0), + Clusters(initializeClusters()), + Adjacent(Cg, Clusters, FuncCluster), + UseGainCache(UseGainCache), + UseShortCallCache(UseShortCallCache), + Cache(Clusters.size()), + ShortCallPairCache(Clusters.size()) { + } +private: + // Initialize the set of active clusters, function id to cluster mapping, + // total number of samples and function addresses. + std::vector initializeClusters() { + std::vector Clusters; + + Clusters.reserve(Cg.numNodes()); + AllClusters.reserve(Cg.numNodes()); + + for (NodeId F = 0; F < Cg.numNodes(); F++) { + AllClusters.emplace_back(F, Cg.getNode(F)); + if (Cg.samples(F) == 0) continue; + Clusters.emplace_back(&AllClusters[F]); + Clusters.back()->setId(Clusters.size() - 1); + FuncCluster[F] = &AllClusters[F]; + Addr[F] = 0; + TotalSamples += Cg.samples(F); } - if (BestGain <= 0.0) break; + return Clusters; + } - // merge the best pair of clusters + /* + * Merge cluster From into cluster Into. + */ + void mergeClusters(Cluster *Into, Cluster *From) { DEBUG( if (opts::Verbosity > 0) { - dbgs() << "Merging cluster " << BestClusterSucc->id() - << " into cluster " << BestClusterPred->id() << "\n"; + dbgs() << "Merging cluster " << From->id() + << " into cluster " << Into->id() << "\n"; }); - Adjacent.merge(BestClusterPred, BestClusterSucc); - BestClusterPred->merge(*BestClusterSucc); + // The adjacency merge must happen before the Cluster::merge since that + // clobbers the contents of From. + Adjacent.merge(Into, From); + + Into->merge(*From); + // Update the clusters and addresses for functions merged from From. size_t CurAddr = 0; - for (auto TargetId : BestClusterPred->targets()) { - State.FuncCluster[TargetId] = BestClusterPred; - State.Addr[TargetId] = CurAddr; - CurAddr += State.Cg->size(TargetId); + for (auto TargetId : Into->targets()) { + FuncCluster[TargetId] = Into; + Addr[TargetId] = CurAddr; + CurAddr += Cg.size(TargetId); } - if (opts::UseShortCallCache) { - maybeErase(State.ShortCallCache, BestClusterPred); - Adjacent.forallAdjacent(BestClusterPred, - [&State](const Cluster *C) { - maybeErase(State.ShortCallCache, C); - }); - State.ShortCallPairCache.invalidate(Adjacent, BestClusterPred); + invalidateCaches(Into); + } + + /* + * Invalidate all cache entries associated with cluster C and its neighbors. + */ + void invalidateCaches(const Cluster *C) { + if (UseShortCallCache) { + maybeErase(ShortCallCache, C); + Adjacent.forallAdjacent(C, + [this](const Cluster *A) { + maybeErase(ShortCallCache, A); + }); + ShortCallPairCache.invalidate(Adjacent, C); } - if (opts::UseGainCache) { - State.Cache.invalidate(Adjacent, BestClusterPred); + if (UseGainCache) { + Cache.invalidate(Adjacent, C); } - - // remove BestClusterSucc from the list of active clusters - auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc); - Clusters.erase(Iter, Clusters.end()); } - DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n" - << format("Final expected iTLB cache hit ratio: %.4lf\n", - expectedCacheHitRatio(State, Clusters))); + // the call graph + const CallGraph &Cg; - // Return the set of clusters that are left, which are the ones that - // didn't get merged (so their first func is its original func). - std::vector Result; - for (auto Cluster : sortByDensity(Clusters)) { - Result.emplace_back(std::move(*Cluster)); - } + // All clusters. + std::vector AllClusters; + + // target_id => cluster + std::vector FuncCluster; + + // current address of the function from the beginning of its cluster + std::vector Addr; + + // the total number of samples in the graph + double TotalSamples; + + // All clusters with non-zero number of samples. This vector gets + // udpated at runtime when clusters are merged. + std::vector Clusters; + + // Cluster adjacency matrix. + AdjacencyMatrix Adjacent; + + // Use cache for mergeGain results. + bool UseGainCache; + + // Use caches for shortCalls results. + bool UseShortCallCache; - assert(std::is_sorted(Result.begin(), Result.end(), compareClustersDensity)); + // A cache that keeps precomputed values of mergeGain for pairs of clusters; + // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs + // containing both x and y and all clusters adjacent to x and y (and recompute + // them on the next iteration). + mutable PrecomputedResults Cache; + + // Cache for shortCalls for a single cluster. + mutable std::unordered_map ShortCallCache; + + // Cache for shortCalls for a pair of Clusters + mutable PrecomputedResults ShortCallPairCache; +}; + +} - return Result; +std::vector hfsortPlus(const CallGraph &Cg, + bool UseGainCache, + bool UseShortCallCache) { + return HFSortPlus(Cg, UseGainCache, UseShortCallCache).run(); } }} diff --git a/bolt/Passes/PettisAndHansen.cpp b/bolt/Passes/PettisAndHansen.cpp index 18e9568d81be..6cd1cbd1eecd 100644 --- a/bolt/Passes/PettisAndHansen.cpp +++ b/bolt/Passes/PettisAndHansen.cpp @@ -54,7 +54,7 @@ void orderFuncs(const CallGraph &Cg, Cluster *C1, Cluster *C2) { double C1tailC2head = 0; double C1tailC2tail = 0; - for (const auto &Arc : Cg.getArcs()) { + for (const auto &Arc : Cg.arcs()) { if ((Arc.src() == C1head && Arc.dst() == C2head) || (Arc.dst() == C1head && Arc.src() == C2head)) { C1headC2head += Arc.weight(); @@ -113,7 +113,7 @@ std::vector pettisAndHansen(const CallGraph &Cg) { // Create a std::vector of cluster arcs - for (auto &Arc : Cg.getArcs()) { + for (auto &Arc : Cg.arcs()) { if (Arc.weight() == 0) continue; auto const S = FuncCluster[Arc.src()]; diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index 72cb18c31e6f..a7dd44c036e2 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -82,6 +82,22 @@ UseEdgeCounts("use-edge-counts", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static llvm::cl::opt +UseGainCache("hfsort+-use-cache", + llvm::cl::desc("Use a cache for mergeGain results when computing hfsort+."), + llvm::cl::ZeroOrMore, + llvm::cl::init(true), + llvm::cl::Hidden, + llvm::cl::cat(BoltOptCategory)); + +static llvm::cl::opt +UseShortCallCache("hfsort+-use-short-call-cache", + llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."), + llvm::cl::ZeroOrMore, + llvm::cl::init(true), + llvm::cl::Hidden, + llvm::cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { @@ -297,7 +313,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, Clusters = clusterize(Cg); break; case BinaryFunction::RT_HFSORT_PLUS: - Clusters = hfsortPlus(Cg); + Clusters = hfsortPlus(Cg, opts::UseGainCache, opts::UseShortCallCache); break; case BinaryFunction::RT_PETTIS_HANSEN: Clusters = pettisAndHansen(Cg); From 1b8b257e27e4486f5d0903d8f412b648f4750433 Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Fri, 16 Jun 2017 17:06:13 -0700 Subject: [PATCH 279/904] BinaryFunction.h: Clarify commet for getSize(), add getNumNonPseudos() Summary: Minor fix and add new function (cherry picked from commit cd9395649bf688fdec3dbff00d24bb0f2f2796b4) --- bolt/BinaryFunction.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 531f1e3107d0..e393aa1ac1af 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -1093,7 +1093,7 @@ class BinaryFunction { return FileOffset; } - /// Return (original) size of the function. + /// Return (original) byte size of the function. uint64_t getSize() const { return Size; } @@ -1103,6 +1103,15 @@ class BinaryFunction { return MaxSize; } + /// Return the number of emitted instructions for this function. + uint32_t getNumNonPseudos() const { + uint32_t N = 0; + for (auto &BB : layout()) { + N += BB->getNumNonPseudos(); + } + return N; + } + /// Return MC symbol associated with the function. /// All references to the function should use this symbol. MCSymbol *getSymbol() { From 6fe2474fcc75a14e0fb2a4792e7087bd4ba103fb Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 16 Jun 2017 15:02:26 -0700 Subject: [PATCH 280/904] [BOLT] Bail frame analysis on PUSHes escaping vars Summary: Some PUSH instructions may contain memory addresses pushed to the stack. If this memory address is from an object in the stack, cancel further frame analysis for this function since it may be escaping a variable. This fixes a bug with deleting used stores (in frameopt) in hhvm trunk. (cherry picked from commit 2e39210bd277ec38752feca8a1df51ec7fcc0e02) --- bolt/Passes/FrameAnalysis.cpp | 19 ++++++++++++++++++- bolt/Passes/FrameOptimizer.cpp | 2 ++ bolt/Passes/RegAnalysis.cpp | 2 +- bolt/Passes/StackPointerTracking.h | 2 +- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index a0fbff46b057..38362e21cc34 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -202,7 +202,7 @@ class FrameAccessAnalysis { return true; } - if (BC.MIA->leaksStackAddress(Inst, *BC.MRI, SPT.HasFramePointer)) { + if (BC.MIA->escapesVariable(Inst, *BC.MRI, SPT.HasFramePointer)) { DEBUG(dbgs() << "Leaked stack address, giving up on this function.\n"); DEBUG(dbgs() << "Blame insn: "); DEBUG(Inst.dump()); @@ -286,6 +286,21 @@ void FrameAnalysis::traverseCG(BinaryFunctionCallGraph &CG) { }); CGWalker.walk(); + + DEBUG_WITH_TYPE("ra", + for (auto &MapEntry : ArgsTouchedMap) { + const auto *Func = MapEntry.first; + const auto &Set = MapEntry.second; + dbgs() << "Args accessed for " << Func->getPrintName() << ": "; + if (!Set.empty() && Set.count(std::make_pair(-1, 0))) { + dbgs() << "assume everything"; + } else { + for (auto &Entry : Set) { + dbgs() << "[" << Entry.first << ", " << (int)Entry.second << "] "; + } + } + dbgs() << "\n"; + }); } bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst, @@ -373,6 +388,8 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) { return Updated; } + DEBUG(dbgs() << "Now computing args accessed for: " << BF.getPrintName() + << "\n"); bool UpdatedArgsTouched = false; FrameAccessAnalysis FAA(BC, BF); diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 38d3968950e0..196dfe749951 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -197,6 +197,8 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, DEBUG(dbgs() << "Unused store instruction: "); DEBUG(Inst.dump()); DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); + DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset + << " size = " << (int)FIEX->Size << "\n"); // Delete it! ToErase.push_back(std::make_pair(&BB, &Inst)); Prev = &Inst; diff --git a/bolt/Passes/RegAnalysis.cpp b/bolt/Passes/RegAnalysis.cpp index 570e09358132..d9ab2d625f1c 100644 --- a/bolt/Passes/RegAnalysis.cpp +++ b/bolt/Passes/RegAnalysis.cpp @@ -68,7 +68,7 @@ RegAnalysis::RegAnalysis(BinaryContext &BC, CountFunctionsAllClobber += Count; ++NumFunctionsAllClobber; } - DEBUG_WITH_TYPE("fa", + DEBUG_WITH_TYPE("ra", dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; const BitVector &RegsKilled = Iter->second; int RegIdx = RegsKilled.find_first(); diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/Passes/StackPointerTracking.h index cfcf237a9380..ce0cd26bbc5e 100644 --- a/bolt/Passes/StackPointerTracking.h +++ b/bolt/Passes/StackPointerTracking.h @@ -159,7 +159,7 @@ class StackPointerTrackingBase return SUPERPOSITION; if (!HasFramePointer) { - if (MIA->leaksStackAddress(Point, *this->BC.MRI, false)) { + if (MIA->escapesVariable(Point, *this->BC.MRI, false)) { HasFramePointer = true; } } From 00885c6b57b8bd44f8a7af120280f7d360d2c84d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 9 Jun 2017 13:17:36 -0700 Subject: [PATCH 281/904] [BOLT] Make function reordering more robust with stale data. Summary: Rewrote the guts of buildCallGraph. There are two new options to control how the CG is created. UsePerfData controls whether we use the perf data directly to construct the CG for functions with a stale profile. IgnoreRecursiveCalls omits recursive calls from the CG since they might be skewing results unfairly for heavily recursive functions. I've changed the way BinaryFunction::estimateHotSize() works. If the function is marked as split, I count the size of all the non-cold blocks. This gives a different but more accurate answer than the old method. I've improved and updated the CG build stats with extra information. (cherry picked from commit 1eaf57db6cf61e273f491adf12cd30cec5db5e6f) --- bolt/BinaryContext.cpp | 6 + bolt/BinaryContext.h | 4 + bolt/BinaryFunction.h | 16 ++- bolt/Passes/BinaryFunctionCallGraph.cpp | 162 ++++++++++++++++-------- bolt/Passes/BinaryFunctionCallGraph.h | 5 +- bolt/Passes/ReorderFunctions.cpp | 37 +++++- 6 files changed, 173 insertions(+), 57 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 26f29ab6f71a..42b1b6e84f56 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -85,6 +85,12 @@ MCSymbol *BinaryContext::getGlobalSymbolAtAddress(uint64_t Address) const { return Symbol; } +MCSymbol *BinaryContext::getGlobalSymbolByName(const std::string &Name) const { + auto Itr = GlobalSymbols.find(Name); + return Itr == GlobalSymbols.end() + ? nullptr : getGlobalSymbolAtAddress(Itr->second); +} + void BinaryContext::foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF, std::map &BFs) { diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index fcc54e358cfa..427b94582b5d 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -194,6 +194,10 @@ class BinaryContext { /// global symbol was registered at the location. MCSymbol *getGlobalSymbolAtAddress(uint64_t Address) const; + /// Return MCSymbol for the given \p Name or nullptr if no + /// global symbol with that name exists. + MCSymbol *getGlobalSymbolByName(const std::string &Name) const; + /// Print the global symbol table. void printGlobalSymbols(raw_ostream& OS) const; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index e393aa1ac1af..ecd146078598 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -1844,11 +1844,19 @@ class BinaryFunction { /// of the added/removed branch instructions. /// Note that this size is optimistic and the actual size may increase /// after relaxation. - size_t estimateHotSize() const { + size_t estimateHotSize(const bool UseSplitSize = true) const { size_t Estimate = 0; - for (const auto *BB : BasicBlocksLayout) { - if (BB->getKnownExecutionCount() != 0) { - Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + if (UseSplitSize && IsSplit) { + for (const auto *BB : BasicBlocksLayout) { + if (!BB->isCold()) { + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + } + } + } else { + for (const auto *BB : BasicBlocksLayout) { + if (BB->getKnownExecutionCount() != 0) { + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + } } } return Estimate; diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index 9cf9f123bf83..df152e2d99b3 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -79,11 +79,15 @@ std::deque BinaryFunctionCallGraph::buildTraversalOrder() { BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, std::map &BFs, CgFilterFunction Filter, + bool CgFromPerfData, bool IncludeColdCalls, bool UseFunctionHotSize, - bool UseEdgeCounts) { + bool UseSplitHotSize, + bool UseEdgeCounts, + bool IgnoreRecursiveCalls) { NamedRegionTimer T1("Callgraph construction", "CG breakdown", opts::TimeOpts); BinaryFunctionCallGraph Cg; + static constexpr auto COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; // Add call graph nodes. auto lookupNode = [&](BinaryFunction *Function) { @@ -94,7 +98,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, // computed by ReorderFunctions. The cold part will be emitted with the // rest of the cold functions and code. const auto Size = UseFunctionHotSize && Function->isSplit() - ? Function->estimateHotSize() + ? Function->estimateHotSize(UseSplitHotSize) : Function->estimateSize(); // NOTE: for functions without a profile, we set the number of samples // to zero. This will keep these functions from appearing in the hot @@ -114,7 +118,10 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, // Add call graph edges. uint64_t NotProcessed = 0; - uint64_t TotalCalls = 0; + uint64_t TotalCallsites = 0; + uint64_t NoProfileCallsites = 0; + uint64_t NumFallbacks = 0; + uint64_t RecursiveCallsites = 0; for (auto &It : BFs) { auto *Function = &It.second; @@ -125,12 +132,24 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames()); const auto SrcId = lookupNode(Function); uint64_t Offset = Function->getAddress(); + uint64_t LastInstSize = 0; auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) { - if (auto *DstFunc = BC.getFunctionForSymbol(DestSymbol)) { + if (auto *DstFunc = + DestSymbol ? BC.getFunctionForSymbol(DestSymbol) : nullptr) { + if (DstFunc == Function) { + DEBUG(dbgs() << "BOLT-INFO: recursive call detected in " + << *DstFunc << "\n"); + ++RecursiveCallsites; + if (IgnoreRecursiveCalls) + return false; + } const auto DstId = lookupNode(DstFunc); - const auto AvgDelta = !UseEdgeCounts ? Offset - DstFunc->getAddress() : 0; - Cg.incArcWeight(SrcId, DstId, Count, AvgDelta); + const auto AvgDelta = UseEdgeCounts ? 0 : Offset - DstFunc->getAddress(); + const bool IsValidCount = Count != COUNT_NO_PROFILE; + const auto AdjCount = UseEdgeCounts && IsValidCount ? Count : 1; + if (!IsValidCount) ++NoProfileCallsites; + Cg.incArcWeight(SrcId, DstId, AdjCount, AvgDelta); DEBUG( if (opts::Verbosity > 1) { dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function @@ -141,59 +160,96 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, return false; }; - for (auto *BB : Function->layout()) { - // Don't count calls from cold blocks - if (BB->isCold() && !IncludeColdCalls) - continue; + auto getCallInfoFromBranchData = [&](const BranchInfo &BI, bool IsStale) { + MCSymbol *DstSym = nullptr; + uint64_t Count; + if (BI.To.IsSymbol && (DstSym = BC.getGlobalSymbolByName(BI.To.Name))) { + Count = BI.Branches; + } else { + Count = COUNT_NO_PROFILE; + } + // If we are using the perf data for a stale function we need to filter + // out data which comes from branches. We'll assume that the To offset + // is non-zero for branches. + if (IsStale && BI.To.Offset != 0 && + (!DstSym || Function == BC.getFunctionForSymbol(DstSym))) { + DstSym = nullptr; + Count = COUNT_NO_PROFILE; + } + return std::make_pair(DstSym, Count); + }; + + // Get pairs of (symbol, count) for each target at this callsite. + // If the call is to an unknown function the symbol will be nullptr. + // If there is no profiling data the count will be COUNT_NO_PROFILE. + auto getCallInfo = [&](const BinaryBasicBlock *BB, const MCInst &Inst) { + std::vector> Counts; + const auto *DstSym = BC.MIA->getTargetSymbol(Inst); - for (auto &Inst : *BB) { - // Find call instructions and extract target symbols from each one. - if (!BC.MIA->isCall(Inst)) + // If this is an indirect call use perf data directly. + if (!DstSym && BranchDataOrErr && + BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { + const auto DataOffset = + BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); + for (const auto &BI : BranchDataOrErr->getBranchRange(DataOffset)) { + Counts.push_back(getCallInfoFromBranchData(BI, false)); + } + } else { + const auto Count = BB->getExecutionCount(); + Counts.push_back(std::make_pair(DstSym, Count)); + } + + return Counts; + }; + + // If the function has an invalid profile, try to use the perf data + // directly (if requested). If there is no perf data for this function, + // fall back to the CFG walker which attempts to handle missing data. + if (!Function->hasValidProfile() && CgFromPerfData && BranchDataOrErr) { + DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: Falling back to perf data" + << " for " << *Function << "\n"); + ++NumFallbacks; + for (const auto &BI : BranchDataOrErr->Data) { + Offset = Function->getAddress() + BI.From.Offset; + const auto CI = getCallInfoFromBranchData(BI, true); + if (!CI.first && CI.second == COUNT_NO_PROFILE) // probably a branch + continue; + ++TotalCallsites; + if (!recordCall(CI.first, CI.second)) { + ++NotProcessed; + } + } + } else { + for (auto *BB : Function->layout()) { + // Don't count calls from cold blocks unless requested. + if (BB->isCold() && !IncludeColdCalls) continue; - ++TotalCalls; - if (const auto *DstSym = BC.MIA->getTargetSymbol(Inst)) { - // For direct calls, just use the BB execution count. - const auto Count = UseEdgeCounts && BB->hasProfile() - ? BB->getExecutionCount() : 1; - if (!recordCall(DstSym, Count)) - ++NotProcessed; - } else if (BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { - // For indirect calls and jump tables, use branch data. - if (!BranchDataOrErr) { - ++NotProcessed; - continue; + for (auto &Inst : *BB) { + if (!UseEdgeCounts) { + Offset += LastInstSize; + LastInstSize = BC.computeCodeSize(&Inst, &Inst + 1); } - const FuncBranchData &BranchData = BranchDataOrErr.get(); - const auto DataOffset = - BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); - for (const auto &BI : BranchData.getBranchRange(DataOffset)) { - // Count each target as a separate call. - ++TotalCalls; + // Find call instructions and extract target symbols from each one. + if (!BC.MIA->isCall(Inst)) + continue; - if (!BI.To.IsSymbol) { - ++NotProcessed; - continue; - } + const auto CallInfo = getCallInfo(BB, Inst); - auto Itr = BC.GlobalSymbols.find(BI.To.Name); - if (Itr == BC.GlobalSymbols.end()) { - ++NotProcessed; - continue; - } - - const auto *DstSym = - BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); + if (CallInfo.empty()) { + ++TotalCallsites; + ++NotProcessed; + continue; + } - if (!recordCall(DstSym, UseEdgeCounts ? BI.Branches : 1)) + for (const auto &CI : CallInfo) { + ++TotalCallsites; + if (!recordCall(CI.first, CI.second)) { ++NotProcessed; + } } } - - if (!UseEdgeCounts) { - Offset += BC.computeCodeSize(&Inst, &Inst + 1); - } } } } @@ -204,9 +260,13 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, bool PrintInfo = false; #endif if (PrintInfo || opts::Verbosity > 0) { - outs() << format("BOLT-INFO: buildCallGraph: %u nodes, density = %.6lf, " - "%u callsites not processed out of %u.\n", - Cg.numNodes(), Cg.density(), NotProcessed, TotalCalls); + outs() << format("BOLT-INFO: buildCallGraph: %u nodes, %u callsites " + "(%u recursive), density = %.6lf, %u callsites not " + "processed, %u callsites with invalid profile, " + "used perf data for %u stale functions.\n", + Cg.numNodes(), TotalCallsites, RecursiveCallsites, + Cg.density(), NotProcessed, NoProfileCallsites, + NumFallbacks); } return Cg; diff --git a/bolt/Passes/BinaryFunctionCallGraph.h b/bolt/Passes/BinaryFunctionCallGraph.h index abb03f9a1d2b..513bb0ef5415 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.h +++ b/bolt/Passes/BinaryFunctionCallGraph.h @@ -70,9 +70,12 @@ inline bool NoFilter(const BinaryFunction &) { return false; } BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, std::map &BFs, CgFilterFunction Filter = NoFilter, + bool CgFromPerfData = false, bool IncludeColdCalls = true, bool UseFunctionHotSize = false, - bool UseEdgeCounts = false); + bool UseSplitHotSize = false, + bool UseEdgeCounts = false, + bool IgnoreRecursiveCalls = false); } } diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index a7dd44c036e2..5f5ed717c73b 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -82,6 +82,29 @@ UseEdgeCounts("use-edge-counts", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +CgFromPerfData("cg-from-perf-data", + cl::desc("use perf data directly when constructing the call graph" + " for stale functions"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +CgIgnoreRecursiveCalls("cg-ignore-recursive-calls", + cl::desc("ignore recursive calls when constructing the call graph"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +CgUseSplitHotSize("cg-use-split-hot-size", + cl::desc("use hot/cold data on basic blocks to determine hot sizes for " + "call graph functions"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static llvm::cl::opt UseGainCache("hfsort+-use-cache", llvm::cl::desc("Use a cache for mergeGain results when computing hfsort+."), @@ -171,10 +194,19 @@ void ReorderFunctions::reorder(std::vector &&Clusters, uint64_t Dist = 0; uint64_t Calls = 0; for (auto Dst : Cg.successors(FuncId)) { + if (FuncId == Dst) // ignore recursive calls in stats + continue; const auto& Arc = *Cg.findArc(FuncId, Dst); const auto D = std::abs(FuncAddr[Arc.dst()] - (FuncAddr[FuncId] + Arc.avgCallOffset())); const auto W = Arc.weight(); + if (D < 64 && PrintDetailed && opts::Verbosity > 2) { + outs() << "BOLT-INFO: short (" << D << "B) call:\n" + << "BOLT-INFO: Src: " << *Cg.nodeIdToFunc(FuncId) << "\n" + << "BOLT-INFO: Dst: " << *Cg.nodeIdToFunc(Dst) << "\n" + << "BOLT-INFO: Weight = " << W << "\n" + << "BOLT-INFO: AvgOffset = " << Arc.avgCallOffset() << "\n"; + } Calls += W; if (D < 64) TotalCalls64B += W; if (D < 4096) TotalCalls4KB += W; @@ -266,9 +298,12 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, [this](const BinaryFunction &BF) { return !shouldOptimize(BF) || !BF.hasProfile(); }, + opts::CgFromPerfData, false, // IncludeColdCalls opts::ReorderFunctionsUseHotSize, - opts::UseEdgeCounts); + opts::CgUseSplitHotSize, + opts::UseEdgeCounts, + opts::CgIgnoreRecursiveCalls); Cg.normalizeArcWeights(opts::UseEdgeCounts); } From 4a829275dc1581cc7dd5eec091695101b45f6d88 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 16 Jun 2017 20:04:43 -0700 Subject: [PATCH 282/904] [BOLT] Set local symbols in relocation mode to zero Summary: Strobelight is getting confused by local symbols that we do not update in relocation mode. These symbols were preserved by the linker in relocation mode in order support emitting relocations against local labels, but they are unused. Issue a quick fix to this by detecting such symbols and setting their value to zero. This patch also fixes an issue with the symbol table that was assigning the wrong section index to symbols associated with the .text section. (cherry picked from commit 271fa2687ae678938e012c5ef264b6d92149aadf) --- bolt/RewriteInstance.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 60fed04e0854..545bcffb17b3 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -3000,7 +3000,8 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { // Write all section header entries while patching section references. for (uint64_t Index = 0; Index < SectionsToWrite.size(); ++Index) { auto &Section = SectionsToWrite[Index]; - if (Section.sh_addr <= NewTextSectionStartAddress && + if (Section.sh_flags & ELF::SHF_ALLOC && + Section.sh_addr <= NewTextSectionStartAddress && Section.sh_addr + Section.sh_size > NewTextSectionStartAddress) { NewTextSectionIndex = Index; } @@ -3050,6 +3051,22 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (NewSymbol.st_shndx < ELF::SHN_LORESERVE) { NewSymbol.st_shndx = NewSectionIndex[NewSymbol.st_shndx]; } + // Set to zero local syms in the text section that we didn't update + // and were preserved by the linker to support relocations against + // .text (t15274167). + if (opts::Relocs && NewSymbol.getType() == ELF::STT_NOTYPE && + NewSymbol.getBinding() == ELF::STB_LOCAL && + NewSymbol.st_size == 0) { + if (auto SecOrErr = + File->getELFFile()->getSection(NewSymbol.st_shndx)) { + auto Section = *SecOrErr; + if (Section->sh_type == ELF::SHT_PROGBITS && + Section->sh_flags & ELF::SHF_ALLOC && + Section->sh_flags & ELF::SHF_EXECINSTR) { + NewSymbol.st_value = 0; + } + } + } } if (opts::HotText) { From 2f32db6be00952fac0c78b913ca5ae5d6fb6c97a Mon Sep 17 00:00:00 2001 From: Bohan Ren Date: Tue, 13 Jun 2017 16:29:39 -0700 Subject: [PATCH 283/904] [BOLT] Call Distance Metric Summary: Designed a new metric, which shows 93.46% correltation with Cache Miss and 86% correlation with CPU Time. Definition: One can get all the traversal path for each function. And for each traversal, we will define a distance. The distance represents how far two connected basic blocks are. Therefore, for each traversal, I will go through the basic blocks one by one, until the end of the traversal and sum up the distance for the neighboring basic blocks. Distance between two connected basic blocks is the distance of the centers of two blocks in the binary file. (cherry picked from commit 01d8f5bd15ec99d8b467768fafdc8e8b39060f6d) --- bolt/CMakeLists.txt | 1 + bolt/CalcCacheMetrics.cpp | 166 ++++++++++++++++++++++++++++++++++++++ bolt/CalcCacheMetrics.h | 27 +++++++ bolt/RewriteInstance.cpp | 20 +++++ 4 files changed, 214 insertions(+) create mode 100644 bolt/CalcCacheMetrics.cpp create mode 100644 bolt/CalcCacheMetrics.h diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index c8e2f3e3c45b..0ddfb353a599 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -64,6 +64,7 @@ add_llvm_tool(llvm-bolt BinaryContext.cpp BinaryFunction.cpp BinaryPassManager.cpp + CalcCacheMetrics.cpp DataReader.cpp DebugData.cpp DWARFRewriter.cpp diff --git a/bolt/CalcCacheMetrics.cpp b/bolt/CalcCacheMetrics.cpp new file mode 100644 index 000000000000..b165483c0ea5 --- /dev/null +++ b/bolt/CalcCacheMetrics.cpp @@ -0,0 +1,166 @@ +//===------ CalcCacheMetrics.cpp - Calculate metrics of cache lines -------===// +// +// Functions to show metrics of cache lines +// +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "BinaryPassManager.h" +#include "CalcCacheMetrics.h" +#include "Exceptions.h" +#include "RewriteInstance.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCSectionELF.h" +#include + +using namespace llvm; +using namespace object; +using namespace bolt; +using Traversal = std::vector; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +} // namespace opts + + +namespace { + +/// Initialize and return a position map for binary basic blocks. +std::unordered_map +getPositionMap(const BinaryFunction &Function) { + std::unordered_map DistMap; + double CurrAddress = 0; + for (auto *BB : Function.layout()) { + uint64_t Size = BB->estimateSize(); + DistMap[BB] = CurrAddress + (double)Size / 2; + CurrAddress += Size; + } + return DistMap; +} + +/// Initialize and return a vector of traversals for a given function and its +/// entry point +std::vector getTraversals(const BinaryFunction &Function, + BinaryBasicBlock *BB) { + std::vector AllTraversals; + std::stack> Stack; + Stack.push(std::make_pair(BB, Traversal())); + std::unordered_set BBSet; + + while (!Stack.empty()) { + BinaryBasicBlock *CurrentBB = Stack.top().first; + Traversal PrevTraversal(Stack.top().second); + Stack.pop(); + + // Add current basic block into consideration + BBSet.insert(CurrentBB); + PrevTraversal.push_back(CurrentBB); + + if (CurrentBB->succ_empty()) { + AllTraversals.push_back(PrevTraversal); + continue; + } + + uint64_t SuccTotalCount = 0; + // Calculate total edges count of successors + for (auto BI = CurrentBB->branch_info_begin(); + BI != CurrentBB->branch_info_end(); ++BI) { + if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { + SuccTotalCount += BI->Count; + } + } + if (SuccTotalCount == 0) { + AllTraversals.push_back(PrevTraversal); + continue; + } + + auto BI = CurrentBB->branch_info_begin(); + for (auto *SuccBB : CurrentBB->successors()) { + if (BBSet.find(SuccBB) == BBSet.end() && BI->Count != 0 && + BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { + Stack.push(std::make_pair(SuccBB, PrevTraversal)); + } + ++BI; + } + } + + return AllTraversals; +} + +/// Given a traversal, return the sum of block distances along this traversal. +double +getTraversalLength(std::unordered_map &DistMap, + Traversal const &Path) { + if (Path.size() <= 1) { + return 0.0; + } + + double Length = 0.0; + BinaryBasicBlock *PrevBB = Path.front(); + for (auto BBI = std::next(Path.begin()); BBI != Path.end(); ++BBI) { + Length += std::abs(DistMap[*BBI] - DistMap[PrevBB]); + PrevBB = *BBI; + } + + return Length; +} + +/// Helper function of calcGraphDistance to go through the call traversals of +/// certain function and to calculate and record the length of each +/// traversal. +void graphDistHelper(std::vector &AllTraversals, + const BinaryFunction &Function, + std::unordered_map &TraversalMap, + uint64_t &TraversalCount) { + auto DistMap = getPositionMap(Function); + + for (auto const &Path : AllTraversals) { + TraversalMap[++TraversalCount] = getTraversalLength(DistMap, Path); + } +} +} + +void CalcCacheMetrics::calcGraphDistance( + const std::map &BinaryFunctions) { + + double TotalFuncValue = 0; + uint64_t FuncCount = 0; + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + + std::unordered_map TraversalMap; + uint64_t TraversalCount = 0; + for (auto *BB : Function.layout()) { + if (BB->isEntryPoint()) { + auto AllTraversals = getTraversals(Function, BB); + graphDistHelper(AllTraversals, Function, TraversalMap, TraversalCount); + } + } + + double TotalValue = 0; + for (auto const &Entry : TraversalMap) { + TotalValue += Entry.second; + } + + double AverageValue = + TraversalMap.empty() ? 0 : (TotalValue * 1.0 / TraversalMap.size()); + TotalFuncValue += AverageValue; + ++FuncCount; + } + + outs() << format(" Sum of averages of traversal distance for all " + "functions is: %.2f\n", + TotalFuncValue) + << format(" There are %u functions in total\n", FuncCount) + << format(" On average, every traversal is %.2f long\n\n", + TotalFuncValue / FuncCount); +} diff --git a/bolt/CalcCacheMetrics.h b/bolt/CalcCacheMetrics.h new file mode 100644 index 000000000000..07ca4551e28f --- /dev/null +++ b/bolt/CalcCacheMetrics.h @@ -0,0 +1,27 @@ +//===- CalcCacheMetrics.h - Interface for metrics printing of cache lines --===// +// +// Functions to show metrics of cache lines +// +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_CALCCACHEMETRICS_H +#define LLVM_CALCCACHEMETRICS_H + +#include "BinaryFunction.h" +#include + +using namespace llvm; +using namespace object; +using namespace bolt; + +namespace CalcCacheMetrics { +/// Calculate average number of call distance for every graph traversal. +void calcGraphDistance( + const std::map &BinaryFunctions); +} + +#endif //LLVM_CALCCACHEMETRICS_H diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 545bcffb17b3..04285f0d78dc 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -14,6 +14,7 @@ #include "BinaryContext.h" #include "BinaryFunction.h" #include "BinaryPassManager.h" +#include "CalcCacheMetrics.h" #include "DataReader.h" #include "Exceptions.h" #include "RewriteInstance.h" @@ -73,6 +74,13 @@ extern cl::OptionCategory BoltOptCategory; extern cl::opt JumpTables; extern cl::opt ReorderFunctions; +static cl::opt +CalcCacheMetrics("calc-cache-metrics", + cl::desc("calculate metrics of cache lines"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt OutputFilename("o", cl::desc(""), @@ -811,6 +819,12 @@ void RewriteInstance::run() { emitFunctions(); } + if (opts::CalcCacheMetrics) { + outs() << "\nBOLT-INFO: After Optimization Call Graph Statistics: Call " + "Distance \n\n"; + CalcCacheMetrics::calcGraphDistance(BinaryFunctions); + } + if (opts::UpdateDebugSections) updateDebugInfo(); @@ -1858,6 +1872,12 @@ void RewriteInstance::disassembleFunctions() { } } } + + if (opts::CalcCacheMetrics) { + outs() << "\nBOLT-INFO: Before Optimization Call Graph Statistics: Call " + "Distance \n\n"; + CalcCacheMetrics::calcGraphDistance(BinaryFunctions); + } } void RewriteInstance::runOptimizationPasses() { From 20fe8710ec7549287a771ff76e01fb5bf5292468 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 22 Jun 2017 16:34:01 -0700 Subject: [PATCH 284/904] [BOLT] Fix shrink-wrapping bugs Summary: Make shrink-wrapping more stable. Changes: * Correctly detect landing pads at the dominance frontier, bailing on such cases because we are not prepared to split LPs that are target of a critical edge. * Disable FOP's store removal by default - this is experimental and shouldn t go to prod because removing a store that we failed to detect it's actually necessary is disastrous. This pass currently doesn't have a great impact on the number of stores reduced, so it is not a problem. Most stores reduced are due shrink wrapping anyway. * Fix stack access identification - correctly estimate memory length of weird instructions, bail if we don't know. * Make rules for shrink-wrapping more strict: cancel shrink wrapping on a number of cases when we are not 100% sure that we are dealing with a regular callee-saved register. * Add basic block folding to SW. Sometimes when splitting critical edges we create a lot of redundant BBs with the same instructions, same successor but different predecessor. Fold all identical BBs created by splitting critical edges. * Change defaults: now the threshold used to determine when to perform SW is more conservative, to be sure we are moving a spill to a colder area. This effort, along with BB folding, helps us to avoid hurting icache performance by indiscriminately increasing code size. (cherry picked from commit bdf80b5af12fedce0edd37e14c62b3468da65070) --- bolt/BinaryContext.h | 3 +- bolt/Passes/FrameAnalysis.cpp | 13 +- bolt/Passes/FrameOptimizer.cpp | 11 +- bolt/Passes/ShrinkWrapping.cpp | 212 +++++++++++++++++++++++------- bolt/Passes/ShrinkWrapping.h | 4 + bolt/Passes/StackReachingUses.cpp | 16 +++ bolt/Passes/StackReachingUses.h | 11 ++ 7 files changed, 218 insertions(+), 52 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 427b94582b5d..55bfded435c9 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -278,7 +278,8 @@ class BinaryContext { uint64_t getHotThreshold() const { static uint64_t Threshold{0}; if (Threshold == 0) { - Threshold = NumProfiledFuncs ? SumExecutionCount / NumProfiledFuncs : 1; + Threshold = + NumProfiledFuncs ? SumExecutionCount / (2 * NumProfiledFuncs) : 1; } return Threshold; } diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 38362e21cc34..348888e8cfde 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -106,14 +106,17 @@ class FrameAccessAnalysis { MCPhysReg Reg{0}; int64_t StackOffset{0}; bool IsIndexed{false}; - if (!BC.MIA->isStackAccess( - Inst, FIE.IsLoad, FIE.IsStore, FIE.IsStoreFromReg, Reg, SrcImm, - FIE.StackPtrReg, StackOffset, FIE.Size, FIE.IsSimple, IsIndexed)) { + if (!BC.MIA->isStackAccess(*BC.MRI, Inst, FIE.IsLoad, FIE.IsStore, + FIE.IsStoreFromReg, Reg, SrcImm, FIE.StackPtrReg, + StackOffset, FIE.Size, FIE.IsSimple, + IsIndexed)) { return true; } - if (IsIndexed) { - DEBUG(dbgs() << "Giving up on indexed memory access in the frame\n"); + if (IsIndexed || FIE.Size == 0) { + DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n"); + DEBUG(dbgs() << "Blame insn: "); + DEBUG(Inst.dump()); return false; } diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 196dfe749951..30b3c8410e9e 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -40,6 +40,15 @@ FrameOptimization("frame-opt", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +cl::opt +RemoveStores("frame-opt-rm-stores", + cl::init(FOP_NONE), + cl::desc("apply additional analysis to remove stores (experimental)"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + + } // namespace opts namespace llvm { @@ -243,7 +252,7 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, NamedRegionTimer T1("remove loads", "FOP breakdown", opts::TimeOpts); removeUnnecessaryLoads(RA, FA, BC, I.second); } - { + if (opts::RemoveStores) { NamedRegionTimer T1("remove stores", "FOP breakdown", opts::TimeOpts); removeUnusedStores(FA, BC, I.second); } diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp index 47636c1c6019..99afc5e4312c 100644 --- a/bolt/Passes/ShrinkWrapping.cpp +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -26,7 +26,7 @@ static cl::opt ShrinkWrappingThreshold( cl::desc("Percentage of prologue execution count to use as threshold when" " evaluating whether a block is cold enough to be profitable to" " move eligible spills there"), - cl::init(40), cl::ZeroOrMore, cl::cat(BoltOptCategory)); + cl::init(30), cl::ZeroOrMore, cl::cat(BoltOptCategory)); } namespace llvm { @@ -36,6 +36,7 @@ void CalleeSavedAnalysis::analyzeSaves() { ReachingDefOrUse &RD = Info.getReachingDefs(); StackReachingUses &SRU = Info.getStackReachingUses(); auto &InsnToBB = Info.getInsnToBBMap(); + BitVector BlacklistedRegs(BC.MRI->getNumRegs(), false); DEBUG(dbgs() << "Checking spill locations\n"); for (auto &BB : BF) { @@ -43,14 +44,26 @@ void CalleeSavedAnalysis::analyzeSaves() { const MCInst *Prev = nullptr; for (auto &Inst : BB) { if (auto FIE = FA.getFIEFor(Inst)) { - if (!FIE->IsStore || !FIE->IsSimple || !FIE->IsStoreFromReg || - FIE->StackOffset >= 0) { + // Blacklist weird stores we don't understand + if ((!FIE->IsSimple || FIE->StackOffset >= 0) && FIE->IsStore && + FIE->IsStoreFromReg) { + BlacklistedRegs.set(FIE->RegOrImm); + CalleeSaved.reset(FIE->RegOrImm); + Prev = &Inst; + continue; + } + + if (!FIE->IsStore || !FIE->IsStoreFromReg || + BlacklistedRegs[FIE->RegOrImm]) { Prev = &Inst; continue; } + // If this reg is defined locally, it is not a callee-saved reg if (RD.isReachedBy(FIE->RegOrImm, Prev ? RD.expr_begin(*Prev) : RD.expr_begin(BB))) { + BlacklistedRegs.set(FIE->RegOrImm); + CalleeSaved.reset(FIE->RegOrImm); Prev = &Inst; continue; } @@ -61,13 +74,32 @@ void CalleeSavedAnalysis::analyzeSaves() { if (SRU.isStoreUsed(*FIE, Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB)), /*IncludeLocalAccesses=*/false) { + BlacklistedRegs.set(FIE->RegOrImm); + CalleeSaved.reset(FIE->RegOrImm); + Prev = &Inst; + continue; + } + + // If this stack position is loaded elsewhere in another reg, we can't + // update it, so blacklist it. + if (SRU.isLoadedInDifferentReg(*FIE, Prev ? SRU.expr_begin(*Prev) + : SRU.expr_begin(BB))) { + BlacklistedRegs.set(FIE->RegOrImm); + CalleeSaved.reset(FIE->RegOrImm); + Prev = &Inst; + continue; + } + + // Ignore regs with multiple saves + if (CalleeSaved[FIE->RegOrImm]) { + BlacklistedRegs.set(FIE->RegOrImm); + CalleeSaved.reset(FIE->RegOrImm); Prev = &Inst; continue; } CalleeSaved.set(FIE->RegOrImm); - if (SaveFIEByReg[FIE->RegOrImm] == nullptr) - SaveFIEByReg[FIE->RegOrImm] = &*FIE; + SaveFIEByReg[FIE->RegOrImm] = &*FIE; SavingCost[FIE->RegOrImm] += InsnToBB[&Inst]->getKnownExecutionCount(); BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getSaveTag(), FIE->RegOrImm); OffsetsByReg[FIE->RegOrImm] = FIE->StackOffset; @@ -88,8 +120,7 @@ void CalleeSavedAnalysis::analyzeRestores() { for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { auto &Inst = *I; if (auto FIE = FA.getFIEFor(Inst)) { - if (!FIE->IsLoad || !FIE->IsSimple || !CalleeSaved[FIE->RegOrImm] || - FIE->StackOffset >= 0) { + if (!FIE->IsLoad || !CalleeSaved[FIE->RegOrImm]) { Prev = &Inst; continue; } @@ -97,8 +128,12 @@ void CalleeSavedAnalysis::analyzeRestores() { // If this reg is used locally after a restore, then we are probably // not dealing with a callee-saved reg. Except if this use is by // another store, but we don't cover this case yet. - if (RU.isReachedBy(FIE->RegOrImm, + // Also not callee-saved if this load accesses caller stack or isn't + // simple. + if (!FIE->IsSimple || FIE->StackOffset >= 0 || + RU.isReachedBy(FIE->RegOrImm, Prev ? RU.expr_begin(*Prev) : RU.expr_begin(BB))) { + CalleeSaved.reset(FIE->RegOrImm); Prev = &Inst; continue; } @@ -568,9 +603,9 @@ void StackLayoutModifier::performChanges() { bool IsStoreFromReg{false}; uint8_t Size{0}; bool Success{false}; - Success = BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, - Reg, SrcImm, StackPtrReg, StackOffset, - Size, IsSimple, IsIndexed); + Success = BC.MIA->isStackAccess(*BC.MRI, Inst, IsLoad, IsStore, + IsStoreFromReg, Reg, SrcImm, StackPtrReg, + StackOffset, Size, IsSimple, IsIndexed); assert(Success && IsSimple && !IsIndexed && (!IsStore || IsStoreFromReg)); if (StackPtrReg != BC.MIA->getFramePointer()) Adjustment = -Adjustment; @@ -851,11 +886,13 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR, CritEdgesFrom.emplace_back(FrontierBB); CritEdgesTo.emplace_back(0); auto &Dests = CritEdgesTo.back(); - bool MayNeedLPSplitting{false}; // Check for invoke instructions at the dominance frontier, which indicates // the landing pad is not dominated. - if (PP.isInst() && BC.MIA->isInvoke(*PP.getInst())) - MayNeedLPSplitting = true; + if (PP.isInst() && BC.MIA->isInvoke(*PP.getInst())) { + DEBUG(dbgs() << "Bailing on restore placement to avoid LP splitting\n"); + Frontier.clear(); + return Frontier; + } doForAllSuccs(*FrontierBB, [&](ProgramPoint P) { if (!DA.doesADominateB(*BestPosSave, P)) { Dests.emplace_back(Info.getParentBB(P)); @@ -863,12 +900,6 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR, } HasCritEdges = true; }); - // This confirms LP splitting is necessary to continue. Bail. - if (MayNeedLPSplitting && Dests.empty()) { - DEBUG(dbgs() << "Bailing on restore placement to avoid LP splitting\n"); - Frontier.clear(); - return Frontier; - } IsCritEdge.push_back(HasCritEdges); } if (std::accumulate(IsCritEdge.begin(), IsCritEdge.end(), 0)) { @@ -1095,6 +1126,9 @@ void ShrinkWrapping::scheduleSaveRestoreInsertions( void ShrinkWrapping::moveSaveRestores() { bool DisablePushPopMode{false}; bool UsedPushPopMode{false}; + // Keeps info about successfully moved regs: reg index, save position and + // save size + std::vector> MovedRegs; for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { MCInst *BestPosSave{nullptr}; @@ -1132,25 +1166,12 @@ void ShrinkWrapping::moveSaveRestores() { scheduleOldSaveRestoresRemoval(I, UsePushPops); scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops); - - // Schedule modifications to stack-accessing instructions via - // StackLayoutModifier - if (UsePushPops) { - for (MCInst *Save : CSA.getSavesByReg(I)) { - SLM.collapseRegion(Save); - } - SLM.insertRegion(BestPosSave, SaveSize); - } - - // Stats collection - if (UsePushPops) - ++SpillsMovedPushPopMode; - else - ++SpillsMovedRegularMode; + MovedRegs.emplace_back(std::make_tuple(I, BestPosSave, SaveSize)); } // Revert push-pop mode if it failed for a single CSR if (DisablePushPopMode && UsedPushPopMode) { + UsedPushPopMode = false; for (auto &BB : BF) { auto WRI = Todo.find(&BB); if (WRI != Todo.end()) { @@ -1176,6 +1197,86 @@ void ShrinkWrapping::moveSaveRestores() { } } } + + // Update statistics + if (!UsedPushPopMode) { + SpillsMovedRegularMode += MovedRegs.size(); + return; + } + + // Schedule modifications to stack-accessing instructions via + // StackLayoutModifier. + SpillsMovedPushPopMode += MovedRegs.size(); + for (auto &I : MovedRegs) { + unsigned RegNdx; + MCInst *SavePos; + size_t SaveSize; + std::tie(RegNdx, SavePos, SaveSize) = I; + for (MCInst *Save : CSA.getSavesByReg(RegNdx)) { + SLM.collapseRegion(Save); + } + SLM.insertRegion(SavePos, SaveSize); + } +} + +namespace { +/// Helper function to identify whether two basic blocks created by splitting +/// a critical edge have the same contents. +bool isIdenticalSplitEdgeBB(const BinaryBasicBlock &A, + const BinaryBasicBlock &B) { + if (A.succ_size() != B.succ_size()) + return false; + if (A.succ_size() != 1) + return false; + + if (*A.succ_begin() != *B.succ_begin()) + return false; + + if (A.size() != B.size()) + return false; + + // Compare instructions + auto I = A.begin(), E = A.end(); + auto OtherI = B.begin(), OtherE = B.end(); + while (I != E && OtherI != OtherE) { + if (I->getOpcode() != OtherI->getOpcode()) + return false; + if (!I->equals(*OtherI, + [](const MCSymbol *A, const MCSymbol *B) { return true; })) + return false; + ++I; + ++OtherI; + } + return true; +} +} + +bool ShrinkWrapping::foldIdenticalSplitEdges() { + bool Changed{false}; + for (auto Iter = BF.begin(); Iter != BF.end(); ++Iter) { + BinaryBasicBlock &BB = *Iter; + if (!BB.getName().startswith(".LSplitEdge")) + continue; + for (auto RIter = BF.rbegin(); RIter != BF.rend(); ++RIter) { + BinaryBasicBlock &RBB = *RIter; + if (&RBB == &BB) + break; + if (!RBB.getName().startswith(".LSplitEdge") || + !RBB.isValid() || + !isIdenticalSplitEdgeBB(*Iter, RBB)) + continue; + assert(RBB.pred_size() == 1 && "Invalid split edge BB"); + BinaryBasicBlock *Pred = *RBB.pred_begin(); + uint64_t OrigCount{Pred->branch_info_begin()->Count}; + uint64_t OrigMispreds{Pred->branch_info_begin()->MispredictedCount}; + Pred->replaceSuccessor(&RBB, &BB, OrigCount, OrigMispreds); + Changed = true; + // Remove the block from CFG + RBB.markValid(false); + } + } + + return Changed; } namespace { @@ -1275,11 +1376,11 @@ void ShrinkWrapping::insertUpdatedCFI(unsigned CSR, int SPValPush, bool IsSimple{false}; bool IsStoreFromReg{false}; uint8_t Size{0}; - if (!BC.MIA->isStackAccess(*InstIter, IsLoad, IsStore, IsStoreFromReg, - Reg, SrcImm, StackPtrReg, StackOffset, Size, - IsSimple, IsIndexed)) + if (!BC.MIA->isStackAccess(*BC.MRI, *InstIter, IsLoad, IsStore, + IsStoreFromReg, Reg, SrcImm, StackPtrReg, + StackOffset, Size, IsSimple, IsIndexed)) continue; - if (Reg != CSR || !IsStore) + if (Reg != CSR || !IsStore || !IsSimple) continue; SavePoint = &*InstIter; break; @@ -1317,11 +1418,20 @@ void ShrinkWrapping::insertUpdatedCFI(unsigned CSR, int SPValPush, } } } - if (InAffectedZoneAtBegin != PrevAffectedZone) { + // Are we at the hot-cold split point? + if (BF.isSplit() && PrevBB && BB->isCold() != PrevBB->isCold()) { if (InAffectedZoneAtBegin) { - insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, true, 0, SPValPush); - } else { - insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, false, 0, SPValPop); + insertCFIsForPushOrPop(*BB, BB->begin(), CSR, true, 0, SPValPush); + } + } else { + if (InAffectedZoneAtBegin != PrevAffectedZone) { + if (InAffectedZoneAtBegin) { + insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, true, 0, + SPValPush); + } else { + insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, false, 0, + SPValPop); + } } } PrevAffectedZone = InAffectedZoneAtEnd; @@ -1360,10 +1470,16 @@ void ShrinkWrapping::rebuildCFIForSP() { SPVal = CurVal; } } - if (SPValAtBegin != PrevSPVal) { + if (BF.isSplit() && PrevBB && BB->isCold() != PrevBB->isCold()) { BF.addCFIInstruction( - PrevBB, PrevBB->end(), + BB, BB->begin(), MCCFIInstruction::createDefCfaOffset(nullptr, SPValAtBegin)); + } else { + if (SPValAtBegin != PrevSPVal) { + BF.addCFIInstruction( + PrevBB, PrevBB->end(), + MCCFIInstruction::createDefCfaOffset(nullptr, SPValAtBegin)); + } } PrevSPVal = SPValAtEnd; PrevBB = BB; @@ -1699,6 +1815,12 @@ void ShrinkWrapping::perform() { if (!processInsertions()) return; processDeletions(); + if (foldIdenticalSplitEdges()) { + const auto Stats = BF.eraseInvalidBBs(); + DEBUG(dbgs() << "Deleted " << Stats.first << " redundant split edge BBs (" + << Stats.second << " bytes) for " << BF.getPrintName() + << "\n"); + } rebuildCFI(); // We may have split edges, creating BBs that need correct branching BF.fixBranches(); diff --git a/bolt/Passes/ShrinkWrapping.h b/bolt/Passes/ShrinkWrapping.h index 7c28dea5ba47..91549124148b 100644 --- a/bolt/Passes/ShrinkWrapping.h +++ b/bolt/Passes/ShrinkWrapping.h @@ -397,6 +397,10 @@ class ShrinkWrapping { /// by computeSaveLocations(). void moveSaveRestores(); + /// Compare multiple basic blocks created by splitting critical edges. If they + /// have the same contents and successor, fold them into one. + bool foldIdenticalSplitEdges(); + /// After the spill locations for reg \p CSR has been moved and all affected /// CFI has been removed, insert new updated CFI information for these /// locations. diff --git a/bolt/Passes/StackReachingUses.cpp b/bolt/Passes/StackReachingUses.cpp index a7a91e92b06a..6efa48db1f90 100644 --- a/bolt/Passes/StackReachingUses.cpp +++ b/bolt/Passes/StackReachingUses.cpp @@ -16,6 +16,22 @@ namespace llvm { namespace bolt { +bool StackReachingUses::isLoadedInDifferentReg(const FrameIndexEntry &StoreFIE, + ExprIterator Candidates) const { + for (auto I = Candidates; I != expr_end(); ++I) { + const MCInst *ReachingInst = *I; + if (auto FIEY = FA.getFIEFor(*ReachingInst)) { + assert(FIEY->IsLoad == 1); + if (StoreFIE.StackOffset + StoreFIE.Size > FIEY->StackOffset && + StoreFIE.StackOffset < FIEY->StackOffset + FIEY->Size && + StoreFIE.RegOrImm != FIEY->RegOrImm) { + return true; + } + } + } + return false; +} + bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE, ExprIterator Candidates, bool IncludeLocalAccesses) const { diff --git a/bolt/Passes/StackReachingUses.h b/bolt/Passes/StackReachingUses.h index 84a46754c748..bc34db3961c9 100644 --- a/bolt/Passes/StackReachingUses.h +++ b/bolt/Passes/StackReachingUses.h @@ -36,6 +36,17 @@ class StackReachingUses : InstrsDataflowAnalysis(BC, BF), FA(FA) {} virtual ~StackReachingUses() {} + /// Return true if the stack position written by the store in \p StoreFIE was + /// later consumed by a load to a different register (not the same one used in + /// the store). Useful for identifying loads/stores of callee-saved regs. + bool isLoadedInDifferentReg(const FrameIndexEntry &StoreFIE, + ExprIterator Candidates) const; + + /// Answer whether the stack position written by the store represented in + /// \p StoreFIE is loaded from or consumed in any way. The set of all + /// relevant expressions reaching this store should be in \p Candidates. + /// If \p IncludelocalAccesses is false, we only consider wheter there is + /// a callee that consumes this stack position. bool isStoreUsed(const FrameIndexEntry &StoreFIE, ExprIterator Candidates, bool IncludeLocalAccesses = true) const; From b9fd6817b74297c9ee00552e53495de0ffe2c95d Mon Sep 17 00:00:00 2001 From: Bohan Ren Date: Tue, 4 Jul 2017 15:59:29 -0700 Subject: [PATCH 285/904] [BOLT] Improved Jump-Distance Metric Summary: Current existing Jump-Distance Metric (Previously named Call-Distance) will ignore some traversals. This modified version adds those missing traversals back. The correlation remains the same: around 97% correlation with CPU and Cache Miss (which implies that even though some traversals are ignored, it doesn't affect correlation that much.) (cherry picked from commit a3dadce99dbb64603ec60508e3e7c7f0a0f3eb53) --- bolt/CalcCacheMetrics.cpp | 21 +++++++++++++-------- bolt/RewriteInstance.cpp | 4 ++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/bolt/CalcCacheMetrics.cpp b/bolt/CalcCacheMetrics.cpp index b165483c0ea5..4e9e81d7ce2e 100644 --- a/bolt/CalcCacheMetrics.cpp +++ b/bolt/CalcCacheMetrics.cpp @@ -50,10 +50,10 @@ getPositionMap(const BinaryFunction &Function) { /// Initialize and return a vector of traversals for a given function and its /// entry point std::vector getTraversals(const BinaryFunction &Function, - BinaryBasicBlock *BB) { + BinaryBasicBlock *EntryBB) { std::vector AllTraversals; std::stack> Stack; - Stack.push(std::make_pair(BB, Traversal())); + Stack.push(std::make_pair(EntryBB, Traversal())); std::unordered_set BBSet; while (!Stack.empty()) { @@ -70,23 +70,28 @@ std::vector getTraversals(const BinaryFunction &Function, continue; } - uint64_t SuccTotalCount = 0; + bool HaveSuccCount = false; // Calculate total edges count of successors for (auto BI = CurrentBB->branch_info_begin(); BI != CurrentBB->branch_info_end(); ++BI) { - if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { - SuccTotalCount += BI->Count; + if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) { + HaveSuccCount = true; + break; } } - if (SuccTotalCount == 0) { + if (!HaveSuccCount) { AllTraversals.push_back(PrevTraversal); continue; } auto BI = CurrentBB->branch_info_begin(); for (auto *SuccBB : CurrentBB->successors()) { - if (BBSet.find(SuccBB) == BBSet.end() && BI->Count != 0 && - BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { + // If we have never seen SuccBB, or SuccBB indicates the + // end of traversal, SuccBB will be added into stack for + // further exploring. + if ((BBSet.find(SuccBB) == BBSet.end() && BI->Count != 0 && + BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) || + SuccBB->succ_empty()) { Stack.push(std::make_pair(SuccBB, PrevTraversal)); } ++BI; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 04285f0d78dc..007f934af846 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -820,7 +820,7 @@ void RewriteInstance::run() { } if (opts::CalcCacheMetrics) { - outs() << "\nBOLT-INFO: After Optimization Call Graph Statistics: Call " + outs() << "\nBOLT-INFO: After Optimization CFG Graph Statistics: Jump " "Distance \n\n"; CalcCacheMetrics::calcGraphDistance(BinaryFunctions); } @@ -1874,7 +1874,7 @@ void RewriteInstance::disassembleFunctions() { } if (opts::CalcCacheMetrics) { - outs() << "\nBOLT-INFO: Before Optimization Call Graph Statistics: Call " + outs() << "\nBOLT-INFO: Before Optimization CFG Graph Statistics: Jump " "Distance \n\n"; CalcCacheMetrics::calcGraphDistance(BinaryFunctions); } From 26824103af7ef9c0e2e28913afe1d89e90c9674c Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 27 Jun 2017 16:25:59 -0700 Subject: [PATCH 286/904] [BOLT] Add cold symbols to the symbol table Summary: Create new .symtab and .strtab sections, so we can change their sizes and not only patch them. Remove local symbols and add symbols to identify the cold part of split functions. (cherry picked from commit 9e0f10a0bb39147600b60ccb47b7357b04c47e7c) --- bolt/RewriteInstance.cpp | 267 ++++++++++++++++++++++++++++----------- bolt/RewriteInstance.h | 55 +++++--- 2 files changed, 230 insertions(+), 92 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 007f934af846..91dde9dda7d4 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -403,6 +403,7 @@ size_t padFunction(const BinaryFunction &Function) { } // namespace opts constexpr const char *RewriteInstance::SectionsToOverwrite[]; +constexpr const char *RewriteInstance::SectionsToOverwriteRelocMode[]; const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; @@ -2719,10 +2720,6 @@ void RewriteInstance::rewriteNoteSections() { Size = appendPadding(OS, Size, Section.sh_addralign); } - if (Section.sh_type == ELF::SHT_SYMTAB) { - NewSymTabOffset = NextAvailableOffset; - } - // Address of extension to the section. uint64_t Address{0}; @@ -2853,47 +2850,52 @@ void RewriteInstance::addBoltInfoSection() { } } -// Rewrite section header table inserting new entries as needed. The sections -// header table size itself may affect the offsets of other sections, -// so we are placing it at the end of the binary. -// -// As we rewrite entries we need to track how many sections were inserted -// as it changes the sh_link value. We map old indices to new ones for -// existing sections. -// -// The following are assumptions about file modifications: -// * There are no modifications done to address and/or size of existing -// allocatable sections. -// * All new allocatable sections are written immediately after existing -// allocatable sections. -// * There could be modifications done to non-allocatable sections, e.g. -// size could be increased. -// * New non-allocatable sections are added to the end of the file. -template -void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { - using Elf_Shdr = typename ELFObjectFile::Elf_Shdr; - +// Provide a mapping of the existing input binary sections to the output binary +// section header table. +// Return the map from the section header old index to its new index. Optionally +// return in OutputSections an ordered list of the output sections. This is +// optional because for reference updating in the symbol table we only need the +// map of input to output indices, not the real output section list. +template +std::vector +RewriteInstance::getOutputSections(ELFObjectFile *File, + std::vector *OutputSections) { auto *Obj = File->getELFFile(); - auto &OS = Out->os(); - - std::vector SectionsToWrite; - NewSectionIndex.resize(Obj->getNumSections()); + std::vector NewSectionIndex(Obj->getNumSections(), 0); + NewTextSectionIndex = 0; + uint32_t CurIndex{0}; // Copy over entries for original allocatable sections with minor // modifications (e.g. name). for (auto &Section : Obj->sections()) { // Always ignore this section. if (Section.sh_type == ELF::SHT_NULL) { - NewSectionIndex[0] = SectionsToWrite.size(); - SectionsToWrite.emplace_back(Section); + NewSectionIndex[0] = CurIndex++; + if (OutputSections) + OutputSections->emplace_back(Section); continue; } + // Is this our new text? Then update our pointer indicating the new output + // text section + if (opts::UseOldText && Section.sh_flags & ELF::SHF_ALLOC && + Section.sh_addr <= NewTextSectionStartAddress && + Section.sh_addr + Section.sh_size > NewTextSectionStartAddress) { + NewTextSectionIndex = CurIndex; + } + // Skip non-allocatable sections. if (!(Section.sh_flags & ELF::SHF_ALLOC)) continue; + NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = + CurIndex++; + + // If only computing the map, we're done with this iteration + if (!OutputSections) + continue; + ErrorOr SectionName = Obj->getSectionName(&Section); check_error(SectionName.getError(), "cannot get section name"); @@ -2910,12 +2912,16 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSection.sh_name = SHStrTab.getOffset(*SectionName); } - NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = - SectionsToWrite.size(); - SectionsToWrite.emplace_back(NewSection); + OutputSections->emplace_back(NewSection); + } + + // If we are creating our own .text section, it should be the first section + // we created in EFMM->SectionMapInfo, so this is the correct index. + if (!opts::UseOldText) { + NewTextSectionIndex = CurIndex; } - // Create entries for new allocatable sections. + // Process entries for all new allocatable sections. for (auto &SMII : EFMM->SectionMapInfo) { const auto &SectionName = SMII.first; const auto &SI = SMII.second; @@ -2926,9 +2932,15 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { << SMII.first << '\n'; continue; } + + ++CurIndex; + // If only computing the map, we're done with this iteration + if (!OutputSections) + continue; + if (opts::Verbosity >= 1) outs() << "BOLT-INFO: writing section header for " << SectionName << '\n'; - Elf_Shdr NewSection; + ELFShdrTy NewSection; NewSection.sh_name = SHStrTab.getOffset(SectionName); NewSection.sh_type = ELF::SHT_PROGBITS; NewSection.sh_addr = SI.FileAddress; @@ -2939,7 +2951,7 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSection.sh_link = 0; NewSection.sh_info = 0; NewSection.sh_addralign = SI.Alignment; - SectionsToWrite.emplace_back(NewSection); + OutputSections->emplace_back(NewSection); } uint64_t LastFileOffset = 0; @@ -2955,6 +2967,13 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { if (Section.sh_type == ELF::SHT_RELA) continue; + NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = + CurIndex++; + + // If only computing the map, we're done with this iteration + if (!OutputSections) + continue; + ErrorOr SectionName = Obj->getSectionName(&Section); check_error(SectionName.getError(), "cannot get section name"); @@ -2968,13 +2987,15 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSection.sh_size = SI.Size; NewSection.sh_name = SHStrTab.getOffset(*SectionName); - NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = - SectionsToWrite.size(); - SectionsToWrite.emplace_back(NewSection); + OutputSections->emplace_back(NewSection); LastFileOffset = SI.FileOffset; } + // Map input -> output is ready. Early return if that's all we need. + if (!OutputSections) + return NewSectionIndex; + // Create entries for new non-allocatable sections. for (auto &SII : EFMM->NoteSectionInfo) { const auto &SectionName = SII.first; @@ -2985,7 +3006,7 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { if (opts::Verbosity >= 1) outs() << "BOLT-INFO: writing section header for " << SectionName << '\n'; - Elf_Shdr NewSection; + ELFShdrTy NewSection; NewSection.sh_name = SHStrTab.getOffset(SectionName); NewSection.sh_type = (SI.IsStrTab ? ELF::SHT_STRTAB : ELF::SHT_PROGBITS); NewSection.sh_addr = 0; @@ -2996,12 +3017,40 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewSection.sh_link = 0; NewSection.sh_info = 0; NewSection.sh_addralign = SI.Alignment ? SI.Alignment : 1; - SectionsToWrite.emplace_back(NewSection); + OutputSections->emplace_back(NewSection); } + return NewSectionIndex; +} + +// Rewrite section header table inserting new entries as needed. The sections +// header table size itself may affect the offsets of other sections, +// so we are placing it at the end of the binary. +// +// As we rewrite entries we need to track how many sections were inserted +// as it changes the sh_link value. We map old indices to new ones for +// existing sections. +// +// The following are assumptions about file modifications: +// * There are no modifications done to address and/or size of existing +// allocatable sections. +// * All new allocatable sections are written immediately after existing +// allocatable sections. +// * There could be modifications done to non-allocatable sections, e.g. +// size could be increased. +// * New non-allocatable sections are added to the end of the file. +template +void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { + using Elf_Shdr = typename ELFObjectFile::Elf_Shdr; + std::vector OutputSections; + auto &OS = Out->os(); + auto *Obj = File->getELFFile(); + + auto NewSectionIndex = getOutputSections(File, &OutputSections); + // Sort sections by their offset prior to writing. Only newly created sections // were unsorted, hence this wouldn't ruin indices in NewSectionIndex. - std::stable_sort(SectionsToWrite.begin(), SectionsToWrite.end(), + std::stable_sort(OutputSections.begin(), OutputSections.end(), [] (Elf_Shdr A, Elf_Shdr B) { return A.sh_offset < B.sh_offset; }); @@ -3018,13 +3067,8 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { SHTOffset = appendPadding(OS, SHTOffset, sizeof(Elf_Shdr)); // Write all section header entries while patching section references. - for (uint64_t Index = 0; Index < SectionsToWrite.size(); ++Index) { - auto &Section = SectionsToWrite[Index]; - if (Section.sh_flags & ELF::SHF_ALLOC && - Section.sh_addr <= NewTextSectionStartAddress && - Section.sh_addr + Section.sh_size > NewTextSectionStartAddress) { - NewTextSectionIndex = Index; - } + for (uint64_t Index = 0; Index < OutputSections.size(); ++Index) { + auto &Section = OutputSections[Index]; Section.sh_link = NewSectionIndex[Section.sh_link]; if (Section.sh_type == ELF::SHT_REL || Section.sh_type == ELF::SHT_RELA) { if (Section.sh_info) @@ -3043,7 +3087,7 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { NewEhdr.e_phoff = PHDRTableOffset; NewEhdr.e_phnum = Phnum; NewEhdr.e_shoff = SHTOffset; - NewEhdr.e_shnum = SectionsToWrite.size(); + NewEhdr.e_shnum = OutputSections.size(); NewEhdr.e_shstrndx = NewSectionIndex[NewEhdr.e_shstrndx]; OS.pwrite(reinterpret_cast(&NewEhdr), sizeof(NewEhdr), 0); } @@ -3054,26 +3098,54 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { return; auto *Obj = File->getELFFile(); - auto &OS = Out->os(); + // Set pointer at the end of the output file, so we can pwrite old symbol + // tables if we need to. + uint64_t NextAvailableOffset = getFileOffsetForAddress(NextAvailableAddress); + assert(NextAvailableOffset >= FirstNonAllocatableOffset && + "next available offset calculation failure"); + Out->os().seek(NextAvailableOffset); using Elf_Shdr = typename ELFObjectFile::Elf_Shdr; using Elf_Sym = typename ELFObjectFile::Elf_Sym; - auto updateSymbolTable = [&](uint64_t SymTabOffset, const Elf_Shdr *Section) { - auto StringSectionOrError = Obj->getStringTableForSymtab(*Section); + // Compute a preview of how section indices will change after rewriting, so + // we can properly update the symbol table. + auto NewSectionIndex = + getOutputSections(File, (std::vector *)nullptr); + + auto updateSymbolTable = [&](bool PatchExisting, const Elf_Shdr *Section, + std::function + Write, + std::function AddToStrTab) { + auto StringSection = *Obj->getStringTableForSymtab(*Section); + for (const Elf_Sym &Symbol : Obj->symbols(Section)) { auto NewSymbol = Symbol; - if (const auto *Function = getBinaryFunctionAtAddress(Symbol.st_value)) { + const auto *Function = getBinaryFunctionAtAddress(Symbol.st_value); + // Some section symbols may be mistakenly associated with the first + // function emitted in the section. Dismiss if it is a section symbol. + if (Function && NewSymbol.getType() != ELF::STT_SECTION) { NewSymbol.st_value = Function->getOutputAddress(); NewSymbol.st_size = Function->getOutputSize(); NewSymbol.st_shndx = NewTextSectionIndex; + if (!PatchExisting && Function->isSplit()) { + auto NewColdSym = NewSymbol; + SmallVector Buf; + NewColdSym.st_name = AddToStrTab(Twine(*Symbol.getName(StringSection)) + .concat(".cold.0") + .toStringRef(Buf)); + NewColdSym.st_value = Function->cold().getAddress(); + NewColdSym.st_size = Function->cold().getImageSize(); + Write(0, reinterpret_cast(&NewColdSym), + sizeof(NewColdSym)); + } } else { if (NewSymbol.st_shndx < ELF::SHN_LORESERVE) { NewSymbol.st_shndx = NewSectionIndex[NewSymbol.st_shndx]; } - // Set to zero local syms in the text section that we didn't update + // Detect local syms in the text section that we didn't update // and were preserved by the linker to support relocations against - // .text (t15274167). + // .text (t15274167). Remove then from the symtab. if (opts::Relocs && NewSymbol.getType() == ELF::STT_NOTYPE && NewSymbol.getBinding() == ELF::STB_LOCAL && NewSymbol.st_size == 0) { @@ -3083,6 +3155,11 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (Section->sh_type == ELF::SHT_PROGBITS && Section->sh_flags & ELF::SHF_ALLOC && Section->sh_flags & ELF::SHF_EXECINSTR) { + // This will cause the symbol to not be emitted if we are + // creating a new symtab from scratch instead of patching one. + if (!PatchExisting) + continue; + // If patching an existing symtab, patch this value to zero. NewSymbol.st_value = 0; } } @@ -3098,16 +3175,14 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { return true; }; - auto SymbolName = Symbol.getName(*StringSectionOrError); + auto SymbolName = Symbol.getName(StringSection); assert(SymbolName && "cannot get symbol name"); if (*SymbolName == "__hot_start" || *SymbolName == "__hot_end") updateSymbolValue(*SymbolName); } - OS.pwrite(reinterpret_cast(&NewSymbol), - sizeof(NewSymbol), - SymTabOffset + - (&Symbol - Obj->symbol_begin(Section)) * sizeof(Elf_Sym)); + Write((&Symbol - Obj->symbol_begin(Section)) * sizeof(Elf_Sym), + reinterpret_cast(&NewSymbol), sizeof(NewSymbol)); } }; @@ -3120,9 +3195,14 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { } } assert(DynSymSection && "no dynamic symbol table found"); - updateSymbolTable(DynSymSection->sh_offset, DynSymSection); - - // Update regular symbol table. + updateSymbolTable(/*patch existing table?*/ true, DynSymSection, + [&](size_t Offset, const char *Buf, size_t Size) { + Out->os().pwrite(Buf, Size, + DynSymSection->sh_offset + Offset); + }, + [](StringRef) -> size_t { return 0; }); + + // (re)create regular symbol table. const Elf_Shdr *SymTabSection = nullptr; for (const auto &Section : Obj->sections()) { if (Section.sh_type == ELF::SHT_SYMTAB) { @@ -3134,8 +3214,41 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { errs() << "BOLT-WARNING: no symbol table found\n"; return; } - assert(NewSymTabOffset && "expected symbol table offset to be set"); - updateSymbolTable(NewSymTabOffset, SymTabSection); + + const Elf_Shdr *StrTabSection = *Obj->getSection(SymTabSection->sh_link); + std::string NewContents; + std::string NewStrTab = + File->getData().substr(StrTabSection->sh_offset, StrTabSection->sh_size); + auto SecName = *Obj->getSectionName(SymTabSection); + auto StrSecName = *Obj->getSectionName(StrTabSection); + + updateSymbolTable(/*patch existing table?*/false, SymTabSection, + [&](size_t Offset, const char *Buf, size_t Size) { + NewContents.append(Buf, Size); + }, [&](StringRef Str) { + size_t Idx = NewStrTab.size(); + NewStrTab.append(Str.data(), Str.size()); + NewStrTab.append(1, '\0'); + return Idx; + }); + + uint8_t *DataCopy = new uint8_t[NewContents.size()]; + memcpy(DataCopy, NewContents.data(), NewContents.size()); + EFMM->NoteSectionInfo[SecName] = + SectionInfo(reinterpret_cast(DataCopy), NewContents.size(), + /*Alignment*/ 1, + /*IsCode=*/false, + /*IsReadOnly=*/false, + /*IsLocal=*/false); + DataCopy = new uint8_t[NewStrTab.size()]; + memcpy(DataCopy, NewStrTab.data(), NewStrTab.size()); + EFMM->NoteSectionInfo[StrSecName] = + SectionInfo(reinterpret_cast(DataCopy), NewStrTab.size(), + /*Alignment*/ 1, + /*IsCode=*/false, + /*IsReadOnly=*/false, + /*IsLocal=*/false); + EFMM->NoteSectionInfo[StrSecName].IsStrTab = true; } template @@ -3432,6 +3545,11 @@ void RewriteInstance::rewriteFile() { // Finalize memory image of section string table. finalizeSectionStringTable(); + if (opts::Relocs) { + // Update symbol tables. + patchELFSymTabs(); + } + // Copy non-allocatable sections once allocatable part is finished. rewriteNoteSections(); @@ -3447,10 +3565,6 @@ void RewriteInstance::rewriteFile() { // Update ELF book-keeping info. patchELFSectionHeaderTable(); - // Update symbol tables. - if (opts::Relocs) - patchELFSymTabs(); - // TODO: we should find a way to mark the binary as optimized by us. Out->keep(); @@ -3555,9 +3669,16 @@ uint64_t RewriteInstance::getFileOffsetForAddress(uint64_t Address) const { } bool RewriteInstance::willOverwriteSection(StringRef SectionName) { - for (auto &OverwriteName : SectionsToOverwrite) { - if (SectionName == OverwriteName) - return true; + if (opts::Relocs) { + for (auto &OverwriteName : SectionsToOverwriteRelocMode) { + if (SectionName == OverwriteName) + return true; + } + } else { + for (auto &OverwriteName : SectionsToOverwrite) { + if (SectionName == OverwriteName) + return true; + } } auto SMII = EFMM->SectionMapInfo.find(SectionName); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index f09c6ebb93a7..71e368de22f0 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -288,28 +288,27 @@ class RewriteInstance { orc::ObjectLinkingLayer<> OLT; /// ELF-specific part. TODO: refactor into new class. - #define ELF_FUNCTION(FUNC) \ - template \ - void FUNC(ELFObjectFile *Obj); \ - void FUNC() { \ - if (auto *ELF32LE = dyn_cast(InputFile)) \ - return FUNC(ELF32LE); \ - if (auto *ELF64LE = dyn_cast(InputFile)) \ - return FUNC(ELF64LE); \ - if (auto *ELF32BE = dyn_cast(InputFile)) \ - return FUNC(ELF32BE); \ - auto *ELF64BE = cast(InputFile); \ - return FUNC(ELF64BE); \ - } +#define ELF_FUNCTION(FUNC) \ + template void FUNC(ELFObjectFile *Obj); \ + void FUNC() { \ + if (auto *ELF32LE = dyn_cast(InputFile)) \ + return FUNC(ELF32LE); \ + if (auto *ELF64LE = dyn_cast(InputFile)) \ + return FUNC(ELF64LE); \ + if (auto *ELF32BE = dyn_cast(InputFile)) \ + return FUNC(ELF32BE); \ + auto *ELF64BE = cast(InputFile); \ + return FUNC(ELF64BE); \ + } /// Patch ELF book-keeping info. void patchELF(); void patchELFPHDRTable(); - /// Patch section header table. + /// Create section header table. ELF_FUNCTION(patchELFSectionHeaderTable); - /// Patch symbol tables. + /// Create the regular symbol table and patch dyn symbol tables. ELF_FUNCTION(patchELFSymTabs); /// Patch dynamic section/segment of ELF. @@ -324,6 +323,14 @@ class RewriteInstance { /// Finalize memory image of section header string table. ELF_FUNCTION(finalizeSectionStringTable); + /// Get a list of all the sections to include in the output binary along + /// with a map of input to output indices. + template ::Elf_Shdr> + std::vector + getOutputSections(ELFObjectFile *File, + std::vector *OutputSections); + /// Add a notes section containing the BOLT revision and command line options. void addBoltInfoSection(); @@ -386,6 +393,17 @@ class RewriteInstance { ".gdb_index", }; + static constexpr const char *SectionsToOverwriteRelocMode[] = { + ".shstrtab", + ".symtab", + ".strtab", + ".debug_aranges", + ".debug_line", + ".debug_loc", + ".debug_ranges", + ".gdb_index", + }; + /// Huge page size used for alignment. static constexpr unsigned PageAlign = 0x200000; @@ -450,10 +468,6 @@ class RewriteInstance { /// Maps section name -> patcher. std::map> SectionPatchers; - /// [old section index] -> [new section index] map. Used for adjusting - /// referenced section indices. - std::vector NewSectionIndex; - uint64_t NewTextSectionStartAddress{0}; uint64_t NewTextSectionIndex{0}; @@ -489,6 +503,9 @@ class RewriteInstance { /// Section header string table. StringTableBuilder SHStrTab; + /// A rewrite of strtab + std::string NewStrTab; + static const std::string OrgSecPrefix; static const std::string BOLTSecPrefix; From d7f87302e156100701baf5f7a0d150a685ea4c86 Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Tue, 13 Jun 2017 17:24:27 -0700 Subject: [PATCH 287/904] get analysis information of functions Summary: complete the StokeInfo pass, ignore previous arc diff (cherry picked from commit 742934c271e855fa26a71a45280f4ef7be38ada2) --- bolt/BinaryFunction.h | 11 +++ bolt/BinaryPassManager.cpp | 19 ++++ bolt/Passes/CMakeLists.txt | 1 + bolt/Passes/RegAnalysis.h | 2 +- bolt/Passes/StokeInfo.cpp | 194 +++++++++++++++++++++++++++++++++++++ bolt/Passes/StokeInfo.h | 133 +++++++++++++++++++++++++ 6 files changed, 359 insertions(+), 1 deletion(-) create mode 100644 bolt/Passes/StokeInfo.cpp create mode 100644 bolt/Passes/StokeInfo.h diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index ecd146078598..385b4ff6dd4d 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -923,6 +923,17 @@ class BinaryFunction { return BLI != nullptr; } + const BinaryLoopInfo &getLoopInfo() { + return *BLI.get(); + } + + bool isLoopFree() { + if (!hasLoopInfo()) { + calculateLoopInfo(); + } + return BLI->empty(); + } + /// Print loop information about the function. void printLoopInfo(raw_ostream &OS) const; diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 846d12f9ad53..06e76de036b9 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -15,6 +15,7 @@ #include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" #include "Passes/ReorderFunctions.h" +#include "Passes/StokeInfo.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include @@ -200,6 +201,20 @@ VerifyCFG("verify-cfg", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static llvm::cl::opt +Stoke("stoke", + cl::desc("turn on the stoke analysis"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static llvm::cl::opt +PrintStoke("print-stoke", + cl::desc("print functions after stoke analysis"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { @@ -345,6 +360,10 @@ void BinaryFunctionPassManager::runAllPasses( llvm::make_unique(PrintSCTC), opts::SimplifyConditionalTailCalls); + // Add the StokeInfo pass, which extract functions for stoke optimization and + // get the liveness information for them + Manager.registerPass(llvm::make_unique(PrintStoke), opts::Stoke); + // This pass should always run last.* Manager.registerPass(llvm::make_unique(PrintFinalized)); diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index b3114c2a05e6..b5a51a553074 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -22,6 +22,7 @@ add_llvm_library(LLVMBOLTPasses StackAvailableExpressions.cpp StackPointerTracking.cpp StackReachingUses.cpp + StokeInfo.cpp ) include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt ) diff --git a/bolt/Passes/RegAnalysis.h b/bolt/Passes/RegAnalysis.h index dd802bcfb5f3..5a6b9306381c 100644 --- a/bolt/Passes/RegAnalysis.h +++ b/bolt/Passes/RegAnalysis.h @@ -44,6 +44,7 @@ class RegAnalysis { /// we know nothing about the function. void beConservative(BitVector &Result) const; +public: /// Compute the set of registers \p Func may read from during its execution. BitVector getFunctionUsedRegsList(const BinaryFunction *Func); @@ -53,7 +54,6 @@ class RegAnalysis { /// set of clobbered registers. BitVector getFunctionClobberList(const BinaryFunction *Func); -public: RegAnalysis(BinaryContext &BC, std::map &BFs, BinaryFunctionCallGraph &CG); diff --git a/bolt/Passes/StokeInfo.cpp b/bolt/Passes/StokeInfo.cpp new file mode 100644 index 000000000000..ebb729d5eecf --- /dev/null +++ b/bolt/Passes/StokeInfo.cpp @@ -0,0 +1,194 @@ +#include "StokeInfo.h" +#include "llvm/Support/Options.h" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +cl::OptionCategory StokeOptCategory("STOKE generic options"); + +static cl::opt +StokeOutputDataFilename("stoke-data", + cl::desc(""), + cl::Optional, + cl::cat(StokeOptCategory)); +} + +namespace llvm { +namespace bolt { + +void dumpRegNameFromBitVec(const BitVector &RegV, const BinaryContext &BC) { + dbgs() << "\t "; + int RegIdx = RegV.find_first(); + while (RegIdx != -1) { + dbgs() << RegIdx << ":" << BC.MRI->getName(RegIdx) << " "; + RegIdx = RegV.find_next(RegIdx); + } + dbgs() << "\n"; +} + +void getRegNameFromBitVec(const BitVector &RegV, std::set &NameVec, + const BinaryContext &BC) { + int RegIdx = RegV.find_first(); + while (RegIdx != -1) { + dbgs() << RegIdx << BC.MRI->getName(RegIdx) << "<>"; + NameVec.insert(std::string(BC.MRI->getName(RegIdx))); + RegIdx = RegV.find_next(RegIdx); + } + dbgs() << "\n"; +} + +void StokeInfo::checkInstr(const BinaryContext &BC, const BinaryFunction &BF, + StokeFuncInfo &FuncInfo) { + + BitVector RegV(NumRegs, false); + for (auto *BB : BF.layout()) { + if (BB->empty()) { + continue; + } + for (auto &It : *BB) { + auto &InstDesc = BC.MII->get(It.getOpcode()); + if (InstDesc.isPseudo()) { + continue; + } + // skip function with exception handling yet + if (BC.MIA->isEHLabel(It) || BC.MIA->isInvoke(It) || BC.MIA->hasEHInfo(It)) { + outs() << "\t exception\n"; + FuncInfo.Omitted = true; + return; + } + if (BC.MIA->hasRIPOperand(It)) { + outs() << "\t rip operand\n"; + } + // check if this function contains call instruction + if (BC.MIA->isCall(It)) { + FuncInfo.HasCall = true; + const auto *TargetSymbol = BC.MIA->getTargetSymbol(It); + // if it is an indirect call, skip + if (TargetSymbol == nullptr) { + FuncInfo.Omitted = true; + return; + } else { + outs() << "\t calling " << TargetSymbol->getName() << "\n"; + } + } + // check if this function modify stack or heap + // TODO: more accurate analysis + auto IsPush = BC.MIA->isPush(It); + if (IsPush) { + FuncInfo.StackOut = true; + } + if (BC.MIA->isStore(It) && !IsPush && !BC.MIA->hasRIPOperand(It)) { + FuncInfo.HeapOut = true; + } + + } // end of for (auto &It : ...) + } // end of for (auto *BB : ...) +} + +bool StokeInfo::analyze(const BinaryContext &BC, BinaryFunction &BF, + DataflowInfoManager &DInfo, RegAnalysis &RA, + StokeFuncInfo &FuncInfo) { + + std::string Name = BF.getSymbol()->getName().str(); + + if (!BF.isSimple() || BF.isMultiEntry() || BF.empty()) { + return false; + } + outs() << " STOKE-INFO: analyzing function " << Name << "\n"; + + FuncInfo.FuncName = Name; + FuncInfo.Offset = BF.getFileOffset(); + FuncInfo.Size = BF.getMaxSize(); + FuncInfo.NumInstrs = BF.getNumNonPseudos(); + FuncInfo.IsLoopFree = BF.isLoopFree(); + FuncInfo.HotSize = BF.estimateHotSize(); + FuncInfo.TotalSize = BF.estimateSize(); + + if (!FuncInfo.IsLoopFree) { + auto &BLI = BF.getLoopInfo(); + FuncInfo.NumLoops = BLI.OuterLoops; + FuncInfo.MaxLoopDepth = BLI.MaximumDepth; + } + // early stop for large functions + if (FuncInfo.NumInstrs > 500) { + return false; + } + + BinaryBasicBlock &EntryBB = BF.front(); + assert(EntryBB.isEntryPoint() && "Weird, this block should be the entry block!"); + + dbgs() << "\t EntryBB offset: " << EntryBB.getInputOffset() << "\n"; + auto *FirstNonPseudo = EntryBB.getFirstNonPseudoInstr(); + if (!FirstNonPseudo) { + return false; + } + dbgs() << "\t " << BC.InstPrinter->getOpcodeName(FirstNonPseudo->getOpcode()) << "\n"; + + dbgs() << "\t [DefIn at entry point]\n\t "; + auto LiveInBV = *(DInfo.getLivenessAnalysis().getStateAt(FirstNonPseudo)); + LiveInBV &= DefaultDefInMask; + getRegNameFromBitVec(LiveInBV, FuncInfo.DefIn, BC); + + outs() << "\t [LiveOut at return point]\n\t "; + auto LiveOutBV = RA.getFunctionClobberList(&BF); + LiveOutBV &= DefaultLiveOutMask; + getRegNameFromBitVec(LiveOutBV, FuncInfo.LiveOut, BC); + + checkInstr(BC, BF, FuncInfo); + + outs() << " STOKE-INFO: end function \n"; + return true; +} + +void StokeInfo::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set &) { + outs() << "STOKE-INFO: begin of stoke pass\n"; + + std::ofstream Outfile; + if (!opts::StokeOutputDataFilename.empty()) { + Outfile.open(opts::StokeOutputDataFilename); + } else { + outs() << "STOKE-INFO: output file is required\n"; + return; + } + + // check some context meta data + outs() << "\tTarget: " << BC.TheTarget->getName() << "\n"; + outs() << "\tTripleName " << BC.TripleName << "\n"; + outs() << "\tgetNumRegs " << BC.MRI->getNumRegs() << "\n"; + + auto CG = buildCallGraph(BC, BFs); + RegAnalysis RA(BC, BFs, CG); + + NumRegs = BC.MRI->getNumRegs(); + assert(NumRegs > 0 && "STOKE-INFO: the target register number is incorrect!"); + + DefaultDefInMask.resize(NumRegs, false); + DefaultLiveOutMask.resize(NumRegs, false); + + BC.MIA->getDefaultDefIn(DefaultDefInMask, *BC.MRI); + BC.MIA->getDefaultLiveOut(DefaultLiveOutMask, *BC.MRI); + + dumpRegNameFromBitVec(DefaultDefInMask, BC); + dumpRegNameFromBitVec(DefaultLiveOutMask, BC); + + StokeFuncInfo FuncInfo; + // analyze all functions + FuncInfo.printCsvHeader(Outfile); + for (auto &BF : BFs) { + DataflowInfoManager DInfo(BC, BF.second, &RA/*RA.get()*/, nullptr); + FuncInfo.reset(); + if (analyze(BC, BF.second, DInfo, RA, FuncInfo)) { + FuncInfo.printData(Outfile); + } + } + + outs() << "STOKE-INFO: end of stoke pass\n"; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/StokeInfo.h b/bolt/Passes/StokeInfo.h new file mode 100644 index 000000000000..067167469d63 --- /dev/null +++ b/bolt/Passes/StokeInfo.h @@ -0,0 +1,133 @@ +//===--- Passes/StokeInfo.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Pass to get information for functions for the Stoke Optimization +// To use the Stoke optimization technique to optimize the HHVM. +// This Pass solves the two major problems to use the Stoke program without +// proting its code: +// +// 1. Stoke works on function level, but it is only limited to relative +// small functions which are loop-free, call-free, exception-free, etc. +// +// 2. Stoke requires much information being manually provided, such as the +// register usages and memory modification, etc. +// +// This Pass analyzes all functions and get the required information into +// .csv file. Next, we use python scripts to process the file, filter +// out functions for optimization and automatically generate configure files. +// Finally, these configure files are feed to the Stoke to do the job. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STOKEINFO_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_STOKEINFO_H + +#include +#include "BinaryPasses.h" +#include "DataflowInfoManager.h" + +namespace llvm { +namespace bolt { + +/// Structure to hold information needed by Stoke for a function +struct StokeFuncInfo { + std::string FuncName; + uint64_t Offset; + uint64_t Size; + uint64_t NumInstrs; + bool IsLoopFree; + unsigned NumLoops; + unsigned MaxLoopDepth; + uint64_t HotSize; + uint64_t TotalSize; + bool HasCall; + std::set DefIn; + std::set LiveOut; + bool HeapOut; + bool StackOut; + bool Omitted; + + StokeFuncInfo() { + reset(); + } + + void reset() { + FuncName = ""; + Offset = Size = NumInstrs = 0; + NumLoops = MaxLoopDepth = 0; + HotSize = TotalSize = 0; + IsLoopFree = HasCall = HeapOut = StackOut = Omitted = false; + DefIn.clear(); + LiveOut.clear(); + } + + void printCsvHeader(std::ofstream &Outfile) { + if (Outfile.is_open()) { + Outfile + << "FuncName,Offset,Size,NumInstrs," + << "IsLoopFree,NumLoops,MaxLoopDepth," + << "HotSize,TotalSize," + << "HasCall," + << "DefIn,LiveOut,HeapOut,StackOut,Omitted\n"; + } + } + + void printData(std::ofstream &Outfile) { + if (Outfile.is_open()) { + Outfile + << FuncName << "," << Offset << "," << Size << "," << NumInstrs << "," + << IsLoopFree << "," << NumLoops << "," << MaxLoopDepth << "," + << HotSize << "," << TotalSize << "," + << HasCall << ",{ "; + for (auto s : DefIn) { + Outfile << s << " "; + } + Outfile << "},{ "; + for (auto s : LiveOut) { + Outfile << s << " "; + } + Outfile << "}," << HeapOut << "," << StackOut << "," << Omitted << "\n"; + } + } +}; + +class StokeInfo : public BinaryFunctionPass { + +private: + // stoke --def_in option default value, for X86: + // rax, rcx, rdx, rsi, rdi, r8, r9, xmm0-xmm7 + BitVector DefaultDefInMask; + // --live_out option default value: rax, rdx, xmm0, xmm1 + BitVector DefaultLiveOutMask; + + uint16_t NumRegs; + +public: + StokeInfo(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { + return "stoke-get-stat"; + } + + void checkInstr(const BinaryContext &BC, const BinaryFunction &BF, + StokeFuncInfo &FuncInfo); + + /// Get all required information for the stoke optimization + bool analyze(const BinaryContext &BC, BinaryFunction &BF, + DataflowInfoManager &DInfo, RegAnalysis &RA, + StokeFuncInfo &FuncInfo); + + void runOnFunctions(BinaryContext &BC, std::map &BFs, + std::set &LargeFunctions) override; + +}; + +} // namespace bolt +} // namespace llvm + + +#endif From ca7cc93fb39e49e155b39b22bfce3ecc2609b69f Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Thu, 13 Jul 2017 15:02:52 -0700 Subject: [PATCH 288/904] add: get function score to find hot functions refine the dumped csv format Summary: minor modification of the bolt stoke pass (cherry picked from commit 2886c405b367b9c6e94578cd26262ae15fd71261) --- bolt/Passes/StokeInfo.cpp | 88 ++++++++++++++++++--------------------- bolt/Passes/StokeInfo.h | 27 +++++++++--- 2 files changed, 62 insertions(+), 53 deletions(-) diff --git a/bolt/Passes/StokeInfo.cpp b/bolt/Passes/StokeInfo.cpp index ebb729d5eecf..65434eb03aad 100644 --- a/bolt/Passes/StokeInfo.cpp +++ b/bolt/Passes/StokeInfo.cpp @@ -1,16 +1,18 @@ #include "StokeInfo.h" #include "llvm/Support/Options.h" +#undef DEBUG_TYPE +#define DEBUG_TYPE "stoke" + using namespace llvm; using namespace bolt; namespace opts { - -cl::OptionCategory StokeOptCategory("STOKE generic options"); +cl::OptionCategory StokeOptCategory("STOKE pass options"); static cl::opt -StokeOutputDataFilename("stoke-data", - cl::desc(""), +StokeOutputDataFilename("stoke-out", + cl::desc("output data for stoke's use"), cl::Optional, cl::cat(StokeOptCategory)); } @@ -18,25 +20,18 @@ StokeOutputDataFilename("stoke-data", namespace llvm { namespace bolt { -void dumpRegNameFromBitVec(const BitVector &RegV, const BinaryContext &BC) { - dbgs() << "\t "; - int RegIdx = RegV.find_first(); - while (RegIdx != -1) { - dbgs() << RegIdx << ":" << BC.MRI->getName(RegIdx) << " "; - RegIdx = RegV.find_next(RegIdx); - } - dbgs() << "\n"; -} -void getRegNameFromBitVec(const BitVector &RegV, std::set &NameVec, - const BinaryContext &BC) { +void getRegNameFromBitVec(const BinaryContext &BC, const BitVector &RegV, + std::set *NameVec = nullptr) { int RegIdx = RegV.find_first(); while (RegIdx != -1) { - dbgs() << RegIdx << BC.MRI->getName(RegIdx) << "<>"; - NameVec.insert(std::string(BC.MRI->getName(RegIdx))); + DEBUG(dbgs() << BC.MRI->getName(RegIdx) << " "); + if (NameVec) { + NameVec->insert(std::string(BC.MRI->getName(RegIdx))); + } RegIdx = RegV.find_next(RegIdx); } - dbgs() << "\n"; + DEBUG(dbgs() << "\n"); } void StokeInfo::checkInstr(const BinaryContext &BC, const BinaryFunction &BF, @@ -54,13 +49,9 @@ void StokeInfo::checkInstr(const BinaryContext &BC, const BinaryFunction &BF, } // skip function with exception handling yet if (BC.MIA->isEHLabel(It) || BC.MIA->isInvoke(It) || BC.MIA->hasEHInfo(It)) { - outs() << "\t exception\n"; FuncInfo.Omitted = true; return; } - if (BC.MIA->hasRIPOperand(It)) { - outs() << "\t rip operand\n"; - } // check if this function contains call instruction if (BC.MIA->isCall(It)) { FuncInfo.HasCall = true; @@ -69,19 +60,21 @@ void StokeInfo::checkInstr(const BinaryContext &BC, const BinaryFunction &BF, if (TargetSymbol == nullptr) { FuncInfo.Omitted = true; return; - } else { - outs() << "\t calling " << TargetSymbol->getName() << "\n"; } } // check if this function modify stack or heap // TODO: more accurate analysis auto IsPush = BC.MIA->isPush(It); + auto IsRipAddr = BC.MIA->hasRIPOperand(It); if (IsPush) { FuncInfo.StackOut = true; } - if (BC.MIA->isStore(It) && !IsPush && !BC.MIA->hasRIPOperand(It)) { + if (BC.MIA->isStore(It) && !IsPush && !IsRipAddr) { FuncInfo.HeapOut = true; } + if (IsRipAddr) { + FuncInfo.HasRipAddr = true; + } } // end of for (auto &It : ...) } // end of for (auto *BB : ...) @@ -102,41 +95,42 @@ bool StokeInfo::analyze(const BinaryContext &BC, BinaryFunction &BF, FuncInfo.Offset = BF.getFileOffset(); FuncInfo.Size = BF.getMaxSize(); FuncInfo.NumInstrs = BF.getNumNonPseudos(); - FuncInfo.IsLoopFree = BF.isLoopFree(); - FuncInfo.HotSize = BF.estimateHotSize(); - FuncInfo.TotalSize = BF.estimateSize(); + // early stop for large functions + if (FuncInfo.NumInstrs > 500) { + return false; + } + FuncInfo.IsLoopFree = BF.isLoopFree(); if (!FuncInfo.IsLoopFree) { auto &BLI = BF.getLoopInfo(); FuncInfo.NumLoops = BLI.OuterLoops; FuncInfo.MaxLoopDepth = BLI.MaximumDepth; } - // early stop for large functions - if (FuncInfo.NumInstrs > 500) { - return false; - } + FuncInfo.HotSize = BF.estimateHotSize(); + FuncInfo.TotalSize = BF.estimateSize(); + FuncInfo.Score = BF.getFunctionScore(); + + checkInstr(BC, BF, FuncInfo); + + // register analysis BinaryBasicBlock &EntryBB = BF.front(); - assert(EntryBB.isEntryPoint() && "Weird, this block should be the entry block!"); + assert(EntryBB.isEntryPoint() && "Weird, this should be the entry block!"); - dbgs() << "\t EntryBB offset: " << EntryBB.getInputOffset() << "\n"; auto *FirstNonPseudo = EntryBB.getFirstNonPseudoInstr(); if (!FirstNonPseudo) { return false; } - dbgs() << "\t " << BC.InstPrinter->getOpcodeName(FirstNonPseudo->getOpcode()) << "\n"; - dbgs() << "\t [DefIn at entry point]\n\t "; + DEBUG(dbgs() << "\t [DefIn]\n\t "); auto LiveInBV = *(DInfo.getLivenessAnalysis().getStateAt(FirstNonPseudo)); LiveInBV &= DefaultDefInMask; - getRegNameFromBitVec(LiveInBV, FuncInfo.DefIn, BC); + getRegNameFromBitVec(BC, LiveInBV, &FuncInfo.DefIn); - outs() << "\t [LiveOut at return point]\n\t "; + DEBUG(dbgs() << "\t [LiveOut]\n\t "); auto LiveOutBV = RA.getFunctionClobberList(&BF); LiveOutBV &= DefaultLiveOutMask; - getRegNameFromBitVec(LiveOutBV, FuncInfo.LiveOut, BC); - - checkInstr(BC, BF, FuncInfo); + getRegNameFromBitVec(BC, LiveOutBV, &FuncInfo.LiveOut); outs() << " STOKE-INFO: end function \n"; return true; @@ -152,14 +146,14 @@ void StokeInfo::runOnFunctions( if (!opts::StokeOutputDataFilename.empty()) { Outfile.open(opts::StokeOutputDataFilename); } else { - outs() << "STOKE-INFO: output file is required\n"; + errs() << "STOKE-INFO: output file is required\n"; return; } // check some context meta data - outs() << "\tTarget: " << BC.TheTarget->getName() << "\n"; - outs() << "\tTripleName " << BC.TripleName << "\n"; - outs() << "\tgetNumRegs " << BC.MRI->getNumRegs() << "\n"; + DEBUG(dbgs() << "\tTarget: " << BC.TheTarget->getName() << "\n"); + DEBUG(dbgs() << "\tTripleName " << BC.TripleName << "\n"); + DEBUG(dbgs() << "\tgetNumRegs " << BC.MRI->getNumRegs() << "\n"); auto CG = buildCallGraph(BC, BFs); RegAnalysis RA(BC, BFs, CG); @@ -173,8 +167,8 @@ void StokeInfo::runOnFunctions( BC.MIA->getDefaultDefIn(DefaultDefInMask, *BC.MRI); BC.MIA->getDefaultLiveOut(DefaultLiveOutMask, *BC.MRI); - dumpRegNameFromBitVec(DefaultDefInMask, BC); - dumpRegNameFromBitVec(DefaultLiveOutMask, BC); + getRegNameFromBitVec(BC, DefaultDefInMask); + getRegNameFromBitVec(BC, DefaultLiveOutMask); StokeFuncInfo FuncInfo; // analyze all functions diff --git a/bolt/Passes/StokeInfo.h b/bolt/Passes/StokeInfo.h index 067167469d63..084947976e20 100644 --- a/bolt/Passes/StokeInfo.h +++ b/bolt/Passes/StokeInfo.h @@ -44,11 +44,13 @@ struct StokeFuncInfo { unsigned MaxLoopDepth; uint64_t HotSize; uint64_t TotalSize; + uint64_t Score; bool HasCall; std::set DefIn; std::set LiveOut; bool HeapOut; bool StackOut; + bool HasRipAddr; bool Omitted; StokeFuncInfo() { @@ -60,7 +62,14 @@ struct StokeFuncInfo { Offset = Size = NumInstrs = 0; NumLoops = MaxLoopDepth = 0; HotSize = TotalSize = 0; - IsLoopFree = HasCall = HeapOut = StackOut = Omitted = false; + Score = 0; + IsLoopFree + = HasCall + = HeapOut + = StackOut + = HasRipAddr + = Omitted + = false; DefIn.clear(); LiveOut.clear(); } @@ -71,8 +80,11 @@ struct StokeFuncInfo { << "FuncName,Offset,Size,NumInstrs," << "IsLoopFree,NumLoops,MaxLoopDepth," << "HotSize,TotalSize," + << "Score," << "HasCall," - << "DefIn,LiveOut,HeapOut,StackOut,Omitted\n"; + << "DefIn,LiveOut,HeapOut,StackOut," + << "HasRipAddr," + << "Omitted\n"; } } @@ -82,15 +94,18 @@ struct StokeFuncInfo { << FuncName << "," << Offset << "," << Size << "," << NumInstrs << "," << IsLoopFree << "," << NumLoops << "," << MaxLoopDepth << "," << HotSize << "," << TotalSize << "," - << HasCall << ",{ "; + << Score << "," + << HasCall << ",\"{ "; for (auto s : DefIn) { - Outfile << s << " "; + Outfile << "%" << s << " "; } - Outfile << "},{ "; + Outfile << "}\",\"{ "; for (auto s : LiveOut) { Outfile << s << " "; } - Outfile << "}," << HeapOut << "," << StackOut << "," << Omitted << "\n"; + Outfile << "}\"," << HeapOut << "," << StackOut << "," + << HasRipAddr << "," + << Omitted << "\n"; } } }; From af08aebe77f1fc2402f0abb85e8185e142787ec4 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 25 Jul 2017 09:11:42 -0700 Subject: [PATCH 289/904] Recognize AArch64 as a valid input Summary: BOLT needs to be configured with the LLVM AArch64 backend. If the backend is linked into the LLVM library, start processing AArch64 binaries. (cherry picked from commit 95344c79f426759934c9b24763dfcc87d4a5b2c4) --- bolt/RewriteInstance.cpp | 52 ++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 91dde9dda7d4..8f65bc27a3c4 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -510,14 +510,24 @@ namespace { /// Create BinaryContext for a given architecture \p ArchName and /// triple \p TripleName. -std::unique_ptr createBinaryContext( - std::string ArchName, - std::string TripleName, - const DataReader &DR, - std::unique_ptr DwCtx) { +std::unique_ptr +createBinaryContext(ELFObjectFileBase *File, const DataReader &DR, + std::unique_ptr DwCtx) { + std::string ArchName; + std::string TripleName; + llvm::Triple::ArchType Arch = (llvm::Triple::ArchType)File->getArch(); + if (Arch == llvm::Triple::x86_64) { + ArchName = "x86-64"; + TripleName = "x86_64-unknown-linux"; + } else if (Arch == llvm::Triple::aarch64) { + ArchName = "aarch64"; + TripleName = "aarch64-unknown-linux"; + } else { + errs() << "BOLT-ERROR: Unrecognized machine in ELF file.\n"; + return nullptr; + } std::string Error; - std::unique_ptr TheTriple = llvm::make_unique(TripleName); const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, *TheTriple, @@ -619,17 +629,13 @@ std::unique_ptr createBinaryContext( } // namespace -RewriteInstance::RewriteInstance(ELFObjectFileBase *File, - const DataReader &DR, - const int Argc, - const char *const *Argv) - : InputFile(File), - Argc(Argc), - Argv(Argv), - BC(createBinaryContext("x86-64", "x86_64-unknown-linux", DR, - std::unique_ptr( - new DWARFContextInMemory(*InputFile, nullptr, true)))) { -} +RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const DataReader &DR, + const int Argc, const char *const *Argv) + : InputFile(File), Argc(Argc), Argv(Argv), + BC(createBinaryContext( + File, DR, + std::unique_ptr( + new DWARFContextInMemory(*InputFile, nullptr, true)))) {} RewriteInstance::~RewriteInstance() {} @@ -637,9 +643,10 @@ void RewriteInstance::reset() { BinaryFunctions.clear(); FileSymRefs.clear(); auto &DR = BC->DR; - BC = createBinaryContext("x86-64", "x86_64-unknown-linux", DR, - std::unique_ptr( - new DWARFContextInMemory(*InputFile, nullptr, true))); + BC = createBinaryContext( + InputFile, DR, + std::unique_ptr( + new DWARFContextInMemory(*InputFile, nullptr, true))); CFIRdWrt.reset(nullptr); EFMM.reset(nullptr); Out.reset(nullptr); @@ -765,6 +772,11 @@ void RewriteInstance::run() { unsigned PassNumber = 1; + outs() << "BOLT-INFO: Target architecture: " + << Triple::getArchTypeName( + (llvm::Triple::ArchType)InputFile->getArch()) + << "\n"; + // Main "loop". discoverStorage(); readSpecialSections(); From af8c02ededa475413e8710a36baba35fed88d866 Mon Sep 17 00:00:00 2001 From: Bohan Ren Date: Tue, 25 Jul 2017 16:27:00 -0700 Subject: [PATCH 290/904] [BOLT] Improve Jump-Distance Metric -- Consider Function Execution Count Summary: Function execution count is very important. When calculating metric, we should care more about functions which are known to be executed. The correlations between this metric and both CPU time is slightly improved to be close to 96% and the correlation between this metric and Cache Miss remains the same 96%. Thanks the suggestion from Sergey! (cherry picked from commit 5ba8d121ea72403203879d00ca7db8eddef0d5e8) --- bolt/CalcCacheMetrics.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bolt/CalcCacheMetrics.cpp b/bolt/CalcCacheMetrics.cpp index 4e9e81d7ce2e..166b3c3b7dd4 100644 --- a/bolt/CalcCacheMetrics.cpp +++ b/bolt/CalcCacheMetrics.cpp @@ -141,6 +141,9 @@ void CalcCacheMetrics::calcGraphDistance( uint64_t FuncCount = 0; for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; + // Only consider functions which are known to be executed + if (Function.getKnownExecutionCount() == 0) + continue; std::unordered_map TraversalMap; uint64_t TraversalCount = 0; @@ -159,7 +162,7 @@ void CalcCacheMetrics::calcGraphDistance( double AverageValue = TraversalMap.empty() ? 0 : (TotalValue * 1.0 / TraversalMap.size()); TotalFuncValue += AverageValue; - ++FuncCount; + FuncCount += TraversalMap.empty() ? 0 : 1; } outs() << format(" Sum of averages of traversal distance for all " From 4b3edbe856f567a8abaec53221dc79ec98cf76fb Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Thu, 27 Jul 2017 12:52:56 -0700 Subject: [PATCH 291/904] Reformat the register strings in the output so Stoke can parse without preprocessing. Summary: Minor change. Reformat the def-in, live-out register strings so that Stoke can parse without doing preprocessing. (cherry picked from commit c25f19c2866e5e433c76eb4079b155d53110217a) --- bolt/Passes/StokeInfo.cpp | 7 ++++--- bolt/Passes/StokeInfo.h | 12 +++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/bolt/Passes/StokeInfo.cpp b/bolt/Passes/StokeInfo.cpp index 65434eb03aad..dda3982500e3 100644 --- a/bolt/Passes/StokeInfo.cpp +++ b/bolt/Passes/StokeInfo.cpp @@ -12,7 +12,7 @@ cl::OptionCategory StokeOptCategory("STOKE pass options"); static cl::opt StokeOutputDataFilename("stoke-out", - cl::desc("output data for stoke's use"), + cl::desc("output data (.csv) for Stoke's use"), cl::Optional, cl::cat(StokeOptCategory)); } @@ -80,7 +80,7 @@ void StokeInfo::checkInstr(const BinaryContext &BC, const BinaryFunction &BF, } // end of for (auto *BB : ...) } -bool StokeInfo::analyze(const BinaryContext &BC, BinaryFunction &BF, +bool StokeInfo::checkFunction(const BinaryContext &BC, BinaryFunction &BF, DataflowInfoManager &DInfo, RegAnalysis &RA, StokeFuncInfo &FuncInfo) { @@ -95,6 +95,7 @@ bool StokeInfo::analyze(const BinaryContext &BC, BinaryFunction &BF, FuncInfo.Offset = BF.getFileOffset(); FuncInfo.Size = BF.getMaxSize(); FuncInfo.NumInstrs = BF.getNumNonPseudos(); + FuncInfo.NumBlocks = BF.size(); // early stop for large functions if (FuncInfo.NumInstrs > 500) { return false; @@ -176,7 +177,7 @@ void StokeInfo::runOnFunctions( for (auto &BF : BFs) { DataflowInfoManager DInfo(BC, BF.second, &RA/*RA.get()*/, nullptr); FuncInfo.reset(); - if (analyze(BC, BF.second, DInfo, RA, FuncInfo)) { + if (checkFunction(BC, BF.second, DInfo, RA, FuncInfo)) { FuncInfo.printData(Outfile); } } diff --git a/bolt/Passes/StokeInfo.h b/bolt/Passes/StokeInfo.h index 084947976e20..d63a77668500 100644 --- a/bolt/Passes/StokeInfo.h +++ b/bolt/Passes/StokeInfo.h @@ -39,6 +39,7 @@ struct StokeFuncInfo { uint64_t Offset; uint64_t Size; uint64_t NumInstrs; + uint64_t NumBlocks; bool IsLoopFree; unsigned NumLoops; unsigned MaxLoopDepth; @@ -59,7 +60,7 @@ struct StokeFuncInfo { void reset() { FuncName = ""; - Offset = Size = NumInstrs = 0; + Offset = Size = NumInstrs = NumBlocks = 0; NumLoops = MaxLoopDepth = 0; HotSize = TotalSize = 0; Score = 0; @@ -77,7 +78,7 @@ struct StokeFuncInfo { void printCsvHeader(std::ofstream &Outfile) { if (Outfile.is_open()) { Outfile - << "FuncName,Offset,Size,NumInstrs," + << "FuncName,Offset,Size,NumInstrs,NumBlocks," << "IsLoopFree,NumLoops,MaxLoopDepth," << "HotSize,TotalSize," << "Score," @@ -91,7 +92,8 @@ struct StokeFuncInfo { void printData(std::ofstream &Outfile) { if (Outfile.is_open()) { Outfile - << FuncName << "," << Offset << "," << Size << "," << NumInstrs << "," + << FuncName << "," + << Offset << "," << Size << "," << NumInstrs << "," << NumBlocks << "," << IsLoopFree << "," << NumLoops << "," << MaxLoopDepth << "," << HotSize << "," << TotalSize << "," << Score << "," @@ -101,7 +103,7 @@ struct StokeFuncInfo { } Outfile << "}\",\"{ "; for (auto s : LiveOut) { - Outfile << s << " "; + Outfile << "%" << s << " "; } Outfile << "}\"," << HeapOut << "," << StackOut << "," << HasRipAddr << "," @@ -132,7 +134,7 @@ class StokeInfo : public BinaryFunctionPass { StokeFuncInfo &FuncInfo); /// Get all required information for the stoke optimization - bool analyze(const BinaryContext &BC, BinaryFunction &BF, + bool checkFunction(const BinaryContext &BC, BinaryFunction &BF, DataflowInfoManager &DInfo, RegAnalysis &RA, StokeFuncInfo &FuncInfo); From 0f3f1d00f6e1aef97957cb72f4f1d31f0e5276ed Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 1 Aug 2017 11:19:01 -0700 Subject: [PATCH 292/904] [BOLT] Fix reading LSDA address for PIC code Summary: Fix a bug while reading LSDA address in PIC format. The base address was wrong for PC-relative value. There's more work involved in making PIC code with C++ exceptions work. (cherry picked from commit f71205559b19f52e229d25a56693cd1d0748616a) --- bolt/Exceptions.cpp | 4 ++-- bolt/RewriteInstance.cpp | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 4fd5e7494aed..110cda04168c 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -64,7 +64,7 @@ namespace bolt { // // The best visual representation of the tables comprising LSDA and // relationships between them is illustrated at: -// http://mentorembedded.github.io/cxx-abi/exceptions.pdf +// https://github.com/itanium-cxx-abi/cxx-abi/blob/master/exceptions.pdf // Keep in mind that GCC implementation deviates slightly from that document. // // To summarize, there are 4 tables in LSDA: call site table, actions table, @@ -145,7 +145,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, intptr_t MaxTypeIndexTableOffset = 0; // The actual type info table starts at the same location, but grows in - // different direction. Encoding is different too (TTypeEncoding). + // opposite direction. TTypeEncoding is used to encode stored values. auto TypeTableStart = reinterpret_cast(Ptr + TTypeEnd); uint8_t CallSiteEncoding = *Ptr++; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 8f65bc27a3c4..473f607d7101 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -3577,7 +3577,6 @@ void RewriteInstance::rewriteFile() { // Update ELF book-keeping info. patchELFSectionHeaderTable(); - // TODO: we should find a way to mark the binary as optimized by us. Out->keep(); // If requested, open again the binary we just wrote to dump its EH Frame From 18c449e214aa260a95dd9c30abae7210809fd0cb Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 17 Jul 2017 11:22:22 -0700 Subject: [PATCH 293/904] [BOLT] Better match LTO functions profile. Summary: * Improve profile matching for LTO binaries that don't match 100%. * Fix profile matching for '.LTHUNK*' functions. * Add external outgoing branches (calls) for profile validation. There's an improvement for 100% match profile and for stale LTO profile. However, we are still not fully closing the gap with stale profile when LTO is enabled. (NOTE: I haven't updated all test cases yet) (cherry picked from commit 34a0ab68177520c77f8968924eb4a665794e810f) --- bolt/BinaryFunction.cpp | 243 +++++++++++++++--------- bolt/BinaryFunction.h | 24 ++- bolt/DataReader.cpp | 66 ++++++- bolt/DataReader.h | 57 +++++- bolt/Passes/BinaryFunctionCallGraph.cpp | 10 +- bolt/Passes/IndirectCallPromotion.cpp | 17 +- bolt/RewriteInstance.cpp | 128 ++++++++----- bolt/RewriteInstance.h | 3 + 8 files changed, 375 insertions(+), 173 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 7de9ada88771..07132a56ae2d 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -961,7 +961,6 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { auto &Ctx = BC.Ctx; auto &MIA = BC.MIA; - auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); DWARFUnitLineTable ULT = getDWARFUnitLineTable(); @@ -1220,7 +1219,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { MCSymbolRefExpr::VK_None, *Ctx))); - if (BranchDataOrErr) { + if (BranchData) { if (IsCall) { MIA->addAnnotation(Ctx.get(), Instruction, "EdgeCountData", Offset); } @@ -1243,7 +1242,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { auto Result = MIA->convertJmpToTailCall(Instruction); (void)Result; assert(Result); - if (BranchDataOrErr) { + if (BranchData) { MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData", Offset); } @@ -1259,7 +1258,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Keep processing. We'll do more checks and fixes in // postProcessIndirectBranches(). MaybeEdgeCountData = true; - if (BranchDataOrErr) { + if (BranchData) { MIA->addAnnotation(Ctx.get(), Instruction, "MaybeIndirectBranchData", @@ -1268,12 +1267,12 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { break; }; } else if (MIA->isCall(Instruction)) { - if (BranchDataOrErr) { + if (BranchData) { MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData", Offset); } } - if (BranchDataOrErr) { + if (BranchData) { const char* AttrName = MaybeEdgeCountData ? "MaybeEdgeCountData" : "EdgeCountData"; MIA->addAnnotation(Ctx.get(), Instruction, AttrName, Offset); @@ -1367,8 +1366,6 @@ void BinaryFunction::postProcessJumpTables() { } bool BinaryFunction::postProcessIndirectBranches() { - auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); - for (auto *BB : layout()) { for (auto &Instr : *BB) { if (!BC.MIA->isIndirectBranch(Instr)) @@ -1527,13 +1524,6 @@ void BinaryFunction::recomputeLandingPads(const unsigned StartIndex, bool BinaryFunction::buildCFG() { auto &MIA = BC.MIA; - auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); - if (!BranchDataOrErr) { - DEBUG(dbgs() << "no branch data found for \"" << *this << "\"\n"); - } else { - ExecutionCount = BranchDataOrErr->ExecutionCount; - } - if (!isSimple()) { assert(!opts::Relocs && "cannot process file with non-simple function in relocs mode"); @@ -1672,9 +1662,8 @@ bool BinaryFunction::buildCFG() { // e.g. exit(3), etc. Otherwise we'll see a false fall-through // blocks. - // Make sure we can use profile data for this function. - if (BranchDataOrErr) - evaluateProfileData(BranchDataOrErr.get()); + // Possibly assign/re-assign branch profile data. + matchProfileData(); for (auto &Branch : TakenBranches) { DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first) @@ -1684,48 +1673,49 @@ bool BinaryFunction::buildCFG() { auto *ToBB = getBasicBlockAtOffset(Branch.second); assert(ToBB && "cannot find BB containing TO branch"); - if (BranchDataOrErr.getError()) { + if (!BranchData) { FromBB->addSuccessor(ToBB); - } else { - const FuncBranchData &BranchData = BranchDataOrErr.get(); - auto BranchInfoOrErr = BranchData.getBranch(Branch.first, Branch.second); - if (BranchInfoOrErr.getError()) { - FromBB->addSuccessor(ToBB); - } else { - const BranchInfo &BInfo = BranchInfoOrErr.get(); - FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); - // Populate profile counts for the jump table. - auto *LastInstr = FromBB->getLastNonPseudoInstr(); - if (!LastInstr) - continue; - auto JTAddress = BC.MIA->getJumpTable(*LastInstr); - if (!JTAddress) - continue; - auto *JT = getJumpTableContainingAddress(JTAddress); - if (!JT) - continue; - JT->Count += BInfo.Branches; - if (opts::IndirectCallPromotion < ICP_JUMP_TABLES && - opts::JumpTables < JTS_AGGRESSIVE) - continue; - if (JT->Counts.empty()) - JT->Counts.resize(JT->Entries.size()); - auto EI = JT->Entries.begin(); - auto Delta = (JTAddress - JT->Address) / JT->EntrySize; - EI += Delta; - while (EI != JT->Entries.end()) { - if (ToBB->getLabel() == *EI) { - assert(Delta < JT->Counts.size()); - JT->Counts[Delta].Mispreds += BInfo.Mispreds; - JT->Counts[Delta].Count += BInfo.Branches; - } - ++Delta; - ++EI; - // A label marks the start of another jump table. - if (JT->Labels.count(Delta * JT->EntrySize)) - break; - } + continue; + } + + auto BranchInfoOrErr = BranchData->getBranch(Branch.first, Branch.second); + if (!BranchInfoOrErr) { + FromBB->addSuccessor(ToBB); + continue; + } + + const BranchInfo &BInfo = BranchInfoOrErr.get(); + FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); + // Populate profile counts for the jump table. + auto *LastInstr = FromBB->getLastNonPseudoInstr(); + if (!LastInstr) + continue; + auto JTAddress = BC.MIA->getJumpTable(*LastInstr); + if (!JTAddress) + continue; + auto *JT = getJumpTableContainingAddress(JTAddress); + if (!JT) + continue; + JT->Count += BInfo.Branches; + if (opts::IndirectCallPromotion < ICP_JUMP_TABLES && + opts::JumpTables < JTS_AGGRESSIVE) + continue; + if (JT->Counts.empty()) + JT->Counts.resize(JT->Entries.size()); + auto EI = JT->Entries.begin(); + auto Delta = (JTAddress - JT->Address) / JT->EntrySize; + EI += Delta; + while (EI != JT->Entries.end()) { + if (ToBB->getLabel() == *EI) { + assert(Delta < JT->Counts.size()); + JT->Counts[Delta].Mispreds += BInfo.Mispreds; + JT->Counts[Delta].Count += BInfo.Branches; } + ++Delta; + ++EI; + // A label marks the start of another jump table. + if (JT->Labels.count(Delta * JT->EntrySize)) + break; } } @@ -1762,9 +1752,8 @@ bool BinaryFunction::buildCFG() { // Does not add a successor if we can't find profile data, leave it to the // inference pass to guess its frequency - if (BranchDataOrErr) { - const FuncBranchData &BranchData = BranchDataOrErr.get(); - auto BranchInfoOrErr = BranchData.getBranch(Branch.first, Branch.second); + if (BranchData) { + auto BranchInfoOrErr = BranchData->getBranch(Branch.first, Branch.second); if (BranchInfoOrErr) { const BranchInfo &BInfo = BranchInfoOrErr.get(); FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); @@ -1774,9 +1763,8 @@ bool BinaryFunction::buildCFG() { for (auto &I : TailCallTerminatedBlocks) { TailCallInfo &TCInfo = I.second; - if (BranchDataOrErr) { - const FuncBranchData &BranchData = BranchDataOrErr.get(); - auto BranchInfoOrErr = BranchData.getDirectCallBranch(TCInfo.Offset); + if (BranchData) { + auto BranchInfoOrErr = BranchData->getDirectCallBranch(TCInfo.Offset); if (BranchInfoOrErr) { const BranchInfo &BInfo = BranchInfoOrErr.get(); TCInfo.Count = BInfo.Branches; @@ -1960,7 +1948,56 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { } } -void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { +void BinaryFunction::matchProfileData() { + if (BranchData) { + ProfileMatchRatio = evaluateProfileData(*BranchData); + if (ProfileMatchRatio == 1.0f) + return; + } + + // Check if the function name can fluctuate between several compilations + // possibly triggered by minor unrelated code changes in the source code + // of the input binary. + const auto HasVolatileName = [this]() { + for (const auto Name : getNames()) { + if (getLTOCommonName(Name)) + return true; + } + return false; + }(); + if (!HasVolatileName) + return; + + // Check for a profile that matches with 100% confidence. + const auto AllBranchData = BC.DR.getFuncBranchDataRegex(getNames()); + for (const auto *NewBranchData : AllBranchData) { + // Prevent functions from sharing the same profile. + if (NewBranchData->Used) + continue; + + if (evaluateProfileData(*NewBranchData) != 1.0f) + continue; + + if (BranchData) + BranchData->Used = false; + + // Update function profile data with the new set. + BranchData = NewBranchData; + ExecutionCount = NewBranchData->ExecutionCount; + ProfileMatchRatio = 1.0f; + BranchData->Used = true; + break; + } +} + +float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { + // Until we define a minimal profile, we consider an empty branch data to be + // a valid profile. It could happen to a function without branches when we + // still have an EntryData for execution count. + if (BranchData.Data.empty()) { + return 1.0f; + } + BranchListType ProfileBranches(BranchData.Data.size()); std::transform(BranchData.Data.begin(), BranchData.Data.end(), @@ -1978,12 +2015,14 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { return Branch.second != -1U; }); - // Until we define a minimal profile, we consider no branch data to be a valid - // profile. It could happen to a function without branches. - if (LocalProfileBranches.empty()) { - ProfileMatchRatio = 1.0f; - return; - } + // Profile referencing external functions. + BranchListType ExternProfileBranches; + std::copy_if(ProfileBranches.begin(), + ProfileBranches.end(), + std::back_inserter(ExternProfileBranches), + [](const std::pair &Branch) { + return Branch.second == -1U; + }); std::sort(LocalProfileBranches.begin(), LocalProfileBranches.end()); @@ -2014,7 +2053,9 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { return false; // Check if it is a recursive call. - if (BC.MIA->isCall(SrcInstrI->second) && Branch.second == 0) + const auto &SrcInstr = SrcInstrI->second; + if ((BC.MIA->isCall(SrcInstr) || BC.MIA->isIndirectBranch(SrcInstr)) && + Branch.second == 0) return true; auto DstInstrI = Instructions.find(Branch.second); @@ -2022,9 +2063,9 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { return false; // Check if it is a return from a recursive call. - bool IsSrcReturn = BC.MIA->isReturn(SrcInstrI->second); + bool IsSrcReturn = BC.MIA->isReturn(SrcInstr); // "rep ret" is considered to be 2 different instructions. - if (!IsSrcReturn && BC.MIA->isPrefix(SrcInstrI->second)) { + if (!IsSrcReturn && BC.MIA->isPrefix(SrcInstr)) { auto SrcInstrSuccessorI = SrcInstrI; ++SrcInstrSuccessorI; assert(SrcInstrSuccessorI != Instructions.end() && @@ -2046,15 +2087,37 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { std::back_inserter(OrphanBranches), isRecursiveBranch); - ProfileMatchRatio = - (float) (LocalProfileBranches.size() - OrphanBranches.size()) / - (float) LocalProfileBranches.size(); + // Check all external branches. + std::copy_if(ExternProfileBranches.begin(), + ExternProfileBranches.end(), + std::back_inserter(OrphanBranches), + [&](const std::pair &Branch) { + auto II = Instructions.find(Branch.first); + if (II == Instructions.end()) + return true; + const auto &Instr = II->second; + if (BC.MIA->isCall(Instr) || + BC.MIA->isIndirectBranch(Instr) || + BC.MIA->isReturn(Instr)) + return false; + // Check for "rep ret" + if (BC.MIA->isPrefix(Instr)) { + ++II; + if (II != Instructions.end() && BC.MIA->isReturn(II->second)) + return false; + } + return true; + }); + + float MatchRatio = + (float) (ProfileBranches.size() - OrphanBranches.size()) / + (float) ProfileBranches.size(); - if (opts::Verbosity >= 1 && !OrphanBranches.empty()) { + if (opts::Verbosity >= 2 && !OrphanBranches.empty()) { errs() << "BOLT-WARNING: profile branches match only " - << format("%.1f%%", ProfileMatchRatio * 100.0f) << " (" - << (LocalProfileBranches.size() - OrphanBranches.size()) << '/' - << LocalProfileBranches.size() << ") for function " + << format("%.1f%%", MatchRatio * 100.0f) << " (" + << (ProfileBranches.size() - OrphanBranches.size()) << '/' + << ProfileBranches.size() << ") for function " << *this << '\n'; DEBUG( for (auto &OBranch : OrphanBranches) @@ -2062,8 +2125,10 @@ void BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { << Twine::utohexstr(OBranch.second) << " (0x" << Twine::utohexstr(OBranch.first + getAddress()) << " -> 0x" << Twine::utohexstr(OBranch.second + getAddress()) << ")\n"; - ); + ); } + + return MatchRatio; } void BinaryFunction::clearProfile() { @@ -2081,8 +2146,7 @@ void BinaryFunction::clearProfile() { void BinaryFunction::inferFallThroughCounts() { assert(!BasicBlocks.empty() && "basic block list should not be empty"); - - auto BranchDataOrErr = BC.DR.getFuncBranchData(getNames()); + assert(BranchData && "cannot infer counts without branch data"); // Compute preliminary execution count for each basic block for (auto CurBB : BasicBlocks) { @@ -2106,13 +2170,10 @@ void BinaryFunction::inferFallThroughCounts() { } // Update execution counts of landing pad blocks. - if (!BranchDataOrErr.getError()) { - const FuncBranchData &BranchData = BranchDataOrErr.get(); - for (const auto &I : BranchData.EntryData) { - BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); - if (BB && LandingPads.find(BB->getLabel()) != LandingPads.end()) { - BB->setExecutionCount(BB->getExecutionCount() + I.Branches); - } + for (const auto &I : BranchData->EntryData) { + BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); + if (BB && LandingPads.find(BB->getLabel()) != LandingPads.end()) { + BB->setExecutionCount(BB->getExecutionCount() + I.Branches); } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 385b4ff6dd4d..a35f706dc313 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -312,8 +312,11 @@ class BinaryFunction { /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; - /// Profile match ration. - float ProfileMatchRatio{0.0}; + /// Profile data for branches. + const FuncBranchData *BranchData{nullptr}; + + /// Profile match ratio for BranchData. + float ProfileMatchRatio{0.0f}; /// Score of the function (estimated number of instructions executed, /// according to profile data). -1 if the score has not been calculated yet. @@ -1278,7 +1281,8 @@ class BinaryFunction { } /// Add new names this function is known under. - void addNewNames(const std::vector &NewNames) { + template + void addNewNames(const ContainterTy &NewNames) { Names.insert(Names.begin(), NewNames.begin(), NewNames.end()); } @@ -1709,9 +1713,19 @@ class BinaryFunction { /// cannot be statically evaluated for any given indirect branch. bool postProcessIndirectBranches(); + /// Find the best matching profile for a function after the creation of basic + /// blocks. + void matchProfileData(); + /// Check how closely the profile data matches the function and set - /// ProfileMatchRatio to reflect the accuracy. - void evaluateProfileData(const FuncBranchData &BranchData); + /// Return accuracy (ranging from 0.0 to 1.0) of matching. + float evaluateProfileData(const FuncBranchData &BranchData); + + /// Return profile data associated with this function, or nullptr if the + /// function has no associated profile. + const FuncBranchData *getBranchData() const { + return BranchData; + } /// Walks the list of basic blocks filling in missing information about /// edge frequency for fall-throughs. diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 20d5e4e4d987..936c30dd0f81 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -14,11 +14,33 @@ #include "DataReader.h" +#include "llvm/Support/Debug.h" #include namespace llvm { namespace bolt { +Optional getLTOCommonName(const StringRef Name) { + auto LTOSuffixPos = Name.find(".lto_priv."); + if (LTOSuffixPos != StringRef::npos) { + return Name.substr(0, LTOSuffixPos + 10); + } else if ((LTOSuffixPos = Name.find(".constprop.")) != StringRef::npos) { + return Name.substr(0, LTOSuffixPos + 11); + } else { + return NoneType(); + } +} + +namespace { + +/// Return standard name of the function possibly renamed by BOLT. +StringRef normalizeName(StringRef Name) { + // Strip "PG." prefix used for globalized locals. + return Name.startswith("PG.") ? Name.substr(2) : Name; +} + +} // anonymous namespace + iterator_range FuncBranchData::getBranchRange(uint64_t From) const { assert(std::is_sorted(Data.begin(), Data.end())); @@ -173,6 +195,7 @@ DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { } auto DR = make_unique(std::move(MB.get()), Diag); DR->parse(); + DR->buildLTONameMap(); return std::move(DR); } @@ -422,16 +445,51 @@ std::error_code DataReader::parse() { return std::error_code(); } -ErrorOr +void DataReader::buildLTONameMap() { + for (const auto &FuncData : FuncsMap) { + const auto FuncName = FuncData.getKey(); + const auto CommonName = getLTOCommonName(FuncName); + if (CommonName) + LTOCommonNameMap[*CommonName].push_back(&FuncData.getValue()); + } +} + +const FuncBranchData * DataReader::getFuncBranchData(const std::vector &FuncNames) const { // Do a reverse order iteration since the name in profile has a higher chance // of matching a name at the end of the list. for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { - const auto I = FuncsMap.find(*FI); + const auto I = FuncsMap.find(normalizeName(*FI)); if (I != FuncsMap.end()) - return I->getValue(); + return &I->getValue(); } - return make_error_code(llvm::errc::invalid_argument); + return nullptr; +} + +std::vector +DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) + const { + std::vector AllData; + // Do a reverse order iteration since the name in profile has a higher chance + // of matching a name at the end of the list. + for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { + StringRef Name = *FI; + Name = normalizeName(Name); + const auto LTOCommonName = getLTOCommonName(Name); + if (LTOCommonName) { + const auto I = LTOCommonNameMap.find(*LTOCommonName); + if (I != LTOCommonNameMap.end()) { + const auto &CommonData = I->getValue(); + AllData.insert(AllData.end(), CommonData.begin(), CommonData.end()); + } + } else { + const auto I = FuncsMap.find(Name); + if (I != FuncsMap.end()) { + return {&I->getValue()}; + } + } + } + return AllData; } bool DataReader::hasLocalsWithFileName() const { diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 1f69424f704a..d94eab298ef6 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -15,6 +15,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_DATA_READER_H #define LLVM_TOOLS_LLVM_BOLT_DATA_READER_H +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/Support/Allocator.h" @@ -22,11 +23,38 @@ #include "llvm/Support/ErrorOr.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include #include namespace llvm { namespace bolt { +/// LTO-generated function names take a form: +// +/// .lto_priv./... +/// or +/// .constprop./... +/// +/// they can also be: +/// +/// .lto_priv..lto_priv./... +/// +/// The is a global counter used for the whole program. As a +/// result, a tiny change in a program may affect the naming of many LTO +/// functions. For us this means that if we do a precise name matching, then +/// a large set of functions could be left without a profile. +/// +/// To solve this issue, we try to match a function to a regex profile: +/// +/// .(lto_priv|consprop).* +/// +/// The name before an asterisk above represents a common LTO name for a family +/// of functions. Later out of all matching profiles we pick the one with the +/// best match. + +/// Return a common part of LTO name for a given \p Name. +Optional getLTOCommonName(const StringRef Name); + struct Location { bool IsSymbol; StringRef Name; @@ -109,6 +137,9 @@ struct FuncBranchData { /// Total execution count for the function. int64_t ExecutionCount{0}; + /// Indicate if the data was used. + mutable bool Used{false}; + FuncBranchData(StringRef Name, ContainerTy Data) : Name(Name), Data(std::move(Data)) {} @@ -137,8 +168,8 @@ class DataReader { explicit DataReader(raw_ostream &Diag) : Diag(Diag) {} DataReader(std::unique_ptr MemBuf, raw_ostream &Diag) - : FileBuf(std::move(MemBuf)), Diag(Diag), ParsingBuf(FileBuf->getBuffer()), - Line(0), Col(0) {} + : FileBuf(std::move(MemBuf)), Diag(Diag), + ParsingBuf(FileBuf->getBuffer()), Line(0), Col(0) {} static ErrorOr> readPerfData(StringRef Path, raw_ostream &Diag); @@ -187,8 +218,15 @@ class DataReader { /// offset d. std::error_code parse(); - ErrorOr getFuncBranchData( - const std::vector &FuncNames) const; + /// Return branch data matching one of the names in \p FuncNames. + const FuncBranchData * + getFuncBranchData(const std::vector &FuncNames) const; + + /// Return a vector of all FuncBranchData matching the list of names. + /// Internally use fuzzy matching to match special names like LTO-generated + /// function names. + std::vector + getFuncBranchDataRegex(const std::vector &FuncNames) const; using FuncsMapType = StringMap; @@ -200,7 +238,6 @@ class DataReader { /// that has a non-empty associated file name. bool hasLocalsWithFileName() const; - /// Dumps the entire data structures parsed. Used for debugging. void dump() const; @@ -216,7 +253,10 @@ class DataReader { ErrorOr parseBranchInfo(); bool hasData(); - // An in-memory copy of the input data file - owns strings used in reader + /// Build suffix map once the profile data is parsed. + void buildLTONameMap(); + + /// An in-memory copy of the input data file - owns strings used in reader. std::unique_ptr FileBuf; raw_ostream &Diag; StringRef ParsingBuf; @@ -224,9 +264,10 @@ class DataReader { unsigned Col; FuncsMapType FuncsMap; static const char FieldSeparator = ' '; -}; - + /// Map of common LTO names to possible matching profiles. + StringMap> LTOCommonNameMap; +}; } } diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index df152e2d99b3..bf976ff36965 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -129,7 +129,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, continue; } - auto BranchDataOrErr = BC.DR.getFuncBranchData(Function->getNames()); + const auto *BranchData = Function->getBranchData(); const auto SrcId = lookupNode(Function); uint64_t Offset = Function->getAddress(); uint64_t LastInstSize = 0; @@ -187,11 +187,11 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, const auto *DstSym = BC.MIA->getTargetSymbol(Inst); // If this is an indirect call use perf data directly. - if (!DstSym && BranchDataOrErr && + if (!DstSym && BranchData && BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { const auto DataOffset = BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); - for (const auto &BI : BranchDataOrErr->getBranchRange(DataOffset)) { + for (const auto &BI : BranchData->getBranchRange(DataOffset)) { Counts.push_back(getCallInfoFromBranchData(BI, false)); } } else { @@ -205,11 +205,11 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, // If the function has an invalid profile, try to use the perf data // directly (if requested). If there is no perf data for this function, // fall back to the CFG walker which attempts to handle missing data. - if (!Function->hasValidProfile() && CgFromPerfData && BranchDataOrErr) { + if (!Function->hasValidProfile() && CgFromPerfData && BranchData) { DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: Falling back to perf data" << " for " << *Function << "\n"); ++NumFallbacks; - for (const auto &BI : BranchDataOrErr->Data) { + for (const auto &BI : BranchData->Data) { Offset = Function->getAddress() + BI.From.Offset; const auto CI = getCallInfoFromBranchData(BI, true); if (!CI.first && CI.second == COUNT_NO_PROFILE) // probably a branch diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index b2e54906db1b..bb1478188c3c 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -194,10 +194,10 @@ IndirectCallPromotion::getCallTargets( Targets.erase(Result, Targets.end()); } else { - const auto BranchDataOrErr = BC.DR.getFuncBranchData(BF.getNames()); - const auto &BranchData = BranchDataOrErr.get(); + const auto *BranchData = BF.getBranchData(); + assert(BranchData && "expected initialized branch data"); auto Offset = BC.MIA->getAnnotationAs(Inst, "IndirectBranchData"); - for (const auto &BI : BranchData.getBranchRange(Offset)) { + for (const auto &BI : BranchData->getBranchRange(Offset)) { Callsite Site(BF, BI); if (Site.isValid()) { Targets.emplace_back(std::move(Site)); @@ -692,17 +692,14 @@ void IndirectCallPromotion::runOnFunctions( if (!Function.isSimple() || !opts::shouldProcess(Function)) continue; - const auto BranchDataOrErr = BC.DR.getFuncBranchData(Function.getNames()); - if (const auto EC = BranchDataOrErr.getError()) { - DEBUG(dbgs() << "BOLT-INFO: no branch data found for \"" - << Function << "\"\n"); + const auto *BranchData = Function.getBranchData(); + if (!BranchData) continue; - } - const FuncBranchData &BranchData = BranchDataOrErr.get(); + const bool HasLayout = !Function.layout_empty(); // Note: this is not just counting calls. - TotalCalls += BranchData.ExecutionCount; + TotalCalls += BranchData->ExecutionCount; // Total number of indirect calls issued from the current Function. // (a fraction of TotalIndirectCalls) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 473f607d7101..0bb55f9e7095 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -18,6 +18,7 @@ #include "DataReader.h" #include "Exceptions.h" #include "RewriteInstance.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h" @@ -770,21 +771,29 @@ void RewriteInstance::run() { return; } - unsigned PassNumber = 1; + auto executeRewritePass = [&](const std::set &NonSimpleFunctions) { + discoverStorage(); + readSpecialSections(); + discoverFileObjects(); + readDebugInfo(); + readProfileData(); + disassembleFunctions(); + for (uint64_t Address : NonSimpleFunctions) { + auto FI = BinaryFunctions.find(Address); + assert(FI != BinaryFunctions.end() && "bad non-simple function address"); + FI->second.setSimple(false); + } + runOptimizationPasses(); + emitFunctions(); + }; outs() << "BOLT-INFO: Target architecture: " << Triple::getArchTypeName( (llvm::Triple::ArchType)InputFile->getArch()) << "\n"; - // Main "loop". - discoverStorage(); - readSpecialSections(); - discoverFileObjects(); - readDebugInfo(); - disassembleFunctions(); - runOptimizationPasses(); - emitFunctions(); + unsigned PassNumber = 1; + executeRewritePass({}); if (opts::SplitFunctions == BinaryFunction::ST_LARGE && checkLargeFunctions()) { @@ -792,13 +801,7 @@ void RewriteInstance::run() { // Emit again because now some functions have been split outs() << "BOLT: split-functions: starting pass " << PassNumber << "...\n"; reset(); - discoverStorage(); - readSpecialSections(); - discoverFileObjects(); - readDebugInfo(); - disassembleFunctions(); - runOptimizationPasses(); - emitFunctions(); + executeRewritePass({}); } // Emit functions again ignoring functions which still didn't fit in their @@ -810,26 +813,7 @@ void RewriteInstance::run() { outs() << "BOLT: starting pass (ignoring large functions) " << PassNumber << "...\n"; reset(); - discoverStorage(); - readSpecialSections(); - discoverFileObjects(); - readDebugInfo(); - disassembleFunctions(); - - for (uint64_t Address : LargeFunctions) { - auto FunctionIt = BinaryFunctions.find(Address); - assert(FunctionIt != BinaryFunctions.end() && - "Invalid large function address."); - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: Function " << FunctionIt->second - << " is larger than its orginal size: emitting again marking it " - << "as not simple.\n"; - } - FunctionIt->second.setSimple(false); - } - - runOptimizationPasses(); - emitFunctions(); + executeRewritePass(LargeFunctions); } if (opts::CalcCacheMetrics) { @@ -943,7 +927,7 @@ void RewriteInstance::discoverFileObjects() { /// change the prefix to enforce global scope of the symbol. std::string Name = NameOrError->startswith(BC->AsmInfo->getPrivateGlobalPrefix()) - ? "PG." + std::string(*NameOrError) + ? "PG" + std::string(*NameOrError) : std::string(*NameOrError); // Disambiguate all local symbols before adding to symbol table. @@ -1144,9 +1128,11 @@ void RewriteInstance::discoverFileObjects() { const auto *FDE = FDEI.second; auto *BF = getBinaryFunctionContainingAddress(Address); if (!BF) { - errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x" - << Twine::utohexstr(Address + FDE->getAddressRange()) - << ") has no corresponding symbol table entry\n"; + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x" + << Twine::utohexstr(Address + FDE->getAddressRange()) + << ") has no corresponding symbol table entry\n"; + } auto Section = BC->getSectionForAddress(Address); assert(Section && "cannot get section for address from FDE"); StringRef SectionName; @@ -1540,16 +1526,18 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { if (!IsPCRelative && Addend != 0 && IsFromCode && !SymbolIsSection) { auto RefSection = BC->getSectionForAddress(SymbolAddress); if (RefSection && RefSection->isText()) { - errs() << "BOLT-WARNING: detected absolute reference from code into a " - << "middle of a function:\n" - << " offset = 0x" << Twine::utohexstr(Rel.getOffset()) - << "; symbol = " << SymbolName - << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) - << "; addend = 0x" << Twine::utohexstr(Addend) - << "; address = 0x" << Twine::utohexstr(Address) - << "; type = " << Rel.getType() - << "; type name = " << TypeName - << '\n'; + if (opts::Verbosity > 1) { + errs() << "BOLT-WARNING: detected absolute reference from code into " + << "a middle of a function:\n" + << " offset = 0x" << Twine::utohexstr(Rel.getOffset()) + << "; symbol = " << SymbolName + << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) + << "; addend = 0x" << Twine::utohexstr(Addend) + << "; address = 0x" << Twine::utohexstr(Address) + << "; type = " << Rel.getType() + << "; type name = " << TypeName + << '\n'; + } assert(ExtractedValue == SymbolAddress + Addend && "value mismatch"); Address = SymbolAddress; IsAbsoluteCodeRefWithAddend = true; @@ -1683,6 +1671,21 @@ void RewriteInstance::readDebugInfo() { BC->preprocessDebugInfo(BinaryFunctions); } +void RewriteInstance::readProfileData() { + if (BC->DR.getAllFuncsData().empty()) + return; + + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + const auto *FuncData = BC->DR.getFuncBranchData(Function.getNames()); + if (!FuncData) + continue; + Function.BranchData = FuncData; + Function.ExecutionCount = FuncData->ExecutionCount; + FuncData->Used = true; + } +} + void RewriteInstance::disassembleFunctions() { // Disassemble every function and build it's control flow graph. TotalScore = 0; @@ -1869,6 +1872,31 @@ void RewriteInstance::disassembleFunctions() { << " have invalid (possibly stale) profile.\n"; } + // Profile is marked as 'Used' if it either matches a function name + // exactly or if it 100% matches any of functions with matching common + // LTO names. + auto getUnusedObjects = [this]() -> Optional> { + std::vector UnusedObjects; + for (const auto &Func : BC->DR.getAllFuncsData()) { + if (!Func.getValue().Used) { + UnusedObjects.emplace_back(Func.getKey()); + } + } + if (UnusedObjects.empty()) + return NoneType(); + return UnusedObjects; + }; + + if (const auto UnusedObjects = getUnusedObjects()) { + outs() << "BOLT-INFO: profile for " << UnusedObjects->size() + << " objects was ignored\n"; + if (opts::Verbosity >= 1) { + for (auto Name : *UnusedObjects) { + outs() << " " << Name << '\n'; + } + } + } + if (ProfiledFunctions.size() > 10) { if (opts::Verbosity >= 1) { outs() << "BOLT-INFO: top called functions are:\n"; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 71e368de22f0..5828f64a230f 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -171,6 +171,9 @@ class RewriteInstance { /// Read information from debug sections. void readDebugInfo(); + /// Associate profile data with functions. + void readProfileData(); + /// Disassemble each function in the binary and associate it with a /// BinaryFunction object, preparing all information necessary for binary /// optimization. From f53544f542c6a78d57a14a4f33828ea3d79a4c95 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 4 Aug 2017 19:39:45 -0700 Subject: [PATCH 294/904] [BOLT] Disable last basic block assertion. Summary: While converting code from __builtin_unreachable() we were asserting that a basic block with a conditional jump and a single CFG successor was the last one before converting the jump to an unconditional one. However, if that code was executed after a conditional tail call conversion in the same function, the original last basic block will no longer be the last one in the post-conversion layout. I'm disabling the assertion since it doesn't seem worth it to add extra checks for the basic block that used to be the last one. (cherry picked from commit 81bdab415c84b9654b9e8fc4d738788aca124442) --- bolt/BinaryFunction.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 07132a56ae2d..6d8e2cc8314f 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1148,12 +1148,13 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Assign proper opcode for tail calls, so that they could be // treated as calls. if (!IsCall) { - if (!MIA->convertJmpToTailCall(Instruction) && - opts::Verbosity >= 2) { + if (!MIA->convertJmpToTailCall(Instruction)) { assert(IsCondBranch && "unknown tail call instruction"); - errs() << "BOLT-WARNING: conditional tail call detected in " - << "function " << *this << " at 0x" - << Twine::utohexstr(AbsoluteInstrAddr) << ".\n"; + if (opts::Verbosity >= 2) { + errs() << "BOLT-WARNING: conditional tail call detected in " + << "function " << *this << " at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << ".\n"; + } } // TODO: A better way to do this would be using annotations for // MCInst objects. @@ -3141,7 +3142,6 @@ void BinaryFunction::postProcessBranches() { // falls-through into the next function - hence the block will have only // one valid successor. Such behaviour is undefined and thus we remove // the conditional branch while leaving a valid successor. - assert(BB == BasicBlocksLayout.back() && "last basic block expected"); BB->eraseInstruction(std::next(LastInstrRI.base())); DEBUG(dbgs() << "BOLT-DEBUG: erasing conditional branch in " << BB->getName() << " in function " << *this << '\n'); From 25265587c0c8c4733c863bba31e9e6925b45fdb6 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 4 Aug 2017 20:14:24 -0700 Subject: [PATCH 295/904] [BOLT] Fix SCTC issue with hot-cold split Summary: SCTC was deleting an unconditional branch to a block in the cold area because it was the next block in the layout vector. Fix the condition to only delete such branches when source and target are in the same allocation area (either both hot or both cold). (cherry picked from commit 4898696f3ea0fb0994308dccd38d8bd26e8dacfa) --- bolt/Passes/BinaryPasses.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 3b77ba91ebf7..fffc47ec46f1 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -679,7 +679,8 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); // Only add a new branch if the target is not the fall-through. - if (BF.getBasicBlockAfter(BB, false) != CondSucc || isValid(BB)) { + if (BF.getBasicBlockAfter(BB) != CondSucc || isValid(BB) || + PredBB->isCold() != CondSucc->isCold()) { if (UncondBranch) { MIA->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), From 44440719a70e6e190513bad5520c9317a9d2f7c3 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 2 Aug 2017 18:14:01 -0700 Subject: [PATCH 296/904] Fix profiling for functions with multiple entry points Summary: Fix issue in memcpy where one of its entry points was getting no profiling data and was wrongly considered cold, being put in the cold region. (cherry picked from commit 56ab5d8081d8e6fe0245d5dfc637b8c368fe3ec3) --- bolt/BinaryContext.h | 4 +-- bolt/BinaryFunction.cpp | 72 +++++++++++++++++++++++++++++++++------- bolt/BinaryFunction.h | 7 +++- bolt/DataReader.cpp | 52 +++++++++++++++++++++-------- bolt/DataReader.h | 16 +++++---- bolt/RewriteInstance.cpp | 6 ++-- bolt/RewriteInstance.h | 2 +- 7 files changed, 120 insertions(+), 39 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 55bfded435c9..e2308fe68ce1 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -141,7 +141,7 @@ class BinaryContext { std::function ErrorCheck; - const DataReader &DR; + DataReader &DR; /// Sum of execution count of all functions uint64_t SumExecutionCount{0}; @@ -163,7 +163,7 @@ class BinaryContext { std::unique_ptr MIA, std::unique_ptr MRI, std::unique_ptr DisAsm, - const DataReader &DR) : + DataReader &DR) : Ctx(std::move(Ctx)), DwCtx(std::move(DwCtx)), TheTriple(std::move(TheTriple)), diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 6d8e2cc8314f..f888389cf35d 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1949,11 +1949,49 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { } } +bool BinaryFunction::fetchProfileForOtherEntryPoints() { + if (!BranchData) + return false; + + // Check if we are missing profiling data for secondary entry points + bool First{true}; + bool Updated{false}; + for (auto BB : BasicBlocks) { + if (First) { + First = false; + continue; + } + if (BB->isEntryPoint()) { + uint64_t EntryAddress = BB->getOffset() + getAddress(); + // Look for branch data associated with this entry point + std::vector Names; + std::multimap::iterator I, E; + for (std::tie(I, E) = BC.GlobalAddresses.equal_range(EntryAddress); + I != E; ++I) { + Names.push_back(I->second); + } + if (!Names.empty()) { + if (FuncBranchData *Data = BC.DR.getFuncBranchData(Names)) { + BranchData->appendFrom(*Data, BB->getOffset()); + Data->Used = true; + Updated = true; + } + } + } + } + return Updated; +} + void BinaryFunction::matchProfileData() { if (BranchData) { ProfileMatchRatio = evaluateProfileData(*BranchData); - if (ProfileMatchRatio == 1.0f) + if (ProfileMatchRatio == 1.0f) { + if (fetchProfileForOtherEntryPoints()) { + ProfileMatchRatio = evaluateProfileData(*BranchData); + ExecutionCount = BranchData->ExecutionCount; + } return; + } } // Check if the function name can fluctuate between several compilations @@ -1971,7 +2009,7 @@ void BinaryFunction::matchProfileData() { // Check for a profile that matches with 100% confidence. const auto AllBranchData = BC.DR.getFuncBranchDataRegex(getNames()); - for (const auto *NewBranchData : AllBranchData) { + for (auto *NewBranchData : AllBranchData) { // Prevent functions from sharing the same profile. if (NewBranchData->Used) continue; @@ -2097,8 +2135,12 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { if (II == Instructions.end()) return true; const auto &Instr = II->second; + // Check for calls, tail calls, rets and indirect branches. + // When matching profiling info, we did not reach the stage + // when we identify tail calls, so they are still represented + // by regular branch instructions and we need isBranch() here. if (BC.MIA->isCall(Instr) || - BC.MIA->isIndirectBranch(Instr) || + BC.MIA->isBranch(Instr) || BC.MIA->isReturn(Instr)) return false; // Check for "rep ret" @@ -2153,27 +2195,33 @@ void BinaryFunction::inferFallThroughCounts() { for (auto CurBB : BasicBlocks) { CurBB->ExecutionCount = 0; } - BasicBlocks.front()->setExecutionCount(ExecutionCount); for (auto CurBB : BasicBlocks) { auto SuccCount = CurBB->branch_info_begin(); for (auto Succ : CurBB->successors()) { - // Do not update execution count of the entry block (when we have tail - // calls). We already accounted for those when computing the func count. - if (Succ == BasicBlocks.front()) { - ++SuccCount; - continue; - } if (SuccCount->Count != BinaryBasicBlock::COUNT_NO_PROFILE) Succ->setExecutionCount(Succ->getExecutionCount() + SuccCount->Count); ++SuccCount; } } - // Update execution counts of landing pad blocks. + // Set entry BBs to zero, we'll update their execution count next with entry + // data (we maintain a separate data structure for branches to function entry + // points) + for (auto BB : BasicBlocks) { + if (BB->isEntryPoint()) + BB->ExecutionCount = 0; + } + + // Update execution counts of landing pad blocks and entry BBs + // There is a slight skew introduced here as branches originated from RETs + // may be accounted for in the execution count of an entry block if the last + // instruction in a predecessor fall-through block is a call. This situation + // should rarely happen because there are few multiple-entry functions. for (const auto &I : BranchData->EntryData) { BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); - if (BB && LandingPads.find(BB->getLabel()) != LandingPads.end()) { + if (BB && (BB->isEntryPoint() || + LandingPads.find(BB->getLabel()) != LandingPads.end())) { BB->setExecutionCount(BB->getExecutionCount() + I.Branches); } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index a35f706dc313..5d82874bb7a8 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -313,7 +313,7 @@ class BinaryFunction { uint64_t ExecutionCount{COUNT_NO_PROFILE}; /// Profile data for branches. - const FuncBranchData *BranchData{nullptr}; + FuncBranchData *BranchData{nullptr}; /// Profile match ratio for BranchData. float ProfileMatchRatio{0.0f}; @@ -1713,6 +1713,11 @@ class BinaryFunction { /// cannot be statically evaluated for any given indirect branch. bool postProcessIndirectBranches(); + /// In functions with multiple entry points, the profile collection records + /// data for other entry points in a different function entry. This function + /// attempts to fetch extra profile data for each secondary entry point. + bool fetchProfileForOtherEntryPoints(); + /// Find the best matching profile for a function after the creation of basic /// blocks. void matchProfileData(); diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 936c30dd0f81..5031b27657f7 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -56,6 +56,28 @@ FuncBranchData::getBranchRange(uint64_t From) const { return iterator_range(Range.first, Range.second); } +void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) { + Data.insert(Data.end(), FBD.Data.begin(), FBD.Data.end()); + for (auto I = Data.begin(), E = Data.end(); I != E; ++I) { + if (I->From.Name == FBD.Name) { + I->From.Name = this->Name; + I->From.Offset += Offset; + } + if (I->To.Name == FBD.Name) { + I->To.Name = this->Name; + I->To.Offset += Offset; + } + } + std::stable_sort(Data.begin(), Data.end()); + ExecutionCount += FBD.ExecutionCount; + for (auto I = FBD.EntryData.begin(), E = FBD.EntryData.end(); I != E; ++I) { + assert(I->To.Name == FBD.Name); + auto NewElmt = EntryData.insert(EntryData.end(), *I); + NewElmt->To.Name = this->Name; + NewElmt->To.Offset += Offset; + } +} + void BranchInfo::mergeWith(const BranchInfo &BI) { // Merge branch and misprediction counts. @@ -160,7 +182,8 @@ void BranchInfo::print(raw_ostream &OS) const { ErrorOr FuncBranchData::getBranch(uint64_t From, uint64_t To) const { for (const auto &I : Data) { - if (I.From.Offset == From && I.To.Offset == To) + if (I.From.Offset == From && I.To.Offset == To && + I.From.Name == I.To.Name) return I; } return make_error_code(llvm::errc::invalid_argument); @@ -422,8 +445,10 @@ std::error_code DataReader::parse() { auto I = GetOrCreateFuncEntry(BI.From.Name); I->getValue().Data.emplace_back(std::move(BI)); - // Add entry data for branches from another function. - if (BI.To.IsSymbol && !BI.From.Name.equals(BI.To.Name)) { + // Add entry data for branches to another function or branches + // to entry points (including recursive calls) + if (BI.To.IsSymbol && + (!BI.From.Name.equals(BI.To.Name) || BI.To.Offset == 0)) { I = GetOrCreateFuncEntry(BI.To.Name); I->getValue().EntryData.emplace_back(std::move(BI)); } @@ -446,7 +471,7 @@ std::error_code DataReader::parse() { } void DataReader::buildLTONameMap() { - for (const auto &FuncData : FuncsMap) { + for (auto &FuncData : FuncsMap) { const auto FuncName = FuncData.getKey(); const auto CommonName = getLTOCommonName(FuncName); if (CommonName) @@ -454,22 +479,21 @@ void DataReader::buildLTONameMap() { } } -const FuncBranchData * -DataReader::getFuncBranchData(const std::vector &FuncNames) const { +FuncBranchData * +DataReader::getFuncBranchData(const std::vector &FuncNames) { // Do a reverse order iteration since the name in profile has a higher chance // of matching a name at the end of the list. for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { - const auto I = FuncsMap.find(normalizeName(*FI)); + auto I = FuncsMap.find(normalizeName(*FI)); if (I != FuncsMap.end()) return &I->getValue(); } return nullptr; } -std::vector -DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) - const { - std::vector AllData; +std::vector +DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) { + std::vector AllData; // Do a reverse order iteration since the name in profile has a higher chance // of matching a name at the end of the list. for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { @@ -477,13 +501,13 @@ DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) Name = normalizeName(Name); const auto LTOCommonName = getLTOCommonName(Name); if (LTOCommonName) { - const auto I = LTOCommonNameMap.find(*LTOCommonName); + auto I = LTOCommonNameMap.find(*LTOCommonName); if (I != LTOCommonNameMap.end()) { - const auto &CommonData = I->getValue(); + auto &CommonData = I->getValue(); AllData.insert(AllData.end(), CommonData.begin(), CommonData.end()); } } else { - const auto I = FuncsMap.find(Name); + auto I = FuncsMap.find(Name); if (I != FuncsMap.end()) { return {&I->getValue()}; } diff --git a/bolt/DataReader.h b/bolt/DataReader.h index d94eab298ef6..483b3c966ffd 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -138,7 +138,7 @@ struct FuncBranchData { int64_t ExecutionCount{0}; /// Indicate if the data was used. - mutable bool Used{false}; + bool Used{false}; FuncBranchData(StringRef Name, ContainerTy Data) : Name(Name), Data(std::move(Data)) {} @@ -157,6 +157,10 @@ struct FuncBranchData { /// Find all the branches originating at From. iterator_range getBranchRange( uint64_t From) const; + + /// Append the branch data of another function located \p Offset bytes away + /// from the entry of this function. + void appendFrom(const FuncBranchData &FBD, uint64_t Offset); }; //===----------------------------------------------------------------------===// @@ -219,14 +223,14 @@ class DataReader { std::error_code parse(); /// Return branch data matching one of the names in \p FuncNames. - const FuncBranchData * - getFuncBranchData(const std::vector &FuncNames) const; + FuncBranchData * + getFuncBranchData(const std::vector &FuncNames); /// Return a vector of all FuncBranchData matching the list of names. /// Internally use fuzzy matching to match special names like LTO-generated /// function names. - std::vector - getFuncBranchDataRegex(const std::vector &FuncNames) const; + std::vector + getFuncBranchDataRegex(const std::vector &FuncNames); using FuncsMapType = StringMap; @@ -266,7 +270,7 @@ class DataReader { static const char FieldSeparator = ' '; /// Map of common LTO names to possible matching profiles. - StringMap> LTOCommonNameMap; + StringMap> LTOCommonNameMap; }; } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 0bb55f9e7095..6c140884b3ab 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -512,7 +512,7 @@ namespace { /// Create BinaryContext for a given architecture \p ArchName and /// triple \p TripleName. std::unique_ptr -createBinaryContext(ELFObjectFileBase *File, const DataReader &DR, +createBinaryContext(ELFObjectFileBase *File, DataReader &DR, std::unique_ptr DwCtx) { std::string ArchName; std::string TripleName; @@ -630,7 +630,7 @@ createBinaryContext(ELFObjectFileBase *File, const DataReader &DR, } // namespace -RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const DataReader &DR, +RewriteInstance::RewriteInstance(ELFObjectFileBase *File, DataReader &DR, const int Argc, const char *const *Argv) : InputFile(File), Argc(Argc), Argv(Argv), BC(createBinaryContext( @@ -1677,7 +1677,7 @@ void RewriteInstance::readProfileData() { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; - const auto *FuncData = BC->DR.getFuncBranchData(Function.getNames()); + auto *FuncData = BC->DR.getFuncBranchData(Function.getNames()); if (!FuncData) continue; Function.BranchData = FuncData; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 5828f64a230f..c971debf4aae 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -146,7 +146,7 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { /// events. class RewriteInstance { public: - RewriteInstance(llvm::object::ELFObjectFileBase *File, const DataReader &DR, + RewriteInstance(llvm::object::ELFObjectFileBase *File, DataReader &DR, const int Argc, const char *const *Argv); ~RewriteInstance(); From fb50cf5eb507ca217c7de82da0148e1697113fad Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 10 Aug 2017 13:18:44 -0700 Subject: [PATCH 297/904] [BOLT] Fix printing of dyno-stats Summary: We used to print dyno-stats after instruction lowering which was skewing our metrics as tail calls were no longer recognized as calls for one thing. The fix is to control the point at which dyno-stats printing pass is run and run it immediately before instruction lowering. In the future we may decide to run the pass before some other intervening pass. (cherry picked from commit e31828308118dec8d783ed36bf1bce6191a311a5) --- bolt/BinaryFunction.cpp | 9 +++++++-- bolt/BinaryPassManager.cpp | 35 +++++++++++++++++++++----------- bolt/BinaryPassManager.h | 11 ++++------ bolt/Passes/BinaryPasses.h | 41 ++++++++++++++++++++++++++++++++++++-- bolt/RewriteInstance.cpp | 22 +------------------- 5 files changed, 74 insertions(+), 44 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index f888389cf35d..17cfb4ea1cbd 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -48,7 +48,6 @@ extern cl::OptionCategory BoltRelocCategory; extern bool shouldProcess(const BinaryFunction &); -extern cl::opt PrintDynoStats; extern cl::opt Relocs; extern cl::opt UpdateDebugSections; extern cl::opt IndirectCallPromotion; @@ -103,6 +102,11 @@ JumpTables("jump-tables", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +cl::opt +PrintDynoStats("dyno-stats", + cl::desc("print execution info based on profile"), + cl::cat(BoltCategory)); + static cl::opt PrintJumpTables("print-jump-tables", cl::desc("print jump tables"), @@ -4376,9 +4380,10 @@ void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name; if (Other) { if (Stat != OtherStat) { + OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0 OS << format(" (%+.1f%%)", ( (float) Stat - (float) OtherStat ) * 100.0 / - (float) (OtherStat + 1) ); + (float) (OtherStat) ); } else { OS << " (=)"; } diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 06e76de036b9..982906a5bebb 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -25,17 +25,19 @@ using namespace llvm; namespace opts { extern cl::OptionCategory BoltOptCategory; +extern cl::OptionCategory BoltCategory; extern cl::opt Verbosity; extern cl::opt PrintAll; +extern cl::opt PrintDynoStats; extern cl::opt DumpDotAll; -extern cl::opt DynoStatsAll; static cl::opt -ICF("icf", - cl::desc("fold functions with identical code"), +DynoStatsAll("dyno-stats-all", + cl::desc("print dyno stats after each stage"), cl::ZeroOrMore, - cl::cat(BoltOptCategory)); + cl::Hidden, + cl::cat(BoltCategory)); static cl::opt EliminateUnreachable("eliminate-unreachable", @@ -44,6 +46,12 @@ EliminateUnreachable("eliminate-unreachable", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +ICF("icf", + cl::desc("fold functions with identical code"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt InlineSmallFunctions("inline-small-functions", cl::desc("inline functions with a single basic block"), @@ -225,12 +233,6 @@ using namespace opts; const char BinaryFunctionPassManager::TimerGroupName[] = "Binary Function Pass Manager"; -cl::opt BinaryFunctionPassManager::AlwaysOn( - "always-run-pass", - cl::desc("Used for passes that are always enabled"), - cl::init(true), - cl::ReallyHidden); - void BinaryFunctionPassManager::runPasses() { for (const auto &OptPassPair : Passes) { if (!OptPassPair.first) @@ -296,6 +298,8 @@ void BinaryFunctionPassManager::runAllPasses( ) { BinaryFunctionPassManager Manager(BC, Functions, LargeFunctions); + const auto InitialDynoStats = getDynoStats(Functions); + // Here we manage dependencies/order manually, since passes are run in the // order they're registered. @@ -347,6 +351,12 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass( llvm::make_unique(PrintReorderedFunctions)); + // Print final dyno stats right while CFG and instruction analysis are intact. + Manager.registerPass( + llvm::make_unique( + InitialDynoStats, "after all optimizations before SCTC and FOP"), + opts::PrintDynoStats | opts::DynoStatsAll); + // This pass introduces conditional jumps into external functions. // Between extending CFG to support this and isolating this pass we chose // the latter. Thus this pass will do double jump removal and unreachable @@ -375,8 +385,9 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintFOP)); - // *except for this pass. This pass turns tail calls into jumps which - // makes them invisible to function reordering. + // This pass turns tail calls into jumps which makes them invisible to + // function reordering. It's unsafe to use any CFG or instruction analysis + // after this point. Manager.registerPass( llvm::make_unique(PrintAfterLowering)); diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h index 07260640b906..f744b71d5de0 100644 --- a/bolt/BinaryPassManager.h +++ b/bolt/BinaryPassManager.h @@ -16,8 +16,6 @@ #include "BinaryFunction.h" #include "Passes/BinaryPasses.h" -#include "llvm/Support/Options.h" -#include "llvm/Support/CommandLine.h" #include #include #include @@ -28,11 +26,10 @@ namespace bolt { /// Simple class for managing analyses and optimizations on BinaryFunctions. class BinaryFunctionPassManager { private: - static cl::opt AlwaysOn; BinaryContext &BC; std::map &BFs; std::set &LargeFunctions; - std::vector &, + std::vector>> Passes; static const char TimerGroupName[]; @@ -45,13 +42,13 @@ class BinaryFunctionPassManager { /// Adds a pass to this manager based on the value of its corresponding /// command-line option. void registerPass(std::unique_ptr Pass, - const cl::opt &Opt) { - Passes.emplace_back(Opt, std::move(Pass)); + const bool Run) { + Passes.emplace_back(Run, std::move(Pass)); } /// Adds an unconditionally run pass to this manager. void registerPass(std::unique_ptr Pass) { - Passes.emplace_back(AlwaysOn, std::move(Pass)); + Passes.emplace_back(true, std::move(Pass)); } /// Run all registered passes in the order they were added. diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index a3e08f25c501..8c1bdb2bd560 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -31,9 +31,9 @@ namespace bolt { /// An optimization/analysis pass that runs on functions. class BinaryFunctionPass { protected: - const cl::opt &PrintPass; + bool PrintPass; - explicit BinaryFunctionPass(const cl::opt &PrintPass) + explicit BinaryFunctionPass(const bool PrintPass) : PrintPass(PrintPass) { } /// Control whether a specific function should be skipped during @@ -58,6 +58,43 @@ class BinaryFunctionPass { std::set &LargeFunctions) = 0; }; +/// A pass to print program-wide dynostats. +class DynoStatsPrintPass : public BinaryFunctionPass { +protected: + DynoStats PrevDynoStats; + std::string Title; + +public: + DynoStatsPrintPass(const DynoStats &PrevDynoStats, const char *Title) + : BinaryFunctionPass(false) + , PrevDynoStats(PrevDynoStats) + , Title(Title) { + } + + const char *getName() const { + return "print dyno-stats after optimizations"; + } + + bool shouldPrint(const BinaryFunction &BF) const override { + return false; + } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override { + const auto NewDynoStats = getDynoStats(BFs); + const auto Changed = (NewDynoStats != PrevDynoStats); + outs() << "BOLT-INFO: program-wide dynostats " + << Title << (Changed ? "" : " (no change)") << ":\n\n" + << PrevDynoStats; + if (Changed) { + outs() << '\n'; + NewDynoStats.print(outs(), &PrevDynoStats); + } + outs() << '\n'; + } +}; + /// Detects functions that simply do a tail call when they are called and /// optimizes calls to these functions. class OptimizeBodylessFunctions : public BinaryFunctionPass { diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 6c140884b3ab..462dac5de8c5 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -136,13 +136,6 @@ DumpEHFrame("dump-eh-frame", cl::Hidden, cl::cat(BoltCategory)); -cl::opt -DynoStatsAll("dyno-stats-all", - cl::desc("print dyno stats after each stage"), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltCategory)); - static cl::opt FixDebugInfoLargeFunctions("fix-debuginfo-large-functions", cl::init(true), @@ -222,11 +215,6 @@ PrintDisasm("print-disasm", cl::Hidden, cl::cat(BoltCategory)); -cl::opt -PrintDynoStats("dyno-stats", - cl::desc("print execution info based on profile"), - cl::cat(BoltCategory)); - static cl::opt PrintLoopInfo("print-loops", cl::desc("print loop related information"), @@ -1922,15 +1910,7 @@ void RewriteInstance::disassembleFunctions() { } void RewriteInstance::runOptimizationPasses() { - callWithDynoStats( - [this] { - BinaryFunctionPassManager::runAllPasses(*BC, - BinaryFunctions, - LargeFunctions); - }, - BinaryFunctions, - "optimizations", - opts::PrintDynoStats || opts::DynoStatsAll); + BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions, LargeFunctions); } // Helper function to emit the contents of a function via a MCStreamer object. From 63d4a271ca7b7e5f24808b164494795140a58459 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 4 Aug 2017 11:21:05 -0700 Subject: [PATCH 298/904] [BOLT] PLT optimization Summary: Add an option to optimize PLT calls: -plt - optimize PLT calls (requires linking with -znow) =none - do not optimize PLT calls =hot - optimize executed (hot) PLT calls =all - optimize all PLT calls When optimized, the calls are converted to use GOT reference indirectly. GOT entries are guaranteed to contain a valid function pointer if lazy binding is disabled - hence the requirement for linker's -znow option. Note: we can add an entry to .dynamic and drop a requirement for -znow if we were moving .dynamic to a new segment. (cherry picked from commit 8f34277d65d3d37df695006d9b1dccb62c7d425b) --- bolt/BinaryContext.h | 13 +++ bolt/BinaryFunction.cpp | 19 ++-- bolt/BinaryFunction.h | 28 ++++-- bolt/BinaryPassManager.cpp | 11 +++ bolt/Exceptions.cpp | 3 +- bolt/Passes/CMakeLists.txt | 1 + bolt/Passes/PLTCall.cpp | 94 +++++++++++++++++++ bolt/Passes/PLTCall.h | 49 ++++++++++ bolt/RewriteInstance.cpp | 180 ++++++++++++++++++++++++++----------- bolt/RewriteInstance.h | 25 +++++- 10 files changed, 349 insertions(+), 74 deletions(-) create mode 100644 bolt/Passes/PLTCall.cpp create mode 100644 bolt/Passes/PLTCall.h diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index e2308fe68ce1..d808f3da0dfd 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -149,6 +149,9 @@ class BinaryContext { /// Number of functions with profile information uint64_t NumProfiledFuncs{0}; + /// True if the binary requires immediate relocation processing. + bool RequiresZNow{false}; + BinaryContext(std::unique_ptr Ctx, std::unique_ptr DwCtx, std::unique_ptr TheTriple, @@ -206,6 +209,16 @@ class BinaryContext { /// Register a symbol with \p Name at a given \p Address. MCSymbol *registerNameAtAddress(const std::string &Name, uint64_t Address) { + // Check if the Name was already registered. + const auto GSI = GlobalSymbols.find(Name); + if (GSI != GlobalSymbols.end()) { + assert(GSI->second == Address && "addresses do not match"); + auto *Symbol = Ctx->lookupSymbol(Name); + assert(Symbol && "symbol should be registered with MCContext"); + + return Symbol; + } + // Add the name to global symbols map. GlobalSymbols[Name] = Address; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 17cfb4ea1cbd..a4b6539df462 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -4289,19 +4289,14 @@ DynoStats BinaryFunction::getDynoStats() const { if (BC.MIA->getMemoryOperandNo(Instr) != -1) { Stats[DynoStats::INDIRECT_CALLS] += CallFreq; } else if (const auto *CallSymbol = BC.MIA->getTargetSymbol(Instr)) { - if (BC.getFunctionForSymbol(CallSymbol)) - continue; - auto GSI = BC.GlobalSymbols.find(CallSymbol->getName()); - if (GSI == BC.GlobalSymbols.end()) - continue; - auto Section = BC.getSectionForAddress(GSI->second); - if (!Section) - continue; - StringRef SectionName; - Section->getName(SectionName); - if (SectionName == ".plt") { + const auto *BF = BC.getFunctionForSymbol(CallSymbol); + if (BF && BF->isPLTFunction()) Stats[DynoStats::PLT_CALLS] += CallFreq; - } + + // We don't process PLT functions and hence have to adjust + // relevant dynostats here. + Stats[DynoStats::LOADS] += CallFreq; + Stats[DynoStats::INDIRECT_CALLS] += CallFreq; } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 5d82874bb7a8..53830c42b1ae 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -339,13 +339,13 @@ class BinaryFunction { /// is referenced by UnitLineTable. DWARFUnitLineTable UnitLineTable{nullptr, nullptr}; - /// Offset of this function's address ranges in the .debug_ranges section of - /// the output binary. - uint32_t AddressRangesOffset{-1U}; - /// Last computed hash value. mutable uint64_t Hash{0}; + /// For PLT functions it contains a symbol associated with a function + /// reference. It is nullptr for non-PLT functions. + const MCSymbol *PLTSymbol{nullptr}; + /// Function order for streaming into the destination binary. uint32_t Index{-1U}; @@ -1165,6 +1165,23 @@ class BinaryFunction { return FunctionColdEndLabel; } + /// Return true if this is a function representing a PLT entry. + bool isPLTFunction() const { + return PLTSymbol != nullptr; + } + + /// Return PLT function reference symbol for PLT functions and nullptr for + /// non-PLT functions. + const MCSymbol *getPLTSymbol() const { + return PLTSymbol; + } + + /// Set function PLT reference symbol for PLT functions. + void setPLTSymbol(const MCSymbol *Symbol) { + assert(Size == 0 && "function size should be 0 for PLT functions"); + PLTSymbol = Symbol; + } + /// Register relocation type \p RelType at a given \p Address in the function /// against \p Symbol. /// Assert if the \p Address is not inside this function. @@ -1614,9 +1631,6 @@ class BinaryFunction { return *this; } - /// Returns the offset of the function's address ranges in .debug_ranges. - uint32_t getAddressRangesOffset() const { return AddressRangesOffset; } - /// Return the profile information about the number of times /// the function was executed. /// diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 982906a5bebb..24bbed8a3bf1 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -14,6 +14,7 @@ #include "Passes/FrameOptimizer.h" #include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" +#include "Passes/PLTCall.h" #include "Passes/ReorderFunctions.h" #include "Passes/StokeInfo.h" #include "llvm/Support/Timer.h" @@ -31,6 +32,7 @@ extern cl::opt Verbosity; extern cl::opt PrintAll; extern cl::opt PrintDynoStats; extern cl::opt DumpDotAll; +extern cl::opt PLT; static cl::opt DynoStatsAll("dyno-stats-all", @@ -131,6 +133,13 @@ PrintOptimizeBodyless("print-optimize-bodyless", cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt +PrintPLT("print-plt", + cl::desc("print functions after PLT optimization"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + static cl::opt PrintPeepholes("print-peepholes", cl::desc("print functions after peephole optimization"), @@ -331,6 +340,8 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintICF), opts::ICF); + Manager.registerPass(llvm::make_unique(PrintPLT)); + Manager.registerPass(llvm::make_unique(PrintReordered)); Manager.registerPass(llvm::make_unique(PrintPeepholes), diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 110cda04168c..929a66d6dbc6 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -530,7 +530,8 @@ const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { uint64_t Address = Function.getAddress(); auto I = FDEs.find(Address); - if (I == FDEs.end()) + // Ignore zero-length FDE ranges. + if (I == FDEs.end() || !I->second->getAddressRange()) return true; const FDE &CurFDE = *I->second; diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index b5a51a553074..9bcd9e72bb85 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -13,6 +13,7 @@ add_llvm_library(LLVMBOLTPasses IndirectCallPromotion.cpp Inliner.cpp LivenessAnalysis.cpp + PLTCall.cpp PettisAndHansen.cpp RegAnalysis.cpp ReorderAlgorithm.cpp diff --git a/bolt/Passes/PLTCall.cpp b/bolt/Passes/PLTCall.cpp new file mode 100644 index 000000000000..e530dba77137 --- /dev/null +++ b/bolt/Passes/PLTCall.cpp @@ -0,0 +1,94 @@ +//===--- Passes/PLTCall.h - PLT call optimization -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Replace calls to PLT entries with indirect calls against GOT. +// +//===----------------------------------------------------------------------===// + +#include "PLTCall.h" +#include "llvm/Support/Options.h" + +#define DEBUG_TYPE "bolt-plt" + +using namespace llvm; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +cl::opt +PLT("plt", + cl::desc("optimize PLT calls (requires linking with -znow)"), + cl::init(bolt::PLTCall::OT_NONE), + cl::values(clEnumValN(bolt::PLTCall::OT_NONE, + "none", + "do not optimize PLT calls"), + clEnumValN(bolt::PLTCall::OT_HOT, + "hot", + "optimize executed (hot) PLT calls"), + clEnumValN(bolt::PLTCall::OT_ALL, + "all", + "optimize all PLT calls"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} + +namespace llvm { +namespace bolt { + +void PLTCall::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set &) { + if (opts::PLT == OT_NONE) + return; + + uint64_t NumCallsOptimized = 0; + for (auto &It : BFs) { + auto &Function = It.second; + if (!shouldOptimize(Function)) + continue; + + if (opts::PLT == OT_HOT && + Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + continue; + + for (auto *BB : Function.layout()) { + if (opts::PLT == OT_HOT && !BB->getKnownExecutionCount()) + continue; + + for (auto &Instr : *BB) { + if (!BC.MIA->isCall(Instr)) + continue; + const auto *CallSymbol = BC.MIA->getTargetSymbol(Instr); + if (!CallSymbol) + continue; + const auto *CalleeBF = BC.getFunctionForSymbol(CallSymbol); + if (!CalleeBF || !CalleeBF->isPLTFunction()) + continue; + BC.MIA->convertCallToIndirectCall(Instr, + CalleeBF->getPLTSymbol(), + BC.Ctx.get()); + ++NumCallsOptimized; + } + } + } + + if (NumCallsOptimized) { + BC.RequiresZNow = true; + outs() << "BOLT-INFO: " << NumCallsOptimized + << " PLT calls in the binary were opitmized.\n"; + } +} + + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/PLTCall.h b/bolt/Passes/PLTCall.h new file mode 100644 index 000000000000..19daab31f2fb --- /dev/null +++ b/bolt/Passes/PLTCall.h @@ -0,0 +1,49 @@ +//===--- Passes/PLTCall.h - PLT call optimization -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_PLTCALL_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_PLTCALL_H + +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "BinaryPasses.h" + +namespace llvm { +namespace bolt { + +class PLTCall : public BinaryFunctionPass { +public: + + /// PLT optimization type + enum OptType : char { + OT_NONE = 0, /// Do not optimize + OT_HOT = 1, /// Optimize hot PLT calls + OT_ALL = 2 /// Optimize all PLT calls + }; + + explicit PLTCall(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "PLT call optimization"; + } + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF); + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 462dac5de8c5..5dbb463a2ce8 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -961,6 +961,7 @@ void RewriteInstance::discoverFileObjects() { AlternativeName = uniquifyName(AltPrefix); } + // Register names even if it's not a function, e.g. for an entry point. BC->registerNameAtAddress(UniqueName, Address); if (!AlternativeName.empty()) BC->registerNameAtAddress(AlternativeName, Address); @@ -1047,8 +1048,6 @@ void RewriteInstance::discoverFileObjects() { continue; } - // TODO: populate address map with PLT entries for better readability. - // Checkout for conflicts with function data from FDEs. bool IsSimple = true; auto FDEI = CFIRdWrt->getFDEs().lower_bound(Address); @@ -1110,50 +1109,32 @@ void RewriteInstance::discoverFileObjects() { PreviousFunction = BF; } + // Process PLT section. + disassemblePLT(); + // See if we missed any functions marked by FDE. for (const auto &FDEI : CFIRdWrt->getFDEs()) { const auto Address = FDEI.first; const auto *FDE = FDEI.second; - auto *BF = getBinaryFunctionContainingAddress(Address); + const auto *BF = getBinaryFunctionAtAddress(Address); if (!BF) { - if (opts::Verbosity >= 1) { + if (const auto *PartialBF = getBinaryFunctionContainingAddress(Address)) { errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange()) - << ") has no corresponding symbol table entry\n"; - } - auto Section = BC->getSectionForAddress(Address); - assert(Section && "cannot get section for address from FDE"); - StringRef SectionName; - Section->getName(SectionName); - // PLT has a special FDE. - if (SectionName == ".plt") { - // Set the size to 0 to prevent PLT from being disassembled. - createBinaryFunction("__BOLT_PLT_PSEUDO" , *Section, Address, 0, false); - } else if (SectionName == ".plt.got") { - createBinaryFunction("__BOLT_PLT_GOT_PSEUDO" , *Section, Address, 0, - false); + << ") conflicts with function " << *PartialBF << '\n'; } else { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) + << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange()) + << ") has no corresponding symbol table entry\n"; + } + auto Section = BC->getSectionForAddress(Address); + assert(Section && "cannot get section for address from FDE"); std::string FunctionName = "__BOLT_FDE_FUNCat" + Twine::utohexstr(Address).str(); - BC->registerNameAtAddress(FunctionName, Address); createBinaryFunction(FunctionName, *Section, Address, FDE->getAddressRange(), true); } - } else if (BF->getAddress() != Address) { - errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x" - << Twine::utohexstr(Address + FDE->getAddressRange()) - << ") conflicts with function " << *BF << '\n'; - } - } - - if (PLTGOTSection.getObject()) { - // Check if we need to create a function for .plt.got. Some linkers - // (depending on the version) would mark it with FDE while others wouldn't. - if (!getBinaryFunctionContainingAddress(PLTGOTSection.getAddress(), true)) { - DEBUG(dbgs() << "BOLT-DEBUG: creating .plt.got pseudo function at 0x" - << Twine::utohexstr(PLTGOTSection.getAddress()) << '\n'); - createBinaryFunction("__BOLT_PLT_GOT_PSEUDO" , PLTGOTSection, - PLTGOTSection.getAddress(), 0, false); } } @@ -1180,6 +1161,81 @@ void RewriteInstance::discoverFileObjects() { } } +void RewriteInstance::disassemblePLT() { + if (!PLTSection.getObject()) + return; + + const auto PLTAddress = PLTSection.getAddress(); + StringRef PLTContents; + PLTSection.getContents(PLTContents); + ArrayRef PLTData( + reinterpret_cast(PLTContents.data()), + PLTSection.getSize()); + + // Pseudo function for the start of PLT. The table could have a matching + // FDE that we want to match to pseudo function. + createBinaryFunction("__BOLT_PLT_PSEUDO" , PLTSection, PLTAddress, 0, false); + for (uint64_t Offset = 0; Offset < PLTSection.getSize(); Offset += 0x10) { + uint64_t InstrSize; + MCInst Instruction; + const uint64_t InstrAddr = PLTAddress + Offset; + if (!BC->DisAsm->getInstruction(Instruction, + InstrSize, + PLTData.slice(Offset), + InstrAddr, + nulls(), + nulls())) { + errs() << "BOLT-ERROR: unable to disassemble instruction in .plt " + << "at offset 0x" << Twine::utohexstr(Offset) << '\n'; + exit(1); + } + + if (!BC->MIA->isIndirectBranch(Instruction)) + continue; + + uint64_t TargetAddress; + if (!BC->MIA->evaluateMemOperandTarget(Instruction, + TargetAddress, + InstrAddr, + InstrSize)) { + errs() << "BOLT-ERROR: error evaluating PLT instruction at offset 0x" + << Twine::utohexstr(InstrAddr) << '\n'; + exit(1); + } + + // To get the name we have to read a relocation against the address. + for (const auto &Rel : RelaPLTSection.relocations()) { + if (Rel.getType() != ELF::R_X86_64_JUMP_SLOT) + continue; + if (Rel.getOffset() == TargetAddress) { + const auto SymbolIter = Rel.getSymbol(); + assert(SymbolIter != InputFile->symbol_end() && + "non-null symbol expected"); + const auto SymbolName = *(*SymbolIter).getName(); + std::string Name = SymbolName.str() + "@PLT"; + auto *BF = createBinaryFunction(Name, + PLTSection, + InstrAddr, + 0, + /*IsSimple=*/false); + auto TargetSymbol = BC->registerNameAtAddress(SymbolName.str() + "@GOT", + TargetAddress); + BF->setPLTSymbol(TargetSymbol); + break; + } + } + } + + if (PLTGOTSection.getObject()) { + // Check if we need to create a function for .plt.got. Some linkers + // (depending on the version) would mark it with FDE while others wouldn't. + if (!getBinaryFunctionAtAddress(PLTGOTSection.getAddress())) { + createBinaryFunction("__BOLT_PLT_GOT_PSEUDO" , PLTGOTSection, + PLTGOTSection.getAddress(), 0, false); + } + } +} + void RewriteInstance::adjustFunctionBoundaries() { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -1320,6 +1376,7 @@ BinaryFunction *RewriteInstance::createBinaryFunction( Address, BinaryFunction(Name, Section, Address, Size, *BC, IsSimple)); assert(Result.second == true && "unexpected duplicate function"); auto *BF = &Result.first->second; + BC->registerNameAtAddress(Name, Address); BC->SymbolToFunctionMap[BF->getSymbol()] = BF; return BF; } @@ -1349,8 +1406,14 @@ void RewriteInstance::readSpecialSections() { HasTextRelocations = true; } else if (SectionName == ".gdb_index") { GdbIndexSection = Section; + } else if (SectionName == ".plt") { + PLTSection = Section; + } else if (SectionName == ".got.plt") { + GOTPLTSection = Section; } else if (SectionName == ".plt.got") { PLTGOTSection = Section; + } else if (SectionName == ".rela.plt") { + RelaPLTSection = Section; } // Ignore zero-size allocatable sections as they present no interest to us. @@ -1733,7 +1796,6 @@ void RewriteInstance::disassembleFunctions() { abort(); } - if (opts::PrintAll || opts::PrintDisasm) Function.print(outs(), "after disassembly", true); @@ -3144,7 +3206,9 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { const auto *Function = getBinaryFunctionAtAddress(Symbol.st_value); // Some section symbols may be mistakenly associated with the first // function emitted in the section. Dismiss if it is a section symbol. - if (Function && NewSymbol.getType() != ELF::STT_SECTION) { + if (Function && + !Function->getPLTSymbol() && + NewSymbol.getType() != ELF::STT_SECTION) { NewSymbol.st_value = Function->getOutputAddress(); NewSymbol.st_size = Function->getOutputSize(); NewSymbol.st_shndx = NewTextSectionIndex; @@ -3275,15 +3339,6 @@ template void RewriteInstance::patchELFRelaPLT(ELFObjectFile *File) { auto &OS = Out->os(); - SectionRef RelaPLTSection; - for (const auto &Section : File->sections()) { - StringRef SectionName; - Section.getName(SectionName); - if (SectionName == ".rela.plt") { - RelaPLTSection = Section; - break; - } - } if (!RelaPLTSection.getObject()) { errs() << "BOLT-INFO: no .rela.plt section found\n"; return; @@ -3362,6 +3417,8 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { } assert(DynamicPhdr && "missing dynamic in ELF binary"); + bool ZNowSet = false; + // Go through all dynamic entries and patch functions addresses with // new ones. ErrorOr DTB = Obj->dynamic_table_begin(DynamicPhdr); @@ -3376,10 +3433,24 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { break; case ELF::DT_INIT: case ELF::DT_FINI: - if (auto NewAddress = getNewFunctionAddress(DE->getPtr())) { - DEBUG(dbgs() << "BOLT-DEBUG: patching dynamic entry of type " - << DE->getTag() << '\n'); - NewDE.d_un.d_ptr = NewAddress; + if (opts::Relocs) { + if (auto NewAddress = getNewFunctionAddress(DE->getPtr())) { + DEBUG(dbgs() << "BOLT-DEBUG: patching dynamic entry of type " + << DE->getTag() << '\n'); + NewDE.d_un.d_ptr = NewAddress; + } + } + break; + case ELF::DT_FLAGS: + if (BC->RequiresZNow) { + NewDE.d_un.d_val |= ELF::DF_BIND_NOW; + ZNowSet = true; + } + break; + case ELF::DT_FLAGS_1: + if (BC->RequiresZNow) { + NewDE.d_un.d_val |= ELF::DF_1_NOW; + ZNowSet = true; } break; } @@ -3388,6 +3459,13 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { DynamicOffset + (DE - *DTB) * sizeof(*DE)); } } + + if (BC->RequiresZNow && !ZNowSet) { + errs() << "BOLT-ERROR: output binary requires immediate relocation " + "processing which depends on DT_FLAGS or DT_FLAGS_1 presence in " + ".dynamic. Please re-link the binary with -znow.\n"; + exit(1); + } } uint64_t RewriteInstance::getNewFunctionAddress(uint64_t OldAddress) { @@ -3573,10 +3651,10 @@ void RewriteInstance::rewriteFile() { // Copy non-allocatable sections once allocatable part is finished. rewriteNoteSections(); - if (opts::Relocs) { - // Patch dynamic section/segment. - patchELFDynamic(); + // Patch dynamic section/segment. + patchELFDynamic(); + if (opts::Relocs) { patchELFRelaPLT(); patchELFGOT(); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index c971debf4aae..ece21e055011 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -290,6 +290,9 @@ class RewriteInstance { // Run ObjectLinkingLayer() with custom memory manager and symbol resolver. orc::ObjectLinkingLayer<> OLT; + /// Disassemble and create function entries for PLT. + void disassemblePLT(); + /// ELF-specific part. TODO: refactor into new class. #define ELF_FUNCTION(FUNC) \ template void FUNC(ELFObjectFile *Obj); \ @@ -481,12 +484,28 @@ class RewriteInstance { const llvm::DWARFFrame *EHFrame{nullptr}; SectionRef EHFrameSection; - /// .gdb_index section. - SectionRef GdbIndexSection; + /// .plt section. + SectionRef PLTSection; - /// .plt.got section. + /// .got.plt sections. + /// + /// Contains jump slots (addresses) indirectly referenced by + /// instructions in .plt section. + SectionRef GOTPLTSection; + + /// .plt.got section (#clowntown). + /// + /// A section sometimes generated by BFD linker. SectionRef PLTGOTSection; + /// .rela.plt section. + /// + /// Contains relocations against .got.plt. + SectionRef RelaPLTSection; + + /// .gdb_index section. + SectionRef GdbIndexSection; + uint64_t NewSymTabOffset{0}; /// Keep track of functions we fail to write in the binary. We need to avoid From fcaac10d663937e16b11b041663b7ef243603b1f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sun, 27 Aug 2017 17:04:06 -0700 Subject: [PATCH 299/904] [BOLT] Support PIC-style exception tables Summary: Exceptions tables for PIC may contain indirect type references that are also encoded using relative addresses. This diff adds support for such encodings. We read PIC-style type info table, and write it using new encoding. (cherry picked from commit 182ee160eec8d278c2710c4db3bb8d9cc3ac85c5) --- bolt/BinaryContext.cpp | 15 ++++ bolt/BinaryContext.h | 5 ++ bolt/BinaryFunction.h | 3 +- bolt/Exceptions.cpp | 155 ++++++++++++++++++++++++++++++++------- bolt/RewriteInstance.cpp | 2 +- 5 files changed, 151 insertions(+), 29 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 42b1b6e84f56..cf5a7d182a47 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -387,6 +387,21 @@ ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ return std::make_error_code(std::errc::bad_address); } +ErrorOr +BinaryContext::extractPointerAtAddress(uint64_t Address) const { + auto Section = getSectionForAddress(Address); + if (!Section) + return Section.getError(); + + StringRef SectionContents; + Section->getContents(SectionContents); + DataExtractor DE(SectionContents, + AsmInfo->isLittleEndian(), + AsmInfo->getPointerSize()); + uint32_t SectionOffset = Address - Section->getAddress(); + return DE.getAddress(&SectionOffset); +} + void BinaryContext::addSectionRelocation(SectionRef Section, uint64_t Offset, MCSymbol *Symbol, uint64_t Type, uint64_t Addend) { diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index d808f3da0dfd..e22d246905c7 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -207,6 +207,11 @@ class BinaryContext { /// Return (allocatable) section containing the given \p Address. ErrorOr getSectionForAddress(uint64_t Address) const; + /// Given \p Address in the binary, extract and return a pointer value at that + /// address. The address has to be a valid statically allocated address for + /// the binary. + ErrorOr extractPointerAtAddress(uint64_t Address) const; + /// Register a symbol with \p Name at a given \p Address. MCSymbol *registerNameAtAddress(const std::string &Name, uint64_t Address) { // Check if the Name was already registered. diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 53830c42b1ae..81b3ea20b90a 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -548,7 +548,8 @@ class BinaryFunction { /// Binary blobs reprsenting action, type, and type index tables for this /// function' LSDA (exception handling). - ArrayRef LSDAActionAndTypeTables; + ArrayRef LSDAActionTable; + std::vector LSDATypeTable; ArrayRef LSDATypeIndexTable; /// Marking for the beginning of language-specific data area for the function. diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 929a66d6dbc6..51021cfab77f 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -53,6 +53,28 @@ PrintExceptions("print-exceptions", namespace llvm { namespace bolt { +namespace { + +unsigned getEncodingSize(unsigned Encoding, BinaryContext &BC) { + switch (Encoding & 0x0f) { + default: llvm_unreachable("unknown encoding"); + case dwarf::DW_EH_PE_absptr: + case dwarf::DW_EH_PE_signed: + return BC.AsmInfo->getPointerSize(); + case dwarf::DW_EH_PE_udata2: + case dwarf::DW_EH_PE_sdata2: + return 2; + case dwarf::DW_EH_PE_udata4: + case dwarf::DW_EH_PE_sdata4: + return 4; + case dwarf::DW_EH_PE_udata8: + case dwarf::DW_EH_PE_sdata8: + return 8; + } +} + +} // anonymous namespace + // Read and dump the .gcc_exception_table section entry. // // .gcc_except_table section contains a set of Language-Specific Data Areas - @@ -88,19 +110,20 @@ namespace bolt { // these tables is encoded in LSDA header. Sizes for both of the tables are not // included anywhere. // -// For the purpose of rewriting exception handling tables, we can reuse action, -// types, and type index tables in their original binary format. -// This is only possible when type references are encoded as absolute addresses. -// We still have to parse all the tables to determine their sizes. Then we have +// We have to parse all of the tables to determine their sizes. Then we have // to parse the call site table and associate discovered information with // actual call instructions and landing pad blocks. // +// For the purpose of rewriting exception handling tables, we can reuse action, +// and type index tables in their original binary format. +// +// Type table could be encoded using position-independent references, and thus +// may require relocation. +// // Ideally we should be able to re-write LSDA in-place, without the need to // allocate a new space for it. Sadly there's no guarantee that the new call // site table will be the same size as GCC uses uleb encodings for PC offsets. // -// For split function re-writing we would need to split LSDA too. -// // Note: some functions have LSDA entries with 0 call site entries. void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, uint64_t LSDASectionAddress) { @@ -112,29 +135,37 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, assert(getLSDAAddress() < LSDASectionAddress + LSDASectionData.size() && "wrong LSDA address"); + // Given an address in memory corresponding to some entity in mapped + // LSDA section return address of this entity in a binary file. + auto getFileAddress = [&](const uint8_t *InMemAddress) { + return InMemAddress - LSDASectionData.data() + LSDASectionAddress; + }; const uint8_t *Ptr = LSDASectionData.data() + getLSDAAddress() - LSDASectionAddress; uint8_t LPStartEncoding = *Ptr++; uintptr_t LPStart = 0; if (LPStartEncoding != DW_EH_PE_omit) { - LPStart = readEncodedPointer(Ptr, LPStartEncoding); + LPStart = readEncodedPointer(Ptr, LPStartEncoding, getFileAddress(Ptr)); } assert(LPStart == 0 && "support for split functions not implemented"); - uint8_t TTypeEncoding = *Ptr++; + const auto TTypeEncoding = *Ptr++; + size_t TTypeEncodingSize = 0; uintptr_t TTypeEnd = 0; if (TTypeEncoding != DW_EH_PE_omit) { TTypeEnd = readULEB128(Ptr); + TTypeEncodingSize = getEncodingSize(TTypeEncoding, BC); } if (opts::PrintExceptions) { outs() << "[LSDA at 0x" << Twine::utohexstr(getLSDAAddress()) << " for function " << *this << "]:\n"; - outs() << "LPStart Encoding = " << (unsigned)LPStartEncoding << '\n'; + outs() << "LPStart Encoding = 0x" + << Twine::utohexstr(LPStartEncoding) << '\n'; outs() << "LPStart = 0x" << Twine::utohexstr(LPStart) << '\n'; - outs() << "TType Encoding = " << (unsigned)TTypeEncoding << '\n'; + outs() << "TType Encoding = 0x" << Twine::utohexstr(TTypeEncoding) << '\n'; outs() << "TType End = " << TTypeEnd << '\n'; } @@ -144,9 +175,12 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // Offset past the last decoded index. intptr_t MaxTypeIndexTableOffset = 0; + // Max positive index used in type table. + unsigned MaxTypeIndex = 0; + // The actual type info table starts at the same location, but grows in // opposite direction. TTypeEncoding is used to encode stored values. - auto TypeTableStart = reinterpret_cast(Ptr + TTypeEnd); + const auto TypeTableStart = Ptr + TTypeEnd; uint8_t CallSiteEncoding = *Ptr++; uint32_t CallSiteTableLength = readULEB128(Ptr); @@ -164,9 +198,12 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, HasEHRanges = CallSitePtr < CallSiteTableEnd; uint64_t RangeBase = getAddress(); while (CallSitePtr < CallSiteTableEnd) { - uintptr_t Start = readEncodedPointer(CallSitePtr, CallSiteEncoding); - uintptr_t Length = readEncodedPointer(CallSitePtr, CallSiteEncoding); - uintptr_t LandingPad = readEncodedPointer(CallSitePtr, CallSiteEncoding); + uintptr_t Start = readEncodedPointer(CallSitePtr, CallSiteEncoding, + getFileAddress(CallSitePtr)); + uintptr_t Length = readEncodedPointer(CallSitePtr, CallSiteEncoding, + getFileAddress(CallSitePtr)); + uintptr_t LandingPad = readEncodedPointer(CallSitePtr, CallSiteEncoding, + getFileAddress(CallSitePtr)); uintptr_t ActionEntry = readULEB128(CallSitePtr); if (opts::PrintExceptions) { @@ -220,13 +257,24 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (ActionEntry != 0) { auto printType = [&] (int Index, raw_ostream &OS) { assert(Index > 0 && "only positive indices are valid"); - assert(TTypeEncoding == DW_EH_PE_udata4 && - "only udata4 supported for TTypeEncoding"); - auto TypeAddress = *(TypeTableStart - Index); + const uint8_t *TTEntry = TypeTableStart - Index * TTypeEncodingSize; + const auto TTEntryAddress = getFileAddress(TTEntry); + auto TypeAddress = readEncodedPointer(TTEntry, + TTypeEncoding, + TTEntryAddress); + if ((TTypeEncoding & DW_EH_PE_pcrel) && + (TypeAddress == TTEntryAddress)) { + TypeAddress = 0; + } if (TypeAddress == 0) { OS << ""; return; } + if (TTypeEncoding & DW_EH_PE_indirect) { + auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress); + assert(PointerOrErr && "failed to decode indirect address"); + TypeAddress = *PointerOrErr; + } auto NI = BC.GlobalAddresses.find(TypeAddress); if (NI != BC.GlobalAddresses.end()) { OS << NI->second; @@ -251,6 +299,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, outs() << "cleanup"; } else if (ActionType > 0) { // It's an index into a type table. + MaxTypeIndex = std::max(MaxTypeIndex, + static_cast(ActionType)); if (opts::PrintExceptions) { outs() << "catch type "; printType(ActionType, outs()); @@ -265,6 +315,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // encoded using uleb128 thus we cannot directly dereference them. auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; while (auto Index = readULEB128(TypeIndexTablePtr)) { + MaxTypeIndex = std::max(MaxTypeIndex, static_cast(Index)); if (opts::PrintExceptions) { outs() << TSep; printType(Index, outs()); @@ -293,9 +344,27 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (TTypeEnd) { // TypeIndexTableStart is a alias for TypeTableStart. - LSDAActionAndTypeTables = - ArrayRef(ActionTableStart, - TypeIndexTableStart - ActionTableStart); + LSDAActionTable = + ArrayRef(ActionTableStart, TypeIndexTableStart - + MaxTypeIndex * TTypeEncodingSize - ActionTableStart); + for (unsigned Index = 1; Index <= MaxTypeIndex; ++Index) { + const uint8_t *TTEntry = TypeTableStart - Index * TTypeEncodingSize; + const auto TTEntryAddress = getFileAddress(TTEntry); + auto TypeAddress = readEncodedPointer(TTEntry, + TTypeEncoding, + TTEntryAddress); + if ((TTypeEncoding & DW_EH_PE_pcrel) && + (TypeAddress == TTEntryAddress)) { + TypeAddress = 0; + } + if (TypeAddress && + (TTypeEncoding & DW_EH_PE_indirect)) { + auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress); + assert(PointerOrErr && "failed to decode indirect address"); + TypeAddress = *PointerOrErr; + } + LSDATypeTable.emplace_back(TypeAddress); + } LSDATypeIndexTable = ArrayRef(TypeIndexTableStart, MaxTypeIndexTableOffset); } @@ -446,8 +515,8 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { Streamer->SwitchSection(BC.MOFI->getLSDASection()); - // When we read we make sure only the following encoding is supported. - constexpr unsigned TTypeEncoding = dwarf::DW_EH_PE_udata4; + const auto TTypeEncoding = BC.MOFI->getTTypeEncoding(); + const auto TTypeEncodingSize = getEncodingSize(TTypeEncoding, BC); // Type tables have to be aligned at 4 bytes. Streamer->EmitValueToAlignment(4); @@ -470,7 +539,8 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { sizeof(int8_t) + // Call site format CallSiteTableLengthSize + // Call site table length size CallSiteTableLength + // Call site table length - LSDAActionAndTypeTables.size(); // Actions + Types size + LSDAActionTable.size() + // Actions table size + LSDATypeTable.size() * TTypeEncodingSize; // Types table size unsigned TTypeBaseOffsetSize = getULEB128Size(TTypeBaseOffset); unsigned TotalSize = sizeof(int8_t) + // LPStart format @@ -514,12 +584,43 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { // Write out action, type, and type index tables at the end. // - // There's no need to change the original format we saw on input - // unless we are doing a function splitting in which case we can - // perhaps split and optimize the tables. - for (auto const &Byte : LSDAActionAndTypeTables) { + // For action and type index tables there's no need to change the original + // table format unless we are doing function splitting, in which case we can + // split and optimize the tables. + // + // For type table we (re-)encode the table using TTypeEncoding matching + // the current assembler mode. + for (auto const &Byte : LSDAActionTable) { Streamer->EmitIntValue(Byte, 1); } + assert(!(TTypeEncoding & dwarf::DW_EH_PE_indirect) && + "indirect type info encoding is not supported yet"); + for (int Index = LSDATypeTable.size() - 1; Index >= 0; --Index) { + // Note: the address could be an indirect one. + const auto TypeAddress = LSDATypeTable[Index]; + switch (TTypeEncoding & 0x70) { + default: + llvm_unreachable("unsupported TTypeEncoding"); + case 0: + Streamer->EmitIntValue(TypeAddress, TTypeEncodingSize); + break; + case dwarf::DW_EH_PE_pcrel: { + if (TypeAddress) { + const auto *TypeSymbol = BC.getOrCreateGlobalSymbol(TypeAddress, "TI"); + auto *DotSymbol = BC.Ctx->createTempSymbol(); + Streamer->EmitLabel(DotSymbol); + const auto *SubDotExpr = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(TypeSymbol, *BC.Ctx), + MCSymbolRefExpr::create(DotSymbol, *BC.Ctx), + *BC.Ctx); + Streamer->EmitValue(SubDotExpr, TTypeEncodingSize); + } else { + Streamer->EmitIntValue(0, TTypeEncodingSize); + } + break; + } + } + } for (auto const &Byte : LSDATypeIndexTable) { Streamer->EmitIntValue(Byte, 1); } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 5dbb463a2ce8..957cfb547ee1 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -559,7 +559,7 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, std::unique_ptr Ctx = llvm::make_unique(AsmInfo.get(), MRI.get(), MOFI.get()); MOFI->InitMCObjectFileInfo(*TheTriple, Reloc::Default, - CodeModel::Default, *Ctx); + CodeModel::Small, *Ctx); std::unique_ptr DisAsm( TheTarget->createMCDisassembler(*STI, *Ctx)); From c45f13384699d072591a37a1340ee06f4f28e589 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 31 Aug 2017 17:28:14 -0700 Subject: [PATCH 300/904] [BOLT] Fix bug in SCTC Summary: After SCTC optimization fixDoubleJumps() was relying on CFG information on the number of successors of a basic block. It ignored the fact that conditional tail call had a successor outside of the function and deleted a containing basic block. Discovered while testing old HHVM with disabled jump tables. (cherry picked from commit 9355176bf48ee661b72d4bd3f9480bd708cc28a7) --- bolt/BinaryPassManager.cpp | 8 ++++---- bolt/Passes/BinaryPasses.cpp | 20 ++++++++++++-------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 24bbed8a3bf1..4d23fdf5bf0f 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -368,6 +368,10 @@ void BinaryFunctionPassManager::runAllPasses( InitialDynoStats, "after all optimizations before SCTC and FOP"), opts::PrintDynoStats | opts::DynoStatsAll); + // Add the StokeInfo pass, which extract functions for stoke optimization and + // get the liveness information for them + Manager.registerPass(llvm::make_unique(PrintStoke), opts::Stoke); + // This pass introduces conditional jumps into external functions. // Between extending CFG to support this and isolating this pass we chose // the latter. Thus this pass will do double jump removal and unreachable @@ -381,10 +385,6 @@ void BinaryFunctionPassManager::runAllPasses( llvm::make_unique(PrintSCTC), opts::SimplifyConditionalTailCalls); - // Add the StokeInfo pass, which extract functions for stoke optimization and - // get the liveness information for them - Manager.registerPass(llvm::make_unique(PrintStoke), opts::Stoke); - // This pass should always run last.* Manager.registerPass(llvm::make_unique(PrintFinalized)); diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index fffc47ec46f1..715acaea048a 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -487,6 +487,10 @@ uint64_t fixDoubleJumps(BinaryContext &BC, if (!BC.MIA->isUnconditionalBranch(*Inst) && !IsTailCall) continue; + // If we operate after SCTC make sure it's not a conditional tail call. + if (IsTailCall && BC.MIA->isConditionalBranch(*Inst)) + continue; + const auto *SuccSym = BC.MIA->getTargetSymbol(*Inst); auto *Succ = BB.getSuccessor(); @@ -517,11 +521,11 @@ uint64_t fixDoubleJumps(BinaryContext &BC, } -bool -SimplifyConditionalTailCalls::shouldRewriteBranch(const BinaryBasicBlock *PredBB, - const MCInst &CondBranch, - const BinaryBasicBlock *BB, - const bool DirectionFlag) { +bool SimplifyConditionalTailCalls::shouldRewriteBranch( + const BinaryBasicBlock *PredBB, + const MCInst &CondBranch, + const BinaryBasicBlock *BB, + const bool DirectionFlag) { const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); if (IsForward) @@ -564,8 +568,9 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, uint64_t NumLocalCTCs = 0; uint64_t LocalCTCTakenCount = 0; uint64_t LocalCTCExecCount = 0; - std::vector> - NeedsUncondBranch; + std::vector> NeedsUncondBranch; // Will block be deleted by UCE? auto isValid = [](const BinaryBasicBlock *BB) { @@ -733,7 +738,6 @@ void SimplifyConditionalTailCalls::runOnFunctions( if (!shouldOptimize(Function)) continue; - // Fix tail calls to reduce branch mispredictions. if (fixTailCalls(BC, Function)) { Modified.insert(&Function); } From 1de1c56e7cafd87b119381f109b32684ef186aa7 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 13 Sep 2017 11:21:47 -0700 Subject: [PATCH 301/904] [BOLT] Ignore TLS relocations types Summary: No special handling is required for TLS relocations types, and if we see them in the binary we can safely ignore those types. (cherry picked from commit 866e0951c01f32bfe1756ddcab2fa80038ff3590) --- bolt/RewriteInstance.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 957cfb547ee1..d023770a4faf 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1529,6 +1529,13 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { << "; type name = " << TypeName << '\n'); + if (Rel.getType() == ELF::R_X86_64_TLSGD || + Rel.getType() == ELF::R_X86_64_TLSLD || + Rel.getType() == ELF::R_X86_64_DTPOFF32) { + DEBUG(dbgs() << "skipping relocation\n"); + continue; + } + // Extract value. uint32_t RelocationOffset = Rel.getOffset() - RelocatedSection.getAddress(); From 549e9973f0940efab996d518b23281087d47f1c9 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 2 Aug 2017 10:59:33 -0700 Subject: [PATCH 302/904] [BOLT] Introduce non-LBR mode Summary: Add support to read profiles collected without LBR. This involves adapting our data aggregator perf2bolt and adding support in llvm-bolt itself to read this data. This patch also introduces different options to convert basic block execution count to edge count, so BOLT can operate with its regular algorithms to perform basic block layout. The most successful approach is the default one. (cherry picked from commit 710287ab7369ce16518245f4afe7936fa193adf8) --- bolt/BinaryBasicBlock.cpp | 10 ++ bolt/BinaryBasicBlock.h | 14 +++ bolt/BinaryFunction.cpp | 110 +++++++++++++++++++- bolt/BinaryFunction.h | 16 +++ bolt/BinaryPassManager.h | 3 +- bolt/DataReader.cpp | 166 ++++++++++++++++++++++++++++--- bolt/DataReader.h | 97 +++++++++++++++++- bolt/Passes/CMakeLists.txt | 3 +- bolt/merge-fdata/merge-fdata.cpp | 128 +++++++++++++++++++++--- 9 files changed, 513 insertions(+), 34 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 49eebb8ac5ac..8664f712dd96 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -373,6 +373,16 @@ void BinaryBasicBlock::addTailCallInstruction(const MCSymbol *Target) { Instructions.emplace_back(std::move(NewInst)); } +uint32_t BinaryBasicBlock::getNumCalls() const { + uint32_t N{0}; + auto &BC = Function->getBinaryContext(); + for (auto &Instr : Instructions) { + if (BC.MIA->isCall(Instr)) + ++N; + } + return N; +} + uint32_t BinaryBasicBlock::getNumPseudos() const { #ifndef NDEBUG auto &BC = Function->getBinaryContext(); diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index db610f7b313d..96a56f0e4d30 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -349,6 +349,17 @@ class BinaryBasicBlock { return BranchInfo[Condition == true ? 0 : 1]; }; + BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ) { + auto BI = branch_info_begin(); + for (auto BB : successors()) { + if (&Succ == BB) + return *BI; + ++BI; + } + llvm_unreachable("Invalid successor"); + return *BI; + } + /// Try to compute the taken and misprediction frequencies for the given /// successor. The result is an error if no information can be found. ErrorOr> @@ -368,6 +379,9 @@ class BinaryBasicBlock { /// to the end of this basic block. void addTailCallInstruction(const MCSymbol *Target); + /// Return the number of call instructions in this basic block. + uint32_t getNumCalls() const; + /// Get landing pad with given label. Returns nullptr if no such /// landing pad is found. BinaryBasicBlock *getLandingPad(const MCSymbol *Label) const; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a4b6539df462..a3d1b62c1cfe 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -13,6 +13,7 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" #include "DataReader.h" +#include "Passes/MCF.h" #include "Passes/ReorderAlgorithm.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -129,6 +130,27 @@ SplitEH("split-eh", cl::Hidden, cl::cat(BoltOptCategory)); +cl::opt +DoMCF("mcf", + cl::desc("solve a min cost flow problem on the CFG to fix edge counts " + "(default=disable)"), + cl::init(MCF_DISABLE), + cl::values( + clEnumValN(MCF_DISABLE, "none", + "disable MCF"), + clEnumValN(MCF_LINEAR, "linear", + "cost function is inversely proportional to edge count"), + clEnumValN(MCF_QUADRATIC, "quadratic", + "cost function is inversely proportional to edge count squared"), + clEnumValN(MCF_LOG, "log", + "cost function is inversely proportional to log of edge count"), + clEnumValN(MCF_BLAMEFTS, "blamefts", + "tune cost to blame fall-through edges for surplus flow"), + clEnumValEnd), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + bool shouldPrint(const BinaryFunction &Function) { if (PrintOnly.empty()) return true; @@ -1839,10 +1861,15 @@ bool BinaryFunction::buildCFG() { addLandingPads(0, BasicBlocks.size()); // Infer frequency for non-taken branches - if (hasValidProfile()) + if (hasValidProfile() && opts::DoMCF != MCF_DISABLE) { + // Convert COUNT_NO_PROFILE to 0 + removeTagsFromProfile(); + solveMCF(*this, opts::DoMCF); + } else if (hasValidProfile()) { inferFallThroughCounts(); - else + } else { clearProfile(); + } // Assign CFI information to each BB entry. annotateCFIState(); @@ -1875,6 +1902,14 @@ bool BinaryFunction::buildCFG() { // Eliminate inconsistencies between branch instructions and CFG. postProcessBranches(); + // If our profiling data comes from samples instead of LBR entries, + // now is the time to read this data and attach it to BBs. At this point, + // conditional tail calls are converted into a branch and a new basic block, + // making it slightly different than the original binary where profiled data + // was collected. However, this shouldn't matter for plain sampling events. + if (!BC.DR.hasLBR()) + readSampleData(); + // Clean-up memory taken by instructions and labels. // // NB: don't clear Labels list as we may need them if we mark the function @@ -1900,6 +1935,71 @@ bool BinaryFunction::buildCFG() { return true; } +void BinaryFunction::removeTagsFromProfile() { + for (auto *BB : BasicBlocks) { + if (BB->ExecutionCount == BinaryBasicBlock::COUNT_NO_PROFILE) + BB->ExecutionCount = 0; + for (auto &BI : BB->branch_info()) { + if (BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE && + BI.MispredictedCount != BinaryBasicBlock::COUNT_NO_PROFILE) + continue; + BI.Count = 0; + BI.MispredictedCount = 0; + } + } +} + +void BinaryFunction::readSampleData() { + auto SampleDataOrErr = BC.DR.getFuncSampleData(getNames()); + + if (!SampleDataOrErr) + return; + + // Non-LBR mode territory + // First step is to assign BB execution count based on samples from perf + ProfileMatchRatio = 1.0f; + removeTagsFromProfile(); + bool NormalizeByInsnCount = + BC.DR.usesEvent("cycles") || BC.DR.usesEvent("instructions"); + bool NormalizeByCalls = BC.DR.usesEvent("branches"); + static bool NagUser{true}; + if (NagUser) { + outs() << "BOLT-INFO: operating with non-LBR profiling data.\n"; + if (NormalizeByInsnCount) { + outs() << "BOLT-INFO: normalizing samples by instruction count.\n"; + } else if (NormalizeByCalls) { + outs() << "BOLT-INFO: normalizing samples by branches.\n"; + } + NagUser = false; + } + uint64_t LastOffset = getSize(); + uint64_t TotalEntryCount{0}; + for (auto I = BasicBlockOffsets.rbegin(), E = BasicBlockOffsets.rend(); + I != E; ++I) { + uint64_t CurOffset = I->first; + // Always work with samples multiplied by 1000 to avoid losing them if we + // later need to normalize numbers + uint64_t NumSamples = + SampleDataOrErr->getSamples(CurOffset, LastOffset) * 1000; + if (NormalizeByInsnCount && I->second->getNumNonPseudos()) + NumSamples /= I->second->getNumNonPseudos(); + else if (NormalizeByCalls) { + uint32_t NumCalls = I->second->getNumCalls(); + NumSamples /= NumCalls + 1; + } + I->second->setExecutionCount(NumSamples); + if (I->second->isEntryPoint()) + TotalEntryCount += NumSamples; + LastOffset = CurOffset; + } + ExecutionCount = TotalEntryCount; + + estimateEdgeCounts(BC, *this); + + if (opts::DoMCF != MCF_DISABLE) + solveMCF(*this, opts::DoMCF); +} + void BinaryFunction::addEntryPoint(uint64_t Address) { assert(containsAddress(Address) && "address does not belong to the function"); @@ -1987,6 +2087,12 @@ bool BinaryFunction::fetchProfileForOtherEntryPoints() { } void BinaryFunction::matchProfileData() { + // This functionality is available for LBR-mode only + // TODO: Implement evaluateProfileData() for samples, checking whether + // sample addresses match instruction addresses in the function + if (!BC.DR.hasLBR()) + return; + if (BranchData) { ProfileMatchRatio = evaluateProfileData(*BranchData); if (ProfileMatchRatio == 1.0f) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 81b3ea20b90a..6f8632b4b80b 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -1763,6 +1763,18 @@ class BinaryFunction { /// using profile information. void removeConditionalTailCalls(); + // Convert COUNT_NO_PROFILE to 0 + void removeTagsFromProfile(); + + /// If our profile data comes from sample addresses instead of LBR entries, + /// collect sample count for all addresses in this function address space, + /// aggregating them per basic block and assigning an execution count to each + /// basic block based on the number of samples recorded at those addresses. + /// The last step is to infer edge counts based on BB execution count. Note + /// this is the opposite of the LBR way, where we infer BB execution count + /// based on edge counts. + void readSampleData(); + /// Computes a function hotness score: the sum of the products of BB frequency /// and size. uint64_t getFunctionScore(); @@ -2038,9 +2050,11 @@ template <> struct GraphTraits : typedef bolt::BinaryBasicBlock * nodes_iterator; static nodes_iterator nodes_begin(bolt::BinaryFunction *F) { + llvm_unreachable("Not implemented"); return &(*F->begin()); } static nodes_iterator nodes_end(bolt::BinaryFunction *F) { + llvm_unreachable("Not implemented"); return &(*F->end()); } static size_t size(bolt::BinaryFunction *F) { @@ -2056,9 +2070,11 @@ template <> struct GraphTraits : typedef const bolt::BinaryBasicBlock * nodes_iterator; static nodes_iterator nodes_begin(const bolt::BinaryFunction *F) { + llvm_unreachable("Not implemented"); return &(*F->begin()); } static nodes_iterator nodes_end(const bolt::BinaryFunction *F) { + llvm_unreachable("Not implemented"); return &(*F->end()); } static size_t size(const bolt::BinaryFunction *F) { diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h index f744b71d5de0..ceacc33cdca2 100644 --- a/bolt/BinaryPassManager.h +++ b/bolt/BinaryPassManager.h @@ -31,9 +31,10 @@ class BinaryFunctionPassManager { std::set &LargeFunctions; std::vector>> Passes; - static const char TimerGroupName[]; public: + static const char TimerGroupName[]; + BinaryFunctionPassManager(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 5031b27657f7..1112e748af28 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -78,6 +78,36 @@ void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) { } } +void SampleInfo::mergeWith(const SampleInfo &SI) { + Occurrences += SI.Occurrences; +} + +void SampleInfo::print(raw_ostream &OS) const { + OS << Address.IsSymbol << " " << Address.Name << " " + << Twine::utohexstr(Address.Offset) << " " + << Occurrences << "\n"; +} + +uint64_t +FuncSampleData::getSamples(uint64_t Start, uint64_t End) const { + assert(std::is_sorted(Data.begin(), Data.end())); + struct Compare { + bool operator()(const SampleInfo &SI, const uint64_t Val) const { + return SI.Address.Offset < Val; + } + bool operator()(const uint64_t Val, const SampleInfo &SI) const { + return Val < SI.Address.Offset; + } + }; + uint64_t Result{0}; + for (auto I = std::lower_bound(Data.begin(), Data.end(), Start, Compare()), + E = std::lower_bound(Data.begin(), Data.end(), End, Compare()); + I != E; ++I) { + Result += I->Occurrences; + } + return Result; +} + void BranchInfo::mergeWith(const BranchInfo &BI) { // Merge branch and misprediction counts. @@ -406,6 +436,48 @@ ErrorOr DataReader::parseBranchInfo() { std::move(Histories)); } +ErrorOr DataReader::parseSampleInfo() { + auto Res = parseLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location Address = Res.get(); + + auto BRes = parseNumberField(FieldSeparator, /* EndNl = */ true); + if (std::error_code EC = BRes.getError()) + return EC; + int64_t Occurrences = BRes.get(); + + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + return SampleInfo(std::move(Address), Occurrences); +} + +ErrorOr DataReader::maybeParseNoLBRFlag() { + if (ParsingBuf.size() < 6 || ParsingBuf.substr(0, 6) != "no_lbr") + return false; + ParsingBuf = ParsingBuf.drop_front(6); + Col += 6; + + if (ParsingBuf.size() > 0 && ParsingBuf[0] == ' ') + ParsingBuf = ParsingBuf.drop_front(1); + + while (ParsingBuf.size() > 0 && ParsingBuf[0] != '\n') { + auto EventName = parseString(' ', true); + if (!EventName) + return make_error_code(llvm::errc::io_error); + EventNames.insert(EventName.get()); + } + + if (!checkAndConsumeNewLine()) { + reportError("malformed no_lbr line"); + return make_error_code(llvm::errc::io_error); + } + return true; +} + bool DataReader::hasData() { if (ParsingBuf.size() == 0) return false; @@ -415,12 +487,48 @@ bool DataReader::hasData() { return false; } +std::error_code DataReader::parseInNoLBRMode() { + auto GetOrCreateFuncEntry = [&](StringRef Name) { + auto I = FuncsToSamples.find(Name); + if (I == FuncsToSamples.end()) { + bool success; + std::tie(I, success) = FuncsToSamples.insert(std::make_pair( + Name, FuncSampleData(Name, FuncSampleData::ContainerTy()))); + + assert(success && "unexpected result of insert"); + } + return I; + }; + + while (hasData()) { + auto Res = parseSampleInfo(); + if (std::error_code EC = Res.getError()) + return EC; + + SampleInfo SI = Res.get(); + + // Ignore samples not involving known locations + if (!SI.Address.IsSymbol) + continue; + + auto I = GetOrCreateFuncEntry(SI.Address.Name); + I->getValue().Data.emplace_back(std::move(SI)); + } + + for (auto &FuncSamples : FuncsToSamples) { + std::stable_sort(FuncSamples.second.Data.begin(), + FuncSamples.second.Data.end()); + } + + return std::error_code(); +} + std::error_code DataReader::parse() { auto GetOrCreateFuncEntry = [&](StringRef Name) { - auto I = FuncsMap.find(Name); - if (I == FuncsMap.end()) { + auto I = FuncsToBranches.find(Name); + if (I == FuncsToBranches.end()) { bool success; - std::tie(I, success) = FuncsMap.insert( + std::tie(I, success) = FuncsToBranches.insert( std::make_pair(Name, FuncBranchData(Name, FuncBranchData::ContainerTy(), FuncBranchData::ContainerTy()))); @@ -431,6 +539,13 @@ std::error_code DataReader::parse() { Col = 0; Line = 1; + auto FlagOrErr = maybeParseNoLBRFlag(); + if (!FlagOrErr) + return FlagOrErr.getError(); + NoLBRMode = *FlagOrErr; + if (NoLBRMode) + return parseInNoLBRMode(); + while (hasData()) { auto Res = parseBranchInfo(); if (std::error_code EC = Res.getError()) @@ -462,7 +577,7 @@ std::error_code DataReader::parse() { } } - for (auto &FuncBranches : FuncsMap) { + for (auto &FuncBranches : FuncsToBranches) { std::stable_sort(FuncBranches.second.Data.begin(), FuncBranches.second.Data.end()); } @@ -471,7 +586,7 @@ std::error_code DataReader::parse() { } void DataReader::buildLTONameMap() { - for (auto &FuncData : FuncsMap) { + for (auto &FuncData : FuncsToBranches) { const auto FuncName = FuncData.getKey(); const auto CommonName = getLTOCommonName(FuncName); if (CommonName) @@ -479,17 +594,30 @@ void DataReader::buildLTONameMap() { } } -FuncBranchData * -DataReader::getFuncBranchData(const std::vector &FuncNames) { +namespace { +template +decltype(MapTy::MapEntryTy::second) * +fetchMapEntry(MapTy &Map, const std::vector &FuncNames) { // Do a reverse order iteration since the name in profile has a higher chance // of matching a name at the end of the list. for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { - auto I = FuncsMap.find(normalizeName(*FI)); - if (I != FuncsMap.end()) + auto I = Map.find(normalizeName(*FI)); + if (I != Map.end()) return &I->getValue(); } return nullptr; } +} + +FuncBranchData * +DataReader::getFuncBranchData(const std::vector &FuncNames) { + return fetchMapEntry(FuncsToBranches, FuncNames); +} + +FuncSampleData * +DataReader::getFuncSampleData(const std::vector &FuncNames) { + return fetchMapEntry(FuncsToSamples, FuncNames); +} std::vector DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) { @@ -507,8 +635,8 @@ DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) { AllData.insert(AllData.end(), CommonData.begin(), CommonData.end()); } } else { - auto I = FuncsMap.find(Name); - if (I != FuncsMap.end()) { + auto I = FuncsToBranches.find(Name); + if (I != FuncsToBranches.end()) { return {&I->getValue()}; } } @@ -517,7 +645,7 @@ DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) { } bool DataReader::hasLocalsWithFileName() const { - for (const auto &Func : FuncsMap) { + for (const auto &Func : FuncsToBranches) { const auto &FuncName = Func.getKey(); if (FuncName.count('/') == 2 && FuncName[0] != '/') return true; @@ -526,7 +654,7 @@ bool DataReader::hasLocalsWithFileName() const { } void DataReader::dump() const { - for (const auto &Func : FuncsMap) { + for (const auto &Func : FuncsToBranches) { Diag << Func.getKey() << " branches:\n"; for (const auto &BI : Func.getValue().Data) { Diag << BI.From.Name << " " << BI.From.Offset << " " << BI.To.Name << " " @@ -552,6 +680,18 @@ void DataReader::dump() const { } } } + + for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) { + StringRef Event = I->getKey(); + Diag << "Data was collected with event: " << Event << "\n"; + } + for (const auto &Func : FuncsToSamples) { + Diag << Func.getKey() << " samples:\n"; + for (const auto &SI : Func.getValue().Data) { + Diag << SI.Address.Name << " " << SI.Address.Offset << " " + << SI.Occurrences << "\n"; + } + } } } // namespace bolt diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 483b3c966ffd..b667fe186308 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -18,6 +18,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ErrorOr.h" @@ -163,6 +164,47 @@ struct FuncBranchData { void appendFrom(const FuncBranchData &FBD, uint64_t Offset); }; +/// Similar to BranchInfo, but instead of recording from-to address (an edge), +/// it records the address of a perf event and the number of times samples hit +/// this address. +struct SampleInfo { + Location Address; // FIXME: Change this name to Loc + int64_t Occurrences; // FIXME: Variable name is horrible + + SampleInfo(Location Address, int64_t Occurrences) + : Address(std::move(Address)), Occurrences(Occurrences) {} + + bool operator==(const SampleInfo &RHS) const { + return Address == RHS.Address; + } + + bool operator<(const SampleInfo &RHS) const { + if (Address < RHS.Address) + return true; + + return false; + } + + void print(raw_ostream &OS) const; + + void mergeWith(const SampleInfo &SI); +}; + +/// Helper class to store samples recorded in the address space of a given +/// function, analogous to FuncBranchData but for samples instead of branches. +struct FuncSampleData { + typedef std::vector ContainerTy; + + StringRef Name; + ContainerTy Data; + + FuncSampleData(StringRef Name, ContainerTy Data) + : Name(Name), Data(std::move(Data)) {} + + /// Get the number of samples recorded in [Start, End) + uint64_t getSamples(uint64_t Start, uint64_t End) const; +}; + //===----------------------------------------------------------------------===// // /// DataReader Class @@ -222,21 +264,45 @@ class DataReader { /// offset d. std::error_code parse(); + /// When no_lbr is the first line of the file, activate No LBR mode. In this + /// mode we read the addresses where samples were recorded directly instead of + /// LBR entries. The line format is almost the same, except for a missing + /// triple and a missing mispredictions field: + /// + /// no_lbr + /// + /// ... + /// + /// Example: + /// + /// no_lbr # First line of fdata file + /// 1 BZ2_compressBlock 466c 3 + /// 1 BZ2_hbMakeCodeLengths 29c 1 + /// + std::error_code parseInNoLBRMode(); + /// Return branch data matching one of the names in \p FuncNames. FuncBranchData * getFuncBranchData(const std::vector &FuncNames); + FuncSampleData * + getFuncSampleData(const std::vector &FuncNames); + /// Return a vector of all FuncBranchData matching the list of names. /// Internally use fuzzy matching to match special names like LTO-generated /// function names. std::vector getFuncBranchDataRegex(const std::vector &FuncNames); - using FuncsMapType = StringMap; + using FuncsToBranchesMapTy = StringMap; + using FuncsToSamplesMapTy = StringMap; - FuncsMapType &getAllFuncsData() { return FuncsMap; } + FuncsToBranchesMapTy &getAllFuncsBranchData() { return FuncsToBranches; } + FuncsToSamplesMapTy &getAllFuncsSampleData() { return FuncsToSamples; } - const FuncsMapType &getAllFuncsData() const { return FuncsMap; } + const FuncsToBranchesMapTy &getAllFuncsData() const { + return FuncsToBranches; + } /// Return true if profile contains an entry for a local function /// that has a non-empty associated file name. @@ -245,6 +311,24 @@ class DataReader { /// Dumps the entire data structures parsed. Used for debugging. void dump() const; + /// Return false only if we are running with profiling data that lacks LBR. + bool hasLBR() const { return !NoLBRMode; } + + /// Return true if event named \p Name was used to collect this profile data. + bool usesEvent(StringRef Name) const { + for (auto I = EventNames.begin(), E = EventNames.end(); I != E; ++I) { + StringRef Event = I->getKey(); + if (Event.find(Name) != StringRef::npos) + return true; + } + return false; + } + + /// Return all event names used to collect this profile + const StringSet<> &getEventNames() const { + return EventNames; + } + private: void reportError(StringRef ErrorMsg); @@ -255,6 +339,8 @@ class DataReader { ErrorOr parseLocation(char EndChar, bool EndNl=false); ErrorOr parseBranchHistory(); ErrorOr parseBranchInfo(); + ErrorOr parseSampleInfo(); + ErrorOr maybeParseNoLBRFlag(); bool hasData(); /// Build suffix map once the profile data is parsed. @@ -266,7 +352,10 @@ class DataReader { StringRef ParsingBuf; unsigned Line; unsigned Col; - FuncsMapType FuncsMap; + FuncsToBranchesMapTy FuncsToBranches; + FuncsToSamplesMapTy FuncsToSamples; + bool NoLBRMode; + StringSet<> EventNames; static const char FieldSeparator = ' '; /// Map of common LTO names to possible matching profiles. diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 9bcd9e72bb85..8619ad21f0c4 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -13,8 +13,9 @@ add_llvm_library(LLVMBOLTPasses IndirectCallPromotion.cpp Inliner.cpp LivenessAnalysis.cpp - PLTCall.cpp + MCF.cpp PettisAndHansen.cpp + PLTCall.cpp RegAnalysis.cpp ReorderAlgorithm.cpp ReorderFunctions.cpp diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/merge-fdata/merge-fdata.cpp index 8e847ca3a3ad..8a0a17fa5841 100644 --- a/bolt/merge-fdata/merge-fdata.cpp +++ b/bolt/merge-fdata/merge-fdata.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "../DataReader.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Object/Binary.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/PrettyStackTrace.h" @@ -91,7 +92,9 @@ int main(int argc, char **argv) { ToolName = argv[0]; // All merged data. - DataReader::FuncsMapType MergedFunctionsData; + DataReader::FuncsToBranchesMapTy MergedFunctionsBranchData; + DataReader::FuncsToSamplesMapTy MergedFunctionsSampleData; + StringSet<> EventNames; // Merged functions data has to replace strings refs with strings from the // pool. @@ -140,9 +143,20 @@ int main(int argc, char **argv) { AllStrings.emplace_back(ToNamePtr); // keep the reference }; + auto CopySampleInfo = [&](const SampleInfo &SI, + std::vector &SIData) { + auto NamePtr = MergedStringPool.intern(SI.Address.Name); + BranchHistories Histories; + SIData.emplace_back(SampleInfo(Location(SI.Address.IsSymbol, + *NamePtr, + SI.Address.Offset), + SI.Occurrences)); + AllStrings.emplace_back(NamePtr); // keep the reference + }; + // Simply replace string references in BranchInfo with internal storage // references. - auto replaceStringRefs = [&] (BranchInfo &BI) { + auto replaceBIStringRefs = [&] (BranchInfo &BI) { auto FromNamePtr = MergedStringPool.intern(BI.From.Name); BI.From.Name = *FromNamePtr; AllStrings.emplace_back(FromNamePtr); // keep the reference @@ -163,6 +177,12 @@ int main(int argc, char **argv) { } }; + auto replaceSIStringRefs = [&] (SampleInfo &SI) { + auto NamePtr = MergedStringPool.intern(SI.Address.Name); + SI.Address.Name = *NamePtr; + AllStrings.emplace_back(NamePtr); // keep the reference + }; + for (auto &InputDataFilename : opts::InputDataFilenames) { if (!sys::fs::exists(InputDataFilename)) report_error(InputDataFilename, errc::no_such_file_or_directory); @@ -175,9 +195,17 @@ int main(int argc, char **argv) { if (std::error_code EC = ReaderOrErr.getError()) report_error(InputDataFilename, EC); - for (auto &FI : ReaderOrErr.get()->getAllFuncsData()) { - auto MI = MergedFunctionsData.find(FI.second.Name); - if (MI != MergedFunctionsData.end()) { + if ((ReaderOrErr.get()->hasLBR() && MergedFunctionsSampleData.size() > 0) || + (!ReaderOrErr.get()->hasLBR() && + MergedFunctionsBranchData.size() > 0)) { + errs() << "Cannot merge LBR profile with non-LBR " + "profile\n"; + return EXIT_FAILURE; + } + + for (auto &FI : ReaderOrErr.get()->getAllFuncsBranchData()) { + auto MI = MergedFunctionsBranchData.find(FI.second.Name); + if (MI != MergedFunctionsBranchData.end()) { MI->second.ExecutionCount += FI.second.ExecutionCount; std::vector TmpBI; for (auto &BI : FI.second.Data) { @@ -186,7 +214,7 @@ int main(int argc, char **argv) { MI->second.Data.end(), BI); if (TI != MI->second.Data.end() && *TI == BI) { - replaceStringRefs(BI); + replaceBIStringRefs(BI); TI->mergeWith(BI); } else { CopyBranchInfo(BI, TmpBI); @@ -208,7 +236,7 @@ int main(int argc, char **argv) { auto NamePtr = MergedStringPool.intern(FI.second.Name); AllStrings.emplace_back(NamePtr); // keep the ref bool Success; - std::tie(MI, Success) = MergedFunctionsData.insert( + std::tie(MI, Success) = MergedFunctionsBranchData.insert( std::make_pair(*NamePtr, FuncBranchData(*NamePtr, FuncBranchData::ContainerTy()))); @@ -218,7 +246,7 @@ int main(int argc, char **argv) { BranchInfo *PrevBI = nullptr; for (auto &BI : FI.second.Data) { if (PrevBI && *PrevBI == BI) { - replaceStringRefs(BI); + replaceBIStringRefs(BI); PrevBI->mergeWith(BI); } else { CopyBranchInfo(BI, MI->second.Data); @@ -227,24 +255,98 @@ int main(int argc, char **argv) { } } } + + for (auto NameIter = ReaderOrErr.get()->getEventNames().begin(), + End = ReaderOrErr.get()->getEventNames().end(); + NameIter != End; ++NameIter) { + auto NamePtr = MergedStringPool.intern(NameIter->getKey()); + EventNames.insert(*NamePtr); + } + + for (auto &FI : ReaderOrErr.get()->getAllFuncsSampleData()) { + auto MI = MergedFunctionsSampleData.find(FI.second.Name); + if (MI != MergedFunctionsSampleData.end()) { + std::vector TmpSI; + for (auto &SI : FI.second.Data) { + // Find and merge a corresponding entry or copy data. + auto TI = std::lower_bound(MI->second.Data.begin(), + MI->second.Data.end(), + SI); + if (TI != MI->second.Data.end() && *TI == SI) { + replaceSIStringRefs(SI); + TI->mergeWith(SI); + } else { + CopySampleInfo(SI, TmpSI); + } + } + // Merge in the temp vector making sure it doesn't contain duplicates. + std::sort(TmpSI.begin(), TmpSI.end()); + SampleInfo *PrevSI = nullptr; + for (auto &SI : TmpSI) { + if (PrevSI && *PrevSI == SI) { + PrevSI->mergeWith(SI); + } else { + MI->second.Data.emplace_back(SI); + PrevSI = &MI->second.Data.back(); + } + } + std::sort(MI->second.Data.begin(), MI->second.Data.end()); + } else { + auto NamePtr = MergedStringPool.intern(FI.second.Name); + AllStrings.emplace_back(NamePtr); // keep the ref + bool Success; + std::tie(MI, Success) = MergedFunctionsSampleData.insert( + std::make_pair(*NamePtr, + FuncSampleData(*NamePtr, + FuncSampleData::ContainerTy()))); + // Copy with string conversion while eliminating duplicates. + std::sort(FI.second.Data.begin(), FI.second.Data.end()); + SampleInfo *PrevSI = nullptr; + for (auto &SI : FI.second.Data) { + if (PrevSI && *PrevSI == SI) { + replaceSIStringRefs(SI); + PrevSI->mergeWith(SI); + } else { + CopySampleInfo(SI, MI->second.Data); + PrevSI = &MI->second.Data.back(); + } + } + } + } } if (!opts::SuppressMergedDataOutput) { // Print all the data in the original format - for (const auto &FDI : MergedFunctionsData) { + // Print mode + if (MergedFunctionsSampleData.size() > 0) { + outs() << "no_lbr"; + for (auto NameIter = EventNames.begin(), End = EventNames.end(); + NameIter != End; ++NameIter) { + outs() << " " << NameIter->getKey(); + } + outs() << "\n"; + } + for (const auto &FDI : MergedFunctionsBranchData) { for (const auto &BD : FDI.second.Data) { BD.print(outs()); } } + for (const auto &FDI : MergedFunctionsSampleData) { + for (const auto &SD : FDI.second.Data) { + SD.print(outs()); + } + } } - errs() << "Data for " << MergedFunctionsData.size() + errs() << "Data for " + << (MergedFunctionsBranchData.size() + + MergedFunctionsSampleData.size()) << " unique objects successfully merged.\n"; if (opts::PrintFunctionList != opts::ST_NONE) { // List of function names with execution count. std::vector> - FunctionList(MergedFunctionsData.size()); + FunctionList(MergedFunctionsBranchData.size()); using CountFuncType = std::function( const StringMapEntry&)>; @@ -264,8 +366,8 @@ int main(int argc, char **argv) { CountFuncType CountFunc = (opts::PrintFunctionList == opts::ST_EXEC_COUNT) ? ExecCountFunc : BranchCountFunc; - std::transform(MergedFunctionsData.begin(), - MergedFunctionsData.end(), + std::transform(MergedFunctionsBranchData.begin(), + MergedFunctionsBranchData.end(), FunctionList.begin(), CountFunc); std::stable_sort(FunctionList.rbegin(), FunctionList.rend()); From 132536baaec0d77b96ff66118cbea1cb0a2d0e5e Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 18 Sep 2017 16:26:00 -0700 Subject: [PATCH 303/904] [BOLT] Fix frameopt=all for gcc Summary: Fix two bugs. First, stack pointer tracking, the dataflow analysis, was converging to the "superposition" state (meaning that at this point there are multiple and conflicting states) too early in case the entry state in the BB was "empty" AND there was an SP computation in the block. In these cases, we need to propagate an "empty" value as well and wait for an iteration where the input is not empty (only entry BBs start with a non-empty well-defined value). Previously, it was propagating "superposition", meaning there is a conflict of states in this block, which is not true, since the input is empty and, therefore, there is no preceding state to justify a collision of states. Second, if SPT failed and has no idea about the stack values in a block (if it is in the superposition state at a given point in a BB), shrink wrapping should not attempt to insert computation into those blocks that we do not understand what is happening. Fix it to bail on those cases. (cherry picked from commit 6243ca36d7a2b97969e1daf914842d6c96ee558b) --- bolt/Passes/ShrinkWrapping.cpp | 15 ++++++++++++++- bolt/Passes/StackPointerTracking.h | 10 ++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp index 99afc5e4312c..abbc0fc6c6a5 100644 --- a/bolt/Passes/ShrinkWrapping.cpp +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -696,6 +696,7 @@ void ShrinkWrapping::computeSaveLocations() { SavePos = std::vector>(BC.MRI->getNumRegs()); auto &RI = Info.getReachingInsnsBackwards(); auto &DA = Info.getDominatorAnalysis(); + auto &SPT = Info.getStackPointerTracking(); DEBUG(dbgs() << "Checking save/restore possibilities\n"); for (auto &BB : BF) { @@ -710,6 +711,12 @@ void ShrinkWrapping::computeSaveLocations() { if (RI.isInLoop(BB)) continue; + const auto SPFP = *SPT.getStateBefore(*First); + // If we don't know stack state at this point, bail + if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) && + (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY)) + continue; + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { if (!CSA.CalleeSaved[I]) continue; @@ -1144,9 +1151,15 @@ void ShrinkWrapping::moveSaveRestores() { auto FIELoad = CSA.LoadFIEByReg[I]; assert(FIESave && FIELoad); auto &SPT = Info.getStackPointerTracking(); - auto SaveOffset = SPT.getStateBefore(*BestPosSave)->first; + const auto SPFP = *SPT.getStateBefore(*BestPosSave); + auto SaveOffset = SPFP.first; auto SaveSize = FIESave->Size; + // If we don't know stack state at this point, bail + if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) && + (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY)) + continue; + // Operation mode: if true, will insert push/pops instead of loads/restores bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset); diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/Passes/StackPointerTracking.h index ce0cd26bbc5e..46cc0facae61 100644 --- a/bolt/Passes/StackPointerTracking.h +++ b/bolt/Passes/StackPointerTracking.h @@ -123,8 +123,11 @@ class StackPointerTrackingBase else FP = std::make_pair(0, 0); int64_t Output; - if (!MIA->evaluateSimple(Point, Output, SP, FP)) + if (!MIA->evaluateSimple(Point, Output, SP, FP)) { + if (SPVal == EMPTY && FPVal == EMPTY) + return SPVal; return SUPERPOSITION; + } return static_cast(Output); } @@ -155,8 +158,11 @@ class StackPointerTrackingBase else SP = std::make_pair(0, 0); int64_t Output; - if (!MIA->evaluateSimple(Point, Output, SP, FP)) + if (!MIA->evaluateSimple(Point, Output, SP, FP)) { + if (SPVal == EMPTY && FPVal == EMPTY) + return FPVal; return SUPERPOSITION; + } if (!HasFramePointer) { if (MIA->escapesVariable(Point, *this->BC.MRI, false)) { From f30ab2e82f9147345e86de96826f68d3ab501a03 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 20 Sep 2017 13:32:46 -0700 Subject: [PATCH 304/904] [BOLT] Fix issue with exception handlers splitting Summary: A cold part of a function can start with a landing pad. As a result, this landing pad will have offset 0 from the start of the corresponding FDE, and it wouldn't get registered by exception-handling runtime. The solution is to use a different landing pad base address (LPStart), such as (FDE_start - 1). (cherry picked from commit af1c61ab61e0e68be4ab09749a3c1916fb0c099f) --- bolt/Exceptions.cpp | 65 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 51021cfab77f..e4daa4632e62 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include #include #undef DEBUG_TYPE @@ -494,13 +495,11 @@ void BinaryFunction::updateEHRanges() { // The code is based on EHStreamer::emitExceptionTable(). void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { const auto *Sites = EmitColdPart ? &ColdCallSites : &CallSites; - - auto *StartSymbol = EmitColdPart ? getColdSymbol() : getSymbol(); - if (Sites->empty()) { return; } + // Calculate callsite table size. Size of each callsite entry is: // // sizeof(start) + sizeof(length) + sizeof(LP) + sizeof(uleb128(action)) @@ -526,8 +525,53 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { assert(LSDASymbol && "no LSDA symbol set"); Streamer->EmitLabel(LSDASymbol); + // Corresponding FDE start. + const auto *StartSymbol = EmitColdPart ? getColdSymbol() : getSymbol(); + // Emit the LSDA header. - Streamer->EmitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format + + // If LPStart is omitted, then the start of the FDE is used as a base for + // landing pad displacements. Then if a cold fragment starts with + // a landing pad, this means that the first landing pad offset will be 0. + // As a result, an exception handling runtime will ignore this landing pad, + // because zero offset denotes the absence of a landing pad. + // + // To workaround this issue, we issue a special LPStart for cold fragments + // that is equal to FDE start minus 1 byte. + // + // Note that main function fragment cannot start with a landing pad and we + // omit LPStart. + const MCExpr *LPStartExpr = nullptr; + std::function emitLandingPad; + if (EmitColdPart) { + Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format + LPStartExpr = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(StartSymbol, *BC.Ctx.get()), + MCConstantExpr::create(1, *BC.Ctx.get()), + *BC.Ctx.get()); + Streamer->EmitValue(LPStartExpr, 4); + emitLandingPad = [&](const MCSymbol *LPSymbol) { + if (!LPSymbol) { + Streamer->EmitIntValue(0, 4); + return; + } + Streamer->EmitValue(MCBinaryExpr::createSub( + MCSymbolRefExpr::create(LPSymbol, *BC.Ctx.get()), + LPStartExpr, + *BC.Ctx.get()), + 4); + }; + } else { + Streamer->EmitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format + emitLandingPad = [&](const MCSymbol *LPSymbol) { + if (!LPSymbol) { + Streamer->EmitIntValue(0, 4); + return; + } + Streamer->emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4); + }; + } + Streamer->EmitIntValue(TTypeEncoding, 1); // TType format // See the comment in EHStreamer::emitExceptionTable() on to use @@ -561,8 +605,8 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { for (const auto &CallSite : *Sites) { - const MCSymbol *BeginLabel = CallSite.Start; - const MCSymbol *EndLabel = CallSite.End; + const auto *BeginLabel = CallSite.Start; + const auto *EndLabel = CallSite.End; assert(BeginLabel && "start EH label expected"); assert(EndLabel && "end EH label expected"); @@ -571,14 +615,7 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { // function split part. Streamer->emitAbsoluteSymbolDiff(BeginLabel, StartSymbol, 4); Streamer->emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4); - - if (!CallSite.LP) { - Streamer->EmitIntValue(0, 4); - } else { - // Difference can get negative if the handler is in hot part. - Streamer->emitAbsoluteSymbolDiff(CallSite.LP, StartSymbol, 4); - } - + emitLandingPad(CallSite.LP); Streamer->EmitULEB128IntValue(CallSite.Action); } From 31660a4613fd96e805e75e6e9043c06462c1cfc9 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 19 Sep 2017 16:59:05 -0700 Subject: [PATCH 305/904] [BOLT] Fix SCTC bug Summary: If conditional branch has been converted to conditional tail call, it may be considered for SCTC optimization later since it will appear as a tail call. We have to make sure that the tail call we are considering is not a conditional branch. (cherry picked from commit 903db56eea1766919126be2e3e27b0ffa1f8c44d) --- bolt/Passes/BinaryPasses.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 715acaea048a..559e979b38f5 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -585,7 +585,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, continue; auto *Instr = BB->getFirstNonPseudoInstr(); - if (!MIA->isTailCall(*Instr)) + if (!MIA->isTailCall(*Instr) || BC.MIA->isConditionalBranch(*Instr)) continue; auto *CalleeSymbol = MIA->getTargetSymbol(*Instr); From 5ad9b848732119d49edf0588b2e4bf0a27afeec9 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 1 Sep 2017 18:13:51 -0700 Subject: [PATCH 306/904] [BOLT] Integrate perf2bolt into llvm-bolt Summary: Move the data aggregator logic from our python script to our C++ LLVM/BOLT libs. This has a dramatic reduction in processing time for profiling data (from 45 minutes for HHVM to 5 minutes) because we directly use BOLT as a disassembler in order to validate traces found in the LBR and to add the fallthrough counts. Previously, the python approach relied on parsing the output objdump to check traces. (cherry picked from commit eaa79f52c3a137f28354df41f83c336eb0bddb2e) --- bolt/BinaryFunction.cpp | 46 +++ bolt/BinaryFunction.h | 13 + bolt/CMakeLists.txt | 3 + bolt/DataAggregator.cpp | 648 +++++++++++++++++++++++++++++++++++++++ bolt/DataAggregator.h | 197 ++++++++++++ bolt/DataReader.cpp | 47 +++ bolt/DataReader.h | 36 ++- bolt/RewriteInstance.cpp | 42 ++- bolt/RewriteInstance.h | 9 +- bolt/llvm-bolt.cpp | 96 +++++- 10 files changed, 1118 insertions(+), 19 deletions(-) create mode 100644 bolt/DataAggregator.cpp create mode 100644 bolt/DataAggregator.h diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a3d1b62c1cfe..9e10d76e7e77 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -4475,6 +4475,52 @@ DynoStats BinaryFunction::getDynoStats() const { return Stats; } +Optional, 16>> +BinaryFunction::getFallthroughsInTrace(uint64_t From, uint64_t To) const { + SmallVector, 16> Res; + + if (CurrentState != State::Disassembled) + return NoneType(); + + // Get iterators and validate trace start/end + auto FromIter = Instructions.find(From); + if (FromIter == Instructions.end()) + return NoneType(); + + auto ToIter = Instructions.find(To); + if (ToIter == Instructions.end()) + return NoneType(); + + // Trace needs to go forward + if (FromIter->first > ToIter->first) + return NoneType(); + + // Trace needs to finish in a branch + if (!BC.MIA->isBranch(ToIter->second) && !BC.MIA->isCall(ToIter->second) && + !BC.MIA->isReturn(ToIter->second)) + return NoneType(); + + // Analyze intermediate instructions + for (; FromIter != ToIter; ++FromIter) { + // This operates under an assumption that we collect all branches in LBR + // No unconditional branches in the middle of the trace + if (BC.MIA->isUnconditionalBranch(FromIter->second) || + BC.MIA->isReturn(FromIter->second) || + BC.MIA->isCall(FromIter->second)) + return NoneType(); + + if (!BC.MIA->isConditionalBranch(FromIter->second)) + continue; + + const uint64_t Src = FromIter->first; + auto Next = std::next(FromIter); + const uint64_t Dst = Next->first; + Res.push_back(std::make_pair(Src, Dst)); + } + + return Res; +} + void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat, uint64_t OtherStat) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 6f8632b4b80b..7f58d4b92b84 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -1747,6 +1747,15 @@ class BinaryFunction { return BranchData; } + FuncBranchData *getBranchData() { + return BranchData; + } + + /// Updates profile data associated with this function + void setBranchData(FuncBranchData *Data) { + BranchData = Data; + } + /// Walks the list of basic blocks filling in missing information about /// edge frequency for fall-throughs. /// @@ -1895,6 +1904,10 @@ class BinaryFunction { return UnitLineTable; } + /// Scan from - to offsets for conditional jumps + Optional, 16>> + getFallthroughsInTrace(uint64_t From, uint64_t To) const; + /// Returns an estimate of the function's hot part after splitting. /// This is a very rough estimate, as with C++ exceptions there are /// blocks we don't move, and it makes no attempt at estimating the size diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 0ddfb353a599..1194d7da67a3 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -65,9 +65,12 @@ add_llvm_tool(llvm-bolt BinaryFunction.cpp BinaryPassManager.cpp CalcCacheMetrics.cpp + DataAggregator.cpp DataReader.cpp DebugData.cpp DWARFRewriter.cpp Exceptions.cpp RewriteInstance.cpp ) + +add_llvm_tool_symlink(perf2bolt llvm-bolt) diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp new file mode 100644 index 000000000000..84bad029267e --- /dev/null +++ b/bolt/DataAggregator.cpp @@ -0,0 +1,648 @@ +//===-- DataAggregator.cpp - Perf data aggregator ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of functions reads profile data written by perf record, +// aggregate it and then write it back to an output file. +// +//===----------------------------------------------------------------------===// + +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "DataAggregator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Options.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Timer.h" + +#include + +#define DEBUG_TYPE "aggregator" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory AggregatorCategory; + +static llvm::cl::opt +TimeAggregator("time-aggr", + cl::desc("time BOLT aggregator"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + +} + +namespace { + +const char TimerGroupName[] = "Aggregator"; + +} + +void DataAggregator::findPerfExecutable() { + auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf"); + if (!PerfExecutable) { + outs() << "PERF2BOLT: No perf executable found!\n"; + exit(1); + } + PerfPath = *PerfExecutable; +} + +void DataAggregator::start(StringRef PerfDataFilename) { + Enabled = true; + outs() << "PERF2BOLT: Starting data aggregation job for " << PerfDataFilename + << "\n"; + findPerfExecutable(); + launchPerfEventsNoWait(PerfDataFilename); + launchPerfTasksNoWait(PerfDataFilename); +} + +bool DataAggregator::launchPerfEventsNoWait(StringRef PerfDataFilename) { + SmallVector Argv; + SmallVector Redirects; + SmallVector RedirectPtrs; + + outs() << "PERF2BOLT: Spawning perf-script job to read events\n"; + Argv.push_back(PerfPath.data()); + Argv.push_back("script"); + Argv.push_back("-F"); + Argv.push_back("pid,brstack"); + Argv.push_back("-i"); + Argv.push_back(PerfDataFilename.data()); + Argv.push_back(nullptr); + + if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out", + PerfEventsOutputPath)) { + outs() << "PERF2BOLT: Failed to create temporary file " + << PerfEventsOutputPath << " with error " << Errc.message() << "\n"; + exit(1); + } + + if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", + PerfEventsErrPath)) { + outs() << "PERF2BOLT: Failed to create temporary file " + << PerfEventsErrPath << " with error " << Errc.message() << "\n"; + exit(1); + } + + Redirects.push_back(""); // Stdin + Redirects.push_back(StringRef(PerfEventsOutputPath.data())); // Stdout + Redirects.push_back(StringRef(PerfEventsErrPath.data())); // Stderr + RedirectPtrs.push_back(&Redirects[0]); + RedirectPtrs.push_back(&Redirects[1]); + RedirectPtrs.push_back(&Redirects[2]); + + DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " + << PerfEventsOutputPath.data() << " 2> " + << PerfEventsErrPath.data() << "\n"); + + EventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), + /*envp*/ nullptr, &RedirectPtrs[0]); + + return true; +} + +bool DataAggregator::launchPerfTasksNoWait(StringRef PerfDataFilename) { + SmallVector Argv; + SmallVector Redirects; + SmallVector RedirectPtrs; + + outs() << "PERF2BOLT: Spawning perf-script job to read tasks\n"; + Argv.push_back(PerfPath.data()); + Argv.push_back("script"); + Argv.push_back("--show-task-events"); + Argv.push_back("-i"); + Argv.push_back(PerfDataFilename.data()); + Argv.push_back(nullptr); + + if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out", + PerfTasksOutputPath)) { + outs() << "PERF2BOLT: Failed to create temporary file " + << PerfTasksOutputPath << " with error " << Errc.message() << "\n"; + exit(1); + } + + if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", + PerfTasksErrPath)) { + outs() << "PERF2BOLT: Failed to create temporary file " + << PerfTasksErrPath << " with error " << Errc.message() << "\n"; + exit(1); + } + + Redirects.push_back(""); // Stdin + Redirects.push_back(StringRef(PerfTasksOutputPath.data())); // Stdout + Redirects.push_back(StringRef(PerfTasksErrPath.data())); // Stderr + RedirectPtrs.push_back(&Redirects[0]); + RedirectPtrs.push_back(&Redirects[1]); + RedirectPtrs.push_back(&Redirects[2]); + + DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " + << PerfTasksOutputPath.data() << " 2> " + << PerfTasksErrPath.data() << "\n"); + + TasksPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), + /*envp*/ nullptr, &RedirectPtrs[0]); + + return true; +} + +bool DataAggregator::checkPerfDataMagic(StringRef FileName) { + int FD; + if (sys::fs::openFileForRead(FileName, FD)) { + return false; + } + + char Buf[7] = {0, 0, 0, 0, 0, 0, 0}; + + if (::read(FD, Buf, 7) == -1) { + ::close(FD); + return false; + } + ::close(FD); + + if (strncmp(Buf, "PERFILE", 7) == 0) + return true; + return false; +} + +void DataAggregator::deleteTempFiles() { + if (auto Errc = sys::fs::remove(PerfEventsErrPath.data())) { + outs() << "PERF2BOLT: Failed to delete temporary file " + << PerfEventsErrPath << " with error " << Errc.message() << "\n"; + } + + if (auto Errc = sys::fs::remove(PerfEventsOutputPath.data())) { + outs() << "PERF2BOLT: Failed to delete temporary file " + << PerfEventsOutputPath << " with error " << Errc.message() << "\n"; + } + + if (auto Errc = sys::fs::remove(PerfTasksErrPath.data())) { + outs() << "PERF2BOLT: Failed to delete temporary file " + << PerfTasksErrPath << " with error " << Errc.message() << "\n"; + } + + if (auto Errc = sys::fs::remove(PerfTasksOutputPath.data())) { + outs() << "PERF2BOLT: Failed to delete temporary file " + << PerfTasksOutputPath << " with error " << Errc.message() << "\n"; + } +} + +bool DataAggregator::aggregate(BinaryContext &BC, + std::map &BFs) { + std::string Error; + + this->BC = &BC; + this->BFs = &BFs; + + outs() << "PERF2BOLT: Waiting for perf tasks collection to finish...\n"; + auto PI1 = sys::Wait(TasksPI, 0, true, &Error); + + if (!Error.empty()) { + errs() << "PERF-ERROR: " << Error << "\n"; + deleteTempFiles(); + exit(1); + } + + if (PI1.ReturnCode != 0) { + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(PerfTasksErrPath.data()); + StringRef ErrBuf = (*MB)->getBuffer(); + + errs() << "PERF-ERROR: Return code " << PI1.ReturnCode << "\n"; + errs() << ErrBuf; + deleteTempFiles(); + exit(1); + } + + ErrorOr> MB1 = + MemoryBuffer::getFileOrSTDIN(PerfTasksOutputPath.data()); + if (std::error_code EC = MB1.getError()) { + errs() << "Cannot open " << PerfTasksOutputPath.data() << ": " + << EC.message() << "\n"; + deleteTempFiles(); + exit(1); + } + + FileBuf.reset(MB1->release()); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + if (parseTasks()) { + outs() << "PERF2BOLT: Failed to parse tasks\n"; + } + + outs() + << "PERF2BOLT: Waiting for perf events collection to finish...\n"; + auto PI2 = sys::Wait(EventsPI, 0, true, &Error); + + if (!Error.empty()) { + errs() << "PERF-ERROR: " << Error << "\n"; + deleteTempFiles(); + exit(1); + } + + if (PI2.ReturnCode != 0) { + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(PerfEventsErrPath.data()); + StringRef ErrBuf = (*MB)->getBuffer(); + + errs() << "PERF-ERROR: Return code " << PI2.ReturnCode << "\n"; + errs() << ErrBuf; + deleteTempFiles(); + exit(1); + } + + ErrorOr> MB2 = + MemoryBuffer::getFileOrSTDIN(PerfEventsOutputPath.data()); + if (std::error_code EC = MB2.getError()) { + errs() << "Cannot open " << PerfEventsOutputPath.data() << ": " + << EC.message() << "\n"; + deleteTempFiles(); + exit(1); + } + + FileBuf.reset(MB2->release()); + deleteTempFiles(); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + if (parseEvents()) { + outs() << "PERF2BOLT: Failed to parse events\n"; + } + + return true; +} + +BinaryFunction * +DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) { + auto FI = BFs->upper_bound(Address); + if (FI == BFs->begin()) + return nullptr; + --FI; + + const auto UsedSize = FI->second.getMaxSize(); + if (Address >= FI->first + UsedSize) + return nullptr; + return &FI->second; +} + +bool DataAggregator::doIntraBranch(BinaryFunction *Func, uint64_t From, + uint64_t To, bool Mispred) { + FuncBranchData *AggrData = Func->getBranchData(); + if (!AggrData) { + AggrData = &FuncsToBranches[Func->getNames()[0]]; + AggrData->Name = Func->getNames()[0]; + Func->setBranchData(AggrData); + } + + From -= Func->getAddress(); + To -= Func->getAddress(); + AggrData->bumpBranchCount(From, To, Mispred); + return true; +} + +bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, + BinaryFunction *ToFunc, uint64_t From, + uint64_t To, bool Mispred) { + FuncBranchData *FromAggrData{nullptr}; + FuncBranchData *ToAggrData{nullptr}; + StringRef SrcFunc; + StringRef DstFunc; + if (FromFunc) { + SrcFunc = FromFunc->getNames()[0]; + FromAggrData = FromFunc->getBranchData(); + if (!FromAggrData) { + FromAggrData = &FuncsToBranches[SrcFunc]; + FromAggrData->Name = SrcFunc; + FromFunc->setBranchData(FromAggrData); + } + From -= FromFunc->getAddress(); + } + if (ToFunc) { + DstFunc = ToFunc->getNames()[0]; + ToAggrData = ToFunc->getBranchData(); + if (!ToAggrData) { + ToAggrData = &FuncsToBranches[DstFunc]; + ToAggrData->Name = DstFunc; + ToFunc->setBranchData(ToAggrData); + } + To -= ToFunc->getAddress(); + } + + if (FromAggrData) + FromAggrData->bumpCallCount(From, Location(!DstFunc.empty(), DstFunc, To), + Mispred); + if (ToAggrData) + ToAggrData->bumpEntryCount(Location(!SrcFunc.empty(), SrcFunc, From), To, + Mispred); + return true; +} + +bool DataAggregator::doBranch(uint64_t From, uint64_t To, bool Mispred) { + auto *FromFunc = getBinaryFunctionContainingAddress(From); + auto *ToFunc = getBinaryFunctionContainingAddress(To); + if (!FromFunc && !ToFunc) + return false; + + if (FromFunc == ToFunc) + return doIntraBranch(FromFunc, From, To, Mispred); + + return doInterBranch(FromFunc, ToFunc, From, To, Mispred); +} + +bool DataAggregator::doTrace(uint64_t From, uint64_t To) { + auto *FromFunc = getBinaryFunctionContainingAddress(From); + auto *ToFunc = getBinaryFunctionContainingAddress(To); + if (!FromFunc || !ToFunc) { + ++NumLongRangeTraces; + return false; + } + if (FromFunc != ToFunc) { + ++NumInvalidTraces; + DEBUG(dbgs() << "Trace starting in " << FromFunc->getPrintName() << " @ " + << Twine::utohexstr(From - FromFunc->getAddress()) + << " and ending in " << ToFunc->getPrintName() << " @ " + << ToFunc->getPrintName() << " @ " + << Twine::utohexstr(To - ToFunc->getAddress()) << "\n"); + return false; + } + if (FromFunc) { + From -= FromFunc->getAddress(); + To -= ToFunc->getAddress(); + } + + auto FTs = FromFunc->getFallthroughsInTrace(From, To); + if (!FTs) { + ++NumInvalidTraces; + return false; + } + + for (const auto &Pair : *FTs) { + doIntraBranch(FromFunc, Pair.first + FromFunc->getAddress(), + Pair.second + FromFunc->getAddress(), false); + } + + return true; +} + +ErrorOr DataAggregator::parseLBREntry() { + LBREntry Res; + auto FromStrRes = parseString('/'); + if (std::error_code EC = FromStrRes.getError()) + return EC; + StringRef OffsetStr = FromStrRes.get(); + if (OffsetStr.getAsInteger(0, Res.From)) { + reportError("expected hexadecimal number with From address"); + Diag << "Found: " << OffsetStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + + auto ToStrRes = parseString('/'); + if (std::error_code EC = ToStrRes.getError()) + return EC; + OffsetStr = ToStrRes.get(); + if (OffsetStr.getAsInteger(0, Res.To)) { + reportError("expected hexadecimal number with To address"); + Diag << "Found: " << OffsetStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + + auto MispredStrRes = parseString('/'); + if (std::error_code EC = MispredStrRes.getError()) + return EC; + StringRef MispredStr = MispredStrRes.get(); + if (MispredStr.size() != 1 || + (MispredStr[0] != 'P' && MispredStr[0] != 'M')) { + reportError("expected single char for mispred bit"); + Diag << "Found: " << OffsetStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + Res.Mispred = MispredStr[0] == 'M'; + + auto Rest = parseString(FieldSeparator, true); + if (std::error_code EC = Rest.getError()) + return EC; + if (Rest.get().size() < 5) { + reportError("expected rest of LBR entry"); + Diag << "Found: " << OffsetStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + return Res; +} + +bool DataAggregator::checkAndConsumeFS() { + if (ParsingBuf[0] != FieldSeparator) { + return false; + } + ParsingBuf = ParsingBuf.drop_front(1); + Col += 1; + return true; +} + +void DataAggregator::consumeRestOfLine() { + auto LineEnd = ParsingBuf.find_first_of('\n'); + if (LineEnd == StringRef::npos) { + ParsingBuf = StringRef(); + Col = 0; + Line += 1; + return; + } + ParsingBuf = ParsingBuf.drop_front(LineEnd + 1); + Col = 0; + Line += 1; +} + +ErrorOr DataAggregator::parseSample() { + PerfSample Res; + + while (checkAndConsumeFS()) {} + + auto PIDRes = parseNumberField(FieldSeparator, true); + if (std::error_code EC = PIDRes.getError()) + return EC; + if (!PIDs.empty() && !PIDs.count(PIDRes.get())) { + consumeRestOfLine(); + return Res; + } + + while (!checkAndConsumeNewLine()) { + if (!expectAndConsumeFS()) + return make_error_code(llvm::errc::io_error); + + auto LBRRes = parseLBREntry(); + if (std::error_code EC = LBRRes.getError()) + return EC; + Res.LBR.push_back(LBRRes.get()); + } + + return Res; +} + +bool DataAggregator::hasData() { + if (ParsingBuf.size() == 0) + return false; + + return true; +} + +std::error_code DataAggregator::parseEvents() { + outs() << "PERF2BOLT: Aggregating...\n"; + NamedRegionTimer T("Samples parsing", TimerGroupName, opts::TimeAggregator); + uint64_t NumEntries{0}; + uint64_t NumSamples{0}; + while (hasData()) { + auto SampleRes = parseSample(); + if (std::error_code EC = SampleRes.getError()) + return EC; + + auto &Sample = SampleRes.get(); + if (Sample.LBR.empty()) + continue; + + ++NumSamples; + NumEntries += Sample.LBR.size(); + + // Parser semantic actions + uint64_t Last{0}; + for (const auto &LBR : Sample.LBR) { + if (Last) + doTrace(LBR.To, Last); + doBranch(LBR.From, LBR.To, LBR.Mispred); + Last = LBR.From; + } + } + outs() << "PERF2BOLT: Read " << NumSamples << " samples and " + << NumEntries << " LBR entries\n"; + outs() << "PERF2BOLT: Invalid traces: " << NumInvalidTraces << "\n"; + outs() << "PERF2BOLT: Traces straddling multiple functions (discarded): " + << NumLongRangeTraces << "\n"; + + return std::error_code(); +} + +ErrorOr DataAggregator::parseTaskPID() { + while (checkAndConsumeFS()) {} + + auto CommNameStr = parseString(FieldSeparator, true); + if (std::error_code EC = CommNameStr.getError()) + return EC; + if (CommNameStr.get() != BinaryName) { + consumeRestOfLine(); + return -1; + } + + auto LineEnd = ParsingBuf.find_first_of("\n"); + if (LineEnd == StringRef::npos) { + reportError("expected rest of line"); + Diag << "Found: " << ParsingBuf << "\n"; + return make_error_code(llvm::errc::io_error); + } + + StringRef Line = ParsingBuf.substr(0, LineEnd); + + if (Line.find("PERF_RECORD_COMM") != StringRef::npos) { + int64_t PID; + StringRef PIDStr = Line.rsplit(':').second.split('/').first; + if (PIDStr.getAsInteger(10, PID)) { + reportError("expected PID"); + Diag << "Found: " << PIDStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + return PID; + } + + consumeRestOfLine(); + return -1; +} + +std::error_code DataAggregator::parseTasks() { + outs() << "PERF2BOLT: Parsing perf-script tasks output\n"; + NamedRegionTimer T("Tasks parsing", TimerGroupName, opts::TimeAggregator); + + while (hasData()) { + auto PIDRes = parseTaskPID(); + if (std::error_code EC = PIDRes.getError()) + return EC; + + auto PID = PIDRes.get(); + if (PID == -1) { + continue; + } + + PIDs.insert(PID); + } + if (!PIDs.empty()) + outs() << "PERF2BOLT: Input binary is associated with " << PIDs.size() + << " PID(s)\n"; + else + outs() << "PERF2BOLT: Could not bind input binary to a PID - will parse " + "all samples in perf data.\n"; + + return std::error_code(); +} + +std::error_code DataAggregator::writeAggregatedFile() const { + std::error_code EC; + raw_fd_ostream OutFile(OutputFDataName, EC, sys::fs::OpenFlags::F_None); + if (EC) + return EC; + + uint64_t Values{0}; + for (const auto &Func : FuncsToBranches) { + for (const auto &BI : Func.getValue().Data) { + OutFile << (BI.From.IsSymbol ? "1 " : "0 ") + << (BI.From.Name.empty() ? "[unknown]" : BI.From.Name) << " " + << Twine::utohexstr(BI.From.Offset) << " " + << (BI.To.IsSymbol ? "1 " : "0 ") + << (BI.To.Name.empty() ? "[unknown]" : BI.To.Name) << " " + << Twine::utohexstr(BI.To.Offset) << " " << BI.Mispreds << " " + << BI.Branches << "\n"; + ++Values; + } + for (const auto &BI : Func.getValue().EntryData) { + // Do not output if source is a known symbol, since this was already + // accounted for in the source function + if (BI.From.IsSymbol) + continue; + OutFile << (BI.From.IsSymbol ? "1 " : "0 ") + << (BI.From.Name.empty() ? "[unknown]" : BI.From.Name) << " " + << Twine::utohexstr(BI.From.Offset) << " " + << (BI.To.IsSymbol ? "1 " : "0 ") + << (BI.To.Name.empty() ? "[unknown]" : BI.To.Name) << " " + << Twine::utohexstr(BI.To.Offset) << " " << BI.Mispreds << " " + << BI.Branches << "\n"; + ++Values; + } + } + outs() << "PERF2BOLT: Wrote " << Values << " objects to " + << OutputFDataName << "\n"; + + return std::error_code(); +} + +void DataAggregator::dump() const { + DataReader::dump(); +} + +void DataAggregator::dump(const LBREntry &LBR) const { + Diag << "From: " << Twine::utohexstr(LBR.From) + << " To: " << Twine::utohexstr(LBR.To) << " Mispred? " << LBR.Mispred + << "\n"; +} + +void DataAggregator::dump(const PerfSample &Sample) const { + Diag << "Sample LBR entries: " << Sample.LBR.size() << "\n"; + for (const auto &LBR : Sample.LBR) { + dump(LBR); + } +} diff --git a/bolt/DataAggregator.h b/bolt/DataAggregator.h new file mode 100644 index 000000000000..9e953d0d340a --- /dev/null +++ b/bolt/DataAggregator.h @@ -0,0 +1,197 @@ +//===-- DataAggregator.h - Perf data aggregator -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of functions reads profile data written by perf record, +// aggregates it and then writes it back to an output file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DATA_AGGREGATOR_H +#define LLVM_TOOLS_LLVM_BOLT_DATA_AGGREGATOR_H + +#include "DataReader.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include + +namespace llvm { +namespace bolt { + +class BinaryFunction; +class BinaryContext; + +struct LBREntry { + uint64_t From; + uint64_t To; + bool Mispred; +}; + +struct PerfSample { + SmallVector LBR; +}; + +/// DataAggregator inherits all parsing logic from DataReader as well as +/// its data structures used to represent aggregated profile data in memory. +/// +/// The aggregator works by dispatching two separate perf-script jobs that +/// read perf samples and perf task annotations. Later, we read the output +/// files to extract information about which PID was used for this binary. +/// With the PID, we filter the samples and extract all LBR entries. +/// +/// To aggregate LBR entries, we rely on a BinaryFunction map to locate the +/// original function where the event happened. Then, we convert a raw address +/// to an offset relative to the start of this function and aggregate branch +/// information for each function. +/// +/// This must be coordinated with RewriteInstance so we have BinaryFunctions in +/// State::Disassembled. After this state, BinaryFunction will drop the +/// instruction map with original addresses we rely on to validate the traces +/// found in the LBR. +/// +/// The last step is to write the aggregated data to disk in the output file +/// specified by the user. +class DataAggregator : public DataReader { + // Perf process spawning bookkeeping + std::string PerfPath; + sys::ProcessInfo EventsPI; + sys::ProcessInfo TasksPI; + SmallVector PerfEventsOutputPath; + SmallVector PerfEventsErrPath; + SmallVector PerfTasksOutputPath; + SmallVector PerfTasksErrPath; + + /// Whether aggregator was scheduled to run + bool Enabled{false}; + + /// Output file name to write aggregated fdata to + StringRef OutputFDataName; + + /// Our sampled binary name to look for in perf.data + StringRef BinaryName; + + DenseSet PIDs; + + /// References to core BOLT data structures + BinaryContext *BC{nullptr}; + std::map *BFs{nullptr}; + + /// Aggregation statistics + uint64_t NumInvalidTraces{0}; + uint64_t NumLongRangeTraces{0}; + + /// Looks into system PATH for Linux Perf and set up the aggregator to use it + void findPerfExecutable(); + + /// Launch a subprocess to read all perf samples and write them to an output + /// file we will parse later + bool launchPerfEventsNoWait(StringRef PerfDataFilename); + + /// Launch a subprocess to read all perf task events. They contain the mapping + /// of binary file name to PIDs used during data collection time. We later use + /// the PIDs to filter samples. + bool launchPerfTasksNoWait(StringRef PerfDataFilename); + + /// Delete all temporary files created to hold the output generated by spawned + /// subprocesses during the aggregation job + void deleteTempFiles(); + + // Semantic pass helpers + /// Look up which function contains an address by using out map of + /// disassembled BinaryFunctions + BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address); + + /// Semantic actions - parser hooks to interpret parsed perf samples + /// Register an intraprocedural branch in \p Func with offsets \p From and + /// \p To (relative to \p Func start address). + bool doIntraBranch(BinaryFunction *Func, uint64_t From, uint64_t To, + bool Mispred); + + /// Register an interprocedural branch from \p FromFunc to \p ToFunc with + /// offsets \p From and \p To, respectively. + bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, + uint64_t From, uint64_t To, bool Mispred); + + /// Register a branch with raw addresses \p From and \p To extracted from the + /// LBR + bool doBranch(uint64_t From, uint64_t To, bool Mispred); + + /// Register a trace starting in raw address \p From and ending in \p To + /// This will add all intermediate conditional branches in this trace as not + /// taken. + bool doTrace(uint64_t From, uint64_t To); + + /// Parser helpers + /// Return false if we exhausted our parser buffer and finished parsing + /// everything + bool hasData(); + + /// Parse a single perf sample containing a PID associated with a sequence of + /// LBR entries + ErrorOr parseSample(); + + /// Check if a field separator is the next char to parse and, if yes, consume + /// it and return true + bool checkAndConsumeFS(); + + /// Consume the entire line + void consumeRestOfLine(); + + /// Parse a single LBR entry as output by perf script -Fbrstack + ErrorOr parseLBREntry(); + + /// Parse the full output generated by perf script to report LBR samples + std::error_code parseEvents(); + + /// Parse a single line of a PERF_RECORD_COMM event looking for an association + /// between the binary name and its PID. Return -1 if binary name is not + /// correct. + ErrorOr parseTaskPID(); + + /// Parse the full output generated by perf script to report PERF_RECORD_COMM + /// events with the association of binary file names and their PIDs. + std::error_code parseTasks(); + +public: + DataAggregator(raw_ostream &Diag, StringRef BinaryName) + : DataReader(Diag), BinaryName(llvm::sys::path::filename(BinaryName)) {} + + /// Set the file name to save aggregate data to + void setOutputFDataName(StringRef Name) { OutputFDataName = Name; } + + /// Start an aggregation job asynchronously. Call "aggregate" to finish it + /// with a list of disassembled functions. + void start(StringRef PerfDataFilename); + + /// True if DataAggregator has asynchronously been started and an aggregation + /// job is in progress + bool started() const { return Enabled; } + + /// Dump data structures into a file readable by llvm-bolt + std::error_code writeAggregatedFile() const; + + /// Join child subprocesses and finalize aggregation populating data + /// structures + bool aggregate(BinaryContext &BC, std::map &BFs); + + /// Check whether \p FileName is a perf.data file + static bool checkPerfDataMagic(StringRef FileName); + + /// Debugging dump methods + void dump() const; + void dump(const LBREntry &LBR) const; + void dump(const PerfSample &Sample) const; +}; + + +} +} + +#endif diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 1112e748af28..4a92adaf5942 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -108,6 +108,52 @@ FuncSampleData::getSamples(uint64_t Start, uint64_t End) const { return Result; } +void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, + bool Mispred) { + auto Iter = IntraIndex[OffsetFrom].find(OffsetTo); + if (Iter == IntraIndex[OffsetFrom].end()) { + Data.emplace_back(Location(true, Name, OffsetFrom), + Location(true, Name, OffsetTo), Mispred, 1, + BranchHistories()); + IntraIndex[OffsetFrom][OffsetTo] = Data.size() - 1; + return; + } + auto &BI = Data[Iter->second]; + ++BI.Branches; + if (Mispred) + ++BI.Mispreds; +} + +void FuncBranchData::bumpCallCount(uint64_t OffsetFrom, const Location &To, + bool Mispred) { + auto Iter = InterIndex[OffsetFrom].find(To); + if (Iter == InterIndex[OffsetFrom].end()) { + Data.emplace_back(Location(true, Name, OffsetFrom), To, Mispred, 1, + BranchHistories()); + InterIndex[OffsetFrom][To] = Data.size() - 1; + return; + } + auto &BI = Data[Iter->second]; + ++BI.Branches; + if (Mispred) + ++BI.Mispreds; +} + +void FuncBranchData::bumpEntryCount(const Location &From, uint64_t OffsetTo, + bool Mispred) { + auto Iter = EntryIndex[OffsetTo].find(From); + if (Iter == EntryIndex[OffsetTo].end()) { + EntryData.emplace_back(From, Location(true, Name, OffsetTo), Mispred, 1, + BranchHistories()); + EntryIndex[OffsetTo][From] = EntryData.size() - 1; + return; + } + auto &BI = EntryData[Iter->second]; + ++BI.Branches; + if (Mispred) + ++BI.Mispreds; +} + void BranchInfo::mergeWith(const BranchInfo &BI) { // Merge branch and misprediction counts. @@ -245,6 +291,7 @@ DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { MemoryBuffer::getFileOrSTDIN(Path); if (std::error_code EC = MB.getError()) { Diag << "Cannot open " << Path << ": " << EC.message() << "\n"; + return EC; } auto DR = make_unique(std::move(MB.get()), Diag); DR->parse(); diff --git a/bolt/DataReader.h b/bolt/DataReader.h index b667fe186308..1243e5ebe7e7 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -15,6 +15,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_DATA_READER_H #define LLVM_TOOLS_LLVM_BOLT_DATA_READER_H +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringMap.h" @@ -141,6 +142,8 @@ struct FuncBranchData { /// Indicate if the data was used. bool Used{false}; + FuncBranchData() {} + FuncBranchData(StringRef Name, ContainerTy Data) : Name(Name), Data(std::move(Data)) {} @@ -162,6 +165,15 @@ struct FuncBranchData { /// Append the branch data of another function located \p Offset bytes away /// from the entry of this function. void appendFrom(const FuncBranchData &FBD, uint64_t Offset); + + /// Aggregation helpers + DenseMap> IntraIndex; + DenseMap> InterIndex; + DenseMap> EntryIndex; + + void bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, bool Mispred); + void bumpCallCount(uint64_t OffsetFrom, const Location &To, bool Mispred); + void bumpEntryCount(const Location &From, uint64_t OffsetTo, bool Mispred); }; /// Similar to BranchInfo, but instead of recording from-to address (an edge), @@ -329,7 +341,7 @@ class DataReader { return EventNames; } -private: +protected: void reportError(StringRef ErrorMsg); bool expectAndConsumeFS(); @@ -363,6 +375,28 @@ class DataReader { }; } + +/// DenseMapInfo allows us to use the DenseMap LLVM data structure to store +/// Locations +template<> struct DenseMapInfo { + static inline bolt::Location getEmptyKey() { + return bolt::Location(true, StringRef(), static_cast(-1LL)); + } + static inline bolt::Location getTombstoneKey() { + return bolt::Location(true, StringRef(), static_cast(-2LL));; + } + static unsigned getHashValue(const bolt::Location &L) { + return (unsigned(DenseMapInfo::getHashValue(L.Name)) >> 4) ^ + (unsigned(L.Offset)); + } + static bool isEqual(const bolt::Location &LHS, + const bolt::Location &RHS) { + return LHS.IsSymbol == RHS.IsSymbol && LHS.Name == RHS.Name && + LHS.Offset == RHS.Offset; + } +}; + + } #endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index d023770a4faf..ef884a6e4694 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -15,6 +15,7 @@ #include "BinaryFunction.h" #include "BinaryPassManager.h" #include "CalcCacheMetrics.h" +#include "DataAggregator.h" #include "DataReader.h" #include "Exceptions.h" #include "RewriteInstance.h" @@ -71,6 +72,8 @@ namespace opts { extern cl::OptionCategory BoltCategory; extern cl::OptionCategory BoltOptCategory; +extern cl::OptionCategory BoltOutputCategory; +extern cl::OptionCategory AggregatorCategory; extern cl::opt JumpTables; extern cl::opt ReorderFunctions; @@ -82,11 +85,11 @@ CalcCacheMetrics("calc-cache-metrics", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt +cl::opt OutputFilename("o", cl::desc(""), cl::Required, - cl::cat(BoltCategory)); + cl::cat(BoltOutputCategory)); static cl::opt AlignFunctions("align-functions", @@ -316,6 +319,12 @@ AddBoltInfo("add-bolt-info", cl::init(true), cl::cat(BoltCategory)); +cl::opt +AggregateOnly("aggregate-only", + cl::desc("exit after writing aggregated data file"), + cl::Hidden, + cl::cat(AggregatorCategory)); + // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { @@ -619,8 +628,9 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, } // namespace RewriteInstance::RewriteInstance(ELFObjectFileBase *File, DataReader &DR, - const int Argc, const char *const *Argv) - : InputFile(File), Argc(Argc), Argv(Argv), + DataAggregator &DA, const int Argc, + const char *const *Argv) + : InputFile(File), Argc(Argc), Argv(Argv), DA(DA), BC(createBinaryContext( File, DR, std::unique_ptr( @@ -646,6 +656,17 @@ void RewriteInstance::reset() { TotalScore = 0; } +void RewriteInstance::aggregateData() { + DA.aggregate(*BC.get(), BinaryFunctions); + + if (!opts::AggregateOnly) + return; + + if (std::error_code EC = DA.writeAggregatedFile()) { + check_error(EC, "cannot create output data file"); + } +} + void RewriteInstance::discoverStorage() { EFMM.reset(new ExecutableFileMemoryManager()); @@ -766,6 +787,11 @@ void RewriteInstance::run() { readDebugInfo(); readProfileData(); disassembleFunctions(); + if (DA.started()) { + aggregateData(); + if (opts::AggregateOnly) + return; + } for (uint64_t Address : NonSimpleFunctions) { auto FI = BinaryFunctions.find(Address); assert(FI != BinaryFunctions.end() && "bad non-simple function address"); @@ -782,6 +808,8 @@ void RewriteInstance::run() { unsigned PassNumber = 1; executeRewritePass({}); + if (opts::AggregateOnly) + return; if (opts::SplitFunctions == BinaryFunction::ST_LARGE && checkLargeFunctions()) { @@ -1860,6 +1888,9 @@ void RewriteInstance::disassembleFunctions() { } BC->InterproceduralReferences.clear(); + if (opts::AggregateOnly) + continue; + // Fill in CFI information for this function if (Function.isSimple()) { if (!CFIRdWrt->fillCFIInfoFor(Function)) { @@ -1895,6 +1926,9 @@ void RewriteInstance::disassembleFunctions() { } // Iterate over all functions + if (opts::AggregateOnly) + return; + uint64_t NumSimpleFunctions{0}; uint64_t NumStaleProfileFunctions{0}; std::vector ProfiledFunctions; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index ece21e055011..061a5ad5bf53 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -35,6 +35,7 @@ namespace bolt { class BinaryContext; class CFIReaderWriter; +class DataAggregator; class DataReader; /// Section information for mapping and re-writing. @@ -147,7 +148,7 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { class RewriteInstance { public: RewriteInstance(llvm::object::ELFObjectFileBase *File, DataReader &DR, - const int Argc, const char *const *Argv); + DataAggregator &DA, const int Argc, const char *const *Argv); ~RewriteInstance(); /// Reset all state except for split hints. Used to run a second pass with @@ -270,6 +271,9 @@ class RewriteInstance { void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, bool EmitColdPart); + /// Perform a perf.data aggregation job instead of a binary rewriting one + void aggregateData(); + /// Detect addresses and offsets available in the binary for allocating /// new sections. void discoverStorage(); @@ -423,6 +427,9 @@ class RewriteInstance { const int Argc; const char *const *Argv; + /// Holds our data aggregator in case user supplied a raw perf data file + DataAggregator &DA; + std::unique_ptr BC; std::unique_ptr CFIRdWrt; diff --git a/bolt/llvm-bolt.cpp b/bolt/llvm-bolt.cpp index f492dfadc679..55aa6bb920ff 100644 --- a/bolt/llvm-bolt.cpp +++ b/bolt/llvm-bolt.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "DataAggregator.h" #include "DataReader.h" #include "RewriteInstance.h" #include "llvm/Object/Binary.h" @@ -35,10 +36,19 @@ namespace opts { cl::OptionCategory BoltCategory("BOLT generic options"); cl::OptionCategory BoltOptCategory("BOLT optimization options"); cl::OptionCategory BoltRelocCategory("BOLT options in relocation mode"); +cl::OptionCategory BoltOutputCategory("Output options"); +cl::OptionCategory AggregatorCategory("Data aggregation options"); static cl::OptionCategory *BoltCategories[] = {&BoltCategory, &BoltOptCategory, - &BoltRelocCategory}; + &BoltRelocCategory, + &BoltOutputCategory}; + +static cl::OptionCategory *Perf2BoltCategories[] = {&AggregatorCategory, + &BoltOutputCategory}; + +extern cl::opt OutputFilename; +extern cl::opt AggregateOnly; static cl::opt DumpData("dump-data", @@ -59,6 +69,18 @@ InputFilename( cl::Required, cl::cat(BoltCategory)); +static cl::opt +PerfData("perfdata", + cl::desc(""), + cl::Optional, + cl::cat(AggregatorCategory)); + +static cl::alias +PerfDataA("p", + cl::desc("Alias for -perfdata"), + cl::aliasopt(PerfData), + cl::cat(AggregatorCategory)); + } // namespace opts static StringRef ToolName; @@ -81,6 +103,40 @@ static void printBoltRevision() { errs() << "BOLT revision " << BoltRevision << "\n"; } +void perf2boltMode(int argc, char **argv) { + cl::HideUnrelatedOptions(makeArrayRef(opts::Perf2BoltCategories)); + cl::ParseCommandLineOptions( + argc, argv, + "perf2bolt - BOLT data aggregator\n" + "\nEXAMPLE: perf2bolt -p=perf.data executable -o data.fdata\n"); + if (opts::PerfData.empty()) { + errs() << ToolName << ": expected -perfdata= option.\n"; + exit(1); + } + if (!opts::InputDataFilename.empty()) { + errs() << ToolName << ": unknown -data option.\n"; + exit(1); + } + if (!sys::fs::exists(opts::PerfData)) + report_error(opts::PerfData, errc::no_such_file_or_directory); + if (!DataAggregator::checkPerfDataMagic(opts::PerfData)) { + errs() << ToolName << ": '" << opts::PerfData + << "': expected valid perf.data file.\n"; + exit(1); + } + opts::AggregateOnly = true; +} + +void boltMode(int argc, char **argv) { + cl::HideUnrelatedOptions(makeArrayRef(opts::BoltCategories)); + // Register the target printer for --version. + cl::AddExtraVersionPrinter(printBoltRevision); + cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); + + cl::ParseCommandLineOptions(argc, argv, + "BOLT - Binary Optimization and Layout Tool\n"); +} + int main(int argc, char **argv) { // Print a stack trace if we signal out. sys::PrintStackTraceOnErrorSignal(); @@ -97,28 +153,42 @@ int main(int argc, char **argv) { llvm::InitializeAllTargets(); llvm::InitializeAllAsmPrinters(); - cl::HideUnrelatedOptions(makeArrayRef(opts::BoltCategories)); - - // Register the target printer for --version. - cl::AddExtraVersionPrinter(printBoltRevision); - cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); + ToolName = argv[0]; - cl::ParseCommandLineOptions(argc, argv, - "BOLT - Binary Optimization and Layout Tool\n"); + if (llvm::sys::path::filename(ToolName) == "perf2bolt") + perf2boltMode(argc, argv); + else + boltMode(argc, argv); - ToolName = argv[0]; if (!sys::fs::exists(opts::InputFilename)) report_error(opts::InputFilename, errc::no_such_file_or_directory); std::unique_ptr DR(new DataReader(errs())); - if (!opts::InputDataFilename.empty()) { + std::unique_ptr DA( + new DataAggregator(errs(), opts::InputFilename)); + + if (opts::AggregateOnly) { + DA->setOutputFDataName(opts::OutputFilename); + if (opts::PerfData.empty()) { + errs() << ToolName << ": missing required -perfdata option.\n"; + exit(1); + } + } + if (!opts::PerfData.empty()) { + if (!opts::AggregateOnly) { + errs() << ToolName + << ": reading perf data directly is unsupported, please use " + "-aggregate-only or perf2bolt\n"; + exit(1); + } + DA->start(opts::PerfData); + } else if (!opts::InputDataFilename.empty()) { if (!sys::fs::exists(opts::InputDataFilename)) report_error(opts::InputDataFilename, errc::no_such_file_or_directory); - // Attempt to read input bolt data auto ReaderOrErr = - bolt::DataReader::readPerfData(opts::InputDataFilename, errs()); + bolt::DataReader::readPerfData(opts::InputDataFilename, errs()); if (std::error_code EC = ReaderOrErr.getError()) report_error(opts::InputDataFilename, EC); DR.reset(ReaderOrErr.get().release()); @@ -135,7 +205,7 @@ int main(int argc, char **argv) { Binary &Binary = *BinaryOrErr.get().getBinary(); if (auto *e = dyn_cast(&Binary)) { - RewriteInstance RI(e, *DR.get(), argc, argv); + RewriteInstance RI(e, *DR.get(), *DA.get(), argc, argv); RI.run(); } else { report_error(opts::InputFilename, object_error::invalid_file_type); From bab8ca5680cae4f48f894a3540eb00e4e8ce16ea Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 21 Sep 2017 15:45:39 -0700 Subject: [PATCH 307/904] Fix SCTC bug when two pred/succ BB are in a loop. Summary: It's possible that two basic blocks being conidered for SCTC are in a loop in the CFG. In this case a block that is both a predecessor and a successor may have been processed and marked invalid by a previous iteration of the SCTC loop. We should skip rewriting in this case. (cherry picked from commit c57befbb5f6ad7be8f95f01109c263d587f85b67) --- bolt/Passes/BinaryPasses.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 559e979b38f5..302152d121ae 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -619,6 +619,16 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, assert(Result && "internal error analyzing conditional branch"); assert(CondBranch && "conditional branch expected"); + // It's possible that PredBB is also a successor to BB that may have + // been processed by a previous iteration of the SCTC loop, in which + // case it may have been marked invalid. We should skip rewriting in + // this case. + if (!PredBB->isValid()) { + assert(PredBB->isSuccessor(BB) && + "PredBB should be valid if it is not a successor to BB"); + continue; + } + // We don't want to reverse direction of the branch in new order // without further profile analysis. const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; From b2f65e42d227442d1dff188e161c8bd9a4eba923 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 25 Sep 2017 18:05:37 -0700 Subject: [PATCH 308/904] [BOLT] Ignore Clang LTO artifact file symbol Summary: The presence of ld-temp.o symbol is somewhat indeterministic. I couldn't find out exactly when it's generated, it could be related to LTO vs ThinLTO, but not always. If the symbol is there, it could affect names of most of functions in LTO binary. The status of the symbol may change between the binary the profile was collected on, and the binary BOLT is called on. As a result, we may mismatch many function names. It is safe to ignore this symbol. (cherry picked from commit 1f10c82bb4cc07791d8554de40da052a295dead4) --- bolt/RewriteInstance.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ef884a6e4694..ec761a70ba69 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -887,6 +887,10 @@ void RewriteInstance::discoverFileObjects() { if (Symbol.getType() == SymbolRef::ST_File) { check_error(NameOrError.getError(), "cannot get symbol name for file"); + // Ignore Clang LTO artificial FILE symbol as it is not always generated, + // and this uncertainty is causing havoc in function name matching. + if (*NameOrError == "ld-temp.o") + continue; FileSymbolName = *NameOrError; SeenFileName = true; continue; From 79013ed9c33c115f6c03f388369099fd112c34ba Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 26 Sep 2017 14:42:43 -0700 Subject: [PATCH 309/904] [PERF2BOLT] Improve user messages about profiling stats Summary: Improve messages and color-code bad traces percentage, warning user about a potential input binary mismatch. (cherry picked from commit 34d3184d9f777257f7ae2473b0448a76d44fe738) --- bolt/DataAggregator.cpp | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index 84bad029267e..1e23ba1fd954 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -500,6 +500,7 @@ std::error_code DataAggregator::parseEvents() { NamedRegionTimer T("Samples parsing", TimerGroupName, opts::TimeAggregator); uint64_t NumEntries{0}; uint64_t NumSamples{0}; + uint64_t NumTraces{0}; while (hasData()) { auto SampleRes = parseSample(); if (std::error_code EC = SampleRes.getError()) @@ -515,17 +516,49 @@ std::error_code DataAggregator::parseEvents() { // Parser semantic actions uint64_t Last{0}; for (const auto &LBR : Sample.LBR) { - if (Last) + if (Last) { doTrace(LBR.To, Last); + ++NumTraces; + } doBranch(LBR.From, LBR.To, LBR.Mispred); Last = LBR.From; } } outs() << "PERF2BOLT: Read " << NumSamples << " samples and " << NumEntries << " LBR entries\n"; - outs() << "PERF2BOLT: Invalid traces: " << NumInvalidTraces << "\n"; - outs() << "PERF2BOLT: Traces straddling multiple functions (discarded): " - << NumLongRangeTraces << "\n"; + outs() << "PERF2BOLT: Traces mismatching disassembled function contents: " + << NumInvalidTraces; + float Perc{0.0f}; + if (NumTraces > 0) { + outs() << " ("; + Perc = NumInvalidTraces * 100.0f / NumTraces; + if (outs().has_colors()) { + if (Perc > 10.0f) { + outs().changeColor(raw_ostream::RED); + } else if (Perc > 5.0f) { + outs().changeColor(raw_ostream::YELLOW); + } else { + outs().changeColor(raw_ostream::GREEN); + } + } + outs() << format("%.1f%%", Perc); + outs().resetColor(); + outs() << ")"; + } + outs() << "\n"; + if (Perc > 10.0f) { + outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " + "binary is probably not the same binary used during profiling " + "collection. The generated data may be ineffective for improving " + "performance.\n\n"; + } + + outs() << "PERF2BOLT: Out of range traces involving unknown regions: " + << NumLongRangeTraces; + if (NumTraces > 0) { + outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); + } + outs() << "\n"; return std::error_code(); } From 932d939f03ebdba86492171fee6e9ddc6a194a65 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 9 Oct 2017 15:52:13 -0700 Subject: [PATCH 310/904] [PERF2BOLT] Fix aggregator wrt new output format of perf Summary: Perf is now outputting one less space, which broke our previous (flaky) assumptions about field separators when processing the output file. Make it more resilient by accepting any number of spaces before reading LBR entries. (cherry picked from commit 224f400e9840f103c273f74064dbfb6490cdcc7e) --- bolt/DataAggregator.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index 1e23ba1fd954..bf855f798d10 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -476,8 +476,7 @@ ErrorOr DataAggregator::parseSample() { } while (!checkAndConsumeNewLine()) { - if (!expectAndConsumeFS()) - return make_error_code(llvm::errc::io_error); + checkAndConsumeFS(); auto LBRRes = parseLBREntry(); if (std::error_code EC = LBRRes.getError()) From 9b59f428cf4c3f3b615dcc2e252b471f1fccc1f9 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Mon, 9 Oct 2017 14:15:38 -0700 Subject: [PATCH 311/904] fixing sizes Summary: In some (weird) cases, a Function is marked 'split' but doesn't contain any 'cold' basic block. In that case, the size of the last basic block of the function is computed incorrectly. Hence, this fix. (cherry picked from commit 15af773c38a0a9f5b7dcf30261eb9844dbfcab2a) --- bolt/BinaryBasicBlock.h | 5 +++++ bolt/RewriteInstance.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 96a56f0e4d30..3534711a2623 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -713,6 +713,11 @@ class BinaryBasicBlock { return OutputAddressRange; } + /// Return size of the basic block in the output binary. + uint64_t getOutputSize() const { + return OutputAddressRange.second - OutputAddressRange.first; + } + BinaryFunction *getFunction() const { return Function; } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ec761a70ba69..075126a14246 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2604,7 +2604,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { } PrevBB = BB; } - PrevBB->setOutputEndAddress(Function.isSplit() ? + PrevBB->setOutputEndAddress(PrevBB->isCold() ? Function.cold().getAddress() + Function.cold().getImageSize() : Function.getOutputAddress() + Function.getOutputSize()); } From 4ad672756823932c399f3e0a712327f41d783286 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 6 Oct 2017 14:42:46 -0700 Subject: [PATCH 312/904] [PERF2BOLT] Check build-ids of binaries when aggregating Summary: Check the build-id of the input binary against the build-id of the binary used during profiling data collection with perf, as reported in perf.data. If they differ, issue a warning, since the user should use exactly the same binary. If we cannot determine the build-id of either the input binary or the one registered in the input perf.data, cancel the build-id check but print a log message. (cherry picked from commit e5a5e3adbaae4fd36d1dd922f756825e6113c7b5) --- bolt/DataAggregator.cpp | 160 +++++++++++++++++++++++++++++++++------ bolt/DataAggregator.h | 26 ++++++- bolt/RewriteInstance.cpp | 88 +++++++++++++++++++++ bolt/RewriteInstance.h | 7 ++ 4 files changed, 255 insertions(+), 26 deletions(-) diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index bf855f798d10..a4d586708bfa 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -60,14 +60,25 @@ void DataAggregator::findPerfExecutable() { void DataAggregator::start(StringRef PerfDataFilename) { Enabled = true; + this->PerfDataFilename = PerfDataFilename; outs() << "PERF2BOLT: Starting data aggregation job for " << PerfDataFilename << "\n"; findPerfExecutable(); - launchPerfEventsNoWait(PerfDataFilename); - launchPerfTasksNoWait(PerfDataFilename); + launchPerfEventsNoWait(); + launchPerfTasksNoWait(); } -bool DataAggregator::launchPerfEventsNoWait(StringRef PerfDataFilename) { +void DataAggregator::abort() { + std::string Error; + + // Kill subprocesses in case they are not finished + sys::Wait(TasksPI, 1, false, &Error); + sys::Wait(EventsPI, 1, false, &Error); + + deleteTempFiles(); +} + +bool DataAggregator::launchPerfEventsNoWait() { SmallVector Argv; SmallVector Redirects; SmallVector RedirectPtrs; @@ -112,7 +123,7 @@ bool DataAggregator::launchPerfEventsNoWait(StringRef PerfDataFilename) { return true; } -bool DataAggregator::launchPerfTasksNoWait(StringRef PerfDataFilename) { +bool DataAggregator::launchPerfTasksNoWait() { SmallVector Argv; SmallVector Redirects; SmallVector RedirectPtrs; @@ -156,6 +167,88 @@ bool DataAggregator::launchPerfTasksNoWait(StringRef PerfDataFilename) { return true; } +Optional DataAggregator::getPerfBuildID() { + SmallVector Argv; + SmallVector Redirects; + SmallVector RedirectPtrs; + SmallVector OutputPath; + SmallVector ErrPath; + + Argv.push_back(PerfPath.data()); + Argv.push_back("buildid-list"); + Argv.push_back("-i"); + Argv.push_back(PerfDataFilename.data()); + Argv.push_back(nullptr); + + if (auto Errc = sys::fs::createTemporaryFile("perf.buildid", "out", + OutputPath)) { + outs() << "PERF2BOLT: Failed to create temporary file " + << OutputPath << " with error " << Errc.message() << "\n"; + exit(1); + } + + if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", + ErrPath)) { + outs() << "PERF2BOLT: Failed to create temporary file " + << ErrPath << " with error " << Errc.message() << "\n"; + exit(1); + } + + Redirects.push_back(""); // Stdin + Redirects.push_back(StringRef(OutputPath.data())); // Stdout + Redirects.push_back(StringRef(ErrPath.data())); // Stderr + RedirectPtrs.push_back(&Redirects[0]); + RedirectPtrs.push_back(&Redirects[1]); + RedirectPtrs.push_back(&Redirects[2]); + + DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " + << OutputPath.data() << " 2> " + << ErrPath.data() << "\n"); + + auto RetCode = sys::ExecuteAndWait(PerfPath.data(), Argv.data(), + /*envp*/ nullptr, &RedirectPtrs[0]); + + if (RetCode != 0) { + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(ErrPath.data()); + StringRef ErrBuf = (*MB)->getBuffer(); + + errs() << "PERF-ERROR: Return code " << RetCode << "\n"; + errs() << ErrBuf; + deleteTempFile(ErrPath.data()); + deleteTempFile(OutputPath.data()); + return NoneType(); + } + + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(OutputPath.data()); + if (std::error_code EC = MB.getError()) { + errs() << "Cannot open " << PerfTasksOutputPath.data() << ": " + << EC.message() << "\n"; + deleteTempFile(ErrPath.data()); + deleteTempFile(OutputPath.data()); + return NoneType(); + } + + FileBuf.reset(MB->release()); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + auto ParseResult = parsePerfBuildID(); + if (!ParseResult) { + outs() << "PERF2BOLT: Failed to parse build-id from perf output\n"; + deleteTempFile(ErrPath.data()); + deleteTempFile(OutputPath.data()); + return NoneType(); + } + + outs() << "PERF2BOLT: Perf.data build-id is: " << *ParseResult << "\n"; + + deleteTempFile(ErrPath.data()); + deleteTempFile(OutputPath.data()); + return std::string(ParseResult->data(), ParseResult->size()); +} + bool DataAggregator::checkPerfDataMagic(StringRef FileName) { int FD; if (sys::fs::openFileForRead(FileName, FD)) { @@ -175,26 +268,18 @@ bool DataAggregator::checkPerfDataMagic(StringRef FileName) { return false; } -void DataAggregator::deleteTempFiles() { - if (auto Errc = sys::fs::remove(PerfEventsErrPath.data())) { +void DataAggregator::deleteTempFile(StringRef File) { + if (auto Errc = sys::fs::remove(File.data())) { outs() << "PERF2BOLT: Failed to delete temporary file " - << PerfEventsErrPath << " with error " << Errc.message() << "\n"; - } - - if (auto Errc = sys::fs::remove(PerfEventsOutputPath.data())) { - outs() << "PERF2BOLT: Failed to delete temporary file " - << PerfEventsOutputPath << " with error " << Errc.message() << "\n"; - } - - if (auto Errc = sys::fs::remove(PerfTasksErrPath.data())) { - outs() << "PERF2BOLT: Failed to delete temporary file " - << PerfTasksErrPath << " with error " << Errc.message() << "\n"; + << File << " with error " << Errc.message() << "\n"; } +} - if (auto Errc = sys::fs::remove(PerfTasksOutputPath.data())) { - outs() << "PERF2BOLT: Failed to delete temporary file " - << PerfTasksOutputPath << " with error " << Errc.message() << "\n"; - } +void DataAggregator::deleteTempFiles() { + deleteTempFile(PerfEventsErrPath.data()); + deleteTempFile(PerfEventsOutputPath.data()); + deleteTempFile(PerfTasksErrPath.data()); + deleteTempFile(PerfTasksOutputPath.data()); } bool DataAggregator::aggregate(BinaryContext &BC, @@ -541,7 +626,8 @@ std::error_code DataAggregator::parseEvents() { } } outs() << format("%.1f%%", Perc); - outs().resetColor(); + if (outs().has_colors()) + outs().resetColor(); outs() << ")"; } outs() << "\n"; @@ -623,6 +709,36 @@ std::error_code DataAggregator::parseTasks() { return std::error_code(); } +Optional> +DataAggregator::parseNameBuildIDPair() { + while (checkAndConsumeFS()) {} + + auto BuildIDStr = parseString(FieldSeparator, true); + if (std::error_code EC = BuildIDStr.getError()) + return NoneType(); + + auto NameStr = parseString(FieldSeparator, true); + if (std::error_code EC = NameStr.getError()) + return NoneType(); + + consumeRestOfLine(); + return std::make_pair(NameStr.get(), BuildIDStr.get()); +} + +Optional DataAggregator::parsePerfBuildID() { + while (hasData()) { + auto IDPair = parseNameBuildIDPair(); + if (!IDPair) + return NoneType(); + + if (sys::path::filename(IDPair->first) != BinaryName) + continue; + + return IDPair->second; + } + return NoneType(); +} + std::error_code DataAggregator::writeAggregatedFile() const { std::error_code EC; raw_fd_ostream OutFile(OutputFDataName, EC, sys::fs::OpenFlags::F_None); diff --git a/bolt/DataAggregator.h b/bolt/DataAggregator.h index 9e953d0d340a..c8a1de470a91 100644 --- a/bolt/DataAggregator.h +++ b/bolt/DataAggregator.h @@ -71,6 +71,9 @@ class DataAggregator : public DataReader { /// Whether aggregator was scheduled to run bool Enabled{false}; + /// Input perf.data file + StringRef PerfDataFilename; + /// Output file name to write aggregated fdata to StringRef OutputFDataName; @@ -92,16 +95,17 @@ class DataAggregator : public DataReader { /// Launch a subprocess to read all perf samples and write them to an output /// file we will parse later - bool launchPerfEventsNoWait(StringRef PerfDataFilename); + bool launchPerfEventsNoWait(); /// Launch a subprocess to read all perf task events. They contain the mapping /// of binary file name to PIDs used during data collection time. We later use /// the PIDs to filter samples. - bool launchPerfTasksNoWait(StringRef PerfDataFilename); + bool launchPerfTasksNoWait(); /// Delete all temporary files created to hold the output generated by spawned /// subprocesses during the aggregation job void deleteTempFiles(); + void deleteTempFile(StringRef File); // Semantic pass helpers /// Look up which function contains an address by using out map of @@ -159,6 +163,13 @@ class DataAggregator : public DataReader { /// events with the association of binary file names and their PIDs. std::error_code parseTasks(); + /// Parse a single pair of binary full path and associated build-id + Optional> parseNameBuildIDPair(); + + /// Parse the output generated by perf buildid-list to extract the build-id + /// of the binary used when collecting profiling + Optional parsePerfBuildID(); + public: DataAggregator(raw_ostream &Diag, StringRef BinaryName) : DataReader(Diag), BinaryName(llvm::sys::path::filename(BinaryName)) {} @@ -174,6 +185,9 @@ class DataAggregator : public DataReader { /// job is in progress bool started() const { return Enabled; } + /// Force all subprocesses to stop and cancel aggregation + void abort(); + /// Dump data structures into a file readable by llvm-bolt std::error_code writeAggregatedFile() const; @@ -184,13 +198,17 @@ class DataAggregator : public DataReader { /// Check whether \p FileName is a perf.data file static bool checkPerfDataMagic(StringRef FileName); + /// Launch a subprocess with perf buildid-list to extract the build-id of the + /// binary used when collecting profiling. Different than launchPerf*, this + /// one spawns the subprocess and blocks. Then it parses the result and + /// returns the build-id. + Optional getPerfBuildID(); + /// Debugging dump methods void dump() const; void dump(const LBREntry &LBR) const; void dump(const PerfSample &Sample) const; }; - - } } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 075126a14246..d4385693d7b9 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -325,6 +325,12 @@ AggregateOnly("aggregate-only", cl::Hidden, cl::cat(AggregatorCategory)); +static cl::opt +IgnoreBuildID("ignore-build-id", + cl::desc("continue even if build-ids in input binary and perf.data mismatch"), + cl::init(false), + cl::cat(AggregatorCategory)); + // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { @@ -774,6 +780,86 @@ void RewriteInstance::discoverStorage() { NewTextSegmentOffset = NextAvailableOffset; } +Optional +RewriteInstance::getBuildID() { + for (auto &Section : InputFile->sections()) { + StringRef SectionName; + Section.getName(SectionName); + + if (SectionName != ".note.gnu.build-id") + continue; + + StringRef SectionContents; + Section.getContents(SectionContents); + + // Reading notes section (see Portable Formats Specification, Version 1.1, + // pg 2-5, section "Note Section"). + DataExtractor DE = DataExtractor(SectionContents, true, 8); + uint32_t Offset = 0; + if (!DE.isValidOffset(Offset)) + return NoneType(); + uint32_t NameSz = DE.getU32(&Offset); + if (!DE.isValidOffset(Offset)) + return NoneType(); + uint32_t DescSz = DE.getU32(&Offset); + if (!DE.isValidOffset(Offset)) + return NoneType(); + uint32_t Type = DE.getU32(&Offset); + + DEBUG(dbgs() << "NameSz = " << NameSz << "; DescSz = " << DescSz + << "; Type = " << Type << "\n"); + + // Type 3 is a GNU build-id note section + if (Type != 3) + return NoneType(); + + StringRef Name = SectionContents.slice(Offset, Offset + NameSz); + Offset = RoundUpToAlignment(Offset + NameSz, 4); + StringRef BinaryBuildID = SectionContents.slice(Offset, Offset + DescSz); + if (Name.substr(0, 3) != "GNU") + return NoneType(); + + std::string Str; + raw_string_ostream OS(Str); + auto CharIter = BinaryBuildID.bytes_begin(); + while (CharIter != BinaryBuildID.bytes_end()) { + if (*CharIter < 0x10) + OS << "0"; + OS << Twine::utohexstr(*CharIter); + ++CharIter; + } + outs() << "BOLT-INFO: Binary build-id is: " << OS.str() << "\n"; + return OS.str(); + } + return NoneType(); +} + +void RewriteInstance::checkBuildID() { + auto FileBuildID = getBuildID(); + if (!FileBuildID) { + outs() << "BOLT-WARNING: Build ID will not be checked because we could not " + "read one from input binary\n"; + return; + } + auto PerfBuildID = DA.getPerfBuildID(); + if (!PerfBuildID) { + outs() << "BOLT-WARNING: Build ID will not be checked because we could not " + "read one from perf.data\n"; + return; + } + if (*FileBuildID == *PerfBuildID) + return; + + outs() << "BOLT-ERROR: Build ID mismatch! This indicates the input binary " + "supplied for data aggregation is not the same recorded by perf " + "when collecting profiling data.\n"; + + if (!opts::IgnoreBuildID) { + DA.abort(); + exit(1); + } +} + void RewriteInstance::run() { if (!BC) { errs() << "BOLT-ERROR: failed to create a binary context\n"; @@ -806,6 +892,8 @@ void RewriteInstance::run() { (llvm::Triple::ArchType)InputFile->getArch()) << "\n"; + if (DA.started()) + checkBuildID(); unsigned PassNumber = 1; executeRewritePass({}); if (opts::AggregateOnly) diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 061a5ad5bf53..1d393f76ca6e 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -158,6 +158,10 @@ class RewriteInstance { /// Run all the necessary steps to read, optimize and rewrite the binary. void run(); + /// Check that binary build ID matches the one used in perf.data to collect + /// profile + void checkBuildID(); + /// Populate array of binary functions and other objects of interest /// from meta data in the file. void discoverFileObjects(); @@ -278,6 +282,9 @@ class RewriteInstance { /// new sections. void discoverStorage(); + /// Read binary sections and find a gnu note section with the build-id + Optional getBuildID(); + /// Adjust function sizes and set proper maximum size values after the whole /// symbol table has been processed. void adjustFunctionBoundaries(); From c2c02ee016454e39410752e78bf83e8b638b919b Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 6 Oct 2017 17:54:26 -0700 Subject: [PATCH 313/904] [BOLT] Write bolt info according to ELF spec Summary: Follow ELF spec for NOTE sections when writing bolt info. Since tools such as "readelf -n" will not recognize a custom code identifying our new note section, we use GNU "gold linker version" note, tricking readelf into printing bolt info. (cherry picked from commit a8acae3b4f6c67841b552bb5ace8aa1885787e01) --- bolt/RewriteInstance.cpp | 43 +++++++++++++++++++++++++++++----------- bolt/RewriteInstance.h | 15 +++++++------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index d4385693d7b9..d0a848e8bddd 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -3043,25 +3043,42 @@ void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { void RewriteInstance::addBoltInfoSection() { if (opts::AddBoltInfo) { - std::string Str; - raw_string_ostream OS(Str); + std::string DescStr; + raw_string_ostream DescOS(DescStr); - OS << "BOLT revision: " << BoltRevision << ", " << "command line:"; + DescOS << "BOLT revision: " << BoltRevision << ", " << "command line:"; for (auto I = 0; I < Argc; ++I) { - OS << " " << Argv[I]; + DescOS << " " << Argv[I]; } + DescOS.flush(); + + std::string Str; + raw_string_ostream OS(Str); + std::string NameStr = "GNU"; + const uint32_t NameSz = NameStr.size() + 1; + const uint32_t DescSz = DescStr.size() + 1; + const uint32_t Type = 4; // NT_GNU_GOLD_VERSION (gold version) + OS.write(reinterpret_cast(&(NameSz)), 4); + OS.write(reinterpret_cast(&(DescSz)), 4); + OS.write(reinterpret_cast(&(Type)), 4); + OS << NameStr << '\0'; + for (uint64_t I = NameStr.size() + 1; + I < RoundUpToAlignment(NameStr.size() + 1, 4); ++I) { + OS << '\0'; + } + OS << DescStr << '\0'; const auto BoltInfo = OS.str(); const auto SectionSize = BoltInfo.size(); uint8_t *SectionData = new uint8_t[SectionSize]; memcpy(SectionData, BoltInfo.data(), SectionSize); - EFMM->NoteSectionInfo[".bolt_info"] = - SectionInfo(reinterpret_cast(SectionData), - SectionSize, - /*Alignment=*/1, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false); + EFMM->NoteSectionInfo[".note.bolt_info"] = + SectionInfo(reinterpret_cast(SectionData), SectionSize, + /*Alignment=*/1, + /*IsCode=*/false, + /*IsReadOnly=*/true, + /*IsLocal=*/false, 0, 0, 0, + /*IsELFNote=*/true); } } @@ -3223,7 +3240,9 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, outs() << "BOLT-INFO: writing section header for " << SectionName << '\n'; ELFShdrTy NewSection; NewSection.sh_name = SHStrTab.getOffset(SectionName); - NewSection.sh_type = (SI.IsStrTab ? ELF::SHT_STRTAB : ELF::SHT_PROGBITS); + NewSection.sh_type = + (SI.IsStrTab ? ELF::SHT_STRTAB + : SI.IsELFNote ? ELF::SHT_NOTE : ELF::SHT_PROGBITS); NewSection.sh_addr = 0; NewSection.sh_offset = SI.FileOffset; NewSection.sh_size = SI.Size; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 1d393f76ca6e..f557e6fa3fa4 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -51,6 +51,7 @@ struct SectionInfo { uint64_t FileAddress{0}; /// Address for the output file (final address). uint64_t FileOffset{0}; /// Offset in the output file. unsigned SectionID{0}; /// Unique ID used for address mapping. + bool IsELFNote{false}; /// Is ELF note section? struct Reloc { uint32_t Offset; @@ -62,13 +63,13 @@ struct SectionInfo { /// Pending relocations for the section. std::vector PendingRelocs; - SectionInfo(uint64_t Address, uint64_t Size, unsigned Alignment, - bool IsCode, bool IsReadOnly, - bool IsLocal, uint64_t FileAddress = 0, - uint64_t FileOffset = 0, unsigned SectionID = 0) - : AllocAddress(Address), Size(Size), Alignment(Alignment), IsCode(IsCode), - IsReadOnly(IsReadOnly), IsLocal(IsLocal), FileAddress(FileAddress), - FileOffset(FileOffset), SectionID(SectionID) {} + SectionInfo(uint64_t Address, uint64_t Size, unsigned Alignment, bool IsCode, + bool IsReadOnly, bool IsLocal, uint64_t FileAddress = 0, + uint64_t FileOffset = 0, unsigned SectionID = 0, + bool IsELFNote = false) + : AllocAddress(Address), Size(Size), Alignment(Alignment), IsCode(IsCode), + IsReadOnly(IsReadOnly), IsLocal(IsLocal), FileAddress(FileAddress), + FileOffset(FileOffset), SectionID(SectionID), IsELFNote(IsELFNote) {} SectionInfo() {} }; From 5988bbaf89488f40d9b7959748febd1582dc127c Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 10 Oct 2017 13:30:05 -0700 Subject: [PATCH 314/904] [BOLT] Fix bolt_info ELF note Summary: Small fix - align the end of the descriptor string as well, since readelf will detect when it is not aligned and print an error instead of printing BOLT version and command line. (cherry picked from commit 5c9811b0f22f42eb6aa50a30ced00981b8158041) --- bolt/RewriteInstance.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index d0a848e8bddd..612babbe2345 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -3056,17 +3056,21 @@ void RewriteInstance::addBoltInfoSection() { raw_string_ostream OS(Str); std::string NameStr = "GNU"; const uint32_t NameSz = NameStr.size() + 1; - const uint32_t DescSz = DescStr.size() + 1; + const uint32_t DescSz = DescStr.size(); const uint32_t Type = 4; // NT_GNU_GOLD_VERSION (gold version) OS.write(reinterpret_cast(&(NameSz)), 4); OS.write(reinterpret_cast(&(DescSz)), 4); OS.write(reinterpret_cast(&(Type)), 4); - OS << NameStr << '\0'; - for (uint64_t I = NameStr.size() + 1; - I < RoundUpToAlignment(NameStr.size() + 1, 4); ++I) { + OS << NameStr; + for (uint64_t I = NameStr.size(); + I < RoundUpToAlignment(NameStr.size(), 4); ++I) { + OS << '\0'; + } + OS << DescStr; + for (uint64_t I = DescStr.size(); + I < RoundUpToAlignment(DescStr.size(), 4); ++I) { OS << '\0'; } - OS << DescStr << '\0'; const auto BoltInfo = OS.str(); const auto SectionSize = BoltInfo.size(); From 79c69419201026f9f3c65c0c44788acf4c7267a6 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 10 Oct 2017 16:36:01 -0700 Subject: [PATCH 315/904] [BOLT] Use 32 as the default max bytes for function alignment Summary: Several benchmarks (hhvm, compilers) show that 32 provides a good balance between I-Cache performance and iTLB misses. (cherry picked from commit dade94ab2ef7bea9d284cf6ed6bcd35934930606) --- bolt/RewriteInstance.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 612babbe2345..398b9d6fdeeb 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -101,7 +101,7 @@ AlignFunctions("align-functions", static cl::opt AlignFunctionsMaxBytes("align-functions-max-bytes", cl::desc("maximum number of bytes to use to align functions"), - cl::init(7), + cl::init(32), cl::ZeroOrMore, cl::cat(BoltOptCategory)); From 0d348ca0da946a2e94146e4a0654cfcaff6dfb9a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 10 Oct 2017 18:06:45 -0700 Subject: [PATCH 316/904] [BOLT] Create symbol table entries under -hot-text if they did not exist Summary: If "-hot-text" options is specified and the input binary did not have __hot_start/__hot_end symbols, then add them to the symbol table. (cherry picked from commit 87e7a24b05f3e589df879016ba86f76d027203c8) --- bolt/RewriteInstance.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 398b9d6fdeeb..dbdb521bd149 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -3356,6 +3356,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { Write, std::function AddToStrTab) { auto StringSection = *Obj->getStringTableForSymtab(*Section); + unsigned IsHotTextUpdated = 0; for (const Elf_Sym &Symbol : Obj->symbols(Section)) { auto NewSymbol = Symbol; @@ -3412,6 +3413,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { NewSymbol.st_shndx = ELF::SHN_ABS; outs() << "BOLT-INFO: setting " << Name << " to 0x" << Twine::utohexstr(NewSymbol.st_value) << '\n'; + ++IsHotTextUpdated; return true; }; @@ -3424,6 +3426,28 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { Write((&Symbol - Obj->symbol_begin(Section)) * sizeof(Elf_Sym), reinterpret_cast(&NewSymbol), sizeof(NewSymbol)); } + + assert((!IsHotTextUpdated || IsHotTextUpdated == 2) && + "either none or both __hot_start/__hot_end symbols were expected"); + if (opts::HotText && !IsHotTextUpdated && !PatchExisting) { + auto addSymbol = [&](const std::string &Name) { + Elf_Sym Symbol; + Symbol.st_value = getNewValueForSymbol(Name); + Symbol.st_shndx = ELF::SHN_ABS; + Symbol.st_name = AddToStrTab(Name); + Symbol.st_size = 0; + Symbol.st_shndx = 0; + Symbol.st_other = 0; + Symbol.setBindingAndType(ELF::STB_WEAK, ELF::STT_NOTYPE); + + outs() << "BOLT-INFO: setting " << Name << " to 0x" + << Twine::utohexstr(Symbol.st_value) << '\n'; + + Write(0, reinterpret_cast(&Symbol), sizeof(Symbol)); + }; + addSymbol("__hot_start"); + addSymbol("__hot_end"); + } }; // Update dynamic symbol table. From 3fd9a0a06a068a76630161e1fec2078de64fc86f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 16 Oct 2017 15:22:05 -0700 Subject: [PATCH 317/904] [BOLT] Change function order file format for linker script Summary: Change output of "-generate-function-order=" to match expected format used for a linker script: * Prefix function names with ".text". * Strip internal suffix from local function names. E.g. for function with names "foo/1" and "foo/foo.c/1" we will only output "foo". * Output (with indentation) duplicate names for folded functions. (cherry picked from commit bbd66431938b4270ddd1f6f572f5ccb026c5e692) --- bolt/Passes/ReorderFunctions.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index 5f5ed717c73b..ed61058b73ac 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -444,7 +444,20 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, for (const auto *Func : SortedFunctions) { if (!Func->hasValidIndex()) break; - FuncsFile << Func->getSymbol()->getName().data() << "\n"; + if (Func->isPLTFunction()) + continue; + const char *Indent = ""; + for (auto Name : Func->getNames()) { + const auto SlashPos = Name.find('/'); + if (SlashPos != std::string::npos) { + // Avoid duplicates for local functions. + if (Name.find('/', SlashPos + 1) != std::string::npos) + continue; + Name = Name.substr(0, SlashPos); + } + FuncsFile << Indent << ".text." << Name << "\n"; + Indent = " "; + } } FuncsFile.close(); From 4d88c02ead45f8043252f15e3a352e5feeaec5c8 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 17 Oct 2017 10:05:16 -0700 Subject: [PATCH 318/904] [BOLT] Fix function order output option Summary: Add support to output both function order and section order files as the former is useful for offloading functions sorting and the latter is useful for linker script generation: -generate-function-order= -generate-link-sections= (cherry picked from commit e602c60f8ac645710083b565ca8fa4912f4111f9) --- bolt/Passes/ReorderFunctions.cpp | 70 ++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index ed61058b73ac..bb5e55ca752b 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -75,6 +75,12 @@ GenerateFunctionOrderFile("generate-function-order", "reordering"), cl::cat(BoltOptCategory)); +static cl::opt +LinkSectionsFile("generate-link-sections", + cl::desc("generate a list of function sections in a format suitable for " + "inclusion in a linker script"), + cl::cat(BoltOptCategory)); + static cl::opt UseEdgeCounts("use-edge-counts", cl::desc("use edge count data when doing clustering"), @@ -408,16 +414,32 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, reorder(std::move(Clusters), BFs); + std::unique_ptr FuncsFile; if (!opts::GenerateFunctionOrderFile.empty()) { - std::ofstream FuncsFile(opts::GenerateFunctionOrderFile, std::ios::out); + FuncsFile = + llvm::make_unique(opts::GenerateFunctionOrderFile, + std::ios::out); if (!FuncsFile) { - errs() << "Ordered functions file \"" << opts::GenerateFunctionOrderFile - << "\" can't be opened.\n"; + errs() << "BOLT-ERROR: ordered functions file " + << opts::GenerateFunctionOrderFile << " cannot be opened\n"; exit(1); } + } - std::vector SortedFunctions(BFs.size()); + std::unique_ptr LinkSectionsFile; + if (!opts::LinkSectionsFile.empty()) { + LinkSectionsFile = + llvm::make_unique(opts::LinkSectionsFile, + std::ios::out); + if (!LinkSectionsFile) { + errs() << "BOLT-ERROR: link sections file " + << opts::LinkSectionsFile << " cannot be opened\n"; + exit(1); + } + } + if (FuncsFile || LinkSectionsFile) { + std::vector SortedFunctions(BFs.size()); std::transform(BFs.begin(), BFs.end(), SortedFunctions.begin(), @@ -446,25 +468,37 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, break; if (Func->isPLTFunction()) continue; - const char *Indent = ""; - for (auto Name : Func->getNames()) { - const auto SlashPos = Name.find('/'); - if (SlashPos != std::string::npos) { - // Avoid duplicates for local functions. - if (Name.find('/', SlashPos + 1) != std::string::npos) - continue; - Name = Name.substr(0, SlashPos); + + if (FuncsFile) + *FuncsFile << Func->getSymbol()->getName().data() << "\n"; + + if (LinkSectionsFile) { + const char *Indent = ""; + for (auto Name : Func->getNames()) { + const auto SlashPos = Name.find('/'); + if (SlashPos != std::string::npos) { + // Avoid duplicates for local functions. + if (Name.find('/', SlashPos + 1) != std::string::npos) + continue; + Name = Name.substr(0, SlashPos); + } + *LinkSectionsFile << Indent << ".text." << Name << "\n"; + Indent = " "; } - FuncsFile << Indent << ".text." << Name << "\n"; - Indent = " "; } } - FuncsFile.close(); - outs() << "BOLT-INFO: dumped function order to \"" - << opts::GenerateFunctionOrderFile << "\"\n"; + if (FuncsFile) { + FuncsFile->close(); + outs() << "BOLT-INFO: dumped function order to " + << opts::GenerateFunctionOrderFile << '\n'; + } - exit(0); + if (LinkSectionsFile) { + LinkSectionsFile->close(); + outs() << "BOLT-INFO: dumped linker section order to " + << opts::LinkSectionsFile << '\n'; + } } } From 47d2291d8c8b7687aac8da0ac60bde0f40cfc320 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Mon, 16 Oct 2017 16:53:50 -0700 Subject: [PATCH 319/904] updating cache metrics Summary: This is a replacement of a previous diff. The implemented metric ('graph distance') is not very useful at the moment but I plan to add more relevant metrics in the subsequent diff. This diff fixes some obvious problems and moves the call of CalcMetrics::printAll to the right place. (cherry picked from commit 905831265f7e9ecb8cf50edc24108d968c99b8a0) --- bolt/CMakeLists.txt | 2 +- ...{CalcCacheMetrics.cpp => CacheMetrics.cpp} | 123 ++++++++---------- bolt/CacheMetrics.h | 27 ++++ bolt/CalcCacheMetrics.h | 27 ---- bolt/RewriteInstance.cpp | 23 ++-- 5 files changed, 88 insertions(+), 114 deletions(-) rename bolt/{CalcCacheMetrics.cpp => CacheMetrics.cpp} (53%) create mode 100644 bolt/CacheMetrics.h delete mode 100644 bolt/CalcCacheMetrics.h diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 1194d7da67a3..dd8a44975134 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -64,7 +64,7 @@ add_llvm_tool(llvm-bolt BinaryContext.cpp BinaryFunction.cpp BinaryPassManager.cpp - CalcCacheMetrics.cpp + CacheMetrics.cpp DataAggregator.cpp DataReader.cpp DebugData.cpp diff --git a/bolt/CalcCacheMetrics.cpp b/bolt/CacheMetrics.cpp similarity index 53% rename from bolt/CalcCacheMetrics.cpp rename to bolt/CacheMetrics.cpp index 166b3c3b7dd4..4e5b08fe2729 100644 --- a/bolt/CalcCacheMetrics.cpp +++ b/bolt/CacheMetrics.cpp @@ -1,4 +1,4 @@ -//===------ CalcCacheMetrics.cpp - Calculate metrics of cache lines -------===// +//===------ CacheMetrics.cpp - Calculate metrics for instruction cache ----===// // // Functions to show metrics of cache lines // @@ -7,31 +7,12 @@ // //===----------------------------------------------------------------------===// - -#include "BinaryBasicBlock.h" -#include "BinaryContext.h" -#include "BinaryFunction.h" -#include "BinaryPassManager.h" -#include "CalcCacheMetrics.h" -#include "Exceptions.h" -#include "RewriteInstance.h" -#include "llvm/MC/MCAsmLayout.h" -#include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCSectionELF.h" -#include +#include "CacheMetrics.h" using namespace llvm; -using namespace object; using namespace bolt; using Traversal = std::vector; -namespace opts { - -extern cl::OptionCategory BoltOptCategory; - -} // namespace opts - - namespace { /// Initialize and return a position map for binary basic blocks. @@ -47,10 +28,8 @@ getPositionMap(const BinaryFunction &Function) { return DistMap; } -/// Initialize and return a vector of traversals for a given function and its -/// entry point -std::vector getTraversals(const BinaryFunction &Function, - BinaryBasicBlock *EntryBB) { +/// Initialize and return a vector of traversals for a given entry block +std::vector getTraversals(BinaryBasicBlock *EntryBB) { std::vector AllTraversals; std::stack> Stack; Stack.push(std::make_pair(EntryBB, Traversal())); @@ -105,10 +84,6 @@ std::vector getTraversals(const BinaryFunction &Function, double getTraversalLength(std::unordered_map &DistMap, Traversal const &Path) { - if (Path.size() <= 1) { - return 0.0; - } - double Length = 0.0; BinaryBasicBlock *PrevBB = Path.front(); for (auto BBI = std::next(Path.begin()); BBI != Path.end(); ++BBI) { @@ -119,56 +94,62 @@ getTraversalLength(std::unordered_map &DistMap, return Length; } -/// Helper function of calcGraphDistance to go through the call traversals of -/// certain function and to calculate and record the length of each -/// traversal. -void graphDistHelper(std::vector &AllTraversals, - const BinaryFunction &Function, - std::unordered_map &TraversalMap, - uint64_t &TraversalCount) { - auto DistMap = getPositionMap(Function); - - for (auto const &Path : AllTraversals) { - TraversalMap[++TraversalCount] = getTraversalLength(DistMap, Path); - } -} -} - -void CalcCacheMetrics::calcGraphDistance( - const std::map &BinaryFunctions) { - - double TotalFuncValue = 0; - uint64_t FuncCount = 0; - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; +/// Calculate average number of call distance for every graph traversal +double calcGraphDistance(const std::vector &BinaryFunctions) { + double TotalTraversalLength = 0; + double NumTraversals = 0; + for (auto BF : BinaryFunctions) { // Only consider functions which are known to be executed - if (Function.getKnownExecutionCount() == 0) + if (BF->getKnownExecutionCount() == 0) continue; - std::unordered_map TraversalMap; - uint64_t TraversalCount = 0; - for (auto *BB : Function.layout()) { + for (auto BB : BF->layout()) { if (BB->isEntryPoint()) { - auto AllTraversals = getTraversals(Function, BB); - graphDistHelper(AllTraversals, Function, TraversalMap, TraversalCount); + auto AllTraversals = getTraversals(BB); + auto DistMap = getPositionMap(*BF); + for (auto const &Path : AllTraversals) { + // Ignore short traversals + if (Path.size() <= 1) + continue; + TotalTraversalLength += getTraversalLength(DistMap, Path); + NumTraversals++; + } } } + } - double TotalValue = 0; - for (auto const &Entry : TraversalMap) { - TotalValue += Entry.second; - } + return TotalTraversalLength / NumTraversals; +} - double AverageValue = - TraversalMap.empty() ? 0 : (TotalValue * 1.0 / TraversalMap.size()); - TotalFuncValue += AverageValue; - FuncCount += TraversalMap.empty() ? 0 : 1; +} + +void CacheMetrics::printAll( + const std::vector &BinaryFunctions) { + + size_t NumFunctions = 0; + size_t NumHotFunctions = 0; + size_t NumBlocks = 0; + size_t NumHotBlocks = 0; + + for (auto BF : BinaryFunctions) { + NumFunctions++; + if (BF->getKnownExecutionCount() > 0) + NumHotFunctions++; + for (auto BB : BF->layout()) { + NumBlocks++; + if (BB->getKnownExecutionCount() > 0) + NumHotBlocks++; + } } - outs() << format(" Sum of averages of traversal distance for all " - "functions is: %.2f\n", - TotalFuncValue) - << format(" There are %u functions in total\n", FuncCount) - << format(" On average, every traversal is %.2f long\n\n", - TotalFuncValue / FuncCount); + outs() << format(" There are %zu functions;", NumFunctions) + << format(" %zu (%.2lf%%) have non-empty execution count\n", + NumHotFunctions, 100.0 * NumHotFunctions / NumFunctions); + outs() << format(" There are %zu basic blocks;", NumBlocks) + << format(" %zu (%.2lf%%) have non-empty execution count\n", + NumHotBlocks, 100.0 * NumHotBlocks / NumBlocks); + + const auto GraphDistance = calcGraphDistance(BinaryFunctions); + outs() << " An average length of graph traversal is " + << format("%.2lf\n", GraphDistance); } diff --git a/bolt/CacheMetrics.h b/bolt/CacheMetrics.h new file mode 100644 index 000000000000..e4ca3abc34f9 --- /dev/null +++ b/bolt/CacheMetrics.h @@ -0,0 +1,27 @@ +//===- CacheMetrics.h - Interface for instruction cache evaluation --===// +// +// Functions to show metrics of cache lines +// +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_CACHEMETRICS_H +#define LLVM_TOOLS_LLVM_BOLT_CACHEMETRICS_H + +#include "BinaryFunction.h" +#include + +namespace llvm { +namespace bolt { +namespace CacheMetrics { + +/// Calculate and print various metrics related to instruction cache performance +void printAll(const std::vector &BinaryFunctions); + +} // namespace CacheMetrics +} // namespace bolt +} // namespace llvm + +#endif //LLVM_CACHEMETRICS_H diff --git a/bolt/CalcCacheMetrics.h b/bolt/CalcCacheMetrics.h deleted file mode 100644 index 07ca4551e28f..000000000000 --- a/bolt/CalcCacheMetrics.h +++ /dev/null @@ -1,27 +0,0 @@ -//===- CalcCacheMetrics.h - Interface for metrics printing of cache lines --===// -// -// Functions to show metrics of cache lines -// -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - - -#ifndef LLVM_CALCCACHEMETRICS_H -#define LLVM_CALCCACHEMETRICS_H - -#include "BinaryFunction.h" -#include - -using namespace llvm; -using namespace object; -using namespace bolt; - -namespace CalcCacheMetrics { -/// Calculate average number of call distance for every graph traversal. -void calcGraphDistance( - const std::map &BinaryFunctions); -} - -#endif //LLVM_CALCCACHEMETRICS_H diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index dbdb521bd149..3274f213d908 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -14,7 +14,7 @@ #include "BinaryContext.h" #include "BinaryFunction.h" #include "BinaryPassManager.h" -#include "CalcCacheMetrics.h" +#include "CacheMetrics.h" #include "DataAggregator.h" #include "DataReader.h" #include "Exceptions.h" @@ -79,8 +79,8 @@ extern cl::opt JumpTables; extern cl::opt ReorderFunctions; static cl::opt -CalcCacheMetrics("calc-cache-metrics", - cl::desc("calculate metrics of cache lines"), +PrintCacheMetrics("print-cache-metrics", + cl::desc("calculate and print various metrics for instruction cache"), cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -920,12 +920,6 @@ void RewriteInstance::run() { executeRewritePass(LargeFunctions); } - if (opts::CalcCacheMetrics) { - outs() << "\nBOLT-INFO: After Optimization CFG Graph Statistics: Jump " - "Distance \n\n"; - CalcCacheMetrics::calcGraphDistance(BinaryFunctions); - } - if (opts::UpdateDebugSections) updateDebugInfo(); @@ -2096,12 +2090,6 @@ void RewriteInstance::disassembleFunctions() { } } } - - if (opts::CalcCacheMetrics) { - outs() << "\nBOLT-INFO: Before Optimization CFG Graph Statistics: Jump " - "Distance \n\n"; - CalcCacheMetrics::calcGraphDistance(BinaryFunctions); - } } void RewriteInstance::runOptimizationPasses() { @@ -2424,6 +2412,11 @@ void RewriteInstance::emitFunctions() { OLT.emitAndFinalize(ObjectsHandle); + if (opts::PrintCacheMetrics) { + outs() << "BOLT-INFO: cache metrics after optimization\n"; + CacheMetrics::printAll(SortedFunctions); + } + if (opts::KeepTmp) TempOut->keep(); } From 6fd5607716805e103217b09bd4156be817b1ee53 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 12 Oct 2017 14:57:11 -0700 Subject: [PATCH 320/904] [BOLT][Refactoring] Make CTC first class operand, etc. Summary: This diff is a preparation for decoupling function disassembly, profile association, and CFG construction phases. We used to have multiple ways to mark conditional tail calls with annotations or TailCallOffsets map. Since CTC information is affecting the correctness, it is justifiable to have it as a operand class for instruction with a destination (0 is a valid one). "Offset" annotation now replaces "EdgeCountData" and "IndirectBranchData" annotations to extract profile data for any given instruction. Inlining for small functions was broken in a presence of profiled (annotated) instructions and hence I had to remove "-inline-small-functions" from the test case. Also fix an issue with UNDEF section for created __hot_start/__hot_end symbols. Now the symbols use ABS section. (cherry picked from commit 1cb088f7e15412993bc532a016dc5cfe0a075f66) --- bolt/BinaryFunction.cpp | 107 +++++++----------------- bolt/BinaryFunction.h | 6 -- bolt/Passes/BinaryFunctionCallGraph.cpp | 8 +- bolt/Passes/BinaryPasses.cpp | 2 +- bolt/Passes/IndirectCallPromotion.cpp | 14 ++-- bolt/Passes/Inliner.cpp | 4 +- bolt/RewriteInstance.cpp | 1 - 7 files changed, 45 insertions(+), 97 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 9e10d76e7e77..048bd785a4b1 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -740,7 +740,7 @@ BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, // // We handle PIC-style jump tables separately. // - if (Instruction.getNumOperands() == 1) { + if (Instruction.getNumPrimeOperands() == 1) { // If the indirect jump is on register - try to detect if the // register value is loaded from a memory location. assert(Instruction.getOperand(0).isReg() && "register operand expected"); @@ -1064,7 +1064,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { if (!IsZeroPadding) { // Ignore this function. Skip to the next one in non-relocs mode. - errs() << "BOLT-ERROR: unable to disassemble instruction at offset 0x" + errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x" << Twine::utohexstr(Offset) << " (address 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << *this << '\n'; @@ -1098,7 +1098,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { int64_t Value; const auto Result = BC.MIA->replaceImmWithSymbol(Instruction, Relocation.Symbol, - Relocation.Addend, BC.Ctx.get(), Value); + Relocation.Addend, Ctx.get(), Value); (void)Result; assert(Result && "cannot replace immediate with relocation"); @@ -1125,7 +1125,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // or a recursive call. bool IsCall = MIA->isCall(Instruction); const bool IsCondBranch = MIA->isConditionalBranch(Instruction); - MCSymbol *TargetSymbol{nullptr}; + MCSymbol *TargetSymbol = nullptr; if (IsCall && containsAddress(TargetAddress)) { if (TargetAddress == getAddress()) { @@ -1154,7 +1154,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { << " : replacing with nop.\n"); BC.MIA->createNoop(Instruction); if (IsCondBranch) { - // Register branch function profile validation. + // Register branch offset for profile validation. IgnoredBranches.emplace_back(Offset, Offset + Size); } goto add_instruction; @@ -1182,10 +1182,6 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { << Twine::utohexstr(AbsoluteInstrAddr) << ".\n"; } } - // TODO: A better way to do this would be using annotations for - // MCInst objects. - TailCallOffsets.emplace(std::make_pair(Offset, - TargetAddress)); IsCall = true; } @@ -1231,14 +1227,6 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Add taken branch info. TakenBranches.emplace_back(Offset, TargetAddress - getAddress()); } - if (IsCondBranch) { - // Add fallthrough branch info. - FTBranches.emplace_back(Offset, Offset + Size); - } - - const bool isIndirect = - ((IsCall || !IsCondBranch) && MIA->isIndirectBranch(Instruction)); - Instruction.clear(); Instruction.addOperand( MCOperand::createExpr( @@ -1246,19 +1234,21 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { MCSymbolRefExpr::VK_None, *Ctx))); - if (BranchData) { + // Record call offset for profile matching. + if (IsCall) { + MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); + } + if (IsCondBranch) { + // Add fallthrough branch info. + FTBranches.emplace_back(Offset, Offset + Size); if (IsCall) { - MIA->addAnnotation(Ctx.get(), Instruction, "EdgeCountData", Offset); - } - if (isIndirect) { - MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData", - Offset); + MIA->setConditionalTailCall(Instruction, TargetAddress); } } } else { // Could not evaluate branch. Should be an indirect call or an // indirect branch. Bail out on the latter case. - bool MaybeEdgeCountData = false; + MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); if (MIA->isIndirectBranch(Instruction)) { auto Result = analyzeIndirectBranch(Instruction, Size, Offset); switch (Result) { @@ -1269,40 +1259,18 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { auto Result = MIA->convertJmpToTailCall(Instruction); (void)Result; assert(Result); - if (BranchData) { - MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData", - Offset); - } } break; case IndirectBranchType::POSSIBLE_JUMP_TABLE: case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE: if (opts::JumpTables == JTS_NONE) IsSimple = false; - MaybeEdgeCountData = true; break; case IndirectBranchType::UNKNOWN: // Keep processing. We'll do more checks and fixes in // postProcessIndirectBranches(). - MaybeEdgeCountData = true; - if (BranchData) { - MIA->addAnnotation(Ctx.get(), - Instruction, - "MaybeIndirectBranchData", - Offset); - } break; }; - } else if (MIA->isCall(Instruction)) { - if (BranchData) { - MIA->addAnnotation(Ctx.get(), Instruction, "IndirectBranchData", - Offset); - } - } - if (BranchData) { - const char* AttrName = - MaybeEdgeCountData ? "MaybeEdgeCountData" : "EdgeCountData"; - MIA->addAnnotation(Ctx.get(), Instruction, AttrName, Offset); } // Indirect call. We only need to fix it if the operand is RIP-relative if (IsSimple && MIA->hasRIPOperand(Instruction)) { @@ -1310,6 +1278,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { errs() << "BOLT-ERROR: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << *this << ".\n"; + if (opts::Relocs) + exit(1); IsSimple = false; } } @@ -1320,6 +1290,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { errs() << "BOLT-ERROR: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << *this << ".\n"; + if (opts::Relocs) + exit(1); IsSimple = false; } } @@ -1336,7 +1308,6 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { postProcessJumpTables(); - // Update state. updateState(State::Disassembled); } @@ -1402,12 +1373,6 @@ bool BinaryFunction::postProcessIndirectBranches() { // it must be a tail call. if (layout_size() == 1) { BC.MIA->convertJmpToTailCall(Instr); - BC.MIA->renameAnnotation(Instr, - "MaybeEdgeCountData", - "EdgeCountData"); - BC.MIA->renameAnnotation(Instr, - "MaybeIndirectBranchData", - "IndirectBranchData"); return true; } @@ -1487,12 +1452,6 @@ bool BinaryFunction::postProcessIndirectBranches() { return false; } BC.MIA->convertJmpToTailCall(Instr); - BC.MIA->renameAnnotation(Instr, - "MaybeEdgeCountData", - "EdgeCountData"); - BC.MIA->renameAnnotation(Instr, - "MaybeIndirectBranchData", - "IndirectBranchData"); } } return true; @@ -1573,7 +1532,7 @@ bool BinaryFunction::buildCFG() { // unconditional branch, and the unconditional branch is not // a destination of another branch. In the latter case, the // basic block will consist of a single unconditional branch - // (missed optimization opportunity?). + // (missed "double-jump" optimization). // // Created basic blocks are sorted in layout order since they are // created in the same order as instructions, and instructions are @@ -1581,7 +1540,6 @@ bool BinaryFunction::buildCFG() { BinaryBasicBlock *InsertBB{nullptr}; BinaryBasicBlock *PrevBB{nullptr}; bool IsLastInstrNop{false}; - bool IsPreviousInstrTailCall{false}; const MCInst *PrevInstr{nullptr}; auto addCFIPlaceholders = @@ -1615,11 +1573,13 @@ bool BinaryFunction::buildCFG() { } if (!InsertBB) { // It must be a fallthrough or unreachable code. Create a new block unless - // we see an unconditional branch following a conditional one. + // we see an unconditional branch following a conditional one. The latter + // should not be a conditional tail call. assert(PrevBB && "no previous basic block for a fall through"); assert(PrevInstr && "no previous instruction for a fall through"); if (MIA->isUnconditionalBranch(Instr) && - !MIA->isUnconditionalBranch(*PrevInstr) && !IsPreviousInstrTailCall) { + !MIA->isUnconditionalBranch(*PrevInstr) && + !MIA->getConditionalTailCall(*PrevInstr)) { // Temporarily restore inserter basic block. InsertBB = PrevBB; } else { @@ -1637,16 +1597,10 @@ bool BinaryFunction::buildCFG() { uint32_t InsertIndex = InsertBB->addInstruction(Instr); PrevInstr = &Instr; - // Record whether this basic block is terminated with a tail call. - auto TCI = TailCallOffsets.find(Offset); - if (TCI != TailCallOffsets.end()) { - uint64_t TargetAddr = TCI->second; + // Record conditional tail call info. + if (const auto CTCDest = MIA->getConditionalTailCall(Instr)) { TailCallTerminatedBlocks.emplace( - std::make_pair(InsertBB, - TailCallInfo(Offset, InsertIndex, TargetAddr))); - IsPreviousInstrTailCall = true; - } else { - IsPreviousInstrTailCall = false; + std::make_pair(InsertBB, TailCallInfo(Offset, InsertIndex, *CTCDest))); } // Add associated CFI instrs. We always add the CFI instruction that is @@ -1821,9 +1775,7 @@ bool BinaryFunction::buildCFG() { // Check if the last instruction is a conditional jump that serves as a tail // call. - bool IsCondTailCall = MIA->isConditionalBranch(*LastInstIter) && - TailCallTerminatedBlocks.count(BB); - + const auto IsCondTailCall = MIA->getConditionalTailCall(*LastInstIter); if (BB->succ_size() == 0) { if (IsCondTailCall) { // Conditional tail call without profile data for non-taken branch. @@ -1915,7 +1867,6 @@ bool BinaryFunction::buildCFG() { // NB: don't clear Labels list as we may need them if we mark the function // as non-simple later in the process of discovering extra entry points. clearList(Instructions); - clearList(TailCallOffsets); clearList(TailCallTerminatedBlocks); clearList(OffsetToCFI); clearList(TakenBranches); @@ -4384,7 +4335,7 @@ DynoStats BinaryFunction::getDynoStats() const { if (!BC.MIA->isCall(Instr)) continue; uint64_t CallFreq = BBExecutionCount; - if (BC.MIA->isCTC(Instr)) { + if (BC.MIA->getConditionalTailCall(Instr)) { CallFreq = 0; if (auto FreqOrErr = BC.MIA->tryGetAnnotationAs(Instr, "CTCTakenFreq")) { @@ -4444,7 +4395,7 @@ DynoStats BinaryFunction::getDynoStats() const { } // CTCs - if (BC.MIA->isCTC(*CondBranch)) { + if (BC.MIA->getConditionalTailCall(*CondBranch)) { if (BB->branch_info_begin() != BB->branch_info_end()) Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count; continue; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 7f58d4b92b84..4f9a45ef495c 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -482,12 +482,6 @@ class BinaryFunction { using InstrMapType = std::map; InstrMapType Instructions; - /// Temporary holder of offsets of tail call instructions before CFG is - /// constructed. Map from offset to the corresponding target address of the - /// tail call. - using TailCallOffsetMapType = std::map; - TailCallOffsetMapType TailCallOffsets; - /// Temporary holder of tail call terminated basic blocks used during CFG /// construction. Map from tail call terminated basic block to a struct with /// information about the tail call. diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index bf976ff36965..cb6dbd6b5471 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -188,10 +188,10 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, // If this is an indirect call use perf data directly. if (!DstSym && BranchData && - BC.MIA->hasAnnotation(Inst, "EdgeCountData")) { - const auto DataOffset = - BC.MIA->getAnnotationAs(Inst, "EdgeCountData"); - for (const auto &BI : BranchData->getBranchRange(DataOffset)) { + BC.MIA->hasAnnotation(Inst, "Offset")) { + const auto InstrOffset = + BC.MIA->getAnnotationAs(Inst, "Offset"); + for (const auto &BI : BranchData->getBranchRange(InstrOffset)) { Counts.push_back(getCallInfoFromBranchData(BI, false)); } } else { diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 302152d121ae..80be088faca2 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -655,7 +655,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, ? 0 : PredBB->getBranchInfo(true).Count; // Annotate it, so "isCall" returns true for this jcc - MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "IsCTC", true); + MIA->setConditionalTailCall(*CondBranch); // Add info abount the conditional tail call frequency, otherwise this // info will be lost when we delete the associated BranchInfo entry BC.MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "CTCTakenFreq", diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index bb1478188c3c..8f3e3986cf00 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -196,7 +196,7 @@ IndirectCallPromotion::getCallTargets( } else { const auto *BranchData = BF.getBranchData(); assert(BranchData && "expected initialized branch data"); - auto Offset = BC.MIA->getAnnotationAs(Inst, "IndirectBranchData"); + auto Offset = BC.MIA->getAnnotationAs(Inst, "Offset"); for (const auto &BI : BranchData->getBranchRange(Offset)) { Callsite Site(BF, BI); if (Site.isValid()) { @@ -309,7 +309,7 @@ IndirectCallPromotion::rewriteCall(BinaryContext &BC, auto TBB = Function.createBasicBlock(0, Sym); for (auto &Inst : Insts) { // sanitize new instructions. if (BC.MIA->isCall(Inst)) - BC.MIA->removeAnnotation(Inst, "IndirectBranchData"); + BC.MIA->removeAnnotation(Inst, "Offset"); } TBB->addInstructions(Insts.begin(), Insts.end()); NewBBs.emplace_back(std::move(TBB)); @@ -725,9 +725,9 @@ void IndirectCallPromotion::runOnFunctions( auto &Inst = BB->getInstructionAtIndex(Idx); const auto InstIdx = &Inst - &(*BB->begin()); const bool IsTailCall = BC.MIA->isTailCall(Inst); + const bool HasBranchData = Function.getBranchData() && + BC.MIA->hasAnnotation(Inst, "Offset"); const bool IsJumpTable = Function.getJumpTable(Inst); - const bool HasBranchData = - BC.MIA->hasAnnotation(Inst, "IndirectBranchData"); const bool OptimizeCalls = (opts::IndirectCallPromotion == ICP_CALLS || opts::IndirectCallPromotion == ICP_ALL); @@ -735,10 +735,14 @@ void IndirectCallPromotion::runOnFunctions( (opts::IndirectCallPromotion == ICP_JUMP_TABLES || opts::IndirectCallPromotion == ICP_ALL); - if (!((HasBranchData && OptimizeCalls) || + if (!((HasBranchData && !IsJumpTable && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) continue; + // Ignore direct calls. + if (BC.MIA->isCall(Inst) && BC.MIA->getTargetSymbol(Inst, 0)) + continue; + assert(BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)); if (IsJumpTable) diff --git a/bolt/Passes/Inliner.cpp b/bolt/Passes/Inliner.cpp index c27d42a5562d..65347403c1da 100644 --- a/bolt/Passes/Inliner.cpp +++ b/bolt/Passes/Inliner.cpp @@ -447,7 +447,7 @@ bool InlineSmallFunctions::inlineCallsInFunction( auto &Inst = *InstIt; if (BC.MIA->isCall(Inst) && !BC.MIA->isTailCall(Inst) && - Inst.size() == 1 && + Inst.getNumPrimeOperands() == 1 && Inst.getOperand(0).isExpr()) { const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); assert(TargetSymbol && "target symbol expected for direct call"); @@ -513,7 +513,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive( for (auto InstIt = BB->begin(); InstIt != BB->end(); ) { auto &Inst = *InstIt; if (BC.MIA->isCall(Inst) && - Inst.size() == 1 && + Inst.getNumPrimeOperands() == 1 && Inst.getOperand(0).isExpr()) { assert(!BC.MIA->isInvoke(Inst)); const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 3274f213d908..33153877bb36 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -3429,7 +3429,6 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { Symbol.st_shndx = ELF::SHN_ABS; Symbol.st_name = AddToStrTab(Name); Symbol.st_size = 0; - Symbol.st_shndx = 0; Symbol.st_other = 0; Symbol.setBindingAndType(ELF::STB_WEAK, ELF::STT_NOTYPE); From a8a31d19a9c10b5f5c7d04a3f43a2afc2e037672 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 10 Oct 2017 14:54:09 -0700 Subject: [PATCH 321/904] [BOLT] Account for FDE functions when calculating max function size Summary: When we calculate maximum function size we only used to rely on the symbol table information, and ignore function info coming from FDEs. Invalid maximum function size can lead to code emission over the code of neighbouring function. Fix this by considering FDE functions when determining the maximum function size. (cherry picked from commit e7dbfb8b3a4a35b14f9f95899af23aa3ce5e7014) --- bolt/RewriteInstance.cpp | 41 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 33153877bb36..4fea432f280c 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1351,12 +1351,14 @@ void RewriteInstance::disassemblePLT() { } void RewriteInstance::adjustFunctionBoundaries() { - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - - // Check if there's a symbol with a larger address in the same section. - // If there is - it determines the maximum size for the current function, - // otherwise, it is the size of containing section the defines it. + for (auto BFI = BinaryFunctions.begin(), BFE = BinaryFunctions.end(); + BFI != BFE; ++BFI) { + auto &Function = BFI->second; + + // Check if there's a symbol or a function with a larger address in the + // same section. If there is - it determines the maximum size for the + // current function. Otherwise, it is the size of a containing section + // the defines it. // // NOTE: ignore some symbols that could be tolerated inside the body // of a function. @@ -1387,22 +1389,21 @@ void RewriteInstance::adjustFunctionBoundaries() { ? InputFile->section_end() : NextSymRefI->second.getSection(); - uint64_t MaxSize; - if (NextSymRefI != FileSymRefs.end() && - NextSymRefI->second.getSection() && - *NextSymRefI->second.getSection() != InputFile->section_end() && - **NextSymRefI->second.getSection() == Function.getSection()) { - MaxSize = NextSymRefI->first - Function.getAddress(); - } else { - // Function runs till the end of the containing section. - uint64_t SectionEnd = Function.getSection().getAddress() + - Function.getSection().getSize(); - assert((NextSymRefI == FileSymRefs.end() || - NextSymRefI->first >= SectionEnd) && - "different sections should not overlap"); - MaxSize = SectionEnd - Function.getAddress(); + // Function runs at most till the end of the containing section. + uint64_t NextObjectAddress = Function.getSection().getAddress() + + Function.getSection().getSize(); + // Or till the next object marked by a symbol. + if (NextSymRefI != FileSymRefs.end()) { + NextObjectAddress = std::min(NextSymRefI->first, NextObjectAddress); + } + // Or till the next function not marked by a symbol. + if (std::next(BFI) != BFE) { + const auto &NextFunction = std::next(BFI)->second; + NextObjectAddress = std::min(NextFunction.getAddress(), + NextObjectAddress); } + const auto MaxSize = NextObjectAddress - Function.getAddress(); if (MaxSize < Function.getSize()) { errs() << "BOLT-ERROR: symbol seen in the middle of the function " << Function << ". Skipping.\n"; From 1dc5efbaf4535bbd4d3a3b687095b6231f4af4a2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 19 Oct 2017 12:36:48 -0700 Subject: [PATCH 322/904] [BOLT] Add ability to specify custom printers for annotations. Summary: This will give us the ability to print annotations in a more meaningful way. Especially annotations that could be interpreted in multiple ways. I've added one register name printer for liveness analysis. We can update the other dataflow annotations as needed. I also noticed that BitVector annotations were leaking since they contain heap allocated memory. I made removeAnnotation call the annotation destructor explicitly to mitigate this but it won't fix the problem when annotations are just dropped en masse. (cherry picked from commit 3955ac4f23e59b97905f1d8f2f6f11283c16c8d3) --- bolt/Passes/DataflowAnalysis.cpp | 18 ++++++++++----- bolt/Passes/DataflowAnalysis.h | 39 +++++++++++++++++++++++++------- bolt/Passes/LivenessAnalysis.h | 17 +++++++++----- 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/bolt/Passes/DataflowAnalysis.cpp b/bolt/Passes/DataflowAnalysis.cpp index e3a1894a930b..9a96059ffb7e 100644 --- a/bolt/Passes/DataflowAnalysis.cpp +++ b/bolt/Passes/DataflowAnalysis.cpp @@ -1,6 +1,12 @@ #include "DataflowAnalysis.h" namespace llvm { + +raw_ostream &operator<<(raw_ostream &OS, const BitVector &Val) { + OS << "BitVector"; + return OS; +} + namespace bolt { void doForAllPreds(const BinaryContext &BC, const BinaryBasicBlock &BB, @@ -30,11 +36,11 @@ void doForAllSuccs(const BinaryBasicBlock &BB, } } +void RegStatePrinter::print(raw_ostream &OS, const BitVector &State) const { + for (auto I = State.find_first(); I != -1; I = State.find_next(I)) { + OS << BC.MRI->getName(I) << " "; + } +} + } // namespace bolt } // namespace llvm - -llvm::raw_ostream &llvm::operator<<(llvm::raw_ostream &OS, - const BitVector &Val) { - OS << "BitVector"; - return OS; -} diff --git a/bolt/Passes/DataflowAnalysis.h b/bolt/Passes/DataflowAnalysis.h index e28d2b085c29..63d28c88f986 100644 --- a/bolt/Passes/DataflowAnalysis.h +++ b/bolt/Passes/DataflowAnalysis.h @@ -105,6 +105,25 @@ void doForAllPreds(const BinaryContext &BC, const BinaryBasicBlock &BB, void doForAllSuccs(const BinaryBasicBlock &BB, std::function Task); +/// Default printer for State data. +template +class StatePrinter { +public: + void print(raw_ostream &OS, const StateTy &State) const { + OS << State; + } + explicit StatePrinter(const BinaryContext &) { } +}; + +/// Printer for State data that is a BitVector of registers. +class RegStatePrinter { +public: + void print(raw_ostream &OS, const BitVector &State) const; + explicit RegStatePrinter(const BinaryContext &BC) : BC(BC) { } +private: + const BinaryContext &BC; +}; + /// Base class for dataflow analyses. Depends on the type of whatever object is /// stored as the state (StateTy) at each program point. The dataflow then /// updates the state at each program point depending on the instruction being @@ -132,7 +151,10 @@ void doForAllSuccs(const BinaryBasicBlock &BB, /// Confluence operator = union (if a reg is alive in any succ, it is alive /// in the current block). /// -template +template > class DataflowAnalysis { /// CRTP convenience methods Derived &derived() { @@ -212,7 +234,7 @@ class DataflowAnalysis { StateTy &getOrCreateStateAt(MCInst &Point) { return BC.MIA->getOrCreateAnnotationAs( - BC.Ctx.get(), Point, derived().getAnnotationName()); + BC.Ctx.get(), Point, derived().getAnnotationName(), StatePrinterTy(BC)); } StateTy &getOrCreateStateAt(ProgramPoint Point) { @@ -272,7 +294,7 @@ class DataflowAnalysis { return getStateAt(PrevPoint[&Point]); } - ErrorOrgetStateBefore(ProgramPoint Point) { + ErrorOr getStateBefore(ProgramPoint Point) { if (Point.isBB()) return getStateAt(*Point.getBB()); return getStateAt(PrevPoint[Point.getInst()]); @@ -462,9 +484,11 @@ class ExprIterator /// Specialization of DataflowAnalysis whose state specifically stores /// a set of instructions. -template +template > class InstrsDataflowAnalysis - : public DataflowAnalysis { + : public DataflowAnalysis { public: /// These iterator functions offer access to the set of pointers to /// instructions in a given program point @@ -512,7 +536,7 @@ class InstrsDataflowAnalysis } InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF) - : DataflowAnalysis(BC, BF) {} + : DataflowAnalysis(BC, BF) {} virtual ~InstrsDataflowAnalysis() {} }; @@ -541,8 +565,7 @@ template<> struct DenseMapInfo { } }; -llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const BitVector &Val); +raw_ostream &operator<<(raw_ostream &OS, const BitVector &Val); } // namespace llvm diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index 2fde42863392..3ef1700824a9 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -26,14 +26,20 @@ namespace llvm { namespace bolt { class LivenessAnalysis - : public DataflowAnalysis { - friend class DataflowAnalysis; + : public DataflowAnalysis { + using Parent = DataflowAnalysis; + friend class DataflowAnalysis; public: LivenessAnalysis(const RegAnalysis &RA, const BinaryContext &BC, BinaryFunction &BF) - : DataflowAnalysis(BC, BF), RA(RA), - NumRegs(BC.MRI->getNumRegs()) {} + : Parent(BC, BF), RA(RA), NumRegs(BC.MRI->getNumRegs()) {} virtual ~LivenessAnalysis(); bool isAlive(ProgramPoint PP, MCPhysReg Reg) const { @@ -45,7 +51,7 @@ class LivenessAnalysis void run() { NamedRegionTimer T1("LA", "Dataflow", opts::TimeOpts); - DataflowAnalysis::run(); + Parent::run(); } // Return a usable general-purpose reg after point P. Return 0 if no reg is @@ -122,5 +128,4 @@ class LivenessAnalysis } // end namespace bolt } // end namespace llvm - #endif From a366b22683855a2a44f0fa90f9a23a2ce65d3d5e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 23 Oct 2017 23:32:40 -0700 Subject: [PATCH 323/904] [BOLT][Refactoring] Get rid of TailCallTerminatedBlocks, etc. Summary: More changes to allow separation of CFG construction and profile assignment. Misc cleanups. (cherry picked from commit ad4718550f10390ac81d991856241c37ef264613) --- bolt/BinaryBasicBlock.h | 2 + bolt/BinaryFunction.cpp | 357 ++++++++++++++++++----------------- bolt/BinaryFunction.h | 24 +-- bolt/DataReader.h | 2 +- bolt/Passes/BinaryPasses.cpp | 2 +- 5 files changed, 197 insertions(+), 190 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 3534711a2623..4c8048c3a6d4 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -115,6 +115,8 @@ class BinaryBasicBlock { private: BinaryBasicBlock() = delete; + BinaryBasicBlock(const BinaryBasicBlock &) = delete; + BinaryBasicBlock& operator=(const BinaryBasicBlock &) = delete; explicit BinaryBasicBlock( BinaryFunction *Function, diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 048bd785a4b1..2f12e8ad3dc0 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1479,6 +1479,8 @@ void BinaryFunction::addLandingPads(const unsigned StartIndex, } } } + + clearList(LPToBBIndex); } void BinaryFunction::recomputeLandingPads(const unsigned StartIndex, @@ -1503,8 +1505,6 @@ void BinaryFunction::recomputeLandingPads(const unsigned StartIndex, } addLandingPads(StartIndex, NumBlocks); - - clearList(LPToBBIndex); } bool BinaryFunction::buildCFG() { @@ -1552,7 +1552,7 @@ bool BinaryFunction::buildCFG() { }; for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { - const uint32_t Offset = I->first; + const auto Offset = I->first; const auto &Instr = I->second; auto LI = Labels.find(Offset); @@ -1594,15 +1594,9 @@ bool BinaryFunction::buildCFG() { } IsLastInstrNop = false; - uint32_t InsertIndex = InsertBB->addInstruction(Instr); + InsertBB->addInstruction(Instr); PrevInstr = &Instr; - // Record conditional tail call info. - if (const auto CTCDest = MIA->getConditionalTailCall(Instr)) { - TailCallTerminatedBlocks.emplace( - std::make_pair(InsertBB, TailCallInfo(Offset, InsertIndex, *CTCDest))); - } - // Add associated CFI instrs. We always add the CFI instruction that is // located immediately after this instruction, since the next CFI // instruction reflects the change in state caused by this instruction. @@ -1624,7 +1618,6 @@ bool BinaryFunction::buildCFG() { } } - // How well do we detect tail calls here? if (MIA->isTerminator(Instr)) { PrevBB = InsertBB; InsertBB = nullptr; @@ -1667,6 +1660,7 @@ bool BinaryFunction::buildCFG() { const BranchInfo &BInfo = BranchInfoOrErr.get(); FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); + // Populate profile counts for the jump table. auto *LastInstr = FromBB->getLastNonPseudoInstr(); if (!LastInstr) @@ -1742,15 +1736,24 @@ bool BinaryFunction::buildCFG() { } } - for (auto &I : TailCallTerminatedBlocks) { - TailCallInfo &TCInfo = I.second; - if (BranchData) { - auto BranchInfoOrErr = BranchData->getDirectCallBranch(TCInfo.Offset); - if (BranchInfoOrErr) { - const BranchInfo &BInfo = BranchInfoOrErr.get(); - TCInfo.Count = BInfo.Branches; - TCInfo.Mispreds = BInfo.Mispreds; - } + if (BranchData) { + for (auto BB : BasicBlocks) { + auto *CTCInstr = BB->getLastNonPseudoInstr(); + if (!CTCInstr || !MIA->getConditionalTailCall(*CTCInstr)) + continue; + + auto OffsetOrErr = + MIA->tryGetAnnotationAs(*CTCInstr, "Offset"); + assert(OffsetOrErr && "offset not set for conditional tail call"); + + auto BranchInfoOrErr = BranchData->getDirectCallBranch(*OffsetOrErr); + if (!BranchInfoOrErr) + continue; + + MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCTakenCount", + BranchInfoOrErr->Branches); + MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCMispredCount", + BranchInfoOrErr->Mispreds); } } @@ -1769,13 +1772,11 @@ bool BinaryFunction::buildCFG() { continue; } - auto LastInstIter = --BB->end(); - while (MIA->isCFI(*LastInstIter) && LastInstIter != BB->begin()) - --LastInstIter; + auto LastInstr = BB->getLastNonPseudoInstr(); + assert(LastInstr && + "should have non-pseudo instruction in non-empty block"); - // Check if the last instruction is a conditional jump that serves as a tail - // call. - const auto IsCondTailCall = MIA->getConditionalTailCall(*LastInstIter); + const auto IsCondTailCall = MIA->getConditionalTailCall(*LastInstr); if (BB->succ_size() == 0) { if (IsCondTailCall) { // Conditional tail call without profile data for non-taken branch. @@ -1783,7 +1784,7 @@ bool BinaryFunction::buildCFG() { } else { // Unless the last instruction is a terminator, control will fall // through to the next basic block. - IsPrevFT = MIA->isTerminator(*LastInstIter) ? false : true; + IsPrevFT = !MIA->isTerminator(*LastInstr); } } else if (BB->succ_size() == 1) { if (IsCondTailCall) { @@ -1793,7 +1794,7 @@ bool BinaryFunction::buildCFG() { } else { // Fall-through should be added if the last instruction is a conditional // jump, since there was no profile data for the non-taken branch. - IsPrevFT = MIA->isConditionalBranch(*LastInstIter) ? true : false; + IsPrevFT = MIA->isConditionalBranch(*LastInstr); } } else { // Ends with 2 branches, with an indirect jump or it is a conditional @@ -1826,10 +1827,6 @@ bool BinaryFunction::buildCFG() { // Assign CFI information to each BB entry. annotateCFIState(); - // Convert conditional tail call branches to conditional branches that jump - // to a tail call. - removeConditionalTailCalls(); - // Set the basic block layout to the original order. PrevBB = nullptr; for (auto BB : BasicBlocks) { @@ -1840,6 +1837,11 @@ bool BinaryFunction::buildCFG() { } PrevBB->setEndOffset(getSize()); + // Convert conditional tail call branches to conditional branches that jump + // to a tail call. + // TODO: make a separate pass + removeConditionalTailCalls(); + // Make any necessary adjustments for indirect branches. if (!postProcessIndirectBranches()) { if (opts::Verbosity) { @@ -1867,7 +1869,6 @@ bool BinaryFunction::buildCFG() { // NB: don't clear Labels list as we may need them if we mark the function // as non-simple later in the process of discovering extra entry points. clearList(Instructions); - clearList(TailCallTerminatedBlocks); clearList(OffsetToCFI); clearList(TakenBranches); clearList(FTBranches); @@ -2258,11 +2259,11 @@ void BinaryFunction::inferFallThroughCounts() { } for (auto CurBB : BasicBlocks) { - auto SuccCount = CurBB->branch_info_begin(); + auto SuccBIIter = CurBB->branch_info_begin(); for (auto Succ : CurBB->successors()) { - if (SuccCount->Count != BinaryBasicBlock::COUNT_NO_PROFILE) - Succ->setExecutionCount(Succ->getExecutionCount() + SuccCount->Count); - ++SuccCount; + if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE) + Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count); + ++SuccBIIter; } } @@ -2281,8 +2282,7 @@ void BinaryFunction::inferFallThroughCounts() { // should rarely happen because there are few multiple-entry functions. for (const auto &I : BranchData->EntryData) { BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); - if (BB && (BB->isEntryPoint() || - LandingPads.find(BB->getLabel()) != LandingPads.end())) { + if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { BB->setExecutionCount(BB->getExecutionCount() + I.Branches); } } @@ -2290,28 +2290,28 @@ void BinaryFunction::inferFallThroughCounts() { // Work on a basic block at a time, propagating frequency information // forwards. // It is important to walk in the layout order. - for (auto CurBB : BasicBlocks) { - uint64_t BBExecCount = CurBB->getExecutionCount(); + for (auto BB : BasicBlocks) { + uint64_t BBExecCount = BB->getExecutionCount(); // Propagate this information to successors, filling in fall-through edges // with frequency information - if (CurBB->succ_size() == 0) + if (BB->succ_size() == 0) continue; // Calculate frequency of outgoing branches from this node according to // LBR data. uint64_t ReportedBranches = 0; - for (const auto &SuccCount : CurBB->branch_info()) { - if (SuccCount.Count != BinaryBasicBlock::COUNT_NO_PROFILE) - ReportedBranches += SuccCount.Count; + for (const auto &SuccBI : BB->branch_info()) { + if (SuccBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) + ReportedBranches += SuccBI.Count; } - // Calculate frequency of outgoing tail calls from this node according to - // LBR data. - uint64_t ReportedTailCalls = 0; - auto TCI = TailCallTerminatedBlocks.find(CurBB); - if (TCI != TailCallTerminatedBlocks.end()) { - ReportedTailCalls = TCI->second.Count; + // Get taken count of conditional tail call if the block ends with one. + uint64_t CTCTakenCount = 0; + const auto CTCInstr = BB->getLastNonPseudoInstr(); + if (CTCInstr && BC.MIA->getConditionalTailCall(*CTCInstr)) { + CTCTakenCount = + BC.MIA->getAnnotationWithDefault(*CTCInstr, "CTCTakenCount"); } // Calculate frequency of throws from this node according to LBR data @@ -2319,12 +2319,12 @@ void BinaryFunction::inferFallThroughCounts() { // for a landing pad to be associated with more than one basic blocks, // we may overestimate the frequency of throws for such blocks. uint64_t ReportedThrows = 0; - for (BinaryBasicBlock *LP: CurBB->landing_pads()) { + for (const auto *LP: BB->landing_pads()) { ReportedThrows += LP->getExecutionCount(); } - uint64_t TotalReportedJumps = - ReportedBranches + ReportedTailCalls + ReportedThrows; + const uint64_t TotalReportedJumps = + ReportedBranches + CTCTakenCount + ReportedThrows; // Infer the frequency of the fall-through edge, representing not taking the // branch. @@ -2332,121 +2332,111 @@ void BinaryFunction::inferFallThroughCounts() { if (BBExecCount > TotalReportedJumps) Inferred = BBExecCount - TotalReportedJumps; - DEBUG({ + DEBUG( if (opts::Verbosity >= 1 && BBExecCount < TotalReportedJumps) errs() << "BOLT-WARNING: Fall-through inference is slightly inconsistent. " "exec frequency is less than the outgoing edges frequency (" << BBExecCount << " < " << ReportedBranches << ") for BB at offset 0x" - << Twine::utohexstr(getAddress() + CurBB->getOffset()) << '\n'; - }); + << Twine::utohexstr(getAddress() + BB->getOffset()) << '\n'; + ); - if (CurBB->succ_size() <= 2) { + if (BB->succ_size() <= 2) { // If there is an FT it will be the last successor. - auto &SuccCount = *CurBB->branch_info_rbegin(); - auto &Succ = *CurBB->succ_rbegin(); - if (SuccCount.Count == BinaryBasicBlock::COUNT_NO_PROFILE) { - SuccCount.Count = Inferred; + auto &SuccBI = *BB->branch_info_rbegin(); + auto &Succ = *BB->succ_rbegin(); + if (SuccBI.Count == BinaryBasicBlock::COUNT_NO_PROFILE) { + SuccBI.Count = Inferred; Succ->ExecutionCount += Inferred; } } - - } // end for (CurBB : BasicBlocks) + } return; } void BinaryFunction::removeConditionalTailCalls() { - for (auto &I : TailCallTerminatedBlocks) { - BinaryBasicBlock *BB = I.first; - TailCallInfo &TCInfo = I.second; - - // Get the conditional tail call instruction. - MCInst &CondTailCallInst = BB->getInstructionAtIndex(TCInfo.Index); - if (!BC.MIA->isConditionalBranch(CondTailCallInst)) { - // The block is not terminated with a conditional tail call. + CurrentState = State::CFG; + + // Blocks to be appended at the end. + std::vector> NewBlocks; + + for (auto BBI = begin(); BBI != end(); ++BBI) { + auto &BB = *BBI; + auto *CTCInstr = BB.getLastNonPseudoInstr(); + if (!CTCInstr) continue; + + auto TargetAddressOrNone = BC.MIA->getConditionalTailCall(*CTCInstr); + if (!TargetAddressOrNone) + continue; + + // Gather all necessary information about CTC instruction before + // annotations are destroyed. + const auto CFIStateBeforeCTC = BB.getCFIStateAtInstr(CTCInstr); + uint64_t CTCTakenCount = BinaryBasicBlock::COUNT_NO_PROFILE; + uint64_t CTCMispredCount = BinaryBasicBlock::COUNT_NO_PROFILE; + if (hasValidProfile()) { + CTCTakenCount = + BC.MIA->getAnnotationWithDefault(*CTCInstr, "CTCTakenCount"); + CTCMispredCount = + BC.MIA->getAnnotationWithDefault(*CTCInstr, + "CTCMispredCount"); } // Assert that the tail call does not throw. const MCSymbol *LP; uint64_t Action; - std::tie(LP, Action) = BC.MIA->getEHInfo(CondTailCallInst); + std::tie(LP, Action) = BC.MIA->getEHInfo(*CTCInstr); assert(!LP && "found tail call with associated landing pad"); - // Create the unconditional tail call instruction. - const auto *TailCallTargetLabel = BC.MIA->getTargetSymbol(CondTailCallInst); - assert(TailCallTargetLabel && "symbol expected for direct tail call"); - MCInst TailCallInst; - BC.MIA->createTailCall(TailCallInst, TailCallTargetLabel, BC.Ctx.get()); - - // The way we will remove this conditional tail call depends on the - // direction of the jump when it is taken. We want to preserve this - // direction. - BinaryBasicBlock *TailCallBB = nullptr; - MCSymbol *TCLabel = BC.Ctx->createTempSymbol("TC", true); - if (getAddress() >= TCInfo.TargetAddress) { - // Backward jump: We will reverse the condition of the tail call, change - // its target to the following (currently fall-through) block, and insert - // a new block between them that will contain the unconditional tail call. - - // Reverse the condition of the tail call and update its target. - unsigned InsertIdx = getIndex(BB) + 1; - assert(InsertIdx < size() && "no fall-through for conditional tail call"); - BinaryBasicBlock *NextBB = BasicBlocks[InsertIdx]; + // Create a basic block with an unconditional tail call instruction using + // the same destination. + const auto *CTCTargetLabel = BC.MIA->getTargetSymbol(*CTCInstr); + assert(CTCTargetLabel && "symbol expected for conditional tail call"); + MCInst TailCallInstr; + BC.MIA->createTailCall(TailCallInstr, CTCTargetLabel, BC.Ctx.get()); + auto TailCallBB = createBasicBlock(BinaryBasicBlock::INVALID_OFFSET, + BC.Ctx->createTempSymbol("TC", true)); + TailCallBB->addInstruction(TailCallInstr); + TailCallBB->setCFIState(CFIStateBeforeCTC); - BC.MIA->reverseBranchCondition( - CondTailCallInst, NextBB->getLabel(), BC.Ctx.get()); - - // Create a basic block containing the unconditional tail call instruction - // and place it between BB and NextBB. - std::vector> TailCallBBs; - TailCallBBs.emplace_back(createBasicBlock(NextBB->getOffset(), TCLabel)); - TailCallBBs[0]->addInstruction(TailCallInst); - insertBasicBlocks(BB, std::move(TailCallBBs), - /* UpdateLayout */ false, - /* UpdateCFIState */ false); - TailCallBB = BasicBlocks[InsertIdx]; - - // Add the correct CFI state for the new block. - TailCallBB->setCFIState(TCInfo.CFIStateBefore); - } else { - // Forward jump: we will create a new basic block at the end of the - // function containing the unconditional tail call and change the target - // of the conditional tail call to this basic block. - - // Create a basic block containing the unconditional tail call - // instruction and place it at the end of the function. - // We have to add 1 byte as there's potentially an existing branch past - // the end of the code as a result of __builtin_unreachable(). - const BinaryBasicBlock *LastBB = BasicBlocks.back(); - uint64_t NewBlockOffset = - LastBB->getOffset() - + BC.computeCodeSize(LastBB->begin(), LastBB->end()) + 1; - TailCallBB = addBasicBlock(NewBlockOffset, TCLabel); - TailCallBB->addInstruction(TailCallInst); - - // Add the correct CFI state for the new block. It has to be inserted in - // the one before last position (the last position holds the CFI state - // after the last block). - TailCallBB->setCFIState(TCInfo.CFIStateBefore); - - // Replace the target of the conditional tail call with the label of the - // new basic block. - BC.MIA->replaceBranchTarget(CondTailCallInst, TCLabel, BC.Ctx.get()); - } - - // Add CFG edge with profile info from BB to TailCallBB info and swap - // edges if the TailCallBB corresponds to the taken branch. - BB->addSuccessor(TailCallBB, TCInfo.Count, TCInfo.Mispreds); - if (getAddress() < TCInfo.TargetAddress) - BB->swapConditionalSuccessors(); + // Add CFG edge with profile info from BB to TailCallBB. + BB.addSuccessor(TailCallBB.get(), CTCTakenCount, CTCMispredCount); // Add execution count for the block. - if (hasValidProfile()) - TailCallBB->setExecutionCount(TCInfo.Count); + TailCallBB->setExecutionCount(CTCTakenCount); + + // In attempt to preserve the direction of the original conditional jump, + // we will either create an unconditional jump in a separate basic block + // at the end of the function, or reverse a condition of the jump + // and create a fall-through block right after the original tail call. + if (getAddress() >= *TargetAddressOrNone) { + // Insert the basic block right after the current one. + std::vector> TCBB; + TCBB.emplace_back(std::move(TailCallBB)); + BBI = insertBasicBlocks(BBI, + std::move(TCBB), + /* UpdateLayout */ true, + /* UpdateCFIState */ false); + BC.MIA->reverseBranchCondition( + *CTCInstr, (*std::next(BBI)).getLabel(), BC.Ctx.get()); + + } else { + BC.MIA->replaceBranchTarget(*CTCInstr, TailCallBB->getLabel(), + BC.Ctx.get()); + // Add basic block to the list that will be added to the end. + NewBlocks.emplace_back(std::move(TailCallBB)); + // Swap edges as the TailCallBB corresponds to the taken branch. + BB.swapConditionalSuccessors(); + } } + + insertBasicBlocks(std::prev(end()), + std::move(NewBlocks), + /* UpdateLayout */ true, + /* UpdateCFIState */ false); } uint64_t BinaryFunction::getFunctionScore() { @@ -2470,7 +2460,7 @@ void BinaryFunction::annotateCFIState() { assert(!BasicBlocks.empty() && "basic block list should not be empty"); // This is an index of the last processed CFI in FDE CFI program. - int32_t State = 0; + uint32_t State = 0; // This is an index of RememberState CFI reflecting effective state right // after execution of RestoreState CFI. @@ -2480,42 +2470,37 @@ void BinaryFunction::annotateCFIState() { // // This allows us to generate shorter replay sequences when producing new // CFI programs. - int32_t EffectiveState = 0; + uint32_t EffectiveState = 0; // For tracking RememberState/RestoreState sequences. - std::stack StateStack; + std::stack StateStack; for (auto *BB : BasicBlocks) { BB->setCFIState(EffectiveState); - // While building the CFG, we want to save the CFI state before a tail call - // instruction, so that we can correctly remove conditional tail calls. - auto TCI = TailCallTerminatedBlocks.find(BB); - bool SaveState = TCI != TailCallTerminatedBlocks.end(); - - uint32_t Idx = 0; // instruction index in a current basic block for (const auto &Instr : *BB) { - ++Idx; - if (SaveState && Idx == TCI->second.Index) { - TCI->second.CFIStateBefore = EffectiveState; - SaveState = false; - } - const auto *CFI = getCFIFor(Instr); if (!CFI) continue; ++State; - if (CFI->getOperation() == MCCFIInstruction::OpRememberState) { + switch (CFI->getOperation()) { + case MCCFIInstruction::OpRememberState: StateStack.push(EffectiveState); - } else if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) { + break; + case MCCFIInstruction::OpRestoreState: assert(!StateStack.empty() && "corrupt CFI stack"); EffectiveState = StateStack.top(); StateStack.pop(); - } else if (CFI->getOperation() != MCCFIInstruction::OpGnuArgsSize) { + break; + case MCCFIInstruction::OpGnuArgsSize: // OpGnuArgsSize CFIs do not affect the CFI state. + break; + default: + // Any other CFI updates the state. EffectiveState = State; + break; } } } @@ -3588,10 +3573,10 @@ std::size_t BinaryFunction::hash(bool Recompute, bool UseDFS) const { } void BinaryFunction::insertBasicBlocks( - BinaryBasicBlock *Start, - std::vector> &&NewBBs, - const bool UpdateLayout, - const bool UpdateCFIState) { + BinaryBasicBlock *Start, + std::vector> &&NewBBs, + const bool UpdateLayout, + const bool UpdateCFIState) { const auto StartIndex = getIndex(Start); const auto NumNewBlocks = NewBBs.size(); @@ -3600,7 +3585,7 @@ void BinaryFunction::insertBasicBlocks( nullptr); auto I = StartIndex + 1; - for (auto &BB : NewBBs) { + for (auto &BB : NewBBs) { assert(!BasicBlocks[I]); BasicBlocks[I++] = BB.release(); } @@ -3621,6 +3606,42 @@ void BinaryFunction::insertBasicBlocks( } } +BinaryFunction::iterator BinaryFunction::insertBasicBlocks( + BinaryFunction::iterator StartBB, + std::vector> &&NewBBs, + const bool UpdateLayout, + const bool UpdateCFIState) { + const auto StartIndex = getIndex(&*StartBB); + const auto NumNewBlocks = NewBBs.size(); + + auto RetIter = BasicBlocks.insert(BasicBlocks.begin() + StartIndex + 1, + NumNewBlocks, + nullptr); + + auto I = StartIndex + 1; + for (auto &BB : NewBBs) { + assert(!BasicBlocks[I]); + BasicBlocks[I++] = BB.release(); + } + + updateBBIndices(StartIndex); + + recomputeLandingPads(StartIndex, NumNewBlocks + 1); + + // Make sure the basic blocks are sorted properly. + assert(std::is_sorted(begin(), end())); + + if (UpdateLayout) { + updateLayout(*std::prev(RetIter), NumNewBlocks); + } + + if (UpdateCFIState) { + updateCFIState(*std::prev(RetIter), NumNewBlocks); + } + + return RetIter; +} + void BinaryFunction::updateBBIndices(const unsigned StartIndex) { for (auto I = StartIndex; I < BasicBlocks.size(); ++I) { BasicBlocks[I]->Index = I; @@ -3629,7 +3650,6 @@ void BinaryFunction::updateBBIndices(const unsigned StartIndex) { void BinaryFunction::updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks) { - assert(TailCallTerminatedBlocks.empty()); const auto CFIState = Start->getCFIStateAtExit(); const auto StartIndex = getIndex(Start) + 1; for (unsigned I = 0; I < NumNewBlocks; ++I) { @@ -4336,11 +4356,8 @@ DynoStats BinaryFunction::getDynoStats() const { continue; uint64_t CallFreq = BBExecutionCount; if (BC.MIA->getConditionalTailCall(Instr)) { - CallFreq = 0; - if (auto FreqOrErr = - BC.MIA->tryGetAnnotationAs(Instr, "CTCTakenFreq")) { - CallFreq = *FreqOrErr; - } + CallFreq = + BC.MIA->getAnnotationWithDefault(Instr, "CTCTakenCount"); } Stats[DynoStats::FUNCTION_CALLS] += CallFreq; if (BC.MIA->getMemoryOperandNo(Instr) != -1) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 4f9a45ef495c..be178e4bf8c4 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -482,24 +482,6 @@ class BinaryFunction { using InstrMapType = std::map; InstrMapType Instructions; - /// Temporary holder of tail call terminated basic blocks used during CFG - /// construction. Map from tail call terminated basic block to a struct with - /// information about the tail call. - struct TailCallInfo { - uint32_t Offset; // offset of the tail call from the function - // start - uint32_t Index; // index of the tail call in the basic block - uint64_t TargetAddress; // address of the callee - uint64_t Count{0}; // taken count from profile data - uint64_t Mispreds{0}; // mispredicted count from profile data - uint32_t CFIStateBefore{0}; // CFI state before the tail call instruction - - TailCallInfo(uint32_t Offset, uint32_t Index, uint64_t TargetAddress) : - Offset(Offset), Index(Index), TargetAddress(TargetAddress) { } - }; - using TailCallBasicBlockMapType = std::map; - TailCallBasicBlockMapType TailCallTerminatedBlocks; - /// List of DWARF CFI instructions. Original CFI from the binary must be /// sorted w.r.t. offset that it appears. We rely on this to replay CFIs /// if needed (to fix state after reordering BBs). @@ -1390,6 +1372,12 @@ class BinaryFunction { const bool UpdateLayout = true, const bool UpdateCFIState = true); + iterator insertBasicBlocks( + iterator StartBB, + std::vector> &&NewBBs, + const bool UpdateLayout = true, + const bool UpdateCFIState = true); + /// Update the basic block layout for this function. The BBs from /// [Start->Index, Start->Index + NumNewBlocks) are inserted into the /// layout after the BB indicated by Start. diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 1243e5ebe7e7..07d7e990dd2b 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -366,7 +366,7 @@ class DataReader { unsigned Col; FuncsToBranchesMapTy FuncsToBranches; FuncsToSamplesMapTy FuncsToSamples; - bool NoLBRMode; + bool NoLBRMode{false}; StringSet<> EventNames; static const char FieldSeparator = ' '; diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 80be088faca2..93e583e53b0d 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -658,7 +658,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, MIA->setConditionalTailCall(*CondBranch); // Add info abount the conditional tail call frequency, otherwise this // info will be lost when we delete the associated BranchInfo entry - BC.MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "CTCTakenFreq", + BC.MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "CTCTakenCount", CTCTakenFreq); // Remove the unused successor which may be eliminated later From f96d474368c9b798b06648f3dcbf015ea0faab86 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Wed, 18 Oct 2017 15:18:52 -0700 Subject: [PATCH 324/904] using offsets for CG Summary: Arc->AvgOffset can be used for function/block ordering to distinguish between calls from the beggining of a function and calls from the end of the function. This makes a difference for large functions. (cherry picked from commit f4ede215ed64fa1652eb433cf05eaa64859a5e1d) --- bolt/Passes/BinaryFunctionCallGraph.cpp | 76 ++++++++++++++++--------- bolt/Passes/BinaryFunctionCallGraph.h | 4 +- bolt/Passes/CallGraph.cpp | 37 ++++++------ bolt/Passes/CallGraph.h | 2 +- bolt/Passes/ReorderFunctions.cpp | 4 +- 5 files changed, 69 insertions(+), 54 deletions(-) diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index cb6dbd6b5471..24dc378e1e4c 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -89,6 +89,13 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, BinaryFunctionCallGraph Cg; static constexpr auto COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; + // Compute function size + auto functionSize = [&](const BinaryFunction *Function) { + return UseFunctionHotSize && Function->isSplit() + ? Function->estimateHotSize(UseSplitHotSize) + : Function->estimateSize(); + }; + // Add call graph nodes. auto lookupNode = [&](BinaryFunction *Function) { const auto Id = Cg.maybeGetNodeId(Function); @@ -97,9 +104,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, // because emitFunctions will emit the hot part first in the order that is // computed by ReorderFunctions. The cold part will be emitted with the // rest of the cold functions and code. - const auto Size = UseFunctionHotSize && Function->isSplit() - ? Function->estimateHotSize(UseSplitHotSize) - : Function->estimateSize(); + const auto Size = functionSize(Function); // NOTE: for functions without a profile, we set the number of samples // to zero. This will keep these functions from appearing in the hot // section. This is a little weird because we wouldn't be trying to @@ -125,14 +130,14 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, for (auto &It : BFs) { auto *Function = &It.second; - if(Filter(*Function)) { + if (Filter(*Function)) { continue; } const auto *BranchData = Function->getBranchData(); const auto SrcId = lookupNode(Function); - uint64_t Offset = Function->getAddress(); - uint64_t LastInstSize = 0; + // Offset of the current basic block from the beginning of the function + uint64_t Offset = 0; auto recordCall = [&](const MCSymbol *DestSymbol, const uint64_t Count) { if (auto *DstFunc = @@ -145,11 +150,11 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, return false; } const auto DstId = lookupNode(DstFunc); - const auto AvgDelta = UseEdgeCounts ? 0 : Offset - DstFunc->getAddress(); const bool IsValidCount = Count != COUNT_NO_PROFILE; const auto AdjCount = UseEdgeCounts && IsValidCount ? Count : 1; - if (!IsValidCount) ++NoProfileCallsites; - Cg.incArcWeight(SrcId, DstId, AdjCount, AvgDelta); + if (!IsValidCount) + ++NoProfileCallsites; + Cg.incArcWeight(SrcId, DstId, AdjCount, Offset); DEBUG( if (opts::Verbosity > 1) { dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function @@ -157,6 +162,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, }); return true; } + return false; }; @@ -209,8 +215,14 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: Falling back to perf data" << " for " << *Function << "\n"); ++NumFallbacks; + const auto Size = functionSize(Function); for (const auto &BI : BranchData->Data) { - Offset = Function->getAddress() + BI.From.Offset; + Offset = BI.From.Offset; + // The computed offset may exceed the hot part of the function; hence, + // bound it the size + if (Offset > Size) + Offset = Size; + const auto CI = getCallInfoFromBranchData(BI, true); if (!CI.first && CI.second == COUNT_NO_PROFILE) // probably a branch continue; @@ -225,30 +237,38 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, if (BB->isCold() && !IncludeColdCalls) continue; - for (auto &Inst : *BB) { - if (!UseEdgeCounts) { - Offset += LastInstSize; - LastInstSize = BC.computeCodeSize(&Inst, &Inst + 1); - } + // Determine whether the block is included in Function's (hot) size + // See BinaryFunction::estimateHotSize + bool BBIncludedInFunctionSize = false; + if (UseFunctionHotSize && Function->isSplit()) { + if (UseSplitHotSize) + BBIncludedInFunctionSize = !BB->isCold(); + else + BBIncludedInFunctionSize = BB->getKnownExecutionCount() != 0; + } else { + BBIncludedInFunctionSize = true; + } + for (auto &Inst : *BB) { // Find call instructions and extract target symbols from each one. - if (!BC.MIA->isCall(Inst)) - continue; - - const auto CallInfo = getCallInfo(BB, Inst); + if (BC.MIA->isCall(Inst)) { + const auto CallInfo = getCallInfo(BB, Inst); - if (CallInfo.empty()) { - ++TotalCallsites; - ++NotProcessed; - continue; - } - - for (const auto &CI : CallInfo) { - ++TotalCallsites; - if (!recordCall(CI.first, CI.second)) { + if (!CallInfo.empty()) { + for (const auto &CI : CallInfo) { + ++TotalCallsites; + if (!recordCall(CI.first, CI.second)) + ++NotProcessed; + } + } else { + ++TotalCallsites; ++NotProcessed; } } + // Increase Offset if needed + if (BBIncludedInFunctionSize) { + Offset += BC.computeCodeSize(&Inst, &Inst + 1); + } } } } diff --git a/bolt/Passes/BinaryFunctionCallGraph.h b/bolt/Passes/BinaryFunctionCallGraph.h index 513bb0ef5415..0bce5c9de92f 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.h +++ b/bolt/Passes/BinaryFunctionCallGraph.h @@ -65,8 +65,8 @@ inline bool NoFilter(const BinaryFunction &) { return false; } /// graph, otherwise they are ignored. /// UseFunctionHotSize controls whether the hot size of a function is used when /// filling in the Size attribute of new Nodes. -/// UseEdgeCounts is used to control if the AvgCallOffset attribute on Arcs is -/// computed using the offsets of call instructions. +/// UseEdgeCounts is used to control if the Weight attribute on Arcs is computed +/// using the number of calls. BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, std::map &BFs, CgFilterFunction Filter = NoFilter, diff --git a/bolt/Passes/CallGraph.cpp b/bolt/Passes/CallGraph.cpp index 70544fe6da45..4533c23d681d 100644 --- a/bolt/Passes/CallGraph.cpp +++ b/bolt/Passes/CallGraph.cpp @@ -44,7 +44,7 @@ inline size_t hash_int64(int64_t k) { return hash_int64_fallback(k); #endif } - + inline size_t hash_int64_pair(int64_t k1, int64_t k2) { #if defined(USE_SSECRC) && defined(__SSE4_2__) // crc32 is commutative, so we need to perturb k1 so that (k1, k2) hashes @@ -56,7 +56,7 @@ inline size_t hash_int64_pair(int64_t k1, int64_t k2) { return (hash_int64(k1) << 1) ^ hash_int64(k2); #endif } - + } namespace llvm { @@ -79,36 +79,31 @@ CallGraph::NodeId CallGraph::addNode(uint32_t Size, uint64_t Samples) { const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W, double Offset) { + assert(Offset <= size(Src) && "Call offset exceeds function size"); + auto Res = Arcs.emplace(Src, Dst, W); if (!Res.second) { Res.first->Weight += W; + Res.first->AvgCallOffset += Offset * W; return *Res.first; } - Res.first->AvgCallOffset += Offset; + Res.first->AvgCallOffset = Offset * W; Nodes[Src].Succs.push_back(Dst); Nodes[Dst].Preds.push_back(Src); return *Res.first; } -void CallGraph::normalizeArcWeights(bool UseEdgeCounts) { - // Normalize arc weights. - if (!UseEdgeCounts) { - for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) { - auto& Func = getNode(FuncId); - for (auto Caller : Func.predecessors()) { - auto Arc = findArc(Caller, FuncId); - Arc->NormalizedWeight = Arc->weight() / Func.samples(); +void CallGraph::normalizeArcWeights() { + // Normalize arc weights + for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) { + auto& Func = getNode(FuncId); + for (auto Caller : Func.predecessors()) { + auto Arc = findArc(Caller, FuncId); + Arc->NormalizedWeight = Arc->weight() / Func.samples(); + if (Arc->weight() > 0) Arc->AvgCallOffset /= Arc->weight(); - assert(Arc->AvgCallOffset < size(Caller)); - } - } - } else { - for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) { - auto &Func = getNode(FuncId); - for (auto Caller : Func.predecessors()) { - auto Arc = findArc(Caller, FuncId); - Arc->NormalizedWeight = Arc->weight() / Func.samples(); - } + assert(Arc->AvgCallOffset <= size(Caller) && + "Avg call offset exceeds function size"); } } } diff --git a/bolt/Passes/CallGraph.h b/bolt/Passes/CallGraph.h index 83837e55b67f..c5df85734d2e 100644 --- a/bolt/Passes/CallGraph.h +++ b/bolt/Passes/CallGraph.h @@ -153,7 +153,7 @@ class CallGraph { return double(Arcs.size()) / (Nodes.size()*Nodes.size()); } - void normalizeArcWeights(bool UseEdgeCounts); + void normalizeArcWeights(); template void printDot(char* fileName, L getLabel) const; diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index bb5e55ca752b..d90e621c7649 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -134,7 +134,7 @@ namespace bolt { using NodeId = CallGraph::NodeId; using Arc = CallGraph::Arc; -using Node = CallGraph::Node; +using Node = CallGraph::Node; void ReorderFunctions::reorder(std::vector &&Clusters, std::map &BFs) { @@ -310,7 +310,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, opts::CgUseSplitHotSize, opts::UseEdgeCounts, opts::CgIgnoreRecursiveCalls); - Cg.normalizeArcWeights(opts::UseEdgeCounts); + Cg.normalizeArcWeights(); } std::vector Clusters; From 887aaf56b2d2a48c3e04810edc373406c6c475b5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 26 Oct 2017 18:36:30 -0700 Subject: [PATCH 325/904] [BOLT][Refactoring] Change landing pads handling Summary: Change the way we store and handle landing pads and throwers. (cherry picked from commit 18aa3174b8173e6670f27ab64b628f57bc35bf0e) --- bolt/BinaryBasicBlock.cpp | 16 ----- bolt/BinaryBasicBlock.h | 10 +-- bolt/BinaryFunction.cpp | 143 ++++++++++++++++---------------------- bolt/BinaryFunction.h | 22 +----- bolt/Exceptions.cpp | 1 - 5 files changed, 62 insertions(+), 130 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 8664f712dd96..e2294df5bbdf 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -293,22 +293,6 @@ void BinaryBasicBlock::removeDuplicateConditionalSuccessor(MCInst *CondBranch) { BranchInfo.push_back({Count, 0}); } -void BinaryBasicBlock::addLandingPad(BinaryBasicBlock *LPBlock) { - if (std::find(LandingPads.begin(), LandingPads.end(), LPBlock) == LandingPads.end()) { - LandingPads.push_back(LPBlock); - } - LPBlock->Throwers.insert(this); -} - -void BinaryBasicBlock::clearLandingPads() { - for (auto *LPBlock : LandingPads) { - auto Count = LPBlock->Throwers.erase(this); - (void)Count; - assert(Count == 1 && "Possible duplicate entry in LandingPads"); - } - LandingPads.clear(); -} - bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 4c8048c3a6d4..1c6e5ba64842 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -60,7 +60,7 @@ class BinaryBasicBlock { /// CFG information. std::vector Predecessors; std::vector Successors; - std::set Throwers; + std::vector Throwers; std::vector LandingPads; /// Each successor has a corresponding BranchInfo entry in the list. @@ -222,7 +222,7 @@ class BinaryBasicBlock { return (unsigned)Throwers.size(); } bool throw_empty() const { return Throwers.empty(); } - bool isLandingPad() const { return !Throwers.empty(); } + bool isLandingPad() const { return !Throwers.empty(); } lp_iterator lp_begin() { return LandingPads.begin(); } const_lp_iterator lp_begin() const { return LandingPads.begin(); } @@ -524,9 +524,6 @@ class BinaryBasicBlock { uint64_t Count = 0, uint64_t MispredictedCount = 0); - /// Adds block to landing pad list. - void addLandingPad(BinaryBasicBlock *LPBlock); - /// Remove /p Succ basic block from the list of successors. Update the /// list of predecessors of /p Succ and update branch info. void removeSuccessor(BinaryBasicBlock *Succ); @@ -781,9 +778,6 @@ class BinaryBasicBlock { /// use removeSuccessor() function. void removePredecessor(BinaryBasicBlock *Pred); - /// Remove landing pads of this basic block. - void clearLandingPads(); - /// Return offset of the basic block from the function start. uint32_t getOffset() const { return InputRange.first; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 2f12e8ad3dc0..94020fd20703 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -330,10 +330,8 @@ std::pair BinaryFunction::eraseInvalidBBs() { assert(BasicBlocks.size() == BasicBlocksLayout.size()); // Update CFG state if needed - if (Count > 0) { - updateBBIndices(0); - recomputeLandingPads(0, BasicBlocks.size()); - } + if (Count > 0) + recomputeLandingPads(); return std::make_pair(Count, Bytes); } @@ -1457,54 +1455,39 @@ bool BinaryFunction::postProcessIndirectBranches() { return true; } -void BinaryFunction::clearLandingPads(const unsigned StartIndex, - const unsigned NumBlocks) { - // remove all landing pads/throws for the given collection of blocks - for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { - BasicBlocks[I]->clearLandingPads(); - } -} +void BinaryFunction::recomputeLandingPads() { + updateBBIndices(0); -void BinaryFunction::addLandingPads(const unsigned StartIndex, - const unsigned NumBlocks) { for (auto *BB : BasicBlocks) { - if (LandingPads.find(BB->getLabel()) != LandingPads.end()) { - const MCSymbol *LP = BB->getLabel(); - for (unsigned I : LPToBBIndex[LP]) { - assert(I < BasicBlocks.size()); - BinaryBasicBlock *ThrowBB = BasicBlocks[I]; - const unsigned ThrowBBIndex = getIndex(ThrowBB); - if (ThrowBBIndex >= StartIndex && ThrowBBIndex < StartIndex + NumBlocks) - ThrowBB->addLandingPad(BB); - } - } + BB->LandingPads.clear(); + BB->Throwers.clear(); } - clearList(LPToBBIndex); -} + for (auto *BB : BasicBlocks) { + for (auto &Instr : *BB) { + if (!BC.MIA->isInvoke(Instr)) + continue; -void BinaryFunction::recomputeLandingPads(const unsigned StartIndex, - const unsigned NumBlocks) { - assert(LPToBBIndex.empty()); - - clearLandingPads(StartIndex, NumBlocks); - - for (auto I = StartIndex; I < StartIndex + NumBlocks; ++I) { - auto *BB = BasicBlocks[I]; - for (auto &Instr : BB->instructions()) { - // Store info about associated landing pad. - if (BC.MIA->isInvoke(Instr)) { - const MCSymbol *LP; - uint64_t Action; - std::tie(LP, Action) = BC.MIA->getEHInfo(Instr); - if (LP) { - LPToBBIndex[LP].push_back(getIndex(BB)); - } - } + const MCSymbol *LPLabel; + uint64_t Action; + std::tie(LPLabel, Action) = BC.MIA->getEHInfo(Instr); + if (!LPLabel) + continue; + + auto *LPBlock = getBasicBlockForLabel(LPLabel); + BB->LandingPads.emplace_back(LPBlock); + LPBlock->Throwers.emplace_back(BB); } + std::sort(BB->lp_begin(), BB->lp_end()); + auto NewEnd = std::unique(BB->lp_begin(), BB->lp_end()); + BB->LandingPads.erase(NewEnd, BB->lp_end()); } - addLandingPads(StartIndex, NumBlocks); + for (auto *BB : BasicBlocks) { + std::sort(BB->throw_begin(), BB->throw_end()); + auto NewEnd = std::unique(BB->throw_begin(), BB->throw_end()); + BB->Throwers.erase(NewEnd, BB->throw_end()); + } } bool BinaryFunction::buildCFG() { @@ -1608,16 +1591,6 @@ bool BinaryFunction::buildCFG() { CFIOffset = getSize(); addCFIPlaceholders(CFIOffset, InsertBB); - // Store info about associated landing pad. - if (MIA->isInvoke(Instr)) { - const MCSymbol *LP; - uint64_t Action; - std::tie(LP, Action) = MIA->getEHInfo(Instr); - if (LP) { - LPToBBIndex[LP].push_back(getIndex(InsertBB)); - } - } - if (MIA->isTerminator(Instr)) { PrevBB = InsertBB; InsertBB = nullptr; @@ -1810,8 +1783,7 @@ bool BinaryFunction::buildCFG() { DEBUG(dbgs() << "last block was marked as a fall-through\n"); } - // Add associated landing pad blocks to each basic block. - addLandingPads(0, BasicBlocks.size()); + recomputeLandingPads(); // Infer frequency for non-taken branches if (hasValidProfile() && opts::DoMCF != MCF_DISABLE) { @@ -1873,7 +1845,6 @@ bool BinaryFunction::buildCFG() { clearList(TakenBranches); clearList(FTBranches); clearList(IgnoredBranches); - clearList(LPToBBIndex); clearList(EntryOffsets); // Update the state. @@ -3033,23 +3004,35 @@ bool BinaryFunction::validateCFG() const { return Valid; for (auto *BB : BasicBlocks) { - std::set Seen; + if (!std::is_sorted(BB->lp_begin(), BB->lp_end())) { + errs() << "BOLT-ERROR: unsorted list of landing pads in " + << BB->getName() << " in function " << *this << '\n'; + return false; + } + if (std::unique(BB->lp_begin(), BB->lp_end()) != BB->lp_end()) { + errs() << "BOLT-ERROR: duplicate landing pad detected in" + << BB->getName() << " in function " << *this << '\n'; + return false; + } + if (!std::is_sorted(BB->throw_begin(), BB->throw_end())) { + errs() << "BOLT-ERROR: unsorted list of throwers in " + << BB->getName() << " in function " << *this << '\n'; + return false; + } + if (std::unique(BB->throw_begin(), BB->throw_end()) != BB->throw_end()) { + errs() << "BOLT-ERROR: duplicate thrower detected in" + << BB->getName() << " in function " << *this << '\n'; + return false; + } for (auto *LPBlock : BB->LandingPads) { - Valid &= Seen.count(LPBlock) == 0; - if (!Valid) { - errs() << "BOLT-WARNING: Duplicate LP seen " << LPBlock->getName() - << "in " << *this << "\n"; - break; - } - Seen.insert(LPBlock); - auto count = LPBlock->Throwers.count(BB); - Valid &= (count == 1); - if (!Valid) { - errs() << "BOLT-WARNING: Inconsistent landing pad detected in " - << *this << ": " << LPBlock->getName() - << " is in LandingPads but not in " << BB->getName() - << "->Throwers\n"; - break; + if (!std::binary_search(LPBlock->throw_begin(), + LPBlock->throw_end(), + BB)) { + errs() << "BOLT-ERROR: inconsistent landing pad detected in " + << *this << ": " << BB->getName() + << " is in LandingPads but not in " << LPBlock->getName() + << " Throwers\n"; + return false; } } } @@ -3590,12 +3573,7 @@ void BinaryFunction::insertBasicBlocks( BasicBlocks[I++] = BB.release(); } - updateBBIndices(StartIndex); - - recomputeLandingPads(StartIndex, NumNewBlocks + 1); - - // Make sure the basic blocks are sorted properly. - assert(std::is_sorted(begin(), end())); + recomputeLandingPads(); if (UpdateLayout) { updateLayout(Start, NumNewBlocks); @@ -3624,12 +3602,7 @@ BinaryFunction::iterator BinaryFunction::insertBasicBlocks( BasicBlocks[I++] = BB.release(); } - updateBBIndices(StartIndex); - - recomputeLandingPads(StartIndex, NumNewBlocks + 1); - - // Make sure the basic blocks are sorted properly. - assert(std::is_sorted(begin(), end())); + recomputeLandingPads(); if (UpdateLayout) { updateLayout(*std::prev(RetIter), NumNewBlocks); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index be178e4bf8c4..f62cce01c8ed 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -325,9 +325,6 @@ class BinaryFunction { /// Original LSDA address for the function. uint64_t LSDAAddress{0}; - /// Landing pads for the function. - std::set LandingPads; - /// Associated DIEs in the .debug_info section with their respective CUs. /// There can be multiple because of identical code folding. std::vector EntryOffsets; @@ -466,10 +452,6 @@ class BinaryFunction { BranchListType FTBranches; /// All fall-through branches. BranchListType IgnoredBranches; /// Branches ignored by CFG purposes. - /// Storage for all landing pads and their corresponding invokes. - using LandingPadsMapType = std::map >; - LandingPadsMapType LPToBBIndex; - /// Map offset in the function to a label. /// Labels are used for building CFG for simple functions. For non-simple /// function in relocation mode we need to emit them for relocations diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index e4daa4632e62..ac303bfd0207 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -233,7 +233,6 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, LPSymbol = BC.Ctx->createTempSymbol("LP", true); Labels[LandingPad] = LPSymbol; } - LandingPads.insert(LPSymbol); } } From 23a9d5be97709f962850b5faad806bd861319818 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 16 Oct 2017 13:09:43 -0700 Subject: [PATCH 326/904] [BOLT] Add value profiling to BOLT Summary: Add support for reading value profiling info from perf data. This diff adds support in DataReader/DataAggregator for value profiling data. Each event is recorded as two Locations (a PC and an address/value) and a count. For now, I'm assuming that the value profiling data is in the same file as the usual BOLT profiling data. Collecting both at the same time seems to work. (cherry picked from commit f1ba4770f1544b295a49c2ba90863804e84e1690) --- bolt/BinaryContext.cpp | 10 + bolt/DataAggregator.cpp | 302 ++++++++++++++++++++++++++----- bolt/DataAggregator.h | 42 +++-- bolt/DataReader.cpp | 264 +++++++++++++++++++++++---- bolt/DataReader.h | 100 +++++++++- bolt/merge-fdata/merge-fdata.cpp | 85 ++++++++- 6 files changed, 699 insertions(+), 104 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index cf5a7d182a47..5fc289a0aefd 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -369,6 +369,16 @@ void BinaryContext::printInstruction(raw_ostream &OS, } } + auto *MD = Function ? DR.getFuncMemData(Function->getNames()) : nullptr; + if (MD) { + bool DidPrint = false; + for (auto &MI : MD->getMemInfoRange(Offset)) { + OS << (DidPrint ? ", " : " # Loads: "); + OS << MI.Addr << "/" << MI.Count; + DidPrint = true; + } + } + OS << "\n"; if (printMCInst) { diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index a4d586708bfa..8912657f2305 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Regex.h" #include "llvm/Support/Timer.h" #include @@ -64,7 +65,8 @@ void DataAggregator::start(StringRef PerfDataFilename) { outs() << "PERF2BOLT: Starting data aggregation job for " << PerfDataFilename << "\n"; findPerfExecutable(); - launchPerfEventsNoWait(); + launchPerfBranchEventsNoWait(); + launchPerfMemEventsNoWait(); launchPerfTasksNoWait(); } @@ -73,17 +75,18 @@ void DataAggregator::abort() { // Kill subprocesses in case they are not finished sys::Wait(TasksPI, 1, false, &Error); - sys::Wait(EventsPI, 1, false, &Error); + sys::Wait(BranchEventsPI, 1, false, &Error); + sys::Wait(MemEventsPI, 1, false, &Error); deleteTempFiles(); } -bool DataAggregator::launchPerfEventsNoWait() { +bool DataAggregator::launchPerfBranchEventsNoWait() { SmallVector Argv; SmallVector Redirects; SmallVector RedirectPtrs; - outs() << "PERF2BOLT: Spawning perf-script job to read events\n"; + outs() << "PERF2BOLT: Spawning perf-script job to read branch events\n"; Argv.push_back(PerfPath.data()); Argv.push_back("script"); Argv.push_back("-F"); @@ -93,32 +96,77 @@ bool DataAggregator::launchPerfEventsNoWait() { Argv.push_back(nullptr); if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out", - PerfEventsOutputPath)) { + PerfBranchEventsOutputPath)) { outs() << "PERF2BOLT: Failed to create temporary file " - << PerfEventsOutputPath << " with error " << Errc.message() << "\n"; + << PerfBranchEventsOutputPath << " with error " << Errc.message() << "\n"; exit(1); } if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", - PerfEventsErrPath)) { + PerfBranchEventsErrPath)) { outs() << "PERF2BOLT: Failed to create temporary file " - << PerfEventsErrPath << " with error " << Errc.message() << "\n"; + << PerfBranchEventsErrPath << " with error " << Errc.message() << "\n"; exit(1); } - Redirects.push_back(""); // Stdin - Redirects.push_back(StringRef(PerfEventsOutputPath.data())); // Stdout - Redirects.push_back(StringRef(PerfEventsErrPath.data())); // Stderr + Redirects.push_back(""); // Stdin + Redirects.push_back(StringRef(PerfBranchEventsOutputPath.data())); // Stdout + Redirects.push_back(StringRef(PerfBranchEventsErrPath.data())); // Stderr RedirectPtrs.push_back(&Redirects[0]); RedirectPtrs.push_back(&Redirects[1]); RedirectPtrs.push_back(&Redirects[2]); DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " - << PerfEventsOutputPath.data() << " 2> " - << PerfEventsErrPath.data() << "\n"); + << PerfBranchEventsOutputPath.data() << " 2> " + << PerfBranchEventsErrPath.data() << "\n"); - EventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, &RedirectPtrs[0]); + BranchEventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), + /*envp*/ nullptr, &RedirectPtrs[0]); + + return true; +} + +bool DataAggregator::launchPerfMemEventsNoWait() { + SmallVector Argv; + SmallVector Redirects; + SmallVector RedirectPtrs; + + outs() << "PERF2BOLT: Spawning perf-script job to read mem events\n"; + Argv.push_back(PerfPath.data()); + Argv.push_back("script"); + Argv.push_back("-F"); + Argv.push_back("pid,event,addr,ip"); + Argv.push_back("-i"); + Argv.push_back(PerfDataFilename.data()); + Argv.push_back(nullptr); + + if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out", + PerfMemEventsOutputPath)) { + outs() << "PERF2BOLT: Failed to create temporary file " + << PerfMemEventsOutputPath << " with error " << Errc.message() << "\n"; + exit(1); + } + + if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", + PerfMemEventsErrPath)) { + outs() << "PERF2BOLT: Failed to create temporary file " + << PerfMemEventsErrPath << " with error " << Errc.message() << "\n"; + exit(1); + } + + Redirects.push_back(""); // Stdin + Redirects.push_back(StringRef(PerfMemEventsOutputPath.data())); // Stdout + Redirects.push_back(StringRef(PerfMemEventsErrPath.data())); // Stderr + RedirectPtrs.push_back(&Redirects[0]); + RedirectPtrs.push_back(&Redirects[1]); + RedirectPtrs.push_back(&Redirects[2]); + + DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " + << PerfMemEventsOutputPath.data() << " 2> " + << PerfMemEventsErrPath.data() << "\n"); + + MemEventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), + /*envp*/ nullptr, &RedirectPtrs[0]); return true; } @@ -276,8 +324,10 @@ void DataAggregator::deleteTempFile(StringRef File) { } void DataAggregator::deleteTempFiles() { - deleteTempFile(PerfEventsErrPath.data()); - deleteTempFile(PerfEventsOutputPath.data()); + deleteTempFile(PerfBranchEventsErrPath.data()); + deleteTempFile(PerfBranchEventsOutputPath.data()); + deleteTempFile(PerfMemEventsErrPath.data()); + deleteTempFile(PerfMemEventsOutputPath.data()); deleteTempFile(PerfTasksErrPath.data()); deleteTempFile(PerfTasksOutputPath.data()); } @@ -328,7 +378,7 @@ bool DataAggregator::aggregate(BinaryContext &BC, outs() << "PERF2BOLT: Waiting for perf events collection to finish...\n"; - auto PI2 = sys::Wait(EventsPI, 0, true, &Error); + auto PI2 = sys::Wait(BranchEventsPI, 0, true, &Error); if (!Error.empty()) { errs() << "PERF-ERROR: " << Error << "\n"; @@ -338,7 +388,7 @@ bool DataAggregator::aggregate(BinaryContext &BC, if (PI2.ReturnCode != 0) { ErrorOr> MB = - MemoryBuffer::getFileOrSTDIN(PerfEventsErrPath.data()); + MemoryBuffer::getFileOrSTDIN(PerfBranchEventsErrPath.data()); StringRef ErrBuf = (*MB)->getBuffer(); errs() << "PERF-ERROR: Return code " << PI2.ReturnCode << "\n"; @@ -348,23 +398,59 @@ bool DataAggregator::aggregate(BinaryContext &BC, } ErrorOr> MB2 = - MemoryBuffer::getFileOrSTDIN(PerfEventsOutputPath.data()); + MemoryBuffer::getFileOrSTDIN(PerfBranchEventsOutputPath.data()); if (std::error_code EC = MB2.getError()) { - errs() << "Cannot open " << PerfEventsOutputPath.data() << ": " + errs() << "Cannot open " << PerfBranchEventsOutputPath.data() << ": " << EC.message() << "\n"; deleteTempFiles(); exit(1); } FileBuf.reset(MB2->release()); - deleteTempFiles(); ParsingBuf = FileBuf->getBuffer(); Col = 0; Line = 1; - if (parseEvents()) { - outs() << "PERF2BOLT: Failed to parse events\n"; + if (parseBranchEvents()) { + outs() << "PERF2BOLT: Failed to parse branch events\n"; + } + + auto PI3 = sys::Wait(MemEventsPI, 0, true, &Error); + + if (PI3.ReturnCode != 0) { + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(PerfMemEventsErrPath.data()); + StringRef ErrBuf = (*MB)->getBuffer(); + + deleteTempFiles(); + + Regex NoData("Samples for '.*' event do not have ADDR attribute set. Cannot print 'addr' field."); + if (!NoData.match(ErrBuf)) { + errs() << "PERF-ERROR: Return code " << PI3.ReturnCode << "\n"; + errs() << ErrBuf; + exit(1); + } + return true; + } + + ErrorOr> MB3 = + MemoryBuffer::getFileOrSTDIN(PerfMemEventsOutputPath.data()); + if (std::error_code EC = MB3.getError()) { + errs() << "Cannot open " << PerfMemEventsOutputPath.data() << ": " + << EC.message() << "\n"; + deleteTempFiles(); + exit(1); + } + + FileBuf.reset(MB3->release()); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + if (parseMemEvents()) { + outs() << "PERF2BOLT: Failed to parse memory events\n"; } + deleteTempFiles(); + return true; } @@ -547,8 +633,8 @@ void DataAggregator::consumeRestOfLine() { Line += 1; } -ErrorOr DataAggregator::parseSample() { - PerfSample Res; +ErrorOr DataAggregator::parseBranchSample() { + PerfBranchSample Res; while (checkAndConsumeFS()) {} @@ -572,6 +658,49 @@ ErrorOr DataAggregator::parseSample() { return Res; } +ErrorOr DataAggregator::parseMemSample() { + PerfMemSample Res{0,0}; + + while (checkAndConsumeFS()) {} + + auto PIDRes = parseNumberField(FieldSeparator, true); + if (std::error_code EC = PIDRes.getError()) + return EC; + if (!PIDs.empty() && !PIDs.count(PIDRes.get())) { + consumeRestOfLine(); + return Res; + } + + while (checkAndConsumeFS()) {} + + auto Event = parseString(FieldSeparator); + if (std::error_code EC = Event.getError()) + return EC; + if (Event.get().find("mem-loads") == StringRef::npos) { + consumeRestOfLine(); + return Res; + } + + while (checkAndConsumeFS()) {} + + auto AddrRes = parseHexField(FieldSeparator); + if (std::error_code EC = AddrRes.getError()) { + return EC; + } + + while (checkAndConsumeFS()) {} + + auto PCRes = parseHexField(FieldSeparator, true); + if (std::error_code EC = PCRes.getError()) { + consumeRestOfLine(); + return EC; + } + + checkAndConsumeNewLine(); + + return PerfMemSample{PCRes.get(), AddrRes.get()}; +} + bool DataAggregator::hasData() { if (ParsingBuf.size() == 0) return false; @@ -579,14 +708,14 @@ bool DataAggregator::hasData() { return true; } -std::error_code DataAggregator::parseEvents() { - outs() << "PERF2BOLT: Aggregating...\n"; - NamedRegionTimer T("Samples parsing", TimerGroupName, opts::TimeAggregator); +std::error_code DataAggregator::parseBranchEvents() { + outs() << "PERF2BOLT: Aggregating branch events...\n"; + NamedRegionTimer T("Branch samples parsing", TimerGroupName, opts::TimeAggregator); uint64_t NumEntries{0}; uint64_t NumSamples{0}; uint64_t NumTraces{0}; while (hasData()) { - auto SampleRes = parseSample(); + auto SampleRes = parseBranchSample(); if (std::error_code EC = SampleRes.getError()) return EC; @@ -648,6 +777,58 @@ std::error_code DataAggregator::parseEvents() { return std::error_code(); } +std::error_code DataAggregator::parseMemEvents() { + outs() << "PERF2BOLT: Aggregating memory events...\n"; + NamedRegionTimer T("Mem samples parsing", TimerGroupName, opts::TimeAggregator); + + while (hasData()) { + auto SampleRes = parseMemSample(); + if (std::error_code EC = SampleRes.getError()) + return EC; + + auto PC = SampleRes.get().PC; + auto Addr = SampleRes.get().Addr; + StringRef FuncName; + StringRef MemName; + + // Try to resolve symbol for PC + auto *Func = getBinaryFunctionContainingAddress(PC); + if (Func) { + FuncName = Func->getNames()[0]; + PC -= Func->getAddress(); + } + + // Try to resolve symbol for memory load + auto *MemFunc = getBinaryFunctionContainingAddress(Addr); + if (MemFunc) { + MemName = MemFunc->getNames()[0]; + Addr -= MemFunc->getAddress(); + } else { + // TODO: global symbol size? + auto Sym = BC->getGlobalSymbolAtAddress(Addr); + if (Sym) { + MemName = Sym->getName(); + Addr = 0; + } + } + + const Location FuncLoc(!FuncName.empty(), FuncName, PC); + const Location AddrLoc(!MemName.empty(), MemName, Addr); + + // TODO what does it mean when PC is 0 (or not a known function)? + DEBUG(if (!Func && PC != 0) { + dbgs() << "Skipped mem event: " << FuncLoc << " = " << AddrLoc << "\n"; + }); + + if (Func) { + FuncsToMemEvents[FuncName].update(FuncLoc, AddrLoc); + DEBUG(dbgs() << "Mem event: " << FuncLoc << " = " << AddrLoc << "\n"); + } + } + + return std::error_code(); +} + ErrorOr DataAggregator::parseTaskPID() { while (checkAndConsumeFS()) {} @@ -745,35 +926,52 @@ std::error_code DataAggregator::writeAggregatedFile() const { if (EC) return EC; - uint64_t Values{0}; + bool WriteMemLocs = false; + + auto writeLocation = [&OutFile,&WriteMemLocs](const Location &Loc) { + if (WriteMemLocs) + OutFile << (Loc.IsSymbol ? "4 " : "3 "); + else + OutFile << (Loc.IsSymbol ? "1 " : "0 "); + OutFile << (Loc.Name.empty() ? "[unknown]" : Loc.Name) << " " + << Twine::utohexstr(Loc.Offset) + << FieldSeparator; + }; + + uint64_t BranchValues{0}; + uint64_t MemValues{0}; + for (const auto &Func : FuncsToBranches) { for (const auto &BI : Func.getValue().Data) { - OutFile << (BI.From.IsSymbol ? "1 " : "0 ") - << (BI.From.Name.empty() ? "[unknown]" : BI.From.Name) << " " - << Twine::utohexstr(BI.From.Offset) << " " - << (BI.To.IsSymbol ? "1 " : "0 ") - << (BI.To.Name.empty() ? "[unknown]" : BI.To.Name) << " " - << Twine::utohexstr(BI.To.Offset) << " " << BI.Mispreds << " " - << BI.Branches << "\n"; - ++Values; + writeLocation(BI.From); + writeLocation(BI.To); + OutFile << BI.Mispreds << " " << BI.Branches << "\n"; + ++BranchValues; } for (const auto &BI : Func.getValue().EntryData) { // Do not output if source is a known symbol, since this was already // accounted for in the source function if (BI.From.IsSymbol) continue; - OutFile << (BI.From.IsSymbol ? "1 " : "0 ") - << (BI.From.Name.empty() ? "[unknown]" : BI.From.Name) << " " - << Twine::utohexstr(BI.From.Offset) << " " - << (BI.To.IsSymbol ? "1 " : "0 ") - << (BI.To.Name.empty() ? "[unknown]" : BI.To.Name) << " " - << Twine::utohexstr(BI.To.Offset) << " " << BI.Mispreds << " " - << BI.Branches << "\n"; - ++Values; + writeLocation(BI.From); + writeLocation(BI.To); + OutFile << BI.Mispreds << " " << BI.Branches << "\n"; + ++BranchValues; + } + } + + WriteMemLocs = true; + for (const auto &Func : FuncsToMemEvents) { + for (const auto &MemEvent : Func.getValue().Data) { + writeLocation(MemEvent.Offset); + writeLocation(MemEvent.Addr); + OutFile << MemEvent.Count << "\n"; + ++MemValues; } } - outs() << "PERF2BOLT: Wrote " << Values << " objects to " - << OutputFDataName << "\n"; + + outs() << "PERF2BOLT: Wrote " << BranchValues << " branch objects and " + << MemValues << " memory objects to " << OutputFDataName << "\n"; return std::error_code(); } @@ -788,9 +986,13 @@ void DataAggregator::dump(const LBREntry &LBR) const { << "\n"; } -void DataAggregator::dump(const PerfSample &Sample) const { +void DataAggregator::dump(const PerfBranchSample &Sample) const { Diag << "Sample LBR entries: " << Sample.LBR.size() << "\n"; for (const auto &LBR : Sample.LBR) { dump(LBR); } } + +void DataAggregator::dump(const PerfMemSample &Sample) const { + Diag << "Sample mem entries: " << Sample.PC << ": " << Sample.Addr << "\n"; +} diff --git a/bolt/DataAggregator.h b/bolt/DataAggregator.h index c8a1de470a91..6dcac3f7daed 100644 --- a/bolt/DataAggregator.h +++ b/bolt/DataAggregator.h @@ -34,10 +34,15 @@ struct LBREntry { bool Mispred; }; -struct PerfSample { +struct PerfBranchSample { SmallVector LBR; }; +struct PerfMemSample { + uint64_t PC; + uint64_t Addr; +}; + /// DataAggregator inherits all parsing logic from DataReader as well as /// its data structures used to represent aggregated profile data in memory. /// @@ -61,10 +66,13 @@ struct PerfSample { class DataAggregator : public DataReader { // Perf process spawning bookkeeping std::string PerfPath; - sys::ProcessInfo EventsPI; + sys::ProcessInfo BranchEventsPI; + sys::ProcessInfo MemEventsPI; sys::ProcessInfo TasksPI; - SmallVector PerfEventsOutputPath; - SmallVector PerfEventsErrPath; + SmallVector PerfBranchEventsOutputPath; + SmallVector PerfBranchEventsErrPath; + SmallVector PerfMemEventsOutputPath; + SmallVector PerfMemEventsErrPath; SmallVector PerfTasksOutputPath; SmallVector PerfTasksErrPath; @@ -93,9 +101,13 @@ class DataAggregator : public DataReader { /// Looks into system PATH for Linux Perf and set up the aggregator to use it void findPerfExecutable(); - /// Launch a subprocess to read all perf samples and write them to an output - /// file we will parse later - bool launchPerfEventsNoWait(); + /// Launch a subprocess to read all perf branch samples and write them to an + /// output file we will parse later + bool launchPerfBranchEventsNoWait(); + + /// Launch a subprocess to read all perf memory event samples and write them + /// to an output file we will parse later + bool launchPerfMemEventsNoWait(); /// Launch a subprocess to read all perf task events. They contain the mapping /// of binary file name to PIDs used during data collection time. We later use @@ -139,7 +151,11 @@ class DataAggregator : public DataReader { /// Parse a single perf sample containing a PID associated with a sequence of /// LBR entries - ErrorOr parseSample(); + ErrorOr parseBranchSample(); + + /// Parse a single perf sample containing a PID associated with an IP and + /// address. + ErrorOr parseMemSample(); /// Check if a field separator is the next char to parse and, if yes, consume /// it and return true @@ -151,8 +167,11 @@ class DataAggregator : public DataReader { /// Parse a single LBR entry as output by perf script -Fbrstack ErrorOr parseLBREntry(); - /// Parse the full output generated by perf script to report LBR samples - std::error_code parseEvents(); + /// Parse the full output generated by perf script to report LBR samples. + std::error_code parseBranchEvents(); + + /// Parse the full output generated by perf script to report memory events. + std::error_code parseMemEvents(); /// Parse a single line of a PERF_RECORD_COMM event looking for an association /// between the binary name and its PID. Return -1 if binary name is not @@ -207,7 +226,8 @@ class DataAggregator : public DataReader { /// Debugging dump methods void dump() const; void dump(const LBREntry &LBR) const; - void dump(const PerfSample &Sample) const; + void dump(const PerfBranchSample &Sample) const; + void dump(const PerfMemSample &Sample) const; }; } } diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 4a92adaf5942..7a0eb57d3ef3 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -41,6 +41,17 @@ StringRef normalizeName(StringRef Name) { } // anonymous namespace +raw_ostream &operator<<(raw_ostream &OS, const Location &Loc) { + if (Loc.IsSymbol) { + OS << Loc.Name; + if (Loc.Offset) + OS << "+" << Twine::utohexstr(Loc.Offset); + } else { + OS << Twine::utohexstr(Loc.Offset); + } + return OS; +} + iterator_range FuncBranchData::getBranchRange(uint64_t From) const { assert(std::is_sorted(Data.begin(), Data.end())); @@ -285,6 +296,39 @@ FuncBranchData::getDirectCallBranch(uint64_t From) const { return make_error_code(llvm::errc::invalid_argument); } +void MemInfo::print(raw_ostream &OS) const { + OS << (Offset.IsSymbol + 3) << " " << Offset.Name << " " + << Twine::utohexstr(Offset.Offset) << " " + << (Addr.IsSymbol + 3) << " " << Addr.Name << " " + << Twine::utohexstr(Addr.Offset) << " " + << Count << "\n"; +} + +iterator_range +FuncMemData::getMemInfoRange(uint64_t Offset) const { + assert(std::is_sorted(Data.begin(), Data.end())); + struct Compare { + bool operator()(const MemInfo &MI, const uint64_t Val) const { + return MI.Offset.Offset < Val; + } + bool operator()(const uint64_t Val, const MemInfo &MI) const { + return Val < MI.Offset.Offset; + } + }; + auto Range = std::equal_range(Data.begin(), Data.end(), Offset, Compare()); + return iterator_range(Range.first, Range.second); +} + +void FuncMemData::update(const Location &Offset, const Location &Addr) { + auto Iter = EventIndex[Offset.Offset].find(Addr); + if (Iter == EventIndex[Offset.Offset].end()) { + Data.emplace_back(MemInfo(Offset, Addr, 1)); + EventIndex[Offset.Offset][Addr] = Data.size() - 1; + return; + } + ++Data[Iter->second].Count; +} + ErrorOr> DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { ErrorOr> MB = @@ -295,7 +339,7 @@ DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { } auto DR = make_unique(std::move(MB.get()), Diag); DR->parse(); - DR->buildLTONameMap(); + DR->buildLTONameMaps(); return std::move(DR); } @@ -366,16 +410,43 @@ ErrorOr DataReader::parseNumberField(char EndChar, bool EndNl) { return Num; } -ErrorOr DataReader::parseLocation(char EndChar, bool EndNl) { +ErrorOr DataReader::parseHexField(char EndChar, bool EndNl) { + auto NumStrRes = parseString(EndChar, EndNl); + if (std::error_code EC = NumStrRes.getError()) + return EC; + StringRef NumStr = NumStrRes.get(); + uint64_t Num; + if (NumStr.getAsInteger(16, Num)) { + reportError("expected hexidecimal number"); + Diag << "Found: " << NumStr << "\n"; + return make_error_code(llvm::errc::io_error); + } + return Num; +} + +ErrorOr DataReader::parseLocation(char EndChar, + bool EndNl, + bool ExpectMemLoc) { // Read whether the location of the branch should be DSO or a symbol // 0 means it is a DSO. 1 means it is a global symbol. 2 means it is a local // symbol. - if (ParsingBuf[0] != '0' && ParsingBuf[0] != '1' && ParsingBuf[0] != '2') { + // The symbol flag is also used to tag memory load events by adding 3 to the + // base values, i.e. 3 not a symbol, 4 global symbol and 5 local symbol. + if (!ExpectMemLoc && + ParsingBuf[0] != '0' && ParsingBuf[0] != '1' && ParsingBuf[0] != '2') { reportError("expected 0, 1 or 2"); return make_error_code(llvm::errc::io_error); } - bool IsSymbol = ParsingBuf[0] == '1' || ParsingBuf[0] == '2'; + if (ExpectMemLoc && + ParsingBuf[0] != '3' && ParsingBuf[0] != '4' && ParsingBuf[0] != '5') { + reportError("expected 3, 4 or 5"); + return make_error_code(llvm::errc::io_error); + } + + bool IsSymbol = + (!ExpectMemLoc && (ParsingBuf[0] == '1' || ParsingBuf[0] == '2')) || + (ExpectMemLoc && (ParsingBuf[0] == '4' || ParsingBuf[0] == '5')); ParsingBuf = ParsingBuf.drop_front(1); Col += 1; @@ -389,18 +460,11 @@ ErrorOr DataReader::parseLocation(char EndChar, bool EndNl) { StringRef Name = NameRes.get(); // Read the offset - auto OffsetStrRes = parseString(EndChar, EndNl); - if (std::error_code EC = OffsetStrRes.getError()) + auto Offset = parseHexField(EndChar, EndNl); + if (std::error_code EC = Offset.getError()) return EC; - StringRef OffsetStr = OffsetStrRes.get(); - uint64_t Offset; - if (OffsetStr.getAsInteger(16, Offset)) { - reportError("expected hexadecimal number"); - Diag << "Found: " << OffsetStr << "\n"; - return make_error_code(llvm::errc::io_error); - } - return Location(IsSymbol, Name, Offset); + return Location(IsSymbol, Name, Offset.get()); } ErrorOr DataReader::parseBranchHistory() { @@ -483,6 +547,26 @@ ErrorOr DataReader::parseBranchInfo() { std::move(Histories)); } +ErrorOr DataReader::parseMemInfo() { + auto Res = parseMemLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location Offset = Res.get(); + + Res = parseMemLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + Location Addr = Res.get(); + + auto CountRes = parseNumberField(FieldSeparator, true); + if (std::error_code EC = CountRes.getError()) + return EC; + + checkAndConsumeNewLine(); + + return MemInfo(Offset, Addr, CountRes.get()); +} + ErrorOr DataReader::parseSampleInfo() { auto Res = parseLocation(FieldSeparator); if (std::error_code EC = Res.getError()) @@ -525,7 +609,7 @@ ErrorOr DataReader::maybeParseNoLBRFlag() { return true; } -bool DataReader::hasData() { +bool DataReader::hasBranchData() { if (ParsingBuf.size() == 0) return false; @@ -534,6 +618,15 @@ bool DataReader::hasData() { return false; } +bool DataReader::hasMemData() { + if (ParsingBuf.size() == 0) + return false; + + if (ParsingBuf[0] == '3' || ParsingBuf[0] == '4' || ParsingBuf[0] == '5') + return true; + return false; +} + std::error_code DataReader::parseInNoLBRMode() { auto GetOrCreateFuncEntry = [&](StringRef Name) { auto I = FuncsToSamples.find(Name); @@ -547,7 +640,18 @@ std::error_code DataReader::parseInNoLBRMode() { return I; }; - while (hasData()) { + auto GetOrCreateFuncMemEntry = [&](StringRef Name) { + auto I = FuncsToMemEvents.find(Name); + if (I == FuncsToMemEvents.end()) { + bool success; + std::tie(I, success) = FuncsToMemEvents.insert( + std::make_pair(Name, FuncMemData(Name, FuncMemData::ContainerTy()))); + assert(success && "unexpected result of insert"); + } + return I; + }; + + while (hasBranchData()) { auto Res = parseSampleInfo(); if (std::error_code EC = Res.getError()) return EC; @@ -562,11 +666,31 @@ std::error_code DataReader::parseInNoLBRMode() { I->getValue().Data.emplace_back(std::move(SI)); } + while (hasMemData()) { + auto Res = parseMemInfo(); + if (std::error_code EC = Res.getError()) + return EC; + + MemInfo MI = Res.get(); + + // Ignore memory events not involving known pc. + if (!MI.Offset.IsSymbol) + continue; + + auto I = GetOrCreateFuncMemEntry(MI.Offset.Name); + I->getValue().Data.emplace_back(std::move(MI)); + } + for (auto &FuncSamples : FuncsToSamples) { std::stable_sort(FuncSamples.second.Data.begin(), FuncSamples.second.Data.end()); } + for (auto &MemEvents : FuncsToMemEvents) { + std::stable_sort(MemEvents.second.Data.begin(), + MemEvents.second.Data.end()); + } + return std::error_code(); } @@ -584,6 +708,17 @@ std::error_code DataReader::parse() { return I; }; + auto GetOrCreateFuncMemEntry = [&](StringRef Name) { + auto I = FuncsToMemEvents.find(Name); + if (I == FuncsToMemEvents.end()) { + bool success; + std::tie(I, success) = FuncsToMemEvents.insert( + std::make_pair(Name, FuncMemData(Name, FuncMemData::ContainerTy()))); + assert(success && "unexpected result of insert"); + } + return I; + }; + Col = 0; Line = 1; auto FlagOrErr = maybeParseNoLBRFlag(); @@ -593,7 +728,7 @@ std::error_code DataReader::parse() { if (NoLBRMode) return parseInNoLBRMode(); - while (hasData()) { + while (hasBranchData()) { auto Res = parseBranchInfo(); if (std::error_code EC = Res.getError()) return EC; @@ -624,21 +759,48 @@ std::error_code DataReader::parse() { } } + while (hasMemData()) { + auto Res = parseMemInfo(); + if (std::error_code EC = Res.getError()) + return EC; + + MemInfo MI = Res.get(); + + // Ignore memory events not involving known pc. + if (!MI.Offset.IsSymbol) + continue; + + auto I = GetOrCreateFuncMemEntry(MI.Offset.Name); + I->getValue().Data.emplace_back(std::move(MI)); + } + for (auto &FuncBranches : FuncsToBranches) { std::stable_sort(FuncBranches.second.Data.begin(), FuncBranches.second.Data.end()); } + for (auto &MemEvents : FuncsToMemEvents) { + std::stable_sort(MemEvents.second.Data.begin(), + MemEvents.second.Data.end()); + } + return std::error_code(); } -void DataReader::buildLTONameMap() { +void DataReader::buildLTONameMaps() { for (auto &FuncData : FuncsToBranches) { const auto FuncName = FuncData.getKey(); const auto CommonName = getLTOCommonName(FuncName); if (CommonName) LTOCommonNameMap[*CommonName].push_back(&FuncData.getValue()); } + + for (auto &FuncData : FuncsToMemEvents) { + const auto FuncName = FuncData.getKey(); + const auto CommonName = getLTOCommonName(FuncName); + if (CommonName) + LTOCommonNameMemMap[*CommonName].push_back(&FuncData.getValue()); + } } namespace { @@ -654,21 +816,14 @@ fetchMapEntry(MapTy &Map, const std::vector &FuncNames) { } return nullptr; } -} - -FuncBranchData * -DataReader::getFuncBranchData(const std::vector &FuncNames) { - return fetchMapEntry(FuncsToBranches, FuncNames); -} - -FuncSampleData * -DataReader::getFuncSampleData(const std::vector &FuncNames) { - return fetchMapEntry(FuncsToSamples, FuncNames); -} -std::vector -DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) { - std::vector AllData; +template +std::vector +fetchMapEntriesRegex( + MapTy &Map, + const StringMap> <OCommonNameMap, + const std::vector &FuncNames) { + std::vector AllData; // Do a reverse order iteration since the name in profile has a higher chance // of matching a name at the end of the list. for (auto FI = FuncNames.rbegin(), FE = FuncNames.rend(); FI != FE; ++FI) { @@ -682,8 +837,8 @@ DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) { AllData.insert(AllData.end(), CommonData.begin(), CommonData.end()); } } else { - auto I = FuncsToBranches.find(Name); - if (I != FuncsToBranches.end()) { + auto I = Map.find(Name); + if (I != Map.end()) { return {&I->getValue()}; } } @@ -691,6 +846,33 @@ DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) { return AllData; } +} + +FuncBranchData * +DataReader::getFuncBranchData(const std::vector &FuncNames) { + return fetchMapEntry(FuncsToBranches, FuncNames); +} + +FuncMemData * +DataReader::getFuncMemData(const std::vector &FuncNames) { + return fetchMapEntry(FuncsToMemEvents, FuncNames); +} + +FuncSampleData * +DataReader::getFuncSampleData(const std::vector &FuncNames) { + return fetchMapEntry(FuncsToSamples, FuncNames); +} + +std::vector +DataReader::getFuncBranchDataRegex(const std::vector &FuncNames) { + return fetchMapEntriesRegex(FuncsToBranches, LTOCommonNameMap, FuncNames); +} + +std::vector +DataReader::getFuncMemDataRegex(const std::vector &FuncNames) { + return fetchMapEntriesRegex(FuncsToMemEvents, LTOCommonNameMemMap, FuncNames); +} + bool DataReader::hasLocalsWithFileName() const { for (const auto &Func : FuncsToBranches) { const auto &FuncName = Func.getKey(); @@ -739,6 +921,20 @@ void DataReader::dump() const { << SI.Occurrences << "\n"; } } + + for (const auto &Func : FuncsToMemEvents) { + Diag << "Memory events for " << Func.getValue().Name; + Location LastOffset(0); + for (auto &MI : Func.getValue().Data) { + if (MI.Offset == LastOffset) { + Diag << ", " << MI.Addr << "/" << MI.Count; + } else { + Diag << "\n" << MI.Offset << ": " << MI.Addr << "/" << MI.Count; + } + LastOffset = MI.Offset; + } + Diag << "\n"; + } } } // namespace bolt diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 07d7e990dd2b..a0623e61aa41 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -32,7 +32,7 @@ namespace llvm { namespace bolt { /// LTO-generated function names take a form: -// +/// /// .lto_priv./... /// or /// .constprop./... @@ -62,6 +62,9 @@ struct Location { StringRef Name; uint64_t Offset; + explicit Location(uint64_t Offset) + : IsSymbol(false), Name("[unknown]"), Offset(Offset) {} + Location(bool IsSymbol, StringRef Name, uint64_t Offset) : IsSymbol(IsSymbol), Name(Name), Offset(Offset) {} @@ -80,6 +83,8 @@ struct Location { return Name != "[heap]" && Offset < RHS.Offset; } + + friend raw_ostream &operator<<(raw_ostream &OS, const Location &Loc); }; typedef std::vector> BranchContext; @@ -123,7 +128,7 @@ struct BranchInfo { } /// Merges the branch and misprediction counts as well as the histories of BI - /// with those of this objetc. + /// with those of this object. void mergeWith(const BranchInfo &BI); void print(raw_ostream &OS) const; @@ -145,10 +150,10 @@ struct FuncBranchData { FuncBranchData() {} FuncBranchData(StringRef Name, ContainerTy Data) - : Name(Name), Data(std::move(Data)) {} + : Name(Name), Data(std::move(Data)) {} FuncBranchData(StringRef Name, ContainerTy Data, ContainerTy EntryData) - : Name(Name), Data(std::move(Data)), EntryData(std::move(EntryData)) {} + : Name(Name), Data(std::move(Data)), EntryData(std::move(EntryData)) {} ErrorOr getBranch(uint64_t From, uint64_t To) const; @@ -176,6 +181,63 @@ struct FuncBranchData { void bumpEntryCount(const Location &From, uint64_t OffsetTo, bool Mispred); }; +/// MemInfo represents a single memory load from an address \p Addr at an \p +/// Offset within a function. \p Count represents how many times a particular +/// address was seen. +struct MemInfo { + Location Offset; + Location Addr; + uint64_t Count; + + bool operator==(const MemInfo &RHS) const { + return Offset == RHS.Offset && Addr == RHS.Addr; + } + + bool operator<(const MemInfo &RHS) const { + if (Offset < RHS.Offset) + return true; + + if (Offset == RHS.Offset) + return (Addr < RHS.Addr); + + return false; + } + + void mergeWith(const MemInfo &MI) { + Count += MI.Count; + } + + void print(raw_ostream &OS) const; + + MemInfo(const Location &Offset, const Location &Addr, uint64_t Count = 0) + : Offset(Offset), Addr(Addr), Count(Count) {} +}; + +/// Helper class to store memory load events recorded in the address space of +/// a given function, analogous to FuncBranchData but for memory load events +/// instead of branches. +struct FuncMemData { + typedef std::vector ContainerTy; + + StringRef Name; + ContainerTy Data; + + DenseMap> EventIndex; + + /// Find all the memory events originating at Offset. + iterator_range getMemInfoRange( + uint64_t Offset) const; + + /// Update \p Data with a memory event. Events with the same + /// \p Offset and \p Addr will be coalesced. + void update(const Location &Offset, const Location &Addr); + + FuncMemData() {} + + FuncMemData(StringRef Name, ContainerTy Data) + : Name(Name), Data(std::move(Data)) {} +}; + /// Similar to BranchInfo, but instead of recording from-to address (an edge), /// it records the address of a perf event and the number of times samples hit /// this address. @@ -297,6 +359,9 @@ class DataReader { FuncBranchData * getFuncBranchData(const std::vector &FuncNames); + /// Return mem data matching one of the names in \p FuncNames. + FuncMemData *getFuncMemData(const std::vector &FuncNames); + FuncSampleData * getFuncSampleData(const std::vector &FuncNames); @@ -306,10 +371,18 @@ class DataReader { std::vector getFuncBranchDataRegex(const std::vector &FuncNames); + /// Return a vector of all FuncMemData matching the list of names. + /// Internally use fuzzy matching to match special names like LTO-generated + /// function names. + std::vector + getFuncMemDataRegex(const std::vector &FuncNames); + using FuncsToBranchesMapTy = StringMap; using FuncsToSamplesMapTy = StringMap; + using FuncsToMemEventsMapTy = StringMap; FuncsToBranchesMapTy &getAllFuncsBranchData() { return FuncsToBranches; } + FuncsToMemEventsMapTy &getAllFuncsMemData() { return FuncsToMemEvents; } FuncsToSamplesMapTy &getAllFuncsSampleData() { return FuncsToSamples; } const FuncsToBranchesMapTy &getAllFuncsData() const { @@ -348,15 +421,24 @@ class DataReader { bool checkAndConsumeNewLine(); ErrorOr parseString(char EndChar, bool EndNl=false); ErrorOr parseNumberField(char EndChar, bool EndNl=false); - ErrorOr parseLocation(char EndChar, bool EndNl=false); + ErrorOr parseHexField(char EndChar, bool EndNl=false); + ErrorOr parseLocation(char EndChar, bool EndNl, bool ExpectMemLoc); + ErrorOr parseLocation(char EndChar, bool EndNl=false) { + return parseLocation(EndChar, EndNl, false); + } + ErrorOr parseMemLocation(char EndChar, bool EndNl=false) { + return parseLocation(EndChar, EndNl, true); + } ErrorOr parseBranchHistory(); ErrorOr parseBranchInfo(); ErrorOr parseSampleInfo(); + ErrorOr parseMemInfo(); ErrorOr maybeParseNoLBRFlag(); - bool hasData(); + bool hasBranchData(); + bool hasMemData(); /// Build suffix map once the profile data is parsed. - void buildLTONameMap(); + void buildLTONameMaps(); /// An in-memory copy of the input data file - owns strings used in reader. std::unique_ptr FileBuf; @@ -366,12 +448,14 @@ class DataReader { unsigned Col; FuncsToBranchesMapTy FuncsToBranches; FuncsToSamplesMapTy FuncsToSamples; + FuncsToMemEventsMapTy FuncsToMemEvents; bool NoLBRMode{false}; StringSet<> EventNames; static const char FieldSeparator = ' '; - /// Map of common LTO names to possible matching profiles. + /// Maps of common LTO names to possible matching profiles. StringMap> LTOCommonNameMap; + StringMap> LTOCommonNameMemMap; }; } diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/merge-fdata/merge-fdata.cpp index 8a0a17fa5841..7aca64561819 100644 --- a/bolt/merge-fdata/merge-fdata.cpp +++ b/bolt/merge-fdata/merge-fdata.cpp @@ -94,6 +94,7 @@ int main(int argc, char **argv) { // All merged data. DataReader::FuncsToBranchesMapTy MergedFunctionsBranchData; DataReader::FuncsToSamplesMapTy MergedFunctionsSampleData; + DataReader::FuncsToMemEventsMapTy MergedFunctionsMemData; StringSet<> EventNames; // Merged functions data has to replace strings refs with strings from the @@ -143,6 +144,22 @@ int main(int argc, char **argv) { AllStrings.emplace_back(ToNamePtr); // keep the reference }; + // Copy mem info replacing string references with internal storage + // references. + auto CopyMemInfo = [&](const MemInfo &MI, std::vector &MIData) { + auto OffsetNamePtr = MergedStringPool.intern(MI.Offset.Name); + auto AddrNamePtr = MergedStringPool.intern(MI.Addr.Name); + MIData.emplace_back(MemInfo(Location(MI.Offset.IsSymbol, + *OffsetNamePtr, + MI.Offset.Offset), + Location(MI.Addr.IsSymbol, + *AddrNamePtr, + MI.Addr.Offset), + MI.Count)); + AllStrings.emplace_back(OffsetNamePtr); // keep the reference + AllStrings.emplace_back(AddrNamePtr); // keep the reference + }; + auto CopySampleInfo = [&](const SampleInfo &SI, std::vector &SIData) { auto NamePtr = MergedStringPool.intern(SI.Address.Name); @@ -183,6 +200,16 @@ int main(int argc, char **argv) { AllStrings.emplace_back(NamePtr); // keep the reference }; + auto replaceMIStringRefs = [&] (MemInfo &MI) { + auto OffsetNamePtr = MergedStringPool.intern(MI.Offset.Name); + MI.Offset.Name = *OffsetNamePtr; + AllStrings.emplace_back(OffsetNamePtr); // keep the reference + + auto AddrNamePtr = MergedStringPool.intern(MI.Addr.Name); + MI.Addr.Name = *AddrNamePtr; + AllStrings.emplace_back(AddrNamePtr); // keep the reference + }; + for (auto &InputDataFilename : opts::InputDataFilenames) { if (!sys::fs::exists(InputDataFilename)) report_error(InputDataFilename, errc::no_such_file_or_directory); @@ -313,6 +340,56 @@ int main(int argc, char **argv) { } } } + + for (auto &FI : ReaderOrErr.get()->getAllFuncsMemData()) { + auto MI = MergedFunctionsMemData.find(FI.second.Name); + if (MI != MergedFunctionsMemData.end()) { + std::vector TmpMI; + for (auto &MMI : FI.second.Data) { + // Find and merge a corresponding entry or copy data. + auto TI = std::lower_bound(MI->second.Data.begin(), + MI->second.Data.end(), + MMI); + if (TI != MI->second.Data.end() && *TI == MMI) { + replaceMIStringRefs(MMI); + TI->mergeWith(MMI); + } else { + CopyMemInfo(MMI, TmpMI); + } + } + // Merge in the temp vector making sure it doesn't contain duplicates. + std::sort(TmpMI.begin(), TmpMI.end()); + MemInfo *PrevMI = nullptr; + for (auto &MMI : TmpMI) { + if (PrevMI && *PrevMI == MMI) { + PrevMI->mergeWith(MMI); + } else { + MI->second.Data.emplace_back(MMI); + PrevMI = &MI->second.Data.back(); + } + } + std::sort(MI->second.Data.begin(), MI->second.Data.end()); + } else { + auto NamePtr = MergedStringPool.intern(FI.second.Name); + AllStrings.emplace_back(NamePtr); // keep the ref + bool Success; + std::tie(MI, Success) = MergedFunctionsMemData.insert( + std::make_pair(*NamePtr, + FuncMemData(*NamePtr, FuncMemData::ContainerTy()))); + // Copy with string conversion while eliminating duplicates. + std::sort(FI.second.Data.begin(), FI.second.Data.end()); + MemInfo *PrevMI = nullptr; + for (auto &MMI : FI.second.Data) { + if (PrevMI && *PrevMI == MMI) { + replaceMIStringRefs(MMI); + PrevMI->mergeWith(MMI); + } else { + CopyMemInfo(MMI, MI->second.Data); + PrevMI = &MI->second.Data.back(); + } + } + } + } } if (!opts::SuppressMergedDataOutput) { @@ -336,11 +413,17 @@ int main(int argc, char **argv) { SD.print(outs()); } } + for (const auto &FDI : MergedFunctionsMemData) { + for (const auto &MD : FDI.second.Data) { + MD.print(outs()); + } + } } errs() << "Data for " << (MergedFunctionsBranchData.size() + - MergedFunctionsSampleData.size()) + MergedFunctionsSampleData.size() + + MergedFunctionsMemData.size()) << " unique objects successfully merged.\n"; if (opts::PrintFunctionList != opts::ST_NONE) { From 0983f0084ab797193e764c7180733a693836be65 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 1 Nov 2017 10:26:07 -0700 Subject: [PATCH 327/904] [BOLT] Refactor branch analysis code. Summary: Move the indirect branch analysis code from BinaryFunction to MCInstrAnalysis/X86MCTargetDesc.cpp. In the process of doing this, I've added an MCRegInfo to MCInstrAnalysis which allowed me to remove a bunch of extra method parameters. I've also had to refactor how BinaryFunction held on to instructions/offsets so that it would be easy to pass a sequence of instructions to the analysis code (rather than a map keyed by offset). Note: I think there are a bunch of MCInstrAnalysis methods that have a BitVector output parameter that could be changed to a return value since the size of the vector is based on the number of registers, i.e. from MCRegisterInfo. I haven't done this in order to keep the diff a more manageable size. (cherry picked from commit 06e59b30ae479aa734072f01e5883dcde0088164) --- bolt/BinaryFunction.cpp | 332 ++++++----------------------- bolt/BinaryFunction.h | 39 ++-- bolt/Exceptions.cpp | 8 +- bolt/Passes/FrameAnalysis.cpp | 9 +- bolt/Passes/LivenessAnalysis.h | 8 +- bolt/Passes/ReachingDefOrUse.h | 4 +- bolt/Passes/RegAnalysis.cpp | 8 +- bolt/Passes/ShrinkWrapping.cpp | 20 +- bolt/Passes/StackPointerTracking.h | 2 +- bolt/Passes/StokeInfo.cpp | 4 +- bolt/RewriteInstance.cpp | 2 +- 11 files changed, 118 insertions(+), 318 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 94020fd20703..3ad749904c79 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -169,12 +169,6 @@ bool shouldPrint(const BinaryFunction &Function) { namespace llvm { namespace bolt { -// Temporary constant. -// -// TODO: move to architecture-specific file together with the code that is -// using it. -constexpr unsigned NoRegister = 0; - constexpr const char *DynoStats::Desc[]; constexpr unsigned BinaryFunction::MinAlign; @@ -430,9 +424,9 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, // Offset of the instruction in function. uint64_t Offset{0}; - if (BasicBlocks.empty() && !Instructions.empty()) { + if (BasicBlocks.empty() && !InstructionOffsets.empty()) { // Print before CFG was built. - for (const auto &II : Instructions) { + for (const auto &II : InstructionOffsets) { Offset = II.first; // Print label if exists at this offset. @@ -440,7 +434,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (LI != Labels.end()) OS << LI->second->getName() << ":\n"; - BC.printInstruction(OS, II.second, Offset, this); + BC.printInstruction(OS, Instructions[II.second], Offset, this); } } @@ -578,252 +572,53 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "End of Function \"" << *this << "\"\n\n"; } -BinaryFunction::IndirectBranchType -BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, - unsigned Size, - uint64_t Offset) { - auto &MIA = BC.MIA; - - IndirectBranchType Type = IndirectBranchType::UNKNOWN; +IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, + unsigned Size, + uint64_t Offset) { + const auto PtrSize = BC.AsmInfo->getPointerSize(); // An instruction referencing memory used by jump instruction (directly or // via register). This location could be an array of function pointers // in case of indirect tail call, or a jump table. - MCInst *MemLocInstr = nullptr; + const MCInst *MemLocInstr; // Address of the table referenced by MemLocInstr. Could be either an // array of function pointers, or a jump table. uint64_t ArrayStart = 0; - auto analyzePICJumpTable = - [&](InstrMapType::reverse_iterator II, - InstrMapType::reverse_iterator IE, - unsigned R1, - unsigned R2) { - // Analyze PIC-style jump table code template: - // - // lea PIC_JUMP_TABLE(%rip), {%r1|%r2} <- MemLocInstr - // mov ({%r1|%r2}, %index, 4), {%r2|%r1} - // add %r2, %r1 - // jmp *%r1 - // - // (with any irrelevant instructions in-between) - // - // When we call this helper we've already determined %r1 and %r2, and - // reverse instruction iterator \p II is pointing to the ADD instruction. - // - // PIC jump table looks like following: - // - // JT: ---------- - // E1:| L1 - JT | - // |----------| - // E2:| L2 - JT | - // |----------| - // | | - // ...... - // En:| Ln - JT | - // ---------- - // - // Where L1, L2, ..., Ln represent labels in the function. - // - // The actual relocations in the table will be of the form: - // - // Ln - JT - // = (Ln - En) + (En - JT) - // = R_X86_64_PC32(Ln) + En - JT - // = R_X86_64_PC32(Ln + offsetof(En)) - // - DEBUG(dbgs() << "BOLT-DEBUG: checking for PIC jump table\n"); - MCInst *MovInstr = nullptr; - while (++II != IE) { - auto &Instr = II->second; - const auto &InstrDesc = BC.MII->get(Instr.getOpcode()); - if (!InstrDesc.hasDefOfPhysReg(Instr, R1, *BC.MRI) && - !InstrDesc.hasDefOfPhysReg(Instr, R2, *BC.MRI)) { - // Ignore instructions that don't affect R1, R2 registers. - continue; - } else if (!MovInstr) { - // Expect to see MOV instruction. - if (!MIA->isMOVSX64rm32(Instr)) { - DEBUG(dbgs() << "BOLT-DEBUG: MOV instruction expected.\n"); - break; - } - - // Check if it's setting %r1 or %r2. In canonical form it sets %r2. - // If it sets %r1 - rename the registers so we have to only check - // a single form. - auto MovDestReg = Instr.getOperand(0).getReg(); - if (MovDestReg != R2) - std::swap(R1, R2); - if (MovDestReg != R2) { - DEBUG(dbgs() << "BOLT-DEBUG: MOV instruction expected to set %r2\n"); - break; - } - - // Verify operands for MOV. - unsigned BaseRegNum; - int64_t ScaleValue; - unsigned IndexRegNum; - int64_t DispValue; - unsigned SegRegNum; - if (!MIA->evaluateX86MemoryOperand(Instr, &BaseRegNum, - &ScaleValue, &IndexRegNum, - &DispValue, &SegRegNum)) - break; - if (BaseRegNum != R1 || - ScaleValue != 4 || - IndexRegNum == bolt::NoRegister || - DispValue != 0 || - SegRegNum != bolt::NoRegister) - break; - MovInstr = &Instr; - } else { - assert(MovInstr && "MOV instruction expected to be set"); - if (!InstrDesc.hasDefOfPhysReg(Instr, R1, *BC.MRI)) - continue; - if (!MIA->isLEA64r(Instr)) { - DEBUG(dbgs() << "BOLT-DEBUG: LEA instruction expected\n"); - break; - } - if (Instr.getOperand(0).getReg() != R1) { - DEBUG(dbgs() << "BOLT-DEBUG: LEA instruction expected to set %r1\n"); - break; - } - - // Verify operands for LEA. - unsigned BaseRegNum; - int64_t ScaleValue; - unsigned IndexRegNum; - const MCExpr *DispExpr = nullptr; - unsigned SegRegNum; - if (!MIA->evaluateX86MemoryOperand(Instr, &BaseRegNum, - &ScaleValue, &IndexRegNum, - nullptr, &SegRegNum, &DispExpr)) - break; - if (BaseRegNum != BC.MRI->getProgramCounter() || - IndexRegNum != bolt::NoRegister || - SegRegNum != bolt::NoRegister || - DispExpr == nullptr) - break; - MemLocInstr = &Instr; - break; - } - } - - if (!MemLocInstr) - return IndirectBranchType::UNKNOWN; - - DEBUG(dbgs() << "BOLT-DEBUG: checking potential PIC jump table\n"); - return IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE; - }; - - // Try to find a (base) memory location from where the address for - // the indirect branch is loaded. For X86-64 the memory will be specified - // in the following format: - // - // {%rip}/{%basereg} + Imm + IndexReg * Scale - // - // We are interested in the cases where Scale == sizeof(uintptr_t) and - // the contents of the memory are presumably a function array. - // - // Normal jump table: - // - // jmp *(JUMP_TABLE, %index, Scale) - // - // or - // - // mov (JUMP_TABLE, %index, Scale), %r1 - // ... - // jmp %r1 - // - // We handle PIC-style jump tables separately. - // - if (Instruction.getNumPrimeOperands() == 1) { - // If the indirect jump is on register - try to detect if the - // register value is loaded from a memory location. - assert(Instruction.getOperand(0).isReg() && "register operand expected"); - const auto R1 = Instruction.getOperand(0).getReg(); - // Check if one of the previous instructions defines the jump-on register. - // We will check that this instruction belongs to the same basic block - // in postProcessIndirectBranches(). - for (auto PrevII = Instructions.rbegin(); PrevII != Instructions.rend(); - ++PrevII) { - auto &PrevInstr = PrevII->second; - const auto &PrevInstrDesc = BC.MII->get(PrevInstr.getOpcode()); - - if (!PrevInstrDesc.hasDefOfPhysReg(PrevInstr, R1, *BC.MRI)) - continue; + unsigned BaseRegNum, IndexRegNum; + int64_t DispValue; + const MCExpr *DispExpr; - if (MIA->isMoveMem2Reg(PrevInstr)) { - MemLocInstr = &PrevInstr; - break; - } else if (MIA->isADD64rr(PrevInstr)) { - auto R2 = PrevInstr.getOperand(2).getReg(); - if (R1 == R2) - return IndirectBranchType::UNKNOWN; - Type = analyzePICJumpTable(PrevII, Instructions.rend(), R1, R2); - break; - } else { - return IndirectBranchType::UNKNOWN; - } - } - if (!MemLocInstr) { - // No definition seen for the register in this function so far. Could be - // an input parameter - which means it is an external code reference. - // It also could be that the definition happens to be in the code that - // we haven't processed yet. Since we have to be conservative, return - // as UNKNOWN case. - return IndirectBranchType::UNKNOWN; - } - } else { - MemLocInstr = &Instruction; - } + auto Type = BC.MIA->analyzeIndirectBranch(Instruction, + Instructions, + PtrSize, + MemLocInstr, + BaseRegNum, + IndexRegNum, + DispValue, + DispExpr); - const auto RIPRegister = BC.MRI->getProgramCounter(); - auto PtrSize = BC.AsmInfo->getPointerSize(); + if (Type == IndirectBranchType::UNKNOWN && !MemLocInstr) + return Type; - // Analyze the memory location. - unsigned BaseRegNum; - int64_t ScaleValue; - unsigned IndexRegNum; - int64_t DispValue; - unsigned SegRegNum; - const MCExpr *DispExpr; - if (!MIA->evaluateX86MemoryOperand(*MemLocInstr, &BaseRegNum, - &ScaleValue, &IndexRegNum, - &DispValue, &SegRegNum, - &DispExpr)) - return IndirectBranchType::UNKNOWN; - - // Do not set annotate with index reg if address was precomputed earlier - // and reg may not be live at the jump site. if (MemLocInstr != &Instruction) IndexRegNum = 0; - if ((BaseRegNum != bolt::NoRegister && BaseRegNum != RIPRegister) || - SegRegNum != bolt::NoRegister) - return IndirectBranchType::UNKNOWN; - - if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE && - (ScaleValue != 1 || BaseRegNum != RIPRegister)) - return IndirectBranchType::UNKNOWN; - - if (Type != IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE && - ScaleValue != PtrSize) - return IndirectBranchType::UNKNOWN; - // RIP-relative addressing should be converted to symbol form by now // in processed instructions (but not in jump). if (DispExpr) { auto SI = BC.GlobalSymbols.find(DispExpr->getSymbol().getName()); assert(SI != BC.GlobalSymbols.end() && "global symbol needs a value"); ArrayStart = SI->second; + BaseRegNum = 0; } else { ArrayStart = static_cast(DispValue); - if (BaseRegNum == RIPRegister) - ArrayStart += getAddress() + Offset + Size; } + if (BaseRegNum == BC.MRI->getProgramCounter()) + ArrayStart += getAddress() + Offset + Size; + DEBUG(dbgs() << "BOLT-DEBUG: addressed memory is 0x" << Twine::utohexstr(ArrayStart) << '\n'); @@ -857,7 +652,8 @@ BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, LI = Result.first; } - BC.MIA->replaceMemOperandDisp(*MemLocInstr, LI->second, BC.Ctx.get()); + BC.MIA->replaceMemOperandDisp(const_cast(*MemLocInstr), + LI->second, BC.Ctx.get()); BC.MIA->setJumpTable(BC.Ctx.get(), Instruction, ArrayStart, IndexRegNum); JTSites.emplace_back(Offset, ArrayStart); @@ -886,7 +682,7 @@ BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, // Extract the value at the start of the array. StringRef SectionContents; Section.getContents(SectionContents); - auto EntrySize = + const auto EntrySize = Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE ? 4 : PtrSize; DataExtractor DE(SectionContents, BC.AsmInfo->isLittleEndian(), EntrySize); auto ValueOffset = static_cast(ArrayStart - Section.getAddress()); @@ -940,7 +736,8 @@ BinaryFunction::analyzeIndirectBranch(MCInst &Instruction, JumpTableType, std::move(JTOffsetCandidates), {{0, JTStartLabel}}}); - BC.MIA->replaceMemOperandDisp(*MemLocInstr, JTStartLabel, BC.Ctx.get()); + BC.MIA->replaceMemOperandDisp(const_cast(*MemLocInstr), + JTStartLabel, BC.Ctx.get()); BC.MIA->setJumpTable(BC.Ctx.get(), Instruction, ArrayStart, IndexRegNum); JTSites.emplace_back(Offset, ArrayStart); @@ -1003,7 +800,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { BC.InstPrinter->printInst(&Instruction, errs(), "", *BC.STI); errs() << '\n'; Instruction.dump_pretty(errs(), BC.InstPrinter.get()); - errs() << '\n';; + errs() << '\n'; return false; } if (TargetAddress == 0) { @@ -1248,7 +1045,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // indirect branch. Bail out on the latter case. MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); if (MIA->isIndirectBranch(Instruction)) { - auto Result = analyzeIndirectBranch(Instruction, Size, Offset); + auto Result = processIndirectBranch(Instruction, Size, Offset); switch (Result) { default: llvm_unreachable("unexpected result"); @@ -1534,9 +1331,10 @@ bool BinaryFunction::buildCFG() { } }; - for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { + for (auto I = InstructionOffsets.begin(), + E = InstructionOffsets.end(); I != E; ++I) { const auto Offset = I->first; - const auto &Instr = I->second; + const auto &Instr = Instructions[I->second]; auto LI = Labels.find(Offset); if (LI != Labels.end()) { @@ -1681,8 +1479,9 @@ bool BinaryFunction::buildCFG() { // basic block. auto *ToBB = getBasicBlockAtOffset(Branch.second); if (ToBB == nullptr) { - auto I = Instructions.find(Branch.second), E = Instructions.end(); - while (ToBB == nullptr && I != E && MIA->isNoop(I->second)) { + auto I = InstructionOffsets.find(Branch.second); + auto E = InstructionOffsets.end(); + while (ToBB == nullptr && I != E && MIA->isNoop(Instructions[I->second])) { ++I; if (I == E) break; @@ -1840,6 +1639,7 @@ bool BinaryFunction::buildCFG() { // // NB: don't clear Labels list as we may need them if we mark the function // as non-simple later in the process of discovering extra entry points. + clearList(InstructionOffsets); clearList(Instructions); clearList(OffsetToCFI); clearList(TakenBranches); @@ -2120,18 +1920,18 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { // Eliminate recursive calls and returns from recursive calls from the list // of branches that have no match. They are not considered local branches. auto isRecursiveBranch = [&](std::pair &Branch) { - auto SrcInstrI = Instructions.find(Branch.first); - if (SrcInstrI == Instructions.end()) + auto SrcInstrI = InstructionOffsets.find(Branch.first); + if (SrcInstrI == InstructionOffsets.end()) return false; // Check if it is a recursive call. - const auto &SrcInstr = SrcInstrI->second; + const auto &SrcInstr = Instructions[SrcInstrI->second]; if ((BC.MIA->isCall(SrcInstr) || BC.MIA->isIndirectBranch(SrcInstr)) && Branch.second == 0) return true; - auto DstInstrI = Instructions.find(Branch.second); - if (DstInstrI == Instructions.end()) + auto DstInstrI = InstructionOffsets.find(Branch.second); + if (DstInstrI == InstructionOffsets.end()) return false; // Check if it is a return from a recursive call. @@ -2140,16 +1940,17 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { if (!IsSrcReturn && BC.MIA->isPrefix(SrcInstr)) { auto SrcInstrSuccessorI = SrcInstrI; ++SrcInstrSuccessorI; - assert(SrcInstrSuccessorI != Instructions.end() && + assert(SrcInstrSuccessorI != InstructionOffsets.end() && "unexpected prefix instruction at the end of function"); - IsSrcReturn = BC.MIA->isReturn(SrcInstrSuccessorI->second); + IsSrcReturn = BC.MIA->isReturn(Instructions[SrcInstrSuccessorI->second]); } if (IsSrcReturn && Branch.second != 0) { // Make sure the destination follows the call instruction. auto DstInstrPredecessorI = DstInstrI; --DstInstrPredecessorI; - assert(DstInstrPredecessorI != Instructions.end() && "invalid iterator"); - if (BC.MIA->isCall(DstInstrPredecessorI->second)) + assert(DstInstrPredecessorI != InstructionOffsets.end() && + "invalid iterator"); + if (BC.MIA->isCall(Instructions[DstInstrPredecessorI->second])) return true; } return false; @@ -2164,10 +1965,10 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { ExternProfileBranches.end(), std::back_inserter(OrphanBranches), [&](const std::pair &Branch) { - auto II = Instructions.find(Branch.first); - if (II == Instructions.end()) + auto II = InstructionOffsets.find(Branch.first); + if (II == InstructionOffsets.end()) return true; - const auto &Instr = II->second; + const auto &Instr = Instructions[II->second]; // Check for calls, tail calls, rets and indirect branches. // When matching profiling info, we did not reach the stage // when we identify tail calls, so they are still represented @@ -2179,13 +1980,14 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { // Check for "rep ret" if (BC.MIA->isPrefix(Instr)) { ++II; - if (II != Instructions.end() && BC.MIA->isReturn(II->second)) + if (II != InstructionOffsets.end() && + BC.MIA->isReturn(Instructions[II->second])) return false; } return true; }); - float MatchRatio = + const float MatchRatio = (float) (ProfileBranches.size() - OrphanBranches.size()) / (float) ProfileBranches.size(); @@ -4225,7 +4027,7 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( for(const auto &Entry : OutputLL.Entries) { if (Entry.Begin <= PrevEndAddress && *PrevLoc == Entry.Loc) { MergedLL.Entries.back().End = std::max(Entry.End, - MergedLL.Entries.back().End);; + MergedLL.Entries.back().End); } else { const auto Begin = std::max(Entry.Begin, PrevEndAddress); const auto End = std::max(Begin, Entry.End); @@ -4424,12 +4226,12 @@ BinaryFunction::getFallthroughsInTrace(uint64_t From, uint64_t To) const { return NoneType(); // Get iterators and validate trace start/end - auto FromIter = Instructions.find(From); - if (FromIter == Instructions.end()) + auto FromIter = InstructionOffsets.find(From); + if (FromIter == InstructionOffsets.end()) return NoneType(); - auto ToIter = Instructions.find(To); - if (ToIter == Instructions.end()) + auto ToIter = InstructionOffsets.find(To); + if (ToIter == InstructionOffsets.end()) return NoneType(); // Trace needs to go forward @@ -4437,20 +4239,22 @@ BinaryFunction::getFallthroughsInTrace(uint64_t From, uint64_t To) const { return NoneType(); // Trace needs to finish in a branch - if (!BC.MIA->isBranch(ToIter->second) && !BC.MIA->isCall(ToIter->second) && - !BC.MIA->isReturn(ToIter->second)) + auto &ToInst = Instructions[ToIter->second]; + if (!BC.MIA->isBranch(ToInst) && !BC.MIA->isCall(ToInst) && + !BC.MIA->isReturn(ToInst)) return NoneType(); // Analyze intermediate instructions for (; FromIter != ToIter; ++FromIter) { // This operates under an assumption that we collect all branches in LBR // No unconditional branches in the middle of the trace - if (BC.MIA->isUnconditionalBranch(FromIter->second) || - BC.MIA->isReturn(FromIter->second) || - BC.MIA->isCall(FromIter->second)) + auto &FromInst = Instructions[FromIter->second]; + if (BC.MIA->isUnconditionalBranch(FromInst) || + BC.MIA->isReturn(FromInst) || + BC.MIA->isCall(FromInst)) return NoneType(); - if (!BC.MIA->isConditionalBranch(FromIter->second)) + if (!BC.MIA->isConditionalBranch(FromInst)) continue; const uint64_t Src = FromIter->first; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index f62cce01c8ed..7b472d11a52e 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -461,8 +461,9 @@ class BinaryFunction { /// Temporary holder of instructions before CFG is constructed. /// Map offset in the function to MCInst. - using InstrMapType = std::map; - InstrMapType Instructions; + using InstrMapType = std::map; + InstrMapType InstructionOffsets; + std::vector Instructions; /// List of DWARF CFI instructions. Original CFI from the binary must be /// sorted w.r.t. offset that it appears. We rely on this to replay CFIs @@ -736,7 +737,10 @@ class BinaryFunction { } void addInstruction(uint64_t Offset, MCInst &&Instruction) { - Instructions.emplace(Offset, std::forward(Instruction)); + assert(InstructionOffsets.size() == Instructions.size() && + "There must be one instruction at every offset."); + Instructions.emplace_back(std::forward(Instruction)); + InstructionOffsets[Offset] = Instructions.size() - 1; } /// Return instruction at a given offset in the function. Valid before @@ -744,22 +748,14 @@ class BinaryFunction { MCInst *getInstructionAtOffset(uint64_t Offset) { assert(CurrentState == State::Disassembled && "can only call function in Disassembled state"); - auto II = Instructions.find(Offset); - return (II == Instructions.end()) ? nullptr : &II->second; + auto II = InstructionOffsets.find(Offset); + return (II == InstructionOffsets.end()) + ? nullptr : &Instructions[II->second]; } - /// Different types of indirect branches encountered during disassembly. - enum class IndirectBranchType : char { - UNKNOWN = 0, /// Unable to determine type. - POSSIBLE_TAIL_CALL, /// Possibly a tail call. - POSSIBLE_JUMP_TABLE, /// Possibly a switch/jump table. - POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC. - POSSIBLE_GOTO /// Possibly a gcc's computed goto. - }; - - /// Analyze indirect branch \p Instruction before it is added to - /// Instructions list. - IndirectBranchType analyzeIndirectBranch(MCInst &Instruction, + /// Analyze and process indirect branch \p Instruction before it is + /// added to Instructions list. + IndirectBranchType processIndirectBranch(MCInst &Instruction, unsigned Size, uint64_t Offset); @@ -1439,22 +1435,23 @@ class BinaryFunction { // harder for us to recover this information, since we can create empty BBs // with NOPs and then reorder it away. // We fix this by moving the CFI instruction just before any NOPs. - auto I = Instructions.lower_bound(Offset); + auto I = InstructionOffsets.lower_bound(Offset); if (Offset == getSize()) { - assert(I == Instructions.end() && "unexpected iterator value"); + assert(I == InstructionOffsets.end() && "unexpected iterator value"); // Sometimes compiler issues restore_state after all instructions // in the function (even after nop). --I; Offset = I->first; } assert(I->first == Offset && "CFI pointing to unknown instruction"); - if (I == Instructions.begin()) { + if (I == InstructionOffsets.begin()) { CIEFrameInstructions.emplace_back(std::forward(Inst)); return; } --I; - while (I != Instructions.begin() && BC.MIA->isNoop(I->second)) { + while (I != InstructionOffsets.begin() && + BC.MIA->isNoop(Instructions[I->second])) { Offset = I->first; --I; } diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index ac303bfd0207..e5fa3f469b53 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -219,7 +219,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // Create a handler entry if necessary. MCSymbol *LPSymbol{nullptr}; if (LandingPad) { - if (Instructions.find(LandingPad) == Instructions.end()) { + if (InstructionOffsets.find(LandingPad) == InstructionOffsets.end()) { if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LandingPad) << " not pointing to an instruction in function " @@ -237,11 +237,11 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, } // Mark all call instructions in the range. - auto II = Instructions.find(Start); - auto IE = Instructions.end(); + auto II = InstructionOffsets.find(Start); + auto IE = InstructionOffsets.end(); assert(II != IE && "exception range not pointing to an instruction"); do { - auto &Instruction = II->second; + auto &Instruction = Instructions[II->second]; if (BC.MIA->isCall(Instruction)) { assert(!BC.MIA->isInvoke(Instruction) && "overlapping exception ranges detected"); diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 348888e8cfde..9418eecad3b6 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -106,10 +106,9 @@ class FrameAccessAnalysis { MCPhysReg Reg{0}; int64_t StackOffset{0}; bool IsIndexed{false}; - if (!BC.MIA->isStackAccess(*BC.MRI, Inst, FIE.IsLoad, FIE.IsStore, - FIE.IsStoreFromReg, Reg, SrcImm, FIE.StackPtrReg, - StackOffset, FIE.Size, FIE.IsSimple, - IsIndexed)) { + if (!BC.MIA->isStackAccess(Inst, FIE.IsLoad, FIE.IsStore, FIE.IsStoreFromReg, + Reg, SrcImm, FIE.StackPtrReg, StackOffset, FIE.Size, + FIE.IsSimple, IsIndexed)) { return true; } @@ -205,7 +204,7 @@ class FrameAccessAnalysis { return true; } - if (BC.MIA->escapesVariable(Inst, *BC.MRI, SPT.HasFramePointer)) { + if (BC.MIA->escapesVariable(Inst, SPT.HasFramePointer)) { DEBUG(dbgs() << "Leaked stack address, giving up on this function.\n"); DEBUG(dbgs() << "Blame insn: "); DEBUG(Inst.dump()); diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index 3ef1700824a9..41909503b2dc 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -44,7 +44,7 @@ class LivenessAnalysis bool isAlive(ProgramPoint PP, MCPhysReg Reg) const { BitVector BV = (*this->getStateAt(PP)); - const BitVector &RegAliases = BC.MIA->getAliases(Reg, *BC.MRI); + const BitVector &RegAliases = BC.MIA->getAliases(Reg); BV &= RegAliases; return BV.any(); } @@ -60,7 +60,7 @@ class LivenessAnalysis BitVector BV = *this->getStateAt(P); BV.flip(); BitVector GPRegs(NumRegs, false); - this->BC.MIA->getGPRegs(GPRegs, *this->BC.MRI); + this->BC.MIA->getGPRegs(GPRegs); BV &= GPRegs; int Reg = BV.find_first(); return Reg != -1 ? Reg : 0; @@ -91,7 +91,7 @@ class LivenessAnalysis // Kill auto Written = BitVector(NumRegs, false); if (!IsCall) { - this->BC.MIA->getWrittenRegs(Point, Written, *this->BC.MRI); + this->BC.MIA->getWrittenRegs(Point, Written); } else { RA.getInstClobberList(Point, Written); // When clobber list is conservative, it is clobbering all/most registers, @@ -100,7 +100,7 @@ class LivenessAnalysis // because we don't really know what's going on. if (RA.isConservative(Written)) { Written.reset(); - BC.MIA->getCalleeSavedRegs(Written, *this->BC.MRI); + BC.MIA->getCalleeSavedRegs(Written); } } Written.flip(); diff --git a/bolt/Passes/ReachingDefOrUse.h b/bolt/Passes/ReachingDefOrUse.h index f241bff62f66..2113eb4590b6 100644 --- a/bolt/Passes/ReachingDefOrUse.h +++ b/bolt/Passes/ReachingDefOrUse.h @@ -45,7 +45,7 @@ class ReachingDefOrUse if (Def) { RA.getInstClobberList(**I, BV); } else { - this->BC.MIA->getTouchedRegs(**I, BV, *this->BC.MRI); + this->BC.MIA->getTouchedRegs(**I, BV); } if (BV[Reg]) return true; @@ -101,7 +101,7 @@ class ReachingDefOrUse if (Def) RA.getInstClobberList(*Y, YClobbers); else - this->BC.MIA->getTouchedRegs(*Y, YClobbers, *this->BC.MRI); + this->BC.MIA->getTouchedRegs(*Y, YClobbers); // X kills Y if it clobbers Y completely -- this is a conservative approach. // In practice, we may produce use-def links that may not exist. XClobbers &= YClobbers; diff --git a/bolt/Passes/RegAnalysis.cpp b/bolt/Passes/RegAnalysis.cpp index d9ab2d625f1c..8207ec9718e8 100644 --- a/bolt/Passes/RegAnalysis.cpp +++ b/bolt/Passes/RegAnalysis.cpp @@ -93,7 +93,7 @@ void RegAnalysis::beConservative(BitVector &Result) const { Result.set(); } else { BitVector BV(BC.MRI->getNumRegs(), false); - BC.MIA->getCalleeSavedRegs(BV, *BC.MRI); + BC.MIA->getCalleeSavedRegs(BV); BV.flip(); Result |= BV; } @@ -104,7 +104,7 @@ bool RegAnalysis::isConservative(BitVector &Vec) const { return Vec.all(); } else { BitVector BV(BC.MRI->getNumRegs(), false); - BC.MIA->getCalleeSavedRegs(BV, *BC.MRI); + BC.MIA->getCalleeSavedRegs(BV); BV |= Vec; return BV.all(); } @@ -114,9 +114,9 @@ void RegAnalysis::getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet, bool GetClobbers) const { if (!BC.MIA->isCall(Inst)) { if (GetClobbers) - BC.MIA->getClobberedRegs(Inst, RegSet, *BC.MRI); + BC.MIA->getClobberedRegs(Inst, RegSet); else - BC.MIA->getUsedRegs(Inst, RegSet, *BC.MRI); + BC.MIA->getUsedRegs(Inst, RegSet); return; } diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp index abbc0fc6c6a5..56c57b5507a9 100644 --- a/bolt/Passes/ShrinkWrapping.cpp +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -603,9 +603,9 @@ void StackLayoutModifier::performChanges() { bool IsStoreFromReg{false}; uint8_t Size{0}; bool Success{false}; - Success = BC.MIA->isStackAccess(*BC.MRI, Inst, IsLoad, IsStore, - IsStoreFromReg, Reg, SrcImm, StackPtrReg, - StackOffset, Size, IsSimple, IsIndexed); + Success = BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, + Reg, SrcImm, StackPtrReg, StackOffset, + Size, IsSimple, IsIndexed); assert(Success && IsSimple && !IsIndexed && (!IsStore || IsStoreFromReg)); if (StackPtrReg != BC.MIA->getFramePointer()) Adjustment = -Adjustment; @@ -642,13 +642,13 @@ void ShrinkWrapping::classifyCSRUses() { BitVector(DA.NumInstrs, false)); const BitVector &FPAliases = - BC.MIA->getAliases(BC.MIA->getFramePointer(), *BC.MRI); + BC.MIA->getAliases(BC.MIA->getFramePointer()); for (auto &BB : BF) { for (auto &Inst : BB) { if (BC.MIA->isCFI(Inst)) continue; auto BV = BitVector(BC.MRI->getNumRegs(), false); - BC.MIA->getTouchedRegs(Inst, BV, *BC.MRI); + BC.MIA->getTouchedRegs(Inst, BV); BV &= CSA.CalleeSaved; for (int I = BV.find_first(); I != -1; I = BV.find_next(I)) { if (I == 0) @@ -668,7 +668,7 @@ void ShrinkWrapping::classifyCSRUses() { } void ShrinkWrapping::pruneUnwantedCSRs() { - BitVector ParamRegs = BC.MIA->getRegsUsedAsParams(*BC.MRI); + BitVector ParamRegs = BC.MIA->getRegsUsedAsParams(); for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { if (!CSA.CalleeSaved[I]) continue; @@ -1080,7 +1080,7 @@ bool ShrinkWrapping::doesInstUsesCSR(const MCInst &Inst, uint16_t CSR) { CSA.getRestoredReg(Inst) == CSR) return false; BitVector BV = BitVector(BC.MRI->getNumRegs(), false); - BC.MIA->getTouchedRegs(Inst, BV, *BC.MRI); + BC.MIA->getTouchedRegs(Inst, BV); return BV[CSR]; } @@ -1389,9 +1389,9 @@ void ShrinkWrapping::insertUpdatedCFI(unsigned CSR, int SPValPush, bool IsSimple{false}; bool IsStoreFromReg{false}; uint8_t Size{0}; - if (!BC.MIA->isStackAccess(*BC.MRI, *InstIter, IsLoad, IsStore, - IsStoreFromReg, Reg, SrcImm, StackPtrReg, - StackOffset, Size, IsSimple, IsIndexed)) + if (!BC.MIA->isStackAccess(*InstIter, IsLoad, IsStore, IsStoreFromReg, + Reg, SrcImm, StackPtrReg, StackOffset, + Size, IsSimple, IsIndexed)) continue; if (Reg != CSR || !IsStore || !IsSimple) continue; diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/Passes/StackPointerTracking.h index 46cc0facae61..cec3244f298f 100644 --- a/bolt/Passes/StackPointerTracking.h +++ b/bolt/Passes/StackPointerTracking.h @@ -165,7 +165,7 @@ class StackPointerTrackingBase } if (!HasFramePointer) { - if (MIA->escapesVariable(Point, *this->BC.MRI, false)) { + if (MIA->escapesVariable(Point, false)) { HasFramePointer = true; } } diff --git a/bolt/Passes/StokeInfo.cpp b/bolt/Passes/StokeInfo.cpp index dda3982500e3..54a879b6de9d 100644 --- a/bolt/Passes/StokeInfo.cpp +++ b/bolt/Passes/StokeInfo.cpp @@ -165,8 +165,8 @@ void StokeInfo::runOnFunctions( DefaultDefInMask.resize(NumRegs, false); DefaultLiveOutMask.resize(NumRegs, false); - BC.MIA->getDefaultDefIn(DefaultDefInMask, *BC.MRI); - BC.MIA->getDefaultLiveOut(DefaultLiveOutMask, *BC.MRI); + BC.MIA->getDefaultDefIn(DefaultDefInMask); + BC.MIA->getDefaultLiveOut(DefaultLiveOutMask); getRegNameFromBitVec(BC, DefaultDefInMask); getRegNameFromBitVec(BC, DefaultLiveOutMask); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 4fea432f280c..e6274566bd94 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -585,7 +585,7 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, } std::unique_ptr MIA( - TheTarget->createMCInstrAnalysis(MII.get())); + TheTarget->createMCInstrAnalysis(MII.get(), MRI.get())); if (!MIA) { errs() << "BOLT-ERROR: failed to create instruction analysis for target" << TripleName << "\n"; From cce766b7a06f7b242a8748399f2f0551b766785f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 6 Nov 2017 11:52:31 -0800 Subject: [PATCH 328/904] [BOLT][Refactoring] Move basic block reordering to BinaryPasses Summary: Refactor basic block reordering code out of the BinaryFunction. BinaryFunction::isSplit() is now checking if the first and the last blocks in the layout belong to the same fragment. As a result, it no longer returns true for functions that have their cold part optimized away. Change type for returned "size" from unsigned to size_t. Fix lines over 80 characters long. (cherry picked from commit 9eddb871c65cec97b6173b60c52742452650cc42) --- bolt/BinaryBasicBlock.h | 61 +++++----- bolt/BinaryFunction.cpp | 209 ++++------------------------------- bolt/BinaryFunction.h | 60 +++------- bolt/Passes/BinaryPasses.cpp | 189 +++++++++++++++++++++++++++---- bolt/Passes/BinaryPasses.h | 39 ++++++- 5 files changed, 278 insertions(+), 280 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 1c6e5ba64842..be9a8235fa51 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -150,7 +150,7 @@ class BinaryBasicBlock { using const_reverse_iterator = std::reverse_iterator; bool empty() const { return Instructions.empty(); } - unsigned size() const { return (unsigned)Instructions.size(); } + size_t size() const { return Instructions.size(); } MCInst &front() { return Instructions.front(); } MCInst &back() { return Instructions.back(); } const MCInst &front() const { return Instructions.front(); } @@ -176,9 +176,11 @@ class BinaryBasicBlock { using const_lp_iterator = decltype(LandingPads)::const_iterator; using pred_reverse_iterator = std::reverse_iterator; - using const_pred_reverse_iterator = std::reverse_iterator; + using const_pred_reverse_iterator = + std::reverse_iterator; using succ_reverse_iterator = std::reverse_iterator; - using const_succ_reverse_iterator = std::reverse_iterator; + using const_succ_reverse_iterator = + std::reverse_iterator; pred_iterator pred_begin() { return Predecessors.begin(); } const_pred_iterator pred_begin() const { return Predecessors.begin(); } @@ -192,8 +194,8 @@ class BinaryBasicBlock { { return Predecessors.rend(); } const_pred_reverse_iterator pred_rend() const { return Predecessors.rend(); } - unsigned pred_size() const { - return (unsigned)Predecessors.size(); + size_t pred_size() const { + return Predecessors.size(); } bool pred_empty() const { return Predecessors.empty(); } @@ -209,8 +211,8 @@ class BinaryBasicBlock { { return Successors.rend(); } const_succ_reverse_iterator succ_rend() const { return Successors.rend(); } - unsigned succ_size() const { - return (unsigned)Successors.size(); + size_t succ_size() const { + return Successors.size(); } bool succ_empty() const { return Successors.empty(); } @@ -218,8 +220,8 @@ class BinaryBasicBlock { const_throw_iterator throw_begin() const { return Throwers.begin(); } throw_iterator throw_end() { return Throwers.end(); } const_throw_iterator throw_end() const { return Throwers.end(); } - unsigned throw_size() const { - return (unsigned)Throwers.size(); + size_t throw_size() const { + return Throwers.size(); } bool throw_empty() const { return Throwers.empty(); } bool isLandingPad() const { return !Throwers.empty(); } @@ -228,8 +230,8 @@ class BinaryBasicBlock { const_lp_iterator lp_begin() const { return LandingPads.begin(); } lp_iterator lp_end() { return LandingPads.end(); } const_lp_iterator lp_end() const { return LandingPads.end(); } - unsigned lp_size() const { - return (unsigned)LandingPads.size(); + size_t lp_size() const { + return LandingPads.size(); } bool lp_empty() const { return LandingPads.empty(); } @@ -273,23 +275,28 @@ class BinaryBasicBlock { using const_branch_info_reverse_iterator = std::reverse_iterator; - branch_info_iterator branch_info_begin() { return BranchInfo.begin(); } - branch_info_iterator branch_info_end() { return BranchInfo.end(); } - const_branch_info_iterator branch_info_begin() const - { return BranchInfo.begin(); } - const_branch_info_iterator branch_info_end() const - { return BranchInfo.end(); } - branch_info_reverse_iterator branch_info_rbegin() - { return BranchInfo.rbegin(); } - branch_info_reverse_iterator branch_info_rend() - { return BranchInfo.rend(); } - const_branch_info_reverse_iterator branch_info_rbegin() const - { return BranchInfo.rbegin(); } - const_branch_info_reverse_iterator branch_info_rend() const - { return BranchInfo.rend(); } - unsigned branch_info_size() const { - return (unsigned)BranchInfo.size(); + branch_info_iterator branch_info_begin() { return BranchInfo.begin(); } + branch_info_iterator branch_info_end() { return BranchInfo.end(); } + const_branch_info_iterator branch_info_begin() const { + return BranchInfo.begin(); } + const_branch_info_iterator branch_info_end() const { + return BranchInfo.end(); + } + branch_info_reverse_iterator branch_info_rbegin() { + return BranchInfo.rbegin(); + } + branch_info_reverse_iterator branch_info_rend() { + return BranchInfo.rend(); + } + const_branch_info_reverse_iterator branch_info_rbegin() const { + return BranchInfo.rbegin(); + } + const_branch_info_reverse_iterator branch_info_rend() const { + return BranchInfo.rend(); + } + + size_t branch_info_size() const { return BranchInfo.size(); } bool branch_info_empty() const { return BranchInfo.empty(); } inline iterator_range branch_info() { diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 3ad749904c79..e38f57caf49e 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -14,7 +14,6 @@ #include "BinaryFunction.h" #include "DataReader.h" #include "Passes/MCF.h" -#include "Passes/ReorderAlgorithm.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/MC/MCAsmInfo.h" @@ -53,18 +52,32 @@ extern cl::opt Relocs; extern cl::opt UpdateDebugSections; extern cl::opt IndirectCallPromotion; extern cl::opt Verbosity; -extern cl::opt PrintFuncStat; static cl::opt -AggressiveSplitting("split-all-cold", - cl::desc("outline as many cold basic blocks as possible"), +AlignBlocks("align-blocks", + cl::desc("try to align BBs inserting nops"), cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -AlignBlocks("align-blocks", - cl::desc("try to align BBs inserting nops"), +static cl::opt +DoMCF("mcf", + cl::desc("solve a min cost flow problem on the CFG to fix edge counts " + "(default=disable)"), + cl::init(MCF_DISABLE), + cl::values( + clEnumValN(MCF_DISABLE, "none", + "disable MCF"), + clEnumValN(MCF_LINEAR, "linear", + "cost function is inversely proportional to edge count"), + clEnumValN(MCF_QUADRATIC, "quadratic", + "cost function is inversely proportional to edge count squared"), + clEnumValN(MCF_LOG, "log", + "cost function is inversely proportional to log of edge count"), + clEnumValN(MCF_BLAMEFTS, "blamefts", + "tune cost to blame fall-through edges for surplus flow"), + clEnumValEnd), cl::ZeroOrMore, + cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt @@ -123,34 +136,6 @@ PrintOnly("print-only", cl::Hidden, cl::cat(BoltCategory)); -static cl::opt -SplitEH("split-eh", - cl::desc("split C++ exception handling code (experimental)"), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltOptCategory)); - -cl::opt -DoMCF("mcf", - cl::desc("solve a min cost flow problem on the CFG to fix edge counts " - "(default=disable)"), - cl::init(MCF_DISABLE), - cl::values( - clEnumValN(MCF_DISABLE, "none", - "disable MCF"), - clEnumValN(MCF_LINEAR, "linear", - "cost function is inversely proportional to edge count"), - clEnumValN(MCF_QUADRATIC, "quadratic", - "cost function is inversely proportional to edge count squared"), - clEnumValN(MCF_LOG, "log", - "cost function is inversely proportional to log of edge count"), - clEnumValN(MCF_BLAMEFTS, "blamefts", - "tune cost to blame fall-through edges for surplus flow"), - clEnumValEnd), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltOptCategory)); - bool shouldPrint(const BinaryFunction &Function) { if (PrintOnly.empty()) return true; @@ -386,8 +371,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, << "\n Orc Section : " << getCodeSectionName() << "\n LSDA : 0x" << Twine::utohexstr(getLSDAAddress()) << "\n IsSimple : " << IsSimple - << "\n IsSplit : " << IsSplit - << "\n BB Count : " << BasicBlocksLayout.size(); + << "\n IsSplit : " << isSplit() + << "\n BB Count : " << size(); if (hasCFG()) { OS << "\n Hash : " << Twine::utohexstr(hash()); @@ -2415,68 +2400,6 @@ bool BinaryFunction::fixCFIState() { return true; } -void BinaryFunction::modifyLayout(LayoutType Type, bool MinBranchClusters, - bool Split) { - if (BasicBlocksLayout.empty() || Type == LT_NONE) - return; - - BasicBlockOrderType NewLayout; - std::unique_ptr Algo; - - // Cannot do optimal layout without profile. - if (Type != LT_REVERSE && !hasValidProfile()) - return; - - if (Type == LT_REVERSE) { - Algo.reset(new ReverseReorderAlgorithm()); - } - else if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD && - Type != LT_OPTIMIZE_SHUFFLE) { - // Work on optimal solution if problem is small enough - DEBUG(dbgs() << "finding optimal block layout for " << *this << "\n"); - Algo.reset(new OptimalReorderAlgorithm()); - } - else { - DEBUG(dbgs() << "running block layout heuristics on " << *this << "\n"); - - std::unique_ptr CAlgo; - if (MinBranchClusters) - CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); - else - CAlgo.reset(new PHGreedyClusterAlgorithm()); - - switch(Type) { - case LT_OPTIMIZE: - Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); - break; - - case LT_OPTIMIZE_BRANCH: - Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); - break; - - case LT_OPTIMIZE_CACHE: - Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); - break; - - case LT_OPTIMIZE_SHUFFLE: - Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); - break; - - default: - llvm_unreachable("unexpected layout type"); - } - } - - Algo->reorderBasicBlocks(*this, NewLayout); - if (opts::PrintFuncStat > 0) - BasicBlocksPreviousLayout = BasicBlocksLayout; - BasicBlocksLayout.clear(); - BasicBlocksLayout.swap(NewLayout); - - if (Split) - splitFunction(); -} - uint64_t BinaryFunction::getInstructionCount() const { uint64_t Count = 0; for (auto &Block : BasicBlocksLayout) { @@ -2486,12 +2409,10 @@ uint64_t BinaryFunction::getInstructionCount() const { } bool BinaryFunction::hasLayoutChanged() const { - assert(opts::PrintFuncStat > 0 && "PrintFuncStat flag is not on"); return BasicBlocksPreviousLayout != BasicBlocksLayout; } uint64_t BinaryFunction::getEditDistance() const { - assert(opts::PrintFuncStat > 0 && "PrintFuncStat flag is not on"); const auto LayoutSize = BasicBlocksPreviousLayout.size(); if (LayoutSize < 2) { return 0; @@ -2899,83 +2820,6 @@ void BinaryFunction::fixBranches() { assert(validateCFG() && "Invalid CFG detected after fixing branches"); } -void BinaryFunction::splitFunction() { - bool AllCold = true; - for (BinaryBasicBlock *BB : BasicBlocksLayout) { - auto ExecCount = BB->getExecutionCount(); - if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) - return; - if (ExecCount != 0) - AllCold = false; - } - - if (AllCold) - return; - - assert(BasicBlocksLayout.size() > 0); - - // Never outline the first basic block. - BasicBlocks.front()->setCanOutline(false); - for (auto BB : BasicBlocks) { - if (!BB->canOutline()) - continue; - if (BB->getExecutionCount() != 0) { - BB->setCanOutline(false); - continue; - } - if (hasEHRanges() && !opts::SplitEH) { - // We cannot move landing pads (or rather entry points for landing - // pads). - if (BB->isLandingPad()) { - BB->setCanOutline(false); - continue; - } - // We cannot move a block that can throw since exception-handling - // runtime cannot deal with split functions. However, if we can guarantee - // that the block never throws, it is safe to move the block to - // decrease the size of the function. - for (auto &Instr : *BB) { - if (BC.MIA->isInvoke(Instr)) { - BB->setCanOutline(false); - break; - } - } - } - } - - if (opts::AggressiveSplitting) { - // All blocks with 0 count that we can move go to the end of the function. - // Even if they were natural to cluster formation and were seen in-between - // hot basic blocks. - std::stable_sort(BasicBlocksLayout.begin(), BasicBlocksLayout.end(), - [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { - return A->canOutline() < B->canOutline(); - }); - } else if (hasEHRanges() && !opts::SplitEH) { - // Typically functions with exception handling have landing pads at the end. - // We cannot move beginning of landing pads, but we can move 0-count blocks - // comprising landing pads to the end and thus facilitate splitting. - auto FirstLP = BasicBlocksLayout.begin(); - while ((*FirstLP)->isLandingPad()) - ++FirstLP; - - std::stable_sort(FirstLP, BasicBlocksLayout.end(), - [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { - return A->canOutline() < B->canOutline(); - }); - } - - // Separate hot from cold starting from the bottom. - for (auto I = BasicBlocksLayout.rbegin(), E = BasicBlocksLayout.rend(); - I != E; ++I) { - BinaryBasicBlock *BB = *I; - if (!BB->canOutline()) - break; - BB->setIsCold(true); - IsSplit = true; - } -} - void BinaryFunction::propagateGnuArgsSizeInfo() { assert(CurrentState == State::CFG && "unexpected function state"); @@ -3443,15 +3287,6 @@ void BinaryFunction::updateLayout(BinaryBasicBlock* Start, updateLayoutIndices(); } -void BinaryFunction::updateLayout(LayoutType Type, - bool MinBranchClusters, - bool Split) { - // Recompute layout with original parameters. - BasicBlocksLayout = BasicBlocks; - modifyLayout(Type, MinBranchClusters, Split); - updateLayoutIndices(); -} - bool BinaryFunction::replaceJumpTableEntryIn(BinaryBasicBlock *BB, BinaryBasicBlock *OldDest, BinaryBasicBlock *NewDest) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 7b472d11a52e..058c4fac6b30 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -186,28 +186,6 @@ class BinaryFunction { ST_ALL, /// Split all functions }; - /// Choose which strategy should the block layout heuristic prioritize when - /// facing conflicting goals. - enum LayoutType : char { - /// LT_NONE - do not change layout of basic blocks - LT_NONE = 0, /// no reordering - /// LT_REVERSE - reverse the order of basic blocks, meant for testing - /// purposes. The first basic block is left intact and the rest are - /// put in the reverse order. - LT_REVERSE, - /// LT_OPTIMIZE - optimize layout of basic blocks based on profile. - LT_OPTIMIZE, - /// LT_OPTIMIZE_BRANCH is an implementation of what is suggested in Pettis' - /// paper (PLDI '90) about block reordering, trying to minimize branch - /// mispredictions. - LT_OPTIMIZE_BRANCH, - /// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04) - /// that suggests putting frequently executed chains first in the layout. - LT_OPTIMIZE_CACHE, - /// Create clusters and use random order for them. - LT_OPTIMIZE_SHUFFLE, - }; - enum ReorderType : char { RT_NONE = 0, RT_EXEC_COUNT, @@ -226,9 +204,6 @@ class BinaryFunction { static constexpr uint64_t COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; - // Function size, in number of BBs, above which we fallback to a heuristic - // solution to the layout problem instead of seeking the optimal one. - static constexpr uint64_t FUNC_SIZE_THRESHOLD = 10; /// We have to use at least 2-byte alignment for functions because of C++ ABI. static constexpr unsigned MinAlign = 2; @@ -280,10 +255,6 @@ class BinaryFunction { /// In relocation mode we still disassemble and re-assemble such functions. bool IsSimple{true}; - /// True if this function needs to be emitted in two separate parts, one for - /// the hot basic blocks and another for the cold basic blocks. - bool IsSplit{false}; - /// Indicate if this function has associated exception handling metadata. bool HasEHRanges{false}; @@ -811,7 +782,7 @@ class BinaryFunction { reverse_iterator rend () { return BasicBlocks.rend(); } const_reverse_iterator rend () const { return BasicBlocks.rend(); } - unsigned size() const { return (unsigned)BasicBlocks.size();} + size_t size() const { return BasicBlocks.size();} bool empty() const { return BasicBlocks.empty(); } const BinaryBasicBlock &front() const { return *BasicBlocks.front(); } BinaryBasicBlock &front() { return *BasicBlocks.front(); } @@ -832,7 +803,7 @@ class BinaryFunction { { return BasicBlocksLayout.rend(); } const_reverse_order_iterator layout_rend() const { return BasicBlocksLayout.rend(); } - unsigned layout_size() const { return (unsigned)BasicBlocksLayout.size(); } + size_t layout_size() const { return BasicBlocksLayout.size(); } bool layout_empty() const { return BasicBlocksLayout.empty(); } const BinaryBasicBlock *layout_front() const { return BasicBlocksLayout.front(); } @@ -864,14 +835,20 @@ class BinaryFunction { return iterator_range(cie_begin(), cie_end()); } + /// Update layout of basic blocks used for output. + void updateBasicBlockLayout(BasicBlockOrderType &NewLayout, + bool SavePrevLayout) { + if (SavePrevLayout) + BasicBlocksPreviousLayout = BasicBlocksLayout; + + BasicBlocksLayout.clear(); + BasicBlocksLayout.swap(NewLayout); + } + /// Return a list of basic blocks sorted using DFS and update layout indices /// using the same order. Does not modify the current layout. BasicBlockOrderType dfs() const; - /// Modify code layout making necessary adjustments to instructions at the - /// end of basic blocks. - void modifyLayout(LayoutType Type, bool MinBranchClusters, bool Split); - /// Find the loops in the CFG of the function and store information about /// them. void calculateLoopInfo(); @@ -1197,7 +1174,8 @@ class BinaryFunction { /// Return true if the function body is non-contiguous. bool isSplit() const { - return IsSplit; + return size() > 1 && + layout_front()->isCold() != layout_back()->isCold(); } /// Return true if the function has exception handling tables. @@ -1361,10 +1339,6 @@ class BinaryFunction { /// layout after the BB indicated by Start. void updateLayout(BinaryBasicBlock* Start, const unsigned NumNewBlocks); - /// Update the basic block layout for this function. The layout is - /// computed from scratch using modifyLayout. - void updateLayout(LayoutType Type, bool MinBranchClusters, bool Split); - /// Make sure basic blocks' indices match the current layout. void updateLayoutIndices() const { unsigned Index = 0; @@ -1794,10 +1768,6 @@ class BinaryFunction { CurrentState = State::Emitted; } - /// Split function in two: a part with warm or hot BBs and a part with never - /// executed BBs. The cold part is moved to a new BinaryFunction. - void splitFunction(); - /// Process LSDA information for the function. void parseLSDA(ArrayRef LSDAData, uint64_t LSDAAddress); @@ -1877,7 +1847,7 @@ class BinaryFunction { /// after relaxation. size_t estimateHotSize(const bool UseSplitSize = true) const { size_t Estimate = 0; - if (UseSplitSize && IsSplit) { + if (UseSplitSize && isSplit()) { for (const auto *BB : BasicBlocksLayout) { if (!BB->isCold()) { Estimate += BC.computeCodeSize(BB->begin(), BB->end()); diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 93e583e53b0d..f2f999063300 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "BinaryPasses.h" +#include "Passes/ReorderAlgorithm.h" #include "llvm/Support/Options.h" #define DEBUG_TYPE "bolt" @@ -58,6 +59,12 @@ enum DynoStatsSortOrder : char { Descending }; +static cl::opt +AggressiveSplitting("split-all-cold", + cl::desc("outline as many cold basic blocks as possible"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt DynoStatsSortOrderOpt("print-sorted-by-order", cl::desc("use ascending or descending order when printing functions " @@ -81,6 +88,13 @@ MinBranchClusters("min-branch-clusters", cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt +PrintFuncStat("print-function-statistics", + cl::desc("print statistics about basic block ordering"), + cl::init(0), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::list PrintSortedBy("print-sorted-by", cl::CommaSeparated, @@ -97,36 +111,29 @@ PrintSortedBy("print-sorted-by", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -cl::opt -PrintFuncStat("print-function-statistics", - cl::desc("print statistics about basic block ordering"), - cl::init(0), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - -static cl::opt +static cl::opt ReorderBlocks("reorder-blocks", cl::desc("change layout of basic blocks in a function"), - cl::init(bolt::BinaryFunction::LT_NONE), + cl::init(bolt::ReorderBasicBlocks::LT_NONE), cl::values( - clEnumValN(bolt::BinaryFunction::LT_NONE, + clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", "do not reorder basic blocks"), - clEnumValN(bolt::BinaryFunction::LT_REVERSE, + clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", "layout blocks in reverse order"), - clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE, + clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", "perform optimal layout based on profile"), - clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_BRANCH, + clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, "branch-predictor", "perform optimal layout prioritizing branch " "predictions"), - clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_CACHE, + clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", "perform optimal layout prioritizing I-cache " "behavior"), - clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_SHUFFLE, + clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, "cluster-shuffle", "perform random layout of clusters"), clEnumValEnd), @@ -155,6 +162,13 @@ SctcMode("sctc-mode", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +SplitEH("split-eh", + cl::desc("split C++ exception handling code (experimental)"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { @@ -296,14 +310,14 @@ void EliminateUnreachableBlocks::runOnFunctions( bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { return (BinaryFunctionPass::shouldPrint(BF) && - opts::ReorderBlocks != BinaryFunction::LT_NONE); + opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); } void ReorderBasicBlocks::runOnFunctions( BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { - if (opts::ReorderBlocks == BinaryFunction::LT_NONE) + if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) return; uint64_t ModifiedFuncCount = 0; @@ -318,8 +332,8 @@ void ReorderBasicBlocks::runOnFunctions( (opts::SplitFunctions == BinaryFunction::ST_EH && Function.hasEHRanges()) || (LargeFunctions.find(It.first) != LargeFunctions.end()); - Function.modifyLayout(opts::ReorderBlocks, opts::MinBranchClusters, - ShouldSplit); + modifyFunctionLayout(Function, opts::ReorderBlocks, opts::MinBranchClusters, + ShouldSplit); if (opts::PrintFuncStat > 0 && Function.hasLayoutChanged()) { ++ModifiedFuncCount; @@ -361,6 +375,141 @@ void ReorderBasicBlocks::runOnFunctions( } } +void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, + LayoutType Type, bool MinBranchClusters, bool Split) const { + if (BF.size() == 0 || Type == LT_NONE) + return; + + BinaryFunction::BasicBlockOrderType NewLayout; + std::unique_ptr Algo; + + // Cannot do optimal layout without profile. + if (Type != LT_REVERSE && !BF.hasValidProfile()) + return; + + if (Type == LT_REVERSE) { + Algo.reset(new ReverseReorderAlgorithm()); + } + else if (BF.size() <= FUNC_SIZE_THRESHOLD && Type != LT_OPTIMIZE_SHUFFLE) { + // Work on optimal solution if problem is small enough + DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); + Algo.reset(new OptimalReorderAlgorithm()); + } else { + DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); + + std::unique_ptr CAlgo; + if (MinBranchClusters) + CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); + else + CAlgo.reset(new PHGreedyClusterAlgorithm()); + + switch(Type) { + case LT_OPTIMIZE: + Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); + break; + + case LT_OPTIMIZE_BRANCH: + Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); + break; + + case LT_OPTIMIZE_CACHE: + Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); + break; + + case LT_OPTIMIZE_SHUFFLE: + Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); + break; + + default: + llvm_unreachable("unexpected layout type"); + } + } + + Algo->reorderBasicBlocks(BF, NewLayout); + + BF.updateBasicBlockLayout(NewLayout, /*SavePrevLayout=*/opts::PrintFuncStat); + + if (Split) + splitFunction(BF); +} + +void ReorderBasicBlocks::splitFunction(BinaryFunction &BF) const { + if (!BF.size()) + return; + + bool AllCold = true; + for (auto *BB : BF.layout()) { + auto ExecCount = BB->getExecutionCount(); + if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) + return; + if (ExecCount != 0) + AllCold = false; + } + + if (AllCold) + return; + + // Never outline the first basic block. + BF.layout_front()->setCanOutline(false); + for (auto *BB : BF.layout()) { + if (!BB->canOutline()) + continue; + if (BB->getExecutionCount() != 0) { + BB->setCanOutline(false); + continue; + } + if (BF.hasEHRanges() && !opts::SplitEH) { + // We cannot move landing pads (or rather entry points for landing + // pads). + if (BB->isLandingPad()) { + BB->setCanOutline(false); + continue; + } + // We cannot move a block that can throw since exception-handling + // runtime cannot deal with split functions. However, if we can guarantee + // that the block never throws, it is safe to move the block to + // decrease the size of the function. + for (auto &Instr : *BB) { + if (BF.getBinaryContext().MIA->isInvoke(Instr)) { + BB->setCanOutline(false); + break; + } + } + } + } + + if (opts::AggressiveSplitting) { + // All blocks with 0 count that we can move go to the end of the function. + // Even if they were natural to cluster formation and were seen in-between + // hot basic blocks. + std::stable_sort(BF.layout_begin(), BF.layout_end(), + [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { + return A->canOutline() < B->canOutline(); + }); + } else if (BF.hasEHRanges() && !opts::SplitEH) { + // Typically functions with exception handling have landing pads at the end. + // We cannot move beginning of landing pads, but we can move 0-count blocks + // comprising landing pads to the end and thus facilitate splitting. + auto FirstLP = BF.layout_begin(); + while ((*FirstLP)->isLandingPad()) + ++FirstLP; + + std::stable_sort(FirstLP, BF.layout_end(), + [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { + return A->canOutline() < B->canOutline(); + }); + } + + // Separate hot from cold starting from the bottom. + for (auto I = BF.layout_rbegin(), E = BF.layout_rend(); + I != E; ++I) { + BinaryBasicBlock *BB = *I; + if (!BB->canOutline()) + break; + BB->setIsCold(true); + } +} + void FixupBranches::runOnFunctions( BinaryContext &BC, std::map &BFs, @@ -976,7 +1125,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, BF.updateLayoutIndices(); // Pre-compute hash before pushing into hashtable. - BF.hash(/*Recompute=*/true, /*UseDFS*/UseDFS); + BF.hash(/*Recompute=*/true, /*UseDFS=*/UseDFS); CongruentBuckets[&BF].emplace(&BF); } diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 8c1bdb2bd560..5de2099d6381 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -150,7 +150,44 @@ class EliminateUnreachableBlocks : public BinaryFunctionPass { // Reorder the basic blocks for each function based on hotness. class ReorderBasicBlocks : public BinaryFunctionPass { - public: +public: + /// Choose which strategy should the block layout heuristic prioritize when + /// facing conflicting goals. + enum LayoutType : char { + /// LT_NONE - do not change layout of basic blocks + LT_NONE = 0, /// no reordering + /// LT_REVERSE - reverse the order of basic blocks, meant for testing + /// purposes. The first basic block is left intact and the rest are + /// put in the reverse order. + LT_REVERSE, + /// LT_OPTIMIZE - optimize layout of basic blocks based on profile. + LT_OPTIMIZE, + /// LT_OPTIMIZE_BRANCH is an implementation of what is suggested in Pettis' + /// paper (PLDI '90) about block reordering, trying to minimize branch + /// mispredictions. + LT_OPTIMIZE_BRANCH, + /// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04) + /// that suggests putting frequently executed chains first in the layout. + LT_OPTIMIZE_CACHE, + /// Create clusters and use random order for them. + LT_OPTIMIZE_SHUFFLE, + }; + +private: + // Function size, in number of BBs, above which we fallback to a heuristic + // solution to the layout problem instead of seeking the optimal one. + static constexpr uint64_t FUNC_SIZE_THRESHOLD = 10; + + void modifyFunctionLayout(BinaryFunction &Function, + LayoutType Type, + bool MinBranchClusters, + bool Split) const; + + /// Split function in two: a part with warm or hot BBs and a part with never + /// executed BBs. The cold part is moved to a new BinaryFunction. + void splitFunction(BinaryFunction &Function) const; + +public: explicit ReorderBasicBlocks(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } From 5fa8477436f7fc393aa4e646c93bf66fc4f4c2ec Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 6 Nov 2017 21:04:28 -0800 Subject: [PATCH 329/904] [BOLT] Always call fixBranches in relocation mode. Summary: If you attempted to use a function filter on a binary when in relocation mode, the resulting binary would probably crash. This is because we weren't calling fixBranches on all functions. This was breaking bughunter.sh I also strengthened the validation of basic blocks. The cond branch should always be non-null when there are two successors. (cherry picked from commit e2de416fea7d6852af110084d8450261c9292c95) --- bolt/BinaryBasicBlock.cpp | 2 +- bolt/Passes/BinaryPasses.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index e2294df5bbdf..6b38b39c7460 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -104,7 +104,7 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { break; case 2: Valid = - (!CondBranch || + (CondBranch && (TBB == getConditionalSuccessor(true)->getLabel() && ((!UncondBranch && !FBB) || (UncondBranch && diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index f2f999063300..68d70826b9a4 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -516,7 +516,7 @@ void FixupBranches::runOnFunctions( std::set &) { for (auto &It : BFs) { auto &Function = It.second; - if (shouldOptimize(Function)) { + if (opts::Relocs || shouldOptimize(Function)) { Function.fixBranches(); } } From 0c731a823dd171c3ac455e800cfd9f3656c1be42 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 7 Nov 2017 11:27:35 -0800 Subject: [PATCH 330/904] [BOLT] Fix BOLT build Summary: The latest change to MCInstrAnalysis broke then clang build. This fixes it. (cherry picked from commit 818900b92dc862d47546381f7121eedc9e31719a) --- bolt/Passes/BinaryPasses.h | 2 +- bolt/Passes/HFSortPlus.cpp | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 5de2099d6381..1c5539199063 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -71,7 +71,7 @@ class DynoStatsPrintPass : public BinaryFunctionPass { , Title(Title) { } - const char *getName() const { + const char *getName() const override { return "print dyno-stats after optimizations"; } diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index ad041b7e7fbe..53bd062a844f 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -116,10 +116,7 @@ class AdjacencyMatrix { void dump(const Cluster *A) const { outs() << "Cluster " << A->id() << ":"; - forallAdjacent(A, - [this,A](const Cluster *B) { - outs() << " " << B->id(); - }); + forallAdjacent(A, [](const Cluster *B) { outs() << " " << B->id(); }); } void dump() const { From b75278231d8d2ec123670e904d23169b90163c2e Mon Sep 17 00:00:00 2001 From: spupyrev Date: Fri, 27 Oct 2017 21:15:57 -0700 Subject: [PATCH 331/904] improving hfsort+ algorithm Summary: A few improvements for hfsort+ algorithm. The goal of the diff is (i) to make the resulting function order more i-cache "friendly" and (ii) fix a bug with incorrect input edge weights. A specific list of changes is as follows: - The "samples" field of CallGraph.Node should be at least the sum of incoming edge weights. Fixed with a new method CallGraph::adjustArcWeights() - A new optimization pass for hfsort+ in which pairs of functions that call each other with very high probability (>=0.99) are always merged. This improves the resulting i-cache but may worsen i-TLB. See a new method HFSortPlus::runPassOne() - Adjusted optimization goal to make the resulting ordering more i-cache "friendly", see HFSortPlus::expectedCalls and HFSortPlus::mergeGain - Functions w/o samples are now reordered too (they're placed at the end of the list of hot functions). These functions do appear in the call graph, as some of their basic blocks have samples in the LBR dataset. See HfSortPlus::initializeClusters (cherry picked from commit 3426466e2b82eb1650dc5cba2627036e3ae145e4) --- bolt/Passes/CallGraph.cpp | 14 +- bolt/Passes/CallGraph.h | 10 + bolt/Passes/HFSort.h | 2 +- bolt/Passes/HFSortPlus.cpp | 378 +++++++++++++++++++------------ bolt/Passes/ReorderFunctions.cpp | 1 - 5 files changed, 254 insertions(+), 151 deletions(-) diff --git a/bolt/Passes/CallGraph.cpp b/bolt/Passes/CallGraph.cpp index 4533c23d681d..e076f84dddd6 100644 --- a/bolt/Passes/CallGraph.cpp +++ b/bolt/Passes/CallGraph.cpp @@ -94,7 +94,6 @@ const CallGraph::Arc &CallGraph::incArcWeight(NodeId Src, NodeId Dst, double W, } void CallGraph::normalizeArcWeights() { - // Normalize arc weights for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) { auto& Func = getNode(FuncId); for (auto Caller : Func.predecessors()) { @@ -108,5 +107,18 @@ void CallGraph::normalizeArcWeights() { } } +void CallGraph::adjustArcWeights() { + for (NodeId FuncId = 0; FuncId < numNodes(); ++FuncId) { + auto& Func = getNode(FuncId); + uint64_t InWeight = 0; + for (auto Caller : Func.predecessors()) { + auto Arc = findArc(Caller, FuncId); + InWeight += (uint64_t)Arc->weight(); + } + if (Func.samples() < InWeight) + setSamples(FuncId, InWeight); + } +} + } } diff --git a/bolt/Passes/CallGraph.h b/bolt/Passes/CallGraph.h index c5df85734d2e..0eeb60c17f51 100644 --- a/bolt/Passes/CallGraph.h +++ b/bolt/Passes/CallGraph.h @@ -153,11 +153,21 @@ class CallGraph { return double(Arcs.size()) / (Nodes.size()*Nodes.size()); } + // Initialize NormalizedWeight field for every arc void normalizeArcWeights(); + // Make sure that the sum of incoming arc weights is at least the number of + // samples for every node + void adjustArcWeights(); template void printDot(char* fileName, L getLabel) const; + private: + void setSamples(const NodeId Id, uint64_t Samples) { + assert(Id < Nodes.size()); + Nodes[Id].Samples = Samples; + } + std::vector Nodes; ArcsType Arcs; }; diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h index 9d7c447357c3..7c837e029397 100644 --- a/bolt/Passes/HFSort.h +++ b/bolt/Passes/HFSort.h @@ -103,7 +103,7 @@ std::vector clusterize(const CallGraph &Cg); /* * Optimize function placement for iTLB cache and i-cache. */ -std::vector hfsortPlus(const CallGraph &Cg, +std::vector hfsortPlus(CallGraph &Cg, bool UseGainCache = true, bool UseShortCallCache = true); diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index 53bd062a844f..d7006af2d005 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -27,6 +27,7 @@ +----------------------------------------------------------------------+ */ +#include "BinaryFunction.h" #include "HFSort.h" #include "llvm/ADT/BitVector.h" #include "llvm/Support/Debug.h" @@ -41,8 +42,35 @@ #undef DEBUG_TYPE #define DEBUG_TYPE "hfsort" +using namespace llvm; +using namespace bolt; + namespace opts { -extern llvm::cl::opt Verbosity; +extern cl::OptionCategory BoltCategory; +extern cl::OptionCategory BoltOptCategory; +extern cl::opt Verbosity; + +cl::opt +ITLBPageSizeParam("itlb-page-size", + cl::desc("The size of i-tlb cache page"), + cl::init(4096), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ITLBEntriesParam("itlb-entries", + cl::desc("The number of entries in i-tlb cache"), + cl::init(16), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +MergeProbability("merge-probability", + cl::desc("The minimum probability of a call for merging two clusters"), + cl::init(0.99), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + } namespace llvm { @@ -50,20 +78,19 @@ namespace bolt { using NodeId = CallGraph::NodeId; using Arc = CallGraph::Arc; -using Node = CallGraph::Node; +using Node = CallGraph::Node; namespace { -// The size of a cache page -// Since we optimize both for iTLB cache (2MB pages) and i-cache (64b pages), -// using a value that fits both -constexpr uint32_t PageSize = uint32_t(1) << 12; +constexpr size_t InvalidAddr = -1; -// Capacity of the iTLB cache: larger values yield more iTLB-friendly result, -// while smaller values result in better i-cache performance -constexpr uint32_t ITLBEntries = 16; +// The size of a cache page: Since we optimize both for i-TLB cache (2MB pages) +// and i-cache (64b pages), using a value that fits both +int32_t ITLBPageSize; -constexpr size_t InvalidAddr = -1; +// Capacity of the iTLB cache: Larger values yield more iTLB-friendly result, +// while smaller values result in better i-cache performance +int32_t ITLBEntries; const char* cacheKindString(bool UseGainCache, bool UseShortCallCache) { if (UseGainCache && UseShortCallCache) @@ -80,7 +107,7 @@ const char* cacheKindString(bool UseGainCache, bool UseShortCallCache) { // processed. It is used to invalidate cache entries when merging // Clusters and for visiting all neighbors of any given Cluster. class AdjacencyMatrix { -public: + public: AdjacencyMatrix(const CallGraph &Cg, std::vector &Clusters, const std::vector &FuncCluster) @@ -126,7 +153,7 @@ class AdjacencyMatrix { outs() << "\n"; } } -private: + private: void set(const Cluster *A, const Cluster *B, bool Value) { assert(A != B); Bits[A->id()][B->id()] = Value; @@ -139,11 +166,17 @@ class AdjacencyMatrix { for (auto Succ : Cg.successors(TargetId)) { auto *B = FuncCluster[Succ]; if (!B || B == A) continue; + const auto &Arc = *Cg.findArc(TargetId, Succ); + if (Arc.weight() <= 0.0) continue; + set(A, B, true); } for (auto Pred : Cg.predecessors(TargetId)) { auto *B = FuncCluster[Pred]; if (!B || B == A) continue; + const auto &Arc = *Cg.findArc(Pred, TargetId); + if (Arc.weight() <= 0.0) continue; + set(A, B, true); } } @@ -225,8 +258,8 @@ double density(const Cluster *ClusterPred, const Cluster *ClusterSucc) { bool compareClusters(const Cluster *C1, const Cluster *C2) { const double D1 = C1->density(); const double D2 = C2->density(); - // making sure the sorting is deterministic if (D1 != D2) return D1 > D2; + // making sure the sorting is deterministic if (C1->size() != C2->size()) return C1->size() < C2->size(); if (C1->samples() != C2->samples()) return C1->samples() > C2->samples(); return C1->target(0) < C2->target(0); @@ -241,6 +274,7 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1, const auto D1 = density(A1, B1); const auto D2 = density(A2, B2); if (D1 != D2) return D1 > D2; + // making sure the sorting is deterministic const auto Size1 = A1->size() + B1->size(); const auto Size2 = A2->size() + B2->size(); if (Size1 != Size2) return Size1 < Size2; @@ -260,36 +294,45 @@ std::vector sortByDensity(const C &Clusters_) { return Clusters; } -/* - * The probability that a page with a given weight is not present in the cache. - * - * Assume that the hot functions are called in a random order; then the - * probability of a TLB page being accessed after a function call is - * p=pageSamples/totalSamples. The probability that the page is not accessed - * is (1-p), and the probability that it is not in the cache (i.e. not accessed - * during the last kITLBEntries function calls) is (1-p)^kITLBEntries - */ -double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double EdgeWeight) { - const auto Dist = std::abs(SrcAddr - DstAddr); - if (Dist > PageSize) { - return 0; - } - return (double(PageSize - Dist) / PageSize) * EdgeWeight; -} - /* * HFSortPlus - layout of hot functions with iTLB cache optimization + * + * Given an ordering of hot functions (and hence, their assignment to the + * iTLB pages), we can divide all functions calls into two categories: + * - 'short' ones that have a caller-callee distance less than a page; + * - 'long' ones where the distance exceeds a page. + * The short calls are likely to result in a iTLB cache hit. For the long ones, + * the hit/miss result depends on the 'hotness' of the page (i.e., how often + * the page is accessed). Assuming that functions are sent to the iTLB cache + * in a random order, the probability that a page is present in the cache is + * proportional to the number of samples corresponding to the functions on the + * page. The following algorithm detects short and long calls, and optimizes + * the expected number of cache misses for the long ones. */ class HFSortPlus { public: /* - * The probability that a page with a given weight is not present in the cache. + * The expected number of calls on different i-TLB pages for an arc of the + * call graph with a specified weight + */ + double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double Weight) const { + const auto Dist = std::abs(SrcAddr - DstAddr); + if (Dist > ITLBPageSize) + return 0; + + double X = double(Dist) / double(ITLBPageSize); + // Increasing the importance of shorter calls + return (1.0 - X * X) * Weight; + } + + /* + * The probability that a page with a given weight is not present in the cache * * Assume that the hot functions are called in a random order; then the - * probability of a TLB page being accessed after a function call is + * probability of a i-TLB page being accessed after a function call is * p=pageSamples/totalSamples. The probability that the page is not accessed * is (1-p), and the probability that it is not in the cache (i.e. not accessed - * during the last kITLBEntries function calls) is (1-p)^kITLBEntries + * during the last ITLBEntries function calls) is (1-p)^ITLBEntries */ double missProbability(double PageSamples) const { double P = PageSamples / TotalSamples; @@ -299,76 +342,9 @@ class HFSortPlus { return pow(1.0 - P, X); } - /* - * Expected hit ratio of the iTLB cache under the given order of clusters - * - * Given an ordering of hot functions (and hence, their assignment to the - * iTLB pages), we can divide all functions calls into two categories: - * - 'short' ones that have a caller-callee distance less than a page; - * - 'long' ones where the distance exceeds a page. - * The short calls are likely to result in a iTLB cache hit. For the long ones, - * the hit/miss result depends on the 'hotness' of the page (i.e., how often - * the page is accessed). Assuming that functions are sent to the iTLB cache - * in a random order, the probability that a page is present in the cache is - * proportional to the number of samples corresponding to the functions on the - * page. The following procedure detects short and long calls, and estimates - * the expected number of cache misses for the long ones. - */ - template - double expectedCacheHitRatio(const C &Clusters_) const { - // sort by density - std::vector Clusters(sortByDensity(Clusters_)); - - // generate function addresses with an alignment - std::vector Addr(Cg.numNodes(), InvalidAddr); - size_t CurAddr = 0; - // 'hotness' of the pages - std::vector PageSamples; - for (auto Cluster : Clusters) { - for (auto TargetId : Cluster->targets()) { - if (CurAddr & 0xf) CurAddr = (CurAddr & ~0xf) + 16; - Addr[TargetId] = CurAddr; - CurAddr += Cg.size(TargetId); - // update page weight - size_t Page = Addr[TargetId] / PageSize; - while (PageSamples.size() <= Page) PageSamples.push_back(0.0); - PageSamples[Page] += Cg.samples(TargetId); - } - } - - // computing expected number of misses for every function - double Misses = 0; - for (auto Cluster : Clusters) { - for (auto TargetId : Cluster->targets()) { - size_t Page = Addr[TargetId] / PageSize; - double Samples = Cg.samples(TargetId); - // probability that the page is not present in the cache - double MissProb = missProbability(PageSamples[Page]); - - for (auto Pred : Cg.predecessors(TargetId)) { - if (Cg.samples(Pred) == 0) continue; - const auto &Arc = *Cg.findArc(Pred, TargetId); - - // the source page - size_t SrcPage = (Addr[Pred] + (size_t)Arc.avgCallOffset()) / PageSize; - if (Page != SrcPage) { - // this is a miss - Misses += Arc.weight() * MissProb; - } - Samples -= Arc.weight(); - } - - // the remaining samples come from the jitted code - Misses += Samples * MissProb; - } - } - - return 100.0 * (1.0 - Misses / TotalSamples); - } - /* * The expected number of calls within a given cluster with both endpoints on - * the same TLB cache page + * the same cache page */ double shortCalls(const Cluster *Cluster) const { if (UseShortCallCache) { @@ -400,7 +376,7 @@ class HFSortPlus { /* * The number of calls between the two clusters with both endpoints on - * the same TLB page, assuming that a given pair of clusters gets merged + * the same i-TLB page, assuming that a given pair of clusters gets merged */ double shortCalls(const Cluster *ClusterPred, const Cluster *ClusterSucc) const { @@ -464,24 +440,24 @@ class HFSortPlus { // cache misses on the first cluster double LongCallsPred = ClusterPred->samples() - shortCalls(ClusterPred); - double ProbPred = missProbability(ClusterPred->density() * PageSize); + double ProbPred = missProbability(ClusterPred->density() * ITLBPageSize); double ExpectedMissesPred = LongCallsPred * ProbPred; // cache misses on the second cluster double LongCallsSucc = ClusterSucc->samples() - shortCalls(ClusterSucc); - double ProbSucc = missProbability(ClusterSucc->density() * PageSize); + double ProbSucc = missProbability(ClusterSucc->density() * ITLBPageSize); double ExpectedMissesSucc = LongCallsSucc * ProbSucc; // cache misses on the merged cluster double LongCallsNew = LongCallsPred + LongCallsSucc - shortCalls(ClusterPred, ClusterSucc); double NewDensity = density(ClusterPred, ClusterSucc); - double ProbNew = missProbability(NewDensity * PageSize); + double ProbNew = missProbability(NewDensity * ITLBPageSize); double MissesNew = LongCallsNew * ProbNew; double Gain = ExpectedMissesPred + ExpectedMissesSucc - MissesNew; // scaling the result to increase the importance of merging short clusters - Gain /= (ClusterPred->size() + ClusterSucc->size()); + Gain /= std::min(ClusterPred->size(), ClusterSucc->size()); if (UseGainCache) { Cache.set(ClusterPred, ClusterSucc, Gain); @@ -491,26 +467,100 @@ class HFSortPlus { } /* - * Run hfsort+ algorithm and return ordered set of function clusters. + * For every active cluster, compute its total weight of outgoing edges */ - std::vector run() { - DEBUG(dbgs() << "Starting hfsort+ w/" - << cacheKindString(UseGainCache, UseShortCallCache) - << " for " << Clusters.size() << " clusters\n" - << format("Initial expected iTLB cache hit ratio: %.4lf\n", - expectedCacheHitRatio(Clusters))); + std::unordered_map computeOutgoingWeight() { + std::unordered_map OutWeight; + for (auto ClusterPred : Clusters) { + double Weight = 0; + for (auto TargetId : ClusterPred->targets()) { + for (auto Succ : Cg.successors(TargetId)) { + auto *ClusterSucc = FuncCluster[Succ]; + if (!ClusterSucc || ClusterSucc == ClusterPred) + continue; + const auto &Arc = *Cg.findArc(TargetId, Succ); + Weight += Arc.weight(); + } + } + OutWeight[ClusterPred] += Weight; + } + return OutWeight; + } + + /* + * Find pairs of clusters that call each other with high probability + */ + std::vector> findClustersToMerge() { + // compute total weight of outgoing edges for every cluster + auto OutWeight = computeOutgoingWeight(); + + std::vector> PairsToMerge; + std::unordered_set ClustersToMerge; + for (auto ClusterPred : Clusters) { + for (auto TargetId : ClusterPred->targets()) { + for (auto Succ : Cg.successors(TargetId)) { + auto *ClusterSucc = FuncCluster[Succ]; + if (!ClusterSucc || ClusterSucc == ClusterPred) + continue; + + const auto &Arc = *Cg.findArc(TargetId, Succ); - int Steps = 0; - // merge pairs of clusters while there is an improvement + const double CallsFromPred = OutWeight[ClusterPred]; + const double CallsToSucc = ClusterSucc->samples(); + const double CallsPredSucc = Arc.weight(); + + // probability that the first cluster is calling the second one + const double ProbOut = + CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0; + assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect probability"); + + // probability that the second cluster is called from the first one + const double ProbIn = + CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0; + assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect probability"); + + if (std::min(ProbOut, ProbIn) >= opts::MergeProbability) { + if (ClustersToMerge.count(ClusterPred) == 0 && + ClustersToMerge.count(ClusterSucc) == 0) { + PairsToMerge.push_back(std::make_pair(ClusterPred, ClusterSucc)); + ClustersToMerge.insert(ClusterPred); + ClustersToMerge.insert(ClusterSucc); + } + } + } + } + } + + return PairsToMerge; + } + + /* + * Run the first optimization pass of the hfsort+ algorithm: + * Merge clusters that call each other with high probability + */ + void runPassOne() { while (Clusters.size() > 1) { - DEBUG( - if (Steps % 500 == 0) { - dbgs() << format("step = %d clusters = %lu expected_hit_rate = %.4lf\n", - Steps, Clusters.size(), - expectedCacheHitRatio(Clusters)); - }); - ++Steps; + // pairs of clusters that will be merged on this iteration + auto PairsToMerge = findClustersToMerge(); + + // stop the pass when there are no pairs to merge + if (PairsToMerge.empty()) + break; + // merge the pairs of clusters + for (auto &Pair : PairsToMerge) { + mergeClusters(Pair.first, Pair.second); + } + } + } + + /* + * Run the second optimization pass of the hfsort+ algorithm: + * Merge pairs of clusters while there is an improvement in the + * expected cache miss ratio + */ + void runPassTwo() { + while (Clusters.size() > 1) { Cluster *BestClusterPred = nullptr; Cluster *BestClusterSucc = nullptr; double BestGain = -1; @@ -520,8 +570,8 @@ class HFSortPlus { ClusterPred, // find the best candidate [&](Cluster *ClusterSucc) { - assert(ClusterPred != ClusterSucc); - // get a cost of merging two clusters + assert(ClusterPred != ClusterSucc && "loop edges are not supported"); + // compute the gain of merging two clusters const double Gain = mergeGain(ClusterPred, ClusterSucc); // breaking ties by density to make the hottest clusters be merged first @@ -537,22 +587,37 @@ class HFSortPlus { }); } - if (BestGain <= 0.0) break; + // stop merging when there is no improvement + if (BestGain <= 0.0) + break; // merge the best pair of clusters mergeClusters(BestClusterPred, BestClusterSucc); - - // remove BestClusterSucc from the list of active clusters - auto Iter = std::remove(Clusters.begin(), Clusters.end(), BestClusterSucc); - Clusters.erase(Iter, Clusters.end()); } + } - DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n" - << format("Final expected iTLB cache hit ratio: %.4lf\n", - expectedCacheHitRatio(Clusters))); + /* + * Run hfsort+ algorithm and return ordered set of function clusters. + */ + std::vector run() { + DEBUG(dbgs() << "Starting hfsort+ w/" + << cacheKindString(UseGainCache, UseShortCallCache) + << " for " << Clusters.size() << " clusters " + << "with ITLBPageSize = " << ITLBPageSize << ", " + << "ITLBEntries = " << ITLBEntries << ", " + << "and MergeProbability = " << opts::MergeProbability << "\n"); + + + // Pass 1 + runPassOne(); + + // Pass 2 + runPassTwo(); + + DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n"); // Return the set of clusters that are left, which are the ones that - // didn't get merged (so their first func is its original func). + // didn't get merged (so their first func is its original func) std::vector Result; for (auto Cluster : sortByDensity(Clusters)) { Result.emplace_back(std::move(*Cluster)); @@ -578,17 +643,21 @@ class HFSortPlus { ShortCallPairCache(Clusters.size()) { } private: - // Initialize the set of active clusters, function id to cluster mapping, - // total number of samples and function addresses. + + /* + * Initialize the set of active clusters, function id to cluster mapping, + * total number of samples and function addresses. + */ std::vector initializeClusters() { - std::vector Clusters; + ITLBPageSize = opts::ITLBPageSizeParam; + ITLBEntries = opts::ITLBEntriesParam; + // Initialize clusters + std::vector Clusters; Clusters.reserve(Cg.numNodes()); AllClusters.reserve(Cg.numNodes()); - - for (NodeId F = 0; F < Cg.numNodes(); F++) { + for (NodeId F = 0; F < Cg.numNodes(); ++F) { AllClusters.emplace_back(F, Cg.getNode(F)); - if (Cg.samples(F) == 0) continue; Clusters.emplace_back(&AllClusters[F]); Clusters.back()->setId(Clusters.size() - 1); FuncCluster[F] = &AllClusters[F]; @@ -600,7 +669,7 @@ class HFSortPlus { } /* - * Merge cluster From into cluster Into. + * Merge cluster From into cluster Into and update the list of active clusters */ void mergeClusters(Cluster *Into, Cluster *From) { DEBUG( @@ -621,9 +690,18 @@ class HFSortPlus { FuncCluster[TargetId] = Into; Addr[TargetId] = CurAddr; CurAddr += Cg.size(TargetId); + // Functions are aligned in the output binary, + // replicating the effect here using BinaryFunction::MinAlign + const auto Align = BinaryFunction::MinAlign; + CurAddr = ((CurAddr + Align - 1) / Align) * Align; } + // Update caches invalidateCaches(Into); + + // Remove cluster From from the list of active clusters + auto Iter = std::remove(Clusters.begin(), Clusters.end(), From); + Clusters.erase(Iter, Clusters.end()); } /* @@ -643,13 +721,13 @@ class HFSortPlus { } } - // the call graph + // The call graph const CallGraph &Cg; - // All clusters. + // All clusters std::vector AllClusters; - // target_id => cluster + // Target_id => cluster std::vector FuncCluster; // current address of the function from the beginning of its cluster @@ -658,17 +736,17 @@ class HFSortPlus { // the total number of samples in the graph double TotalSamples; - // All clusters with non-zero number of samples. This vector gets + // All clusters with non-zero number of samples. This vector gets // udpated at runtime when clusters are merged. std::vector Clusters; - // Cluster adjacency matrix. + // Cluster adjacency matrix AdjacencyMatrix Adjacent; - // Use cache for mergeGain results. + // Use cache for mergeGain results bool UseGainCache; - // Use caches for shortCalls results. + // Use caches for shortCalls results bool UseShortCallCache; // A cache that keeps precomputed values of mergeGain for pairs of clusters; @@ -686,9 +764,13 @@ class HFSortPlus { } -std::vector hfsortPlus(const CallGraph &Cg, +std::vector hfsortPlus(CallGraph &Cg, bool UseGainCache, bool UseShortCallCache) { + // It is required that the sum of incoming arc weights is not greater + // than the number of samples for every function. + // Ensuring the call graph obeys the property before running the algorithm. + Cg.adjustArcWeights(); return HFSortPlus(Cg, UseGainCache, UseShortCallCache).run(); } diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index d90e621c7649..4676c1c2fa8a 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -145,7 +145,6 @@ void ReorderFunctions::reorder(std::vector &&Clusters, // Set order of hot functions based on clusters. for (const auto& Cluster : Clusters) { for (const auto FuncId : Cluster.targets()) { - assert(Cg.samples(FuncId) > 0); Cg.nodeIdToFunc(FuncId)->setIndex(Index++); FuncAddr[FuncId] = TotalSize; TotalSize += Cg.size(FuncId); From 023c78084b10083722f2abdf8ede76f04e72e9b0 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 24 Aug 2017 14:37:35 -0700 Subject: [PATCH 332/904] [BOLT-AArch64] Support rewriting bzip2 Summary: Add basic AArch64 read/write capability to be able to disassemble bzip2 for AArch64 compiled with gcc 5.4.0 and write it back after going through the basic BOLT pipeline with no block reordering (NOPs/unreachable blocks get removed). This is not for relocation mode. (cherry picked from commit 500d874c8e4e916d9639c15ae1578b1c7ae9d90a) --- bolt/BinaryFunction.cpp | 37 ++++++++++++++++++------------------ bolt/Passes/BinaryPasses.cpp | 8 ++++---- bolt/Passes/StokeInfo.cpp | 2 +- bolt/RewriteInstance.cpp | 8 ++++++-- bolt/RewriteInstance.h | 7 ++++--- 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index e38f57caf49e..03a926665135 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -775,13 +775,13 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { Labels[0] = Ctx->createTempSymbol("BB0", false); addEntryPointAtOffset(0); - auto handleRIPOperand = + auto handlePCRelOperand = [&](MCInst &Instruction, uint64_t Address, uint64_t Size) { uint64_t TargetAddress{0}; MCSymbol *TargetSymbol{nullptr}; if (!MIA->evaluateMemOperandTarget(Instruction, TargetAddress, Address, Size)) { - errs() << "BOLT-ERROR: rip-relative operand can't be evaluated:\n"; + errs() << "BOLT-ERROR: PC-relative operand can't be evaluated:\n"; BC.InstPrinter->printInst(&Instruction, errs(), "", *BC.STI); errs() << '\n'; Instruction.dump_pretty(errs(), BC.InstPrinter.get()); @@ -790,7 +790,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } if (TargetAddress == 0) { if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: rip-relative operand is zero in function " + outs() << "BOLT-INFO: PC-relative operand is zero in function " << *this << ".\n"; } } @@ -816,8 +816,11 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { if (!TargetSymbol) TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); MIA->replaceMemOperandDisp( - Instruction, MCOperand::createExpr(MCSymbolRefExpr::create( - TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx))); + Instruction, MCOperand::createExpr(BC.MIA->getTargetExprFor( + Instruction, + MCSymbolRefExpr::create( + TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx), + *BC.Ctx))); return true; }; @@ -954,7 +957,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Assign proper opcode for tail calls, so that they could be // treated as calls. if (!IsCall) { - if (!MIA->convertJmpToTailCall(Instruction)) { + if (!MIA->convertJmpToTailCall(Instruction, BC.Ctx.get())) { assert(IsCondBranch && "unknown tail call instruction"); if (opts::Verbosity >= 2) { errs() << "BOLT-WARNING: conditional tail call detected in " @@ -1007,12 +1010,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Add taken branch info. TakenBranches.emplace_back(Offset, TargetAddress - getAddress()); } - Instruction.clear(); - Instruction.addOperand( - MCOperand::createExpr( - MCSymbolRefExpr::create(TargetSymbol, - MCSymbolRefExpr::VK_None, - *Ctx))); + BC.MIA->replaceBranchTarget(Instruction, TargetSymbol, &*Ctx); // Record call offset for profile matching. if (IsCall) { @@ -1036,7 +1034,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { llvm_unreachable("unexpected result"); case IndirectBranchType::POSSIBLE_TAIL_CALL: { - auto Result = MIA->convertJmpToTailCall(Instruction); + auto Result = + MIA->convertJmpToTailCall(Instruction, BC.Ctx.get()); (void)Result; assert(Result); } @@ -1053,8 +1052,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { }; } // Indirect call. We only need to fix it if the operand is RIP-relative - if (IsSimple && MIA->hasRIPOperand(Instruction)) { - if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { + if (IsSimple && MIA->hasPCRelOperand(Instruction)) { + if (!handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size)) { errs() << "BOLT-ERROR: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << *this << ".\n"; @@ -1065,8 +1064,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } } } else { - if (MIA->hasRIPOperand(Instruction)) { - if (!handleRIPOperand(Instruction, AbsoluteInstrAddr, Size)) { + if (MIA->hasPCRelOperand(Instruction)) { + if (!handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size)) { errs() << "BOLT-ERROR: cannot handle RIP operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << *this << ".\n"; @@ -1152,7 +1151,7 @@ bool BinaryFunction::postProcessIndirectBranches() { // If there's an indirect branch in a single-block function - // it must be a tail call. if (layout_size() == 1) { - BC.MIA->convertJmpToTailCall(Instr); + BC.MIA->convertJmpToTailCall(Instr, BC.Ctx.get()); return true; } @@ -1231,7 +1230,7 @@ bool BinaryFunction::postProcessIndirectBranches() { } return false; } - BC.MIA->convertJmpToTailCall(Instr); + BC.MIA->convertJmpToTailCall(Instr, BC.Ctx.get()); } } return true; diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 68d70826b9a4..cae8fc694614 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -1005,12 +1005,12 @@ bool SimplifyRODataLoads::simplifyRODataLoads( // Try to statically evaluate the target memory address; uint64_t TargetAddress; - if (MIA->hasRIPOperand(Inst)) { - // Try to find the symbol that corresponds to the RIP-relative operand. + if (MIA->hasPCRelOperand(Inst)) { + // Try to find the symbol that corresponds to the PC-relative operand. auto DispOpI = MIA->getMemOperandDisp(Inst); - assert(DispOpI != Inst.end() && "expected RIP-relative displacement"); + assert(DispOpI != Inst.end() && "expected PC-relative displacement"); assert(DispOpI->isExpr() && - "found RIP-relative with non-symbolic displacement"); + "found PC-relative with non-symbolic displacement"); // Get displacement symbol. const MCSymbolRefExpr *DisplExpr; diff --git a/bolt/Passes/StokeInfo.cpp b/bolt/Passes/StokeInfo.cpp index 54a879b6de9d..4890bdfbeaf2 100644 --- a/bolt/Passes/StokeInfo.cpp +++ b/bolt/Passes/StokeInfo.cpp @@ -65,7 +65,7 @@ void StokeInfo::checkInstr(const BinaryContext &BC, const BinaryFunction &BF, // check if this function modify stack or heap // TODO: more accurate analysis auto IsPush = BC.MIA->isPush(It); - auto IsRipAddr = BC.MIA->hasRIPOperand(It); + auto IsRipAddr = BC.MIA->hasPCRelOperand(It); if (IsPush) { FuncInfo.StackOut = true; } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index e6274566bd94..2b3759015d85 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -675,7 +675,10 @@ void RewriteInstance::aggregateData() { void RewriteInstance::discoverStorage() { - EFMM.reset(new ExecutableFileMemoryManager()); + // Tell EE that we guarantee we don't need stubs for x86, but not for aarch64 + EFMM.reset(new ExecutableFileMemoryManager( + /*AllowStubs*/ (BC->TheTriple->getArch() == llvm::Triple::aarch64 && + opts::Relocs))); auto ELF64LEFile = dyn_cast(InputFile); if (!ELF64LEFile) { @@ -1224,7 +1227,8 @@ void RewriteInstance::discoverFileObjects() { } // Process PLT section. - disassemblePLT(); + if (BC->TheTriple->getArch() == Triple::x86_64) + disassemblePLT(); // See if we missed any functions marked by FDE. for (const auto &FDEI : CFIRdWrt->getFDEs()) { diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index f557e6fa3fa4..ca3a106dc56c 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -104,6 +104,8 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { bool IsCode, bool IsReadOnly); + bool AllowStubs; + public: /// [start memory address] -> [segment info] mapping. std::map SegmentMapInfo; @@ -114,7 +116,7 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { /// Information about non-allocatable sections. std::map NoteSectionInfo; - ExecutableFileMemoryManager() {} + ExecutableFileMemoryManager(bool AllowStubs) : AllowStubs(AllowStubs) {} ~ExecutableFileMemoryManager(); @@ -136,8 +138,7 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { unsigned Alignment, unsigned SectionID, StringRef SectionName) override; - // Tell EE that we guarantee we don't need stubs. - bool allowStubAllocation() const override { return false; } + bool allowStubAllocation() const override { return AllowStubs; } bool finalizeMemory(std::string *ErrMsg = nullptr) override; }; From bcc8990506389d46acf204061b5f9628ed4cf621 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 31 Aug 2017 11:45:37 -0700 Subject: [PATCH 333/904] [BOLT-AArch64] Support reordering bzip2 no relocs Summary: Add functionality to support reordering bzip2 compiled to AArch64, with function splitting but without relocations: * Expand the AArch64 backend to support inverting branches and analyzing branches so BOLT reordering machinery is able to shuffle blocks and fix branches correctly; * Add a new pass named LongJmp to add stubs whenever code needs to jump to the cold area, when using function splitting, because of the limited target encoding capability in AArch64 (as a RISC architecture). (cherry picked from commit 3162c20429b9fe24b697338f320bcc24b3f42d62) --- bolt/BinaryBasicBlock.cpp | 15 ++ bolt/BinaryBasicBlock.h | 10 + bolt/BinaryContext.cpp | 23 +++ bolt/BinaryContext.h | 9 + bolt/BinaryFunction.cpp | 2 +- bolt/BinaryFunction.h | 14 +- bolt/BinaryPassManager.cpp | 14 ++ bolt/Passes/BinaryPasses.cpp | 2 +- bolt/Passes/CMakeLists.txt | 1 + bolt/Passes/LongJmp.cpp | 365 +++++++++++++++++++++++++++++++++++ bolt/Passes/LongJmp.h | 116 +++++++++++ bolt/RewriteInstance.cpp | 22 +-- 12 files changed, 571 insertions(+), 22 deletions(-) create mode 100644 bolt/Passes/LongJmp.cpp create mode 100644 bolt/Passes/LongJmp.h diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 6b38b39c7460..9ea9cf11bd1d 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -138,6 +138,21 @@ BinaryBasicBlock *BinaryBasicBlock::getSuccessor(const MCSymbol *Label) const { return nullptr; } +BinaryBasicBlock * +BinaryBasicBlock::getSuccessor(const MCSymbol *Label, + BinaryBranchInfo &BI) const { + auto BIIter = branch_info_begin(); + for (BinaryBasicBlock *BB : successors()) { + if (BB->getLabel() == Label) { + BI = *BIIter; + return BB; + } + ++BIIter; + } + + return nullptr; +} + BinaryBasicBlock *BinaryBasicBlock::getLandingPad(const MCSymbol *Label) const { for (BinaryBasicBlock *BB : landing_pads()) { if (BB->getLabel() == Label) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index be9a8235fa51..08a6a7e142e6 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -333,6 +333,10 @@ class BinaryBasicBlock { /// return the successor. BinaryBasicBlock *getSuccessor(const MCSymbol *Label = nullptr) const; + /// Return the related branch info as well as the successor. + BinaryBasicBlock *getSuccessor(const MCSymbol *Label, + BinaryBranchInfo &BI) const; + /// If the basic block ends with a conditional branch (possibly followed by /// an unconditional branch) and thus has 2 successors, return a successor /// corresponding to a jump condition which could be true or false. @@ -637,6 +641,12 @@ class BinaryBasicBlock { return Instructions.erase(II); } + /// Erase all instructions + void clear() { + Instructions.clear(); + NumPseudos = 0; + } + /// Retrieve iterator for \p Inst or return end iterator if instruction is not /// from this basic block. decltype(Instructions)::iterator findInstruction(const MCInst *Inst) { diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 5fc289a0aefd..4415857b4338 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -28,6 +28,7 @@ namespace opts { extern cl::OptionCategory BoltCategory; extern cl::opt Relocs; +extern cl::opt ReorderFunctions; static cl::opt PrintDebugInfo("print-debug-info", @@ -186,6 +187,28 @@ unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID, return Ctx->getDwarfFile(Dir, FileNames[FileIndex - 1].Name, 0, DestCUID); } +std::vector BinaryContext::getSortedFunctions( + std::map &BinaryFunctions) { + std::vector SortedFunctions(BinaryFunctions.size()); + std::transform(BinaryFunctions.begin(), BinaryFunctions.end(), + SortedFunctions.begin(), + [](std::pair &BFI) { + return &BFI.second; + }); + + if (opts::ReorderFunctions != BinaryFunction::RT_NONE) { + std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } else { + return A->hasValidIndex(); + } + }); + } + return SortedFunctions; +} + void BinaryContext::preprocessDebugInfo( std::map &BinaryFunctions) { // Populate MCContext with DWARF files. diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index e22d246905c7..1fccb073d9cc 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -149,6 +149,11 @@ class BinaryContext { /// Number of functions with profile information uint64_t NumProfiledFuncs{0}; + /// Track next available address for new allocatable sections. RewriteInstance + /// sets this prior to running BOLT passes, so layout passes are aware of the + /// final addresses functions will have. + uint64_t LayoutStartAddress{0}; + /// True if the binary requires immediate relocation processing. bool RequiresZNow{false}; @@ -272,6 +277,10 @@ class BinaryContext { const uint32_t SrcCUID, unsigned FileIndex); + /// Return functions in output layout order + static std::vector + getSortedFunctions(std::map &BinaryFunctions); + /// Compute the native code size for a range of instructions. /// Note: this can be imprecise wrt the final binary since happening prior to /// relaxation, as well as wrt the original binary because of opcode diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 03a926665135..e65b90225522 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -3969,7 +3969,7 @@ DynoStats BinaryFunction::getDynoStats() const { BC.MIA->getAnnotationWithDefault(Instr, "CTCTakenCount"); } Stats[DynoStats::FUNCTION_CALLS] += CallFreq; - if (BC.MIA->getMemoryOperandNo(Instr) != -1) { + if (BC.MIA->isIndirectCall(Instr)) { Stats[DynoStats::INDIRECT_CALLS] += CallFreq; } else if (const auto *CallSymbol = BC.MIA->getTargetSymbol(Instr)) { const auto *BF = BC.getFunctionForSymbol(CallSymbol); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 058c4fac6b30..04b3a51697fe 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -711,7 +711,7 @@ class BinaryFunction { assert(InstructionOffsets.size() == Instructions.size() && "There must be one instruction at every offset."); Instructions.emplace_back(std::forward(Instruction)); - InstructionOffsets[Offset] = Instructions.size() - 1; + InstructionOffsets[Offset] = Instructions.size() - 1; } /// Return instruction at a given offset in the function. Valid before @@ -1863,6 +1863,18 @@ class BinaryFunction { return Estimate; } + size_t estimateColdSize() const { + if (!isSplit()) + return estimateSize(); + size_t Estimate = 0; + for (const auto *BB : BasicBlocksLayout) { + if (BB->isCold()) { + Estimate += BC.computeCodeSize(BB->begin(), BB->end()); + } + } + return Estimate; + } + size_t estimateSize() const { size_t Estimate = 0; for (const auto *BB : BasicBlocksLayout) { diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 4d23fdf5bf0f..c322b0d69d6c 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -14,6 +14,7 @@ #include "Passes/FrameOptimizer.h" #include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" +#include "Passes/LongJmp.h" #include "Passes/PLTCall.h" #include "Passes/ReorderFunctions.h" #include "Passes/StokeInfo.h" @@ -105,6 +106,13 @@ PrintFinalized("print-finalized", cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt +PrintLongJmp("print-longjmp", + cl::desc("print functions after longjmp pass"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + static cl::opt PrintICF("print-icf", cl::desc("print functions after ICF optimization"), @@ -396,6 +404,12 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintFOP)); + // Thighten branches according to offset differences between branch and + // targets. No extra instructions after this pass, otherwise we may have + // relocations out of range and crash during linking. + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) + Manager.registerPass(llvm::make_unique(PrintLongJmp)); + // This pass turns tail calls into jumps which makes them invisible to // function reordering. It's unsafe to use any CFG or instruction analysis // after this point. diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index cae8fc694614..43021aa13d01 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -233,7 +233,7 @@ void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, << ": replacing call to " << OriginalTarget->getName() << " by call to " << Target->getName() << " while folding " << CallSites << " call sites\n"); - BC.MIA->replaceCallTargetOperand(Inst, Target, BC.Ctx.get()); + BC.MIA->replaceBranchTarget(Inst, Target, BC.Ctx.get()); NumOptimizedCallSites += CallSites; if (BB->hasProfile()) { diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 8619ad21f0c4..5f66de76826f 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -13,6 +13,7 @@ add_llvm_library(LLVMBOLTPasses IndirectCallPromotion.cpp Inliner.cpp LivenessAnalysis.cpp + LongJmp.cpp MCF.cpp PettisAndHansen.cpp PLTCall.cpp diff --git a/bolt/Passes/LongJmp.cpp b/bolt/Passes/LongJmp.cpp new file mode 100644 index 000000000000..412c71a6c6ec --- /dev/null +++ b/bolt/Passes/LongJmp.cpp @@ -0,0 +1,365 @@ +//===--- Passes/LongJmp.cpp -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "LongJmp.h" + +#define DEBUG_TYPE "longjmp" + +using namespace llvm; + +namespace opts { +extern cl::opt Relocs; +} + +namespace llvm { +namespace bolt { + +namespace { +constexpr unsigned ColdFragAlign = 16; + +std::pair, MCSymbol *> +createNewStub(const BinaryContext &BC, BinaryFunction &Func, + const MCSymbol *TgtSym) { + auto *StubSym = BC.Ctx->createTempSymbol("Stub", true); + auto StubBB = Func.createBasicBlock(0, StubSym); + + std::vector Seq; + BC.MIA->createLongJmp(Seq, TgtSym, BC.Ctx.get()); + StubBB->addInstructions(Seq.begin(), Seq.end()); + StubBB->setExecutionCount(0); + return std::make_pair(std::move(StubBB), StubSym); +} + +void shrinkStubToShortJmp(const BinaryContext &BC, BinaryBasicBlock &StubBB, + const MCSymbol *Tgt) { + std::vector Seq; + BC.MIA->createShortJmp(Seq, Tgt, BC.Ctx.get()); + StubBB.clear(); + StubBB.addInstructions(Seq.begin(), Seq.end()); +} + +void shrinkStubToSingleInst(const BinaryContext &BC, BinaryBasicBlock &StubBB, + const MCSymbol *Tgt) { + MCInst Inst; + BC.MIA->createUncondBranch(Inst, Tgt, BC.Ctx.get()); + StubBB.clear(); + StubBB.addInstruction(Inst); +} + +BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) { + if (!Func.isSplit() || Func.empty()) + return nullptr; + + assert(!(*Func.begin()).isCold() && "Entry cannot be cold"); + for (auto I = Func.layout_begin(), E = Func.layout_end(); I != E; ++I) { + auto Next = std::next(I); + if (Next != E && (*Next)->isCold()) + return *I; + } + llvm_unreachable("No hot-colt split point found"); +} +} + +std::unique_ptr +LongJmpPass::replaceTargetWithStub(const BinaryContext &BC, + BinaryFunction &Func, BinaryBasicBlock &BB, + MCInst &Inst) { + std::unique_ptr NewBB; + auto TgtSym = BC.MIA->getTargetSymbol(Inst); + assert (TgtSym && "getTargetSymbol failed"); + + BinaryBasicBlock::BinaryBranchInfo BI{0, 0}; + auto *TgtBB = BB.getSuccessor(TgtSym, BI); + // Do not issue a long jmp for blocks in the same region + if (TgtBB && TgtBB->isCold() == BB.isCold()) + return nullptr; + + BinaryBasicBlock *StubBB = + BB.isCold() ? ColdStubs[&Func][TgtSym] : HotStubs[&Func][TgtSym]; + MCSymbol *StubSymbol = StubBB ? StubBB->getLabel() : nullptr; + + if (!StubBB) { + std::tie(NewBB, StubSymbol) = createNewStub(BC, Func, TgtSym); + StubBB = NewBB.get(); + Stubs[&Func].insert(StubBB); + } + + // Local branch + if (TgtBB) { + uint64_t OrigCount{BI.Count}; + uint64_t OrigMispreds{BI.MispredictedCount}; + BB.replaceSuccessor(TgtBB, StubBB, OrigCount, OrigMispreds); + StubBB->setExecutionCount(StubBB->getExecutionCount() + OrigCount); + if (NewBB) { + StubBB->addSuccessor(TgtBB, OrigCount, OrigMispreds); + StubBB->setIsCold(BB.isCold()); + } + // Call / tail call + } else { + StubBB->setExecutionCount(StubBB->getExecutionCount() + + BB.getExecutionCount()); + if (NewBB) { + assert(TgtBB == nullptr); + StubBB->setIsCold(BB.isCold()); + // Set as entry point because this block is valid but we have no preds + StubBB->setEntryPoint(true); + } + } + BC.MIA->replaceBranchTarget(Inst, StubSymbol, BC.Ctx.get()); + ++StubRefCount[StubBB]; + StubBits[StubBB] = BC.AsmInfo->getPointerSize() * 8; + + if (NewBB) { + if (BB.isCold()) + ColdStubs[&Func][TgtSym] = StubBB; + else + HotStubs[&Func][TgtSym] = StubBB; + } + + return NewBB; +} + +namespace { + +bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) { + return (BC.MIA->isBranch(Inst) || BC.MIA->isCall(Inst)) && + !BC.MIA->isIndirectBranch(Inst) && !BC.MIA->isIndirectCall(Inst); +} + +} + +void LongJmpPass::insertStubs(const BinaryContext &BC, BinaryFunction &Func) { + std::vector>> + Insertions; + + BinaryBasicBlock *Frontier = getBBAtHotColdSplitPoint(Func); + + for (auto &BB : Func) { + for (auto &Inst : BB) { + // Only analyze direct branches with target distance constraints + if (!shouldInsertStub(BC, Inst)) + continue; + + // Insert stubs close to the patched BB if call, but far away from the + // hot path if a branch, since this branch target is the cold region + BinaryBasicBlock *InsertionPoint = &BB; + if (!BC.MIA->isCall(Inst) && Frontier && !BB.isCold()) + InsertionPoint = Frontier; + // Create a stub to handle a far-away target + Insertions.emplace_back(std::make_pair( + InsertionPoint, replaceTargetWithStub(BC, Func, BB, Inst))); + } + } + + for (auto &Elmt : Insertions) { + if (!Elmt.second) + continue; + std::vector> NewBBs; + NewBBs.emplace_back(std::move(Elmt.second)); + Func.insertBasicBlocks(Elmt.first, std::move(NewBBs), true, true); + } + +} + +void LongJmpPass::tentativeBBLayout(const BinaryContext &BC, + const BinaryFunction &Func) { + uint64_t HotDot = HotAddresses[&Func]; + uint64_t ColdDot = ColdAddresses[&Func]; + bool Cold{false}; + for (auto *BB : Func.layout()) { + if (Cold || BB->isCold()) { + Cold = true; + BBAddresses[BB] = ColdDot; + ColdDot += BC.computeCodeSize(BB->begin(), BB->end()); + } else { + BBAddresses[BB] = HotDot; + HotDot += BC.computeCodeSize(BB->begin(), BB->end()); + } + } +} + +void LongJmpPass::tentativeLayout( + const BinaryContext &BC, + std::vector &SortedFunctions) { + assert(!opts::Relocs && "Pass is incompatible with relocs"); + uint64_t DotAddress = BC.LayoutStartAddress; + + for (auto Func : SortedFunctions) { + HotAddresses[Func] = Func->getAddress(); + DotAddress = RoundUpToAlignment(DotAddress, ColdFragAlign); + ColdAddresses[Func] = DotAddress; + if (Func->isSplit()) + DotAddress += Func->estimateColdSize(); + tentativeBBLayout(BC, *Func); + } +} + +void LongJmpPass::removeStubRef(const BinaryContext &BC, + BinaryBasicBlock *BB, MCInst &Inst, + BinaryBasicBlock *StubBB, + const MCSymbol *Target, + BinaryBasicBlock *TgtBB) { + BC.MIA->replaceBranchTarget(Inst, Target, BC.Ctx.get()); + + --StubRefCount[StubBB]; + assert(StubRefCount[StubBB] >= 0 && "Ref count is lost"); + + if (TgtBB && BB->isSuccessor(StubBB)) { + const auto &BI = BB->getBranchInfo(*StubBB); + uint64_t OrigCount{BI.Count}; + uint64_t OrigMispreds{BI.MispredictedCount}; + BB->replaceSuccessor(StubBB, TgtBB, OrigCount, OrigMispreds); + } + + if (StubRefCount[StubBB] == 0) { + // Remove the block from CFG + StubBB->removeSuccessors(StubBB->succ_begin(), StubBB->succ_end()); + StubBB->markValid(false); + StubBB->setEntryPoint(false); + } +} + +bool LongJmpPass::usesStub(const BinaryContext &BC, const BinaryFunction &Func, + const MCInst &Inst) const { + auto TgtSym = BC.MIA->getTargetSymbol(Inst); + auto *TgtBB = Func.getBasicBlockForLabel(TgtSym); + auto Iter = Stubs.find(&Func); + if (Iter != Stubs.end()) + return Iter->second.count(TgtBB); + return false; +} + +uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC, + const MCSymbol *Target, + const BinaryBasicBlock *TgtBB) const { + if (TgtBB) { + auto Iter = BBAddresses.find(TgtBB); + assert (Iter != BBAddresses.end() && "Unrecognized local BB"); + return Iter->second; + } + auto *TargetFunc = BC.getFunctionForSymbol(Target); + auto Iter = HotAddresses.find(TargetFunc); + if (Iter == HotAddresses.end()) { + // Look at BinaryContext's resolution for this symbol - this is a symbol not + // mapped to a BinaryFunction + auto SymIter = BC.GlobalSymbols.find(Target->getName()); + assert (SymIter != BC.GlobalSymbols.end() && "Unrecognized symbol"); + return SymIter->second; + } + return Iter->second; +} + +bool LongJmpPass::removeOrShrinkStubs(const BinaryContext &BC, + BinaryFunction &Func) { + bool Modified{false}; + + assert(BC.TheTriple->getArch() == llvm::Triple::aarch64 && + "Unsupported arch"); + constexpr auto InsnSize = 4; // AArch64 + // Remove unnecessary stubs for branch targets we know we can fit in the + // instruction + for (auto &BB : Func) { + uint64_t DotAddress = BBAddresses[&BB]; + for (auto &Inst : BB) { + if (!shouldInsertStub(BC, Inst) || !usesStub(BC, Func, Inst)) { + DotAddress += InsnSize; + continue; + } + + auto StubSym = BC.MIA->getTargetSymbol(Inst); + auto *StubBB = Func.getBasicBlockForLabel(StubSym); + auto *RealTargetSym = BC.MIA->getTargetSymbol(*StubBB->begin()); + auto *TgtBB = Func.getBasicBlockForLabel(RealTargetSym); + auto BitsAvail = BC.MIA->getPCRelEncodingSize(Inst) - 1; + uint64_t Mask = ~((1ULL << BitsAvail) - 1); + uint64_t Offset = getSymbolAddress(BC, RealTargetSym, TgtBB); + if (DotAddress > Offset) + Offset = DotAddress - Offset; + else + Offset -= DotAddress; + // If it fits in the original instr, remove the stub + if (!(Offset & Mask)) { + removeStubRef(BC, &BB, Inst, StubBB, RealTargetSym, TgtBB); + Modified = true; + } + DotAddress += InsnSize; + } + } + + auto RangeShortJmp = BC.MIA->getShortJmpEncodingSize(); + auto RangeSingleInstr = BC.MIA->getUncondBranchEncodingSize(); + uint64_t ShortJmpMask = ~((1ULL << RangeShortJmp) - 1); + uint64_t SingleInstrMask = ~((1ULL << (RangeSingleInstr - 1)) - 1); + // Shrink stubs from 64 to 32 or 28 bit whenever possible + for (auto &BB : Func) { + if (!Stubs[&Func].count(&BB) || !BB.isValid()) + continue; + + auto Bits = StubBits[&BB]; + // Already working with the tightest range? + if (Bits == RangeSingleInstr) + continue; + + // Attempt to tight to short jmp + auto *RealTargetSym = BC.MIA->getTargetSymbol(*BB.begin()); + auto *TgtBB = Func.getBasicBlockForLabel(RealTargetSym); + uint64_t DotAddress = BBAddresses[&BB]; + uint64_t TgtAddress = getSymbolAddress(BC, RealTargetSym, TgtBB); + if (TgtAddress & ShortJmpMask) + continue; + + // Attempt to tight to pc-relative single-instr branch + uint64_t PCRelTgtAddress = TgtAddress > DotAddress + ? TgtAddress - DotAddress + : DotAddress - TgtAddress; + if (PCRelTgtAddress & SingleInstrMask) { + if (Bits > RangeShortJmp) { + shrinkStubToShortJmp(BC, BB, RealTargetSym); + StubBits[&BB] = RangeShortJmp; + Modified = true; + } + continue; + } + + if (Bits > RangeSingleInstr) { + shrinkStubToSingleInst(BC, BB, RealTargetSym); + StubBits[&BB] = RangeSingleInstr; + Modified = true; + } + } + return Modified; +} + +void LongJmpPass::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + auto Sorted = BinaryContext::getSortedFunctions(BFs); + for (auto Func : Sorted) { + insertStubs(BC, *Func); + Func->fixBranches(); + } + + bool Modified; + do { + Modified = false; + tentativeLayout(BC, Sorted); + for (auto Func : Sorted) { + if (removeOrShrinkStubs(BC, *Func)) { + Func->eraseInvalidBBs(); + Func->fixBranches(); + Modified = true; + } + } + } while (Modified); +} + +} +} diff --git a/bolt/Passes/LongJmp.h b/bolt/Passes/LongJmp.h new file mode 100644 index 000000000000..9c4b6c63e5bc --- /dev/null +++ b/bolt/Passes/LongJmp.h @@ -0,0 +1,116 @@ +//===--- Passes/LongJmp.h -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_LONGJMP_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_LONGJMP_H + +#include "BinaryPasses.h" + +namespace llvm { +namespace bolt { + +/// LongJmp is veneer-insertion pass originally written for AArch64 that +/// compensates for its short-range branches, typically done during linking. We +/// pull this pass inside BOLT because here we can do a better job at stub +/// inserting by manipulating the CFG, something linkers can't do. +/// +/// LongJmp is a two-step process. In the first step, when function sizes are +/// still unknown because we can insert an arbitrary amount of code to reach +/// far-away code, this pass expands all PC-relative instructions that refer to +/// a symbol at an unknown location likely to violate the branch range. +/// This expansion inserts the equivalent of "linker stubs", small +/// blocks of code that load a 64-bit address into a pre-allocated register and +// then executes an unconditional indirect branch on this register. By using a +/// 64-bit range, we guarantee it can reach any code location. +/// +/// In the second step, we iteratively repeat the following until no +/// modification is done: we do a tentative layout with the current function +/// sizes; then we remove stubs for branches that we know are close enough to be +/// encoded in a direct branch or a smaller stub (32-bit). +/// +/// Notice that this iteration is possible since step 2 strictly reduces sizes +/// and distances between branches and their destinations. +/// +class LongJmpPass : public BinaryFunctionPass { + using StubMapTy = DenseMap>; + /// Used to quickly fetch stubs based on the target they jump to + StubMapTy HotStubs; + StubMapTy ColdStubs; + + /// Used to quickly identify whether a BB is a stub, sharded by function + DenseMap> Stubs; + + using FuncAddressesMapTy = DenseMap; + /// Hold tentative addresses during step 2 + FuncAddressesMapTy HotAddresses; + FuncAddressesMapTy ColdAddresses; + DenseMap BBAddresses; + + /// Used to remove unused stubs + DenseMap StubRefCount; + /// Used to identify the stub size + DenseMap StubBits; + + /// Replace the target of call or conditional branch in \p Inst with a + /// a stub that in turn will branch to the target (perform stub insertion). + /// If a new stub was created, return it. + std::unique_ptr + replaceTargetWithStub(const BinaryContext &BC, BinaryFunction &BF, + BinaryBasicBlock &BB, MCInst &Inst); + + /// -- Step 1 methods -- + /// Process all functions and insert maximum-size stubs so every branch in the + /// program is encodable without violating relocation ranges (relax all + /// branches). + void insertStubs(const BinaryContext &BC, BinaryFunction &BF); + + /// -- Step 2 methods -- + /// Try to do layout before running the emitter, by looking at BinaryFunctions + /// and MCInsts -- this is an estimation. To be correct for longjmp inserter + /// purposes, we need to do a size worst-case estimation. Real layout is done + /// by RewriteInstance::mapFileSections() + void tentativeLayout(const BinaryContext &BC, + std::vector &SortedFunctions); + void tentativeBBLayout(const BinaryContext &BC, const BinaryFunction &Func); + + /// Helper to identify whether \p Inst is branching to a stub + bool usesStub(const BinaryContext &BC, const BinaryFunction &Func, + const MCInst &Inst) const; + + /// Helper to resolve a symbol address according to our tentative layout + uint64_t getSymbolAddress(const BinaryContext &BC, const MCSymbol *Target, + const BinaryBasicBlock *TgtBB) const; + /// Change \p Inst to not use a stub anymore, back to its original form + void removeStubRef(const BinaryContext &BC, + BinaryBasicBlock *BB, MCInst &Inst, + BinaryBasicBlock *StubBB, + const MCSymbol *Target, BinaryBasicBlock *TgtBB); + + /// Step 2 main entry point: Iterate through functions reducing stubs size + /// or completely removing them. + bool removeOrShrinkStubs(const BinaryContext &BC, BinaryFunction &BF); + +public: + /// BinaryPass public interface + + explicit LongJmpPass(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { return "long-jmp"; } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; +} +} + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 2b3759015d85..da71de6b21e3 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -76,7 +76,6 @@ extern cl::OptionCategory BoltOutputCategory; extern cl::OptionCategory AggregatorCategory; extern cl::opt JumpTables; -extern cl::opt ReorderFunctions; static cl::opt PrintCacheMetrics("print-cache-metrics", @@ -781,6 +780,7 @@ void RewriteInstance::discoverStorage() { NewTextSegmentAddress = NextAvailableAddress; NewTextSegmentOffset = NextAvailableOffset; + BC->LayoutStartAddress = NextAvailableAddress; } Optional @@ -2265,24 +2265,8 @@ void RewriteInstance::emitFunctions() { Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_start")); // Sort functions for the output. - std::vector SortedFunctions(BinaryFunctions.size()); - std::transform(BinaryFunctions.begin(), - BinaryFunctions.end(), - SortedFunctions.begin(), - [](std::pair &BFI) { - return &BFI.second; - }); - - if (opts::ReorderFunctions != BinaryFunction::RT_NONE) { - std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), - [](const BinaryFunction *A, const BinaryFunction *B) { - if (A->hasValidIndex() && B->hasValidIndex()) { - return A->getIndex() < B->getIndex(); - } else { - return A->hasValidIndex(); - } - }); - } + std::vector SortedFunctions = + BinaryContext::getSortedFunctions(BinaryFunctions); DEBUG( if (!opts::Relocs) { From 3bcf0ac877ca2b2ea0a206147e3368d7321fa7d3 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 20 Sep 2017 10:43:01 -0700 Subject: [PATCH 334/904] [BOLT-AArch64] Support relocation mode for bzip2 Summary: As we deal with incomplete addresses in address-computing sequences of code in AArch64, we found it is easier to handle them in relocation mode in the presence of relocations. Incomplete addresses may mislead BOLT into thinking there are instructions referring to a basic block when, in fact, this may be the base address of a data reference. If the relocation is present, we can easily spot such cases. This diff contains extensions in relocation mode to understand and deal with AArch64 relocations. It also adds code to process data inside functions as marked by AArch64 ABI (symbol table entries named "$d"). In our code, this is called constant islands handling. Last, it extends bughunter with a "cross" mode, in which the host generates the binaries and the user test them (uploading to the target), useful when debugging in AArch64. (cherry picked from commit 2129321545f1a09dddabf988da42348521a580a4) --- bolt/BinaryContext.cpp | 68 ++++++++++++++++++ bolt/BinaryContext.h | 10 +++ bolt/BinaryFunction.cpp | 123 +++++++++++++++++++++++++++++++- bolt/BinaryFunction.h | 114 ++++++++++++++++++++++++++++++ bolt/Passes/LongJmp.cpp | 74 ++++++++++++++++++-- bolt/Passes/LongJmp.h | 4 ++ bolt/RewriteInstance.cpp | 147 +++++++++++++++++++++++++++------------ bolt/RewriteInstance.h | 14 ---- 8 files changed, 488 insertions(+), 66 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 4415857b4338..0cc4aadcb0d7 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -489,13 +489,71 @@ size_t Relocation::getSizeForType(uint64_t Type) { case ELF::R_X86_64_TPOFF32: case ELF::R_X86_64_GOTPCRELX: case ELF::R_X86_64_REX_GOTPCRELX: + case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_JUMP26: return 4; case ELF::R_X86_64_PC64: case ELF::R_X86_64_64: + case ELF::R_AARCH64_ABS64: return 8; } } +uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents) { + switch (Type) { + default: + llvm_unreachable("unsupported relocation type"); + case ELF::R_AARCH64_ABS64: + return Contents; + case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_CALL26: + // Immediate goes in bits 25:0 of B and BL. + Contents &= ~0xfffffffffc000000ULL; + return SignExtend64<28>(Contents << 2); + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: { + // Bits 32:12 of Symbol address goes in bits 30:29 + 23:5 of ADRP + // instruction + Contents &= ~0xffffffff9f00001fU; + auto LowBits = (Contents >> 29) & 0x3; + auto HighBits = (Contents >> 5) & 0x7ffff; + Contents = LowBits | (HighBits << 2); + return SignExtend64<32>(Contents << 12); + } + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of LD/ST instruction, taken + // from bits 11:3 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 3); + } + case ELF::R_AARCH64_ADD_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 0); + } + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction, taken + // from bits 11:2 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 2); + } + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction, taken + // from bits 11:0 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 0); + } + } +} + bool Relocation::isPCRelative(uint64_t Type) { switch (Type) { default: @@ -505,6 +563,12 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_X86_64_32: case ELF::R_X86_64_32S: case ELF::R_X86_64_TPOFF32: + case ELF::R_AARCH64_ABS64: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: return false; case ELF::R_X86_64_PC8: @@ -514,6 +578,10 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_X86_64_GOTTPOFF: case ELF::R_X86_64_GOTPCRELX: case ELF::R_X86_64_REX_GOTPCRELX: + case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_JUMP26: return true; } } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 1fccb073d9cc..7bf02a2651b9 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -64,6 +64,11 @@ struct Relocation { /// Return size of the given relocation \p Type. static size_t getSizeForType(uint64_t Type); + /// Extract current relocated value from binary contents. This is used for + /// RISC architectures where values are encoded in specific bits depending + /// on the relocation value. + static uint64_t extractValue(uint64_t Type, uint64_t Contents); + /// Return true if relocation type is PC-relative. Return false otherwise. static bool isPCRelative(uint64_t Type); @@ -154,6 +159,11 @@ class BinaryContext { /// final addresses functions will have. uint64_t LayoutStartAddress{0}; + /// Old .text info. + uint64_t OldTextSectionAddress{0}; + uint64_t OldTextSectionOffset{0}; + uint64_t OldTextSectionSize{0}; + /// True if the binary requires immediate relocation processing. bool RequiresZNow{false}; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index e65b90225522..198f1cf3d27d 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -795,10 +795,21 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } } + if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && + isInConstantIsland(TargetAddress)) { + TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "ISLANDat"); + IslandSymbols[TargetAddress - getAddress()] = TargetSymbol; + } + // Note that the address does not necessarily have to reside inside // a section, it could be an absolute address too. auto Section = BC.getSectionForAddress(TargetAddress); - if (Section && Section->isText()) { + // Assume AArch64's ADRP never references code - it does, but this is fixed + // after reading relocations. ADRP contents now are not really meaningful + // without its supporting relocation. + if (!TargetSymbol && Section && Section->isText() && + (BC.TheTriple->getArch() != llvm::Triple::aarch64 || + !BC.MIA->isADRP(Instruction))) { if (containsAddress(TargetAddress)) { if (TargetAddress != getAddress()) { // The address could potentially escape. Mark it as another entry @@ -829,6 +840,16 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { MCInst Instruction; const uint64_t AbsoluteInstrAddr = getAddress() + Offset; + // Check for data inside code and ignore it + if (DataOffsets.find(Offset) != DataOffsets.end()) { + auto Iter = CodeOffsets.upper_bound(Offset); + if (Iter != CodeOffsets.end()) { + Size = *Iter - Offset; + continue; + } + break; + } + if (!BC.DisAsm->getInstruction(Instruction, Size, FunctionData.slice(Offset), @@ -985,10 +1006,16 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // code without re-assembly. size_t RelSize = (Size < 5) ? 1 : 4; auto RelOffset = Offset + Size - RelSize; + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { + RelSize = 0; + RelOffset = Offset; + } auto RI = MoveRelocations.find(RelOffset); if (RI == MoveRelocations.end()) { uint64_t RelType = (RelSize == 1) ? ELF::R_X86_64_PC8 : ELF::R_X86_64_PC32; + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) + RelType = ELF::R_AARCH64_CALL26; DEBUG(dbgs() << "BOLT-DEBUG: creating relocation for static" << " function call to " << TargetSymbol->getName() << " at offset 0x" @@ -2485,6 +2512,9 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { LastIsPrefix = BC.MIA->isPrefix(Instr); } } + + if (!EmitColdPart) + emitConstantIslands(Streamer); } void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { @@ -2545,6 +2575,70 @@ void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { } } +void BinaryFunction::emitConstantIslands(MCStreamer &Streamer) { + if (DataOffsets.empty()) + return; + + Streamer.EmitLabel(getFunctionConstantIslandLabel()); + // Raw contents of the function. + StringRef SectionContents; + Section.getContents(SectionContents); + + // Raw contents of the function. + StringRef FunctionContents = + SectionContents.substr(getAddress() - Section.getAddress(), + getMaxSize()); + + if (opts::Verbosity) + outs() << "BOLT-INFO: emitting constant island for function " << *this + << "\n"; + + auto IS = IslandSymbols.begin(); + + // We split the island into smaller blocks and output labels between them. + for (auto DataIter = DataOffsets.begin(); DataIter != DataOffsets.end(); + ++DataIter) { + uint64_t FunctionOffset = *DataIter; + uint64_t EndOffset = 0ULL; + + // Determine size of this data chunk + auto NextData = std::next(DataIter); + auto CodeIter = CodeOffsets.lower_bound(*DataIter); + if (CodeIter == CodeOffsets.end() && NextData == DataOffsets.end()) { + EndOffset = getMaxSize(); + } else if (CodeIter == CodeOffsets.end()) { + EndOffset = *NextData; + } else if (NextData == DataOffsets.end()) { + EndOffset = *CodeIter; + } else { + EndOffset = (*CodeIter > *NextData) ? *NextData : *CodeIter; + } + + if (FunctionOffset == EndOffset) + continue; // Size is zero, nothing to emit + + // Emit labels and data + while (IS != IslandSymbols.end() && IS->first < EndOffset) { + auto NextStop = IS->first; + assert(NextStop <= EndOffset && "internal overflow error"); + if (FunctionOffset < NextStop) { + Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, NextStop)); + FunctionOffset = NextStop; + } + DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << IS->second->getName() + << " at offset 0x" << Twine::utohexstr(IS->first) << '\n'); + Streamer.EmitLabel(IS->second); + ++IS; + } + assert(FunctionOffset <= EndOffset && "overflow error"); + if (FunctionOffset < EndOffset) { + Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, EndOffset)); + } + } + + assert(IS == IslandSymbols.end() && "some symbols were not emitted!"); +} + namespace { #ifndef MAX_PATH @@ -3334,10 +3428,37 @@ BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From, return NewBBPtr; } +bool BinaryFunction::isDataMarker(const SymbolRef &Symbol, + uint64_t SymbolSize) const { + // For aarch64, the ABI defines mapping symbols so we identify data in the + // code section (see IHI0056B). $d identifies a symbol starting data contents. + if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && + Symbol.getType() == SymbolRef::ST_Unknown && + SymbolSize == 0 && + (!Symbol.getName().getError() && *Symbol.getName() == "$d")) + return true; + return false; +} + +bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol, + uint64_t SymbolSize) const { + // For aarch64, the ABI defines mapping symbols so we identify data in the + // code section (see IHI0056B). $x identifies a symbol starting code or the + // end of a data chunk inside code. + if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && + Symbol.getType() == SymbolRef::ST_Unknown && + SymbolSize == 0 && + (!Symbol.getName().getError() && *Symbol.getName() == "$x")) + return true; + return false; +} + bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const { // Some symbols are tolerated inside function bodies, others are not. // The real function boundaries may not be known at this point. + if (isDataMarker(Symbol, SymbolSize) || isCodeMarker(Symbol, SymbolSize)) + return true; // It's okay to have a zero-sized symbol in the middle of non-zero-sized // function. diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 04b3a51697fe..fd43bf33e38c 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -415,6 +415,18 @@ class BinaryFunction { /// Temporary holder of offsets that are potentially entry points. std::unordered_set EntryOffsets; + /// Temporary holder of offsets that are data markers (used in AArch) + /// It is possible to have data in code sections. To ease the identification + /// of data in code sections, the ABI requires the symbol table to have + /// symbols named "$d" identifying the start of data inside code and "$x" + /// identifying the end of a chunk of data inside code. DataOffsets contain + /// all offsets of $d symbols and CodeOffsets all offsets of $x symbols. + std::set DataOffsets; + std::set CodeOffsets; + /// The address offset where we emitted the constant island, that is, the + /// chunk of data in the function code area (AArch only) + int64_t OutputDataOffset; + /// Map labels to corresponding basic blocks. std::unordered_map LabelToBB; @@ -621,6 +633,10 @@ class BinaryFunction { /// Offsets in function that should have PC-relative relocation. std::set PCRelativeRelocationOffsets; + /// Offsets in function that are data values in a constant island identified + /// after disassembling + std::map IslandSymbols; + // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), // the terminating instructions need to be modified. @@ -657,6 +673,8 @@ class BinaryFunction { /// Symbol at the end of the cold part of split function. mutable MCSymbol *FunctionColdEndLabel{nullptr}; + mutable MCSymbol *FunctionConstantIslandLabel{nullptr}; + /// Unique number associated with the function. uint64_t FunctionNumber; @@ -690,6 +708,16 @@ class BinaryFunction { /// of the function. MCSymbol *getOrCreateLocalLabel(uint64_t Address, bool CreatePastEnd = false); + /// Register an entry point at a given \p Offset into the function. + void markDataAtOffset(uint64_t Offset) { + DataOffsets.emplace(Offset); + } + + /// Register an entry point at a given \p Offset into the function. + void markCodeAtOffset(uint64_t Offset) { + CodeOffsets.emplace(Offset); + } + /// Register an entry point at a given \p Offset into the function. MCSymbol *addEntryPointAtOffset(uint64_t Offset) { EntryOffsets.emplace(Offset); @@ -1097,6 +1125,17 @@ class BinaryFunction { return FunctionColdEndLabel; } + /// Return a label used to identify where the constant island was emitted + /// (AArch only). This is used to update the symbol table accordingly, + /// emitting data marker symbols as required by the ABI. + MCSymbol *getFunctionConstantIslandLabel() const { + if (!FunctionConstantIslandLabel) { + FunctionConstantIslandLabel = + BC.Ctx->createTempSymbol("func_const_island", true); + } + return FunctionConstantIslandLabel; + } + /// Return true if this is a function representing a PLT entry. bool isPLTFunction() const { return PLTSymbol != nullptr; @@ -1126,6 +1165,13 @@ class BinaryFunction { case ELF::R_X86_64_32: case ELF::R_X86_64_32S: case ELF::R_X86_64_64: + case ELF::R_AARCH64_ABS64: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: Relocations.emplace(Offset, Relocation{Offset, Symbol, RelType, Addend, Value}); break; @@ -1614,6 +1660,71 @@ class BinaryFunction { return ColdLSDASymbol; } + /// True if the symbol is a mapping symbol used in AArch64 to delimit + /// data inside code section. + bool isDataMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const; + bool isCodeMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const; + + void setOutputDataAddress(uint64_t Address) { + OutputDataOffset = Address; + } + + uint64_t getOutputDataAddress() const { + return OutputDataOffset; + } + + /// Detects whether \p Address is inside a data region in this function + /// (constant islands). + bool isInConstantIsland(uint64_t Address) const { + if (Address <= getAddress()) + return false; + + auto Offset = Address - getAddress(); + + if (Offset >= getMaxSize()) + return false; + + auto DataIter = DataOffsets.upper_bound(Offset); + if (DataIter == DataOffsets.begin()) + return false; + DataIter = std::prev(DataIter); + + auto CodeIter = CodeOffsets.upper_bound(Offset); + if (CodeIter == CodeOffsets.begin()) + return true; + + return *std::prev(CodeIter) <= *DataIter; + } + + uint64_t estimateConstantIslandSize() const { + uint64_t Size = 0; + for (auto DataIter = DataOffsets.begin(); DataIter != DataOffsets.end(); + ++DataIter) { + auto NextData = std::next(DataIter); + auto CodeIter = CodeOffsets.lower_bound(*DataIter); + if (CodeIter == CodeOffsets.end() && + NextData == DataOffsets.end()) { + Size += getMaxSize() - *DataIter; + continue; + } + + uint64_t NextMarker; + if (CodeIter == CodeOffsets.end()) + NextMarker = *NextData; + else if (NextData == DataOffsets.end()) + NextMarker = *CodeIter; + else + NextMarker = (*CodeIter > *NextData) ? *NextData : *CodeIter; + + Size += NextMarker - *DataIter; + } + return Size; + } + + bool hasConstantIsland() const { + return !DataOffsets.empty(); + } + /// Return true iff the symbol could be seen inside this function otherwise /// it is probably another function. bool isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const; @@ -1787,6 +1898,9 @@ class BinaryFunction { /// Emit function as a blob with relocations and labels for relocations. void emitBodyRaw(MCStreamer *Streamer); + /// Helper for emitBody to write data inside a function (used for AArch64) + void emitConstantIslands(MCStreamer &Streamer); + /// Merge profile data of this function into those of the given /// function. The functions should have been proven identical with /// isIdenticalWith. diff --git a/bolt/Passes/LongJmp.cpp b/bolt/Passes/LongJmp.cpp index 412c71a6c6ec..c0e35e482b78 100644 --- a/bolt/Passes/LongJmp.cpp +++ b/bolt/Passes/LongJmp.cpp @@ -17,6 +17,9 @@ using namespace llvm; namespace opts { extern cl::opt Relocs; +extern cl::opt UseOldText; +extern cl::opt AlignFunctions; +extern cl::opt AlignFunctionsMaxBytes; } namespace llvm { @@ -24,13 +27,13 @@ namespace bolt { namespace { constexpr unsigned ColdFragAlign = 16; +constexpr unsigned PageAlign = 0x200000; std::pair, MCSymbol *> createNewStub(const BinaryContext &BC, BinaryFunction &Func, const MCSymbol *TgtSym) { auto *StubSym = BC.Ctx->createTempSymbol("Stub", true); auto StubBB = Func.createBasicBlock(0, StubSym); - std::vector Seq; BC.MIA->createLongJmp(Seq, TgtSym, BC.Ctx.get()); StubBB->addInstructions(Seq.begin(), Seq.end()); @@ -186,13 +189,36 @@ void LongJmpPass::tentativeBBLayout(const BinaryContext &BC, } } -void LongJmpPass::tentativeLayout( - const BinaryContext &BC, - std::vector &SortedFunctions) { - assert(!opts::Relocs && "Pass is incompatible with relocs"); - uint64_t DotAddress = BC.LayoutStartAddress; +uint64_t LongJmpPass::tentativeLayoutRelocMode( + const BinaryContext &BC, std::vector &SortedFunctions, + uint64_t DotAddress) { + // Hot for (auto Func : SortedFunctions) { + DotAddress = RoundUpToAlignment(DotAddress, BinaryFunction::MinAlign); + auto Pad = OffsetToAlignment(DotAddress, opts::AlignFunctions); + if (Pad <= opts::AlignFunctionsMaxBytes) + DotAddress += Pad; + HotAddresses[Func] = DotAddress; + DEBUG(dbgs() << Func->getPrintName() + << " tentative: " << Twine::utohexstr(DotAddress) << "\n"); + if (!Func->isSimple()) { + DotAddress += Func->getMaxSize(); + } else { + if (!Func->isSplit()) { + DotAddress += Func->estimateSize(); + } else { + DotAddress += Func->estimateHotSize(); + DotAddress += Func->estimateConstantIslandSize(); + } + } + } + // Cold + for (auto Func : SortedFunctions) { + DotAddress = RoundUpToAlignment(DotAddress, BinaryFunction::MinAlign); + auto Pad = OffsetToAlignment(DotAddress, opts::AlignFunctions); + if (Pad <= opts::AlignFunctionsMaxBytes) + DotAddress += Pad; HotAddresses[Func] = Func->getAddress(); DotAddress = RoundUpToAlignment(DotAddress, ColdFragAlign); ColdAddresses[Func] = DotAddress; @@ -200,6 +226,42 @@ void LongJmpPass::tentativeLayout( DotAddress += Func->estimateColdSize(); tentativeBBLayout(BC, *Func); } + return DotAddress; +} + +void LongJmpPass::tentativeLayout( + const BinaryContext &BC, + std::vector &SortedFunctions) { + uint64_t DotAddress = BC.LayoutStartAddress; + + if (!opts::Relocs) { + for (auto Func : SortedFunctions) { + HotAddresses[Func] = Func->getAddress(); + DotAddress = RoundUpToAlignment(DotAddress, 16); + ColdAddresses[Func] = DotAddress; + if (Func->isSplit()) + DotAddress += Func->estimateColdSize(); + tentativeBBLayout(BC, *Func); + } + + return; + } + + // Relocation mode + auto EstimatedTextSize = tentativeLayoutRelocMode(BC, SortedFunctions, 0); + + // Initial padding + if (opts::UseOldText && EstimatedTextSize <= BC.OldTextSectionSize) { + DotAddress = BC.OldTextSectionAddress; + auto Pad = OffsetToAlignment(DotAddress, PageAlign); + if (Pad + EstimatedTextSize <= BC.OldTextSectionSize) { + DotAddress += Pad; + } + } else { + DotAddress = RoundUpToAlignment(BC.LayoutStartAddress, PageAlign); + } + + tentativeLayoutRelocMode(BC, SortedFunctions, DotAddress); } void LongJmpPass::removeStubRef(const BinaryContext &BC, diff --git a/bolt/Passes/LongJmp.h b/bolt/Passes/LongJmp.h index 9c4b6c63e5bc..e54cc1ccb2b7 100644 --- a/bolt/Passes/LongJmp.h +++ b/bolt/Passes/LongJmp.h @@ -79,6 +79,10 @@ class LongJmpPass : public BinaryFunctionPass { /// by RewriteInstance::mapFileSections() void tentativeLayout(const BinaryContext &BC, std::vector &SortedFunctions); + uint64_t + tentativeLayoutRelocMode(const BinaryContext &BC, + std::vector &SortedFunctions, + uint64_t DotAddress); void tentativeBBLayout(const BinaryContext &BC, const BinaryFunction &Func); /// Helper to identify whether \p Inst is branching to a stub diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index da71de6b21e3..08f2850ea0d1 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -90,14 +90,14 @@ OutputFilename("o", cl::Required, cl::cat(BoltOutputCategory)); -static cl::opt +cl::opt AlignFunctions("align-functions", cl::desc("align functions at a given value (relocation mode)"), cl::init(64), cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt +cl::opt AlignFunctionsMaxBytes("align-functions-max-bytes", cl::desc("maximum number of bytes to use to align functions"), cl::init(32), @@ -406,7 +406,6 @@ size_t padFunction(const BinaryFunction &Function) { } // namespace opts constexpr const char *RewriteInstance::SectionsToOverwrite[]; -constexpr const char *RewriteInstance::SectionsToOverwriteRelocMode[]; const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; @@ -673,11 +672,12 @@ void RewriteInstance::aggregateData() { } void RewriteInstance::discoverStorage() { - - // Tell EE that we guarantee we don't need stubs for x86, but not for aarch64 + // Stubs are harmful because RuntimeDyld may try to increase the size of + // sections accounting for stubs when we need those sections to match the + // same size seen in the input binary, in case this section is a copy + // of the original one seen in the binary. EFMM.reset(new ExecutableFileMemoryManager( - /*AllowStubs*/ (BC->TheTriple->getArch() == llvm::Triple::aarch64 && - opts::Relocs))); + /*AllowStubs*/ false)); auto ELF64LEFile = dyn_cast(InputFile); if (!ELF64LEFile) { @@ -715,9 +715,9 @@ void RewriteInstance::discoverStorage() { StringRef SectionContents; Section.getContents(SectionContents); if (SectionName == ".text") { - OldTextSectionAddress = Section.getAddress(); - OldTextSectionSize = Section.getSize(); - OldTextSectionOffset = + BC->OldTextSectionAddress = Section.getAddress(); + BC->OldTextSectionSize = Section.getSize(); + BC->OldTextSectionOffset = SectionContents.data() - InputFile->getData().data(); } @@ -869,6 +869,22 @@ void RewriteInstance::run() { return; } + // Flip unsupported flags in AArch64 mode + if (BC->TheTriple->getArch() == llvm::Triple::aarch64) { + if (opts::BoostMacroops) { + opts::BoostMacroops = false; + outs() << "BOLT-INFO: disabling -boost-macroops for AArch64\n"; + } + if (opts::Relocs && opts::UseOldText) { + opts::UseOldText = false; + outs() << "BOLT-INFO: disabling -use-old-text for AArch64\n"; + } + if (!opts::Relocs) { + outs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully " + "supported\n"; + } + } + auto executeRewritePass = [&](const std::set &NonSimpleFunctions) { discoverStorage(); readSpecialSections(); @@ -1020,13 +1036,32 @@ void RewriteInstance::discoverFileObjects() { continue; } - FileSymRefs[Address] = Symbol; + // In aarch, make $x symbols be replaceable by a more meaningful one + // whenever possible + if (BC->TheTriple->getArch() != llvm::Triple::aarch64 || + FileSymRefs.find(Address) == FileSymRefs.end()) { + FileSymRefs[Address] = Symbol; + } else { + if (FileSymRefs[Address].getType() == SymbolRef::ST_Unknown && + *FileSymRefs[Address].getName() == "$x") + FileSymRefs[Address] = Symbol; + else if (Symbol.getType() != SymbolRef::ST_Unknown || + *NameOrError != "$x") + FileSymRefs[Address] = Symbol; + } // There's nothing horribly wrong with anonymous symbols, but let's // ignore them for now. if (NameOrError->empty()) continue; + // For aarch64, the ABI defines mapping symbols so we identify data in the + // code section (see IHI0056B). $d identifies data contents. + if (BC->TheTriple->getArch() == llvm::Triple::aarch64 && + Symbol.getType() == SymbolRef::ST_Unknown && + (*NameOrError == "$d" || *NameOrError == "$x")) + continue; + /// It is possible we are seeing a globalized local. LLVM might treat it as /// a local if it has a "private global" prefix, e.g. ".L". Thus we have to /// change the prefix to enforce global scope of the symbol. @@ -1376,16 +1411,21 @@ void RewriteInstance::adjustFunctionBoundaries() { // This is potentially another entry point into the function. auto EntryOffset = NextSymRefI->first - Function.getAddress(); - DEBUG(dbgs() << "BOLT-DEBUG: adding entry point to function " - << Function << " at offset 0x" - << Twine::utohexstr(EntryOffset) << '\n'); - Function.addEntryPointAtOffset(EntryOffset); - - // In non-relocation mode there's potentially an external undetectable - // reference to the entry point and hence we cannot move this entry point. - // Optimizing without moving could be difficult. - if (!opts::Relocs) - Function.setSimple(false); + if (Function.isDataMarker(Symbol, SymbolSize)) { + Function.markDataAtOffset(EntryOffset); + } else if (Function.isCodeMarker(Symbol, SymbolSize)) { + Function.markCodeAtOffset(EntryOffset); + } else { + DEBUG(dbgs() << "BOLT-DEBUG: adding entry point to function " + << Function << " at offset 0x" + << Twine::utohexstr(EntryOffset) << '\n'); + Function.addEntryPointAtOffset(EntryOffset); + // In non-relocation mode there's potentially an external undetectable + // reference to the entry point and hence we cannot move this entry + // point. Optimizing without moving could be difficult. + if (!opts::Relocs) + Function.setSimple(false); + } ++NextSymRefI; } @@ -1662,6 +1702,9 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { DE.getSigned(&RelocationOffset, Relocation::getSizeForType(Rel.getType()))); + if (BC->TheTriple->getArch() == llvm::Triple::aarch64) + ExtractedValue = Relocation::extractValue(Rel.getType(), ExtractedValue); + bool IsPCRelative = Relocation::isPCRelative(Rel.getType()); auto Addend = getRelocationAddend(InputFile, Rel); uint64_t Address = 0; @@ -2420,13 +2463,13 @@ void RewriteInstance::mapFileSections( auto &SI = SMII->second; uint64_t NewTextSectionOffset = 0; - if (opts::UseOldText && SI.Size <= OldTextSectionSize) { + if (opts::UseOldText && SI.Size <= BC->OldTextSectionSize) { outs() << "BOLT-INFO: using original .text for new code\n"; // Utilize the original .text for storage. - NewTextSectionStartAddress = OldTextSectionAddress; - NewTextSectionOffset = OldTextSectionOffset; + NewTextSectionStartAddress = BC->OldTextSectionAddress; + NewTextSectionOffset = BC->OldTextSectionOffset; auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); - if (Padding + SI.Size <= OldTextSectionSize) { + if (Padding + SI.Size <= BC->OldTextSectionSize) { outs() << "BOLT-INFO: using 0x200000 alignment\n"; NewTextSectionStartAddress += Padding; NewTextSectionOffset += Padding; @@ -2434,7 +2477,7 @@ void RewriteInstance::mapFileSections( } else { if (opts::UseOldText) { errs() << "BOLT-ERROR: original .text too small to fit the new code. " - << SI.Size << " bytes needed, have " << OldTextSectionSize + << SI.Size << " bytes needed, have " << BC->OldTextSectionSize << " bytes available.\n"; } auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); @@ -2621,6 +2664,11 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { const auto StartOffset = Layout.getSymbolOffset(*Function.getSymbol()); const auto EndOffset = Layout.getSymbolOffset(*Function.getFunctionEndLabel()); + if (Function.hasConstantIsland()) { + const auto DataOffset = + Layout.getSymbolOffset(*Function.getFunctionConstantIslandLabel()); + Function.setOutputDataAddress(BaseAddress + DataOffset); + } Function.setOutputAddress(BaseAddress + StartOffset); Function.setOutputSize(EndOffset - StartOffset); if (Function.isSplit()) { @@ -3314,9 +3362,6 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { template void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { - if (!opts::Relocs) - return; - auto *Obj = File->getELFFile(); // Set pointer at the end of the output file, so we can pwrite old symbol // tables if we need to. @@ -3350,7 +3395,10 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { NewSymbol.getType() != ELF::STT_SECTION) { NewSymbol.st_value = Function->getOutputAddress(); NewSymbol.st_size = Function->getOutputSize(); - NewSymbol.st_shndx = NewTextSectionIndex; + if (opts::Relocs) + NewSymbol.st_shndx = NewTextSectionIndex; + else + NewSymbol.st_shndx = NewSectionIndex[NewSymbol.st_shndx]; if (!PatchExisting && Function->isSplit()) { auto NewColdSym = NewSymbol; SmallVector Buf; @@ -3362,6 +3410,24 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { Write(0, reinterpret_cast(&NewColdSym), sizeof(NewColdSym)); } + if (!PatchExisting && Function->hasConstantIsland()) { + auto DataMark = Function->getOutputDataAddress(); + auto CISize = Function->estimateConstantIslandSize(); + auto CodeMark = DataMark + CISize; + auto DataMarkSym = NewSymbol; + DataMarkSym.st_name = AddToStrTab("$d"); + DataMarkSym.st_value = DataMark; + DataMarkSym.st_size = 0; + DataMarkSym.setType(ELF::STT_NOTYPE); + DataMarkSym.setBinding(ELF::STB_LOCAL); + auto CodeMarkSym = DataMarkSym; + CodeMarkSym.st_name = AddToStrTab("$x"); + CodeMarkSym.st_value = CodeMark; + Write(0, reinterpret_cast(&DataMarkSym), + sizeof(DataMarkSym)); + Write(0, reinterpret_cast(&CodeMarkSym), + sizeof(CodeMarkSym)); + } } else { if (NewSymbol.st_shndx < ELF::SHN_LORESERVE) { NewSymbol.st_shndx = NewSectionIndex[NewSymbol.st_shndx]; @@ -3369,7 +3435,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { // Detect local syms in the text section that we didn't update // and were preserved by the linker to support relocations against // .text (t15274167). Remove then from the symtab. - if (opts::Relocs && NewSymbol.getType() == ELF::STT_NOTYPE && + if (NewSymbol.getType() == ELF::STT_NOTYPE && NewSymbol.getBinding() == ELF::STB_LOCAL && NewSymbol.st_size == 0) { if (auto SecOrErr = @@ -3804,10 +3870,8 @@ void RewriteInstance::rewriteFile() { // Finalize memory image of section string table. finalizeSectionStringTable(); - if (opts::Relocs) { - // Update symbol tables. - patchELFSymTabs(); - } + // Update symbol tables. + patchELFSymTabs(); // Copy non-allocatable sections once allocatable part is finished. rewriteNoteSections(); @@ -3927,16 +3991,9 @@ uint64_t RewriteInstance::getFileOffsetForAddress(uint64_t Address) const { } bool RewriteInstance::willOverwriteSection(StringRef SectionName) { - if (opts::Relocs) { - for (auto &OverwriteName : SectionsToOverwriteRelocMode) { - if (SectionName == OverwriteName) - return true; - } - } else { - for (auto &OverwriteName : SectionsToOverwrite) { - if (SectionName == OverwriteName) - return true; - } + for (auto &OverwriteName : SectionsToOverwrite) { + if (SectionName == OverwriteName) + return true; } auto SMII = EFMM->SectionMapInfo.find(SectionName); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index ca3a106dc56c..a0b8a7bac360 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -404,15 +404,6 @@ class RewriteInstance { /// When updating debug info, these are the sections we overwrite. static constexpr const char *SectionsToOverwrite[] = { - ".shstrtab", - ".debug_aranges", - ".debug_line", - ".debug_loc", - ".debug_ranges", - ".gdb_index", - }; - - static constexpr const char *SectionsToOverwriteRelocMode[] = { ".shstrtab", ".symtab", ".strtab", @@ -458,11 +449,6 @@ class RewriteInstance { uint64_t PHDRTableOffset{0}; unsigned Phnum{0}; - /// Old .text info. - uint64_t OldTextSectionAddress{0}; - uint64_t OldTextSectionOffset{0}; - uint64_t OldTextSectionSize{0}; - /// New code segment info. uint64_t NewTextSegmentAddress{0}; uint64_t NewTextSegmentOffset{0}; From daed529531242949b9c81a603d2687113e45442f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 6 Nov 2017 11:52:58 -0800 Subject: [PATCH 335/904] [BOLT] Fix implementation for TSP solution Summary: Fix a bug in reconstruction of an optimal path. When calculating the best path we need to take into account a path from new "last" node to the current last node. Add "-tsp-threshold" (defaults to 10) to control when the TSP algorithm should be used. (cherry picked from commit 3130d9cddcecdd07eea34f43a6ac65324dbd9cfd) --- bolt/BinaryBasicBlock.h | 28 +++++++------- bolt/Passes/BinaryPasses.cpp | 12 +++++- bolt/Passes/BinaryPasses.h | 4 -- bolt/Passes/ReorderAlgorithm.cpp | 64 +++++++++++++++++--------------- 4 files changed, 59 insertions(+), 49 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 08a6a7e142e6..f619b385d1bf 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -90,7 +90,7 @@ class BinaryBasicBlock { unsigned Index{InvalidIndex}; /// Index in the current layout. - unsigned LayoutIndex{InvalidIndex}; + mutable unsigned LayoutIndex{InvalidIndex}; /// Number of pseudo instructions in this block. uint32_t NumPseudos{0}; @@ -778,6 +778,19 @@ class BinaryBasicBlock { /// Returns an estimate of size of basic block during run time. uint64_t estimateSize() const; + /// Return index in the current layout. The user is responsible for + /// making sure the indices are up to date, + /// e.g. by calling BinaryFunction::updateLayoutIndices(); + unsigned getLayoutIndex() const { + assert(isValid()); + return LayoutIndex; + } + + /// Set layout index. To be used by BinaryFunction. + void setLayoutIndex(unsigned Index) const { + LayoutIndex = Index; + } + private: void adjustNumPseudos(const MCInst &Inst, int Sign); @@ -815,19 +828,6 @@ class BinaryBasicBlock { void setIndex(unsigned I) { Index = I; } - - /// Return index in the current layout. The user is responsible for - /// making sure the indices are up to date, - /// e.g. by calling BinaryFunction::updateLayoutIndices(); - unsigned getLayoutIndex() const { - assert(isValid()); - return LayoutIndex; - } - - /// Set layout index. To be used by BinaryFunction. - void setLayoutIndex(unsigned Index) { - LayoutIndex = Index; - } }; bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 43021aa13d01..4dc00fdc3805 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -169,6 +169,15 @@ SplitEH("split-eh", cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt +TSPThreshold("tsp-threshold", + cl::desc("maximum number of hot basic blocks in a function for which to use " + "a precise TSP solution while re-ordering basic blocks"), + cl::init(10), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { @@ -389,8 +398,7 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, if (Type == LT_REVERSE) { Algo.reset(new ReverseReorderAlgorithm()); - } - else if (BF.size() <= FUNC_SIZE_THRESHOLD && Type != LT_OPTIMIZE_SHUFFLE) { + } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { // Work on optimal solution if problem is small enough DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); Algo.reset(new OptimalReorderAlgorithm()); diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 1c5539199063..5cf91bc387d2 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -174,10 +174,6 @@ class ReorderBasicBlocks : public BinaryFunctionPass { }; private: - // Function size, in number of BBs, above which we fallback to a heuristic - // solution to the layout problem instead of seeking the optimal one. - static constexpr uint64_t FUNC_SIZE_THRESHOLD = 10; - void modifyFunctionLayout(BinaryFunction &Function, LayoutType Type, bool MinBranchClusters, diff --git a/bolt/Passes/ReorderAlgorithm.cpp b/bolt/Passes/ReorderAlgorithm.cpp index b475b6f58bc9..6956a8207ba8 100644 --- a/bolt/Passes/ReorderAlgorithm.cpp +++ b/bolt/Passes/ReorderAlgorithm.cpp @@ -396,24 +396,26 @@ void MinBranchGreedyClusterAlgorithm::reset() { } void OptimalReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { + const BinaryFunction &BF, BasicBlockOrder &Order) const { std::vector> Weight; - std::unordered_map BBToIndex; std::vector IndexToBB; - unsigned N = BF.layout_size(); + const auto N = BF.layout_size(); + assert(N <= std::numeric_limits::digits && + "cannot use TSP solution for sizes larger than bits in uint64_t"); + // Populating weight map and index map - for (auto BB : BF.layout()) { - BBToIndex[BB] = IndexToBB.size(); + for (auto *BB : BF.layout()) { + BB->setLayoutIndex(IndexToBB.size()); IndexToBB.push_back(BB); } Weight.resize(N); - for (auto BB : BF.layout()) { + for (auto *BB : BF.layout()) { auto BI = BB->branch_info_begin(); - Weight[BBToIndex[BB]].resize(N); - for (auto I : BB->successors()) { + Weight[BB->getLayoutIndex()].resize(N); + for (auto *SuccBB : BB->successors()) { if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) - Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count; + Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; ++BI; } } @@ -427,26 +429,26 @@ void OptimalReorderAlgorithm::reorderBasicBlocks( DP[1][0] = 0; // Walk through TSP solutions using a bitmask to represent state (current set // of BBs in the layout) - unsigned BestSet = 1; - unsigned BestLast = 0; + uint64_t BestSet = 1; + uint64_t BestLast = 0; int64_t BestWeight = 0; - for (unsigned Set = 1; Set < (1U << N); ++Set) { + for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { // Traverse each possibility of Last BB visited in this layout - for (unsigned Last = 0; Last < N; ++Last) { + for (uint64_t Last = 0; Last < N; ++Last) { // Case 1: There is no possible layout with this BB as Last if (DP[Set][Last] == -1) continue; // Case 2: There is a layout with this Set and this Last, and we try // to expand this set with New - for (unsigned New = 1; New < N; ++New) { + for (uint64_t New = 1; New < N; ++New) { // Case 2a: BB "New" is already in this Set - if ((Set & (1 << New)) != 0) + if ((Set & (1ULL << New)) != 0) continue; // Case 2b: BB "New" is not in this set and we add it to this Set and // record total weight of this layout with "New" as the last BB. - unsigned NewSet = (Set | (1 << New)); + uint64_t NewSet = (Set | (1ULL << New)); if (DP[NewSet][New] == -1) DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; DP[NewSet][New] = std::max(DP[NewSet][New], @@ -462,38 +464,42 @@ void OptimalReorderAlgorithm::reorderBasicBlocks( } // Define final function layout based on layout that maximizes weight - unsigned Last = BestLast; - unsigned Set = BestSet; + uint64_t Last = BestLast; + uint64_t Set = BestSet; std::vector Visited; Visited.resize(N); Visited[Last] = true; Order.push_back(IndexToBB[Last]); - Set = Set & ~(1U << Last); + Set = Set & ~(1ULL << Last); while (Set != 0) { int64_t Best = -1; - for (unsigned I = 0; I < N; ++I) { + uint64_t NewLast; + for (uint64_t I = 0; I < N; ++I) { if (DP[Set][I] == -1) continue; - if (DP[Set][I] > Best) { - Last = I; - Best = DP[Set][I]; + int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; + if (DP[Set][I] + AdjWeight > Best) { + NewLast = I; + Best = DP[Set][I] + AdjWeight; } } + Last = NewLast; Visited[Last] = true; Order.push_back(IndexToBB[Last]); - Set = Set & ~(1U << Last); + Set = Set & ~(1ULL << Last); } std::reverse(Order.begin(), Order.end()); - // Finalize layout with BBs that weren't assigned to the layout - for (auto BB : BF.layout()) { - if (Visited[BBToIndex[BB]] == false) + // Finalize layout with BBs that weren't assigned to the layout using the + // input layout. + for (auto *BB : BF.layout()) { + if (Visited[BB->getLayoutIndex()] == false) Order.push_back(BB); } } void OptimizeReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { + const BinaryFunction &BF, BasicBlockOrder &Order) const { if (BF.layout_empty()) return; @@ -509,7 +515,7 @@ void OptimizeReorderAlgorithm::reorderBasicBlocks( } void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { + const BinaryFunction &BF, BasicBlockOrder &Order) const { if (BF.layout_empty()) return; From b8a1ea7f04835e222edc171f0f4a7edf1ed3702d Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 16 Oct 2017 11:12:22 -0700 Subject: [PATCH 336/904] [BOLT-AArch64] Support reordering spec06 gcc relocs Summary: Enhance the basic infrastructure for relocation mode for AArch64 to make a reasonably large program work after reordering (gcc). Detect jump table patterns and skip optimizing functions with jump tables in AArch64, as those will require extra future effort to fully decode. To make these work in relocation mode, we skip changing the function body and introduce a mode to preserve even the original nops. By not changing any local offsets in the function, the input original jump tables should just work. Functions with no jump tables are optimized with BB reordering. No other optimizations have been tested. (cherry picked from commit 98be4fddbe5feb89597f1a73ff1c86b39e2b1913) --- bolt/BinaryBasicBlock.h | 1 + bolt/BinaryContext.cpp | 41 ++++++++++-- bolt/BinaryContext.h | 9 ++- bolt/BinaryFunction.cpp | 131 ++++++++++++++++++++++++++++++++------- bolt/BinaryFunction.h | 20 +++++- bolt/Passes/LongJmp.cpp | 10 ++- bolt/RewriteInstance.cpp | 37 ++++++++--- 7 files changed, 206 insertions(+), 43 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index f619b385d1bf..f42a4ceb241a 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -26,6 +26,7 @@ #include namespace llvm { + namespace bolt { class BinaryFunction; diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 0cc4aadcb0d7..c11f96375b09 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -493,11 +493,14 @@ size_t Relocation::getSizeForType(uint64_t Type) { case ELF::R_AARCH64_ADR_PREL_PG_HI21: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST8_ABS_LO12_NC: case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_PREL32: return 4; case ELF::R_X86_64_PC64: case ELF::R_X86_64_64: @@ -506,26 +509,31 @@ size_t Relocation::getSizeForType(uint64_t Type) { } } -uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents) { +uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, + uint64_t PC) { switch (Type) { default: llvm_unreachable("unsupported relocation type"); case ELF::R_AARCH64_ABS64: return Contents; + case ELF::R_AARCH64_PREL32: + return static_cast(PC) + SignExtend64<32>(Contents & 0xffffffff); case ELF::R_AARCH64_JUMP26: case ELF::R_AARCH64_CALL26: // Immediate goes in bits 25:0 of B and BL. Contents &= ~0xfffffffffc000000ULL; - return SignExtend64<28>(Contents << 2); + return static_cast(PC) + SignExtend64<28>(Contents << 2); case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_ADR_PREL_PG_HI21: { // Bits 32:12 of Symbol address goes in bits 30:29 + 23:5 of ADRP // instruction - Contents &= ~0xffffffff9f00001fU; + Contents &= ~0xffffffff9f00001fUll; auto LowBits = (Contents >> 29) & 0x3; auto HighBits = (Contents >> 5) & 0x7ffff; Contents = LowBits | (HighBits << 2); - return SignExtend64<32>(Contents << 12); + Contents = static_cast(PC) + SignExtend64<32>(Contents << 12); + Contents &= ~0xfffUll; + return Contents; } case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: { @@ -539,12 +547,24 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents) { Contents &= ~0xffffffffffc003ffU; return Contents >> (10 - 0); } + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction, taken + // from bits 11:4 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 4); + } case ELF::R_AARCH64_LDST32_ABS_LO12_NC: { // Immediate goes in bits 21:10 of ADD instruction, taken // from bits 11:2 of Symbol address Contents &= ~0xffffffffffc003ffU; return Contents >> (10 - 2); } + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction, taken + // from bits 11:1 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 1); + } case ELF::R_AARCH64_LDST8_ABS_LO12_NC: { // Immediate goes in bits 21:10 of ADD instruction, taken // from bits 11:0 of Symbol address @@ -554,6 +574,16 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents) { } } +bool Relocation::isGOT(uint64_t Type) { + switch (Type) { + default: + return false; + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + return true; + } +} + bool Relocation::isPCRelative(uint64_t Type) { switch (Type) { default: @@ -566,7 +596,9 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_AARCH64_ABS64: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST8_ABS_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: return false; @@ -582,6 +614,7 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_AARCH64_ADR_PREL_PG_HI21: case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_PREL32: return true; } } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 7bf02a2651b9..b01aa0236e20 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -67,11 +67,14 @@ struct Relocation { /// Extract current relocated value from binary contents. This is used for /// RISC architectures where values are encoded in specific bits depending /// on the relocation value. - static uint64_t extractValue(uint64_t Type, uint64_t Contents); + static uint64_t extractValue(uint64_t Type, uint64_t Contents, uint64_t PC); /// Return true if relocation type is PC-relative. Return false otherwise. static bool isPCRelative(uint64_t Type); + /// Return true if relocation type implies the creation of a GOT entry + static bool isGOT(uint64_t Type); + /// Emit relocation at a current \p Streamer' position. The caller is /// responsible for setting the position correctly. size_t emit(MCStreamer *Streamer) const; @@ -303,6 +306,10 @@ class BinaryContext { SmallString<256> Code; SmallVector Fixups; raw_svector_ostream VecOS(Code); + if (MIA->isCFI(*Beg)) { + ++Beg; + continue; + } MCE->encodeInstruction(*Beg++, VecOS, Fixups, *STI); Size += Code.size(); } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 198f1cf3d27d..236aa0dc3582 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -575,14 +575,36 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, int64_t DispValue; const MCExpr *DispExpr; + // In AArch, identify the instruction adding the PC-relative offset to + // jump table entries to correctly decode it. + MCInst *PCRelBaseInstr; + uint64_t PCRelAddr = 0; + + MutableArrayRef BB = Instructions; + + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { + PreserveNops = opts::Relocs; + // Start at the last label as an approximation of the current basic block. + // This is a heuristic, since the full set of labels have yet to be + // determined + for (auto LI = Labels.rbegin(); LI != Labels.rend(); ++LI) { + auto II = InstructionOffsets.find(LI->first); + if (II != InstructionOffsets.end()) { + BB = BB.slice(II->second); + break; + } + } + } + auto Type = BC.MIA->analyzeIndirectBranch(Instruction, - Instructions, + BB, PtrSize, MemLocInstr, BaseRegNum, IndexRegNum, DispValue, - DispExpr); + DispExpr, + PCRelBaseInstr); if (Type == IndirectBranchType::UNKNOWN && !MemLocInstr) return Type; @@ -590,13 +612,52 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, if (MemLocInstr != &Instruction) IndexRegNum = 0; + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { + const auto *Sym = BC.MIA->getTargetSymbol(*PCRelBaseInstr, 1); + assert (Sym && "Symbol extraction failed"); + auto SI = BC.GlobalSymbols.find(Sym->getName()); + if (SI != BC.GlobalSymbols.end()) { + PCRelAddr = SI->second; + } else { + for (auto &Elmt : Labels) { + if (Elmt.second == Sym) { + PCRelAddr = Elmt.first + getAddress(); + break; + } + } + } + uint64_t InstrAddr = 0; + for (auto II = InstructionOffsets.rbegin(); II != InstructionOffsets.rend(); + ++II) { + if (&Instructions[II->second] == PCRelBaseInstr) { + InstrAddr = II->first + getAddress(); + break; + } + } + assert(InstrAddr != 0 && "instruction not found"); + // We do this to avoid spurious references to code locations outside this + // function (for example, if the indirect jump lives in the last basic + // block of the function, it will create a reference to the next function). + // This replaces a symbol reference with an immediate. + BC.MIA->replaceMemOperandDisp(*PCRelBaseInstr, + MCOperand::createImm(PCRelAddr - InstrAddr)); + // FIXME: Disable full jump table processing for AArch64 until we have a + // proper way of determining the jump table limits. + return IndirectBranchType::UNKNOWN; + } + // RIP-relative addressing should be converted to symbol form by now // in processed instructions (but not in jump). if (DispExpr) { - auto SI = BC.GlobalSymbols.find(DispExpr->getSymbol().getName()); + auto SI = + BC.GlobalSymbols.find(BC.MIA->getTargetSymbol(DispExpr)->getName()); assert(SI != BC.GlobalSymbols.end() && "global symbol needs a value"); ArrayStart = SI->second; BaseRegNum = 0; + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { + ArrayStart &= ~0xFFFULL; + ArrayStart += DispValue & 0xFFFULL; + } } else { ArrayStart = static_cast(DispValue); } @@ -679,7 +740,9 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, << " is referencing address 0x" << Twine::utohexstr(Section.getAddress() + ValueOffset)); // Extract the value and increment the offset. - if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) { + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { + Value = PCRelAddr + DE.getSigned(&ValueOffset, EntrySize); + } else if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) { Value = ArrayStart + DE.getSigned(&ValueOffset, 4); } else { Value = DE.getAddress(&ValueOffset); @@ -810,7 +873,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { if (!TargetSymbol && Section && Section->isText() && (BC.TheTriple->getArch() != llvm::Triple::aarch64 || !BC.MIA->isADRP(Instruction))) { - if (containsAddress(TargetAddress)) { + if (containsAddress(TargetAddress, /*UseMaxSize=*/ + BC.TheTriple->getArch() == llvm::Triple::aarch64)) { if (TargetAddress != getAddress()) { // The address could potentially escape. Mark it as another entry // point into the function. @@ -831,7 +895,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { Instruction, MCSymbolRefExpr::create( TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx), - *BC.Ctx))); + *BC.Ctx, 0))); return true; }; @@ -890,6 +954,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } // Check if there's a relocation associated with this instruction. + bool UsedReloc{false}; if (!Relocations.empty()) { auto RI = Relocations.lower_bound(Offset); if (RI != Relocations.end() && RI->first < Offset + Size) { @@ -900,15 +965,21 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { << " for instruction at offset 0x" << Twine::utohexstr(Offset) << '\n'); int64_t Value; - const auto Result = - BC.MIA->replaceImmWithSymbol(Instruction, Relocation.Symbol, - Relocation.Addend, Ctx.get(), Value); + const auto Result = BC.MIA->replaceImmWithSymbol( + Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value, + Relocation.Type); (void)Result; assert(Result && "cannot replace immediate with relocation"); + // For aarch, if we replaced an immediate with a symbol from a + // relocation, we mark it so we do not try to further process a + // pc-relative operand. All we need is the symbol. + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) + UsedReloc = true; // Make sure we replaced the correct immediate (instruction // can have multiple immediate operands). - assert(static_cast(Value) == Relocation.Value && + assert((BC.TheTriple->getArch() == llvm::Triple::aarch64 || + static_cast(Value) == Relocation.Value) && "immediate value mismatch in function"); } } @@ -1081,7 +1152,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Indirect call. We only need to fix it if the operand is RIP-relative if (IsSimple && MIA->hasPCRelOperand(Instruction)) { if (!handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size)) { - errs() << "BOLT-ERROR: cannot handle RIP operand at 0x" + errs() << "BOLT-ERROR: cannot handle PC-relative operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << *this << ".\n"; if (opts::Relocs) @@ -1091,9 +1162,9 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } } } else { - if (MIA->hasPCRelOperand(Instruction)) { + if (MIA->hasPCRelOperand(Instruction) && !UsedReloc) { if (!handlePCRelOperand(Instruction, AbsoluteInstrAddr, Size)) { - errs() << "BOLT-ERROR: cannot handle RIP operand at 0x" + errs() << "BOLT-ERROR: cannot handle PC-relative operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << *this << ".\n"; if (opts::Relocs) @@ -1359,7 +1430,7 @@ bool BinaryFunction::buildCFG() { // Ignore nops. We use nops to derive alignment of the next basic block. // It will not always work, as some blocks are naturally aligned, but // it's just part of heuristic for block alignment. - if (MIA->isNoop(Instr)) { + if (MIA->isNoop(Instr) && !PreserveNops) { IsLastInstrNop = true; continue; } @@ -2593,9 +2664,8 @@ void BinaryFunction::emitConstantIslands(MCStreamer &Streamer) { outs() << "BOLT-INFO: emitting constant island for function " << *this << "\n"; - auto IS = IslandSymbols.begin(); - // We split the island into smaller blocks and output labels between them. + auto IS = IslandSymbols.begin(); for (auto DataIter = DataOffsets.begin(); DataIter != DataOffsets.end(); ++DataIter) { uint64_t FunctionOffset = *DataIter; @@ -2617,18 +2687,33 @@ void BinaryFunction::emitConstantIslands(MCStreamer &Streamer) { if (FunctionOffset == EndOffset) continue; // Size is zero, nothing to emit - // Emit labels and data - while (IS != IslandSymbols.end() && IS->first < EndOffset) { - auto NextStop = IS->first; + // Emit labels, relocs and data + auto RI = MoveRelocations.lower_bound(FunctionOffset); + while ((IS != IslandSymbols.end() && IS->first < EndOffset) || + (RI != MoveRelocations.end() && RI->first < EndOffset)) { + auto NextLabelOffset = IS == IslandSymbols.end() ? EndOffset : IS->first; + auto NextRelOffset = RI == MoveRelocations.end() ? EndOffset : RI->first; + auto NextStop = std::min(NextLabelOffset, NextRelOffset); assert(NextStop <= EndOffset && "internal overflow error"); if (FunctionOffset < NextStop) { Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, NextStop)); FunctionOffset = NextStop; } - DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << IS->second->getName() - << " at offset 0x" << Twine::utohexstr(IS->first) << '\n'); - Streamer.EmitLabel(IS->second); - ++IS; + if (IS != IslandSymbols.end() && FunctionOffset == IS->first) { + DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << IS->second->getName() + << " at offset 0x" << Twine::utohexstr(IS->first) << '\n'); + Streamer.EmitLabel(IS->second); + ++IS; + } + if (RI != MoveRelocations.end() && FunctionOffset == RI->first) { + auto RelocationSize = RI->second.emit(&Streamer); + DEBUG(dbgs() << "BOLT-DEBUG: emitted relocation for symbol " + << RI->second.Symbol->getName() << " at offset 0x" + << Twine::utohexstr(RI->first) + << " with size " << RelocationSize << '\n'); + FunctionOffset += RelocationSize; + ++RI; + } } assert(FunctionOffset <= EndOffset && "overflow error"); if (FunctionOffset < EndOffset) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index fd43bf33e38c..37bf235153d2 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -255,6 +255,10 @@ class BinaryFunction { /// In relocation mode we still disassemble and re-assemble such functions. bool IsSimple{true}; + /// In AArch64, preserve nops to maintain code equal to input (assuming no + /// optimizations are done). + bool PreserveNops{false}; + /// Indicate if this function has associated exception handling metadata. bool HasEHRanges{false}; @@ -758,6 +762,9 @@ class BinaryFunction { unsigned Size, uint64_t Offset); + DenseMap> + computeLocalUDChain(const MCInst *CurInstr); + /// Emit line number information corresponding to \p NewLoc. \p PrevLoc /// provides a context for de-duplication of line number info. /// @@ -1158,7 +1165,7 @@ class BinaryFunction { /// Assert if the \p Address is not inside this function. void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t RelType, uint64_t Addend, uint64_t Value) { - assert(Address >= getAddress() && Address < getAddress() + getSize() && + assert(Address >= getAddress() && Address < getAddress() + getMaxSize() && "address is outside of the function"); auto Offset = Address - getAddress(); switch (RelType) { @@ -1167,10 +1174,13 @@ class BinaryFunction { case ELF::R_X86_64_64: case ELF::R_AARCH64_ABS64: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST32_ABS_LO12_NC: case ELF::R_AARCH64_LDST8_ABS_LO12_NC: - case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: + case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_ADR_PREL_PG_HI21: Relocations.emplace(Offset, Relocation{Offset, Symbol, RelType, Addend, Value}); @@ -1180,6 +1190,8 @@ class BinaryFunction { case ELF::R_X86_64_PLT32: case ELF::R_X86_64_GOTPCRELX: case ELF::R_X86_64_REX_GOTPCRELX: + case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_CALL26: break; // The following relocations are ignored. @@ -1272,7 +1284,9 @@ class BinaryFunction { } /// Return true if the given address \p PC is inside the function body. - bool containsAddress(uint64_t PC) const { + bool containsAddress(uint64_t PC, bool UseMaxSize=false) const { + if (UseMaxSize) + return Address <= PC && PC < Address + MaxSize; return Address <= PC && PC < Address + Size; } diff --git a/bolt/Passes/LongJmp.cpp b/bolt/Passes/LongJmp.cpp index c0e35e482b78..ffddde6308f1 100644 --- a/bolt/Passes/LongJmp.cpp +++ b/bolt/Passes/LongJmp.cpp @@ -50,9 +50,11 @@ void shrinkStubToShortJmp(const BinaryContext &BC, BinaryBasicBlock &StubBB, } void shrinkStubToSingleInst(const BinaryContext &BC, BinaryBasicBlock &StubBB, - const MCSymbol *Tgt) { + const MCSymbol *Tgt, bool TgtIsFunc) { MCInst Inst; BC.MIA->createUncondBranch(Inst, Tgt, BC.Ctx.get()); + if (TgtIsFunc) + BC.MIA->convertJmpToTailCall(Inst, BC.Ctx.get()); StubBB.clear(); StubBB.addInstruction(Inst); } @@ -392,7 +394,7 @@ bool LongJmpPass::removeOrShrinkStubs(const BinaryContext &BC, } if (Bits > RangeSingleInstr) { - shrinkStubToSingleInst(BC, BB, RealTargetSym); + shrinkStubToSingleInst(BC, BB, RealTargetSym, /*is func?*/!TgtBB); StubBits[&BB] = RangeSingleInstr; Modified = true; } @@ -405,6 +407,10 @@ void LongJmpPass::runOnFunctions(BinaryContext &BC, std::set &LargeFunctions) { auto Sorted = BinaryContext::getSortedFunctions(BFs); for (auto Func : Sorted) { + // We are going to remove invalid BBs, so remove any previous marks + for (auto &BB : *Func) { + BB.markValid(true); + } insertStubs(BC, *Func); Func->fixBranches(); } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 08f2850ea0d1..24322fa1e08a 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1703,7 +1703,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { Relocation::getSizeForType(Rel.getType()))); if (BC->TheTriple->getArch() == llvm::Triple::aarch64) - ExtractedValue = Relocation::extractValue(Rel.getType(), ExtractedValue); + ExtractedValue = Relocation::extractValue(Rel.getType(), ExtractedValue, + Rel.getOffset()); bool IsPCRelative = Relocation::isPCRelative(Rel.getType()); auto Addend = getRelocationAddend(InputFile, Rel); @@ -1712,9 +1713,14 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { auto SymbolIter = Rel.getSymbol(); std::string SymbolName = ""; SymbolAddress = *SymbolIter->getAddress(); - if (!SymbolAddress) { + // If no symbol has been found or if it is a relocation requiring the + // creation of a GOT entry, do not link against the symbol but against + // whatever address was extracted from the instruction itself. We are + // not creating a GOT entry as this was already processed by the linker. + if (!SymbolAddress || Relocation::isGOT(Rel.getType())) { Address = ExtractedValue; - if (IsPCRelative) { + // For aarch, pc address has already been added in extractValue + if (IsPCRelative && BC->TheTriple->getArch() != llvm::Triple::aarch64) { Address += Rel.getOffset(); } } else { @@ -1731,7 +1737,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { StringRef SymbolSectionName; (*SymbolSection)->getName(SymbolSectionName); SymbolName = "section " + std::string(SymbolSectionName); - Address = Addend; + if (BC->TheTriple->getArch() != llvm::Triple::aarch64) + Address = Addend; } } } @@ -1761,6 +1768,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { assert(ExtractedValue == SymbolAddress + Addend && "value mismatch"); Address = SymbolAddress; IsAbsoluteCodeRefWithAddend = true; + } else if (BC->TheTriple->getArch() == llvm::Triple::aarch64) { + Addend = 0; // TODO: check if should apply for x86 as well } } else if (Addend < 0 && IsPCRelative) { Address -= Addend; @@ -1778,9 +1787,13 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { << "; type name = " << TypeName << '\n'); + if (Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE) + ForceRelocation = true; + if (Rel.getType() != ELF::R_X86_64_TPOFF32 && Rel.getType() != ELF::R_X86_64_GOTTPOFF && - Rel.getType() != ELF::R_X86_64_GOTPCREL) { + Rel.getType() != ELF::R_X86_64_GOTPCREL && + BC->TheTriple->getArch() != llvm::Triple::aarch64) { if (!IsPCRelative) { if (!IsAbsoluteCodeRefWithAddend) { if (opts::Verbosity > 2 && @@ -1802,7 +1815,10 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { BinaryFunction *ContainingBF = nullptr; if (IsFromCode) { - ContainingBF = getBinaryFunctionContainingAddress(Rel.getOffset()); + ContainingBF = getBinaryFunctionContainingAddress( + Rel.getOffset(), + /*CheckPastEnd*/ false, + /*UseMaxSize*/ BC->TheTriple->getArch() == llvm::Triple::aarch64); assert(ContainingBF && "cannot find function for address in code"); DEBUG(dbgs() << "BOLT-DEBUG: relocation belongs to " << *ContainingBF << '\n'); @@ -1815,7 +1831,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { // between the two. If we blindly apply the relocation it will appear // that it references an arbitrary location in the code, possibly even // in a different function from that containing the jump table. - if (IsPCRelative) { + if (BC->TheTriple->getArch() != llvm::Triple::aarch64 && IsPCRelative) { // Just register the fact that we have PC-relative relocation at a given // address. The actual referenced label/address cannot be determined // from linker data alone. @@ -1869,7 +1885,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } if (IsFromCode) { - if (ReferencedBF || ForceRelocation) { + if (ReferencedBF || ForceRelocation || + BC->TheTriple->getArch() == llvm::Triple::aarch64) { ContainingBF->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), Addend, ExtractedValue); } else { @@ -2015,8 +2032,8 @@ void RewriteInstance::disassembleFunctions() { errs() << "BOLT-WARNING: function " << *ContainingFunction << " has an object detected in a padding region at address 0x" << Twine::utohexstr(Addr) << '\n'; - ContainingFunction->setMaxSize( - Addr - ContainingFunction->getAddress()); + ContainingFunction->setMaxSize(Addr - + ContainingFunction->getAddress()); } } } From 61c54df56200975ff87c6662d576d92d4138b103 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Fri, 27 Oct 2017 15:05:31 -0700 Subject: [PATCH 337/904] [BOLT] Custom function alignment Summary: A new 'compact' function aligner that takes function sizes in consideration. The approach is based on the following assumptions: -- It is not desirable to introduce a large offset when aligning short functions, as it leads to a lot of "wasted" address space. -- For longer functions, the offset can be larger than the default 32 bytes; However, using 64 bytes for the offset still worsen performance, as again a lot of address space is wasted. -- Cold parts of functions can still use the default max-32 offset. The algorithm is switched on/off by flag 'use-compact-aligner' and is controlled by parameters align-functions-max-bytes and align-cold-functions-max-bytes described above. In my tests the best performance is produced with '-use-compact-aligner=true -align-functions-max-bytes=48 -align-cold-functions-max-bytes=32'. (cherry picked from commit 03340b4673081f7f4b430799ef3057933e982f0c) --- bolt/BinaryFunction.h | 30 +++++++++-- bolt/BinaryPassManager.cpp | 3 ++ bolt/Passes/Aligner.cpp | 101 +++++++++++++++++++++++++++++++++++++ bolt/Passes/Aligner.h | 38 ++++++++++++++ bolt/Passes/CMakeLists.txt | 1 + bolt/RewriteInstance.cpp | 21 ++------ 6 files changed, 175 insertions(+), 19 deletions(-) create mode 100644 bolt/Passes/Aligner.cpp create mode 100644 bolt/Passes/Aligner.h diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 37bf235153d2..25b01477bd7f 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -241,7 +241,13 @@ class BinaryFunction { uint64_t MaxSize{std::numeric_limits::max()}; /// Alignment requirements for the function. - uint64_t Alignment{2}; + uint16_t Alignment{2}; + + /// Maximum number of bytes used for alignment of hot part of the function. + uint16_t MaxAlignmentBytes{0}; + + /// Maximum number of bytes used for alignment of cold part of the function. + uint16_t MaxColdAlignmentBytes{0}; const MCSymbol *PersonalityFunction{nullptr}; uint8_t PersonalityEncoding{dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_pcrel}; @@ -1580,15 +1586,33 @@ class BinaryFunction { return *this; } - BinaryFunction &setAlignment(uint64_t Align) { + BinaryFunction &setAlignment(uint16_t Align) { Alignment = Align; return *this; } - uint64_t getAlignment() const { + uint16_t getAlignment() const { return Alignment; } + BinaryFunction &setMaxAlignmentBytes(uint16_t MaxAlignBytes) { + MaxAlignmentBytes = MaxAlignBytes; + return *this; + } + + uint16_t getMaxAlignmentBytes() const { + return MaxAlignmentBytes; + } + + BinaryFunction &setMaxColdAlignmentBytes(uint16_t MaxAlignBytes) { + MaxColdAlignmentBytes = MaxAlignBytes; + return *this; + } + + uint16_t getMaxColdAlignmentBytes() const { + return MaxColdAlignmentBytes; + } + BinaryFunction &setImageAddress(uint64_t Address) { ImageAddress = Address; return *this; diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index c322b0d69d6c..abd67f675f6d 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "BinaryPassManager.h" +#include "Passes/Aligner.h" #include "Passes/AllocCombiner.h" #include "Passes/FrameOptimizer.h" #include "Passes/IndirectCallPromotion.h" @@ -393,6 +394,8 @@ void BinaryFunctionPassManager::runAllPasses( llvm::make_unique(PrintSCTC), opts::SimplifyConditionalTailCalls); + Manager.registerPass(llvm::make_unique()); + // This pass should always run last.* Manager.registerPass(llvm::make_unique(PrintFinalized)); diff --git a/bolt/Passes/Aligner.cpp b/bolt/Passes/Aligner.cpp new file mode 100644 index 000000000000..c55379a19b0e --- /dev/null +++ b/bolt/Passes/Aligner.cpp @@ -0,0 +1,101 @@ +//===--- Aligner.cpp ------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "Aligner.h" + +using namespace llvm; + +namespace opts { +extern cl::OptionCategory BoltOptCategory; + +extern cl::opt Relocs; + +cl::opt +UseCompactAligner("use-compact-aligner", + cl::desc("Use compact approach for aligning functions"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +AlignFunctions("align-functions", + cl::desc("align functions at a given value (relocation mode)"), + cl::init(64), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +AlignFunctionsMaxBytes("align-functions-max-bytes", + cl::desc("maximum number of bytes to use to align functions"), + cl::init(32), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} // end namespace opts + +namespace llvm { +namespace bolt { + +namespace { + +// Align function to the specified byte-boundary (typically, 64) offsetting +// the fuction by not more than the corresponding value +void alignMaxBytes(BinaryFunction &Function) { + Function.setAlignment(opts::AlignFunctions); + Function.setMaxAlignmentBytes(opts::AlignFunctionsMaxBytes); + Function.setMaxColdAlignmentBytes(opts::AlignFunctionsMaxBytes); +} + +// Align function to the specified byte-boundary (typically, 64) offsetting +// the fuction by not more than the minimum over +// -- the size of the function +// -- the specified number of bytes +void alignCompact(BinaryContext &BC, BinaryFunction &Function) { + size_t HotSize = 0; + size_t ColdSize = 0; + for (const auto *BB : Function.layout()) { + if (BB->isCold()) + ColdSize += BC.computeCodeSize(BB->begin(), BB->end()); + else + HotSize += BC.computeCodeSize(BB->begin(), BB->end()); + } + + Function.setAlignment(opts::AlignFunctions); + if (HotSize > 0) + Function.setMaxAlignmentBytes( + std::min(size_t(opts::AlignFunctionsMaxBytes), HotSize)); + + // using the same option, max-align-bytes, both for cold and hot parts of the + // functions, as aligning cold functions typically does not affect performance + if (ColdSize > 0) + Function.setMaxColdAlignmentBytes( + std::min(size_t(opts::AlignFunctionsMaxBytes), ColdSize)); +} + +} // end anonymous namespace + +void AlignerPass::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + if (!opts::Relocs) + return; + + for (auto &It : BFs) { + auto &Function = It.second; + if (opts::UseCompactAligner) + alignCompact(BC, Function); + else + alignMaxBytes(Function); + } +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/Aligner.h b/bolt/Passes/Aligner.h new file mode 100644 index 000000000000..3164a47a91c8 --- /dev/null +++ b/bolt/Passes/Aligner.h @@ -0,0 +1,38 @@ +//===--------- Passes/Aligner.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_ALIGNER_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_ALIGNER_H + +#include "BinaryPasses.h" + +namespace llvm { +namespace bolt { + +class AlignerPass : public BinaryFunctionPass { + public: + explicit AlignerPass() : BinaryFunctionPass(false) {} + + const char *getName() const override { + return "aligner"; + } + + /// Pass entry point + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + + +#endif diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 5f66de76826f..0b733d7ad846 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(LLVMBOLTPasses + Aligner.cpp AllocCombiner.cpp BinaryPasses.cpp BinaryFunctionCallGraph.cpp diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 24322fa1e08a..8d67ac160443 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -90,20 +90,6 @@ OutputFilename("o", cl::Required, cl::cat(BoltOutputCategory)); -cl::opt -AlignFunctions("align-functions", - cl::desc("align functions at a given value (relocation mode)"), - cl::init(64), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - -cl::opt -AlignFunctionsMaxBytes("align-functions-max-bytes", - cl::desc("maximum number of bytes to use to align functions"), - cl::init(32), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - cl::opt AllowStripped("allow-stripped", cl::desc("allow processing of stripped binaries"), @@ -2190,8 +2176,11 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Functio if (opts::Relocs) { Streamer.EmitCodeAlignment(BinaryFunction::MinAlign); - Streamer.EmitCodeAlignment(opts::AlignFunctions, - opts::AlignFunctionsMaxBytes); + auto MaxAlignBytes = EmitColdPart + ? Function.getMaxColdAlignmentBytes() + : Function.getMaxAlignmentBytes(); + if (MaxAlignBytes > 0) + Streamer.EmitCodeAlignment(Function.getAlignment(), MaxAlignBytes); } else { Streamer.EmitCodeAlignment(Function.getAlignment()); Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress()); From 5e4d2c32d182065defec871f900e82d5a57069b6 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 8 Nov 2017 14:42:14 -0800 Subject: [PATCH 338/904] [BOLT] Fix segfault in debug print Summary: With "-debug" flag we are using a dump in intermediate state when basic block's list is initialized, but layout is not. In new isSplit() funciton we were checking the size() which uses basic block list, and then we were accessing the (uninitiazed) layout. Instead of checking size() we should be checking layout_size(). (cherry picked from commit 60ac977d047b980f2808083fe96325a0f60f798f) --- bolt/BinaryFunction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 25b01477bd7f..01a981c73813 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -1238,7 +1238,7 @@ class BinaryFunction { /// Return true if the function body is non-contiguous. bool isSplit() const { - return size() > 1 && + return layout_size() && layout_front()->isCold() != layout_back()->isCold(); } From 395cd4d40689d5510c1099cd9b7a38f8f5937f44 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 7 Nov 2017 16:00:26 -0800 Subject: [PATCH 339/904] [BOLT] Fix N-1'th sctc bug. Summary: The logic to append an unconditional branch at the end of a block that had the condition flipped on its conditional tail was broken. It should have been looking at the successor to PredBB instead of BB. It also wasn't skipping invalid blocks when finding the fallthrough block. This fixes the SCTC bug uncovered by @spupyrev's work on block reordering. (cherry picked from commit 2855275bc0203841562620fda9032f46df8b462d) --- bolt/Passes/BinaryPasses.cpp | 57 +++++++++++++++++++++--------------- bolt/Passes/BinaryPasses.h | 1 + 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 4dc00fdc3805..e150b1db012c 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -682,7 +682,11 @@ bool SimplifyConditionalTailCalls::shouldRewriteBranch( const BinaryBasicBlock *PredBB, const MCInst &CondBranch, const BinaryBasicBlock *BB, - const bool DirectionFlag) { + const bool DirectionFlag +) { + if (BeenOptimized.count(PredBB)) + return false; + const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); if (IsForward) @@ -725,9 +729,8 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, uint64_t NumLocalCTCs = 0; uint64_t LocalCTCTakenCount = 0; uint64_t LocalCTCExecCount = 0; - std::vector> NeedsUncondBranch; + std::vector> NeedsUncondBranch; // Will block be deleted by UCE? auto isValid = [](const BinaryBasicBlock *BB) { @@ -792,6 +795,9 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) continue; + // Record this block so that we don't try to optimize it twice. + BeenOptimized.insert(PredBB); + if (CondSucc != BB) { // Patch the new target address into the conditional branch. MIA->reverseBranchCondition(*CondBranch, CalleeSymbol, BC.Ctx.get()); @@ -799,7 +805,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // the target for the unconditional branch or add a unconditional // branch to the old target. This has to be done manually since // fixupBranches is not called after SCTC. - NeedsUncondBranch.emplace_back(std::make_tuple(BB, PredBB, CondSucc)); + NeedsUncondBranch.emplace_back(std::make_pair(PredBB, CondSucc)); // Swap branch statistics after swapping the branch targets. auto BI = PredBB->branch_info_begin(); std::swap(*BI, *(BI + 1)); @@ -840,9 +846,8 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // Add unconditional branches at the end of BBs to new successors // as long as the successor is not a fallthrough. for (auto &Entry : NeedsUncondBranch) { - auto *BB = std::get<0>(Entry); - auto *PredBB = std::get<1>(Entry); - auto *CondSucc = std::get<2>(Entry); + auto *PredBB = Entry.first; + auto *CondSucc = Entry.second; const MCSymbol *TBB = nullptr; const MCSymbol *FBB = nullptr; @@ -850,24 +855,30 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, MCInst *UncondBranch = nullptr; PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); - // Only add a new branch if the target is not the fall-through. - if (BF.getBasicBlockAfter(BB) != CondSucc || isValid(BB) || - PredBB->isCold() != CondSucc->isCold()) { - if (UncondBranch) { + // Find the next valid block. Invalid blocks will be deleted + // so they shouldn't be considered fallthrough targets. + const auto *NextBlock = BF.getBasicBlockAfter(PredBB, false); + while (NextBlock && !isValid(NextBlock)) { + NextBlock = BF.getBasicBlockAfter(NextBlock, false); + } + + // Get the unconditional successor to this block. + const auto *PredSucc = PredBB->getSuccessor(); + assert(PredSucc && "The other branch should be a tail call"); + + const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); + + if (UncondBranch) { + if (HasFallthrough) + PredBB->eraseInstruction(UncondBranch); + else MIA->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), BC.Ctx.get()); - } else { - MCInst Branch; - auto Result = MIA->createUncondBranch(Branch, - CondSucc->getLabel(), - BC.Ctx.get()); - (void)Result; - assert(Result); - PredBB->addInstruction(Branch); - } - } else if (UncondBranch) { - PredBB->eraseInstruction(UncondBranch); + } else if (!HasFallthrough) { + MCInst Branch; + MIA->createUncondBranch(Branch, CondSucc->getLabel(), BC.Ctx.get()); + PredBB->addInstruction(Branch); } } diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 5cf91bc387d2..12eee5cb0ab6 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -276,6 +276,7 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { uint64_t DeletedBlocks{0}; uint64_t DeletedBytes{0}; std::unordered_set Modified; + std::set BeenOptimized; bool shouldRewriteBranch(const BinaryBasicBlock *PredBB, const MCInst &CondBranch, From eb2a541e0eac16d3c1203dddccb5731cbd39f29a Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 8 Nov 2017 14:29:20 -0800 Subject: [PATCH 340/904] [BOLT] Fix ASAN bugs Summary: Fix a leak in DEBUGRewriter.cpp and an address out of bounds issue in edit distance calculation. (cherry picked from commit 78b03b305481751b864d60bbc6ef8a2b77d70fdd) --- bolt/BinaryFunction.cpp | 29 +++-------------------------- bolt/DWARFRewriter.cpp | 2 +- bolt/DebugData.h | 8 ++++---- 3 files changed, 8 insertions(+), 31 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 236aa0dc3582..a854a8cb3ad8 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -14,6 +14,7 @@ #include "BinaryFunction.h" #include "DataReader.h" #include "Passes/MCF.h" +#include "llvm/ADT/edit_distance.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/MC/MCAsmInfo.h" @@ -2510,32 +2511,8 @@ bool BinaryFunction::hasLayoutChanged() const { } uint64_t BinaryFunction::getEditDistance() const { - const auto LayoutSize = BasicBlocksPreviousLayout.size(); - if (LayoutSize < 2) { - return 0; - } - - std::vector> ChangeMatrix( - LayoutSize + 1, std::vector(LayoutSize + 1)); - - for (uint64_t I = 0; I <= LayoutSize; ++I) { - ChangeMatrix[I][0] = I; - ChangeMatrix[0][I] = I; - } - - for (uint64_t I = 1; I <= LayoutSize; ++I) { - for (uint64_t J = 1; J <= LayoutSize; ++J) { - if (BasicBlocksPreviousLayout[I] != BasicBlocksLayout[J]) { - ChangeMatrix[I][J] = - std::min(std::min(ChangeMatrix[I - 1][J], ChangeMatrix[I][J - 1]), - ChangeMatrix[I - 1][J - 1]) + 1; - } else { - ChangeMatrix[I][J] = ChangeMatrix[I - 1][J - 1]; - } - } - } - - return ChangeMatrix[LayoutSize][LayoutSize]; + return ComputeEditDistance(BasicBlocksPreviousLayout, + BasicBlocksLayout); } void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index b94c2c63b987..f7678f9a19c1 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -552,7 +552,7 @@ void RewriteInstance::updateGdbIndexSection() { OffsetToIndexMap[Offset] = Index; } - + // Ignore old address table. const auto OldAddressTableSize = SymbolTableOffset - AddressTableOffset; Data += OldAddressTableSize; diff --git a/bolt/DebugData.h b/bolt/DebugData.h index c448ebed0b0c..6b46ef071635 100644 --- a/bolt/DebugData.h +++ b/bolt/DebugData.h @@ -113,8 +113,8 @@ class DebugRangesSectionsWriter { return CUAddressRanges; } - SmallVectorImpl *finalize() { - return RangesBuffer.release(); + std::unique_ptr> finalize() { + return std::unique_ptr>(RangesBuffer.release()); } private: @@ -149,8 +149,8 @@ class DebugLocWriter { uint64_t getEmptyListOffset() const { return EmptyListOffset; } - SmallVectorImpl *finalize() { - return LocBuffer.release(); + std::unique_ptr> finalize() { + return std::unique_ptr>(LocBuffer.release()); } private: From dbff4347aef31d494e1dba5cd1a5bb59cb5ff259 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 8 Nov 2017 18:49:33 -0800 Subject: [PATCH 341/904] [BOLT] Add finer control of peephole pass. Summary: Add selective control over peephole options. This makes it easier to test which ones might have a positive effect. (cherry picked from commit a69c4afa50a34020c2efaf3689f036812e5187b1) --- bolt/BinaryPassManager.cpp | 12 ++------- bolt/Passes/BinaryPasses.cpp | 49 +++++++++++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index abd67f675f6d..2f8cf3e345a9 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -76,12 +76,6 @@ OptimizeBodylessFunctions("optimize-bodyless-functions", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -Peepholes("peepholes", - cl::desc("run peephole optimizations"), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt PrintAfterBranchFixup("print-after-branch-fixup", cl::desc("print function after fixing local branches"), @@ -332,8 +326,7 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintICP)); - Manager.registerPass(llvm::make_unique(PrintPeepholes), - opts::Peepholes); + Manager.registerPass(llvm::make_unique(PrintPeepholes)); Manager.registerPass(llvm::make_unique(PrintInline), opts::InlineSmallFunctions); @@ -353,8 +346,7 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintReordered)); - Manager.registerPass(llvm::make_unique(PrintPeepholes), - opts::Peepholes); + Manager.registerPass(llvm::make_unique(PrintPeepholes)); Manager.registerPass( llvm::make_unique(PrintUCE), diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index e150b1db012c..3ad608edab32 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -12,6 +12,7 @@ #include "BinaryPasses.h" #include "Passes/ReorderAlgorithm.h" #include "llvm/Support/Options.h" +#include #define DEBUG_TYPE "bolt" @@ -178,6 +179,32 @@ TSPThreshold("tsp-threshold", cl::Hidden, cl::cat(BoltOptCategory)); +enum PeepholeOpts : char { + PEEP_NONE = 0x0, + PEEP_SHORTEN = 0x1, + PEEP_DOUBLE_JUMPS = 0x2, + PEEP_TAILCALL_TRAPS = 0x4, + PEEP_USELESS_BRANCHES = 0x8, + PEEP_ALL = 0xf +}; + +static cl::list +Peepholes("peepholes", + cl::CommaSeparated, + cl::desc("enable peephole optimizations"), + cl::value_desc("opt1,opt2,opt3,..."), + cl::values( + clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"), + clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", + "remove double jumps when able"), + clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"), + clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches", + "remove useless conditional branches"), + clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { @@ -986,13 +1013,27 @@ void Peepholes::removeUselessCondBranches(BinaryContext &BC, void Peepholes::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { + const char Opts = + std::accumulate(opts::Peepholes.begin(), + opts::Peepholes.end(), + 0, + [](const char A, const opts::PeepholeOpts B) { + return A | B; + }); + if (Opts == opts::PEEP_NONE) + return; + for (auto &It : BFs) { auto &Function = It.second; if (shouldOptimize(Function)) { - shortenInstructions(BC, Function); - NumDoubleJumps += fixDoubleJumps(BC, Function, false); - addTailcallTraps(BC, Function); - removeUselessCondBranches(BC, Function); + if (Opts & opts::PEEP_SHORTEN) + shortenInstructions(BC, Function); + if (Opts & opts::PEEP_DOUBLE_JUMPS) + NumDoubleJumps += fixDoubleJumps(BC, Function, false); + if (Opts & opts::PEEP_TAILCALL_TRAPS) + addTailcallTraps(BC, Function); + if (Opts & opts::PEEP_USELESS_BRANCHES) + removeUselessCondBranches(BC, Function); } } outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps From 173d4b3b5fad2aa3f4ce7b83bd4d752108c211eb Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 13 Nov 2017 11:05:47 -0800 Subject: [PATCH 342/904] [BOLT] Fix handling of RememberState CFI Summary: When RememberState CFI happens to be the last CFI in a basic block, we used to set the state of the next basic block to a CFI prior to executing RememberState instruction. This contradicts comments in annotateCFIState() function and also differs form behaviour of getCFIStateAtInstr(). As a result we were getting code like the following: .LBB0121166 (21 instructions, align : 1) CFI State : 0 .... 0000001a: !CFI $1 ; OpOffset Reg6 -16 0000001a: !CFI $2 ; OpRememberState .... Successors: .Ltmp4167600, .Ltmp4167601 CFI State: 3 .Ltmp4167601 (13 instructions, align : 1) CFI State : 2 .... Notice that the state at the entry of the 2nd basic block is less than the state at the exit of the previous basic block. In practice we have never seen basic blocks where RememberState was the last CFI instruction in the basic block, and hence we've never run into this issue before. The fix is a synchronization of handling of last RememberState instruction by annotateCFIState() and getCFIStateAtInstr(). In the example above, the CFI state at the entry to the second BB will be 3 after this diff. (cherry picked from commit 77e0e1d5d8bd5ae092f0775033a02d167660ccc1) --- bolt/BinaryFunction.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a854a8cb3ad8..cfac6193d5de 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -2344,6 +2344,7 @@ void BinaryFunction::annotateCFIState() { switch (CFI->getOperation()) { case MCCFIInstruction::OpRememberState: StateStack.push(EffectiveState); + EffectiveState = State; break; case MCCFIInstruction::OpRestoreState: assert(!StateStack.empty() && "corrupt CFI stack"); From 759e25b0ee2e9e4fc50035c22ca79738c89083bc Mon Sep 17 00:00:00 2001 From: spupyrev Date: Wed, 15 Nov 2017 14:17:39 -0800 Subject: [PATCH 343/904] speeding up caches for hfsort+ Summary: When running hfsort+, we invalidate too many cache entries, which leads to inefficiencies. It seems we only need to invalidate cache for pairs of clusters (Into, X) and (X, Into) when modifying cluster Into (for all clusters X). With the modification, we do not really need ShortCache, since it is computed only once per pair of clusters. (cherry picked from commit 98d1f785a9c53061c4c9496d2feec52ac7979732) --- bolt/Passes/HFSort.h | 4 +- bolt/Passes/HFSortPlus.cpp | 102 ++++++------------------------- bolt/Passes/ReorderFunctions.cpp | 10 +-- 3 files changed, 22 insertions(+), 94 deletions(-) diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h index 7c837e029397..2329ec171417 100644 --- a/bolt/Passes/HFSort.h +++ b/bolt/Passes/HFSort.h @@ -103,9 +103,7 @@ std::vector clusterize(const CallGraph &Cg); /* * Optimize function placement for iTLB cache and i-cache. */ -std::vector hfsortPlus(CallGraph &Cg, - bool UseGainCache = true, - bool UseShortCallCache = true); +std::vector hfsortPlus(CallGraph &Cg, bool UseGainCache = true); /* * Pettis-Hansen code layout algorithm diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index d7006af2d005..fb8f2cbcf2c2 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -46,7 +46,7 @@ using namespace llvm; using namespace bolt; namespace opts { -extern cl::OptionCategory BoltCategory; + extern cl::OptionCategory BoltOptCategory; extern cl::opt Verbosity; @@ -92,17 +92,6 @@ int32_t ITLBPageSize; // while smaller values result in better i-cache performance int32_t ITLBEntries; -const char* cacheKindString(bool UseGainCache, bool UseShortCallCache) { - if (UseGainCache && UseShortCallCache) - return "gain + short call cache"; - else if (UseGainCache) - return "gain cache"; - else if (UseShortCallCache) - return "short call cache"; - else - return "no cache"; -} - // This class maintains adjacency information for all Clusters being // processed. It is used to invalidate cache entries when merging // Clusters and for visiting all neighbors of any given Cluster. @@ -215,17 +204,16 @@ class PrecomputedResults { Valid[Index] = true; } - void invalidate(const AdjacencyMatrix &Adjacent, const Cluster *C) { - invalidate(C); - Adjacent.forallAdjacent(C, [&](const Cluster *A) { invalidate(A); }); - } - private: void invalidate(const Cluster *C) { Valid.reset(C->id() * Size, (C->id() + 1) * Size); + for (size_t Id = 0; Id < Size; Id++) { + Valid.reset(Id * Size + C->id()); + } } + private: size_t index(const Cluster *First, const Cluster *Second) const { - return (First->id() * Size) + Second->id(); + return First->id() * Size + Second->id(); } size_t Size; @@ -347,12 +335,6 @@ class HFSortPlus { * the same cache page */ double shortCalls(const Cluster *Cluster) const { - if (UseShortCallCache) { - auto Itr = ShortCallCache.find(Cluster); - if (Itr != ShortCallCache.end()) - return Itr->second; - } - double Calls = 0; for (auto TargetId : Cluster->targets()) { for (auto Succ : Cg.successors(TargetId)) { @@ -367,10 +349,6 @@ class HFSortPlus { } } - if (UseShortCallCache) { - ShortCallCache[Cluster] = Calls; - } - return Calls; } @@ -380,11 +358,6 @@ class HFSortPlus { */ double shortCalls(const Cluster *ClusterPred, const Cluster *ClusterSucc) const { - if (UseShortCallCache && - ShortCallPairCache.contains(ClusterPred, ClusterSucc)) { - return ShortCallPairCache.get(ClusterPred, ClusterSucc); - } - double Calls = 0; for (auto TargetId : ClusterPred->targets()) { for (auto Succ : Cg.successors(TargetId)) { @@ -413,10 +386,6 @@ class HFSortPlus { } } - if (UseShortCallCache) { - ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls); - } - return Calls; } @@ -434,8 +403,8 @@ class HFSortPlus { */ double mergeGain(const Cluster *ClusterPred, const Cluster *ClusterSucc) const { - if (UseGainCache && Cache.contains(ClusterPred, ClusterSucc)) { - return Cache.get(ClusterPred, ClusterSucc); + if (UseGainCache && GainCache.contains(ClusterPred, ClusterSucc)) { + return GainCache.get(ClusterPred, ClusterSucc); } // cache misses on the first cluster @@ -460,7 +429,7 @@ class HFSortPlus { Gain /= std::min(ClusterPred->size(), ClusterSucc->size()); if (UseGainCache) { - Cache.set(ClusterPred, ClusterSucc, Gain); + GainCache.set(ClusterPred, ClusterSucc, Gain); } return Gain; @@ -513,7 +482,7 @@ class HFSortPlus { const double ProbOut = CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0; assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect probability"); - + // probability that the second cluster is called from the first one const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0; @@ -601,13 +570,12 @@ class HFSortPlus { */ std::vector run() { DEBUG(dbgs() << "Starting hfsort+ w/" - << cacheKindString(UseGainCache, UseShortCallCache) + << (UseGainCache ? "gain cache" : "no cache") << " for " << Clusters.size() << " clusters " << "with ITLBPageSize = " << ITLBPageSize << ", " << "ITLBEntries = " << ITLBEntries << ", " << "and MergeProbability = " << opts::MergeProbability << "\n"); - // Pass 1 runPassOne(); @@ -628,9 +596,7 @@ class HFSortPlus { return Result; } - HFSortPlus(const CallGraph &Cg, - bool UseGainCache, - bool UseShortCallCache) + HFSortPlus(const CallGraph &Cg, bool UseGainCache) : Cg(Cg), FuncCluster(Cg.numNodes(), nullptr), Addr(Cg.numNodes(), InvalidAddr), @@ -638,9 +604,7 @@ class HFSortPlus { Clusters(initializeClusters()), Adjacent(Cg, Clusters, FuncCluster), UseGainCache(UseGainCache), - UseShortCallCache(UseShortCallCache), - Cache(Clusters.size()), - ShortCallPairCache(Clusters.size()) { + GainCache(Clusters.size()) { } private: @@ -696,31 +660,16 @@ class HFSortPlus { CurAddr = ((CurAddr + Align - 1) / Align) * Align; } - // Update caches - invalidateCaches(Into); + // Invalidate all cache entries associated with cluster Into + if (UseGainCache) { + GainCache.invalidate(Into); + } // Remove cluster From from the list of active clusters auto Iter = std::remove(Clusters.begin(), Clusters.end(), From); Clusters.erase(Iter, Clusters.end()); } - /* - * Invalidate all cache entries associated with cluster C and its neighbors. - */ - void invalidateCaches(const Cluster *C) { - if (UseShortCallCache) { - maybeErase(ShortCallCache, C); - Adjacent.forallAdjacent(C, - [this](const Cluster *A) { - maybeErase(ShortCallCache, A); - }); - ShortCallPairCache.invalidate(Adjacent, C); - } - if (UseGainCache) { - Cache.invalidate(Adjacent, C); - } - } - // The call graph const CallGraph &Cg; @@ -746,32 +695,21 @@ class HFSortPlus { // Use cache for mergeGain results bool UseGainCache; - // Use caches for shortCalls results - bool UseShortCallCache; - // A cache that keeps precomputed values of mergeGain for pairs of clusters; // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs // containing both x and y and all clusters adjacent to x and y (and recompute // them on the next iteration). - mutable PrecomputedResults Cache; - - // Cache for shortCalls for a single cluster. - mutable std::unordered_map ShortCallCache; - - // Cache for shortCalls for a pair of Clusters - mutable PrecomputedResults ShortCallPairCache; + mutable PrecomputedResults GainCache; }; } -std::vector hfsortPlus(CallGraph &Cg, - bool UseGainCache, - bool UseShortCallCache) { +std::vector hfsortPlus(CallGraph &Cg, bool UseGainCache) { // It is required that the sum of incoming arc weights is not greater // than the number of samples for every function. // Ensuring the call graph obeys the property before running the algorithm. Cg.adjustArcWeights(); - return HFSortPlus(Cg, UseGainCache, UseShortCallCache).run(); + return HFSortPlus(Cg, UseGainCache).run(); } }} diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index 4676c1c2fa8a..bf4f178e2259 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -119,14 +119,6 @@ UseGainCache("hfsort+-use-cache", llvm::cl::Hidden, llvm::cl::cat(BoltOptCategory)); -static llvm::cl::opt -UseShortCallCache("hfsort+-use-short-call-cache", - llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."), - llvm::cl::ZeroOrMore, - llvm::cl::init(true), - llvm::cl::Hidden, - llvm::cl::cat(BoltOptCategory)); - } // namespace opts namespace llvm { @@ -353,7 +345,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, Clusters = clusterize(Cg); break; case BinaryFunction::RT_HFSORT_PLUS: - Clusters = hfsortPlus(Cg, opts::UseGainCache, opts::UseShortCallCache); + Clusters = hfsortPlus(Cg, opts::UseGainCache); break; case BinaryFunction::RT_PETTIS_HANSEN: Clusters = pettisAndHansen(Cg); From abfefb10a8657cf43fb01645107c23919f92b689 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 20 Oct 2017 12:11:34 -0700 Subject: [PATCH 344/904] [BOLT] Improve ICP for virtual method calls and jump tables using value profiling. Summary: Use value profiling data to remove the method pointer loads from vtables when doing ICP at virtual function and jump table callsites. The basic process is the following: 1. Work backwards from the callsite to find the most recent def of the call register. 2. Work back from the call register def to find the instruction where the vtable is loaded. 3. Find out of there is any value profiling data associated with the vtable load. If so, record all these addresses as potential vtables + method offsets. 4. Since the addresses extracted by #3 will be vtable + method offset, we need to figure out the method offset in order to determine the actual vtable base address. At this point I virtually execute all the instructions that occur between #3 and #2 that touch the method pointer register. The result of this execution should be the method offset. 5. Fetch the actual method address from the appropriate data section containing the vtable using the computed method offset. Make sure that this address maps to an actual function symbol. 6. Try to associate a vtable pointer with each target address in SymTargets. If every target has a vtable, then this is almost certainly a virtual method callsite. 7. Use the vtable address when generating the promoted call code. It's basically the same as regular ICP code except that the compare is against the vtable and not the method pointer. Additionally, the instructions to load up the method are dumped into the cold call block. For jump tables, the basic idea is the same. I use the memory profiling data to find the hottest slots in the jumptable and then use that information to compute the indices of the hottest entries. We can then compare the index register to the hot index values and avoid the load from the jump table. Note: I'm assuming the whole call is in a single BB. According to @rafaelauler, this isn't always the case on ARM. This also isn't always the case on X86 either. If there are non-trivial arguments that are passed by value, there could be branches in between the setup and the call. I'm going to leave fixing this until later since it makes things a bit more complicated. I've also fixed a bug where ICP was introducing a conditional tail call. I made sure that SCTC fixes these up afterwards. I have no idea why I made it introduce a CTC in the first place. (cherry picked from commit 4c2cd4b5f4c8b98c1147cf0bf4c14503fa775f7c) --- bolt/BinaryBasicBlock.h | 8 + bolt/BinaryContext.cpp | 98 +++- bolt/BinaryContext.h | 37 +- bolt/BinaryFunction.cpp | 80 ++- bolt/BinaryFunction.h | 36 +- bolt/DataAggregator.cpp | 4 +- bolt/DataReader.cpp | 3 +- bolt/DataReader.h | 3 + bolt/Passes/BinaryPasses.cpp | 14 + bolt/Passes/FrameAnalysis.cpp | 4 +- bolt/Passes/IndirectCallPromotion.cpp | 768 ++++++++++++++++++++++---- bolt/Passes/IndirectCallPromotion.h | 44 +- bolt/RewriteInstance.cpp | 63 +-- 13 files changed, 1000 insertions(+), 162 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index f42a4ceb241a..49949c9263c0 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -642,6 +642,14 @@ class BinaryBasicBlock { return Instructions.erase(II); } + /// Erase instructions in the specified range. + template + void eraseInstructions(ItrType Begin, ItrType End) { + while (End > Begin) { + eraseInstruction(*--End); + } + } + /// Erase all instructions void clear() { Instructions.clear(); diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index c11f96375b09..e4bba784961a 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -36,8 +36,28 @@ PrintDebugInfo("print-debug-info", cl::Hidden, cl::cat(BoltCategory)); +static cl::opt +PrintRelocations("print-relocations", + cl::desc("print relocations when printing functions"), + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +PrintMemData("print-mem-data", + cl::desc("print memory data annotations when printing functions"), + cl::Hidden, + cl::cat(BoltCategory)); + } // namespace opts +namespace llvm { +namespace bolt { +extern void check_error(std::error_code EC, StringRef Message); +} +} + +Triple::ArchType Relocation::Arch; + BinaryContext::~BinaryContext() { } MCObjectWriter *BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { @@ -326,7 +346,9 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, uint64_t Offset, const BinaryFunction* Function, - bool printMCInst) const { + bool PrintMCInst, + bool PrintMemData, + bool PrintRelocations) const { if (MIA->isEHLabel(Instruction)) { OS << " EH_LABEL: " << *MIA->getTargetSymbol(Instruction) << '\n'; return; @@ -392,24 +414,58 @@ void BinaryContext::printInstruction(raw_ostream &OS, } } - auto *MD = Function ? DR.getFuncMemData(Function->getNames()) : nullptr; - if (MD) { - bool DidPrint = false; - for (auto &MI : MD->getMemInfoRange(Offset)) { - OS << (DidPrint ? ", " : " # Loads: "); - OS << MI.Addr << "/" << MI.Count; - DidPrint = true; + if ((opts::PrintMemData || PrintMemData) && Function) { + const auto *MD = Function->getMemData(); + const auto MemDataOffset = + MIA->tryGetAnnotationAs(Instruction, "MemDataOffset"); + if (MD && MemDataOffset) { + bool DidPrint = false; + for (auto &MI : MD->getMemInfoRange(MemDataOffset.get())) { + OS << (DidPrint ? ", " : " # Loads: "); + OS << MI.Addr << "/" << MI.Count; + DidPrint = true; + } } } + if ((opts::PrintRelocations || PrintRelocations) && Function) { + const auto Size = computeCodeSize(&Instruction, &Instruction + 1); + Function->printRelocations(OS, Offset, Size); + } + OS << "\n"; - if (printMCInst) { + if (PrintMCInst) { Instruction.dump_pretty(OS, InstPrinter.get()); OS << "\n"; } } +ErrorOr> +BinaryContext::getFunctionData(const BinaryFunction &Function) const { + auto Section = Function.getSection(); + assert(Section.getAddress() <= Function.getAddress() && + Section.getAddress() + Section.getSize() + >= Function.getAddress() + Function.getSize() && + "wrong section for function"); + + if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { + return std::make_error_code(std::errc::bad_address); + } + + StringRef SectionContents; + check_error(Section.getContents(SectionContents), + "cannot get section contents"); + + assert(SectionContents.size() == Section.getSize() && + "section size mismatch"); + + // Function offset from the section start. + auto FunctionOffset = Function.getAddress() - Section.getAddress(); + auto *Bytes = reinterpret_cast(SectionContents.data()); + return ArrayRef(Bytes + FunctionOffset, Function.getSize()); +} + ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ auto SI = AllocatableSections.upper_bound(Address); if (SI != AllocatableSections.begin()) { @@ -640,3 +696,27 @@ size_t Relocation::emit(MCStreamer *Streamer) const { } return Size; } + +#define ELF_RELOC(name, value) #name, + +void Relocation::print(raw_ostream &OS) const { + static const char *X86RelocNames[] = { +#include "llvm/Support/ELFRelocs/x86_64.def" + }; + static const char *AArch64RelocNames[] = { +#include "llvm/Support/ELFRelocs/AArch64.def" + }; + if (Arch == Triple::aarch64) + OS << AArch64RelocNames[Type]; + else + OS << X86RelocNames[Type]; + OS << ", 0x" << Twine::utohexstr(Offset); + if (Symbol) { + OS << ", " << Symbol->getName(); + } + if (int64_t(Addend) < 0) + OS << ", -0x" << Twine::utohexstr(-int64_t(Addend)); + else + OS << ", 0x" << Twine::utohexstr(Addend); + OS << ", 0x" << Twine::utohexstr(Value); +} diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index b01aa0236e20..3cc4f1442738 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -55,6 +55,7 @@ class DataReader; /// Relocation class. struct Relocation { + static Triple::ArchType Arch; /// for printing, set by BinaryContext ctor. uint64_t Offset; mutable MCSymbol *Symbol; /// mutable to allow modification by emitter. uint64_t Type; @@ -78,6 +79,9 @@ struct Relocation { /// Emit relocation at a current \p Streamer' position. The caller is /// responsible for setting the position correctly. size_t emit(MCStreamer *Streamer) const; + + /// Print a relocation to \p OS. + void print(raw_ostream &OS) const; }; /// Relocation ordering by offset. @@ -85,6 +89,11 @@ inline bool operator<(const Relocation &A, const Relocation &B) { return A.Offset < B.Offset; } +inline raw_ostream &operator<<(raw_ostream &OS, const Relocation &Rel) { + Rel.print(OS); + return OS; +} + class BinaryContext { BinaryContext() = delete; @@ -199,7 +208,9 @@ class BinaryContext { MIA(std::move(MIA)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)), - DR(DR) {} + DR(DR) { + Relocation::Arch = this->TheTriple->getArch(); + } ~BinaryContext(); @@ -215,6 +226,15 @@ class BinaryContext { /// global symbol was registered at the location. MCSymbol *getGlobalSymbolAtAddress(uint64_t Address) const; + /// Find the address of the global symbol with the given \p Name. + /// return an error if no such symbol exists. + ErrorOr getAddressForGlobalSymbol(StringRef Name) const { + auto Itr = GlobalSymbols.find(Name); + if (Itr != GlobalSymbols.end()) + return Itr->second; + return std::make_error_code(std::errc::bad_address); + } + /// Return MCSymbol for the given \p Name or nullptr if no /// global symbol with that name exists. MCSymbol *getGlobalSymbolByName(const std::string &Name) const; @@ -222,6 +242,10 @@ class BinaryContext { /// Print the global symbol table. void printGlobalSymbols(raw_ostream& OS) const; + /// Get the raw bytes for a given function. + ErrorOr> + getFunctionData(const BinaryFunction &Function) const; + /// Return (allocatable) section containing the given \p Address. ErrorOr getSectionForAddress(uint64_t Address) const; @@ -340,7 +364,9 @@ class BinaryContext { const MCInst &Instruction, uint64_t Offset = 0, const BinaryFunction *Function = nullptr, - bool printMCInst = false) const; + bool PrintMCInst = false, + bool PrintMemData = false, + bool PrintRelocations = false) const; /// Print a range of instructions. template @@ -349,9 +375,12 @@ class BinaryContext { Itr End, uint64_t Offset = 0, const BinaryFunction *Function = nullptr, - bool printMCInst = false) const { + bool PrintMCInst = false, + bool PrintMemData = false, + bool PrintRelocations = false) const { while (Begin != End) { - printInstruction(OS, *Begin, Offset, Function, printMCInst); + printInstruction(OS, *Begin, Offset, Function, PrintMCInst, + PrintMemData, PrintRelocations); Offset += computeCodeSize(Begin, Begin + 1); ++Begin; } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index cfac6193d5de..3852f7fd1415 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Regex.h" #include #include #include @@ -137,8 +138,16 @@ PrintOnly("print-only", cl::Hidden, cl::cat(BoltCategory)); +static cl::list +PrintOnlyRegex("print-only-regex", + cl::CommaSeparated, + cl::desc("list of function regexes to print"), + cl::value_desc("func1,func2,func3,..."), + cl::Hidden, + cl::cat(BoltCategory)); + bool shouldPrint(const BinaryFunction &Function) { - if (PrintOnly.empty()) + if (PrintOnly.empty() && PrintOnlyRegex.empty()) return true; for (auto &Name : opts::PrintOnly) { @@ -147,6 +156,12 @@ bool shouldPrint(const BinaryFunction &Function) { } } + for (auto &Name : opts::PrintOnlyRegex) { + if (Function.hasNameRegex(Name)) { + return true; + } + } + return false; } @@ -160,6 +175,11 @@ constexpr unsigned BinaryFunction::MinAlign; namespace { +template +bool emptyRange(const R &Range) { + return Range.begin() == Range.end(); +} + /// Gets debug line information for the instruction located at the given /// address in the original binary. The SMLoc's pointer is used /// to point to this information, which is represented by a @@ -227,6 +247,14 @@ bool DynoStats::lessThan(const DynoStats &Other, uint64_t BinaryFunction::Count = 0; +bool BinaryFunction::hasNameRegex(const std::string &NameRegex) const { + Regex MatchName(NameRegex); + for (auto &Name : Names) + if (MatchName.match(Name)) + return true; + return false; +} + BinaryBasicBlock * BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { if (Offset > Size) @@ -558,6 +586,31 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "End of Function \"" << *this << "\"\n\n"; } +void BinaryFunction::printRelocations(raw_ostream &OS, + uint64_t Offset, + uint64_t Size) const { + const char* Sep = " # Relocs: "; + + auto RI = Relocations.lower_bound(Offset); + while (RI != Relocations.end() && RI->first < Offset + Size) { + OS << Sep << "(R: " << RI->second << ")"; + Sep = ", "; + ++RI; + } + + RI = MoveRelocations.lower_bound(Offset); + while (RI != MoveRelocations.end() && RI->first < Offset + Size) { + OS << Sep << "(M: " << RI->second << ")"; + Sep = ", "; + ++RI; + } + + auto PI = PCRelativeRelocationOffsets.lower_bound(Offset); + if (PI != PCRelativeRelocationOffsets.end() && *PI < Offset + Size) { + OS << Sep << "(pcrel)"; + } +} + IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size, uint64_t Offset) { @@ -566,7 +619,7 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, // An instruction referencing memory used by jump instruction (directly or // via register). This location could be an array of function pointers // in case of indirect tail call, or a jump table. - const MCInst *MemLocInstr; + MCInst *MemLocInstr; // Address of the table referenced by MemLocInstr. Could be either an // array of function pointers, or a jump table. @@ -834,6 +887,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { DWARFUnitLineTable ULT = getDWARFUnitLineTable(); + matchProfileMemData(); + // Insert a label at the beginning of the function. This will be our first // basic block. Labels[0] = Ctx->createTempSymbol("BB0", false); @@ -1181,6 +1236,10 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, ULT)); } + if (MemData && !emptyRange(MemData->getMemInfoRange(Offset))) { + MIA->addAnnotation(Ctx.get(), Instruction, "MemDataOffset", Offset); + } + addInstruction(Offset, std::move(Instruction)); } @@ -1892,6 +1951,23 @@ bool BinaryFunction::fetchProfileForOtherEntryPoints() { return Updated; } +void BinaryFunction::matchProfileMemData() { + const auto AllMemData = BC.DR.getFuncMemDataRegex(getNames()); + for (auto *NewMemData : AllMemData) { + // Prevent functions from sharing the same profile. + if (NewMemData->Used) + continue; + + if (MemData) + MemData->Used = false; + + // Update function profile data with the new set. + MemData = NewMemData; + MemData->Used = true; + break; + } +} + void BinaryFunction::matchProfileData() { // This functionality is available for LBR-mode only // TODO: Implement evaluateProfileData() for samples, checking whether diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 01a981c73813..c803f4c849e5 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -296,6 +296,9 @@ class BinaryFunction { /// Profile data for branches. FuncBranchData *BranchData{nullptr}; + /// Profile data for memory loads. + FuncMemData *MemData{nullptr}; + /// Profile match ratio for BranchData. float ProfileMatchRatio{0.0f}; @@ -453,7 +456,7 @@ class BinaryFunction { LabelsMapType Labels; /// Temporary holder of instructions before CFG is constructed. - /// Map offset in the function to MCInst. + /// Map offset in the function to MCInst index. using InstrMapType = std::map; InstrMapType InstructionOffsets; std::vector Instructions; @@ -1014,6 +1017,10 @@ class BinaryFunction { return false; } + /// Check if (possibly one out of many) function name matches the given + /// regex. + bool hasNameRegex(const std::string &NameRegex) const; + /// Return a vector of all possible names for the function. const std::vector &getNames() const { return Names; @@ -1455,6 +1462,10 @@ class BinaryFunction { void print(raw_ostream &OS, std::string Annotation = "", bool PrintInstructions = true) const; + /// Print all relocations between \p Offset and \p Offset + \p Size in + /// this function. + void printRelocations(raw_ostream &OS, uint64_t Offset, uint64_t Size) const; + /// Return true if function has a profile, even if the profile does not /// match CFG 100%. bool hasProfile() const { @@ -1821,6 +1832,10 @@ class BinaryFunction { /// blocks. void matchProfileData(); + /// Find the best matching memory data profile for a function before the + /// creation of basic blocks. + void matchProfileMemData(); + /// Check how closely the profile data matches the function and set /// Return accuracy (ranging from 0.0 to 1.0) of matching. float evaluateProfileData(const FuncBranchData &BranchData); @@ -1831,15 +1846,34 @@ class BinaryFunction { return BranchData; } + /// Return profile data associated with this function, or nullptr if the + /// function has no associated profile. FuncBranchData *getBranchData() { return BranchData; } + /// Return memory profile data associated with this function, or nullptr + /// if the function has no associated profile. + const FuncMemData *getMemData() const { + return MemData; + } + + /// Return memory profile data associated with this function, or nullptr + /// if the function has no associated profile. + FuncMemData *getMemData() { + return MemData; + } + /// Updates profile data associated with this function void setBranchData(FuncBranchData *Data) { BranchData = Data; } + /// Updates the memory profile data associated with this function + void setMemData(FuncMemData *Data) { + MemData = Data; + } + /// Walks the list of basic blocks filling in missing information about /// edge frequency for fall-throughs. /// diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index 8912657f2305..dbce1e4ec465 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -821,7 +821,9 @@ std::error_code DataAggregator::parseMemEvents() { }); if (Func) { - FuncsToMemEvents[FuncName].update(FuncLoc, AddrLoc); + auto *MemData = &FuncsToMemEvents[FuncName]; + Func->setMemData(MemData); + MemData->update(FuncLoc, AddrLoc); DEBUG(dbgs() << "Mem event: " << FuncLoc << " = " << AddrLoc << "\n"); } } diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index 7a0eb57d3ef3..c58919b2a360 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -306,7 +306,8 @@ void MemInfo::print(raw_ostream &OS) const { iterator_range FuncMemData::getMemInfoRange(uint64_t Offset) const { - assert(std::is_sorted(Data.begin(), Data.end())); + // Commented out because it can be expensive. + //assert(std::is_sorted(Data.begin(), Data.end())); struct Compare { bool operator()(const MemInfo &MI, const uint64_t Val) const { return MI.Offset.Offset < Val; diff --git a/bolt/DataReader.h b/bolt/DataReader.h index a0623e61aa41..852e6f177417 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -222,6 +222,9 @@ struct FuncMemData { StringRef Name; ContainerTy Data; + /// Indicate if the data was used. + bool Used{false}; + DenseMap> EventIndex; /// Find all the memory events originating at Offset. diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 3ad608edab32..8dbaba51922d 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -194,6 +194,7 @@ Peepholes("peepholes", cl::desc("enable peephole optimizations"), cl::value_desc("opt1,opt2,opt3,..."), cl::values( + clEnumValN(PEEP_NONE, "none", "disable peepholes"), clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"), clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", "remove double jumps when able"), @@ -566,6 +567,13 @@ void FinalizeFunctions::runOnFunctions( auto &Function = It.second; const auto ShouldOptimize = shouldOptimize(Function); + // Strip all annotations. + for (auto &BB : Function) { + for (auto &Inst : BB) { + BC.MIA->removeAllAnnotations(Inst); + } + } + // Always fix functions in relocation mode. if (!opts::Relocs && !ShouldOptimize) continue; @@ -632,12 +640,18 @@ uint64_t fixDoubleJumps(BinaryContext &BC, // We must patch up any existing branch instructions to match up // with the new successor. auto *Ctx = BC.Ctx.get(); + assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && + "Predecessor block has inconsistent number of successors"); if (CondBranch && BC.MIA->getTargetSymbol(*CondBranch) == BB.getLabel()) { BC.MIA->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); } else if (UncondBranch && BC.MIA->getTargetSymbol(*UncondBranch) == BB.getLabel()) { BC.MIA->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); + } else if (!UncondBranch) { + assert(Function.getBasicBlockAfter(Pred, false) != Succ && + "Don't add an explicit jump to a fallthrough block."); + Pred->addBranchInstruction(Succ); } } else { // Succ will be null in the tail call case. In this case we diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 9418eecad3b6..a4e157192775 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -534,12 +534,12 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, } void FrameAnalysis::printStats() { - outs() << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsNotOptimized + outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized << " function(s) " << format("(%.1lf%% dyn cov)", (100.0 * CountFunctionsNotOptimized / CountDenominator)) << " were not optimized.\n" - << "BOLT-INFO FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI + << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI << " function(s) " << format("(%.1lf%% dyn cov)", (100.0 * CountFunctionsFailedRestoreFI / CountDenominator)) diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index 8f3e3986cf00..25f1a678c30a 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -12,8 +12,10 @@ #include "IndirectCallPromotion.h" #include "DataflowInfoManager.h" #include "llvm/Support/Options.h" +#include #define DEBUG_TYPE "ICP" +#define DEBUG_VERBOSE(Level, X) if (opts::Verbosity >= (Level)) { X; } using namespace llvm; using namespace bolt; @@ -74,6 +76,52 @@ IndirectCallPromotionTopN( cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +IndirectCallPromotionCallsTopN( + "indirect-call-promotion-calls-topn", + cl::desc("number of targets to consider when doing indirect " + "call promotion on calls"), + cl::init(0), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +IndirectCallPromotionJumpTablesTopN( + "indirect-call-promotion-jump-tables-topn", + cl::desc("number of targets to consider when doing indirect " + "call promotion on jump tables"), + cl::init(0), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +EliminateLoads( + "icp-eliminate-loads", + cl::desc("enable load elimination using memory profiling data when " + "performing ICP"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +ICPAlwaysOn( + "icp-always-on", + cl::desc("enable ICP for all eligible callsites"), + cl::init(false), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +ICPTopCallsites( + "icp-top-callsites", + cl::desc("only optimize calls that contribute to this percentage of all " + "indirect calls"), + cl::init(0), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::list ICPFuncsList("icp-funcs", cl::CommaSeparated, @@ -194,6 +242,11 @@ IndirectCallPromotion::getCallTargets( Targets.erase(Result, Targets.end()); } else { + // Don't try to optimize PC relative indirect calls. + if (Inst.getOperand(0).isReg() && + Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) { + return Targets; + } const auto *BranchData = BF.getBranchData(); assert(BranchData && "expected initialized branch data"); auto Offset = BC.MIA->getAnnotationAs(Inst, "Offset"); @@ -247,37 +300,364 @@ IndirectCallPromotion::getCallTargets( return Targets; } -std::vector> +IndirectCallPromotion::JumpTableInfoType +IndirectCallPromotion::maybeGetHotJumpTableTargets( + BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &CallInst, + MCInst *&TargetFetchInst, + const BinaryFunction::JumpTable *JT, + const std::vector &Targets +) const { + const auto *MemData = Function.getMemData(); + JumpTableInfoType HotTargets; + + assert(JT && "Can't get jump table addrs for non-jump tables."); + + if (!MemData || !opts::EliminateLoads) + return JumpTableInfoType(); + + MCInst *MemLocInstr; + MCInst *PCRelBaseOut; + unsigned BaseReg, IndexReg; + int64_t DispValue; + const MCExpr *DispExpr; + MutableArrayRef Insts(&BB->front(), &CallInst); + const auto Type = BC.MIA->analyzeIndirectBranch(CallInst, + Insts, + BC.AsmInfo->getPointerSize(), + MemLocInstr, + BaseReg, + IndexReg, + DispValue, + DispExpr, + PCRelBaseOut); + + assert(MemLocInstr && "There should always be a load for jump tables"); + if (!MemLocInstr) + return JumpTableInfoType(); + + DEBUG({ + dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for " + << "jump table in " << Function << " at @ " + << (&CallInst - &BB->front()) << "\n" + << "BOLT-INFO: ICP target fetch instructions:\n"; + BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); + if (MemLocInstr != &CallInst) { + BC.printInstruction(dbgs(), CallInst, 0, &Function); + } + }); + + DEBUG_VERBOSE(1, { + dbgs() << "Jmp info: Type = " << (unsigned)Type << ", " + << "BaseReg = " << BC.MRI->getName(BaseReg) << ", " + << "IndexReg = " << BC.MRI->getName(IndexReg) << ", " + << "DispValue = " << Twine::utohexstr(DispValue) << ", " + << "DispExpr = " << DispExpr << ", " + << "MemLocInstr = "; + BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function); + dbgs() << "\n"; + }); + + ++TotalIndexBasedCandidates; + + // Try to get value profiling data for the method load instruction. + auto DataOffset = BC.MIA->tryGetAnnotationAs(*MemLocInstr, + "MemDataOffset"); + + if (!DataOffset) { + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP no memory profiling data found\n"); + return JumpTableInfoType(); + } + + uint64_t ArrayStart; + if (DispExpr) { + auto SI = BC.GlobalSymbols.find(DispExpr->getSymbol().getName()); + assert(SI != BC.GlobalSymbols.end() && "global symbol needs a value"); + ArrayStart = SI->second; + } else { + ArrayStart = static_cast(DispValue); + } + + if (BaseReg == BC.MRI->getProgramCounter()) { + auto FunctionData = BC.getFunctionData(Function); + const uint64_t Address = Function.getAddress() + DataOffset.get(); + MCInst OrigJmp; + uint64_t Size; + assert(FunctionData); + auto Success = BC.DisAsm->getInstruction(OrigJmp, + Size, + *FunctionData, + Address, + nulls(), + nulls()); + assert(Success && "Must be able to disassmble original jump instruction"); + ArrayStart += Address + Size; + } + + for (const auto &MI : MemData->getMemInfoRange(DataOffset.get())) { + size_t Index; + if (MI.Addr.Offset % JT->EntrySize != 0) // ignore bogus data + continue; + + if (MI.Addr.IsSymbol) { + // Deal with bad/stale data + if (MI.Addr.Name != (std::string("JUMP_TABLEat0x") + + Twine::utohexstr(JT->Address).str()) && + MI.Addr.Name != (std::string("JUMP_TABLEat0x") + + Twine::utohexstr(ArrayStart).str())) + continue; + Index = MI.Addr.Offset / JT->EntrySize; + } else { + Index = (MI.Addr.Offset - ArrayStart) / JT->EntrySize; + } + + // If Index is out of range it probably means the memory profiling data is + // wrong for this instruction, bail out. + if (Index >= JT->getSize()) + continue; + + assert(std::accumulate(Targets.begin(), + Targets.end(), + false, + [Index](bool Found, const Callsite &CS) { + return (Found || + std::find(CS.JTIndex.begin(), + CS.JTIndex.end(), + Index) != CS.JTIndex.end()); + }) && + "hot indices must be referred to by at least one callsite"); + + HotTargets.emplace_back(std::make_pair(MI.Count, Index)); + } + + // Sort with highest counts first. + std::sort(HotTargets.rbegin(), HotTargets.rend()); + + DEBUG({ + dbgs() << "BOLT-INFO: ICP jump table hot targets:\n"; + for (const auto &Target : HotTargets) { + dbgs() << "BOLT-INFO: Idx = " << Target.second << ", " + << "Count = " << Target.first << "\n"; + } + }); + + BC.MIA->getOrCreateAnnotationAs(BC.Ctx.get(), + CallInst, + "JTIndexReg") = IndexReg; + + TargetFetchInst = MemLocInstr; + + return HotTargets; +} + +IndirectCallPromotion::SymTargetsType IndirectCallPromotion::findCallTargetSymbols( BinaryContext &BC, - const std::vector &Targets, - const size_t N + std::vector &Targets, + const size_t N, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &CallInst, + MCInst *&TargetFetchInst ) const { - std::vector> SymTargets; - - size_t TgtIdx = 0; - for (size_t I = 0; I < N; ++TgtIdx) { - assert(Targets[TgtIdx].To.IsSymbol && "All ICP targets must be to known symbols"); - if (Targets[TgtIdx].JTIndex.empty()) { - SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, 0)); - ++I; + const auto *JT = Function.getJumpTable(CallInst); + SymTargetsType SymTargets; + + if (JT) { + std::vector NewTargets; + std::set ToDelete; + + auto findTargetSymbol = + [&](uint64_t Index, const std::vector &Targets) -> MCSymbol * { + size_t Idx = 0; + for (const auto &CS : Targets) { + assert(CS.To.IsSymbol && "All ICP targets must be to known symbols"); + assert(!CS.JTIndex.empty()); + if (std::find(CS.JTIndex.begin(), CS.JTIndex.end(), Index) != + CS.JTIndex.end()) { + ToDelete.insert(Idx); + NewTargets.push_back(CS); + // Since we know the hot index, delete the rest. + NewTargets.back().JTIndex.clear(); + NewTargets.back().JTIndex.push_back(Index); + return CS.To.Sym; + } + ++Idx; + } + return nullptr; + }; + + auto HotTargets = maybeGetHotJumpTableTargets(BC, + Function, + BB, + CallInst, + TargetFetchInst, + JT, + Targets); + + if (!HotTargets.empty()) { + HotTargets.resize(std::min(N, HotTargets.size())); + for (const auto &HT : HotTargets) { + auto *Sym = findTargetSymbol(HT.second, Targets); + assert(Sym); + SymTargets.push_back(std::make_pair(Sym, HT.second)); + } + for (size_t I = 0; I < Targets.size(); ++I) { + if (ToDelete.count(I) == 0) + NewTargets.push_back(Targets[I]); + } + std::swap(NewTargets, Targets); } else { - for (auto Idx : Targets[TgtIdx].JTIndex) { - SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, Idx)); - ++I; + for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { + assert(Targets[TgtIdx].To.IsSymbol && + "All ICP targets must be to known symbols"); + assert(!Targets[TgtIdx].JTIndex.empty() && + "Jump tables must have indices"); + for (auto Idx : Targets[TgtIdx].JTIndex) { + SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, Idx)); + ++I; + } } } + } else { + for (size_t I = 0; I < N; ++I) { + assert(Targets[I].To.IsSymbol && + "All ICP targets must be to known symbols"); + assert(Targets[I].JTIndex.empty() && + "Can't have jump table indices for non-jump tables"); + SymTargets.push_back(std::make_pair(Targets[I].To.Sym, 0)); + } } return SymTargets; } +IndirectCallPromotion::MethodInfoType +IndirectCallPromotion::maybeGetVtableAddrs( + BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + const SymTargetsType &SymTargets +) const { + const auto *MemData = Function.getMemData(); + std::vector VtableAddrs; + std::vector MethodFetchInsns; + unsigned VtableReg, MethodReg; + uint64_t MethodOffset; + + assert(!Function.getJumpTable(Inst) && + "Can't get vtable addrs for jump tables."); + + if (!MemData || !opts::EliminateLoads) + return MethodInfoType(); + + MutableArrayRef Insts(&BB->front(), &Inst + 1); + if (!BC.MIA->analyzeVirtualMethodCall(Insts, + MethodFetchInsns, + VtableReg, + MethodReg, + MethodOffset)) { + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in " + << Function << " at @ " << (&Inst - &BB->front()) + << "\n"); + return MethodInfoType(); + } + + ++TotalMethodLoadEliminationCandidates; + + DEBUG_VERBOSE(1, + dbgs() << "BOLT-INFO: ICP found virtual method call in " + << Function << " at @ " << (&Inst - &BB->front()) << "\n"; + dbgs() << "BOLT-INFO: ICP method fetch instructions:\n"; + for (auto *Inst : MethodFetchInsns) { + BC.printInstruction(dbgs(), *Inst, 0, &Function); + } + if (MethodFetchInsns.back() != &Inst) { + BC.printInstruction(dbgs(), Inst, 0, &Function); + } + ); + + // Try to get value profiling data for the method load instruction. + auto DataOffset = BC.MIA->tryGetAnnotationAs(*MethodFetchInsns.back(), + "MemDataOffset"); + + if (!DataOffset) { + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP no memory profiling data found\n"); + return MethodInfoType(); + } + + // Find the vtable that each method belongs to. + std::map MethodToVtable; + + for (auto &MI : MemData->getMemInfoRange(DataOffset.get())) { + ErrorOr Address = MI.Addr.IsSymbol + ? BC.getAddressForGlobalSymbol(MI.Addr.Name) + : MI.Addr.Offset; + + // Ignore bogus data. + if (!Address) + continue; + + if (MI.Addr.IsSymbol) + Address = Address.get() + MI.Addr.Offset; + + const auto VtableBase = Address.get() - MethodOffset; + + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = " + << Twine::utohexstr(VtableBase) + << "+" << MethodOffset << "/" << MI.Count + << "\n"); + + if (auto MethodAddr = BC.extractPointerAtAddress(Address.get())) { + auto *MethodSym = BC.getGlobalSymbolAtAddress(MethodAddr.get()); + MethodToVtable[MethodSym] = VtableBase; + DEBUG_VERBOSE(1, + const auto *Method = BC.getFunctionForSymbol(MethodSym); + dbgs() << "BOLT-INFO: ICP found method = " + << Twine::utohexstr(MethodAddr.get()) << "/" + << (Method ? Method->getPrintName() : "") << "\n"; + ); + } + } + + // Find the vtable for each target symbol. + for (size_t I = 0; I < SymTargets.size(); ++I) { + auto Itr = MethodToVtable.find(SymTargets[I].first); + if (Itr != MethodToVtable.end()) { + VtableAddrs.push_back(Itr->second); + } else { + // Give up if we can't find the vtable for a method. + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for " + << SymTargets[I].first->getName() << "\n"); + return MethodInfoType(); + } + } + + // Make sure the vtable reg is not clobbered by the argument passing code + if (VtableReg != MethodReg) { + for (auto *CurInst = MethodFetchInsns.front(); CurInst < &Inst; ++CurInst) { + const auto &InstrInfo = BC.MII->get(CurInst->getOpcode()); + if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI)) { + return MethodInfoType(); + } + } + } + + return MethodInfoType(VtableAddrs, MethodFetchInsns); +} + std::vector> -IndirectCallPromotion::rewriteCall(BinaryContext &BC, - BinaryFunction &Function, - BinaryBasicBlock *IndCallBlock, - const MCInst &CallInst, - MCInstrAnalysis::ICPdata &&ICPcode) const { +IndirectCallPromotion::rewriteCall( + BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *IndCallBlock, + const MCInst &CallInst, + MCInstrAnalysis::ICPdata &&ICPcode, + const std::vector &MethodFetchInsns +) const { // Create new basic blocks with correct code in each one first. std::vector> NewBBs; const bool IsTailCallOrJT = (BC.MIA->isTailCall(CallInst) || @@ -289,7 +669,7 @@ IndirectCallPromotion::rewriteCall(BinaryContext &BC, // Remember any pseudo instructions following a tail call. These // must be preserved and moved to the original block. std::vector TailInsts; - const auto *TailInst= &CallInst; + const auto *TailInst = &CallInst; if (IsTailCallOrJT) { while (TailInst + 1 < &(*IndCallBlock->end()) && BC.MII->get((TailInst + 1)->getOpcode()).isPseudo()) { @@ -299,7 +679,16 @@ IndirectCallPromotion::rewriteCall(BinaryContext &BC, auto MovedInst = IndCallBlock->splitInstructions(&CallInst); - IndCallBlock->replaceInstruction(&CallInst, ICPcode.front().second); + IndCallBlock->eraseInstructions(MethodFetchInsns.begin(), + MethodFetchInsns.end()); + if (IndCallBlock->empty() || + (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst)) { + IndCallBlock->addInstructions(ICPcode.front().second.begin(), + ICPcode.front().second.end()); + } else { + IndCallBlock->replaceInstruction(&IndCallBlock->back(), + ICPcode.front().second); + } IndCallBlock->addInstructions(TailInsts.begin(), TailInsts.end()); for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) { @@ -319,8 +708,6 @@ IndirectCallPromotion::rewriteCall(BinaryContext &BC, // the merge block. if (!IsTailCallOrJT) { NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end()); - } else { - // assert(MovedInst.empty()); empty or just CFI } return NewBBs; @@ -381,7 +768,8 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( std::vector BBI; for (auto Itr = Targets.begin(); Itr != Targets.end(); ++Itr) { const auto BranchPct = (double)Itr->Branches / TotalIndirectBranches; - const auto MispredPct = (double)Itr->Mispreds / TotalIndirectMispreds; + const auto MispredPct = + (double)Itr->Mispreds / std::max(TotalIndirectMispreds, 1ul); if (Itr->JTIndex.empty()) { BBI.push_back(BinaryBranchInfo{uint64_t(TotalCount * BranchPct), uint64_t(TotalMispreds * MispredPct)}); @@ -402,10 +790,8 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( ++BI; }; - if (IsTailCall || IsJumpTable) { - if (IsJumpTable) { - moveSuccessors(IndCallBlock, NewBBs.back().get()); - } + if (IsJumpTable) { + moveSuccessors(IndCallBlock, NewBBs.back().get()); std::vector SymTargets; for (size_t I = 0; I < Targets.size(); ++I) { @@ -421,10 +807,8 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( // Fix up successors and execution counts. updateCurrentBranchInfo(); - if (IsJumpTable) { - auto *Succ = Function.getBasicBlockForLabel(SymTargets[0]); - IndCallBlock->addSuccessor(Succ, BBI[0]); // cond branch - } + auto *Succ = Function.getBasicBlockForLabel(SymTargets[0]); + IndCallBlock->addSuccessor(Succ, BBI[0]); // cond branch IndCallBlock->addSuccessor(NewBBs[0].get(), TotalCount); // fallthru branch for (size_t I = 0; I < NewBBs.size() - 1; ++I) { @@ -432,39 +816,40 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( TotalCount <= uint64_t(TotalIndirectBranches)); uint64_t ExecCount = BBI[I+1].Count; updateCurrentBranchInfo(); - if (IsJumpTable) { - auto *Succ = Function.getBasicBlockForLabel(SymTargets[I+1]); - NewBBs[I]->addSuccessor(Succ, BBI[I+1]); - } + auto *Succ = Function.getBasicBlockForLabel(SymTargets[I+1]); + NewBBs[I]->addSuccessor(Succ, BBI[I+1]); NewBBs[I]->addSuccessor(NewBBs[I+1].get(), TotalCount); // fallthru ExecCount += TotalCount; NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); NewBBs[I]->setIsCold(IndCallBlock->isCold()); NewBBs[I]->setExecutionCount(ExecCount); } - } else { assert(NewBBs.size() >= 2); assert(NewBBs.size() % 2 == 1 || IndCallBlock->succ_empty()); - assert(NewBBs.size() % 2 == 1); - - MergeBlock = NewBBs.back().get(); + assert(NewBBs.size() % 2 == 1 || IsTailCall); - moveSuccessors(IndCallBlock, MergeBlock); + if (!IsTailCall) { + MergeBlock = NewBBs.back().get(); + moveSuccessors(IndCallBlock, MergeBlock); + } // Fix up successors and execution counts. updateCurrentBranchInfo(); IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount); // cond branch IndCallBlock->addSuccessor(NewBBs[0].get(), BBI[0]); // uncond branch - for (size_t I = 0; I < NewBBs.size() - 2; ++I) { + const size_t Adj = IsTailCall ? 1 : 2; + for (size_t I = 0; I < NewBBs.size() - Adj; ++I) { assert(TotalCount <= IndCallBlock->getExecutionCount() || TotalCount <= uint64_t(TotalIndirectBranches)); uint64_t ExecCount = BBI[(I+1)/2].Count; NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); NewBBs[I]->setIsCold(IndCallBlock->isCold()); if (I % 2 == 0) { - NewBBs[I]->addSuccessor(MergeBlock, BBI[(I+1)/2].Count); // uncond + if (MergeBlock) { + NewBBs[I]->addSuccessor(MergeBlock, BBI[(I+1)/2].Count); // uncond + } } else { assert(I + 2 < NewBBs.size()); updateCurrentBranchInfo(); @@ -475,16 +860,18 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( NewBBs[I]->setExecutionCount(ExecCount); } - // Arrange for the MergeBlock to be the fallthrough for the first - // promoted call block. - MergeBlock->setCanOutline(IndCallBlock->canOutline()); - MergeBlock->setIsCold(IndCallBlock->isCold()); - std::unique_ptr MBPtr; - std::swap(MBPtr, NewBBs.back()); - NewBBs.pop_back(); - NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); - // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? - NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch + if (MergeBlock) { + // Arrange for the MergeBlock to be the fallthrough for the first + // promoted call block. + MergeBlock->setCanOutline(IndCallBlock->canOutline()); + MergeBlock->setIsCold(IndCallBlock->isCold()); + std::unique_ptr MBPtr; + std::swap(MBPtr, NewBBs.back()); + NewBBs.pop_back(); + NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr)); + // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here? + NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch + } } // cold call block @@ -507,7 +894,7 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, uint64_t NumCalls) { const bool IsJumpTable = BB->getFunction()->getJumpTable(Inst); - // If we have no targets (or no calls), skip this callsite. + // If we have no targets (or no calls), skip this callsite. if (Targets.empty() || !NumCalls) { if (opts::Verbosity >= 1) { const auto InstIdx = &Inst - &(*BB->begin()); @@ -519,15 +906,21 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, return 0; } - const auto TrialN = std::min(size_t(opts::IndirectCallPromotionTopN), - Targets.size()); + size_t TopN = opts::IndirectCallPromotionTopN; + if (IsJumpTable) { + if (opts::IndirectCallPromotionJumpTablesTopN != 0) + TopN = opts::IndirectCallPromotionJumpTablesTopN; + } else if (opts::IndirectCallPromotionCallsTopN != 0) { + TopN = opts::IndirectCallPromotionCallsTopN; + } + const auto TrialN = std::min(TopN, Targets.size()); - if (!opts::ICPFuncsList.empty()) { - for (auto &Name : opts::ICPFuncsList) { - if (BB->getFunction()->hasName(Name)) - return TrialN; - } - return 0; + if (opts::ICPAlwaysOn) + return TrialN; + + if (opts::ICPTopCallsites > 0) { + auto &BC = BB->getFunction()->getBinaryContext(); + return BC.MIA->hasAnnotation(Inst, "DoICP") ? TrialN : 0; } // Pick the top N targets. @@ -606,29 +999,37 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, // Don't check misprediction frequency for jump tables -- we don't really // care as long as we are saving loads from the jump table. - if (IsJumpTable && !opts::ICPJumpTablesByTarget) - return N; - - // Compute the misprediction frequency of the top N call targets. If - // this frequency is less than the threshold, we should skip ICP at - // this callsite. - const double TopNMispredictFrequency = - (100.0 * TotalMispredictsTopN) / NumCalls; - - if (TopNMispredictFrequency < - opts::IndirectCallPromotionMispredictThreshold) { - if (opts::Verbosity >= 1) { - const auto InstIdx = &Inst - &(*BB->begin()); - outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " - << InstIdx << " in " << BB->getName() << ", calls = " - << NumCalls << ", top N mispredict frequency " - << format("%.1f", TopNMispredictFrequency) << "% < " - << opts::IndirectCallPromotionMispredictThreshold << "%\n"; + if (!IsJumpTable || opts::ICPJumpTablesByTarget) { + // Compute the misprediction frequency of the top N call targets. If + // this frequency is less than the threshold, we should skip ICP at + // this callsite. + const double TopNMispredictFrequency = + (100.0 * TotalMispredictsTopN) / NumCalls; + + if (TopNMispredictFrequency < + opts::IndirectCallPromotionMispredictThreshold) { + if (opts::Verbosity >= 1) { + const auto InstIdx = &Inst - &(*BB->begin()); + outs() << "BOLT-INFO: ICP failed in " << *BB->getFunction() << " @ " + << InstIdx << " in " << BB->getName() << ", calls = " + << NumCalls << ", top N mispredict frequency " + << format("%.1f", TopNMispredictFrequency) << "% < " + << opts::IndirectCallPromotionMispredictThreshold << "%\n"; + } + return 0; } - return 0; } } + // Filter functions that can have ICP applied (for debugging) + if (!opts::ICPFuncsList.empty()) { + for (auto &Name : opts::ICPFuncsList) { + if (BB->getFunction()->hasName(Name)) + return N; + } + return 0; + } + return N; } @@ -662,6 +1063,11 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, << ", mispreds = " << Targets[I].Mispreds << ", taken freq = " << format("%.1f", Frequency) << "%" << ", mis. freq = " << format("%.1f", MisFrequency) << "%"; + bool First = true; + for (auto JTIndex : Targets[I].JTIndex) { + outs() << (First ? ", indices = " : ", ") << JTIndex; + First = false; + } } outs() << "\n"; @@ -679,6 +1085,13 @@ void IndirectCallPromotion::runOnFunctions( if (opts::IndirectCallPromotion == ICP_NONE) return; + const bool OptimizeCalls = + (opts::IndirectCallPromotion == ICP_CALLS || + opts::IndirectCallPromotion == ICP_ALL); + const bool OptimizeJumpTables = + (opts::IndirectCallPromotion == ICP_JUMP_TABLES || + opts::IndirectCallPromotion == ICP_ALL); + std::unique_ptr RA; std::unique_ptr CG; if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) { @@ -686,6 +1099,111 @@ void IndirectCallPromotion::runOnFunctions( RA.reset(new RegAnalysis(BC, BFs, *CG)); } + DEBUG_VERBOSE(2, { + for (auto &BFIt : BFs) { + auto &Function = BFIt.second; + const auto *MemData = Function.getMemData(); + bool DidPrintFunc = false; + uint64_t Offset = 0; + + if (!MemData || !Function.isSimple() || !opts::shouldProcess(Function)) + continue; + + for (auto &BB : Function) { + bool PrintBB = false; + for (auto &Inst : BB) { + if (auto Mem = + BC.MIA->tryGetAnnotationAs(Inst, "MemDataOffset")) { + for (auto &MI : MemData->getMemInfoRange(Mem.get())) { + if (MI.Addr.IsSymbol) { + PrintBB = true; + break; + } + if (auto Section = BC.getSectionForAddress(MI.Addr.Offset)) { + PrintBB = true; + break; + } + } + } + } + if (PrintBB && !DidPrintFunc) { + dbgs() << "\nNon-heap/stack memory data found in " + << Function << ":\n"; + DidPrintFunc = true; + } + Offset = BC.printInstructions(PrintBB ? dbgs() : nulls(), + BB.begin(), + BB.end(), + Offset, + &Function); + } + } + }); + + // If icp-top-callsites is enabled, compute the total number of indirect + // calls and then optimize the hottest callsites that contribute to that + // total. + if (opts::ICPTopCallsites > 0) { + using IndirectCallsite = std::pair; + std::vector IndirectCalls; + size_t TotalIndirectCalls = 0; + + // Find all the indirect callsites. + for (auto &BFIt : BFs) { + auto &Function = BFIt.second; + + if (!Function.isSimple() || + !opts::shouldProcess(Function) || + !Function.getBranchData()) + continue; + + const bool HasLayout = !Function.layout_empty(); + + for (auto &BB : Function) { + if (HasLayout && Function.isSplit() && BB.isCold()) + continue; + + for (auto &Inst : BB) { + if ((BC.MIA->isIndirectCall(Inst) && OptimizeCalls) || + (Function.getJumpTable(Inst) && OptimizeJumpTables)) { + IndirectCalls.push_back(std::make_pair(&BB, &Inst)); + TotalIndirectCalls += BB.getKnownExecutionCount(); + } + } + } + } + + // Sort callsites by execution count. + std::sort(IndirectCalls.begin(), + IndirectCalls.end(), + [](const IndirectCallsite &A, const IndirectCallsite &B) { + const auto CountA = A.first->getKnownExecutionCount(); + const auto CountB = B.first->getKnownExecutionCount(); + return CountA > CountB; + }); + + // Find callsites that contribute to the top "opts::ICPTopCallsites"% + // number of calls. + const float TopPerc = opts::ICPTopCallsites / 100.0f; + int64_t MaxCalls = TotalIndirectCalls * TopPerc; + size_t Num = 0; + for (auto &IC : IndirectCalls) { + if (MaxCalls <= 0) + break; + MaxCalls -= IC.first->getKnownExecutionCount(); + ++Num; + } + outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls + << ", " << Num << " calls cover " << opts::ICPTopCallsites << "% " + << "of all indirect calls\n"; + + // Mark sites to optimize with "DoICP" annotation. + for (size_t I = 0; I < Num; ++I) { + auto &Inst = *IndirectCalls[I].second; + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "DoICP", true); + } + } + for (auto &BFIt : BFs) { auto &Function = BFIt.second; @@ -728,12 +1246,6 @@ void IndirectCallPromotion::runOnFunctions( const bool HasBranchData = Function.getBranchData() && BC.MIA->hasAnnotation(Inst, "Offset"); const bool IsJumpTable = Function.getJumpTable(Inst); - const bool OptimizeCalls = - (opts::IndirectCallPromotion == ICP_CALLS || - opts::IndirectCallPromotion == ICP_ALL); - const bool OptimizeJumpTables = - (opts::IndirectCallPromotion == ICP_JUMP_TABLES || - opts::IndirectCallPromotion == ICP_ALL); if (!((HasBranchData && !IsJumpTable && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) @@ -750,7 +1262,7 @@ void IndirectCallPromotion::runOnFunctions( else ++TotalIndirectCallsites; - const auto Targets = getCallTargets(Function, Inst); + auto Targets = getCallTargets(Function, Inst); // Compute the total number of calls from this particular callsite. uint64_t NumCalls = 0; @@ -764,15 +1276,18 @@ void IndirectCallPromotion::runOnFunctions( // If FLAGS regs is alive after this jmp site, do not try // promoting because we will clobber FLAGS. - if (IsJumpTable && (*Info.getLivenessAnalysis().getStateBefore( - Inst))[BC.MIA->getFlagsReg()]) { - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: ICP failed in " << Function << " @ " - << InstIdx << " in " << BB->getName() - << ", calls = " << NumCalls - << ", cannot clobber flags reg.\n"; + if (IsJumpTable) { + auto State = Info.getLivenessAnalysis().getStateBefore(Inst); + if (!State || (State && (*State)[BC.MIA->getFlagsReg()])) { + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: ICP failed in " << Function << " @ " + << InstIdx << " in " << BB->getName() + << ", calls = " << NumCalls + << (State ? ", cannot clobber flags reg.\n" + : ", no liveness data available.\n"); + } + continue; } - continue; } // Should this callsite be optimized? Return the number of targets @@ -788,9 +1303,17 @@ void IndirectCallPromotion::runOnFunctions( } // Find MCSymbols or absolute addresses for each call target. - const auto SymTargets = findCallTargetSymbols(BC, Targets, N); + MCInst *TargetFetchInst = nullptr; + const auto SymTargets = findCallTargetSymbols(BC, + Targets, + N, + Function, + BB, + Inst, + TargetFetchInst); // If we can't resolve any of the target symbols, punt on this callsite. + // TODO: can this ever happen? if (SymTargets.size() < N) { const auto LastTarget = SymTargets.size(); if (opts::Verbosity >= 1) { @@ -803,12 +1326,33 @@ void IndirectCallPromotion::runOnFunctions( continue; } + MethodInfoType MethodInfo; + + if (!IsJumpTable) { + MethodInfo = maybeGetVtableAddrs(BC, + Function, + BB, + Inst, + SymTargets); + TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1; + DEBUG(dbgs() << "BOLT-INFO: ICP " + << (!MethodInfo.first.empty() ? "found" : "did not find") + << " vtables for all methods.\n"); + } else if (TargetFetchInst) { + ++TotalIndexBasedJumps; + MethodInfo.second.push_back(TargetFetchInst); + } + // Generate new promoted call code for this callsite. auto ICPcode = (IsJumpTable && !opts::ICPJumpTablesByTarget) - ? BC.MIA->jumpTablePromotion(Inst, SymTargets, BC.Ctx.get()) + ? BC.MIA->jumpTablePromotion(Inst, + SymTargets, + MethodInfo.second, + BC.Ctx.get()) : BC.MIA->indirectCallPromotion( - Inst, SymTargets, opts::ICPOldCodeSequence, BC.Ctx.get()); + Inst, SymTargets, MethodInfo.first, MethodInfo.second, + opts::ICPOldCodeSequence, BC.Ctx.get()); if (ICPcode.empty()) { if (opts::Verbosity >= 1) { @@ -836,7 +1380,12 @@ void IndirectCallPromotion::runOnFunctions( }); // Rewrite the CFG with the newly generated ICP code. - auto NewBBs = rewriteCall(BC, Function, BB, Inst, std::move(ICPcode)); + auto NewBBs = rewriteCall(BC, + Function, + BB, + Inst, + std::move(ICPcode), + MethodInfo.second); // Fix the CFG after inserting the new basic blocks. auto MergeBlock = fixCFG(BC, Function, BB, IsTailCall, IsJumpTable, @@ -889,14 +1438,31 @@ void IndirectCallPromotion::runOnFunctions( << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / std::max(TotalIndirectCallsites, 1ul)) << "%\n" + << "BOLT-INFO: ICP number of method load elimination candidates = " + << TotalMethodLoadEliminationCandidates + << "\n" + << "BOLT-INFO: ICP percentage of method calls candidates that have " + "loads eliminated = " + << format("%.1f", (100.0 * TotalMethodLoadsEliminated) / + std::max(TotalMethodLoadEliminationCandidates, 1ul)) + << "%\n" << "BOLT-INFO: ICP percentage of indirect branches that are " "optimized = " << format("%.1f", (100.0 * TotalNumFrequentJmps) / std::max(TotalIndirectJmps, 1ul)) << "%\n" - << "BOLT-INFO: ICP percentage of jump table callsites that are optimized = " + << "BOLT-INFO: ICP percentage of jump table callsites that are " + << "optimized = " << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / std::max(TotalJumpTableCallsites, 1ul)) + << "%\n" + << "BOLT-INFO: ICP number of jump table callsites that can use hot " + << "indices = " << TotalIndexBasedCandidates + << "\n" + << "BOLT-INFO: ICP percentage of jump table callsites that use hot " + "indices = " + << format("%.1f", (100.0 * TotalIndexBasedJumps) / + std::max(TotalIndexBasedCandidates, 1ul)) << "%\n"; } diff --git a/bolt/Passes/IndirectCallPromotion.h b/bolt/Passes/IndirectCallPromotion.h index 43dc6183f00d..cd49933fbe30 100644 --- a/bolt/Passes/IndirectCallPromotion.h +++ b/bolt/Passes/IndirectCallPromotion.h @@ -99,6 +99,9 @@ namespace bolt { /// class IndirectCallPromotion : public BinaryFunctionPass { using BasicBlocksVector = std::vector>; + using MethodInfoType = std::pair, std::vector>; + using JumpTableInfoType = std::vector>; + using SymTargetsType = std::vector>; struct Location { bool IsSymbol{false}; MCSymbol *Sym{nullptr}; @@ -153,6 +156,12 @@ class IndirectCallPromotion : public BinaryFunctionPass { // (a fraction of TotalIndirectCallsites) uint64_t TotalOptimizedIndirectCallsites{0}; + // Total number of method callsites that can have loads eliminated. + mutable uint64_t TotalMethodLoadEliminationCandidates{0}; + + // Total number of method callsites that had loads eliminated. + uint64_t TotalMethodLoadsEliminated{0}; + // Total number of jump table callsites that are optimized by ICP. uint64_t TotalOptimizedJumpTableCallsites{0}; @@ -164,6 +173,12 @@ class IndirectCallPromotion : public BinaryFunctionPass { // (a fraction of TotalCalls) uint64_t TotalNumFrequentJmps{0}; + // Total number of jump table sites that can use hot indices. + mutable uint64_t TotalIndexBasedCandidates{0}; + + // Total number of jump table sites that use hot indices. + uint64_t TotalIndexBasedJumps{0}; + std::vector getCallTargets(BinaryFunction &BF, const MCInst &Inst) const; @@ -178,17 +193,36 @@ class IndirectCallPromotion : public BinaryFunctionPass { const size_t N, uint64_t NumCalls) const; - std::vector> - findCallTargetSymbols(BinaryContext &BC, - const std::vector &Targets, - const size_t N) const; + JumpTableInfoType + maybeGetHotJumpTableTargets(BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + MCInst *&TargetFetchInst, + const BinaryFunction::JumpTable *JT, + const std::vector &Targets) const; + + SymTargetsType findCallTargetSymbols(BinaryContext &BC, + std::vector &Targets, + const size_t N, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + MCInst *&TargetFetchInst) const; + + MethodInfoType maybeGetVtableAddrs(BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + const SymTargetsType &SymTargets) const; std::vector> rewriteCall(BinaryContext &BC, BinaryFunction &Function, BinaryBasicBlock *IndCallBlock, const MCInst &CallInst, - MCInstrAnalysis::ICPdata &&ICPcode) const; + MCInstrAnalysis::ICPdata &&ICPcode, + const std::vector &MethodFetchInsns) const; BinaryBasicBlock *fixCFG(BinaryContext &BC, BinaryFunction &Function, diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 8d67ac160443..fd6c148702da 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -319,8 +319,12 @@ IgnoreBuildID("ignore-build-id", // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { - if (opts::MaxFunctions && Function.getFunctionNumber() > opts::MaxFunctions) - return false; + if (opts::MaxFunctions && Function.getFunctionNumber() >= opts::MaxFunctions) { + if (Function.getFunctionNumber() == opts::MaxFunctions) + dbgs() << "BOLT-INFO: processing ending on " << Function << "\n"; + else + return false; + } auto populateFunctionNames = [](cl::opt &FunctionNamesFile, cl::list &FunctionNames) { @@ -400,21 +404,22 @@ const std::string RewriteInstance::BOLTSecPrefix = ".bolt"; namespace llvm { namespace bolt { extern const char *BoltRevision; -} -} -static void report_error(StringRef Message, std::error_code EC) { +void report_error(StringRef Message, std::error_code EC) { assert(EC); errs() << "BOLT-ERROR: '" << Message << "': " << EC.message() << ".\n"; exit(1); } -static void check_error(std::error_code EC, StringRef Message) { +void check_error(std::error_code EC, StringRef Message) { if (!EC) return; report_error(Message, EC); } +} +} + uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, unsigned Alignment, unsigned SectionID, @@ -1900,12 +1905,15 @@ void RewriteInstance::readProfileData() { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; - auto *FuncData = BC->DR.getFuncBranchData(Function.getNames()); - if (!FuncData) - continue; - Function.BranchData = FuncData; - Function.ExecutionCount = FuncData->ExecutionCount; - FuncData->Used = true; + if (auto *MemData = BC->DR.getFuncMemData(Function.getNames())) { + Function.MemData = MemData; + MemData->Used = true; + } + if (auto *FuncData = BC->DR.getFuncBranchData(Function.getNames())) { + Function.BranchData = FuncData; + Function.ExecutionCount = FuncData->ExecutionCount; + FuncData->Used = true; + } } } @@ -1923,12 +1931,9 @@ void RewriteInstance::disassembleFunctions() { continue; } - SectionRef Section = Function.getSection(); - assert(Section.getAddress() <= Function.getAddress() && - Section.getAddress() + Section.getSize() - >= Function.getAddress() + Function.getSize() && - "wrong section for function"); - if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { + auto FunctionData = BC->getFunctionData(Function); + + if (!FunctionData) { // When could it happen? errs() << "BOLT-ERROR: corresponding section is non-executable or " << "empty for function " << Function << '\n'; @@ -1941,26 +1946,12 @@ void RewriteInstance::disassembleFunctions() { continue; } - StringRef SectionContents; - check_error(Section.getContents(SectionContents), - "cannot get section contents"); - - assert(SectionContents.size() == Section.getSize() && - "section size mismatch"); - - // Function offset from the section start. - auto FunctionOffset = Function.getAddress() - Section.getAddress(); - // Offset of the function in the file. - Function.setFileOffset( - SectionContents.data() - InputFile->getData().data() + FunctionOffset); - - ArrayRef FunctionData( - reinterpret_cast - (SectionContents.data()) + FunctionOffset, - Function.getSize()); + auto *FileBegin = + reinterpret_cast(InputFile->getData().data()); + Function.setFileOffset(FunctionData->begin() - FileBegin); - Function.disassemble(FunctionData); + Function.disassemble(*FunctionData); if (!Function.isSimple() && opts::Relocs) { errs() << "BOLT-ERROR: function " << Function << " cannot be properly " From 441d8b19eb64ff3768665d8078edd042188331c3 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 4 Nov 2017 19:22:05 -0700 Subject: [PATCH 345/904] [RFC] [BOLT] Use iterators for MC branch/call analysis code. Summary: Here's an implementation of an abstract instruction iterator for the branch/call analysis code in MCInstrAnalysis. I'm posting it up to see what you guys think. It's a bit sloppy with constness and probably needs more tidying up. (cherry picked from commit e2dbb77180cd4284eb5e42a1aec8ff0360491e80) --- bolt/BinaryBasicBlock.cpp | 7 ++- bolt/BinaryFunction.cpp | 84 +++++++++++++-------------- bolt/BinaryFunction.h | 26 ++++----- bolt/Exceptions.cpp | 8 +-- bolt/Passes/IndirectCallPromotion.cpp | 6 +- bolt/Passes/Inliner.cpp | 4 +- 6 files changed, 66 insertions(+), 69 deletions(-) diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 9ea9cf11bd1d..b3d9328f6a24 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -313,7 +313,12 @@ bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB, MCInst *&CondBranch, MCInst *&UncondBranch) { auto &MIA = Function->getBinaryContext().MIA; - return MIA->analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch); + return MIA->analyzeBranch(Instructions.begin(), + Instructions.end(), + TBB, + FBB, + CondBranch, + UncondBranch); } MCInst *BinaryBasicBlock::getTerminatorBefore(MCInst *Pos) { diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 3852f7fd1415..7d89b3ee4a19 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -438,9 +438,9 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, // Offset of the instruction in function. uint64_t Offset{0}; - if (BasicBlocks.empty() && !InstructionOffsets.empty()) { + if (BasicBlocks.empty() && !Instructions.empty()) { // Print before CFG was built. - for (const auto &II : InstructionOffsets) { + for (const auto &II : Instructions) { Offset = II.first; // Print label if exists at this offset. @@ -448,7 +448,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (LI != Labels.end()) OS << LI->second->getName() << ":\n"; - BC.printInstruction(OS, Instructions[II.second], Offset, this); + BC.printInstruction(OS, II.second, Offset, this); } } @@ -634,7 +634,8 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, MCInst *PCRelBaseInstr; uint64_t PCRelAddr = 0; - MutableArrayRef BB = Instructions; + auto Begin = Instructions.begin(); + auto End = Instructions.end(); if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { PreserveNops = opts::Relocs; @@ -642,16 +643,17 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, // This is a heuristic, since the full set of labels have yet to be // determined for (auto LI = Labels.rbegin(); LI != Labels.rend(); ++LI) { - auto II = InstructionOffsets.find(LI->first); - if (II != InstructionOffsets.end()) { - BB = BB.slice(II->second); + auto II = Instructions.find(LI->first); + if (II != Instructions.end()) { + Begin = II; break; } } } auto Type = BC.MIA->analyzeIndirectBranch(Instruction, - BB, + Begin, + End, PtrSize, MemLocInstr, BaseRegNum, @@ -681,9 +683,8 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, } } uint64_t InstrAddr = 0; - for (auto II = InstructionOffsets.rbegin(); II != InstructionOffsets.rend(); - ++II) { - if (&Instructions[II->second] == PCRelBaseInstr) { + for (auto II = Instructions.rbegin(); II != Instructions.rend(); ++II) { + if (&II->second == PCRelBaseInstr) { InstrAddr = II->first + getAddress(); break; } @@ -1473,10 +1474,9 @@ bool BinaryFunction::buildCFG() { } }; - for (auto I = InstructionOffsets.begin(), - E = InstructionOffsets.end(); I != E; ++I) { + for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { const auto Offset = I->first; - const auto &Instr = Instructions[I->second]; + const auto &Instr = I->second; auto LI = Labels.find(Offset); if (LI != Labels.end()) { @@ -1621,9 +1621,8 @@ bool BinaryFunction::buildCFG() { // basic block. auto *ToBB = getBasicBlockAtOffset(Branch.second); if (ToBB == nullptr) { - auto I = InstructionOffsets.find(Branch.second); - auto E = InstructionOffsets.end(); - while (ToBB == nullptr && I != E && MIA->isNoop(Instructions[I->second])) { + auto I = Instructions.find(Branch.second), E = Instructions.end(); + while (ToBB == nullptr && I != E && MIA->isNoop(I->second)) { ++I; if (I == E) break; @@ -1781,7 +1780,6 @@ bool BinaryFunction::buildCFG() { // // NB: don't clear Labels list as we may need them if we mark the function // as non-simple later in the process of discovering extra entry points. - clearList(InstructionOffsets); clearList(Instructions); clearList(OffsetToCFI); clearList(TakenBranches); @@ -2079,18 +2077,18 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { // Eliminate recursive calls and returns from recursive calls from the list // of branches that have no match. They are not considered local branches. auto isRecursiveBranch = [&](std::pair &Branch) { - auto SrcInstrI = InstructionOffsets.find(Branch.first); - if (SrcInstrI == InstructionOffsets.end()) + auto SrcInstrI = Instructions.find(Branch.first); + if (SrcInstrI == Instructions.end()) return false; // Check if it is a recursive call. - const auto &SrcInstr = Instructions[SrcInstrI->second]; + const auto &SrcInstr = SrcInstrI->second; if ((BC.MIA->isCall(SrcInstr) || BC.MIA->isIndirectBranch(SrcInstr)) && Branch.second == 0) return true; - auto DstInstrI = InstructionOffsets.find(Branch.second); - if (DstInstrI == InstructionOffsets.end()) + auto DstInstrI = Instructions.find(Branch.second); + if (DstInstrI == Instructions.end()) return false; // Check if it is a return from a recursive call. @@ -2099,17 +2097,16 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { if (!IsSrcReturn && BC.MIA->isPrefix(SrcInstr)) { auto SrcInstrSuccessorI = SrcInstrI; ++SrcInstrSuccessorI; - assert(SrcInstrSuccessorI != InstructionOffsets.end() && + assert(SrcInstrSuccessorI != Instructions.end() && "unexpected prefix instruction at the end of function"); - IsSrcReturn = BC.MIA->isReturn(Instructions[SrcInstrSuccessorI->second]); + IsSrcReturn = BC.MIA->isReturn(SrcInstrSuccessorI->second); } if (IsSrcReturn && Branch.second != 0) { // Make sure the destination follows the call instruction. auto DstInstrPredecessorI = DstInstrI; --DstInstrPredecessorI; - assert(DstInstrPredecessorI != InstructionOffsets.end() && - "invalid iterator"); - if (BC.MIA->isCall(Instructions[DstInstrPredecessorI->second])) + assert(DstInstrPredecessorI != Instructions.end() && "invalid iterator"); + if (BC.MIA->isCall(DstInstrPredecessorI->second)) return true; } return false; @@ -2124,10 +2121,10 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { ExternProfileBranches.end(), std::back_inserter(OrphanBranches), [&](const std::pair &Branch) { - auto II = InstructionOffsets.find(Branch.first); - if (II == InstructionOffsets.end()) + auto II = Instructions.find(Branch.first); + if (II == Instructions.end()) return true; - const auto &Instr = Instructions[II->second]; + const auto &Instr = II->second; // Check for calls, tail calls, rets and indirect branches. // When matching profiling info, we did not reach the stage // when we identify tail calls, so they are still represented @@ -2139,8 +2136,7 @@ float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { // Check for "rep ret" if (BC.MIA->isPrefix(Instr)) { ++II; - if (II != InstructionOffsets.end() && - BC.MIA->isReturn(Instructions[II->second])) + if (II != Instructions.end() && BC.MIA->isReturn(II->second)) return false; } return true; @@ -4320,12 +4316,12 @@ BinaryFunction::getFallthroughsInTrace(uint64_t From, uint64_t To) const { return NoneType(); // Get iterators and validate trace start/end - auto FromIter = InstructionOffsets.find(From); - if (FromIter == InstructionOffsets.end()) + auto FromIter = Instructions.find(From); + if (FromIter == Instructions.end()) return NoneType(); - auto ToIter = InstructionOffsets.find(To); - if (ToIter == InstructionOffsets.end()) + auto ToIter = Instructions.find(To); + if (ToIter == Instructions.end()) return NoneType(); // Trace needs to go forward @@ -4333,22 +4329,20 @@ BinaryFunction::getFallthroughsInTrace(uint64_t From, uint64_t To) const { return NoneType(); // Trace needs to finish in a branch - auto &ToInst = Instructions[ToIter->second]; - if (!BC.MIA->isBranch(ToInst) && !BC.MIA->isCall(ToInst) && - !BC.MIA->isReturn(ToInst)) + if (!BC.MIA->isBranch(ToIter->second) && !BC.MIA->isCall(ToIter->second) && + !BC.MIA->isReturn(ToIter->second)) return NoneType(); // Analyze intermediate instructions for (; FromIter != ToIter; ++FromIter) { // This operates under an assumption that we collect all branches in LBR // No unconditional branches in the middle of the trace - auto &FromInst = Instructions[FromIter->second]; - if (BC.MIA->isUnconditionalBranch(FromInst) || - BC.MIA->isReturn(FromInst) || - BC.MIA->isCall(FromInst)) + if (BC.MIA->isUnconditionalBranch(FromIter->second) || + BC.MIA->isReturn(FromIter->second) || + BC.MIA->isCall(FromIter->second)) return NoneType(); - if (!BC.MIA->isConditionalBranch(FromInst)) + if (!BC.MIA->isConditionalBranch(FromIter->second)) continue; const uint64_t Src = FromIter->first; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index c803f4c849e5..487f93cb7dba 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -456,10 +456,9 @@ class BinaryFunction { LabelsMapType Labels; /// Temporary holder of instructions before CFG is constructed. - /// Map offset in the function to MCInst index. - using InstrMapType = std::map; - InstrMapType InstructionOffsets; - std::vector Instructions; + /// Map offset in the function to MCInst. + using InstrMapType = std::map; + InstrMapType Instructions; /// List of DWARF CFI instructions. Original CFI from the binary must be /// sorted w.r.t. offset that it appears. We rely on this to replay CFIs @@ -749,10 +748,7 @@ class BinaryFunction { } void addInstruction(uint64_t Offset, MCInst &&Instruction) { - assert(InstructionOffsets.size() == Instructions.size() && - "There must be one instruction at every offset."); - Instructions.emplace_back(std::forward(Instruction)); - InstructionOffsets[Offset] = Instructions.size() - 1; + Instructions.emplace(Offset, std::forward(Instruction)); } /// Return instruction at a given offset in the function. Valid before @@ -760,9 +756,8 @@ class BinaryFunction { MCInst *getInstructionAtOffset(uint64_t Offset) { assert(CurrentState == State::Disassembled && "can only call function in Disassembled state"); - auto II = InstructionOffsets.find(Offset); - return (II == InstructionOffsets.end()) - ? nullptr : &Instructions[II->second]; + auto II = Instructions.find(Offset); + return (II == Instructions.end()) ? nullptr : &II->second; } /// Analyze and process indirect branch \p Instruction before it is @@ -1486,23 +1481,22 @@ class BinaryFunction { // harder for us to recover this information, since we can create empty BBs // with NOPs and then reorder it away. // We fix this by moving the CFI instruction just before any NOPs. - auto I = InstructionOffsets.lower_bound(Offset); + auto I = Instructions.lower_bound(Offset); if (Offset == getSize()) { - assert(I == InstructionOffsets.end() && "unexpected iterator value"); + assert(I == Instructions.end() && "unexpected iterator value"); // Sometimes compiler issues restore_state after all instructions // in the function (even after nop). --I; Offset = I->first; } assert(I->first == Offset && "CFI pointing to unknown instruction"); - if (I == InstructionOffsets.begin()) { + if (I == Instructions.begin()) { CIEFrameInstructions.emplace_back(std::forward(Inst)); return; } --I; - while (I != InstructionOffsets.begin() && - BC.MIA->isNoop(Instructions[I->second])) { + while (I != Instructions.begin() && BC.MIA->isNoop(I->second)) { Offset = I->first; --I; } diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index e5fa3f469b53..ac303bfd0207 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -219,7 +219,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // Create a handler entry if necessary. MCSymbol *LPSymbol{nullptr}; if (LandingPad) { - if (InstructionOffsets.find(LandingPad) == InstructionOffsets.end()) { + if (Instructions.find(LandingPad) == Instructions.end()) { if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LandingPad) << " not pointing to an instruction in function " @@ -237,11 +237,11 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, } // Mark all call instructions in the range. - auto II = InstructionOffsets.find(Start); - auto IE = InstructionOffsets.end(); + auto II = Instructions.find(Start); + auto IE = Instructions.end(); assert(II != IE && "exception range not pointing to an instruction"); do { - auto &Instruction = Instructions[II->second]; + auto &Instruction = II->second; if (BC.MIA->isCall(Instruction)) { assert(!BC.MIA->isInvoke(Instruction) && "overlapping exception ranges detected"); diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index 25f1a678c30a..580795911b76 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -325,7 +325,8 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets( const MCExpr *DispExpr; MutableArrayRef Insts(&BB->front(), &CallInst); const auto Type = BC.MIA->analyzeIndirectBranch(CallInst, - Insts, + Insts.begin(), + Insts.end(), BC.AsmInfo->getPointerSize(), MemLocInstr, BaseReg, @@ -555,7 +556,8 @@ IndirectCallPromotion::maybeGetVtableAddrs( return MethodInfoType(); MutableArrayRef Insts(&BB->front(), &Inst + 1); - if (!BC.MIA->analyzeVirtualMethodCall(Insts, + if (!BC.MIA->analyzeVirtualMethodCall(Insts.begin(), + Insts.end(), MethodFetchInsns, VtableReg, MethodReg, diff --git a/bolt/Passes/Inliner.cpp b/bolt/Passes/Inliner.cpp index 65347403c1da..dc7e62e34b6c 100644 --- a/bolt/Passes/Inliner.cpp +++ b/bolt/Passes/Inliner.cpp @@ -246,7 +246,9 @@ InlineSmallFunctions::inlineCall( const MCSymbol *OldFTLabel = nullptr; MCInst *CondBranch = nullptr; MCInst *UncondBranch = nullptr; - const bool Result = BC.MIA->analyzeBranch(Instruction, OldTargetLabel, + const bool Result = BC.MIA->analyzeBranch(&Instruction, + &Instruction + 1, + OldTargetLabel, OldFTLabel, CondBranch, UncondBranch); (void)Result; From d836428b6123ae7559a6213f96c2dd8fb38b5fd9 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 27 Nov 2017 12:58:21 -0800 Subject: [PATCH 346/904] [PERF2BOLT] Fix aggregator wrt traces with REP RET Summary: Previously the perf2bolt aggregator was rejecting traces finishing with REP RET (return instruction with REP prefix) as a result of the migration from objdump output to LLVM disassembler, which decodes REP as a separate instruction. Add code to detect REP RET and treat it as a single return instruction. (cherry picked from commit 55b087f2e7ff70b1053f6953cee954b4884c1991) --- bolt/BinaryFunction.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 7d89b3ee4a19..be26772a009f 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -179,7 +179,7 @@ template bool emptyRange(const R &Range) { return Range.begin() == Range.end(); } - + /// Gets debug line information for the instruction located at the given /// address in the original binary. The SMLoc's pointer is used /// to point to this information, which is represented by a @@ -254,7 +254,7 @@ bool BinaryFunction::hasNameRegex(const std::string &NameRegex) const { return true; return false; } - + BinaryBasicBlock * BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { if (Offset > Size) @@ -610,7 +610,7 @@ void BinaryFunction::printRelocations(raw_ostream &OS, OS << Sep << "(pcrel)"; } } - + IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size, uint64_t Offset) { @@ -4330,8 +4330,16 @@ BinaryFunction::getFallthroughsInTrace(uint64_t From, uint64_t To) const { // Trace needs to finish in a branch if (!BC.MIA->isBranch(ToIter->second) && !BC.MIA->isCall(ToIter->second) && - !BC.MIA->isReturn(ToIter->second)) - return NoneType(); + !BC.MIA->isReturn(ToIter->second)) { + // Check for "rep ret" + if (!BC.MIA->isPrefix(ToIter->second)) { + return NoneType(); + } else { + ++ToIter; + if (!BC.MIA->isReturn(ToIter->second)) + return NoneType(); + } + } // Analyze intermediate instructions for (; FromIter != ToIter; ++FromIter) { From 6caf595743c93c561fd5b85086cc8b50f19573b8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Nov 2017 18:00:24 -0800 Subject: [PATCH 347/904] [BOLT] Add timers for non-optimization related phases. Summary: Add timers for non-optimization related phases. There are two new options, -time-build for disassembling functions and building CFGs, and -time-rewrite for phases in executeRewritePass(). (cherry picked from commit c1bfb05ae094a848cbcbae40b9097f894da0d32c) --- bolt/BinaryFunction.cpp | 12 ++++++++++++ bolt/BinaryFunction.h | 2 ++ bolt/RewriteInstance.cpp | 22 ++++++++++++++++++++++ bolt/RewriteInstance.h | 2 ++ 4 files changed, 38 insertions(+) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index be26772a009f..10d2dbbbffb7 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" +#include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Regex.h" #include @@ -146,6 +147,13 @@ PrintOnlyRegex("print-only-regex", cl::Hidden, cl::cat(BoltCategory)); +cl::opt +TimeBuild("time-build", + cl::desc("print time spent constructing binary functions"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + bool shouldPrint(const BinaryFunction &Function) { if (PrintOnly.empty() && PrintOnlyRegex.empty()) return true; @@ -172,6 +180,7 @@ namespace bolt { constexpr const char *DynoStats::Desc[]; constexpr unsigned BinaryFunction::MinAlign; +const char BinaryFunction::TimerGroupName[] = "Build binary functions"; namespace { @@ -880,6 +889,8 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address, } void BinaryFunction::disassemble(ArrayRef FunctionData) { + NamedRegionTimer T("disassemble", TimerGroupName, opts::TimeBuild); + assert(FunctionData.size() == getSize() && "function size does not match raw data size"); @@ -1431,6 +1442,7 @@ void BinaryFunction::recomputeLandingPads() { } bool BinaryFunction::buildCFG() { + NamedRegionTimer T("build cfg", TimerGroupName, opts::TimeBuild); auto &MIA = BC.MIA; if (!isSimple()) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 487f93cb7dba..30a5112b8b1a 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -208,6 +208,8 @@ class BinaryFunction { /// We have to use at least 2-byte alignment for functions because of C++ ABI. static constexpr unsigned MinAlign = 2; + static const char TimerGroupName[]; + using BasicBlockOrderType = std::vector; private: diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index fd6c148702da..815c26d80f02 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -53,6 +53,7 @@ #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/Timer.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -316,6 +317,13 @@ IgnoreBuildID("ignore-build-id", cl::init(false), cl::cat(AggregatorCategory)); +static cl::opt +TimeRewrite("time-rewrite", + cl::desc("print time spent in rewriting passes"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { @@ -401,6 +409,8 @@ const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; const std::string RewriteInstance::BOLTSecPrefix = ".bolt"; +const char RewriteInstance::TimerGroupName[] = "Rewrite passes"; + namespace llvm { namespace bolt { extern const char *BoltRevision; @@ -652,6 +662,7 @@ void RewriteInstance::reset() { } void RewriteInstance::aggregateData() { + NamedRegionTimer T("aggregate data", TimerGroupName, opts::TimeRewrite); DA.aggregate(*BC.get(), BinaryFunctions); if (!opts::AggregateOnly) @@ -663,6 +674,8 @@ void RewriteInstance::aggregateData() { } void RewriteInstance::discoverStorage() { + NamedRegionTimer T("discover storage", TimerGroupName, opts::TimeRewrite); + // Stubs are harmful because RuntimeDyld may try to increase the size of // sections accounting for stubs when we need those sections to match the // same size seen in the input binary, in case this section is a copy @@ -947,6 +960,8 @@ void RewriteInstance::run() { } void RewriteInstance::discoverFileObjects() { + NamedRegionTimer T("discover file objects", TimerGroupName, opts::TimeRewrite); + FileSymRefs.clear(); BinaryFunctions.clear(); BC->GlobalAddresses.clear(); @@ -1532,6 +1547,8 @@ BinaryFunction *RewriteInstance::createBinaryFunction( } void RewriteInstance::readSpecialSections() { + NamedRegionTimer T("read special sections", TimerGroupName, opts::TimeRewrite); + bool HasTextRelocations = false; // Process special sections. @@ -1893,6 +1910,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } void RewriteInstance::readDebugInfo() { + NamedRegionTimer T("read debug info", TimerGroupName, opts::TimeRewrite); if (!opts::UpdateDebugSections) return; @@ -1900,6 +1918,7 @@ void RewriteInstance::readDebugInfo() { } void RewriteInstance::readProfileData() { + NamedRegionTimer T("read profile data", TimerGroupName, opts::TimeRewrite); if (BC->DR.getAllFuncsData().empty()) return; @@ -1918,6 +1937,7 @@ void RewriteInstance::readProfileData() { } void RewriteInstance::disassembleFunctions() { + NamedRegionTimer T("disassemble functions", TimerGroupName, opts::TimeRewrite); // Disassemble every function and build it's control flow graph. TotalScore = 0; BC->SumExecutionCount = 0; @@ -2135,6 +2155,7 @@ void RewriteInstance::disassembleFunctions() { } void RewriteInstance::runOptimizationPasses() { + NamedRegionTimer T("run optimization passes", TimerGroupName, opts::TimeRewrite); BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions, LargeFunctions); } @@ -2270,6 +2291,7 @@ std::vector singletonSet(T t) { } // anonymous namespace void RewriteInstance::emitFunctions() { + NamedRegionTimer T("emit functions", TimerGroupName, opts::TimeRewrite); std::error_code EC; // This is an object file, which we keep for debugging purposes. diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index a0b8a7bac360..8bc3ad3da294 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -414,6 +414,8 @@ class RewriteInstance { ".gdb_index", }; + static const char TimerGroupName[]; + /// Huge page size used for alignment. static constexpr unsigned PageAlign = 0x200000; From 63b76f02927065d17215173a143aee5c496b1d66 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 19 Nov 2017 11:17:57 -0800 Subject: [PATCH 348/904] [BOLT] Fix icp-top-callsites option, remove icp-always-on. Summary: The icp-top-callsites option was using basic block counts to pick the top callsites while the ICP main loop was using branch info from the targets of each call. These numbers do not exactly match up so there was a dispcrepancy in computing the top calls. I've switch top callsites over to use the same stats as the main loop. The icp-always-on option was redundant with -icp-top-callsites=100, so I removed it. (cherry picked from commit 5ae3450fd99f0f9fb82bdfaf38124a19cfbd8cf6) --- bolt/Passes/IndirectCallPromotion.cpp | 47 ++++++++++++--------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index 580795911b76..d17bcbd5c045 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -103,15 +103,6 @@ EliminateLoads( cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -ICPAlwaysOn( - "icp-always-on", - cl::desc("enable ICP for all eligible callsites"), - cl::init(false), - cl::Hidden, - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt ICPTopCallsites( "icp-top-callsites", @@ -917,9 +908,6 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, } const auto TrialN = std::min(TopN, Targets.size()); - if (opts::ICPAlwaysOn) - return TrialN; - if (opts::ICPTopCallsites > 0) { auto &BC = BB->getFunction()->getBinaryContext(); return BC.MIA->hasAnnotation(Inst, "DoICP") ? TrialN : 0; @@ -1146,7 +1134,7 @@ void IndirectCallPromotion::runOnFunctions( // calls and then optimize the hottest callsites that contribute to that // total. if (opts::ICPTopCallsites > 0) { - using IndirectCallsite = std::pair; + using IndirectCallsite = std::pair; std::vector IndirectCalls; size_t TotalIndirectCalls = 0; @@ -1166,23 +1154,28 @@ void IndirectCallPromotion::runOnFunctions( continue; for (auto &Inst : BB) { - if ((BC.MIA->isIndirectCall(Inst) && OptimizeCalls) || - (Function.getJumpTable(Inst) && OptimizeJumpTables)) { - IndirectCalls.push_back(std::make_pair(&BB, &Inst)); - TotalIndirectCalls += BB.getKnownExecutionCount(); + const bool IsJumpTable = Function.getJumpTable(Inst); + const bool HasBranchData = BC.MIA->hasAnnotation(Inst, "Offset"); + const bool IsDirectCall = (BC.MIA->isCall(Inst) && + BC.MIA->getTargetSymbol(Inst, 0)); + + if (!IsDirectCall && + ((HasBranchData && !IsJumpTable && OptimizeCalls) || + (IsJumpTable && OptimizeJumpTables))) { + uint64_t NumCalls = 0; + for (const auto &BInfo : getCallTargets(Function, Inst)) { + NumCalls += BInfo.Branches; + } + + IndirectCalls.push_back(std::make_pair(NumCalls, &Inst)); + TotalIndirectCalls += NumCalls; } } } } // Sort callsites by execution count. - std::sort(IndirectCalls.begin(), - IndirectCalls.end(), - [](const IndirectCallsite &A, const IndirectCallsite &B) { - const auto CountA = A.first->getKnownExecutionCount(); - const auto CountB = B.first->getKnownExecutionCount(); - return CountA > CountB; - }); + std::sort(IndirectCalls.rbegin(), IndirectCalls.rend()); // Find callsites that contribute to the top "opts::ICPTopCallsites"% // number of calls. @@ -1192,7 +1185,7 @@ void IndirectCallPromotion::runOnFunctions( for (auto &IC : IndirectCalls) { if (MaxCalls <= 0) break; - MaxCalls -= IC.first->getKnownExecutionCount(); + MaxCalls -= IC.first; ++Num; } outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls @@ -1201,8 +1194,8 @@ void IndirectCallPromotion::runOnFunctions( // Mark sites to optimize with "DoICP" annotation. for (size_t I = 0; I < Num; ++I) { - auto &Inst = *IndirectCalls[I].second; - BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "DoICP", true); + auto *Inst = IndirectCalls[I].second; + BC.MIA->addAnnotation(BC.Ctx.get(), *Inst, "DoICP", true); } } From 61dfad78829221113b1d1dbae4e51331c7a5f3f8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 29 Nov 2017 17:40:14 -0800 Subject: [PATCH 349/904] [BOLT] Fix bug in shortening peephole. Summary: The arithmetic shortening code on x86 was broken. It would sometimes shorten instructions with immediate operands that wouldn't fit into 8 bits. (cherry picked from commit 0235dec24beaf35f14dc5305dd4397bdf622d7f4) --- bolt/Passes/BinaryPasses.cpp | 32 +++++++++++++++++++++++++++----- bolt/Passes/BinaryPasses.h | 3 ++- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 8dbaba51922d..43b514d3e64a 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -974,13 +974,33 @@ void SimplifyConditionalTailCalls::runOnFunctions( << ".\n"; } -void Peepholes::shortenInstructions(BinaryContext &BC, - BinaryFunction &Function) { +uint64_t Peepholes::shortenInstructions(BinaryContext &BC, + BinaryFunction &Function) { + std::string DebugStr; + (void)DebugStr; + uint64_t Count = 0; for (auto &BB : Function) { for (auto &Inst : BB) { - BC.MIA->shortenInstruction(Inst); + DEBUG( + if (opts::Verbosity > 1) { + DebugStr.clear(); + raw_string_ostream OS(DebugStr); + BC.printInstruction(OS, Inst, 0, &Function); + OS.str(); + }); + if (BC.MIA->shortenInstruction(Inst)) { + DEBUG( + if (opts::Verbosity > 1) { + dbgs() << "BOLT-INFO: peephole, shortening:\n" + << "BOLT-INFO: " << DebugStr + << "BOLT-INFO: to:"; + BC.printInstruction(dbgs(), Inst, 0, &Function); + }); + ++Count; + } } } + return Count; } void Peepholes::addTailcallTraps(BinaryContext &BC, @@ -1041,7 +1061,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC, auto &Function = It.second; if (shouldOptimize(Function)) { if (Opts & opts::PEEP_SHORTEN) - shortenInstructions(BC, Function); + NumShortened += shortenInstructions(BC, Function); if (Opts & opts::PEEP_DOUBLE_JUMPS) NumDoubleJumps += fixDoubleJumps(BC, Function, false); if (Opts & opts::PEEP_TAILCALL_TRAPS) @@ -1050,7 +1070,9 @@ void Peepholes::runOnFunctions(BinaryContext &BC, removeUselessCondBranches(BC, Function); } } - outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps + outs() << "BOLT-INFO: Peephole: " << NumShortened + << " instructions shortened.\n" + << "BOLT-INFO: Peephole: " << NumDoubleJumps << " double jumps patched.\n" << "BOLT-INFO: Peephole: " << TailCallTraps << " tail call traps inserted.\n" diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 12eee5cb0ab6..3316afd2c808 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -301,13 +301,14 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { /// Perform simple peephole optimizations. class Peepholes : public BinaryFunctionPass { + uint64_t NumShortened{0}; uint64_t NumDoubleJumps{0}; uint64_t TailCallTraps{0}; uint64_t NumUselessCondBranches{0}; /// Attempt to use the minimum operand width for arithmetic, branch and /// move instructions. - void shortenInstructions(BinaryContext &BC, BinaryFunction &Function); + uint64_t shortenInstructions(BinaryContext &BC, BinaryFunction &Function); /// Add trap instructions immediately after indirect tail calls to prevent /// the processor from decoding instructions immediate following the From 4b94df02326000d3e53f1aaa737d26c984421d90 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 30 Nov 2017 13:30:49 -0800 Subject: [PATCH 350/904] [BOLT] Use getNumPrimeOperands in shortenInstruction. Summary: Apply maks' review comments (cherry picked from commit 7a7a534660dd3c244b4519e7536870ae70cbb411) --- bolt/Passes/BinaryPasses.cpp | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 43b514d3e64a..53a60e149386 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -976,26 +976,21 @@ void SimplifyConditionalTailCalls::runOnFunctions( uint64_t Peepholes::shortenInstructions(BinaryContext &BC, BinaryFunction &Function) { - std::string DebugStr; - (void)DebugStr; + MCInst DebugInst; uint64_t Count = 0; for (auto &BB : Function) { for (auto &Inst : BB) { - DEBUG( - if (opts::Verbosity > 1) { - DebugStr.clear(); - raw_string_ostream OS(DebugStr); - BC.printInstruction(OS, Inst, 0, &Function); - OS.str(); - }); + if (opts::Verbosity > 1) { + DebugInst = Inst; + } if (BC.MIA->shortenInstruction(Inst)) { - DEBUG( - if (opts::Verbosity > 1) { - dbgs() << "BOLT-INFO: peephole, shortening:\n" - << "BOLT-INFO: " << DebugStr - << "BOLT-INFO: to:"; - BC.printInstruction(dbgs(), Inst, 0, &Function); - }); + if (opts::Verbosity > 1) { + outs() << "BOLT-INFO: peephole, shortening:\n" + << "BOLT-INFO: "; + BC.printInstruction(outs(), DebugInst, 0, &Function); + outs() << "BOLT-INFO: to:"; + BC.printInstruction(outs(), Inst, 0, &Function); + } ++Count; } } From 1c0564c12d9a1209d929ddf4c3c58f23f92d2134 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 2 Nov 2017 00:30:11 -0700 Subject: [PATCH 351/904] Introduce pass to reduce jump tables footprint Summary: Add a pass to identify indirect jumps to jump tables and reduce their entries size from 8 to 4 bytes. For PIC jump tables, it will convert the PIC code to non-PIC (since BOLT only processes static code, it makes no sense to use expensive PIC-style jumps in static code). Add corresponding improvements to register scavenging pass and add a MCInst matcher machinery. (cherry picked from commit ddf0b32d251d34f8e1023a62a246856b41bc37c2) --- bolt/BinaryFunction.cpp | 2 +- bolt/BinaryFunction.h | 17 +- bolt/BinaryPassManager.cpp | 18 ++ bolt/Passes/CMakeLists.txt | 1 + bolt/Passes/DataflowAnalysis.cpp | 13 ++ bolt/Passes/JTFootprintReduction.cpp | 276 +++++++++++++++++++++++++++ bolt/Passes/JTFootprintReduction.h | 85 +++++++++ bolt/Passes/LivenessAnalysis.h | 51 ++++- 8 files changed, 453 insertions(+), 10 deletions(-) create mode 100644 bolt/Passes/JTFootprintReduction.cpp create mode 100644 bolt/Passes/JTFootprintReduction.h diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 10d2dbbbffb7..8afba679f450 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -3826,7 +3826,7 @@ uint64_t BinaryFunction::JumpTable::emit(MCStreamer *Streamer, LastLabel = LI->second; } if (Type == JTT_NORMAL) { - Streamer->EmitSymbolValue(Entry, EntrySize); + Streamer->EmitSymbolValue(Entry, OutputEntrySize); } else { // JTT_PIC auto JT = MCSymbolRefExpr::create(LastLabel, Streamer->getContext()); auto E = MCSymbolRefExpr::create(Entry, Streamer->getContext()); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 30a5112b8b1a..e0157143c732 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -538,6 +538,9 @@ class BinaryFunction { /// Size of the entry used for storage. std::size_t EntrySize; + /// Size of the entry size we will write (we may use a more compact layout) + std::size_t OutputEntrySize; + /// The type of this jump table. JumpTableType Type; @@ -567,14 +570,11 @@ class BinaryFunction { std::pair getEntriesForAddress(const uint64_t Addr) const; /// Constructor. - JumpTable(uint64_t Address, - std::size_t EntrySize, - JumpTableType Type, + JumpTable(uint64_t Address, std::size_t EntrySize, JumpTableType Type, decltype(OffsetEntries) &&OffsetEntries, decltype(Labels) &&Labels) - : Address(Address), EntrySize(EntrySize), Type(Type), - OffsetEntries(OffsetEntries), Labels(Labels) - {} + : Address(Address), EntrySize(EntrySize), OutputEntrySize(EntrySize), + Type(Type), OffsetEntries(OffsetEntries), Labels(Labels) {} /// Dynamic number of times each entry in the table was referenced. /// Identical entries will have a shared count (identical for every @@ -1275,6 +1275,11 @@ class BinaryFunction { return getJumpTableContainingAddress(Address); } + JumpTable *getJumpTable(const MCInst &Inst) { + const auto Address = BC.MIA->getJumpTable(Inst); + return getJumpTableContainingAddress(Address); + } + const MCSymbol *getPersonalityFunction() const { return PersonalityFunction; } diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 2f8cf3e345a9..5bea65a527ea 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -16,6 +16,7 @@ #include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" #include "Passes/LongJmp.h" +#include "Passes/JTFootprintReduction.h" #include "Passes/PLTCall.h" #include "Passes/ReorderFunctions.h" #include "Passes/StokeInfo.h" @@ -62,6 +63,19 @@ InlineSmallFunctions("inline-small-functions", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +JTFootprintReductionFlag("jt-footprint-reduction", + cl::desc("make jump tables size smaller at the cost of using more " + "instructions at jump sites"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +PrintJTFootprintReduction("print-after-jt-footprint-reduction", + cl::desc("print function after jt-footprint-reduction pass"), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt NeverPrint("never-print", cl::desc("never print"), @@ -328,6 +342,10 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintPeepholes)); + Manager.registerPass( + llvm::make_unique(PrintJTFootprintReduction), + opts::JTFootprintReductionFlag); + Manager.registerPass(llvm::make_unique(PrintInline), opts::InlineSmallFunctions); diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 0b733d7ad846..ac446bd725b7 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -13,6 +13,7 @@ add_llvm_library(LLVMBOLTPasses HFSortPlus.cpp IndirectCallPromotion.cpp Inliner.cpp + JTFootprintReduction.cpp LivenessAnalysis.cpp LongJmp.cpp MCF.cpp diff --git a/bolt/Passes/DataflowAnalysis.cpp b/bolt/Passes/DataflowAnalysis.cpp index 9a96059ffb7e..5b093d9fdbcb 100644 --- a/bolt/Passes/DataflowAnalysis.cpp +++ b/bolt/Passes/DataflowAnalysis.cpp @@ -37,6 +37,19 @@ void doForAllSuccs(const BinaryBasicBlock &BB, } void RegStatePrinter::print(raw_ostream &OS, const BitVector &State) const { + if (State.all()) { + OS << "(all)"; + return; + } + if (State.count() > (State.size() >> 1)) { + OS << "all, except: "; + auto BV = State; + BV.flip(); + for (auto I = BV.find_first(); I != -1; I = BV.find_next(I)) { + OS << BC.MRI->getName(I) << " "; + } + return; + } for (auto I = State.find_first(); I != -1; I = State.find_next(I)) { OS << BC.MRI->getName(I) << " "; } diff --git a/bolt/Passes/JTFootprintReduction.cpp b/bolt/Passes/JTFootprintReduction.cpp new file mode 100644 index 000000000000..6ac856f74e9f --- /dev/null +++ b/bolt/Passes/JTFootprintReduction.cpp @@ -0,0 +1,276 @@ +//===--- JTFootprintReduction.cpp -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "JTFootprintReduction.h" +#include "llvm/Support/Options.h" + +#define DEBUG_TYPE "JT" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +extern cl::opt Verbosity; +extern cl::opt Relocs; +extern bool shouldProcess(const bolt::BinaryFunction &Function); + +extern cl::opt JumpTables; +} // namespace opts + +namespace llvm { +namespace bolt { + +void JTFootprintReduction::checkOpportunities(BinaryContext &BC, + BinaryFunction &Function, + DataflowInfoManager &Info) { + std::map AllJTs; + + for (auto &BB : Function) { + for (auto &Inst : BB) { + auto *JumpTable = Function.getJumpTable(Inst); + if (!JumpTable) + continue; + + AllJTs[JumpTable] += BB.getKnownExecutionCount(); + ++IndJmps; + + if (BlacklistedJTs.count(JumpTable)) + continue; + + uint64_t Scale; + // Try a standard indirect jump matcher + auto IndJmpMatcher = BC.MIA->matchIndJmp( + BC.MIA->matchAnyOperand(), BC.MIA->matchImm(Scale), + BC.MIA->matchReg(), BC.MIA->matchAnyOperand()); + if (IndJmpMatcher->match(*BC.MRI, *BC.MIA, + MutableArrayRef(&*BB.begin(), &Inst + 1), + -1) && + Scale == 8) { + if (Info.getLivenessAnalysis().scavengeRegAfter(&Inst)) + continue; + BlacklistedJTs.insert(JumpTable); + ++IndJmpsDenied; + ++NumJTsNoReg; + continue; + } + + // Try a PIC matcher. The pattern we are looking for is a PIC JT ind jmp: + // addq %rdx, %rsi + // addq %rdx, %rdi + // leaq DATAat0x402450(%rip), %r11 + // movslq (%r11,%rdx,4), %rcx + // addq %r11, %rcx + // jmpq *%rcx # JUMPTABLE @0x402450 + MCPhysReg BaseReg1; + MCPhysReg BaseReg2; + uint64_t Offset; + auto PICIndJmpMatcher = BC.MIA->matchIndJmp(BC.MIA->matchAdd( + BC.MIA->matchReg(BaseReg1), + BC.MIA->matchLoad(BC.MIA->matchReg(BaseReg2), BC.MIA->matchImm(Scale), + BC.MIA->matchReg(), BC.MIA->matchImm(Offset)))); + auto PICBaseAddrMatcher = BC.MIA->matchIndJmp( + BC.MIA->matchAdd(BC.MIA->matchLoadAddr(BC.MIA->matchSymbol()), + BC.MIA->matchAnyOperand())); + if (!PICIndJmpMatcher->match( + *BC.MRI, *BC.MIA, + MutableArrayRef(&*BB.begin(), &Inst + 1), -1) || + Scale != 4 || BaseReg1 != BaseReg2 || Offset != 0 || + !PICBaseAddrMatcher->match( + *BC.MRI, *BC.MIA, + MutableArrayRef(&*BB.begin(), &Inst + 1), -1)) { + BlacklistedJTs.insert(JumpTable); + ++IndJmpsDenied; + ++NumJTsBadMatch; + continue; + } + } + } + + // Statistics only + for (const auto &JTFreq : AllJTs) { + auto *JT = JTFreq.first; + uint64_t CurScore = JTFreq.second; + TotalJTScore += CurScore; + if (!BlacklistedJTs.count(JT)) { + OptimizedScore += CurScore; + if (JT->EntrySize == 8) + BytesSaved += JT->getSize() >> 1; + } + } + TotalJTs += AllJTs.size(); + TotalJTsDenied += BlacklistedJTs.size(); +} + +bool JTFootprintReduction::tryOptimizeNonPIC( + BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, uint64_t JTAddr, + BinaryFunction::JumpTable *JumpTable, DataflowInfoManager &Info) { + + MCOperand Base; + uint64_t Scale; + MCPhysReg Index; + MCOperand Offset; + auto IndJmpMatcher = BC.MIA->matchIndJmp( + BC.MIA->matchAnyOperand(Base), BC.MIA->matchImm(Scale), + BC.MIA->matchReg(Index), BC.MIA->matchAnyOperand(Offset)); + if (!IndJmpMatcher->match(*BC.MRI, *BC.MIA, + MutableArrayRef(&*BB.begin(), &Inst + 1), + -1)) { + return false; + } + + assert(Scale == 8 && "Wrong scale"); + + Scale = 4; + IndJmpMatcher->annotate(*BC.MIA, *BC.Ctx.get(), "DeleteMe"); + + auto &LA = Info.getLivenessAnalysis(); + MCPhysReg Reg = LA.scavengeRegAfter(&Inst); + assert(Reg != 0 && "Register scavenger failed!"); + auto RegOp = MCOperand::createReg(Reg); + SmallVector NewFrag; + + BC.MIA->createIJmp32Frag(NewFrag, Base, MCOperand::createImm(Scale), + MCOperand::createReg(Index), Offset, RegOp); + BC.MIA->setJumpTable(BC.Ctx.get(), NewFrag.back(), JTAddr, Index); + + JumpTable->OutputEntrySize = 4; + + BB.replaceInstruction(&Inst, NewFrag.begin(), NewFrag.end()); + return true; +} + +bool JTFootprintReduction::tryOptimizePIC( + BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, uint64_t JTAddr, + BinaryFunction::JumpTable *JumpTable, DataflowInfoManager &Info) { + MCPhysReg BaseReg; + uint64_t Scale; + MCPhysReg Index; + MCOperand Offset; + MCOperand JumpTableRef; + auto PICIndJmpMatcher = BC.MIA->matchIndJmp(BC.MIA->matchAdd( + BC.MIA->matchLoadAddr(BC.MIA->matchAnyOperand(JumpTableRef)), + BC.MIA->matchLoad(BC.MIA->matchReg(BaseReg), BC.MIA->matchImm(Scale), + BC.MIA->matchReg(Index), BC.MIA->matchAnyOperand()))); + if (!PICIndJmpMatcher->match(*BC.MRI, *BC.MIA, + MutableArrayRef(&*BB.begin(), &Inst + 1), + -1)) { + return false; + } + + assert(Scale == 4 && "Wrong scale"); + + PICIndJmpMatcher->annotate(*BC.MIA, *BC.Ctx.get(), "DeleteMe"); + + auto RegOp = MCOperand::createReg(BaseReg); + SmallVector NewFrag; + + BC.MIA->createIJmp32Frag(NewFrag, MCOperand::createReg(0), + MCOperand::createImm(Scale), + MCOperand::createReg(Index), JumpTableRef, RegOp); + BC.MIA->setJumpTable(BC.Ctx.get(), NewFrag.back(), JTAddr, Index); + + JumpTable->OutputEntrySize = 4; + // DePICify + JumpTable->Type = BinaryFunction::JumpTable::JTT_NORMAL; + + BB.replaceInstruction(&Inst, NewFrag.begin(), NewFrag.end()); + return true; +} + +void JTFootprintReduction::optimizeFunction(BinaryContext &BC, + BinaryFunction &Function, + DataflowInfoManager &Info) { + for (auto &BB : Function) { + if (!BB.getNumNonPseudos()) + continue; + + MCInst &IndJmp = *BB.getLastNonPseudo(); + uint64_t JTAddr = BC.MIA->getJumpTable(IndJmp); + + if (!JTAddr) + continue; + + auto *JumpTable = Function.getJumpTable(IndJmp); + if (BlacklistedJTs.count(JumpTable)) + continue; + + if (tryOptimizeNonPIC(BC, BB, IndJmp, JTAddr, JumpTable, Info) + || tryOptimizePIC(BC, BB, IndJmp, JTAddr, JumpTable, Info)) { + Modified.insert(&Function); + continue; + } + + llvm_unreachable("Should either optimize PIC or NonPIC successfuly"); + } + + if (!Modified.count(&Function)) + return; + + for (auto &BB : Function) { + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + if (BC.MIA->hasAnnotation(*I, "DeleteMe")) + BB.eraseInstruction(&*I); + } + } +} + +void JTFootprintReduction::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions +) { + if (opts::JumpTables == JTS_BASIC && opts::Relocs) + return; + + BinaryFunctionCallGraph CG(buildCallGraph(BC, BFs)); + RegAnalysis RA(BC, BFs, CG); + for (auto &BFIt : BFs) { + auto &Function = BFIt.second; + + if (!Function.isSimple() || !opts::shouldProcess(Function)) + continue; + + if (Function.getKnownExecutionCount() == 0) + continue; + + DataflowInfoManager Info(BC, Function, &RA, nullptr); + BlacklistedJTs.clear(); + checkOpportunities(BC, Function, Info); + optimizeFunction(BC, Function, Info); + } + + if (TotalJTs == TotalJTsDenied) { + outs() << "BOLT-INFO: JT Footprint reduction: no changes were made.\n"; + return; + } + + outs() << "BOLT-INFO: JT Footprint reduction stats (simple funcs only):\n"; + if (OptimizedScore) { + outs() << format("\t %.2lf%%", (OptimizedScore * 100.0 / TotalJTScore)) + << " of dynamic JT entries were reduced.\n"; + } + outs() << "\t " << TotalJTs - TotalJTsDenied << " of " << TotalJTs + << " jump tables affected.\n"; + outs() << "\t " << IndJmps - IndJmpsDenied << " of " << IndJmps + << " indirect jumps to JTs affected.\n"; + outs() << "\t " << NumJTsBadMatch + << " JTs discarded due to unsupported jump pattern.\n"; + outs() << "\t " << NumJTsNoReg + << " JTs discarded due to register unavailability.\n"; + outs() << "\t " << BytesSaved + << " bytes saved.\n"; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/JTFootprintReduction.h b/bolt/Passes/JTFootprintReduction.h new file mode 100644 index 000000000000..b2717fca3061 --- /dev/null +++ b/bolt/Passes/JTFootprintReduction.h @@ -0,0 +1,85 @@ +//===--- JTFootprintReduction.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Jump table footprint reduction pass +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_JT_FOOTPRINT_REDUCTION_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_JT_FOOTPRINT_REDUCTION_H + +#include "BinaryPasses.h" +#include "DataflowInfoManager.h" +#include "DataReader.h" + +namespace llvm { +namespace bolt { + +/// This pass identify indirect jumps to jump tables and reduce their entries +/// size from 8 to 4 bytes. For PIC jump tables, it will remove the PIC code +/// (since BOLT only process static code and it makes no sense to use expensive +/// PIC-style jumps in static code). +class JTFootprintReduction : public BinaryFunctionPass { + uint64_t TotalJTScore{0}; + uint64_t TotalJTs{0}; + uint64_t TotalJTsDenied{0}; + uint64_t OptimizedScore{0}; + uint64_t IndJmps{0}; + uint64_t IndJmpsDenied{0}; + uint64_t NumJTsBadMatch{0}; + uint64_t NumJTsNoReg{0}; + uint64_t BytesSaved{0}; + DenseSet BlacklistedJTs; + DenseSet Modified; + + /// Check if \p Function presents jump tables where all jump locations can + /// be safely changed to use a different code sequence. If this is true, we + /// will be able to emit the whole table with a smaller entry size. + void checkOpportunities(BinaryContext &BC, BinaryFunction &Function, + DataflowInfoManager &Info); + + /// The Non-PIC jump table optimization consists of reducing the jump table + /// entry size from 8 to 4 bytes. For that, we need to change the jump code + /// sequence from a single jmp * instruction to a pair of load32zext-jmp + /// instructions that depend on the availability of an extra register. + /// This saves dcache/dTLB at the expense of icache. + bool tryOptimizeNonPIC(BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, + uint64_t JTAddr, BinaryFunction::JumpTable *JumpTable, + DataflowInfoManager &Info); + + /// The PIC jump table optimization consists of "de-pic-ifying" it, since the + /// PIC jump sequence is larger than its non-PIC counterpart, saving icache. + bool tryOptimizePIC(BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, + uint64_t JTAddr, BinaryFunction::JumpTable *JumpTable, + DataflowInfoManager &Info); + + /// Run a pass for \p Function + void optimizeFunction(BinaryContext &BC, BinaryFunction &Function, + DataflowInfoManager &Info); + +public: + explicit JTFootprintReduction(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + /// BinaryPass interface functions + const char *getName() const override { + return "jt-footprint-reduction"; + } + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index 41909503b2dc..54f74d28391a 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -60,8 +60,13 @@ class LivenessAnalysis BitVector BV = *this->getStateAt(P); BV.flip(); BitVector GPRegs(NumRegs, false); - this->BC.MIA->getGPRegs(GPRegs); + this->BC.MIA->getGPRegs(GPRegs, /*IncludeAlias=*/false); + // Ignore the register used for frame pointer even if it is not alive (it + // may be used by CFI which is not represented in our dataflow). + auto FP = BC.MIA->getAliases(BC.MIA->getFramePointer()); + FP.flip(); BV &= GPRegs; + BV &= FP; int Reg = BV.find_first(); return Reg != -1 ? Reg : 0; } @@ -74,6 +79,19 @@ class LivenessAnalysis void preflight() {} BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) { + // Entry points start with default live out (registers used as return + // values). + if (BB.succ_size() == 0) { + BitVector State(NumRegs, false); + if (opts::AssumeABI) { + BC.MIA->getDefaultLiveOut(State); + BC.MIA->getCalleeSavedRegs(State); + } else { + State.set(); + State.reset(BC.MIA->getFlagsReg()); + } + return State; + } return BitVector(NumRegs, false); } @@ -100,7 +118,15 @@ class LivenessAnalysis // because we don't really know what's going on. if (RA.isConservative(Written)) { Written.reset(); - BC.MIA->getCalleeSavedRegs(Written); + BC.MIA->getDefaultLiveOut(Written); + // If ABI is respected, everything except CSRs should be dead after a + // call + if (opts::AssumeABI) { + auto CSR = BitVector(NumRegs, false); + BC.MIA->getCalleeSavedRegs(CSR); + CSR.flip(); + Written |= CSR; + } } } Written.flip(); @@ -108,7 +134,26 @@ class LivenessAnalysis // Gen if (!this->BC.MIA->isCFI(Point)) { auto Used = BitVector(NumRegs, false); - RA.getInstUsedRegsList(Point, Used, /*GetClobbers*/false); + if (IsCall) { + RA.getInstUsedRegsList(Point, Used, /*GetClobbers*/true); + if (RA.isConservative(Used)) { + Used = BC.MIA->getRegsUsedAsParams(); + BC.MIA->getDefaultLiveOut(Used); + } + } + const auto InstInfo = BC.MII->get(Point.getOpcode()); + for (unsigned I = 0, E = Point.getNumOperands(); I != E; ++I) { + if (!Point.getOperand(I).isReg() || I < InstInfo.getNumDefs()) + continue; + Used |= BC.MIA->getAliases(Point.getOperand(I).getReg(), + /*OnlySmaller=*/false); + } + for (auto + I = InstInfo.getImplicitUses(), + E = InstInfo.getImplicitUses() + InstInfo.getNumImplicitUses(); + I != E; ++I) { + Used |= BC.MIA->getAliases(*I, false); + } if (IsCall && (!BC.MIA->isTailCall(Point) || !BC.MIA->isConditionalBranch(Point))) { // Never gen FLAGS from a non-conditional call... this is overly From c8a2b0269d2a948520001d2c03801eca85f860e8 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Tue, 14 Nov 2017 16:51:24 -0800 Subject: [PATCH 352/904] a new i-cache metric Summary: The diff introduces two measures for i-cache performance: a TSP measure (currently used for optimization) and an "extended" TSP measure that takes into account jumps between non-consecutive basic blocks. The two measures are computed for estimated addresses/sizes of basic blocks and for the actually omitted addresses/sizes. Intuitively, the Extended-TSP metric quantifies the expected number of i-cache misses for a given ordering of basic blocks. It has 5 parameters: - FallthroughWeight is the impact of fallthrough jumps on the score - ForwardWeight is the impact of forward (but not fallthrough) jumps - BackwardWeight is the impact of backward jumps - ForwardDistance is the max distance of a forward jump affecting the score - BackwardDistance is the max distance of a backward jump affecting the score We're still learning the "best" values for the options but default values look reasonable so far. (cherry picked from commit c58f451224beff38e64dfb37de654151bdc1bcdf) --- bolt/CacheMetrics.cpp | 306 +++++++++++++++++++++++++++++++++++---- bolt/CacheMetrics.h | 4 +- bolt/RewriteInstance.cpp | 6 +- 3 files changed, 286 insertions(+), 30 deletions(-) diff --git a/bolt/CacheMetrics.cpp b/bolt/CacheMetrics.cpp index 4e5b08fe2729..bd723b80629d 100644 --- a/bolt/CacheMetrics.cpp +++ b/bolt/CacheMetrics.cpp @@ -13,19 +13,37 @@ using namespace llvm; using namespace bolt; using Traversal = std::vector; +// The weight of fallthrough jumps for ExtTSP metric +constexpr double FallthroughWeight = 1.0; +// The weight of forward jumps for ExtTSP metric +constexpr double ForwardWeight = 1.0; +// The weight of backward jumps for ExtTSP metric +constexpr double BackwardWeight = 1.0; +// The maximum distance (in bytes) of forward jumps for ExtTSP metric +constexpr uint64_t ForwardDistance = 256; +// The maximum distance (in bytes) of backward jumps for ExtTSP metric +constexpr uint64_t BackwardDistance = 256; + +// The size of the i-TLB cache page +constexpr uint64_t ITLBPageSize = 4096; +// Capacity of the i-TLB cache +constexpr uint64_t ITLBEntries = 16; + namespace { -/// Initialize and return a position map for binary basic blocks. -std::unordered_map -getPositionMap(const BinaryFunction &Function) { - std::unordered_map DistMap; - double CurrAddress = 0; - for (auto *BB : Function.layout()) { - uint64_t Size = BB->estimateSize(); - DistMap[BB] = CurrAddress + (double)Size / 2; - CurrAddress += Size; +/// Initialize and return a position map for binary basic blocks +void extractBasicBlockInfo( + const std::vector &BinaryFunctions, + std::unordered_map &BBAddr, + std::unordered_map &BBSize) { + + // Use addresses/sizes as in the output binary + for (auto BF : BinaryFunctions) { + for (auto BB : BF->layout()) { + BBAddr[BB] = BB->getOutputAddressRange().first; + BBSize[BB] = BB->getOutputSize(); + } } - return DistMap; } /// Initialize and return a vector of traversals for a given entry block @@ -81,21 +99,27 @@ std::vector getTraversals(BinaryBasicBlock *EntryBB) { } /// Given a traversal, return the sum of block distances along this traversal. -double -getTraversalLength(std::unordered_map &DistMap, - Traversal const &Path) { - double Length = 0.0; - BinaryBasicBlock *PrevBB = Path.front(); - for (auto BBI = std::next(Path.begin()); BBI != Path.end(); ++BBI) { - Length += std::abs(DistMap[*BBI] - DistMap[PrevBB]); - PrevBB = *BBI; +double getTraversalLength( + const std::unordered_map &BBAddr, + const Traversal &Path) { + double Length = 0; + for (size_t I = 0; I + 1 < Path.size(); I++) { + // Ignore calls between hot and cold parts + if (Path[I]->isCold() != Path[I + 1]->isCold()) + continue; + double SrcAddr = BBAddr.at(Path[I]); + double DstAddr = BBAddr.at(Path[I + 1]); + Length += std::abs(SrcAddr - DstAddr); } - return Length; } /// Calculate average number of call distance for every graph traversal -double calcGraphDistance(const std::vector &BinaryFunctions) { +double calcGraphDistance( + const std::vector &BinaryFunctions, + const std::unordered_map &BBAddr, + const std::unordered_map &BBSize) { + double TotalTraversalLength = 0; double NumTraversals = 0; for (auto BF : BinaryFunctions) { @@ -106,12 +130,11 @@ double calcGraphDistance(const std::vector &BinaryFunctions) { for (auto BB : BF->layout()) { if (BB->isEntryPoint()) { auto AllTraversals = getTraversals(BB); - auto DistMap = getPositionMap(*BF); for (auto const &Path : AllTraversals) { // Ignore short traversals if (Path.size() <= 1) continue; - TotalTraversalLength += getTraversalLength(DistMap, Path); + TotalTraversalLength += getTraversalLength(BBAddr, Path); NumTraversals++; } } @@ -121,6 +144,196 @@ double calcGraphDistance(const std::vector &BinaryFunctions) { return TotalTraversalLength / NumTraversals; } +/// Calculate TSP metric, which quantifies the number of fallthrough jumps in +/// the ordering of basic blocks +double calcTSPScore( + const std::vector &BinaryFunctions, + const std::unordered_map &BBAddr, + const std::unordered_map &BBSize) { + + double Score = 0; + for (auto BF : BinaryFunctions) { + for (auto SrcBB : BF->layout()) { + auto BI = SrcBB->branch_info_begin(); + for (auto DstBB : SrcBB->successors()) { + if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB)) + Score += BI->Count; + ++BI; + } + } + } + return Score; +} + +/// Calculate Extended-TSP metric, which quantifies the expected number of +/// i-cache misses for a given ordering of basic blocks. The parameters are: +/// - FallthroughWeight is the impact of fallthrough jumps on the score +/// - ForwardWeight is the impact of forward (but not fallthrough) jumps +/// - BackwardWeight is the impact of backward jumps +/// - ForwardDistance is the max distance of a forward jump affecting the score +/// - BackwardDistance is the max distance of a backward jump affecting the score +double calcExtTSPScore( + const std::vector &BinaryFunctions, + const std::unordered_map &BBAddr, + const std::unordered_map &BBSize, + double FallthroughWeight, + double ForwardWeight, + double BackwardWeight, + uint64_t ForwardDistance, + uint64_t BackwardDistance) { + + double Score = 0.0; + for (auto BF : BinaryFunctions) { + for (auto SrcBB : BF->layout()) { + auto BI = SrcBB->branch_info_begin(); + for (auto DstBB : SrcBB->successors()) { + if (DstBB != SrcBB) { + double Count = BI->Count == BinaryBasicBlock::COUNT_NO_PROFILE + ? 0.0 + : double(BI->Count); + uint64_t SrcAddr = BBAddr.at(SrcBB); + uint64_t SrcSize = BBSize.at(SrcBB); + uint64_t DstAddr = BBAddr.at(DstBB); + + if (SrcAddr <= DstAddr) { + if (SrcAddr + SrcSize == DstAddr) { + // fallthrough jump + Score += FallthroughWeight * Count; + } else { + // the distance of the forward jump + size_t Dist = DstAddr - (SrcAddr + SrcSize); + if (Dist <= ForwardDistance) { + double Prob = double(ForwardDistance - Dist) / ForwardDistance; + Score += ForwardWeight * Prob * Count; + } + } + } else { + // the distance of the backward jump + size_t Dist = SrcAddr + SrcSize - DstAddr; + if (Dist <= BackwardDistance) { + double Prob = double(BackwardDistance - Dist) / BackwardDistance; + Score += BackwardWeight * Prob * Count; + } + } + } + ++BI; + } + } + } + return Score; +} + +using Predecessors = std::vector>; + +/// Build a simplified version of the call graph: For every function, keep +/// its callers and the frequencies of the calls +std::unordered_map +extractFunctionCalls(const std::vector &BinaryFunctions) { + std::unordered_map Calls; + + for (auto SrcFunction : BinaryFunctions) { + const auto &BC = SrcFunction->getBinaryContext(); + for (auto BB : SrcFunction->layout()) { + // Find call instructions and extract target symbols from each one + for (auto &Inst : *BB) { + if (!BC.MIA->isCall(Inst)) + continue; + + // Call info + const MCSymbol* DstSym = BC.MIA->getTargetSymbol(Inst); + auto Count = BB->getKnownExecutionCount(); + // Ignore calls w/o information + if (DstSym == nullptr || Count == 0) + continue; + + auto DstFunction = BC.getFunctionForSymbol(DstSym); + // Ignore recursive calls + if (DstFunction == nullptr || + DstFunction->layout_empty() || + DstFunction == SrcFunction) + continue; + + // Record the call + Calls[DstFunction].push_back(std::make_pair(SrcFunction, Count)); + } + } + } + return Calls; +} + +/// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg). +/// Given an assignment of functions to the i-TLB pages), we divide all +/// functions calls into two categories: +/// - 'short' ones that have a caller-callee distance less than a page; +/// - 'long' ones where the distance exceeds a page. +/// The short calls are likely to result in a i-TLB cache hit. For the long ones, +/// the hit/miss result depends on the 'hotness' of the page (i.e., how often +/// the page is accessed). Assuming that functions are sent to the i-TLB cache +/// in a random order, the probability that a page is present in the cache is +/// proportional to the number of samples corresponding to the functions on the +/// page. The following procedure detects short and long calls, and estimates +/// the expected number of cache misses for the long ones. +double expectedCacheHitRatio( + const std::vector &BinaryFunctions, + const std::unordered_map &BBAddr, + const std::unordered_map &BBSize, + double PageSize, + uint64_t CacheEntries) { + + auto Calls = extractFunctionCalls(BinaryFunctions); + // Compute 'hotness' of the functions + double TotalSamples = 0; + std::unordered_map FunctionSamples; + for (auto BF : BinaryFunctions) { + double Samples = 0; + for (auto Pair : Calls[BF]) { + Samples += Pair.second; + } + Samples = std::max(Samples, (double)BF->getKnownExecutionCount()); + FunctionSamples[BF] = Samples; + TotalSamples += Samples; + } + + // Compute 'hotness' of the pages + std::unordered_map PageSamples; + for (auto BF : BinaryFunctions) { + if (BF->layout_empty()) + continue; + auto Page = BBAddr.at(BF->layout_front()) / PageSize; + PageSamples[Page] += FunctionSamples.at(BF); + } + + // Computing the expected number of misses for every function + double Misses = 0; + for (auto BF : BinaryFunctions) { + // Skip the function if it has no samples + if (BF->layout_empty() || FunctionSamples.at(BF) == 0.0) + continue; + double Samples = FunctionSamples.at(BF); + auto Page = BBAddr.at(BF->layout_front()) / PageSize; + // The probability that the page is not present in the cache + double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries); + + // Processing all callers of the function + for (auto Pair : Calls[BF]) { + auto SrcFunction = Pair.first; + auto SrcPage = BBAddr.at(SrcFunction->layout_front()) / PageSize; + // Is this a 'long' or a 'short' call? + if (Page != SrcPage) { + // This is a miss + Misses += MissProb * Pair.second; + } + Samples -= Pair.second; + } + assert(Samples >= 0.0 && "Function samples computed incorrectly"); + // The remaining samples likely come from the jitted code + Misses += Samples * MissProb; + } + + return 100.0 * (1.0 - Misses / TotalSamples); +} + } void CacheMetrics::printAll( @@ -149,7 +362,50 @@ void CacheMetrics::printAll( << format(" %zu (%.2lf%%) have non-empty execution count\n", NumHotBlocks, 100.0 * NumHotBlocks / NumBlocks); - const auto GraphDistance = calcGraphDistance(BinaryFunctions); - outs() << " An average length of graph traversal is " - << format("%.2lf\n", GraphDistance); + std::unordered_map BBAddr; + std::unordered_map BBSize; + extractBasicBlockInfo(BinaryFunctions, BBAddr, BBSize); + + size_t TotalCodeSize = 0; + size_t HotCodeSize = 0; + for (auto Pair : BBSize) { + TotalCodeSize += Pair.second; + auto BB = Pair.first; + if (!BB->isCold() && BB->getFunction()->hasValidIndex()) + HotCodeSize += Pair.second; + } + outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu)\n", + 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize); + + outs() << " An average length of graph traversal: " + << format("%.0lf\n", calcGraphDistance(BinaryFunctions, + BBAddr, + BBSize)); + + outs() << " Expected i-TLB cache hit ratio " + << format("(%zu, %zu): ", ITLBPageSize, ITLBEntries) + << format("%.2lf%%\n", expectedCacheHitRatio(BinaryFunctions, + BBAddr, + BBSize, + ITLBPageSize, + ITLBEntries)); + + outs() << " TSP score: " + << format("%.0lf\n", calcTSPScore(BinaryFunctions, BBAddr, BBSize)); + + outs() << " ExtTSP score " + << format("(%.2lf, %.2lf, %.2lf, %zu, %zu): ", FallthroughWeight, + ForwardWeight, + BackwardWeight, + ForwardDistance, + BackwardDistance) + << format("%.0lf\n", calcExtTSPScore(BinaryFunctions, + BBAddr, + BBSize, + FallthroughWeight, + ForwardWeight, + BackwardWeight, + ForwardDistance, + BackwardDistance)); + } diff --git a/bolt/CacheMetrics.h b/bolt/CacheMetrics.h index e4ca3abc34f9..1dab4565bc34 100644 --- a/bolt/CacheMetrics.h +++ b/bolt/CacheMetrics.h @@ -11,13 +11,13 @@ #define LLVM_TOOLS_LLVM_BOLT_CACHEMETRICS_H #include "BinaryFunction.h" -#include +#include namespace llvm { namespace bolt { namespace CacheMetrics { -/// Calculate and print various metrics related to instruction cache performance +/// Calculate various metrics related to instruction cache performance. void printAll(const std::vector &BinaryFunctions); } // namespace CacheMetrics diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 815c26d80f02..66b1633605d6 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -675,7 +675,7 @@ void RewriteInstance::aggregateData() { void RewriteInstance::discoverStorage() { NamedRegionTimer T("discover storage", TimerGroupName, opts::TimeRewrite); - + // Stubs are harmful because RuntimeDyld may try to increase the size of // sections accounting for stubs when we need those sections to match the // same size seen in the input binary, in case this section is a copy @@ -1952,7 +1952,7 @@ void RewriteInstance::disassembleFunctions() { } auto FunctionData = BC->getFunctionData(Function); - + if (!FunctionData) { // When could it happen? errs() << "BOLT-ERROR: corresponding section is non-executable or " @@ -2464,7 +2464,7 @@ void RewriteInstance::emitFunctions() { OLT.emitAndFinalize(ObjectsHandle); if (opts::PrintCacheMetrics) { - outs() << "BOLT-INFO: cache metrics after optimization\n"; + outs() << "BOLT-INFO: cache metrics after emitting functions:\n"; CacheMetrics::printAll(SortedFunctions); } From ee66b2a74ecb7961dd2e93c17a872637d8881ad9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 29 Nov 2017 17:38:39 -0800 Subject: [PATCH 353/904] [BOLT] Fix ICP nested jump table handling and general stats. Summary: Load elimination for ICP wasn't handling nested jump tables correctly. It wasn't offseting the indices by the range of the nested table. I also wasn't computing some of the stats ICP correctly in all cases which was leading to weird results in the stats. (cherry picked from commit 153f5d7f16978b70330cb27d355e2ce90022085f) --- bolt/DataReader.cpp | 4 + bolt/DataReader.h | 6 + bolt/Passes/IndirectCallPromotion.cpp | 219 ++++++++++++++++---------- bolt/Passes/IndirectCallPromotion.h | 3 +- 4 files changed, 143 insertions(+), 89 deletions(-) diff --git a/bolt/DataReader.cpp b/bolt/DataReader.cpp index c58919b2a360..eaa70fe4e048 100644 --- a/bolt/DataReader.cpp +++ b/bolt/DataReader.cpp @@ -304,6 +304,10 @@ void MemInfo::print(raw_ostream &OS) const { << Count << "\n"; } +void MemInfo::prettyPrint(raw_ostream &OS) const { + OS << "(PC: " << Offset << ", M: " << Addr << ", C: " << Count << ")"; +} + iterator_range FuncMemData::getMemInfoRange(uint64_t Offset) const { // Commented out because it can be expensive. diff --git a/bolt/DataReader.h b/bolt/DataReader.h index 852e6f177417..b3ba0999a932 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -208,9 +208,15 @@ struct MemInfo { } void print(raw_ostream &OS) const; + void prettyPrint(raw_ostream &OS) const; MemInfo(const Location &Offset, const Location &Addr, uint64_t Count = 0) : Offset(Offset), Addr(Addr), Count(Count) {} + + friend raw_ostream &operator<<(raw_ostream &OS, const MemInfo &MI) { + MI.prettyPrint(OS); + return OS; + } }; /// Helper class to store memory load events recorded in the address space of diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index d17bcbd5c045..fc0792c7c760 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -188,6 +188,9 @@ IndirectCallPromotion::getCallTargets( assert(BF.getBasicBlockForLabel(Entry) || Entry == BF.getFunctionEndLabel() || Entry == BF.getFunctionColdEndLabel()); + if (Entry == BF.getFunctionEndLabel() || + Entry == BF.getFunctionColdEndLabel()) + continue; const Location To(Entry); Callsite CS{ From, To, JI->Mispreds, JI->Count, BranchHistories(), @@ -298,8 +301,7 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets( BinaryBasicBlock *BB, MCInst &CallInst, MCInst *&TargetFetchInst, - const BinaryFunction::JumpTable *JT, - const std::vector &Targets + const BinaryFunction::JumpTable *JT ) const { const auto *MemData = Function.getMemData(); JumpTableInfoType HotTargets; @@ -388,18 +390,28 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets( ArrayStart += Address + Size; } + // This is a map of [symbol] -> [count, index] and is used to combine indices + // into the jump table since there may be multiple addresses that all have the + // same entry. + std::map> HotTargetMap; + const auto Range = JT->getEntriesForAddress(ArrayStart); + for (const auto &MI : MemData->getMemInfoRange(DataOffset.get())) { size_t Index; - if (MI.Addr.Offset % JT->EntrySize != 0) // ignore bogus data + if (!MI.Addr.Offset) // mem data occasionally includes nulls, ignore them continue; + if (MI.Addr.Offset % JT->EntrySize != 0) // ignore bogus data + return JumpTableInfoType(); + if (MI.Addr.IsSymbol) { // Deal with bad/stale data if (MI.Addr.Name != (std::string("JUMP_TABLEat0x") + Twine::utohexstr(JT->Address).str()) && MI.Addr.Name != (std::string("JUMP_TABLEat0x") + - Twine::utohexstr(ArrayStart).str())) - continue; + Twine::utohexstr(ArrayStart).str())) { + return JumpTableInfoType(); + } Index = MI.Addr.Offset / JT->EntrySize; } else { Index = (MI.Addr.Offset - ArrayStart) / JT->EntrySize; @@ -407,23 +419,47 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets( // If Index is out of range it probably means the memory profiling data is // wrong for this instruction, bail out. - if (Index >= JT->getSize()) - continue; + if (Index >= Range.second) { + DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first + << ", " << Range.second << "\n"); + return JumpTableInfoType(); + } + + // Make sure the hot index points at a legal label corresponding to a BB, + // e.g. not the end of function (unreachable) label. + if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) { + DEBUG({ + dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus " + << "label " << JT->Entries[Index + Range.first]->getName() + << " in jump table:\n"; + JT->print(dbgs()); + dbgs() << "HotTargetMap:\n"; + for (auto &HT : HotTargetMap) { + dbgs() << "BOLT-INFO: " << HT.first->getName() + << " = (count=" << HT.first << ", index=" << HT.second + << ")\n"; + } + dbgs() << "BOLT-INFO: MemData:\n"; + for (auto &MI : MemData->getMemInfoRange(DataOffset.get())) { + dbgs() << "BOLT-INFO: " << MI << "\n"; + } + }); + return JumpTableInfoType(); + } - assert(std::accumulate(Targets.begin(), - Targets.end(), - false, - [Index](bool Found, const Callsite &CS) { - return (Found || - std::find(CS.JTIndex.begin(), - CS.JTIndex.end(), - Index) != CS.JTIndex.end()); - }) && - "hot indices must be referred to by at least one callsite"); - - HotTargets.emplace_back(std::make_pair(MI.Count, Index)); + auto &HotTarget = HotTargetMap[JT->Entries[Index + Range.first]]; + HotTarget.first += MI.Count; + HotTarget.second = Index; } + std::transform( + HotTargetMap.begin(), + HotTargetMap.end(), + std::back_inserter(HotTargets), + [](const std::pair> &A) { + return A.second; + }); + // Sort with highest counts first. std::sort(HotTargets.rbegin(), HotTargets.rend()); @@ -458,59 +494,57 @@ IndirectCallPromotion::findCallTargetSymbols( SymTargetsType SymTargets; if (JT) { - std::vector NewTargets; - std::set ToDelete; - - auto findTargetSymbol = - [&](uint64_t Index, const std::vector &Targets) -> MCSymbol * { - size_t Idx = 0; - for (const auto &CS : Targets) { - assert(CS.To.IsSymbol && "All ICP targets must be to known symbols"); - assert(!CS.JTIndex.empty()); - if (std::find(CS.JTIndex.begin(), CS.JTIndex.end(), Index) != - CS.JTIndex.end()) { - ToDelete.insert(Idx); - NewTargets.push_back(CS); - // Since we know the hot index, delete the rest. - NewTargets.back().JTIndex.clear(); - NewTargets.back().JTIndex.push_back(Index); - return CS.To.Sym; - } - ++Idx; - } - return nullptr; - }; - auto HotTargets = maybeGetHotJumpTableTargets(BC, Function, BB, CallInst, TargetFetchInst, - JT, - Targets); + JT); if (!HotTargets.empty()) { - HotTargets.resize(std::min(N, HotTargets.size())); - for (const auto &HT : HotTargets) { - auto *Sym = findTargetSymbol(HT.second, Targets); - assert(Sym); - SymTargets.push_back(std::make_pair(Sym, HT.second)); + auto findTargetsIndex = [&](uint64_t JTIndex) { + for (size_t I = 0; I < Targets.size(); ++I) { + auto &JTIs = Targets[I].JTIndex; + if (std::find(JTIs.begin(), JTIs.end(), JTIndex) != JTIs.end()) + return I; + } + DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump " + << " table entry in " << Function << "\n"); + llvm_unreachable("Hot indices must be referred to by at least one " + "callsite"); + }; + + const auto MaxHotTargets = std::min(N, HotTargets.size()); + + if (opts::Verbosity >= 1) { + for (size_t I = 0; I < MaxHotTargets; ++I) { + outs() << "BOLT-INFO: HotTarget[" << I << "] = (" + << HotTargets[I].first << ", " << HotTargets[I].second << ")\n"; + } } - for (size_t I = 0; I < Targets.size(); ++I) { - if (ToDelete.count(I) == 0) - NewTargets.push_back(Targets[I]); + + std::vector NewTargets; + for (size_t I = 0; I < MaxHotTargets; ++I) { + const auto JTIndex = HotTargets[I].second; + const auto TargetIndex = findTargetsIndex(JTIndex); + + NewTargets.push_back(Targets[TargetIndex]); + std::vector({JTIndex}).swap(NewTargets.back().JTIndex); + + Targets.erase(Targets.begin() + TargetIndex); } + std::copy(Targets.begin(), Targets.end(), std::back_inserter(NewTargets)); + assert(NewTargets.size() == Targets.size() + MaxHotTargets); std::swap(NewTargets, Targets); - } else { - for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { - assert(Targets[TgtIdx].To.IsSymbol && - "All ICP targets must be to known symbols"); - assert(!Targets[TgtIdx].JTIndex.empty() && - "Jump tables must have indices"); - for (auto Idx : Targets[TgtIdx].JTIndex) { - SymTargets.push_back(std::make_pair(Targets[TgtIdx].To.Sym, Idx)); - ++I; - } + } + + for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { + auto &Target = Targets[TgtIdx]; + assert(Target.To.IsSymbol && "All ICP targets must be to known symbols"); + assert(!Target.JTIndex.empty() && "Jump tables must have indices"); + for (auto Idx : Target.JTIndex) { + SymTargets.push_back(std::make_pair(Target.To.Sym, Idx)); + ++I; } } } else { @@ -797,10 +831,13 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( } } } + assert(SymTargets.size() > NewBBs.size() - 1 && + "There must be a target symbol associated with each new BB."); // Fix up successors and execution counts. updateCurrentBranchInfo(); auto *Succ = Function.getBasicBlockForLabel(SymTargets[0]); + assert(Succ && "each jump target must be a legal BB label"); IndCallBlock->addSuccessor(Succ, BBI[0]); // cond branch IndCallBlock->addSuccessor(NewBBs[0].get(), TotalCount); // fallthru branch @@ -810,6 +847,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( uint64_t ExecCount = BBI[I+1].Count; updateCurrentBranchInfo(); auto *Succ = Function.getBasicBlockForLabel(SymTargets[I+1]); + assert(Succ && "each jump target must be a legal BB label"); NewBBs[I]->addSuccessor(Succ, BBI[I+1]); NewBBs[I]->addSuccessor(NewBBs[I+1].get(), TotalCount); // fallthru ExecCount += TotalCount; @@ -887,7 +925,16 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, uint64_t NumCalls) { const bool IsJumpTable = BB->getFunction()->getJumpTable(Inst); - // If we have no targets (or no calls), skip this callsite. + auto computeStats = [&](size_t N) { + for (size_t I = 0; I < N; ++I) { + if (!IsJumpTable) + TotalNumFrequentCalls += Targets[I].Branches; + else + TotalNumFrequentJmps += Targets[I].Branches; + } + }; + + // If we have no targets (or no calls), skip this callsite. if (Targets.empty() || !NumCalls) { if (opts::Verbosity >= 1) { const auto InstIdx = &Inst - &(*BB->begin()); @@ -910,7 +957,11 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, if (opts::ICPTopCallsites > 0) { auto &BC = BB->getFunction()->getBinaryContext(); - return BC.MIA->hasAnnotation(Inst, "DoICP") ? TrialN : 0; + if (BC.MIA->hasAnnotation(Inst, "DoICP")) { + computeStats(TrialN); + return TrialN; + } + return 0; } // Pick the top N targets. @@ -925,14 +976,10 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, // is exceeded by fewer targets. double Threshold = double(opts::IndirectCallPromotionMispredictThreshold); for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) { - const auto Frequency = (100.0 * Targets[I].Mispreds) / NumCalls; + Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls; TotalMispredictsTopN += Targets[I].Mispreds; - if (!IsJumpTable) - TotalNumFrequentCalls += Targets[I].Branches; - else - TotalNumFrequentJmps += Targets[I].Branches; - Threshold -= Frequency; } + computeStats(N); // Compute the misprediction frequency of the top N call targets. If this // frequency is greater than the threshold, we should try ICP on this callsite. @@ -951,24 +998,22 @@ IndirectCallPromotion::canPromoteCallsite(const BinaryBasicBlock *BB, return 0; } } else { + size_t MaxTargets = 0; + // Count total number of calls for (at most) the top N targets. // We may choose a smaller N (TrialN vs. N) if the frequency threshold // is exceeded by fewer targets. double Threshold = double(opts::IndirectCallPromotionThreshold); - for (size_t I = 0; I < TrialN && Threshold > 0; ++I) { + for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++MaxTargets) { if (N + (Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size()) > TrialN) break; - const auto Frequency = (100.0 * Targets[I].Branches) / NumCalls; TotalCallsTopN += Targets[I].Branches; TotalMispredictsTopN += Targets[I].Mispreds; - if (!IsJumpTable) - TotalNumFrequentCalls += Targets[I].Branches; - else - TotalNumFrequentJmps += Targets[I].Branches; - Threshold -= Frequency; + Threshold -= (100.0 * Targets[I].Branches) / NumCalls; N += Targets[I].JTIndex.empty() ? 1 : Targets[I].JTIndex.size(); } + computeStats(MaxTargets); // Compute the frequency of the top N call targets. If this frequency // is greater than the threshold, we should try ICP on this callsite. @@ -1033,18 +1078,17 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, const bool IsTailCall = BC.MIA->isTailCall(Inst); const bool IsJumpTable = BB->getFunction()->getJumpTable(Inst); const auto InstIdx = &Inst - &(*BB->begin()); - bool Separator = false; outs() << "BOLT-INFO: ICP candidate branch info: " << *BB->getFunction() << " @ " << InstIdx << " in " << BB->getName() << " -> calls = " << NumCalls - << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : "")); + << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : "")) + << "\n"; for (size_t I = 0; I < N; I++) { const auto Frequency = 100.0 * Targets[I].Branches / NumCalls; const auto MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls; - outs() << (Separator ? " | " : ", "); - Separator = true; + outs() << "BOLT-INFO: "; if (Targets[I].To.IsSymbol) outs() << Targets[I].To.Sym->getName(); else @@ -1058,11 +1102,11 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, outs() << (First ? ", indices = " : ", ") << JTIndex; First = false; } + outs() << "\n"; } - outs() << "\n"; DEBUG({ - dbgs() << "BOLT-INFO: ICP original call instruction:\n"; + dbgs() << "BOLT-INFO: ICP original call instruction:"; BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true); }); } @@ -1189,7 +1233,7 @@ void IndirectCallPromotion::runOnFunctions( ++Num; } outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls - << ", " << Num << " calls cover " << opts::ICPTopCallsites << "% " + << ", " << Num << " callsites cover " << opts::ICPTopCallsites << "% " << "of all indirect calls\n"; // Mark sites to optimize with "DoICP" annotation. @@ -1211,9 +1255,6 @@ void IndirectCallPromotion::runOnFunctions( const bool HasLayout = !Function.layout_empty(); - // Note: this is not just counting calls. - TotalCalls += BranchData->ExecutionCount; - // Total number of indirect calls issued from the current Function. // (a fraction of TotalIndirectCalls) uint64_t FuncTotalIndirectCalls = 0; @@ -1242,6 +1283,10 @@ void IndirectCallPromotion::runOnFunctions( BC.MIA->hasAnnotation(Inst, "Offset"); const bool IsJumpTable = Function.getJumpTable(Inst); + if (BC.MIA->isCall(Inst)) { + TotalCalls += BB->getKnownExecutionCount(); + } + if (!((HasBranchData && !IsJumpTable && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) continue; diff --git a/bolt/Passes/IndirectCallPromotion.h b/bolt/Passes/IndirectCallPromotion.h index cd49933fbe30..e7b4cdc285e9 100644 --- a/bolt/Passes/IndirectCallPromotion.h +++ b/bolt/Passes/IndirectCallPromotion.h @@ -199,8 +199,7 @@ class IndirectCallPromotion : public BinaryFunctionPass { BinaryBasicBlock *BB, MCInst &Inst, MCInst *&TargetFetchInst, - const BinaryFunction::JumpTable *JT, - const std::vector &Targets) const; + const BinaryFunction::JumpTable *JT) const; SymTargetsType findCallTargetSymbols(BinaryContext &BC, std::vector &Targets, From 0adce20fb6b92ecb9629150bcaa5a3e5b6701671 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 14 Nov 2017 18:20:40 -0800 Subject: [PATCH 354/904] [BOLT] Add REX prefix rebalancing pass Summary: Add a pass to rebalance the usage of REX prefixes, moving them from the hot code path to the cold path whenever possible. To do this, we rank the usage frequency of each register and exchange an X86 classic reg with an extended one (which requires a REX prefix) whenever the classic register is being used less times than the extended one. There are two versions of this pass: regular one will only consider RBX as classic and R12-R15 as extended registers because those are callee-saved, which means their scope is local to the function and therefore they can be easily interchanged within the function without further consequences. The aggressive version relies on liveness analysis to detect if the value of a register is being used as a caller-saved value (written to without being read first), which also is eligible for reallocation. However, it showed limited results and is not the default option because it is expensive. Currently, this pass does not update debug info. This means that if a substitution is made, the AT_LOCATION of a variable inside a function may be outdated and GDB will display the wrong value if you ask it to print the value of the affected variable. Updating DWARF involves a painful task of writing a new DWARF expression parser/writer similar to the one we already have for CFI expressions. I'll defer the task of writing this until we determine this optimization is enabled in production. So far, it is experimental to be combined with other optimizations to help us find a new set of optimizations that is beneficial. (cherry picked from commit 1a39b103bab189ab89b14096c3356e640ff8aa7b) --- bolt/BinaryPassManager.cpp | 18 ++ bolt/Passes/CMakeLists.txt | 1 + bolt/Passes/JTFootprintReduction.cpp | 28 +- bolt/Passes/LivenessAnalysis.h | 3 + bolt/Passes/RegReAssign.cpp | 431 +++++++++++++++++++++++++++ bolt/Passes/RegReAssign.h | 68 +++++ 6 files changed, 544 insertions(+), 5 deletions(-) create mode 100644 bolt/Passes/RegReAssign.cpp create mode 100644 bolt/Passes/RegReAssign.h diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 5bea65a527ea..facc0b5ddee0 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -18,6 +18,7 @@ #include "Passes/LongJmp.h" #include "Passes/JTFootprintReduction.h" #include "Passes/PLTCall.h" +#include "Passes/RegReAssign.h" #include "Passes/ReorderFunctions.h" #include "Passes/StokeInfo.h" #include "llvm/Support/Timer.h" @@ -136,6 +137,13 @@ PrintICP("print-icp", cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt +PrintRegReAssign("print-regreassign", + cl::desc("print functions after regreassign pass"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + static cl::opt PrintInline("print-inline", cl::desc("print functions after inlining optimization"), @@ -213,6 +221,13 @@ SimplifyRODataLoads("simplify-rodata-loads", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +RegReAssign("reg-reassign", + cl::desc("reassign registers so as to avoid using REX prefixes in hot code"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt StripRepRet("strip-rep-ret", cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"), @@ -357,6 +372,9 @@ void BinaryFunctionPassManager::runAllPasses( llvm::make_unique(PrintSimplifyROLoads), opts::SimplifyRODataLoads); + Manager.registerPass(llvm::make_unique(PrintRegReAssign), + opts::RegReAssign); + Manager.registerPass(llvm::make_unique(PrintICF), opts::ICF); diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index ac446bd725b7..f9b8db8703af 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -20,6 +20,7 @@ add_llvm_library(LLVMBOLTPasses PettisAndHansen.cpp PLTCall.cpp RegAnalysis.cpp + RegReAssign.cpp ReorderAlgorithm.cpp ReorderFunctions.cpp ShrinkWrapping.cpp diff --git a/bolt/Passes/JTFootprintReduction.cpp b/bolt/Passes/JTFootprintReduction.cpp index 6ac856f74e9f..6eeb2f82710f 100644 --- a/bolt/Passes/JTFootprintReduction.cpp +++ b/bolt/Passes/JTFootprintReduction.cpp @@ -26,6 +26,15 @@ extern cl::opt Relocs; extern bool shouldProcess(const bolt::BinaryFunction &Function); extern cl::opt JumpTables; + +static cl::opt +JTFootprintOnlyPIC("jt-footprint-optimize-for-icache", + cl::desc("with jt-footprint-reduction, only process PIC jumptables and turn" + " off other transformations that increase code size"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { @@ -45,15 +54,18 @@ void JTFootprintReduction::checkOpportunities(BinaryContext &BC, AllJTs[JumpTable] += BB.getKnownExecutionCount(); ++IndJmps; - if (BlacklistedJTs.count(JumpTable)) + if (BlacklistedJTs.count(JumpTable)) { + ++IndJmpsDenied; continue; + } uint64_t Scale; // Try a standard indirect jump matcher auto IndJmpMatcher = BC.MIA->matchIndJmp( BC.MIA->matchAnyOperand(), BC.MIA->matchImm(Scale), BC.MIA->matchReg(), BC.MIA->matchAnyOperand()); - if (IndJmpMatcher->match(*BC.MRI, *BC.MIA, + if (!opts::JTFootprintOnlyPIC && + IndJmpMatcher->match(*BC.MRI, *BC.MIA, MutableArrayRef(&*BB.begin(), &Inst + 1), -1) && Scale == 8) { @@ -115,6 +127,8 @@ void JTFootprintReduction::checkOpportunities(BinaryContext &BC, bool JTFootprintReduction::tryOptimizeNonPIC( BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, uint64_t JTAddr, BinaryFunction::JumpTable *JumpTable, DataflowInfoManager &Info) { + if (opts::JTFootprintOnlyPIC) + return false; MCOperand Base; uint64_t Scale; @@ -233,8 +247,12 @@ void JTFootprintReduction::runOnFunctions( if (opts::JumpTables == JTS_BASIC && opts::Relocs) return; - BinaryFunctionCallGraph CG(buildCallGraph(BC, BFs)); - RegAnalysis RA(BC, BFs, CG); + std::unique_ptr RA; + std::unique_ptr CG; + if (!opts::JTFootprintOnlyPIC) { + CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); + RA.reset(new RegAnalysis(BC, BFs, *CG)); + } for (auto &BFIt : BFs) { auto &Function = BFIt.second; @@ -244,7 +262,7 @@ void JTFootprintReduction::runOnFunctions( if (Function.getKnownExecutionCount() == 0) continue; - DataflowInfoManager Info(BC, Function, &RA, nullptr); + DataflowInfoManager Info(BC, Function, RA.get(), nullptr); BlacklistedJTs.clear(); checkOpportunities(BC, Function, Info); optimizeFunction(BC, Function, Info); diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index 54f74d28391a..1b22d0a9d66b 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -133,6 +133,9 @@ class LivenessAnalysis Next &= Written; // Gen if (!this->BC.MIA->isCFI(Point)) { + if (BC.MIA->isCleanRegXOR(Point)) + return Next; + auto Used = BitVector(NumRegs, false); if (IsCall) { RA.getInstUsedRegsList(Point, Used, /*GetClobbers*/true); diff --git a/bolt/Passes/RegReAssign.cpp b/bolt/Passes/RegReAssign.cpp new file mode 100644 index 000000000000..914164a57e59 --- /dev/null +++ b/bolt/Passes/RegReAssign.cpp @@ -0,0 +1,431 @@ +//===--- Passes/RegReAssign.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DataflowAnalysis.h" +#include "DataflowInfoManager.h" +#include "RegReAssign.h" +#include + +#define DEBUG_TYPE "regreassign" + +using namespace llvm; + +namespace opts { +extern cl::OptionCategory BoltOptCategory; +extern cl::opt Relocs; +extern cl::opt UpdateDebugSections; +extern bool shouldProcess(const bolt::BinaryFunction &Function); + +static cl::opt +AggressiveReAssign("use-aggr-reg-reassign", + cl::desc("use register liveness analysis to try to find more opportunities " + "for -reg-reassign optimization"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} + +namespace llvm { +namespace bolt { + +void RegReAssign::swap(BinaryContext &BC, BinaryFunction &Function, MCPhysReg A, + MCPhysReg B) { + const BitVector &AliasA = BC.MIA->getAliases(A, false); + const BitVector &AliasB = BC.MIA->getAliases(B, false); + + // Regular instructions + for (auto &BB : Function) { + for (auto &Inst : BB) { + for (int I = 0, E = Inst.getNumPrimeOperands(); I != E; ++I) { + auto &Operand = Inst.getOperand(I); + if (!Operand.isReg()) + continue; + + auto Reg = Operand.getReg(); + if (AliasA.test(Reg)) { + Operand.setReg(BC.MIA->getAliasSized(B, BC.MIA->getRegSize(Reg))); + --StaticBytesSaved; + DynBytesSaved -= BB.getKnownExecutionCount(); + continue; + } + if (!AliasB.test(Reg)) + continue; + Operand.setReg(BC.MIA->getAliasSized(A, BC.MIA->getRegSize(Reg))); + ++StaticBytesSaved; + DynBytesSaved += BB.getKnownExecutionCount(); + } + } + } + + // CFI + DenseSet Changed; + for (auto &BB : Function) { + for (auto &Inst : BB) { + if (!BC.MIA->isCFI(Inst)) + continue; + auto *CFI = Function.getCFIFor(Inst); + if (Changed.count(CFI)) + continue; + Changed.insert(CFI); + + switch (CFI->getOperation()) { + case MCCFIInstruction::OpRegister: { + const auto CFIReg2 = CFI->getRegister2(); + const MCPhysReg Reg2 = BC.MRI->getLLVMRegNum(CFIReg2, /*isEH=*/false); + if (AliasA.test(Reg2)) { + CFI->setRegister2(BC.MRI->getDwarfRegNum( + BC.MIA->getAliasSized(B, BC.MIA->getRegSize(Reg2)), false)); + } else if (AliasB.test(Reg2)) { + CFI->setRegister2(BC.MRI->getDwarfRegNum( + BC.MIA->getAliasSized(A, BC.MIA->getRegSize(Reg2)), false)); + } + } + // Fall-through + case MCCFIInstruction::OpUndefined: + case MCCFIInstruction::OpDefCfa: + case MCCFIInstruction::OpOffset: + case MCCFIInstruction::OpRestore: + case MCCFIInstruction::OpSameValue: + case MCCFIInstruction::OpDefCfaRegister: + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpExpression: + case MCCFIInstruction::OpValExpression: { + const auto CFIReg = CFI->getRegister(); + const MCPhysReg Reg = BC.MRI->getLLVMRegNum(CFIReg, /*isEH=*/false); + if (AliasA.test(Reg)) { + CFI->setRegister(BC.MRI->getDwarfRegNum( + BC.MIA->getAliasSized(B, BC.MIA->getRegSize(Reg)), false)); + } else if (AliasB.test(Reg)) { + CFI->setRegister(BC.MRI->getDwarfRegNum( + BC.MIA->getAliasSized(A, BC.MIA->getRegSize(Reg)), false)); + } + break; + } + default: + break; + } + } + } +} + +void RegReAssign::rankRegisters(BinaryContext &BC, BinaryFunction &Function) { + std::fill(RegScore.begin(), RegScore.end(), 0); + std::fill(RankedRegs.begin(), RankedRegs.end(), 0); + + for (auto &BB : Function) { + for (auto &Inst : BB) { + const bool CannotUseREX = BC.MIA->cannotUseREX(Inst); + const auto &Desc = BC.MII->get(Inst.getOpcode()); + + // Disallow substituitions involving regs in implicit uses lists + const auto *ImplicitUses = Desc.getImplicitUses(); + while (ImplicitUses && *ImplicitUses) { + const size_t RegEC = + BC.MIA->getAliases(*ImplicitUses, false).find_first(); + RegScore[RegEC] = + std::numeric_limits::min(); + ++ImplicitUses; + } + + // Disallow substituitions involving regs in implicit defs lists + const auto *ImplicitDefs = Desc.getImplicitDefs(); + while (ImplicitDefs && *ImplicitDefs) { + const size_t RegEC = + BC.MIA->getAliases(*ImplicitDefs, false).find_first(); + RegScore[RegEC] = + std::numeric_limits::min(); + ++ImplicitDefs; + } + + for (int I = 0, E = Inst.getNumPrimeOperands(); I != E; ++I) { + const auto &Operand = Inst.getOperand(I); + if (!Operand.isReg()) + continue; + + if (Desc.getOperandConstraint(I, MCOI::TIED_TO) != -1) + continue; + + auto Reg = Operand.getReg(); + size_t RegEC = BC.MIA->getAliases(Reg, false).find_first(); + if (RegEC == 0) + continue; + + // Disallow substituitions involving regs in instrs that cannot use REX + if (CannotUseREX) { + RegScore[RegEC] = + std::numeric_limits::min(); + continue; + } + + // Unsupported substitution, cannot swap BH with R* regs, bail + if (BC.MIA->isUpper8BitReg(Reg) && ClassicCSR.test(Reg)) { + RegScore[RegEC] = + std::numeric_limits::min(); + continue; + } + + RegScore[RegEC] += BB.getKnownExecutionCount(); + } + } + } + std::iota(RankedRegs.begin(), RankedRegs.end(), 0); // 0, 1, 2, 3... + std::sort(RankedRegs.begin(), RankedRegs.end(), + [&](size_t A, size_t B) { return RegScore[A] > RegScore[B]; }); + + DEBUG({ + for (auto Reg : RankedRegs) { + if (RegScore[Reg] == 0) + continue; + dbgs() << Reg << " "; + if (RegScore[Reg] > 0) + dbgs() << BC.MRI->getName(Reg) << ": " << RegScore[Reg] << "\n"; + else + dbgs() << BC.MRI->getName(Reg) << ": (blacklisted)\n"; + } + }); +} + +void RegReAssign::aggressivePassOverFunction(BinaryContext &BC, + BinaryFunction &Function) { + rankRegisters(BC, Function); + + // Bail early if our registers are all black listed, before running expensive + // analysis passes + bool Bail = true; + int64_t LowScoreClassic = std::numeric_limits::max(); + for (int J = ClassicRegs.find_first(); J != -1; + J = ClassicRegs.find_next(J)) { + if (RegScore[J] <= 0) + continue; + Bail = false; + if (RegScore[J] < LowScoreClassic) + LowScoreClassic = RegScore[J]; + } + if (Bail) + return; + BitVector Extended = ClassicRegs; + Extended.flip(); + Extended &= GPRegs; + Bail = true; + int64_t HighScoreExtended = 0; + for (int J = Extended.find_first(); J != -1; J = Extended.find_next(J)) { + if (RegScore[J] <= 0) + continue; + Bail = false; + if (RegScore[J] > HighScoreExtended) + HighScoreExtended = RegScore[J]; + } + // Also bail early if there is no profitable substitution even if we assume + // all registers can be exchanged + if (Bail || (LowScoreClassic << 1) >= HighScoreExtended) + return; + + // -- expensive pass -- determine all regs alive during func start + DataflowInfoManager Info(BC, Function, RA.get(), nullptr); + auto AliveAtStart = *Info.getLivenessAnalysis().getStateAt( + ProgramPoint::getFirstPointAt(*Function.begin())); + for (auto &BB : Function) { + if (BB.pred_size() == 0) + AliveAtStart |= *Info.getLivenessAnalysis().getStateAt( + ProgramPoint::getFirstPointAt(BB)); + } + // Mark frame pointer alive because of CFI + AliveAtStart |= BC.MIA->getAliases(BC.MIA->getFramePointer(), false); + // Never touch return registers + BC.MIA->getDefaultLiveOut(AliveAtStart); + + // Try swapping more profitable options first + auto Begin = RankedRegs.begin(); + auto End = std::prev(RankedRegs.end()); + while (Begin != End) { + MCPhysReg ClassicReg = *End; + if (!ClassicRegs[ClassicReg] || RegScore[ClassicReg] <= 0) { + --End; + continue; + } + + MCPhysReg ExtReg = *Begin; + if (!Extended[ExtReg] || RegScore[ExtReg] <= 0) { + ++Begin; + continue; + } + + if (RegScore[ClassicReg] << 1 >= RegScore[ExtReg]) { + DEBUG(dbgs() << " Ending at " << BC.MRI->getName(ClassicReg) << " with " + << BC.MRI->getName(ExtReg) + << " because exchange is not profitable\n"); + break; + } + + BitVector AnyAliasAlive = AliveAtStart; + AnyAliasAlive &= BC.MIA->getAliases(ClassicReg); + if (AnyAliasAlive.any()) { + DEBUG(dbgs() << " Bailed on " << BC.MRI->getName(ClassicReg) << " with " + << BC.MRI->getName(ExtReg) + << " because classic reg is alive\n"); + --End; + continue; + } + AnyAliasAlive = AliveAtStart; + AnyAliasAlive &= BC.MIA->getAliases(ExtReg); + if (AnyAliasAlive.any()) { + DEBUG(dbgs() << " Bailed on " << BC.MRI->getName(ClassicReg) << " with " + << BC.MRI->getName(ExtReg) + << " because extended reg is alive\n"); + ++Begin; + continue; + } + + // Opportunity detected. Swap. + DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(ClassicReg) << " with " + << BC.MRI->getName(ExtReg) << "\n\n"); + swap(BC, Function, ClassicReg, ExtReg); + FuncsChanged.insert(&Function); + ++Begin; + if (Begin == End) + break; + --End; + } +} + +bool RegReAssign::conservativePassOverFunction(BinaryContext &BC, + BinaryFunction &Function) { + rankRegisters(BC, Function); + + // Try swapping R12, R13, R14 or R15 with RBX (we work with all callee-saved + // regs except RBP) + MCPhysReg Candidate = 0; + for (int J = ExtendedCSR.find_first(); J != -1; + J = ExtendedCSR.find_next(J)) { + if (RegScore[J] > RegScore[Candidate]) + Candidate = J; + } + + if (!Candidate || RegScore[Candidate] < 0) + return false; + + // Check if our classic callee-saved reg (RBX is the only one) has lower + // score / utilization rate + MCPhysReg RBX = 0; + for (int I = ClassicCSR.find_first(); I != -1; I = ClassicCSR.find_next(I)) { + auto ScoreRBX = RegScore[I]; + if (ScoreRBX <= 0) + continue; + + if (RegScore[Candidate] > (ScoreRBX + 10)) { + RBX = I; + } + } + + if (!RBX) + return false; + + DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(RBX) << " with " + << BC.MRI->getName(Candidate) << "\n\n"); + swap(BC, Function, RBX, Candidate); + FuncsChanged.insert(&Function); + return true; +} + +void RegReAssign::setupAggressivePass(BinaryContext &BC, + std::map &BFs) { + setupConservativePass(BC, BFs); + CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); + RA.reset(new RegAnalysis(BC, BFs, *CG)); + + GPRegs = BitVector(BC.MRI->getNumRegs(), false); + BC.MIA->getGPRegs(GPRegs); +} + +void RegReAssign::setupConservativePass( + BinaryContext &BC, std::map &BFs) { + // Set up constant bitvectors used throughout this analysis + ClassicRegs = BitVector(BC.MRI->getNumRegs(), false); + CalleeSaved = BitVector(BC.MRI->getNumRegs(), false); + ClassicCSR = BitVector(BC.MRI->getNumRegs(), false); + ExtendedCSR = BitVector(BC.MRI->getNumRegs(), false); + // Never consider the frame pointer + BC.MIA->getClassicGPRegs(ClassicRegs); + ClassicRegs.flip(); + ClassicRegs |= BC.MIA->getAliases(BC.MIA->getFramePointer(), false); + ClassicRegs.flip(); + BC.MIA->getCalleeSavedRegs(CalleeSaved); + ClassicCSR |= ClassicRegs; + ClassicCSR &= CalleeSaved; + BC.MIA->getClassicGPRegs(ClassicRegs); + ExtendedCSR |= ClassicRegs; + ExtendedCSR.flip(); + ExtendedCSR &= CalleeSaved; + + DEBUG({ + RegStatePrinter P(BC); + dbgs() << "Starting register reassignment\nClassicRegs: "; + P.print(dbgs(), ClassicRegs); + dbgs() << "\nCalleeSaved: "; + P.print(dbgs(), CalleeSaved); + dbgs() << "\nClassicCSR: "; + P.print(dbgs(), ClassicCSR); + dbgs() << "\nExtendedCSR: "; + P.print(dbgs(), ExtendedCSR); + dbgs() << "\n"; + }); +} + +void RegReAssign::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + RegScore = std::vector(BC.MRI->getNumRegs(), 0); + RankedRegs = std::vector(BC.MRI->getNumRegs(), 0); + + if (opts::AggressiveReAssign) + setupAggressivePass(BC, BFs); + else + setupConservativePass(BC, BFs); + + for (auto &I : BFs) { + auto &Function = I.second; + + if (!Function.isSimple() || !opts::shouldProcess(Function)) + continue; + + DEBUG(dbgs() << "====================================\n"); + DEBUG(dbgs() << " - " << Function.getPrintName() << "\n"); + if (!conservativePassOverFunction(BC, Function) && + opts::AggressiveReAssign) { + aggressivePassOverFunction(BC, Function); + DEBUG({ + if (FuncsChanged.count(&Function)) { + dbgs() << "Aggressive pass successful on " << Function.getPrintName() + << "\n"; + } + }); + } + } + + if (FuncsChanged.empty()) { + outs() << "BOLT-INFO: Reg Reassignment Pass: no changes were made.\n"; + return; + } + if (opts::UpdateDebugSections) { + outs() << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections." + << " Some registers were changed but associated AT_LOCATION for " + << "impacted variables were NOT updated! This operation is " + << "currently unsupported by BOLT.\n"; + } + outs() << "BOLT-INFO: Reg Reassignment Pass Stats:\n"; + outs() << "\t " << FuncsChanged.size() << " functions affected.\n"; + outs() << "\t " << StaticBytesSaved << " static bytes saved.\n"; + outs() << "\t " << DynBytesSaved << " dynamic bytes saved.\n"; +} + +} +} diff --git a/bolt/Passes/RegReAssign.h b/bolt/Passes/RegReAssign.h new file mode 100644 index 000000000000..602ae12c5d28 --- /dev/null +++ b/bolt/Passes/RegReAssign.h @@ -0,0 +1,68 @@ +//===--- Passes/RegReAssign.h ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REGREASSIGN_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REGREASSIGN_H + +#include "BinaryPasses.h" +#include "RegAnalysis.h" + +namespace llvm { +namespace bolt { + +class RegReAssign : public BinaryFunctionPass { + std::vector RegScore; + std::vector RankedRegs; + BitVector ClassicRegs; + BitVector CalleeSaved; + BitVector ClassicCSR; + BitVector ExtendedCSR; + BitVector GPRegs; + + /// Hooks to other passes + std::unique_ptr RA; + std::unique_ptr CG; + + /// Stats + DenseSet FuncsChanged; + int64_t StaticBytesSaved{0}; + int64_t DynBytesSaved{0}; + + void swap(BinaryContext &BC, BinaryFunction &Function, MCPhysReg A, + MCPhysReg B); + void rankRegisters(BinaryContext &BC, BinaryFunction &Function); + void aggressivePassOverFunction(BinaryContext &BC, BinaryFunction &Function); + bool conservativePassOverFunction(BinaryContext &BC, + BinaryFunction &Function); + void setupAggressivePass(BinaryContext &BC, + std::map &BFs); + void setupConservativePass(BinaryContext &BC, + std::map &BFs); + +public: + /// BinaryPass public interface + + explicit RegReAssign(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { return "regreassign"; } + + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0; + } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; +} +} + +#endif From 621b5c19f6183350267027dca306b4f0a7ee6a97 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 6 Dec 2017 15:45:57 -0800 Subject: [PATCH 355/904] [BOLT] Options to facilitate debugging Summary: Some helpful options: -print-dyno-stats-only while printing functions output dyno-stats and skip instructions -report-stale print a list of functions with a stale profile (cherry picked from commit be49d623847311ffb4da61fb0ce3943feaa76203) --- bolt/BinaryFunction.cpp | 9 ++++++++- bolt/RewriteInstance.cpp | 19 +++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 8afba679f450..0d0245d421be 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -124,6 +124,13 @@ PrintDynoStats("dyno-stats", cl::desc("print execution info based on profile"), cl::cat(BoltCategory)); +static cl::opt +PrintDynoStatsOnly("print-dyno-stats-only", + cl::desc("while printing functions output dyno-stats and skip instructions"), + cl::init(false), + cl::Hidden, + cl::cat(BoltCategory)); + static cl::opt PrintJumpTables("print-jump-tables", cl::desc("print jump tables"), @@ -441,7 +448,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << "\n}\n"; - if (!PrintInstructions || !BC.InstPrinter) + if (opts::PrintDynoStatsOnly || !PrintInstructions || !BC.InstPrinter) return; // Offset of the instruction in function. diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 66b1633605d6..01e6cb13fe28 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -217,6 +217,14 @@ Relocs("relocs", cl::ZeroOrMore, cl::cat(BoltCategory)); +static cl::opt +ReportStaleFuncs("report-stale", + cl::desc("print a list of functions with a stale profile"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + static cl::list SkipFunctionNames("skip-funcs", cl::CommaSeparated, @@ -2077,6 +2085,7 @@ void RewriteInstance::disassembleFunctions() { if (opts::AggregateOnly) return; + const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; uint64_t NumSimpleFunctions{0}; uint64_t NumStaleProfileFunctions{0}; std::vector ProfiledFunctions; @@ -2087,10 +2096,16 @@ void RewriteInstance::disassembleFunctions() { ++NumSimpleFunctions; if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) continue; - if (Function.hasValidProfile()) + if (Function.hasValidProfile()) { ProfiledFunctions.push_back(&Function); - else + } else { + if (opts::ReportStaleFuncs) { + outs() << StaleFuncsHeader + << " " << Function << '\n'; + StaleFuncsHeader = ""; + } ++NumStaleProfileFunctions; + } } BC->NumProfiledFuncs = ProfiledFunctions.size(); From 5eed6ef574e0b92aaa93734ca2d30a9b94bef70a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 8 Dec 2017 20:27:49 -0800 Subject: [PATCH 356/904] [BOLT] Consistent DFS ordering for landing pads Summary: The list of landing pads in BinaryBasicBlock was sorted by their address in memory. As a result, the DFS order was not always deterministic. The change is to store landing pads in the order they appear in invoke instructions while keeping them unique. Also, add Throwers verification to validateCFG(). (cherry picked from commit 18f4d18932e7c94e67cc2c431512344f2038a9a0) --- bolt/BinaryFunction.cpp | 74 ++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 0d0245d421be..4531112f189b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -486,7 +486,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, uint64_t BBExecCount = BB->getExecutionCount(); if (hasValidProfile()) { - OS << " Exec Count : " << BBExecCount << "\n"; + OS << " Exec Count : " << BBExecCount << '\n'; } if (BB->getCFIState() >= 0) { OS << " CFI State : " << BB->getCFIState() << '\n'; @@ -1422,6 +1422,7 @@ void BinaryFunction::recomputeLandingPads() { } for (auto *BB : BasicBlocks) { + std::unordered_set BBLandingPads; for (auto &Instr : *BB) { if (!BC.MIA->isInvoke(Instr)) continue; @@ -1433,18 +1434,12 @@ void BinaryFunction::recomputeLandingPads() { continue; auto *LPBlock = getBasicBlockForLabel(LPLabel); - BB->LandingPads.emplace_back(LPBlock); - LPBlock->Throwers.emplace_back(BB); + if (!BBLandingPads.count(LPBlock)) { + BBLandingPads.insert(LPBlock); + BB->LandingPads.emplace_back(LPBlock); + LPBlock->Throwers.emplace_back(BB); + } } - std::sort(BB->lp_begin(), BB->lp_end()); - auto NewEnd = std::unique(BB->lp_begin(), BB->lp_end()); - BB->LandingPads.erase(NewEnd, BB->lp_end()); - } - - for (auto *BB : BasicBlocks) { - std::sort(BB->throw_begin(), BB->throw_end()); - auto NewEnd = std::unique(BB->throw_begin(), BB->throw_end()); - BB->Throwers.erase(NewEnd, BB->throw_end()); } } @@ -2973,31 +2968,30 @@ bool BinaryFunction::validateCFG() const { if (!Valid) return Valid; - for (auto *BB : BasicBlocks) { - if (!std::is_sorted(BB->lp_begin(), BB->lp_end())) { - errs() << "BOLT-ERROR: unsorted list of landing pads in " - << BB->getName() << " in function " << *this << '\n'; - return false; - } - if (std::unique(BB->lp_begin(), BB->lp_end()) != BB->lp_end()) { - errs() << "BOLT-ERROR: duplicate landing pad detected in" - << BB->getName() << " in function " << *this << '\n'; - return false; - } - if (!std::is_sorted(BB->throw_begin(), BB->throw_end())) { - errs() << "BOLT-ERROR: unsorted list of throwers in " - << BB->getName() << " in function " << *this << '\n'; - return false; + for (const auto *BB : BasicBlocks) { + std::unordered_set BBLandingPads; + for (const auto *LP : BB->landing_pads()) { + if (BBLandingPads.count(LP)) { + errs() << "BOLT-ERROR: duplicate landing pad detected in" + << BB->getName() << " in function " << *this << '\n'; + return false; + } + BBLandingPads.insert(LP); } - if (std::unique(BB->throw_begin(), BB->throw_end()) != BB->throw_end()) { - errs() << "BOLT-ERROR: duplicate thrower detected in" - << BB->getName() << " in function " << *this << '\n'; - return false; + + std::unordered_set BBThrowers; + for (const auto *Thrower : BB->throwers()) { + if (BBThrowers.count(Thrower)) { + errs() << "BOLT-ERROR: duplicate thrower detected in" + << BB->getName() << " in function " << *this << '\n'; + return false; + } + BBThrowers.insert(Thrower); } - for (auto *LPBlock : BB->LandingPads) { - if (!std::binary_search(LPBlock->throw_begin(), - LPBlock->throw_end(), - BB)) { + + for (const auto *LPBlock : BB->landing_pads()) { + if (std::find(LPBlock->throw_begin(), LPBlock->throw_end(), BB) + == LPBlock->throw_end()) { errs() << "BOLT-ERROR: inconsistent landing pad detected in " << *this << ": " << BB->getName() << " is in LandingPads but not in " << LPBlock->getName() @@ -3005,6 +2999,16 @@ bool BinaryFunction::validateCFG() const { return false; } } + for (const auto *Thrower : BB->throwers()) { + if (std::find(Thrower->lp_begin(), Thrower->lp_end(), BB) + == Thrower->lp_end()) { + errs() << "BOLT-ERROR: inconsistent thrower detected in " + << *this << ": " << BB->getName() + << " is in Throwers list but not in " << Thrower->getName() + << " LandingPads\n"; + return false; + } + } } return Valid; From 6a42955e664cb8fbb9fad5c39d753b423cb8f57e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sat, 9 Dec 2017 21:40:39 -0800 Subject: [PATCH 357/904] [BOLT] Automatically detect and use relocations Summary: If relocations are available in the binary, use them by default. If "-relocs" is specified, then require relocations for further processing. Use "-relocs=0" to forcefully ignore relocations. Instead of `opts::Relocs` use `BinaryContext::HasRelocations` to check for the presence of the relocations. (cherry picked from commit 62b72f62d9e4e3a5b5a027747af15a23d0aef6b1) --- bolt/BinaryContext.cpp | 3 +- bolt/BinaryContext.h | 3 + bolt/BinaryFunction.cpp | 28 ++++---- bolt/DWARFRewriter.cpp | 1 - bolt/Passes/Aligner.cpp | 4 +- bolt/Passes/BinaryPasses.cpp | 7 +- bolt/Passes/JTFootprintReduction.cpp | 3 +- bolt/Passes/LongJmp.cpp | 3 +- bolt/Passes/RegReAssign.cpp | 1 - bolt/Passes/ReorderFunctions.cpp | 3 +- bolt/RewriteInstance.cpp | 98 +++++++++++++++------------- 11 files changed, 78 insertions(+), 76 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index e4bba784961a..82de83bd8e7b 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -27,7 +27,6 @@ namespace opts { extern cl::OptionCategory BoltCategory; -extern cl::opt Relocs; extern cl::opt ReorderFunctions; static cl::opt @@ -132,7 +131,7 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF, // Merge execution counts of ChildBF into those of ParentBF. ChildBF.mergeProfileDataInto(ParentBF); - if (opts::Relocs) { + if (HasRelocations) { // Remove ChildBF from the global set of functions in relocs mode. auto FI = BFs.find(ChildBF.getAddress()); assert(FI != BFs.end() && "function not found"); diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 3cc4f1442738..65804f0b5ab4 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -160,6 +160,9 @@ class BinaryContext { DataReader &DR; + /// Indicates if relocations are availabe for usage. + bool HasRelocations{false}; + /// Sum of execution count of all functions uint64_t SumExecutionCount{0}; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 4531112f189b..04615a4898ce 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -51,7 +51,6 @@ extern cl::OptionCategory BoltRelocCategory; extern bool shouldProcess(const BinaryFunction &); -extern cl::opt Relocs; extern cl::opt UpdateDebugSections; extern cl::opt IndirectCallPromotion; extern cl::opt Verbosity; @@ -654,7 +653,7 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, auto End = Instructions.end(); if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { - PreserveNops = opts::Relocs; + PreserveNops = BC.HasRelocations; // Start at the last label as an approximation of the current basic block. // This is a heuristic, since the full set of labels have yet to be // determined @@ -1110,7 +1109,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { goto add_instruction; } BC.InterproceduralReferences.insert(TargetAddress); - if (opts::Verbosity >= 2 && !IsCall && Size == 2 && !opts::Relocs) { + if (opts::Verbosity >= 2 && !IsCall && Size == 2 && + !BC.HasRelocations) { errs() << "BOLT-WARNING: relaxed tail call detected at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << " in function " << *this @@ -1147,7 +1147,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } } - if (opts::Relocs) { + if (BC.HasRelocations) { // Check if we need to create relocation to move this function's // code without re-assembly. size_t RelSize = (Size < 5) ? 1 : 4; @@ -1230,7 +1230,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { errs() << "BOLT-ERROR: cannot handle PC-relative operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << *this << ".\n"; - if (opts::Relocs) + if (BC.HasRelocations) exit(1); IsSimple = false; } @@ -1242,7 +1242,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { errs() << "BOLT-ERROR: cannot handle PC-relative operand at 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ". Skipping function " << *this << ".\n"; - if (opts::Relocs) + if (BC.HasRelocations) exit(1); IsSimple = false; } @@ -1294,7 +1294,7 @@ void BinaryFunction::postProcessJumpTables() { TakenBranches.emplace_back(JTSiteOffset, TargetOffset); // Take ownership of jump table relocations. - if (opts::Relocs) + if (BC.HasRelocations) BC.removeRelocationAt(JT->Address + EntryOffset); EntryOffset += JT->EntrySize; @@ -1448,7 +1448,7 @@ bool BinaryFunction::buildCFG() { auto &MIA = BC.MIA; if (!isSimple()) { - assert(!opts::Relocs && + assert(!BC.HasRelocations && "cannot process file with non-simple function in relocs mode"); return false; } @@ -1909,7 +1909,7 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { auto *BB = getBasicBlockAtOffset(Offset); if (!BB) { // TODO #14762450: split basic block and process function. - if (opts::Verbosity || opts::Relocs) { + if (opts::Verbosity || BC.HasRelocations) { errs() << "BOLT-WARNING: no basic block at offset 0x" << Twine::utohexstr(Offset) << " in function " << *this << ". Marking non-simple.\n"; @@ -3701,7 +3701,7 @@ void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { auto &JT = JTI.second; if (opts::PrintJumpTables) JT.print(outs()); - if (opts::JumpTables == JTS_BASIC && opts::Relocs) { + if (opts::JumpTables == JTS_BASIC && BC.HasRelocations) { JT.updateOriginal(BC); } else { MCSection *HotSection, *ColdSection; @@ -3769,7 +3769,7 @@ void BinaryFunction::JumpTable::updateOriginal(BinaryContext &BC) { // In non-relocation mode we have to emit jump tables in local sections. // This way we only overwrite them when a corresponding function is // overwritten. - assert(opts::Relocs && "relocation mode expected"); + assert(BC.HasRelocations && "relocation mode expected"); auto SectionOrError = BC.getSectionForAddress(Address); assert(SectionOrError && "section not found for jump table"); auto Section = SectionOrError.get(); @@ -3961,7 +3961,7 @@ DWARFAddressRangesVector BinaryFunction::getOutputAddressRanges() const { uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const { // If the function hasn't changed return the same address. - if (!isEmitted() && !opts::Relocs) + if (!isEmitted() && !BC.HasRelocations) return Address; if (Address < getAddress()) @@ -3986,7 +3986,7 @@ uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const { DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( const DWARFAddressRangesVector &InputRanges) const { // If the function hasn't changed return the same ranges. - if (!isEmitted() && !opts::Relocs) + if (!isEmitted() && !BC.HasRelocations) return InputRanges; // Even though we will merge ranges in a post-processing pass, we attempt to @@ -4062,7 +4062,7 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( const DWARFDebugLoc::LocationList &InputLL, uint64_t BaseAddress) const { // If the function wasn't changed - there's nothing to update. - if (!isEmitted() && !opts::Relocs) { + if (!isEmitted() && !BC.HasRelocations) { if (!BaseAddress) { return InputLL; } else { diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index f7678f9a19c1..565a79dfe519 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -49,7 +49,6 @@ namespace opts { extern cl::OptionCategory BoltCategory; extern cl::opt Verbosity; -extern cl::opt Relocs; static cl::opt KeepARanges("keep-aranges", diff --git a/bolt/Passes/Aligner.cpp b/bolt/Passes/Aligner.cpp index c55379a19b0e..0d56e9ffcfea 100644 --- a/bolt/Passes/Aligner.cpp +++ b/bolt/Passes/Aligner.cpp @@ -16,8 +16,6 @@ using namespace llvm; namespace opts { extern cl::OptionCategory BoltOptCategory; -extern cl::opt Relocs; - cl::opt UseCompactAligner("use-compact-aligner", cl::desc("Use compact approach for aligning functions"), @@ -85,7 +83,7 @@ void alignCompact(BinaryContext &BC, BinaryFunction &Function) { void AlignerPass::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { - if (!opts::Relocs) + if (!BC.HasRelocations) return; for (auto &It : BFs) { diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 53a60e149386..3e98e6300831 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -51,7 +51,6 @@ namespace opts { extern cl::OptionCategory BoltOptCategory; extern cl::opt Verbosity; -extern cl::opt Relocs; extern cl::opt SplitFunctions; extern bool shouldProcess(const bolt::BinaryFunction &Function); @@ -552,7 +551,7 @@ void FixupBranches::runOnFunctions( std::set &) { for (auto &It : BFs) { auto &Function = It.second; - if (opts::Relocs || shouldOptimize(Function)) { + if (BC.HasRelocations || shouldOptimize(Function)) { Function.fixBranches(); } } @@ -575,12 +574,12 @@ void FinalizeFunctions::runOnFunctions( } // Always fix functions in relocation mode. - if (!opts::Relocs && !ShouldOptimize) + if (!BC.HasRelocations && !ShouldOptimize) continue; // Fix the CFI state. if (ShouldOptimize && !Function.fixCFIState()) { - if (opts::Relocs) { + if (BC.HasRelocations) { errs() << "BOLT-ERROR: unable to fix CFI state for function " << Function << ". Exiting.\n"; exit(1); diff --git a/bolt/Passes/JTFootprintReduction.cpp b/bolt/Passes/JTFootprintReduction.cpp index 6eeb2f82710f..ec37bd4b836d 100644 --- a/bolt/Passes/JTFootprintReduction.cpp +++ b/bolt/Passes/JTFootprintReduction.cpp @@ -22,7 +22,6 @@ namespace opts { extern cl::OptionCategory BoltOptCategory; extern cl::opt Verbosity; -extern cl::opt Relocs; extern bool shouldProcess(const bolt::BinaryFunction &Function); extern cl::opt JumpTables; @@ -244,7 +243,7 @@ void JTFootprintReduction::runOnFunctions( std::map &BFs, std::set &LargeFunctions ) { - if (opts::JumpTables == JTS_BASIC && opts::Relocs) + if (opts::JumpTables == JTS_BASIC && BC.HasRelocations) return; std::unique_ptr RA; diff --git a/bolt/Passes/LongJmp.cpp b/bolt/Passes/LongJmp.cpp index ffddde6308f1..a3d004649899 100644 --- a/bolt/Passes/LongJmp.cpp +++ b/bolt/Passes/LongJmp.cpp @@ -16,7 +16,6 @@ using namespace llvm; namespace opts { -extern cl::opt Relocs; extern cl::opt UseOldText; extern cl::opt AlignFunctions; extern cl::opt AlignFunctionsMaxBytes; @@ -236,7 +235,7 @@ void LongJmpPass::tentativeLayout( std::vector &SortedFunctions) { uint64_t DotAddress = BC.LayoutStartAddress; - if (!opts::Relocs) { + if (!BC.HasRelocations) { for (auto Func : SortedFunctions) { HotAddresses[Func] = Func->getAddress(); DotAddress = RoundUpToAlignment(DotAddress, 16); diff --git a/bolt/Passes/RegReAssign.cpp b/bolt/Passes/RegReAssign.cpp index 914164a57e59..848147481132 100644 --- a/bolt/Passes/RegReAssign.cpp +++ b/bolt/Passes/RegReAssign.cpp @@ -20,7 +20,6 @@ using namespace llvm; namespace opts { extern cl::OptionCategory BoltOptCategory; -extern cl::opt Relocs; extern cl::opt UpdateDebugSections; extern bool shouldProcess(const bolt::BinaryFunction &Function); diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index bf4f178e2259..f7d4ffa55dd1 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -22,7 +22,6 @@ namespace opts { extern cl::OptionCategory BoltOptCategory; extern cl::opt Verbosity; -extern cl::opt Relocs; extern cl::opt RandomSeed; extern bool shouldProcess(const bolt::BinaryFunction &Function); @@ -281,7 +280,7 @@ std::vector readFunctionOrderFile() { void ReorderFunctions::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { - if (!opts::Relocs && opts::ReorderFunctions != BinaryFunction::RT_NONE) { + if (!BC.HasRelocations && opts::ReorderFunctions != BinaryFunction::RT_NONE) { errs() << "BOLT-ERROR: Function reordering only works when " << "relocs are enabled.\n"; exit(1); diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 01e6cb13fe28..733a52a2eb40 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -211,9 +211,9 @@ PrintLoopInfo("print-loops", cl::Hidden, cl::cat(BoltCategory)); -cl::opt -Relocs("relocs", - cl::desc("relocation mode - use relocations to move functions in the binary"), +static cl::opt +RelocationMode("relocs", + cl::desc("use relocations in the binary (default=autodetect)"), cl::ZeroOrMore, cl::cat(BoltCategory)); @@ -887,14 +887,13 @@ void RewriteInstance::run() { opts::BoostMacroops = false; outs() << "BOLT-INFO: disabling -boost-macroops for AArch64\n"; } - if (opts::Relocs && opts::UseOldText) { + if (opts::RelocationMode != cl::BOU_TRUE) { + errs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully " + "supported\n"; + } else if (opts::UseOldText) { opts::UseOldText = false; outs() << "BOLT-INFO: disabling -use-old-text for AArch64\n"; } - if (!opts::Relocs) { - outs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully " - "supported\n"; - } } auto executeRewritePass = [&](const std::set &NonSimpleFunctions) { @@ -968,7 +967,8 @@ void RewriteInstance::run() { } void RewriteInstance::discoverFileObjects() { - NamedRegionTimer T("discover file objects", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("discover file objects", TimerGroupName, + opts::TimeRewrite); FileSymRefs.clear(); BinaryFunctions.clear(); @@ -1201,7 +1201,7 @@ void RewriteInstance::discoverFileObjects() { PreviousFunction-> addEntryPointAtOffset(Address - PreviousFunction->getAddress()); - if (!opts::Relocs) + if (!BC->HasRelocations) PreviousFunction->setSimple(false); // Remove the symbol from FileSymRefs so that we can skip it from @@ -1317,7 +1317,7 @@ void RewriteInstance::discoverFileObjects() { // Now that all the functions were created - adjust their boundaries. adjustFunctionBoundaries(); - if (!opts::Relocs) + if (!BC->HasRelocations) return; // Read all relocations now that we have binary functions mapped. @@ -1437,7 +1437,7 @@ void RewriteInstance::adjustFunctionBoundaries() { // In non-relocation mode there's potentially an external undetectable // reference to the entry point and hence we cannot move this entry // point. Optimizing without moving could be difficult. - if (!opts::Relocs) + if (!BC->HasRelocations) Function.setSimple(false); } @@ -1555,7 +1555,8 @@ BinaryFunction *RewriteInstance::createBinaryFunction( } void RewriteInstance::readSpecialSections() { - NamedRegionTimer T("read special sections", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("read special sections", TimerGroupName, + opts::TimeRewrite); bool HasTextRelocations = false; @@ -1602,12 +1603,15 @@ void RewriteInstance::readSpecialSections() { } } - if (opts::Relocs && !HasTextRelocations) { + if (opts::RelocationMode == cl::BOU_TRUE && !HasTextRelocations) { errs() << "BOLT-ERROR: relocations against code are missing from the input " "file. Cannot proceed in relocations mode (-relocs).\n"; exit(1); } + BC->HasRelocations = HasTextRelocations && + (opts::RelocationMode != cl::BOU_FALSE); + // Process debug sections. EHFrame = BC->DwCtx->getEHFrame(); if (opts::DumpEHFrame) { @@ -1945,7 +1949,8 @@ void RewriteInstance::readProfileData() { } void RewriteInstance::disassembleFunctions() { - NamedRegionTimer T("disassemble functions", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("disassemble functions", TimerGroupName, + opts::TimeRewrite); // Disassemble every function and build it's control flow graph. TotalScore = 0; BC->SumExecutionCount = 0; @@ -1953,7 +1958,7 @@ void RewriteInstance::disassembleFunctions() { BinaryFunction &Function = BFI.second; // If we have to relocate the code we have to disassemble all functions. - if (!opts::Relocs && !opts::shouldProcess(Function)) { + if (!BC->HasRelocations && !opts::shouldProcess(Function)) { DEBUG(dbgs() << "BOLT: skipping processing function " << Function << " per user request.\n"); continue; @@ -1981,10 +1986,10 @@ void RewriteInstance::disassembleFunctions() { Function.disassemble(*FunctionData); - if (!Function.isSimple() && opts::Relocs) { + if (!Function.isSimple() && BC->HasRelocations) { errs() << "BOLT-ERROR: function " << Function << " cannot be properly " << "disassembled. Unable to continue in relocation mode.\n"; - abort(); + exit(1); } if (opts::PrintAll || opts::PrintDisasm) @@ -1996,7 +2001,7 @@ void RewriteInstance::disassembleFunctions() { auto *ContainingFunction = getBinaryFunctionContainingAddress(Addr); if (ContainingFunction && ContainingFunction->getAddress() != Addr) { ContainingFunction->addEntryPoint(Addr); - if (!opts::Relocs) { + if (!BC->HasRelocations) { if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: Function " << *ContainingFunction << " has internal BBs that are target of a reference located" @@ -2019,7 +2024,7 @@ void RewriteInstance::disassembleFunctions() { if (SectionName == ".plt" || SectionName == ".plt.got") continue; - if (opts::Relocs) { + if (BC->HasRelocations) { errs() << "BOLT-ERROR: cannot process binaries with unmarked " << "object in code at address 0x" << Twine::utohexstr(Addr) << " belonging to section " @@ -2052,8 +2057,8 @@ void RewriteInstance::disassembleFunctions() { if (!CFIRdWrt->fillCFIInfoFor(Function)) { errs() << "BOLT-ERROR: unable to fill CFI for function " << Function << ".\n"; - if (opts::Relocs) - abort(); + if (BC->HasRelocations) + exit(1); Function.setSimple(false); continue; } @@ -2170,12 +2175,14 @@ void RewriteInstance::disassembleFunctions() { } void RewriteInstance::runOptimizationPasses() { - NamedRegionTimer T("run optimization passes", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("run optimization passes", TimerGroupName, + opts::TimeRewrite); BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions, LargeFunctions); } // Helper function to emit the contents of a function via a MCStreamer object. -void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Function, +void RewriteInstance::emitFunction(MCStreamer &Streamer, + BinaryFunction &Function, bool EmitColdPart) { if (Function.getSize() == 0) return; @@ -2184,7 +2191,7 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Functio return; MCSection *Section; - if (opts::Relocs) { + if (BC->HasRelocations) { Section = BC->MOFI->getTextSection(); } else { // Each fuction is emmitted into its own section. @@ -2201,7 +2208,7 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Functio Streamer.SwitchSection(Section); - if (opts::Relocs) { + if (BC->HasRelocations) { Streamer.EmitCodeAlignment(BinaryFunction::MinAlign); auto MaxAlignBytes = EmitColdPart ? Function.getMaxColdAlignmentBytes() @@ -2227,7 +2234,7 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Functio } // Emit CFI start - if (Function.hasCFI() && (opts::Relocs || Function.isSimple())) { + if (Function.hasCFI() && (BC->HasRelocations || Function.isSimple())) { Streamer.EmitCFIStartProc(/*IsSimple=*/false); if (Function.getPersonalityFunction() != nullptr) { Streamer.EmitCFIPersonality(Function.getPersonalityFunction(), @@ -2279,7 +2286,7 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Functio } // Emit CFI end - if (Function.hasCFI() && (opts::Relocs || Function.isSimple())) + if (Function.hasCFI() && (BC->HasRelocations || Function.isSimple())) Streamer.EmitCFIEndProc(); Streamer.EmitLabel(EmitColdPart ? Function.getFunctionColdEndLabel() @@ -2338,7 +2345,7 @@ void RewriteInstance::emitFunctions() { Streamer->InitSections(false); // Mark beginning of "hot text". - if (opts::Relocs && opts::HotText) + if (BC->HasRelocations && opts::HotText) Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_start")); // Sort functions for the output. @@ -2346,7 +2353,7 @@ void RewriteInstance::emitFunctions() { BinaryContext::getSortedFunctions(BinaryFunctions); DEBUG( - if (!opts::Relocs) { + if (!BC->HasRelocations) { auto SortedIt = SortedFunctions.begin(); for (auto &It : BinaryFunctions) { assert(&It.second == *SortedIt); @@ -2375,7 +2382,8 @@ void RewriteInstance::emitFunctions() { // Emit all cold function split parts at the border of hot and // cold functions. - if (opts::Relocs && !ColdFunctionSeen && CurrentIndex >= LastHotIndex) { + if (BC->HasRelocations && !ColdFunctionSeen && + CurrentIndex >= LastHotIndex) { // Mark the end of "hot" stuff. if (opts::HotText) { Streamer->SwitchSection(BC->MOFI->getTextSection()); @@ -2394,7 +2402,7 @@ void RewriteInstance::emitFunctions() { DEBUG(dbgs() << "BOLT-DEBUG: first cold function: " << Function << '\n'); } - if (!opts::Relocs && + if (!BC->HasRelocations && (!Function.isSimple() || !opts::shouldProcess(Function))) { ++CurrentIndex; continue; @@ -2406,7 +2414,7 @@ void RewriteInstance::emitFunctions() { emitFunction(*Streamer, Function, /*EmitColdPart=*/false); - if (!opts::Relocs && Function.isSplit()) + if (!BC->HasRelocations && Function.isSplit()) emitFunction(*Streamer, Function, /*EmitColdPart=*/true); ++CurrentIndex; @@ -2417,7 +2425,7 @@ void RewriteInstance::emitFunctions() { Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_end")); } - if (!opts::Relocs && opts::UpdateDebugSections) + if (!BC->HasRelocations && opts::UpdateDebugSections) updateDebugLineInfoForNonSimpleFunctions(); emitDataSections(Streamer.get()); @@ -2490,7 +2498,7 @@ void RewriteInstance::emitFunctions() { void RewriteInstance::mapFileSections( orc::ObjectLinkingLayer<>::ObjSetHandleT &ObjectsHandle) { NewTextSectionStartAddress = NextAvailableAddress; - if (opts::Relocs) { + if (BC->HasRelocations) { auto SMII = EFMM->SectionMapInfo.find(".text"); assert(SMII != EFMM->SectionMapInfo.end() && ".text not found in output"); @@ -2693,7 +2701,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { continue; } - if (opts::Relocs) { + if (BC->HasRelocations) { const auto BaseAddress = NewTextSectionStartAddress; const auto StartOffset = Layout.getSymbolOffset(*Function.getSymbol()); const auto EndOffset = @@ -2728,7 +2736,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { continue; // Output ranges should match the input if the body hasn't changed. - if (!Function.isSimple() && !opts::Relocs) + if (!Function.isSimple() && !BC->HasRelocations) continue; BinaryBasicBlock *PrevBB = nullptr; @@ -2737,7 +2745,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { auto *BB = *BBI; assert(BB->getLabel()->isDefined(false) && "symbol should be defined"); uint64_t BaseAddress; - if (opts::Relocs) { + if (BC->HasRelocations) { BaseAddress = NewTextSectionStartAddress; } else { BaseAddress = BB->isCold() ? Function.cold().getAddress() @@ -2831,7 +2839,7 @@ void RewriteInstance::emitDataSections(MCStreamer *Streamer) { } bool RewriteInstance::checkLargeFunctions() { - if (opts::Relocs) + if (BC->HasRelocations) return false; LargeFunctions.clear(); @@ -3382,7 +3390,7 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { // Fix ELF header. auto NewEhdr = *Obj->getHeader(); - if (opts::Relocs) { + if (BC->HasRelocations) { NewEhdr.e_entry = getNewFunctionAddress(NewEhdr.e_entry); assert(NewEhdr.e_entry && "cannot find new address for entry point"); } @@ -3429,7 +3437,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { NewSymbol.getType() != ELF::STT_SECTION) { NewSymbol.st_value = Function->getOutputAddress(); NewSymbol.st_size = Function->getOutputSize(); - if (opts::Relocs) + if (BC->HasRelocations) NewSymbol.st_shndx = NewTextSectionIndex; else NewSymbol.st_shndx = NewSectionIndex[NewSymbol.st_shndx]; @@ -3694,7 +3702,7 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { break; case ELF::DT_INIT: case ELF::DT_FINI: - if (opts::Relocs) { + if (BC->HasRelocations) { if (auto NewAddress = getNewFunctionAddress(DE->getPtr())) { DEBUG(dbgs() << "BOLT-DEBUG: patching dynamic entry of type " << DE->getTag() << '\n'); @@ -3764,7 +3772,7 @@ void RewriteInstance::rewriteFile() { assert(Offset == getFileOffsetForAddress(NextAvailableAddress) && "error resizing output file"); - if (!opts::Relocs) { + if (!BC->HasRelocations) { // Overwrite functions in the output file. uint64_t CountOverwrittenFunctions = 0; uint64_t OverwrittenScore = 0; @@ -3861,7 +3869,7 @@ void RewriteInstance::rewriteFile() { } } - if (opts::Relocs && opts::TrapOldCode) { + if (BC->HasRelocations && opts::TrapOldCode) { auto SavedPos = OS.tell(); // Overwrite function body to make sure we never execute these instructions. for (auto &BFI : BinaryFunctions) { @@ -3913,7 +3921,7 @@ void RewriteInstance::rewriteFile() { // Patch dynamic section/segment. patchELFDynamic(); - if (opts::Relocs) { + if (BC->HasRelocations) { patchELFRelaPLT(); patchELFGOT(); From ecc6ddd1f968becdaf8de3a3b7994544ee08bb07 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 28 Nov 2017 09:57:21 -0800 Subject: [PATCH 358/904] [BOLT] Major overhaul of profiling in BOLT Summary: Profile reading was tightly coupled with building CFG. Since I plan to move to a new profile format that will be associated with CFG it is critical to decouple the two phases. We now have read profile right after the cfg was constructed, but before it is "canonicalized", i.e. CTCs will till be there. After reading the profile, we do a post-processing pass that fixes CFG and does some post-processing for debug info, such as inference of fall-throughs, which is still required with the current format. Another good reason for decoupling is that we can use profile with CFG to more accurately record fall-through branches during aggregation. At the moment we use "Offset" annotations to facilitate location of instructions corresponding to the profile. This might not be super efficient. However, once we switch to the new profile format the offsets would be no longer needed. We might keep them for the aggregator, but if we have to trust LBR data that might not be strictly necessary. I've tried to make changes while keeping backwards compatibly. This makes it easier to verify correctness of the changes, but that also means that we lose accuracy of the profile. Some refactoring is included. Flag "-prof-compat-mode" (on by default) is used for bug-level backwards compatibility. Disable it for more accurate tracing. (cherry picked from commit 883d771adb913bf934e75526175a5a652b595106) --- bolt/BinaryBasicBlock.cpp | 24 +- bolt/BinaryBasicBlock.h | 17 +- bolt/BinaryContext.cpp | 20 +- bolt/BinaryContext.h | 3 + bolt/BinaryFunction.cpp | 843 +++---------------------- bolt/BinaryFunction.h | 78 ++- bolt/BinaryFunctionProfile.cpp | 854 ++++++++++++++++++++++++++ bolt/BinaryPassManager.cpp | 2 +- bolt/CMakeLists.txt | 1 + bolt/DataAggregator.cpp | 92 +-- bolt/DataAggregator.h | 25 +- bolt/DataReader.h | 6 + bolt/Passes/BinaryPasses.cpp | 161 ++++- bolt/Passes/BinaryPasses.h | 8 +- bolt/Passes/IndirectCallPromotion.cpp | 3 +- bolt/Passes/ReorderFunctions.cpp | 45 +- bolt/Passes/ReorderFunctions.h | 12 +- bolt/RewriteInstance.cpp | 201 ++---- bolt/RewriteInstance.h | 8 +- bolt/llvm-bolt.cpp | 5 +- 20 files changed, 1340 insertions(+), 1068 deletions(-) create mode 100644 bolt/BinaryFunctionProfile.cpp diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index b3d9328f6a24..8bb3919b18e1 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -97,11 +97,12 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { case 0: Valid = !CondBranch && !UncondBranch; break; - case 1: - Valid = !CondBranch || - (CondBranch && - !Function->getBasicBlockForLabel(BC.MIA->getTargetSymbol(*CondBranch))); + case 1: { + const bool HasCondBlock = CondBranch && + Function->getBasicBlockForLabel(BC.MIA->getTargetSymbol(*CondBranch)); + Valid = !CondBranch || !HasCondBlock; break; + } case 2: Valid = (CondBranch && @@ -121,7 +122,7 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { << Twine::utohexstr(BC.MIA->getJumpTable(*Inst)) << "\n"; JT->print(errs()); } - dump(); + getFunction()->dump(); } return Valid; } @@ -452,5 +453,18 @@ uint64_t BinaryBasicBlock::estimateSize() const { return Function->getBinaryContext().computeCodeSize(begin(), end()); } +BinaryBasicBlock::BinaryBranchInfo & +BinaryBasicBlock::getBranchInfo(const BinaryBasicBlock &Succ) { + auto BI = branch_info_begin(); + for (auto BB : successors()) { + if (&Succ == BB) + return *BI; + ++BI; + } + + llvm_unreachable("Invalid successor"); + return *BI; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 49949c9263c0..52db09c8a8ed 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -363,15 +363,14 @@ class BinaryBasicBlock { return BranchInfo[Condition == true ? 0 : 1]; }; - BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ) { - auto BI = branch_info_begin(); - for (auto BB : successors()) { - if (&Succ == BB) - return *BI; - ++BI; - } - llvm_unreachable("Invalid successor"); - return *BI; + BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ); + + void setSuccessorBranchInfo(const BinaryBasicBlock &Succ, + uint64_t Count, + uint64_t MispredictedCount) { + auto &BI = getBranchInfo(Succ); + BI.Count = Count; + BI.MispredictedCount = MispredictedCount; } /// Try to compute the taken and misprediction frequencies for the given diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 82de83bd8e7b..d1f38138bb09 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -27,8 +27,6 @@ namespace opts { extern cl::OptionCategory BoltCategory; -extern cl::opt ReorderFunctions; - static cl::opt PrintDebugInfo("print-debug-info", cl::desc("print debug info when printing functions"), @@ -215,16 +213,14 @@ std::vector BinaryContext::getSortedFunctions( return &BFI.second; }); - if (opts::ReorderFunctions != BinaryFunction::RT_NONE) { - std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), - [](const BinaryFunction *A, const BinaryFunction *B) { - if (A->hasValidIndex() && B->hasValidIndex()) { - return A->getIndex() < B->getIndex(); - } else { - return A->hasValidIndex(); - } - }); - } + std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } else { + return A->hasValidIndex(); + } + }); return SortedFunctions; } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 65804f0b5ab4..ad4909e9f013 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -169,6 +169,9 @@ class BinaryContext { /// Number of functions with profile information uint64_t NumProfiledFuncs{0}; + /// Total hotness score according to profiling data for this binary. + uint64_t TotalScore{0}; + /// Track next available address for new allocatable sections. RewriteInstance /// sets this prior to running BOLT passes, so layout passes are aware of the /// final addresses functions will have. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 04615a4898ce..a6c75fea32b2 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -13,7 +13,6 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" #include "DataReader.h" -#include "Passes/MCF.h" #include "llvm/ADT/edit_distance.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -52,7 +51,6 @@ extern cl::OptionCategory BoltRelocCategory; extern bool shouldProcess(const BinaryFunction &); extern cl::opt UpdateDebugSections; -extern cl::opt IndirectCallPromotion; extern cl::opt Verbosity; static cl::opt @@ -61,27 +59,6 @@ AlignBlocks("align-blocks", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -DoMCF("mcf", - cl::desc("solve a min cost flow problem on the CFG to fix edge counts " - "(default=disable)"), - cl::init(MCF_DISABLE), - cl::values( - clEnumValN(MCF_DISABLE, "none", - "disable MCF"), - clEnumValN(MCF_LINEAR, "linear", - "cost function is inversely proportional to edge count"), - clEnumValN(MCF_QUADRATIC, "quadratic", - "cost function is inversely proportional to edge count squared"), - clEnumValN(MCF_LOG, "log", - "cost function is inversely proportional to log of edge count"), - clEnumValN(MCF_BLAMEFTS, "blamefts", - "tune cost to blame fall-through edges for surplus flow"), - clEnumValEnd), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltOptCategory)); - static cl::opt DotToolTipCode("dot-tooltip-code", cl::desc("add basic block instructions as tool tips on nodes"), @@ -1185,21 +1162,13 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } BC.MIA->replaceBranchTarget(Instruction, TargetSymbol, &*Ctx); - // Record call offset for profile matching. - if (IsCall) { - MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); - } - if (IsCondBranch) { - // Add fallthrough branch info. - FTBranches.emplace_back(Offset, Offset + Size); - if (IsCall) { - MIA->setConditionalTailCall(Instruction, TargetAddress); - } + // Mark CTC. + if (IsCondBranch && IsCall) { + MIA->setConditionalTailCall(Instruction, TargetAddress); } } else { // Could not evaluate branch. Should be an indirect call or an // indirect branch. Bail out on the latter case. - MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); if (MIA->isIndirectBranch(Instruction)) { auto Result = processIndirectBranch(Instruction, Size, Offset); switch (Result) { @@ -1255,6 +1224,9 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { findDebugLineInformationForInstructionAt(AbsoluteInstrAddr, ULT)); } + // Record offset of the instruction for profile matching. + MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); + if (MemData && !emptyRange(MemData->getMemInfoRange(Offset))) { MIA->addAnnotation(Ctx.get(), Instruction, "MemDataOffset", Offset); } @@ -1563,9 +1535,6 @@ bool BinaryFunction::buildCFG() { // e.g. exit(3), etc. Otherwise we'll see a false fall-through // blocks. - // Possibly assign/re-assign branch profile data. - matchProfileData(); - for (auto &Branch : TakenBranches) { DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first) << "] -> [0x" << Twine::utohexstr(Branch.second) << "]\n"); @@ -1574,124 +1543,15 @@ bool BinaryFunction::buildCFG() { auto *ToBB = getBasicBlockAtOffset(Branch.second); assert(ToBB && "cannot find BB containing TO branch"); - if (!BranchData) { - FromBB->addSuccessor(ToBB); - continue; - } - - auto BranchInfoOrErr = BranchData->getBranch(Branch.first, Branch.second); - if (!BranchInfoOrErr) { - FromBB->addSuccessor(ToBB); - continue; - } - - const BranchInfo &BInfo = BranchInfoOrErr.get(); - FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); - - // Populate profile counts for the jump table. - auto *LastInstr = FromBB->getLastNonPseudoInstr(); - if (!LastInstr) - continue; - auto JTAddress = BC.MIA->getJumpTable(*LastInstr); - if (!JTAddress) - continue; - auto *JT = getJumpTableContainingAddress(JTAddress); - if (!JT) - continue; - JT->Count += BInfo.Branches; - if (opts::IndirectCallPromotion < ICP_JUMP_TABLES && - opts::JumpTables < JTS_AGGRESSIVE) - continue; - if (JT->Counts.empty()) - JT->Counts.resize(JT->Entries.size()); - auto EI = JT->Entries.begin(); - auto Delta = (JTAddress - JT->Address) / JT->EntrySize; - EI += Delta; - while (EI != JT->Entries.end()) { - if (ToBB->getLabel() == *EI) { - assert(Delta < JT->Counts.size()); - JT->Counts[Delta].Mispreds += BInfo.Mispreds; - JT->Counts[Delta].Count += BInfo.Branches; - } - ++Delta; - ++EI; - // A label marks the start of another jump table. - if (JT->Labels.count(Delta * JT->EntrySize)) - break; - } + FromBB->addSuccessor(ToBB); } - for (auto &Branch : FTBranches) { - DEBUG(dbgs() << "registering fallthrough [0x" - << Twine::utohexstr(Branch.first) << "] -> [0x" - << Twine::utohexstr(Branch.second) << "]\n"); - auto *FromBB = getBasicBlockContainingOffset(Branch.first); - assert(FromBB && "cannot find BB containing FROM branch"); - // Try to find the destination basic block. If the jump instruction was - // followed by a no-op then the destination offset recorded in FTBranches - // will point to that no-op but the destination basic block will start - // after the no-op due to ignoring no-ops when creating basic blocks. - // So we have to skip any no-ops when trying to find the destination - // basic block. - auto *ToBB = getBasicBlockAtOffset(Branch.second); - if (ToBB == nullptr) { - auto I = Instructions.find(Branch.second), E = Instructions.end(); - while (ToBB == nullptr && I != E && MIA->isNoop(I->second)) { - ++I; - if (I == E) - break; - ToBB = getBasicBlockAtOffset(I->first); - } - if (ToBB == nullptr) { - // We have a fall-through that does not point to another BB, ignore it - // as it may happen in cases where we have a BB finished by two - // branches. - // This can also happen when we delete a branch past the end of a - // function in case of a call to __builtin_unreachable(). - continue; - } - } - - // Does not add a successor if we can't find profile data, leave it to the - // inference pass to guess its frequency - if (BranchData) { - auto BranchInfoOrErr = BranchData->getBranch(Branch.first, Branch.second); - if (BranchInfoOrErr) { - const BranchInfo &BInfo = BranchInfoOrErr.get(); - FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds); - } - } - } - - if (BranchData) { - for (auto BB : BasicBlocks) { - auto *CTCInstr = BB->getLastNonPseudoInstr(); - if (!CTCInstr || !MIA->getConditionalTailCall(*CTCInstr)) - continue; - - auto OffsetOrErr = - MIA->tryGetAnnotationAs(*CTCInstr, "Offset"); - assert(OffsetOrErr && "offset not set for conditional tail call"); - - auto BranchInfoOrErr = BranchData->getDirectCallBranch(*OffsetOrErr); - if (!BranchInfoOrErr) - continue; - - MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCTakenCount", - BranchInfoOrErr->Branches); - MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCMispredCount", - BranchInfoOrErr->Mispreds); - } - } - - // Add fall-through branches (except for non-taken conditional branches with - // profile data, which were already accounted for in TakenBranches). + // Add fall-through branches. PrevBB = nullptr; bool IsPrevFT = false; // Is previous block a fall-through. for (auto BB : BasicBlocks) { if (IsPrevFT) { - PrevBB->addSuccessor(BB, BinaryBasicBlock::COUNT_NO_PROFILE, - BinaryBasicBlock::COUNT_INFERRED); + PrevBB->addSuccessor(BB); } if (BB->empty()) { IsPrevFT = true; @@ -1703,29 +1563,18 @@ bool BinaryFunction::buildCFG() { assert(LastInstr && "should have non-pseudo instruction in non-empty block"); - const auto IsCondTailCall = MIA->getConditionalTailCall(*LastInstr); if (BB->succ_size() == 0) { - if (IsCondTailCall) { - // Conditional tail call without profile data for non-taken branch. - IsPrevFT = true; - } else { - // Unless the last instruction is a terminator, control will fall - // through to the next basic block. - IsPrevFT = !MIA->isTerminator(*LastInstr); - } + // Since there's no existing successors, we know the last instruction is + // not a conditional branch. Thus if it's a terminator, it shouldn't be a + // fall-through. + // + // Conditional tail call is a special case since we don't add a taken + // branch successor for it. + IsPrevFT = !MIA->isTerminator(*LastInstr) || + MIA->getConditionalTailCall(*LastInstr); } else if (BB->succ_size() == 1) { - if (IsCondTailCall) { - // Conditional tail call with data for non-taken branch. A fall-through - // edge has already ben added in the CFG. - IsPrevFT = false; - } else { - // Fall-through should be added if the last instruction is a conditional - // jump, since there was no profile data for the non-taken branch. - IsPrevFT = MIA->isConditionalBranch(*LastInstr); - } + IsPrevFT = MIA->isConditionalBranch(*LastInstr); } else { - // Ends with 2 branches, with an indirect jump or it is a conditional - // branch whose frequency has been inferred from LBR. IsPrevFT = false; } @@ -1734,26 +1583,20 @@ bool BinaryFunction::buildCFG() { if (!IsPrevFT) { // Possibly a call that does not return. - DEBUG(dbgs() << "last block was marked as a fall-through\n"); + DEBUG(dbgs() << "last block was marked as a fall-through in " << *this + << '\n'); } + // Assign landing pads and throwers info. recomputeLandingPads(); - // Infer frequency for non-taken branches - if (hasValidProfile() && opts::DoMCF != MCF_DISABLE) { - // Convert COUNT_NO_PROFILE to 0 - removeTagsFromProfile(); - solveMCF(*this, opts::DoMCF); - } else if (hasValidProfile()) { - inferFallThroughCounts(); - } else { - clearProfile(); - } - // Assign CFI information to each BB entry. annotateCFIState(); - // Set the basic block layout to the original order. + // Annotate invoke instructions with GNU_args_size data. + propagateGnuArgsSizeInfo(); + + // Set the basic block layout to the original order and set end offsets. PrevBB = nullptr; for (auto BB : BasicBlocks) { BasicBlocksLayout.emplace_back(BB); @@ -1763,32 +1606,36 @@ bool BinaryFunction::buildCFG() { } PrevBB->setEndOffset(getSize()); - // Convert conditional tail call branches to conditional branches that jump - // to a tail call. - // TODO: make a separate pass - removeConditionalTailCalls(); + updateLayoutIndices(); - // Make any necessary adjustments for indirect branches. - if (!postProcessIndirectBranches()) { - if (opts::Verbosity) { - errs() << "BOLT-WARNING: failed to post-process indirect branches for " - << *this << '\n'; - } - // In relocation mode we want to keep processing the function but avoid - // optimizing it. - setSimple(false); - } + // Update the state. + CurrentState = State::CFG; - // Eliminate inconsistencies between branch instructions and CFG. - postProcessBranches(); + return true; +} - // If our profiling data comes from samples instead of LBR entries, - // now is the time to read this data and attach it to BBs. At this point, - // conditional tail calls are converted into a branch and a new basic block, - // making it slightly different than the original binary where profiled data - // was collected. However, this shouldn't matter for plain sampling events. - if (!BC.DR.hasLBR()) - readSampleData(); +void BinaryFunction::postProcessCFG() { + if (isSimple() && !BasicBlocks.empty()) { + // Convert conditional tail call branches to conditional branches that jump + // to a tail call. + removeConditionalTailCalls(); + + // Make any necessary adjustments for indirect branches. + if (!postProcessIndirectBranches()) { + if (opts::Verbosity) { + errs() << "BOLT-WARNING: failed to post-process indirect branches for " + << *this << '\n'; + } + // In relocation mode we want to keep processing the function but avoid + // optimizing it. + setSimple(false); + } else { + postProcessProfile(); + + // Eliminate inconsistencies between branch instructions and CFG. + postProcessBranches(); + } + } // Clean-up memory taken by instructions and labels. // @@ -1797,19 +1644,20 @@ bool BinaryFunction::buildCFG() { clearList(Instructions); clearList(OffsetToCFI); clearList(TakenBranches); - clearList(FTBranches); clearList(IgnoredBranches); clearList(EntryOffsets); - // Update the state. - CurrentState = State::CFG; - - // Annotate invoke instructions with GNU_args_size data. - propagateGnuArgsSizeInfo(); - - assert(validateCFG() && "Invalid CFG detected after disassembly"); + // Remove "Offset" annotations from instructions that don't need those. + for (auto *BB : layout()) { + for (auto &Inst : *BB) { + if (BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)) + continue; + BC.MIA->removeAnnotation(Inst, "Offset"); + } + } - return true; + assert((!isSimple() || validateCFG()) + && "Invalid CFG detected after post-processing CFG"); } void BinaryFunction::removeTagsFromProfile() { @@ -1826,57 +1674,6 @@ void BinaryFunction::removeTagsFromProfile() { } } -void BinaryFunction::readSampleData() { - auto SampleDataOrErr = BC.DR.getFuncSampleData(getNames()); - - if (!SampleDataOrErr) - return; - - // Non-LBR mode territory - // First step is to assign BB execution count based on samples from perf - ProfileMatchRatio = 1.0f; - removeTagsFromProfile(); - bool NormalizeByInsnCount = - BC.DR.usesEvent("cycles") || BC.DR.usesEvent("instructions"); - bool NormalizeByCalls = BC.DR.usesEvent("branches"); - static bool NagUser{true}; - if (NagUser) { - outs() << "BOLT-INFO: operating with non-LBR profiling data.\n"; - if (NormalizeByInsnCount) { - outs() << "BOLT-INFO: normalizing samples by instruction count.\n"; - } else if (NormalizeByCalls) { - outs() << "BOLT-INFO: normalizing samples by branches.\n"; - } - NagUser = false; - } - uint64_t LastOffset = getSize(); - uint64_t TotalEntryCount{0}; - for (auto I = BasicBlockOffsets.rbegin(), E = BasicBlockOffsets.rend(); - I != E; ++I) { - uint64_t CurOffset = I->first; - // Always work with samples multiplied by 1000 to avoid losing them if we - // later need to normalize numbers - uint64_t NumSamples = - SampleDataOrErr->getSamples(CurOffset, LastOffset) * 1000; - if (NormalizeByInsnCount && I->second->getNumNonPseudos()) - NumSamples /= I->second->getNumNonPseudos(); - else if (NormalizeByCalls) { - uint32_t NumCalls = I->second->getNumCalls(); - NumSamples /= NumCalls + 1; - } - I->second->setExecutionCount(NumSamples); - if (I->second->isEntryPoint()) - TotalEntryCount += NumSamples; - LastOffset = CurOffset; - } - ExecutionCount = TotalEntryCount; - - estimateEdgeCounts(BC, *this); - - if (opts::DoMCF != MCF_DISABLE) - solveMCF(*this, opts::DoMCF); -} - void BinaryFunction::addEntryPoint(uint64_t Address) { assert(containsAddress(Address) && "address does not belong to the function"); @@ -1930,377 +1727,7 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { } } -bool BinaryFunction::fetchProfileForOtherEntryPoints() { - if (!BranchData) - return false; - - // Check if we are missing profiling data for secondary entry points - bool First{true}; - bool Updated{false}; - for (auto BB : BasicBlocks) { - if (First) { - First = false; - continue; - } - if (BB->isEntryPoint()) { - uint64_t EntryAddress = BB->getOffset() + getAddress(); - // Look for branch data associated with this entry point - std::vector Names; - std::multimap::iterator I, E; - for (std::tie(I, E) = BC.GlobalAddresses.equal_range(EntryAddress); - I != E; ++I) { - Names.push_back(I->second); - } - if (!Names.empty()) { - if (FuncBranchData *Data = BC.DR.getFuncBranchData(Names)) { - BranchData->appendFrom(*Data, BB->getOffset()); - Data->Used = true; - Updated = true; - } - } - } - } - return Updated; -} - -void BinaryFunction::matchProfileMemData() { - const auto AllMemData = BC.DR.getFuncMemDataRegex(getNames()); - for (auto *NewMemData : AllMemData) { - // Prevent functions from sharing the same profile. - if (NewMemData->Used) - continue; - - if (MemData) - MemData->Used = false; - - // Update function profile data with the new set. - MemData = NewMemData; - MemData->Used = true; - break; - } -} - -void BinaryFunction::matchProfileData() { - // This functionality is available for LBR-mode only - // TODO: Implement evaluateProfileData() for samples, checking whether - // sample addresses match instruction addresses in the function - if (!BC.DR.hasLBR()) - return; - - if (BranchData) { - ProfileMatchRatio = evaluateProfileData(*BranchData); - if (ProfileMatchRatio == 1.0f) { - if (fetchProfileForOtherEntryPoints()) { - ProfileMatchRatio = evaluateProfileData(*BranchData); - ExecutionCount = BranchData->ExecutionCount; - } - return; - } - } - - // Check if the function name can fluctuate between several compilations - // possibly triggered by minor unrelated code changes in the source code - // of the input binary. - const auto HasVolatileName = [this]() { - for (const auto Name : getNames()) { - if (getLTOCommonName(Name)) - return true; - } - return false; - }(); - if (!HasVolatileName) - return; - - // Check for a profile that matches with 100% confidence. - const auto AllBranchData = BC.DR.getFuncBranchDataRegex(getNames()); - for (auto *NewBranchData : AllBranchData) { - // Prevent functions from sharing the same profile. - if (NewBranchData->Used) - continue; - - if (evaluateProfileData(*NewBranchData) != 1.0f) - continue; - - if (BranchData) - BranchData->Used = false; - - // Update function profile data with the new set. - BranchData = NewBranchData; - ExecutionCount = NewBranchData->ExecutionCount; - ProfileMatchRatio = 1.0f; - BranchData->Used = true; - break; - } -} - -float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { - // Until we define a minimal profile, we consider an empty branch data to be - // a valid profile. It could happen to a function without branches when we - // still have an EntryData for execution count. - if (BranchData.Data.empty()) { - return 1.0f; - } - - BranchListType ProfileBranches(BranchData.Data.size()); - std::transform(BranchData.Data.begin(), - BranchData.Data.end(), - ProfileBranches.begin(), - [](const BranchInfo &BI) { - return std::make_pair(BI.From.Offset, - BI.To.Name == BI.From.Name ? - BI.To.Offset : -1U); - }); - BranchListType LocalProfileBranches; - std::copy_if(ProfileBranches.begin(), - ProfileBranches.end(), - std::back_inserter(LocalProfileBranches), - [](const std::pair &Branch) { - return Branch.second != -1U; - }); - - // Profile referencing external functions. - BranchListType ExternProfileBranches; - std::copy_if(ProfileBranches.begin(), - ProfileBranches.end(), - std::back_inserter(ExternProfileBranches), - [](const std::pair &Branch) { - return Branch.second == -1U; - }); - - std::sort(LocalProfileBranches.begin(), LocalProfileBranches.end()); - - BranchListType FunctionBranches = TakenBranches; - FunctionBranches.insert(FunctionBranches.end(), - FTBranches.begin(), - FTBranches.end()); - FunctionBranches.insert(FunctionBranches.end(), - IgnoredBranches.begin(), - IgnoredBranches.end()); - std::sort(FunctionBranches.begin(), FunctionBranches.end()); - - BranchListType DiffBranches; // Branches in profile without a match. - std::set_difference(LocalProfileBranches.begin(), - LocalProfileBranches.end(), - FunctionBranches.begin(), - FunctionBranches.end(), - std::back_inserter(DiffBranches)); - - // Branches without a match in CFG. - BranchListType OrphanBranches; - - // Eliminate recursive calls and returns from recursive calls from the list - // of branches that have no match. They are not considered local branches. - auto isRecursiveBranch = [&](std::pair &Branch) { - auto SrcInstrI = Instructions.find(Branch.first); - if (SrcInstrI == Instructions.end()) - return false; - - // Check if it is a recursive call. - const auto &SrcInstr = SrcInstrI->second; - if ((BC.MIA->isCall(SrcInstr) || BC.MIA->isIndirectBranch(SrcInstr)) && - Branch.second == 0) - return true; - - auto DstInstrI = Instructions.find(Branch.second); - if (DstInstrI == Instructions.end()) - return false; - - // Check if it is a return from a recursive call. - bool IsSrcReturn = BC.MIA->isReturn(SrcInstr); - // "rep ret" is considered to be 2 different instructions. - if (!IsSrcReturn && BC.MIA->isPrefix(SrcInstr)) { - auto SrcInstrSuccessorI = SrcInstrI; - ++SrcInstrSuccessorI; - assert(SrcInstrSuccessorI != Instructions.end() && - "unexpected prefix instruction at the end of function"); - IsSrcReturn = BC.MIA->isReturn(SrcInstrSuccessorI->second); - } - if (IsSrcReturn && Branch.second != 0) { - // Make sure the destination follows the call instruction. - auto DstInstrPredecessorI = DstInstrI; - --DstInstrPredecessorI; - assert(DstInstrPredecessorI != Instructions.end() && "invalid iterator"); - if (BC.MIA->isCall(DstInstrPredecessorI->second)) - return true; - } - return false; - }; - std::remove_copy_if(DiffBranches.begin(), - DiffBranches.end(), - std::back_inserter(OrphanBranches), - isRecursiveBranch); - - // Check all external branches. - std::copy_if(ExternProfileBranches.begin(), - ExternProfileBranches.end(), - std::back_inserter(OrphanBranches), - [&](const std::pair &Branch) { - auto II = Instructions.find(Branch.first); - if (II == Instructions.end()) - return true; - const auto &Instr = II->second; - // Check for calls, tail calls, rets and indirect branches. - // When matching profiling info, we did not reach the stage - // when we identify tail calls, so they are still represented - // by regular branch instructions and we need isBranch() here. - if (BC.MIA->isCall(Instr) || - BC.MIA->isBranch(Instr) || - BC.MIA->isReturn(Instr)) - return false; - // Check for "rep ret" - if (BC.MIA->isPrefix(Instr)) { - ++II; - if (II != Instructions.end() && BC.MIA->isReturn(II->second)) - return false; - } - return true; - }); - - const float MatchRatio = - (float) (ProfileBranches.size() - OrphanBranches.size()) / - (float) ProfileBranches.size(); - - if (opts::Verbosity >= 2 && !OrphanBranches.empty()) { - errs() << "BOLT-WARNING: profile branches match only " - << format("%.1f%%", MatchRatio * 100.0f) << " (" - << (ProfileBranches.size() - OrphanBranches.size()) << '/' - << ProfileBranches.size() << ") for function " - << *this << '\n'; - DEBUG( - for (auto &OBranch : OrphanBranches) - errs() << "\t0x" << Twine::utohexstr(OBranch.first) << " -> 0x" - << Twine::utohexstr(OBranch.second) << " (0x" - << Twine::utohexstr(OBranch.first + getAddress()) << " -> 0x" - << Twine::utohexstr(OBranch.second + getAddress()) << ")\n"; - ); - } - - return MatchRatio; -} - -void BinaryFunction::clearProfile() { - // Keep function execution profile the same. Only clear basic block and edge - // counts. - for (auto *BB : BasicBlocks) { - BB->ExecutionCount = 0; - for (auto &BI : BB->branch_info()) { - BI.Count = 0; - BI.MispredictedCount = 0; - } - } -} - - -void BinaryFunction::inferFallThroughCounts() { - assert(!BasicBlocks.empty() && "basic block list should not be empty"); - assert(BranchData && "cannot infer counts without branch data"); - - // Compute preliminary execution count for each basic block - for (auto CurBB : BasicBlocks) { - CurBB->ExecutionCount = 0; - } - - for (auto CurBB : BasicBlocks) { - auto SuccBIIter = CurBB->branch_info_begin(); - for (auto Succ : CurBB->successors()) { - if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE) - Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count); - ++SuccBIIter; - } - } - - // Set entry BBs to zero, we'll update their execution count next with entry - // data (we maintain a separate data structure for branches to function entry - // points) - for (auto BB : BasicBlocks) { - if (BB->isEntryPoint()) - BB->ExecutionCount = 0; - } - - // Update execution counts of landing pad blocks and entry BBs - // There is a slight skew introduced here as branches originated from RETs - // may be accounted for in the execution count of an entry block if the last - // instruction in a predecessor fall-through block is a call. This situation - // should rarely happen because there are few multiple-entry functions. - for (const auto &I : BranchData->EntryData) { - BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); - if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { - BB->setExecutionCount(BB->getExecutionCount() + I.Branches); - } - } - - // Work on a basic block at a time, propagating frequency information - // forwards. - // It is important to walk in the layout order. - for (auto BB : BasicBlocks) { - uint64_t BBExecCount = BB->getExecutionCount(); - - // Propagate this information to successors, filling in fall-through edges - // with frequency information - if (BB->succ_size() == 0) - continue; - - // Calculate frequency of outgoing branches from this node according to - // LBR data. - uint64_t ReportedBranches = 0; - for (const auto &SuccBI : BB->branch_info()) { - if (SuccBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) - ReportedBranches += SuccBI.Count; - } - - // Get taken count of conditional tail call if the block ends with one. - uint64_t CTCTakenCount = 0; - const auto CTCInstr = BB->getLastNonPseudoInstr(); - if (CTCInstr && BC.MIA->getConditionalTailCall(*CTCInstr)) { - CTCTakenCount = - BC.MIA->getAnnotationWithDefault(*CTCInstr, "CTCTakenCount"); - } - - // Calculate frequency of throws from this node according to LBR data - // for branching into associated landing pads. Since it is possible - // for a landing pad to be associated with more than one basic blocks, - // we may overestimate the frequency of throws for such blocks. - uint64_t ReportedThrows = 0; - for (const auto *LP: BB->landing_pads()) { - ReportedThrows += LP->getExecutionCount(); - } - - const uint64_t TotalReportedJumps = - ReportedBranches + CTCTakenCount + ReportedThrows; - - // Infer the frequency of the fall-through edge, representing not taking the - // branch. - uint64_t Inferred = 0; - if (BBExecCount > TotalReportedJumps) - Inferred = BBExecCount - TotalReportedJumps; - - DEBUG( - if (opts::Verbosity >= 1 && BBExecCount < TotalReportedJumps) - errs() - << "BOLT-WARNING: Fall-through inference is slightly inconsistent. " - "exec frequency is less than the outgoing edges frequency (" - << BBExecCount << " < " << ReportedBranches - << ") for BB at offset 0x" - << Twine::utohexstr(getAddress() + BB->getOffset()) << '\n'; - ); - - if (BB->succ_size() <= 2) { - // If there is an FT it will be the last successor. - auto &SuccBI = *BB->branch_info_rbegin(); - auto &Succ = *BB->succ_rbegin(); - if (SuccBI.Count == BinaryBasicBlock::COUNT_NO_PROFILE) { - SuccBI.Count = Inferred; - Succ->ExecutionCount += Inferred; - } - } - } - - return; -} - void BinaryFunction::removeConditionalTailCalls() { - CurrentState = State::CFG; - // Blocks to be appended at the end. std::vector> NewBlocks; @@ -2373,6 +1800,9 @@ void BinaryFunction::removeConditionalTailCalls() { // Swap edges as the TailCallBB corresponds to the taken branch. BB.swapConditionalSuccessors(); } + + // This branch is no longer a conditional tail call. + BC.MIA->unsetConditionalTailCall(*CTCInstr); } insertBasicBlocks(std::prev(end()), @@ -3068,11 +2498,12 @@ void BinaryFunction::fixBranches() { // terminator) or more than 2 (switch table) don't require branch // instruction adjustments. } - assert(validateCFG() && "Invalid CFG detected after fixing branches"); + assert((!isSimple() || validateCFG()) + && "Invalid CFG detected after fixing branches"); } void BinaryFunction::propagateGnuArgsSizeInfo() { - assert(CurrentState == State::CFG && "unexpected function state"); + assert(CurrentState == State::Disassembled && "unexpected function state"); if (!hasEHRanges() || !usesGnuArgsSize()) return; @@ -3145,68 +2576,6 @@ void BinaryFunction::postProcessBranches() { assert(validateCFG() && "invalid CFG"); } -void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { - // No reason to merge invalid or empty profiles into BF. - if (!hasValidProfile()) - return; - - // Update function execution count. - if (getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) { - BF.setExecutionCount(BF.getKnownExecutionCount() + getExecutionCount()); - } - - // Since we are merging a valid profile, the new profile should be valid too. - // It has either already been valid, or it has been cleaned up. - BF.ProfileMatchRatio = 1.0f; - - // Update basic block and edge counts. - auto BBMergeI = BF.begin(); - for (BinaryBasicBlock *BB : BasicBlocks) { - BinaryBasicBlock *BBMerge = &*BBMergeI; - assert(getIndex(BB) == BF.getIndex(BBMerge)); - - // Update basic block count. - if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) { - BBMerge->setExecutionCount( - BBMerge->getKnownExecutionCount() + BB->getExecutionCount()); - } - - // Update edge count for successors of this basic block. - auto BBMergeSI = BBMerge->succ_begin(); - auto BIMergeI = BBMerge->branch_info_begin(); - auto BII = BB->branch_info_begin(); - for (const auto *BBSucc : BB->successors()) { - (void)BBSucc; - assert(getIndex(BBSucc) == BF.getIndex(*BBMergeSI)); - - // At this point no branch count should be set to COUNT_NO_PROFILE. - assert(BII->Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "unexpected unknown branch profile"); - assert(BIMergeI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "unexpected unknown branch profile"); - - BIMergeI->Count += BII->Count; - - // When we merge inferred and real fall-through branch data, the merged - // data is considered inferred. - if (BII->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED && - BIMergeI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { - BIMergeI->MispredictedCount += BII->MispredictedCount; - } else { - BIMergeI->MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; - } - - ++BBMergeSI; - ++BII; - ++BIMergeI; - } - assert(BBMergeSI == BBMerge->succ_end()); - - ++BBMergeI; - } - assert(BBMergeI == BF.end()); -} - BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { BasicBlockOrderType DFS; unsigned Index = 0; @@ -4058,6 +3427,28 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( return MergedRanges; } +MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) { + if (CurrentState == State::Disassembled) { + auto II = Instructions.find(Offset); + return (II == Instructions.end()) ? nullptr : &II->second; + } else if (CurrentState == State::CFG) { + auto *BB = getBasicBlockContainingOffset(Offset); + if (!BB) + return nullptr; + + for (auto &Inst : *BB) { + constexpr auto InvalidOffset = std::numeric_limits::max(); + if (Offset == BC.MIA->getAnnotationWithDefault(Inst, "Offset", + InvalidOffset)) + return &Inst; + } + + return nullptr; + } else { + llvm_unreachable("invalid CFG state to use getInstructionAtOffset()"); + } +} + DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( const DWARFDebugLoc::LocationList &InputLL, uint64_t BaseAddress) const { @@ -4331,60 +3722,6 @@ DynoStats BinaryFunction::getDynoStats() const { return Stats; } -Optional, 16>> -BinaryFunction::getFallthroughsInTrace(uint64_t From, uint64_t To) const { - SmallVector, 16> Res; - - if (CurrentState != State::Disassembled) - return NoneType(); - - // Get iterators and validate trace start/end - auto FromIter = Instructions.find(From); - if (FromIter == Instructions.end()) - return NoneType(); - - auto ToIter = Instructions.find(To); - if (ToIter == Instructions.end()) - return NoneType(); - - // Trace needs to go forward - if (FromIter->first > ToIter->first) - return NoneType(); - - // Trace needs to finish in a branch - if (!BC.MIA->isBranch(ToIter->second) && !BC.MIA->isCall(ToIter->second) && - !BC.MIA->isReturn(ToIter->second)) { - // Check for "rep ret" - if (!BC.MIA->isPrefix(ToIter->second)) { - return NoneType(); - } else { - ++ToIter; - if (!BC.MIA->isReturn(ToIter->second)) - return NoneType(); - } - } - - // Analyze intermediate instructions - for (; FromIter != ToIter; ++FromIter) { - // This operates under an assumption that we collect all branches in LBR - // No unconditional branches in the middle of the trace - if (BC.MIA->isUnconditionalBranch(FromIter->second) || - BC.MIA->isReturn(FromIter->second) || - BC.MIA->isCall(FromIter->second)) - return NoneType(); - - if (!BC.MIA->isConditionalBranch(FromIter->second)) - continue; - - const uint64_t Src = FromIter->first; - auto Next = std::next(FromIter); - const uint64_t Dst = Next->first; - Res.push_back(std::make_pair(Src, Dst)); - } - - return Res; -} - void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat, uint64_t OtherStat) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index e0157143c732..472890a6e327 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -172,7 +172,7 @@ class BinaryFunction { enum class State : char { Empty = 0, /// Function body is empty. Disassembled, /// Function have been disassembled. - CFG, /// Control flow graph have been built. + CFG, /// Control flow graph has been built. CFG_Finalized, /// CFG is finalized. No optimizations allowed. Emitted, /// Instructions have been emitted to output. }; @@ -186,16 +186,6 @@ class BinaryFunction { ST_ALL, /// Split all functions }; - enum ReorderType : char { - RT_NONE = 0, - RT_EXEC_COUNT, - RT_HFSORT, - RT_HFSORT_PLUS, - RT_PETTIS_HANSEN, - RT_RANDOM, - RT_USER - }; - /// Branch statistics for jump table entries. struct JumpInfo { uint64_t Mispreds{0}; @@ -447,7 +437,6 @@ class BinaryFunction { using BranchListType = std::vector>; BranchListType TakenBranches; /// All local taken branches. - BranchListType FTBranches; /// All fall-through branches. BranchListType IgnoredBranches; /// Branches ignored by CFG purposes. /// Map offset in the function to a label. @@ -754,13 +743,8 @@ class BinaryFunction { } /// Return instruction at a given offset in the function. Valid before - /// CFG is constructed. - MCInst *getInstructionAtOffset(uint64_t Offset) { - assert(CurrentState == State::Disassembled && - "can only call function in Disassembled state"); - auto II = Instructions.find(Offset); - return (II == Instructions.end()) ? nullptr : &II->second; - } + /// CFG is constructed or while instruction offsets are available in CFG. + MCInst *getInstructionAtOffset(uint64_t Offset); /// Analyze and process indirect branch \p Instruction before it is /// added to Instructions list. @@ -1480,6 +1464,13 @@ class BinaryFunction { ProfileMatchRatio == 1.0f; } + /// Mark this function as having a valid profile. + void markProfiled() { + if (ExecutionCount == COUNT_NO_PROFILE) + ExecutionCount = 0; + ProfileMatchRatio = 1.0f; + } + void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) { assert(!Instructions.empty()); @@ -1809,6 +1800,12 @@ class BinaryFunction { /// State::CFG. Returns false if CFG cannot be built. bool buildCFG(); + /// Read any kind of profile information available for the function. + void readProfile(); + + /// Perform post-processing of the CFG. + void postProcessCFG(); + /// Verify that any assumptions we've made about indirect branches were /// correct and also make any necessary changes to unknown indirect branches. /// @@ -2022,9 +2019,41 @@ class BinaryFunction { return UnitLineTable; } - /// Scan from - to offsets for conditional jumps + /// Update function execution profile with a recorded trace. + /// A trace is region of code executed between two LBR entries supplied in + /// execution order. + /// + /// Return true if the trace is valid, false otherwise. + bool recordTrace( + const LBREntry &First, + const LBREntry &Second, + uint64_t Count = 1, + SmallVector, 16> *Branches = nullptr); + + /// Update function profile with a taken branch. + /// \p Count could be 0 if verification of the branch is required. + /// + /// Return true if the branch is valid, false otherwise. + bool recordBranch(uint64_t From, uint64_t To, uint64_t Count = 1, + uint64_t Mispreds = 0); + + /// Record external entry into the function. + /// + /// Return true if the entry is valid, false otherwise. + bool recordEntry(uint64_t To, bool Mispred, uint64_t Count = 1); + + /// Record exit from a function via a call or return. + /// + /// Return true if the exit point is valid, false otherwise. + bool recordExit(uint64_t From, bool Mispred, uint64_t Count = 1); + + /// Finalize profile for the function. + void postProcessProfile(); + + /// Return a vector of offsets corresponding to a trace in a function + /// (see recordTrace() above). Optional, 16>> - getFallthroughsInTrace(uint64_t From, uint64_t To) const; + getFallthroughsInTrace(const LBREntry &First, const LBREntry &Second); /// Returns an estimate of the function's hot part after splitting. /// This is a very rough estimate, as with C++ exceptions there are @@ -2181,6 +2210,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, return OS; } +inline raw_ostream &operator<<(raw_ostream &OS, + const LBREntry &LBR) { + OS << "0x" << Twine::utohexstr(LBR.From) + << " -> 0x" << Twine::utohexstr(LBR.To); + return OS; +} + } // namespace bolt diff --git a/bolt/BinaryFunctionProfile.cpp b/bolt/BinaryFunctionProfile.cpp new file mode 100644 index 000000000000..66bf634ef6e9 --- /dev/null +++ b/bolt/BinaryFunctionProfile.cpp @@ -0,0 +1,854 @@ +//===--- BinaryFunctionProfile.cpp --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "DataReader.h" +#include "Passes/MCF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt-prof" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory AggregatorCategory; +extern cl::OptionCategory BoltOptCategory; + +extern cl::opt Verbosity; +extern cl::opt IndirectCallPromotion; +extern cl::opt JumpTables; + +static cl::opt +CompatMode("prof-compat-mode", + cl::desc("maintain bug-level compatibility with old profile"), + cl::init(true), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +DoMCF("mcf", + cl::desc("solve a min cost flow problem on the CFG to fix edge counts " + "(default=disable)"), + cl::init(MCF_DISABLE), + cl::values( + clEnumValN(MCF_DISABLE, "none", + "disable MCF"), + clEnumValN(MCF_LINEAR, "linear", + "cost function is inversely proportional to edge count"), + clEnumValN(MCF_QUADRATIC, "quadratic", + "cost function is inversely proportional to edge count squared"), + clEnumValN(MCF_LOG, "log", + "cost function is inversely proportional to log of edge count"), + clEnumValN(MCF_BLAMEFTS, "blamefts", + "tune cost to blame fall-through edges for surplus flow"), + clEnumValEnd), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +FixFuncCounts("fix-func-counts", + cl::desc("adjust function counts based on basic blocks execution count"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +bool BinaryFunction::recordTrace( + const LBREntry &First, + const LBREntry &Second, + uint64_t Count, + SmallVector, 16> *Branches) { + if (!isSimple()) + return false; + + assert(CurrentState == State::CFG && "can only record traces in CFG state"); + + // Offsets of the trace within this function. + const auto From = First.To - getAddress(); + const auto To = Second.From - getAddress(); + + if (From > To) + return false; + + auto *FromBB = getBasicBlockContainingOffset(From); + auto *ToBB = getBasicBlockContainingOffset(To); + + if (!FromBB || !ToBB) + return false; + + // Fill out information for fall-through edges. The From and To could be + // within the same basic block, e.g. when two call instructions are in the + // same block. In this case we skip the processing. + if (FromBB == ToBB) { + if (opts::CompatMode) + return true; + + // If the previous block ended with a call, the destination of a return + // would be in ToBB basic block. And if the ToBB starts with a control + // transfer instruction, we will have a 0-length trace that we have to + // account for as a fall-through edge. + if (To == ToBB->getOffset()) { + // External entry point. + if (ToBB->isEntryPoint() || ToBB->isLandingPad()) + return true; + + // Check that the origin LBR of a trace starts in another function. + // Otherwise it's an internal branch that was accounted for. + if (containsAddress(First.From)) + return true; + + auto *PrevBB = BasicBlocksLayout[ToBB->getIndex() - 1]; + + // This could be a bad trace. + if (!PrevBB->getSuccessor(ToBB->getLabel())) { + DEBUG(dbgs() << "invalid LBR sequence:\n" + << " " << First << '\n' + << " " << Second << '\n'); + return false; + } + + auto &BI = PrevBB->getBranchInfo(*ToBB); + BI.Count += Count; + if (Branches) { + const auto *Instr = PrevBB->getLastNonPseudoInstr(); + const auto Offset = + BC.MIA->getAnnotationWithDefault(*Instr, "Offset"); + Branches->push_back(std::make_pair(Offset, ToBB->getOffset())); + } + } + + return true; + } + + // Process blocks in the original layout order. + auto *BB = BasicBlocksLayout[FromBB->getIndex()]; + assert(BB == FromBB && "index mismatch"); + while (BB != ToBB) { + auto *NextBB = BasicBlocksLayout[BB->getIndex() + 1]; + assert((NextBB && NextBB->getOffset() > BB->getOffset()) && "bad layout"); + + // Check for bad LBRs. + if (!BB->getSuccessor(NextBB->getLabel())) { + DEBUG(dbgs() << "no fall-through for the trace:\n" + << " " << First << '\n' + << " " << Second << '\n'); + return false; + } + + // To keep backwards compatibility we skip recording fall-throughs that + // are not a result of a conditional jump. + if (!opts::CompatMode || + (BB->succ_size() == 2 && + BB->getConditionalSuccessor(false) == NextBB)) { + auto &BI = BB->getBranchInfo(*NextBB); + BI.Count += Count; + + if (Branches) { + const auto *Instr = BB->getLastNonPseudoInstr(); + // Note: real offset for conditional jump instruction shouldn't be 0. + const auto Offset = + BC.MIA->getAnnotationWithDefault(*Instr, "Offset"); + if (Offset) { + Branches->push_back(std::make_pair(Offset, NextBB->getOffset())); + } + } + } + + BB = NextBB; + } + + return true; +} + +bool BinaryFunction::recordBranch(uint64_t From, uint64_t To, + uint64_t Count, uint64_t Mispreds) { + auto *FromBB = getBasicBlockContainingOffset(From); + auto *ToBB = getBasicBlockContainingOffset(To); + + if (!FromBB || !ToBB) { + DEBUG(dbgs() << "failed to get block for recorded branch\n"); + return false; + } + + // Could be bad LBR data. Ignore, or report as a bad profile for backwards + // compatibility. + if (From == To) { + if (!opts::CompatMode) + return true; + auto *Instr = getInstructionAtOffset(0); + if (Instr && BC.MIA->isCall(*Instr)) + return true; + return false; + } + + if (FromBB->succ_size() == 0) { + // Return from a tail call. + return true; + } + + // Very rarely we will see ignored branches. Do a linear check. + for (auto &Branch : IgnoredBranches) { + if (Branch == std::make_pair(static_cast(From), + static_cast(To))) + return true; + } + + if (To != ToBB->getOffset()) { + // "To" could be referring to nop instructions in between 2 basic blocks. + // While building the CFG we make sure these nops are attributed to the + // previous basic block, thus we check if the destination belongs to the + // gap past the last instruction. + const auto *LastInstr = ToBB->getLastNonPseudoInstr(); + if (LastInstr) { + const auto LastInstrOffset = + BC.MIA->getAnnotationWithDefault(*LastInstr, "Offset"); + + // With old .fdata we are getting FT branches for "jcc,jmp" sequences. + if (To == LastInstrOffset && BC.MIA->isUnconditionalBranch(*LastInstr)) { + return true; + } + + if (To <= LastInstrOffset) { + DEBUG(dbgs() << "branch recorded into the middle of the block" << " in " + << *this << " : " << From << " -> " << To << '\n'); + return false; + } + } + + // The real destination is the layout successor of the detected ToBB. + if (ToBB == BasicBlocksLayout.back()) + return false; + auto *NextBB = BasicBlocksLayout[ToBB->getIndex() + 1]; + assert((NextBB && NextBB->getOffset() > ToBB->getOffset()) && "bad layout"); + ToBB = NextBB; + } + + // If there's no corresponding instruction for 'From', we have probably + // discarded it as a FT from __builtin_unreachable. + auto *FromInstruction = getInstructionAtOffset(From); + if (!FromInstruction) { + DEBUG(dbgs() << "no instruction for offset " << From << " in " + << *this << '\n'); + return false; + } + + if (FromBB == ToBB) { + // Check for a return from a recursive call. + // Otherwise it's a simple loop. + } + + if (!FromBB->getSuccessor(ToBB->getLabel())) { + // Check if this is a recursive call or a return from a recursive call. + if (ToBB->isEntryPoint()) { + // Execution count is already accounted for. + return true; + } + + DEBUG(dbgs() << "invalid branch in " << *this << '\n' + << Twine::utohexstr(From) << " -> " + << Twine::utohexstr(To) << '\n'); + return false; + } + + auto &BI = FromBB->getBranchInfo(*ToBB); + BI.Count += Count; + // Only update mispredicted count if it the count was real. + if (Count) { + BI.MispredictedCount += Mispreds; + } + + return true; +} + +bool BinaryFunction::recordEntry(uint64_t To, bool Mispred, uint64_t Count) { + if (To > getSize()) + return false; + + if (!hasProfile()) + ExecutionCount = 0; + + if (To == 0) + ExecutionCount += Count; + + return true; +} + +bool BinaryFunction::recordExit(uint64_t From, bool Mispred, uint64_t Count) { + if (!isSimple()) + return false; + assert(From <= getSize() && "wrong From address"); + + if (!hasProfile()) + ExecutionCount = 0; + + return true; +} + +void BinaryFunction::postProcessProfile() { + if (!hasValidProfile()) { + clearProfile(); + return; + } + + // Check if MCF post-processing was requested. + if (opts::DoMCF != MCF_DISABLE) { + removeTagsFromProfile(); + solveMCF(*this, opts::DoMCF); + return; + } + + // Is we are using non-LBR sampling there's nothing left to do. + if (!BranchData) + return; + + // Bug compatibility with previous version - double accounting for conditional + // jump into a fall-through block. + if (opts::CompatMode) { + for (auto *BB : BasicBlocks) { + if (BB->succ_size() == 2 && + BB->getConditionalSuccessor(false) == + BB->getConditionalSuccessor(true)) { + auto &TakenBI = *BB->branch_info_begin(); + auto &FallThroughBI = *BB->branch_info_rbegin(); + FallThroughBI.Count = TakenBI.Count; + FallThroughBI.MispredictedCount = 0; + } + } + } + + // Pre-sort branch data. + std::stable_sort(BranchData->Data.begin(), BranchData->Data.end()); + + // If we have at least some branch data for the function indicate that it + // was executed. + if (opts::FixFuncCounts && ExecutionCount == 0) { + ExecutionCount = 1; + } + + // Compute preliminary execution count for each basic block + for (auto *BB : BasicBlocks) { + BB->ExecutionCount = 0; + } + for (auto *BB : BasicBlocks) { + auto SuccBIIter = BB->branch_info_begin(); + for (auto Succ : BB->successors()) { + if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE) + Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count); + ++SuccBIIter; + } + } + + // Set entry BBs to zero, we'll update their execution count next with entry + // data (we maintain a separate data structure for branches to function entry + // points) + for (auto *BB : BasicBlocks) { + if (BB->isEntryPoint()) + BB->ExecutionCount = 0; + } + + // Update execution counts of landing pad blocks and entry BBs + // There is a slight skew introduced here as branches originated from RETs + // may be accounted for in the execution count of an entry block if the last + // instruction in a predecessor fall-through block is a call. This situation + // should rarely happen because there are few multiple-entry functions. + for (const auto &I : BranchData->EntryData) { + BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); + if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { + BB->setExecutionCount(BB->getExecutionCount() + I.Branches); + } + } + + inferFallThroughCounts(); + + // Update profile information for jump tables based on CFG branch data. + for (auto *BB : BasicBlocks) { + const auto *LastInstr = BB->getLastNonPseudoInstr(); + if (!LastInstr) + continue; + const auto JTAddress = BC.MIA->getJumpTable(*LastInstr); + if (!JTAddress) + continue; + auto *JT = getJumpTableContainingAddress(JTAddress); + if (!JT) + continue; + + uint64_t TotalBranchCount = 0; + for (const auto &BranchInfo : BB->branch_info()) { + TotalBranchCount += BranchInfo.Count; + } + JT->Count += TotalBranchCount; + + if (opts::IndirectCallPromotion < ICP_JUMP_TABLES && + opts::JumpTables < JTS_AGGRESSIVE) + continue; + + if (JT->Counts.empty()) + JT->Counts.resize(JT->Entries.size()); + auto EI = JT->Entries.begin(); + auto Delta = (JTAddress - JT->Address) / JT->EntrySize; + EI += Delta; + while (EI != JT->Entries.end()) { + const auto *TargetBB = getBasicBlockForLabel(*EI); + if (TargetBB) { + const auto &BranchInfo = BB->getBranchInfo(*TargetBB); + assert(Delta < JT->Counts.size()); + JT->Counts[Delta].Count += BranchInfo.Count; + JT->Counts[Delta].Mispreds += BranchInfo.MispredictedCount; + } + ++Delta; + ++EI; + // A label marks the start of another jump table. + if (JT->Labels.count(Delta * JT->EntrySize)) + break; + } + } +} + +Optional, 16>> +BinaryFunction::getFallthroughsInTrace(const LBREntry &First, + const LBREntry &Second) { + SmallVector, 16> Res; + + if (!recordTrace(First, Second, 1, &Res)) + return NoneType(); + + return Res; +} + +void BinaryFunction::readProfile() { + if (empty()) + return; + + if (!BC.DR.hasLBR()) { + readSampleData(); + return; + } + + // Possibly assign/re-assign branch profile data. + matchProfileData(); + + if (!BranchData) + return; + + uint64_t MismatchedBranches = 0; + for (const auto &BI : BranchData->Data) { + if (BI.From.Name != BI.To.Name) { + continue; + } + + if (!recordBranch(BI.From.Offset, BI.To.Offset, + BI.Branches, BI.Mispreds)) { + DEBUG(dbgs() << "bad branch : " << BI.From.Offset << " -> " + << BI.To.Offset << '\n'); + ++MismatchedBranches; + } + } + + // Special profile data propagation is required for conditional tail calls. + for (auto BB : BasicBlocks) { + auto *CTCInstr = BB->getLastNonPseudoInstr(); + if (!CTCInstr || !BC.MIA->getConditionalTailCall(*CTCInstr)) + continue; + + auto OffsetOrErr = + BC.MIA->tryGetAnnotationAs(*CTCInstr, "Offset"); + assert(OffsetOrErr && "offset not set for conditional tail call"); + + auto BranchInfoOrErr = BranchData->getDirectCallBranch(*OffsetOrErr); + if (!BranchInfoOrErr) + continue; + + BC.MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCTakenCount", + BranchInfoOrErr->Branches); + BC.MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCMispredCount", + BranchInfoOrErr->Mispreds); + } +} + +void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { + // No reason to merge invalid or empty profiles into BF. + if (!hasValidProfile()) + return; + + // Update function execution count. + if (getExecutionCount() != BinaryFunction::COUNT_NO_PROFILE) { + BF.setExecutionCount(BF.getKnownExecutionCount() + getExecutionCount()); + } + + // Since we are merging a valid profile, the new profile should be valid too. + // It has either already been valid, or it has been cleaned up. + BF.ProfileMatchRatio = 1.0f; + + // Update basic block and edge counts. + auto BBMergeI = BF.begin(); + for (BinaryBasicBlock *BB : BasicBlocks) { + BinaryBasicBlock *BBMerge = &*BBMergeI; + assert(getIndex(BB) == BF.getIndex(BBMerge)); + + // Update basic block count. + if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) { + BBMerge->setExecutionCount( + BBMerge->getKnownExecutionCount() + BB->getExecutionCount()); + } + + // Update edge count for successors of this basic block. + auto BBMergeSI = BBMerge->succ_begin(); + auto BIMergeI = BBMerge->branch_info_begin(); + auto BII = BB->branch_info_begin(); + for (const auto *BBSucc : BB->successors()) { + (void)BBSucc; + assert(getIndex(BBSucc) == BF.getIndex(*BBMergeSI)); + + // At this point no branch count should be set to COUNT_NO_PROFILE. + assert(BII->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "unexpected unknown branch profile"); + assert(BIMergeI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "unexpected unknown branch profile"); + + BIMergeI->Count += BII->Count; + + // When we merge inferred and real fall-through branch data, the merged + // data is considered inferred. + if (BII->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED && + BIMergeI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { + BIMergeI->MispredictedCount += BII->MispredictedCount; + } else { + BIMergeI->MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; + } + + ++BBMergeSI; + ++BII; + ++BIMergeI; + } + assert(BBMergeSI == BBMerge->succ_end()); + + ++BBMergeI; + } + assert(BBMergeI == BF.end()); +} + +void BinaryFunction::readSampleData() { + auto SampleDataOrErr = BC.DR.getFuncSampleData(getNames()); + + if (!SampleDataOrErr) + return; + + // Non-LBR mode territory + // First step is to assign BB execution count based on samples from perf + ProfileMatchRatio = 1.0f; + removeTagsFromProfile(); + bool NormalizeByInsnCount = + BC.DR.usesEvent("cycles") || BC.DR.usesEvent("instructions"); + bool NormalizeByCalls = BC.DR.usesEvent("branches"); + static bool NagUser{true}; + if (NagUser) { + outs() << "BOLT-INFO: operating with non-LBR profiling data.\n"; + if (NormalizeByInsnCount) { + outs() << "BOLT-INFO: normalizing samples by instruction count.\n"; + } else if (NormalizeByCalls) { + outs() << "BOLT-INFO: normalizing samples by branches.\n"; + } + NagUser = false; + } + uint64_t LastOffset = getSize(); + uint64_t TotalEntryCount{0}; + for (auto I = BasicBlockOffsets.rbegin(), E = BasicBlockOffsets.rend(); + I != E; ++I) { + uint64_t CurOffset = I->first; + // Always work with samples multiplied by 1000 to avoid losing them if we + // later need to normalize numbers + uint64_t NumSamples = + SampleDataOrErr->getSamples(CurOffset, LastOffset) * 1000; + if (NormalizeByInsnCount && I->second->getNumNonPseudos()) + NumSamples /= I->second->getNumNonPseudos(); + else if (NormalizeByCalls) { + uint32_t NumCalls = I->second->getNumCalls(); + NumSamples /= NumCalls + 1; + } + I->second->setExecutionCount(NumSamples); + if (I->second->isEntryPoint()) + TotalEntryCount += NumSamples; + LastOffset = CurOffset; + } + ExecutionCount = TotalEntryCount; + + estimateEdgeCounts(BC, *this); + + if (opts::DoMCF != MCF_DISABLE) + solveMCF(*this, opts::DoMCF); +} + +void BinaryFunction::inferFallThroughCounts() { + // Work on a basic block at a time, propagating frequency information + // forwards. + // It is important to walk in the layout order. + for (auto *BB : BasicBlocks) { + const uint64_t BBExecCount = BB->getExecutionCount(); + + // Propagate this information to successors, filling in fall-through edges + // with frequency information + if (BB->succ_size() == 0) + continue; + + // Calculate frequency of outgoing branches from this node according to + // LBR data. + uint64_t ReportedBranches = 0; + for (const auto &SuccBI : BB->branch_info()) { + if (SuccBI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) + ReportedBranches += SuccBI.Count; + } + + // Get taken count of conditional tail call if the block ends with one. + uint64_t CTCTakenCount = 0; + const auto CTCInstr = BB->getLastNonPseudoInstr(); + if (CTCInstr && BC.MIA->getConditionalTailCall(*CTCInstr)) { + CTCTakenCount = + BC.MIA->getAnnotationWithDefault(*CTCInstr, "CTCTakenCount"); + } + + // Calculate frequency of throws from this node according to LBR data + // for branching into associated landing pads. Since it is possible + // for a landing pad to be associated with more than one basic blocks, + // we may overestimate the frequency of throws for such blocks. + uint64_t ReportedThrows = 0; + for (const auto *LP: BB->landing_pads()) { + ReportedThrows += LP->getExecutionCount(); + } + + const uint64_t TotalReportedJumps = + ReportedBranches + CTCTakenCount + ReportedThrows; + + // Infer the frequency of the fall-through edge, representing not taking the + // branch. + uint64_t Inferred = 0; + if (BBExecCount > TotalReportedJumps) + Inferred = BBExecCount - TotalReportedJumps; + + DEBUG( + if (BBExecCount < TotalReportedJumps) + dbgs() + << "Fall-through inference is slightly inconsistent. " + "exec frequency is less than the outgoing edges frequency (" + << BBExecCount << " < " << ReportedBranches + << ") for BB at offset 0x" + << Twine::utohexstr(getAddress() + BB->getOffset()) << '\n'; + ); + + if (BB->succ_size() <= 2) { + // Skip if the last instruction is an unconditional jump. + const auto *LastInstr = BB->getLastNonPseudoInstr(); + if (LastInstr && + (BC.MIA->isUnconditionalBranch(*LastInstr) || + BC.MIA->isIndirectBranch(*LastInstr))) + continue; + // If there is an FT it will be the last successor. + auto &SuccBI = *BB->branch_info_rbegin(); + auto &Succ = *BB->succ_rbegin(); + if (SuccBI.Count == 0) { + SuccBI.Count = Inferred; + SuccBI.MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; + Succ->ExecutionCount += Inferred; + } + } + } + + return; +} + +bool BinaryFunction::fetchProfileForOtherEntryPoints() { + if (!BranchData) + return false; + + // Check if we are missing profiling data for secondary entry points + bool First{true}; + bool Updated{false}; + for (auto BB : BasicBlocks) { + if (First) { + First = false; + continue; + } + if (BB->isEntryPoint()) { + uint64_t EntryAddress = BB->getOffset() + getAddress(); + // Look for branch data associated with this entry point + std::vector Names; + std::multimap::iterator I, E; + for (std::tie(I, E) = BC.GlobalAddresses.equal_range(EntryAddress); + I != E; ++I) { + Names.push_back(I->second); + } + if (!Names.empty()) { + if (FuncBranchData *Data = BC.DR.getFuncBranchData(Names)) { + BranchData->appendFrom(*Data, BB->getOffset()); + Data->Used = true; + Updated = true; + } + } + } + } + return Updated; +} + +void BinaryFunction::matchProfileMemData() { + const auto AllMemData = BC.DR.getFuncMemDataRegex(getNames()); + for (auto *NewMemData : AllMemData) { + // Prevent functions from sharing the same profile. + if (NewMemData->Used) + continue; + + if (MemData) + MemData->Used = false; + + // Update function profile data with the new set. + MemData = NewMemData; + MemData->Used = true; + break; + } +} + +void BinaryFunction::matchProfileData() { + // This functionality is available for LBR-mode only + // TODO: Implement evaluateProfileData() for samples, checking whether + // sample addresses match instruction addresses in the function + if (!BC.DR.hasLBR()) + return; + + if (BranchData) { + ProfileMatchRatio = evaluateProfileData(*BranchData); + if (ProfileMatchRatio == 1.0f) { + if (fetchProfileForOtherEntryPoints()) { + ProfileMatchRatio = evaluateProfileData(*BranchData); + ExecutionCount = BranchData->ExecutionCount; + } + return; + } + } + + // Check if the function name can fluctuate between several compilations + // possibly triggered by minor unrelated code changes in the source code + // of the input binary. + const auto HasVolatileName = [this]() { + for (const auto Name : getNames()) { + if (getLTOCommonName(Name)) + return true; + } + return false; + }(); + if (!HasVolatileName) + return; + + // Check for a profile that matches with 100% confidence. + const auto AllBranchData = BC.DR.getFuncBranchDataRegex(getNames()); + for (auto *NewBranchData : AllBranchData) { + // Prevent functions from sharing the same profile. + if (NewBranchData->Used) + continue; + + if (evaluateProfileData(*NewBranchData) != 1.0f) + continue; + + if (BranchData) + BranchData->Used = false; + + // Update function profile data with the new set. + BranchData = NewBranchData; + ExecutionCount = NewBranchData->ExecutionCount; + ProfileMatchRatio = 1.0f; + BranchData->Used = true; + break; + } +} + +float BinaryFunction::evaluateProfileData(const FuncBranchData &BranchData) { + // Until we define a minimal profile, we consider an empty branch data to be + // a valid profile. It could happen to a function without branches when we + // still have an EntryData for execution count. + if (BranchData.Data.empty()) { + return 1.0f; + } + + uint64_t NumMatchedBranches = 0; + for (const auto &BI : BranchData.Data) { + bool IsValid = false; + if (BI.From.Name == BI.To.Name) { + // Try to record information with 0 count. + IsValid = recordBranch(BI.From.Offset, BI.To.Offset, 0); + } else { + // The branch has to originate from this function. + // Check for calls, tail calls, rets and indirect branches. + // When matching profiling info, we did not reach the stage + // when we identify tail calls, so they are still represented + // by regular branch instructions and we need isBranch() here. + auto *Instr = getInstructionAtOffset(BI.From.Offset); + // If it's a prefix - skip it. + if (Instr && BC.MIA->isPrefix(*Instr)) + Instr = getInstructionAtOffset(BI.From.Offset + 1); + if (Instr && + (BC.MIA->isCall(*Instr) || + BC.MIA->isBranch(*Instr) || + BC.MIA->isReturn(*Instr))) { + IsValid = true; + } + } + + if (IsValid) { + ++NumMatchedBranches; + continue; + } + + DEBUG(dbgs() + << "\tinvalid branch in " << *this << " : 0x" + << Twine::utohexstr(BI.From.Offset) << " -> "; + if (BI.From.Name == BI.To.Name) + dbgs() << "0x" << Twine::utohexstr(BI.To.Offset) << '\n'; + else + dbgs() << "\n"; + ); + } + + const auto MatchRatio = (float) NumMatchedBranches / BranchData.Data.size(); + if (opts::Verbosity >= 2 && NumMatchedBranches < BranchData.Data.size()) { + errs() << "BOLT-WARNING: profile branches match only " + << format("%.1f%%", MatchRatio * 100.0f) << " (" + << NumMatchedBranches << '/' << BranchData.Data.size() + << ") for function " << *this << '\n'; + } + + return MatchRatio; +} + +void BinaryFunction::clearProfile() { + // Keep function execution profile the same. Only clear basic block and edge + // counts. + for (auto *BB : BasicBlocks) { + BB->ExecutionCount = 0; + for (auto &BI : BB->branch_info()) { + BI.Count = 0; + BI.MispredictedCount = 0; + } + } +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index facc0b5ddee0..687c10497765 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -345,7 +345,7 @@ void BinaryFunctionPassManager::runAllPasses( // order they're registered. // Run this pass first to use stats for the original functions. - Manager.registerPass(llvm::make_unique(NeverPrint)); + Manager.registerPass(llvm::make_unique(NeverPrint)); Manager.registerPass(llvm::make_unique(NeverPrint), opts::StripRepRet); diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index dd8a44975134..959b19915f10 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -63,6 +63,7 @@ add_llvm_tool(llvm-bolt BinaryBasicBlock.cpp BinaryContext.cpp BinaryFunction.cpp + BinaryFunctionProfile.cpp BinaryPassManager.cpp CacheMetrics.cpp DataAggregator.cpp diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index dbce1e4ec465..a964c73069e4 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -414,6 +414,14 @@ bool DataAggregator::aggregate(BinaryContext &BC, outs() << "PERF2BOLT: Failed to parse branch events\n"; } + // Mark all functions with registered events as having a valid profile. + for (auto &BFI : BFs) { + auto &BF = BFI.second; + if (BF.getBranchData()) { + BF.markProfiled(); + } + } + auto PI3 = sys::Wait(MemEventsPI, 0, true, &Error); if (PI3.ReturnCode != 0) { @@ -423,7 +431,8 @@ bool DataAggregator::aggregate(BinaryContext &BC, deleteTempFiles(); - Regex NoData("Samples for '.*' event do not have ADDR attribute set. Cannot print 'addr' field."); + Regex NoData("Samples for '.*' event do not have ADDR attribute set. " + "Cannot print 'addr' field."); if (!NoData.match(ErrBuf)) { errs() << "PERF-ERROR: Return code " << PI3.ReturnCode << "\n"; errs() << ErrBuf; @@ -450,7 +459,7 @@ bool DataAggregator::aggregate(BinaryContext &BC, } deleteTempFiles(); - + return true; } @@ -467,8 +476,8 @@ DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) { return &FI->second; } -bool DataAggregator::doIntraBranch(BinaryFunction *Func, uint64_t From, - uint64_t To, bool Mispred) { +bool +DataAggregator::doIntraBranch(BinaryFunction *Func, const LBREntry &Branch) { FuncBranchData *AggrData = Func->getBranchData(); if (!AggrData) { AggrData = &FuncsToBranches[Func->getNames()[0]]; @@ -476,19 +485,21 @@ bool DataAggregator::doIntraBranch(BinaryFunction *Func, uint64_t From, Func->setBranchData(AggrData); } - From -= Func->getAddress(); - To -= Func->getAddress(); - AggrData->bumpBranchCount(From, To, Mispred); + AggrData->bumpBranchCount(Branch.From - Func->getAddress(), + Branch.To - Func->getAddress(), + Branch.Mispred); return true; } bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, - BinaryFunction *ToFunc, uint64_t From, - uint64_t To, bool Mispred) { + BinaryFunction *ToFunc, + const LBREntry &Branch) { FuncBranchData *FromAggrData{nullptr}; FuncBranchData *ToAggrData{nullptr}; StringRef SrcFunc; StringRef DstFunc; + auto From = Branch.From; + auto To = Branch.To; if (FromFunc) { SrcFunc = FromFunc->getNames()[0]; FromAggrData = FromFunc->getBranchData(); @@ -498,6 +509,8 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, FromFunc->setBranchData(FromAggrData); } From -= FromFunc->getAddress(); + + FromFunc->recordExit(From, Branch.Mispred); } if (ToFunc) { DstFunc = ToFunc->getNames()[0]; @@ -508,32 +521,39 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, ToFunc->setBranchData(ToAggrData); } To -= ToFunc->getAddress(); + + ToFunc->recordEntry(To, Branch.Mispred); } if (FromAggrData) FromAggrData->bumpCallCount(From, Location(!DstFunc.empty(), DstFunc, To), - Mispred); + Branch.Mispred); if (ToAggrData) ToAggrData->bumpEntryCount(Location(!SrcFunc.empty(), SrcFunc, From), To, - Mispred); + Branch.Mispred); return true; } -bool DataAggregator::doBranch(uint64_t From, uint64_t To, bool Mispred) { - auto *FromFunc = getBinaryFunctionContainingAddress(From); - auto *ToFunc = getBinaryFunctionContainingAddress(To); +bool DataAggregator::doBranch(const LBREntry &Branch) { + auto *FromFunc = getBinaryFunctionContainingAddress(Branch.From); + auto *ToFunc = getBinaryFunctionContainingAddress(Branch.To); if (!FromFunc && !ToFunc) return false; - if (FromFunc == ToFunc) - return doIntraBranch(FromFunc, From, To, Mispred); + if (FromFunc == ToFunc) { + FromFunc->recordBranch(Branch.From - FromFunc->getAddress(), + Branch.To - FromFunc->getAddress(), + 1, + Branch.Mispred); + return doIntraBranch(FromFunc, Branch); + } - return doInterBranch(FromFunc, ToFunc, From, To, Mispred); + return doInterBranch(FromFunc, ToFunc, Branch); } -bool DataAggregator::doTrace(uint64_t From, uint64_t To) { - auto *FromFunc = getBinaryFunctionContainingAddress(From); - auto *ToFunc = getBinaryFunctionContainingAddress(To); +bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second) { + auto *FromFunc = getBinaryFunctionContainingAddress(First.To); + auto *ToFunc = getBinaryFunctionContainingAddress(Second.From); if (!FromFunc || !ToFunc) { ++NumLongRangeTraces; return false; @@ -541,26 +561,25 @@ bool DataAggregator::doTrace(uint64_t From, uint64_t To) { if (FromFunc != ToFunc) { ++NumInvalidTraces; DEBUG(dbgs() << "Trace starting in " << FromFunc->getPrintName() << " @ " - << Twine::utohexstr(From - FromFunc->getAddress()) + << Twine::utohexstr(First.To - FromFunc->getAddress()) << " and ending in " << ToFunc->getPrintName() << " @ " << ToFunc->getPrintName() << " @ " - << Twine::utohexstr(To - ToFunc->getAddress()) << "\n"); + << Twine::utohexstr(Second.From - ToFunc->getAddress()) + << '\n'); return false; } - if (FromFunc) { - From -= FromFunc->getAddress(); - To -= ToFunc->getAddress(); - } - auto FTs = FromFunc->getFallthroughsInTrace(From, To); + auto FTs = FromFunc->getFallthroughsInTrace(First, Second); if (!FTs) { ++NumInvalidTraces; return false; } for (const auto &Pair : *FTs) { - doIntraBranch(FromFunc, Pair.first + FromFunc->getAddress(), - Pair.second + FromFunc->getAddress(), false); + doIntraBranch(FromFunc, + LBREntry{Pair.first + FromFunc->getAddress(), + Pair.second + FromFunc->getAddress(), + false}); } return true; @@ -710,7 +729,8 @@ bool DataAggregator::hasData() { std::error_code DataAggregator::parseBranchEvents() { outs() << "PERF2BOLT: Aggregating branch events...\n"; - NamedRegionTimer T("Branch samples parsing", TimerGroupName, opts::TimeAggregator); + NamedRegionTimer T("Branch samples parsing", TimerGroupName, + opts::TimeAggregator); uint64_t NumEntries{0}; uint64_t NumSamples{0}; uint64_t NumTraces{0}; @@ -727,14 +747,16 @@ std::error_code DataAggregator::parseBranchEvents() { NumEntries += Sample.LBR.size(); // Parser semantic actions - uint64_t Last{0}; + // LBRs are stored in reverse execution order. NextLBR refers to next + // executed branch record. + const LBREntry *NextLBR{nullptr}; for (const auto &LBR : Sample.LBR) { - if (Last) { - doTrace(LBR.To, Last); + if (NextLBR) { + doTrace(LBR, *NextLBR); ++NumTraces; } - doBranch(LBR.From, LBR.To, LBR.Mispred); - Last = LBR.From; + doBranch(LBR); + NextLBR = &LBR; } } outs() << "PERF2BOLT: Read " << NumSamples << " samples and " diff --git a/bolt/DataAggregator.h b/bolt/DataAggregator.h index 6dcac3f7daed..7c1b575be664 100644 --- a/bolt/DataAggregator.h +++ b/bolt/DataAggregator.h @@ -28,12 +28,6 @@ namespace bolt { class BinaryFunction; class BinaryContext; -struct LBREntry { - uint64_t From; - uint64_t To; - bool Mispred; -}; - struct PerfBranchSample { SmallVector LBR; }; @@ -125,24 +119,19 @@ class DataAggregator : public DataReader { BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address); /// Semantic actions - parser hooks to interpret parsed perf samples - /// Register an intraprocedural branch in \p Func with offsets \p From and - /// \p To (relative to \p Func start address). - bool doIntraBranch(BinaryFunction *Func, uint64_t From, uint64_t To, - bool Mispred); + /// Register an intraprocedural branch \p Branch. + bool doIntraBranch(BinaryFunction *Func, const LBREntry &Branch); /// Register an interprocedural branch from \p FromFunc to \p ToFunc with /// offsets \p From and \p To, respectively. bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, - uint64_t From, uint64_t To, bool Mispred); + const LBREntry &Branch); - /// Register a branch with raw addresses \p From and \p To extracted from the - /// LBR - bool doBranch(uint64_t From, uint64_t To, bool Mispred); + /// Register a \p Branch. + bool doBranch(const LBREntry &Branch); - /// Register a trace starting in raw address \p From and ending in \p To - /// This will add all intermediate conditional branches in this trace as not - /// taken. - bool doTrace(uint64_t From, uint64_t To); + /// Register a trace between two LBR entries supplied in execution order. + bool doTrace(const LBREntry &First, const LBREntry &Second); /// Parser helpers /// Return false if we exhausted our parser buffer and finished parsing diff --git a/bolt/DataReader.h b/bolt/DataReader.h index b3ba0999a932..474b1aa3b304 100644 --- a/bolt/DataReader.h +++ b/bolt/DataReader.h @@ -31,6 +31,12 @@ namespace llvm { namespace bolt { +struct LBREntry { + uint64_t From; + uint64_t To; + bool Mispred; +}; + /// LTO-generated function names take a form: /// /// .lto_priv./... diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 3e98e6300831..ddcd87974dac 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -48,6 +48,7 @@ const char* dynoStatsOptDesc(const bolt::DynoStats::Category C) { namespace opts { +extern cl::OptionCategory BoltCategory; extern cl::OptionCategory BoltOptCategory; extern cl::opt Verbosity; @@ -88,6 +89,33 @@ MinBranchClusters("min-branch-clusters", cl::Hidden, cl::cat(BoltOptCategory)); +enum PeepholeOpts : char { + PEEP_NONE = 0x0, + PEEP_SHORTEN = 0x1, + PEEP_DOUBLE_JUMPS = 0x2, + PEEP_TAILCALL_TRAPS = 0x4, + PEEP_USELESS_BRANCHES = 0x8, + PEEP_ALL = 0xf +}; + +static cl::list +Peepholes("peepholes", + cl::CommaSeparated, + cl::desc("enable peephole optimizations"), + cl::value_desc("opt1,opt2,opt3,..."), + cl::values( + clEnumValN(PEEP_NONE, "none", "disable peepholes"), + clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"), + clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", + "remove double jumps when able"), + clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"), + clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches", + "remove useless conditional branches"), + clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt PrintFuncStat("print-function-statistics", cl::desc("print statistics about basic block ordering"), @@ -140,6 +168,14 @@ ReorderBlocks("reorder-blocks", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +ReportStaleFuncs("report-stale", + cl::desc("print the list of functions with stale profile"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + enum SctcModes : char { SctcAlways, SctcPreserveDirection, @@ -178,32 +214,14 @@ TSPThreshold("tsp-threshold", cl::Hidden, cl::cat(BoltOptCategory)); -enum PeepholeOpts : char { - PEEP_NONE = 0x0, - PEEP_SHORTEN = 0x1, - PEEP_DOUBLE_JUMPS = 0x2, - PEEP_TAILCALL_TRAPS = 0x4, - PEEP_USELESS_BRANCHES = 0x8, - PEEP_ALL = 0xf -}; - -static cl::list -Peepholes("peepholes", - cl::CommaSeparated, - cl::desc("enable peephole optimizations"), - cl::value_desc("opt1,opt2,opt3,..."), - cl::values( - clEnumValN(PEEP_NONE, "none", "disable peepholes"), - clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"), - clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", - "remove double jumps when able"), - clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"), - clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches", - "remove useless conditional branches"), - clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations"), - clEnumValEnd), +static cl::opt +TopCalledLimit("top-called-limit", + cl::desc("maximum number of functions to print in top called " + "functions section"), + cl::init(100), cl::ZeroOrMore, - cl::cat(BoltOptCategory)); + cl::Hidden, + cl::cat(BoltCategory)); } // namespace opts @@ -861,6 +879,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, MIA->setConditionalTailCall(*CondBranch); // Add info abount the conditional tail call frequency, otherwise this // info will be lost when we delete the associated BranchInfo entry + BC.MIA->removeAnnotation(*CondBranch, "CTCTakenCount"); BC.MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "CTCTakenCount", CTCTakenFreq); @@ -1315,11 +1334,93 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, } } -void PrintSortedBy::runOnFunctions( - BinaryContext &, - std::map &BFs, - std::set & -) { +void +PrintProgramStats::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &) { + uint64_t NumSimpleFunctions{0}; + uint64_t NumStaleProfileFunctions{0}; + std::vector ProfiledFunctions; + const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; + for (auto &BFI : BFs) { + auto &Function = BFI.second; + if (!Function.isSimple()) + continue; + ++NumSimpleFunctions; + if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + continue; + if (Function.hasValidProfile()) + ProfiledFunctions.push_back(&Function); + else { + if (opts::ReportStaleFuncs) { + outs() << StaleFuncsHeader; + StaleFuncsHeader = ""; + outs() << " " << Function << '\n'; + } + ++NumStaleProfileFunctions; + } + } + BC.NumProfiledFuncs = ProfiledFunctions.size(); + + const auto NumAllProfiledFunctions = + ProfiledFunctions.size() + NumStaleProfileFunctions; + outs() << "BOLT-INFO: " + << NumAllProfiledFunctions + << " functions out of " << NumSimpleFunctions << " simple functions (" + << format("%.1f", NumAllProfiledFunctions / + (float) NumSimpleFunctions * 100.0f) + << "%) have non-empty execution profile.\n"; + if (NumStaleProfileFunctions) { + outs() << "BOLT-INFO: " << NumStaleProfileFunctions + << format(" (%.1f%% of all profiled)", + NumStaleProfileFunctions / + (float) NumAllProfiledFunctions * 100.0f) + << " function" << (NumStaleProfileFunctions == 1 ? "" : "s") + << " have invalid (possibly stale) profile.\n"; + } + + // Profile is marked as 'Used' if it either matches a function name + // exactly or if it 100% matches any of functions with matching common + // LTO names. + auto getUnusedObjects = [&]() -> Optional> { + std::vector UnusedObjects; + for (const auto &Func : BC.DR.getAllFuncsData()) { + if (!Func.getValue().Used) { + UnusedObjects.emplace_back(Func.getKey()); + } + } + if (UnusedObjects.empty()) + return NoneType(); + return UnusedObjects; + }; + + if (const auto UnusedObjects = getUnusedObjects()) { + outs() << "BOLT-INFO: profile for " << UnusedObjects->size() + << " objects was ignored\n"; + if (opts::Verbosity >= 1) { + for (auto Name : *UnusedObjects) { + outs() << " " << Name << '\n'; + } + } + } + + if (ProfiledFunctions.size() > 10) { + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: top called functions are:\n"; + std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), + [](BinaryFunction *A, BinaryFunction *B) { + return B->getExecutionCount() < A->getExecutionCount(); + } + ); + auto SFI = ProfiledFunctions.begin(); + auto SFIend = ProfiledFunctions.end(); + for (auto i = 0u; i < opts::TopCalledLimit && SFI != SFIend; ++SFI, ++i) { + outs() << " " << **SFI << " : " + << (*SFI)->getExecutionCount() << '\n'; + } + } + } + if (!opts::PrintSortedBy.empty() && std::find(opts::PrintSortedBy.begin(), opts::PrintSortedBy.end(), diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 3316afd2c808..ea7376f7997b 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -382,17 +382,15 @@ class IdenticalCodeFolding : public BinaryFunctionPass { std::set &LargeFunctions) override; }; -/// /// Prints a list of the top 100 functions sorted by a set of /// dyno stats categories. -/// -class PrintSortedBy : public BinaryFunctionPass { +class PrintProgramStats : public BinaryFunctionPass { public: - explicit PrintSortedBy(const cl::opt &PrintPass) + explicit PrintProgramStats(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } const char *getName() const override { - return "print-sorted-by"; + return "print-stats"; } bool shouldPrint(const BinaryFunction &) const override { return false; diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index fc0792c7c760..f76564355c7b 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -1295,7 +1295,8 @@ void IndirectCallPromotion::runOnFunctions( if (BC.MIA->isCall(Inst) && BC.MIA->getTargetSymbol(Inst, 0)) continue; - assert(BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)); + assert((BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)) + && "expected a call or an indirect jump instruction"); if (IsJumpTable) ++TotalJumpTableCallsites; diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index f7d4ffa55dd1..3535588773d5 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -27,29 +27,29 @@ extern cl::opt RandomSeed; extern bool shouldProcess(const bolt::BinaryFunction &Function); extern size_t padFunction(const bolt::BinaryFunction &Function); -cl::opt +cl::opt ReorderFunctions("reorder-functions", cl::desc("reorder and cluster functions (works only with relocations)"), - cl::init(bolt::BinaryFunction::RT_NONE), - cl::values(clEnumValN(bolt::BinaryFunction::RT_NONE, + cl::init(bolt::ReorderFunctions::RT_NONE), + cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none", "do not reorder functions"), - clEnumValN(bolt::BinaryFunction::RT_EXEC_COUNT, + clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count", "order by execution count"), - clEnumValN(bolt::BinaryFunction::RT_HFSORT, + clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort", "use hfsort algorithm"), - clEnumValN(bolt::BinaryFunction::RT_HFSORT_PLUS, + clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+", "use hfsort+ algorithm"), - clEnumValN(bolt::BinaryFunction::RT_PETTIS_HANSEN, + clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, "pettis-hansen", "use Pettis-Hansen algorithm"), - clEnumValN(bolt::BinaryFunction::RT_RANDOM, + clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random", "reorder functions randomly"), - clEnumValN(bolt::BinaryFunction::RT_USER, + clEnumValN(bolt::ReorderFunctions::RT_USER, "user", "use function order specified by -function-order"), clEnumValEnd), @@ -142,7 +142,7 @@ void ReorderFunctions::reorder(std::vector &&Clusters, } } - if (opts::ReorderFunctions == BinaryFunction::RT_NONE) + if (opts::ReorderFunctions == RT_NONE) return; if (opts::Verbosity == 0) { @@ -280,15 +280,15 @@ std::vector readFunctionOrderFile() { void ReorderFunctions::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { - if (!BC.HasRelocations && opts::ReorderFunctions != BinaryFunction::RT_NONE) { + if (!BC.HasRelocations && opts::ReorderFunctions != RT_NONE) { errs() << "BOLT-ERROR: Function reordering only works when " << "relocs are enabled.\n"; exit(1); } - if (opts::ReorderFunctions != BinaryFunction::RT_NONE && - opts::ReorderFunctions != BinaryFunction::RT_EXEC_COUNT && - opts::ReorderFunctions != BinaryFunction::RT_USER) { + if (opts::ReorderFunctions != RT_NONE && + opts::ReorderFunctions != RT_EXEC_COUNT && + opts::ReorderFunctions != RT_USER) { Cg = buildCallGraph(BC, BFs, [this](const BinaryFunction &BF) { @@ -306,9 +306,9 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, std::vector Clusters; switch(opts::ReorderFunctions) { - case BinaryFunction::RT_NONE: + case RT_NONE: break; - case BinaryFunction::RT_EXEC_COUNT: + case RT_EXEC_COUNT: { std::vector SortedFunctions(BFs.size()); uint32_t Index = 0; @@ -340,20 +340,20 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, } } break; - case BinaryFunction::RT_HFSORT: + case RT_HFSORT: Clusters = clusterize(Cg); break; - case BinaryFunction::RT_HFSORT_PLUS: + case RT_HFSORT_PLUS: Clusters = hfsortPlus(Cg, opts::UseGainCache); break; - case BinaryFunction::RT_PETTIS_HANSEN: + case RT_PETTIS_HANSEN: Clusters = pettisAndHansen(Cg); break; - case BinaryFunction::RT_RANDOM: + case RT_RANDOM: std::srand(opts::RandomSeed); Clusters = randomClusters(Cg); break; - case BinaryFunction::RT_USER: + case RT_USER: { uint32_t Index = 0; for (const auto &Function : readFunctionOrderFile()) { @@ -394,7 +394,8 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, if (!BF->hasValidIndex()) { BF->setIndex(Index++); } else if (opts::Verbosity > 0) { - errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function << ".\n"; + errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function + << ".\n"; } } } diff --git a/bolt/Passes/ReorderFunctions.h b/bolt/Passes/ReorderFunctions.h index 57f804ae2290..1b1c58021dfd 100644 --- a/bolt/Passes/ReorderFunctions.h +++ b/bolt/Passes/ReorderFunctions.h @@ -24,7 +24,17 @@ class ReorderFunctions : public BinaryFunctionPass { void reorder(std::vector &&Clusters, std::map &BFs); - public: +public: + enum ReorderType : char { + RT_NONE = 0, + RT_EXEC_COUNT, + RT_HFSORT, + RT_HFSORT_PLUS, + RT_PETTIS_HANSEN, + RT_RANDOM, + RT_USER + }; + explicit ReorderFunctions(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 733a52a2eb40..22ffe9f273c0 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -217,14 +217,6 @@ RelocationMode("relocs", cl::ZeroOrMore, cl::cat(BoltCategory)); -static cl::opt -ReportStaleFuncs("report-stale", - cl::desc("print a list of functions with a stale profile"), - cl::init(false), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltCategory)); - static cl::list SkipFunctionNames("skip-funcs", cl::CommaSeparated, @@ -255,15 +247,6 @@ SplitFunctions("split-functions", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -TopCalledLimit("top-called-limit", - cl::desc("maximum number of functions to print in top called " - "functions section"), - cl::init(100), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltCategory)); - cl::opt TrapOldCode("trap-old-code", cl::desc("insert traps in old function bodies (relocation mode)"), @@ -572,7 +555,8 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, std::unique_ptr MII(TheTarget->createMCInstrInfo()); if (!MII) { - errs() << "BOLT-ERROR: no instruction info for target " << TripleName << "\n"; + errs() << "BOLT-ERROR: no instruction info for target " << TripleName + << "\n"; return nullptr; } @@ -666,19 +650,6 @@ void RewriteInstance::reset() { FailedAddresses.clear(); RangesSectionsWriter.reset(); LocationListWriter.reset(); - TotalScore = 0; -} - -void RewriteInstance::aggregateData() { - NamedRegionTimer T("aggregate data", TimerGroupName, opts::TimeRewrite); - DA.aggregate(*BC.get(), BinaryFunctions); - - if (!opts::AggregateOnly) - return; - - if (std::error_code EC = DA.writeAggregatedFile()) { - check_error(EC, "cannot create output data file"); - } } void RewriteInstance::discoverStorage() { @@ -901,13 +872,11 @@ void RewriteInstance::run() { readSpecialSections(); discoverFileObjects(); readDebugInfo(); - readProfileData(); disassembleFunctions(); - if (DA.started()) { - aggregateData(); - if (opts::AggregateOnly) - return; - } + readProfileData(); + if (opts::AggregateOnly) + return; + postProcessFunctions(); for (uint64_t Address : NonSimpleFunctions) { auto FI = BinaryFunctions.find(Address); assert(FI != BinaryFunctions.end() && "bad non-simple function address"); @@ -1930,30 +1899,44 @@ void RewriteInstance::readDebugInfo() { } void RewriteInstance::readProfileData() { - NamedRegionTimer T("read profile data", TimerGroupName, opts::TimeRewrite); - if (BC->DR.getAllFuncsData().empty()) + if (DA.started()) { + NamedRegionTimer T("aggregate data", TimerGroupName, opts::TimeRewrite); + DA.aggregate(*BC.get(), BinaryFunctions); + + if (opts::AggregateOnly) { + if (std::error_code EC = DA.writeAggregatedFile()) { + check_error(EC, "cannot create output data file"); + } + } return; + } + + NamedRegionTimer T("read profile data", TimerGroupName, opts::TimeRewrite); + // Preliminary match profile data to functions. + if (!BC->DR.getAllFuncsData().empty()) { + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + if (auto *MemData = BC->DR.getFuncMemData(Function.getNames())) { + Function.MemData = MemData; + MemData->Used = true; + } + if (auto *FuncData = BC->DR.getFuncBranchData(Function.getNames())) { + Function.BranchData = FuncData; + Function.ExecutionCount = FuncData->ExecutionCount; + FuncData->Used = true; + } + } + } for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; - if (auto *MemData = BC->DR.getFuncMemData(Function.getNames())) { - Function.MemData = MemData; - MemData->Used = true; - } - if (auto *FuncData = BC->DR.getFuncBranchData(Function.getNames())) { - Function.BranchData = FuncData; - Function.ExecutionCount = FuncData->ExecutionCount; - FuncData->Used = true; - } + Function.readProfile(); } } void RewriteInstance::disassembleFunctions() { NamedRegionTimer T("disassemble functions", TimerGroupName, opts::TimeRewrite); - // Disassemble every function and build it's control flow graph. - TotalScore = 0; - BC->SumExecutionCount = 0; for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; @@ -1965,7 +1948,6 @@ void RewriteInstance::disassembleFunctions() { } auto FunctionData = BC->getFunctionData(Function); - if (!FunctionData) { // When could it happen? errs() << "BOLT-ERROR: corresponding section is non-executable or " @@ -1980,7 +1962,7 @@ void RewriteInstance::disassembleFunctions() { } // Offset of the function in the file. - auto *FileBegin = + const auto *FileBegin = reinterpret_cast(InputFile->getData().data()); Function.setFileOffset(FunctionData->begin() - FileBegin); @@ -2049,9 +2031,6 @@ void RewriteInstance::disassembleFunctions() { } BC->InterproceduralReferences.clear(); - if (opts::AggregateOnly) - continue; - // Fill in CFI information for this function if (Function.isSimple()) { if (!CFIRdWrt->fillCFIInfoFor(Function)) { @@ -2071,6 +2050,23 @@ void RewriteInstance::disassembleFunctions() { if (!Function.buildCFG()) continue; + if (opts::PrintAll) + Function.print(outs(), "while building cfg", true); + + } // Iterate over all functions +} + +void RewriteInstance::postProcessFunctions() { + BC->TotalScore = 0; + BC->SumExecutionCount = 0; + for (auto &BFI : BinaryFunctions) { + BinaryFunction &Function = BFI.second; + + if (Function.empty()) + continue; + + Function.postProcessCFG(); + if (opts::PrintAll || opts::PrintCFG) Function.print(outs(), "after building cfg", true); @@ -2082,95 +2078,8 @@ void RewriteInstance::disassembleFunctions() { Function.printLoopInfo(outs()); } - TotalScore += Function.getFunctionScore(); + BC->TotalScore += Function.getFunctionScore(); BC->SumExecutionCount += Function.getKnownExecutionCount(); - - } // Iterate over all functions - - if (opts::AggregateOnly) - return; - - const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; - uint64_t NumSimpleFunctions{0}; - uint64_t NumStaleProfileFunctions{0}; - std::vector ProfiledFunctions; - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - if (!Function.isSimple()) - continue; - ++NumSimpleFunctions; - if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) - continue; - if (Function.hasValidProfile()) { - ProfiledFunctions.push_back(&Function); - } else { - if (opts::ReportStaleFuncs) { - outs() << StaleFuncsHeader - << " " << Function << '\n'; - StaleFuncsHeader = ""; - } - ++NumStaleProfileFunctions; - } - } - BC->NumProfiledFuncs = ProfiledFunctions.size(); - - const auto NumAllProfiledFunctions = - ProfiledFunctions.size() + NumStaleProfileFunctions; - outs() << "BOLT-INFO: " - << NumAllProfiledFunctions - << " functions out of " << NumSimpleFunctions << " simple functions (" - << format("%.1f", NumAllProfiledFunctions / - (float) NumSimpleFunctions * 100.0f) - << "%) have non-empty execution profile.\n"; - if (NumStaleProfileFunctions) { - outs() << "BOLT-INFO: " << NumStaleProfileFunctions - << format(" (%.1f%% of all profiled)", - NumStaleProfileFunctions / - (float) NumAllProfiledFunctions * 100.0f) - << " function" << (NumStaleProfileFunctions == 1 ? "" : "s") - << " have invalid (possibly stale) profile.\n"; - } - - // Profile is marked as 'Used' if it either matches a function name - // exactly or if it 100% matches any of functions with matching common - // LTO names. - auto getUnusedObjects = [this]() -> Optional> { - std::vector UnusedObjects; - for (const auto &Func : BC->DR.getAllFuncsData()) { - if (!Func.getValue().Used) { - UnusedObjects.emplace_back(Func.getKey()); - } - } - if (UnusedObjects.empty()) - return NoneType(); - return UnusedObjects; - }; - - if (const auto UnusedObjects = getUnusedObjects()) { - outs() << "BOLT-INFO: profile for " << UnusedObjects->size() - << " objects was ignored\n"; - if (opts::Verbosity >= 1) { - for (auto Name : *UnusedObjects) { - outs() << " " << Name << '\n'; - } - } - } - - if (ProfiledFunctions.size() > 10) { - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: top called functions are:\n"; - std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), - [](BinaryFunction *A, BinaryFunction *B) { - return B->getExecutionCount() < A->getExecutionCount(); - } - ); - auto SFI = ProfiledFunctions.begin(); - auto SFIend = ProfiledFunctions.end(); - for (auto i = 0u; i < opts::TopCalledLimit && SFI != SFIend; ++SFI, ++i) { - outs() << " " << **SFI << " : " - << (*SFI)->getExecutionCount() << '\n'; - } - } } } @@ -3861,8 +3770,8 @@ void RewriteInstance::rewriteFile() { outs() << "BOLT: " << CountOverwrittenFunctions << " out of " << BinaryFunctions.size() << " functions were overwritten.\n"; - if (TotalScore != 0) { - double Coverage = OverwrittenScore / (double)TotalScore * 100.0; + if (BC->TotalScore != 0) { + double Coverage = OverwrittenScore / (double) BC->TotalScore * 100.0; outs() << format("BOLT: Rewritten functions cover %.2lf", Coverage) << "% of the execution count of simple functions of " "this binary.\n"; diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 8bc3ad3da294..74c801a27d33 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -186,6 +186,8 @@ class RewriteInstance { /// optimization. void disassembleFunctions(); + void postProcessFunctions(); + /// Run optimizations that operate at the binary, or post-linker, level. void runOptimizationPasses(); @@ -277,9 +279,6 @@ class RewriteInstance { void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, bool EmitColdPart); - /// Perform a perf.data aggregation job instead of a binary rewriting one - void aggregateData(); - /// Detect addresses and offsets available in the binary for allocating /// new sections. void discoverStorage(); @@ -523,9 +522,6 @@ class RewriteInstance { /// last emission, so that we may either decide to split or not optimize them. std::set LargeFunctions; - /// Total hotness score according to profiling data for this binary. - uint64_t TotalScore{0}; - /// Section header string table. StringTableBuilder SHStrTab; diff --git a/bolt/llvm-bolt.cpp b/bolt/llvm-bolt.cpp index 55aa6bb920ff..b283e371dfc6 100644 --- a/bolt/llvm-bolt.cpp +++ b/bolt/llvm-bolt.cpp @@ -178,9 +178,8 @@ int main(int argc, char **argv) { if (!opts::PerfData.empty()) { if (!opts::AggregateOnly) { errs() << ToolName - << ": reading perf data directly is unsupported, please use " - "-aggregate-only or perf2bolt\n"; - exit(1); + << ": WARNING: reading perf data directly is unsupported, please use " + "-aggregate-only or perf2bolt.\n!!! Proceed on your own risk. !!!\n"; } DA->start(opts::PerfData); } else if (!opts::InputDataFilename.empty()) { From 008bea7894cac04bae80cd66e7fa01b818fb2b4a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 11 Dec 2017 11:44:07 -0800 Subject: [PATCH 359/904] debug (cherry picked from commit fd44b996541f56b6fb61143d4710049d1ff0cc9f) --- bolt/Passes/IndirectCallPromotion.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index f76564355c7b..d8c372931334 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -1212,6 +1212,8 @@ void IndirectCallPromotion::runOnFunctions( } IndirectCalls.push_back(std::make_pair(NumCalls, &Inst)); + dbgs() << "indirect call in " << Function << " : " + << BB.getName() << " : " << NumCalls << '\n'; TotalIndirectCalls += NumCalls; } } From f2bd857b06ea7a009a35b92bd2f3853efb4a0eae Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 11 Dec 2017 17:22:49 -0800 Subject: [PATCH 360/904] [BOLT] Fix debugging derp (cherry picked from commit b94659a8487bb70e86bdbf182f6b8b1404216048) --- bolt/Passes/IndirectCallPromotion.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index d8c372931334..f76564355c7b 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -1212,8 +1212,6 @@ void IndirectCallPromotion::runOnFunctions( } IndirectCalls.push_back(std::make_pair(NumCalls, &Inst)); - dbgs() << "indirect call in " << Function << " : " - << BB.getName() << " : " << NumCalls << '\n'; TotalIndirectCalls += NumCalls; } } From 8a8a2b23eb18a078746cb21b4b34111c65433308 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 11 Dec 2017 17:07:56 -0800 Subject: [PATCH 361/904] [BOLT] Fix -simplify-rodata-loads wrt data chunks with relocs Summary: The pass was previously copying data that would change after layout because it had a relocation at the copied address. (cherry picked from commit 5272a3cce182984f80141682f8a80d7ddd69ccd8) --- bolt/BinaryContext.cpp | 16 ++++++++++++++++ bolt/BinaryContext.h | 4 ++++ bolt/Passes/BinaryPasses.cpp | 4 ++++ 3 files changed, 24 insertions(+) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index d1f38138bb09..02821658b973 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -525,6 +525,22 @@ void BinaryContext::removeRelocationAt(uint64_t Address) { Relocations.erase(RelocI); } +const Relocation *BinaryContext::getRelocationAt(uint64_t Address) { + auto ContainingSection = getSectionForAddress(Address); + assert(ContainingSection && "cannot find section for address"); + auto RI = SectionRelocations.find(*ContainingSection); + if (RI == SectionRelocations.end()) + return nullptr; + + auto &Relocations = RI->second; + auto RelocI = Relocations.find( + Relocation{Address - ContainingSection->getAddress(), 0, 0, 0, 0}); + if (RelocI == Relocations.end()) + return nullptr; + + return &*RelocI; +} + size_t Relocation::getSizeForType(uint64_t Type) { switch (Type) { default: diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index ad4909e9f013..69b26b1d5c89 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -301,6 +301,10 @@ class BinaryContext { /// Remove registered relocation at a given \p Address. void removeRelocationAt(uint64_t Address); + /// Return a relocation registered at a given \p Address, or nullptr if there + /// is no relocation at such address. + const Relocation *getRelocationAt(uint64_t Address); + const BinaryFunction *getFunctionForSymbol(const MCSymbol *Symbol) const { auto BFI = SymbolToFunctionMap.find(Symbol); return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second; diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index ddcd87974dac..6f78bb856f2c 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -1146,6 +1146,10 @@ bool SimplifyRODataLoads::simplifyRODataLoads( SectionRef DataSection = DataSectionOrErr.get(); if (!DataSection.isReadOnly()) continue; + + if (BC.getRelocationAt(TargetAddress)) + continue; + uint32_t Offset = TargetAddress - DataSection.getAddress(); StringRef ConstantData; if (std::error_code EC = DataSection.getContents(ConstantData)) { From 25ff351492a052b4edbbbc40edd3cf5af0411060 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 13 Dec 2017 19:08:43 -0800 Subject: [PATCH 362/904] [BOLT] Do not assign a LP to tail calls Summary: Do not assign a LP to tail calls. They are not calls in the view of an unwinder, they are just regular branches. We were hitting an assertion in BinaryFunction::removeConditionalTailCalls() complaining about landing pads in a CTC, however it was in fact a builtin_unreachable being conservatively treated as a CTC. (cherry picked from commit 9834e60e5ad8944bbaad3a2038e2e9c07a6117b3) --- bolt/Exceptions.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index ac303bfd0207..34568e113a81 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -242,7 +242,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, assert(II != IE && "exception range not pointing to an instruction"); do { auto &Instruction = II->second; - if (BC.MIA->isCall(Instruction)) { + if (BC.MIA->isCall(Instruction) && + !BC.MIA->getConditionalTailCall(Instruction)) { assert(!BC.MIA->isInvoke(Instruction) && "overlapping exception ranges detected"); // Add extra operands to a call instruction making it an invoke from @@ -553,7 +554,7 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { if (!LPSymbol) { Streamer->EmitIntValue(0, 4); return; - } + } Streamer->EmitValue(MCBinaryExpr::createSub( MCSymbolRefExpr::create(LPSymbol, *BC.Ctx.get()), LPStartExpr, @@ -566,7 +567,7 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { if (!LPSymbol) { Streamer->EmitIntValue(0, 4); return; - } + } Streamer->emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4); }; } From ff4da485f2dec62602337c905895f6908ee72dc8 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Fri, 1 Dec 2017 16:54:08 -0800 Subject: [PATCH 363/904] [BOLT] a new block reordering algorithm Summary: A new block reordering algorithm, cache+, that is designed to optimize i-cache performance. On a high level, this algorithm is a greedy heuristic that merges clusters (ordered sequences) of basic blocks, similarly to how it is done in OptimizeCacheReorderAlgorithm. There are two important differences: (a) the metric that is optimized in the procedure, and (b) how two clusters are merged together. Initially all clusters are isolated basic blocks. On every iteration, we pick a pair of clusters whose merging yields the biggest increase in the ExtTSP metric (see CacheMetrics.cpp for exact implementation), which models how i-cache "friendly" a pecific cluster is. A pair of clusters giving the maximum gain is merged to a new clusters. The procedure stops when there is only one cluster left, or when merging does not increase ExtTSP. In the latter case, the remaining clusters are sorted by density. An important aspect is the way two clusters are merged. Unlike earlier algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two clusters, X and Y, are first split into three, X1, X2, and Y. Then we consider all possible ways of gluing the three clusters (e.g., X1YX2, X1X2Y, X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. This improves the quality of the final result (the search space is larger) while keeping the implementation sufficiently fast. (cherry picked from commit 4c22b0cb62fc0fb8e069fee5b3ef1567ec7a46dc) --- bolt/CacheMetrics.cpp | 285 +++++-------- bolt/CacheMetrics.h | 11 + bolt/Passes/BinaryPasses.cpp | 7 + bolt/Passes/BinaryPasses.h | 2 + bolt/Passes/CMakeLists.txt | 1 + bolt/Passes/CachePlusReorderAlgorithm.cpp | 476 ++++++++++++++++++++++ bolt/Passes/HFSortPlus.cpp | 358 +++++----------- bolt/Passes/ReorderAlgorithm.h | 10 + bolt/Passes/ReorderUtils.h | 112 +++++ 9 files changed, 813 insertions(+), 449 deletions(-) create mode 100644 bolt/Passes/CachePlusReorderAlgorithm.cpp create mode 100644 bolt/Passes/ReorderUtils.h diff --git a/bolt/CacheMetrics.cpp b/bolt/CacheMetrics.cpp index bd723b80629d..638872dc67c2 100644 --- a/bolt/CacheMetrics.cpp +++ b/bolt/CacheMetrics.cpp @@ -8,26 +8,65 @@ //===----------------------------------------------------------------------===// #include "CacheMetrics.h" +#include "llvm/Support/Options.h" using namespace llvm; using namespace bolt; -using Traversal = std::vector; - -// The weight of fallthrough jumps for ExtTSP metric -constexpr double FallthroughWeight = 1.0; -// The weight of forward jumps for ExtTSP metric -constexpr double ForwardWeight = 1.0; -// The weight of backward jumps for ExtTSP metric -constexpr double BackwardWeight = 1.0; -// The maximum distance (in bytes) of forward jumps for ExtTSP metric -constexpr uint64_t ForwardDistance = 256; -// The maximum distance (in bytes) of backward jumps for ExtTSP metric -constexpr uint64_t BackwardDistance = 256; - -// The size of the i-TLB cache page -constexpr uint64_t ITLBPageSize = 4096; -// Capacity of the i-TLB cache -constexpr uint64_t ITLBEntries = 16; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +cl::opt +FallthroughWeight("fallthrough-weight", + cl::desc("The weight of forward jumps for ExtTSP metric"), + cl::init(1), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ForwardWeight("forward-weight", + cl::desc("The weight of forward jumps for ExtTSP metric"), + cl::init(0.4), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +BackwardWeight("backward-weight", + cl::desc("The weight of backward jumps for ExtTSP metric"), + cl::init(0.4), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ForwardDistance("forward-distance", + cl::desc("The maximum distance (in bytes) of forward jumps for ExtTSP metric"), + cl::init(768), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +BackwardDistance("backward-distance", + cl::desc("The maximum distance (in bytes) of backward jumps for ExtTSP metric"), + cl::init(192), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ITLBPageSize("itlb-page-size", + cl::desc("The size of i-tlb cache page"), + cl::init(4096), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +ITLBEntries("itlb-entries", + cl::desc("The number of entries in i-tlb cache"), + cl::init(16), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} namespace { @@ -46,104 +85,6 @@ void extractBasicBlockInfo( } } -/// Initialize and return a vector of traversals for a given entry block -std::vector getTraversals(BinaryBasicBlock *EntryBB) { - std::vector AllTraversals; - std::stack> Stack; - Stack.push(std::make_pair(EntryBB, Traversal())); - std::unordered_set BBSet; - - while (!Stack.empty()) { - BinaryBasicBlock *CurrentBB = Stack.top().first; - Traversal PrevTraversal(Stack.top().second); - Stack.pop(); - - // Add current basic block into consideration - BBSet.insert(CurrentBB); - PrevTraversal.push_back(CurrentBB); - - if (CurrentBB->succ_empty()) { - AllTraversals.push_back(PrevTraversal); - continue; - } - - bool HaveSuccCount = false; - // Calculate total edges count of successors - for (auto BI = CurrentBB->branch_info_begin(); - BI != CurrentBB->branch_info_end(); ++BI) { - if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) { - HaveSuccCount = true; - break; - } - } - if (!HaveSuccCount) { - AllTraversals.push_back(PrevTraversal); - continue; - } - - auto BI = CurrentBB->branch_info_begin(); - for (auto *SuccBB : CurrentBB->successors()) { - // If we have never seen SuccBB, or SuccBB indicates the - // end of traversal, SuccBB will be added into stack for - // further exploring. - if ((BBSet.find(SuccBB) == BBSet.end() && BI->Count != 0 && - BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) || - SuccBB->succ_empty()) { - Stack.push(std::make_pair(SuccBB, PrevTraversal)); - } - ++BI; - } - } - - return AllTraversals; -} - -/// Given a traversal, return the sum of block distances along this traversal. -double getTraversalLength( - const std::unordered_map &BBAddr, - const Traversal &Path) { - double Length = 0; - for (size_t I = 0; I + 1 < Path.size(); I++) { - // Ignore calls between hot and cold parts - if (Path[I]->isCold() != Path[I + 1]->isCold()) - continue; - double SrcAddr = BBAddr.at(Path[I]); - double DstAddr = BBAddr.at(Path[I + 1]); - Length += std::abs(SrcAddr - DstAddr); - } - return Length; -} - -/// Calculate average number of call distance for every graph traversal -double calcGraphDistance( - const std::vector &BinaryFunctions, - const std::unordered_map &BBAddr, - const std::unordered_map &BBSize) { - - double TotalTraversalLength = 0; - double NumTraversals = 0; - for (auto BF : BinaryFunctions) { - // Only consider functions which are known to be executed - if (BF->getKnownExecutionCount() == 0) - continue; - - for (auto BB : BF->layout()) { - if (BB->isEntryPoint()) { - auto AllTraversals = getTraversals(BB); - for (auto const &Path : AllTraversals) { - // Ignore short traversals - if (Path.size() <= 1) - continue; - TotalTraversalLength += getTraversalLength(BBAddr, Path); - NumTraversals++; - } - } - } - } - - return TotalTraversalLength / NumTraversals; -} - /// Calculate TSP metric, which quantifies the number of fallthrough jumps in /// the ordering of basic blocks double calcTSPScore( @@ -166,22 +107,12 @@ double calcTSPScore( return Score; } -/// Calculate Extended-TSP metric, which quantifies the expected number of -/// i-cache misses for a given ordering of basic blocks. The parameters are: -/// - FallthroughWeight is the impact of fallthrough jumps on the score -/// - ForwardWeight is the impact of forward (but not fallthrough) jumps -/// - BackwardWeight is the impact of backward jumps -/// - ForwardDistance is the max distance of a forward jump affecting the score -/// - BackwardDistance is the max distance of a backward jump affecting the score +/// Calculate Ext-TSP metric, which quantifies the expected number of i-cache +/// misses for a given ordering of basic blocks double calcExtTSPScore( const std::vector &BinaryFunctions, const std::unordered_map &BBAddr, - const std::unordered_map &BBSize, - double FallthroughWeight, - double ForwardWeight, - double BackwardWeight, - uint64_t ForwardDistance, - uint64_t BackwardDistance) { + const std::unordered_map &BBSize) { double Score = 0.0; for (auto BF : BinaryFunctions) { @@ -189,33 +120,10 @@ double calcExtTSPScore( auto BI = SrcBB->branch_info_begin(); for (auto DstBB : SrcBB->successors()) { if (DstBB != SrcBB) { - double Count = BI->Count == BinaryBasicBlock::COUNT_NO_PROFILE - ? 0.0 - : double(BI->Count); - uint64_t SrcAddr = BBAddr.at(SrcBB); - uint64_t SrcSize = BBSize.at(SrcBB); - uint64_t DstAddr = BBAddr.at(DstBB); - - if (SrcAddr <= DstAddr) { - if (SrcAddr + SrcSize == DstAddr) { - // fallthrough jump - Score += FallthroughWeight * Count; - } else { - // the distance of the forward jump - size_t Dist = DstAddr - (SrcAddr + SrcSize); - if (Dist <= ForwardDistance) { - double Prob = double(ForwardDistance - Dist) / ForwardDistance; - Score += ForwardWeight * Prob * Count; - } - } - } else { - // the distance of the backward jump - size_t Dist = SrcAddr + SrcSize - DstAddr; - if (Dist <= BackwardDistance) { - double Prob = double(BackwardDistance - Dist) / BackwardDistance; - Score += BackwardWeight * Prob * Count; - } - } + Score += CacheMetrics::extTSPScore(BBAddr.at(SrcBB), + BBSize.at(SrcBB), + BBAddr.at(DstBB), + BI->Count); } ++BI; } @@ -277,10 +185,10 @@ extractFunctionCalls(const std::vector &BinaryFunctions) { double expectedCacheHitRatio( const std::vector &BinaryFunctions, const std::unordered_map &BBAddr, - const std::unordered_map &BBSize, - double PageSize, - uint64_t CacheEntries) { + const std::unordered_map &BBSize) { + const double PageSize = opts::ITLBPageSize; + const uint64_t CacheEntries = opts::ITLBEntries; auto Calls = extractFunctionCalls(BinaryFunctions); // Compute 'hotness' of the functions double TotalSamples = 0; @@ -334,6 +242,34 @@ double expectedCacheHitRatio( return 100.0 * (1.0 - Misses / TotalSamples); } +} // end namespace anonymous + +double CacheMetrics::extTSPScore(uint64_t SrcAddr, + uint64_t SrcSize, + uint64_t DstAddr, + uint64_t Count) { + assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); + + // Fallthrough + if (SrcAddr + SrcSize == DstAddr) { + return opts::FallthroughWeight * Count; + } + // Forward + if (SrcAddr + SrcSize < DstAddr) { + const auto Dist = DstAddr - (SrcAddr + SrcSize); + if (Dist <= opts::ForwardDistance) { + double Prob = 1.0 - static_cast(Dist) / opts::ForwardDistance; + return opts::ForwardWeight * Prob * Count; + } + return 0; + } + // Backward + const auto Dist = SrcAddr + SrcSize - DstAddr; + if (Dist <= opts::BackwardDistance) { + double Prob = 1.0 - static_cast(Dist) / opts::BackwardDistance; + return opts::BackwardWeight * Prob * Count; + } + return 0; } void CacheMetrics::printAll( @@ -356,10 +292,10 @@ void CacheMetrics::printAll( } outs() << format(" There are %zu functions;", NumFunctions) - << format(" %zu (%.2lf%%) have non-empty execution count\n", + << format(" %zu (%.2lf%%) have positive execution count\n", NumHotFunctions, 100.0 * NumHotFunctions / NumFunctions); outs() << format(" There are %zu basic blocks;", NumBlocks) - << format(" %zu (%.2lf%%) have non-empty execution count\n", + << format(" %zu (%.2lf%%) have positive execution count\n", NumHotBlocks, 100.0 * NumHotBlocks / NumBlocks); std::unordered_map BBAddr; @@ -377,35 +313,14 @@ void CacheMetrics::printAll( outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu)\n", 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize); - outs() << " An average length of graph traversal: " - << format("%.0lf\n", calcGraphDistance(BinaryFunctions, - BBAddr, - BBSize)); - - outs() << " Expected i-TLB cache hit ratio " - << format("(%zu, %zu): ", ITLBPageSize, ITLBEntries) + outs() << " Expected i-TLB cache hit ratio: " << format("%.2lf%%\n", expectedCacheHitRatio(BinaryFunctions, BBAddr, - BBSize, - ITLBPageSize, - ITLBEntries)); + BBSize)); outs() << " TSP score: " << format("%.0lf\n", calcTSPScore(BinaryFunctions, BBAddr, BBSize)); - outs() << " ExtTSP score " - << format("(%.2lf, %.2lf, %.2lf, %zu, %zu): ", FallthroughWeight, - ForwardWeight, - BackwardWeight, - ForwardDistance, - BackwardDistance) - << format("%.0lf\n", calcExtTSPScore(BinaryFunctions, - BBAddr, - BBSize, - FallthroughWeight, - ForwardWeight, - BackwardWeight, - ForwardDistance, - BackwardDistance)); - + outs() << " ExtTSP score: " + << format("%.0lf\n", calcExtTSPScore(BinaryFunctions, BBAddr, BBSize)); } diff --git a/bolt/CacheMetrics.h b/bolt/CacheMetrics.h index 1dab4565bc34..b512168ebaf3 100644 --- a/bolt/CacheMetrics.h +++ b/bolt/CacheMetrics.h @@ -20,6 +20,17 @@ namespace CacheMetrics { /// Calculate various metrics related to instruction cache performance. void printAll(const std::vector &BinaryFunctions); +/// Calculate Extended-TSP metric, which quantifies the expected number of +/// i-cache misses for a given pair of basic blocks. The parameters are: +/// - SrcAddr is the address of the source block; +/// - SrcSize is the size of the source block; +/// - DstAddr is the address of the destination block; +/// - Count is the number of jumps between the pair of blocks. +double extTSPScore(uint64_t SrcAddr, + uint64_t SrcSize, + uint64_t DstAddr, + uint64_t Count); + } // namespace CacheMetrics } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 6f78bb856f2c..b90cc0f10c52 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -161,6 +161,9 @@ ReorderBlocks("reorder-blocks", "cache", "perform optimal layout prioritizing I-cache " "behavior"), + clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, + "cache+", + "perform layout optimizing I-cache behavior"), clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, "cluster-shuffle", "perform random layout of clusters"), @@ -469,6 +472,10 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); break; + case LT_OPTIMIZE_CACHE_PLUS: + Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo))); + break; + case LT_OPTIMIZE_SHUFFLE: Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); break; diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index ea7376f7997b..0ef8e9027d55 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -169,6 +169,8 @@ class ReorderBasicBlocks : public BinaryFunctionPass { /// LT_OPTIMIZE_CACHE piggybacks on the idea from Ispike paper (CGO '04) /// that suggests putting frequently executed chains first in the layout. LT_OPTIMIZE_CACHE, + /// Block reordering guided by the extended TSP metric. + LT_OPTIMIZE_CACHE_PLUS, /// Create clusters and use random order for them. LT_OPTIMIZE_SHUFFLE, }; diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index f9b8db8703af..0255e7b40048 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_library(LLVMBOLTPasses BinaryFunctionCallGraph.cpp CallGraph.cpp CallGraphWalker.cpp + CachePlusReorderAlgorithm.cpp DataflowAnalysis.cpp DataflowInfoManager.cpp FrameAnalysis.cpp diff --git a/bolt/Passes/CachePlusReorderAlgorithm.cpp b/bolt/Passes/CachePlusReorderAlgorithm.cpp new file mode 100644 index 000000000000..5a717aaec5e2 --- /dev/null +++ b/bolt/Passes/CachePlusReorderAlgorithm.cpp @@ -0,0 +1,476 @@ +//===--- CachePlusReorderAlgorithm.cpp - Order basic blocks ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "CacheMetrics.h" +#include "ReorderAlgorithm.h" +#include "ReorderUtils.h" + +using namespace llvm; +using namespace bolt; +using EdgeList = std::vector>; + +namespace llvm { +namespace bolt { + +namespace { + +// A cluster (ordered sequence) of basic blocks +class Cluster { +public: + Cluster(BinaryBasicBlock *BB, uint64_t ExecutionCount_, uint64_t Size_) + : Id(BB->getLayoutIndex()), + IsEntry(BB->getLayoutIndex() == 0), + ExecutionCount(ExecutionCount_), + Size(Size_), + Score(0) { + Blocks.push_back(BB); + } + + size_t id() const { + return Id; + } + + uint64_t size() const { + return Size; + } + + double density() const { + return static_cast(ExecutionCount) / Size; + } + + bool isCold() const { + return ExecutionCount == 0; + } + + uint64_t executionCount() const { + return ExecutionCount; + } + + bool isEntryPoint() const { + return IsEntry; + } + + double score() const { + return Score; + } + + const std::vector &blocks() const { + return Blocks; + } + + /// Update the list of basic blocks and meta-info + void merge(const Cluster *Other, + const std::vector &MergedBlocks, + double MergedScore) { + Blocks = MergedBlocks; + IsEntry |= Other->IsEntry; + ExecutionCount += Other->ExecutionCount; + Size += Other->Size; + Score = MergedScore; + } + +private: + std::vector Blocks; + size_t Id; + bool IsEntry; + uint64_t ExecutionCount; + uint64_t Size; + double Score; +}; + +/// Deterministically compare clusters by their density in decreasing order +bool compareClusters(const Cluster *C1, const Cluster *C2) { + // original entry point to the front + if (C1->isEntryPoint()) + return true; + if (C2->isEntryPoint()) + return false; + + const double D1 = C1->density(); + const double D2 = C2->density(); + if (D1 != D2) + return D1 > D2; + // Making the order deterministic + return C1->id() < C2->id(); +} + +/// Deterministically compare pairs of clusters +bool compareClusterPairs(const Cluster *A1, const Cluster *B1, + const Cluster *A2, const Cluster *B2) { + const auto Samples1 = A1->executionCount() + B1->executionCount(); + const auto Samples2 = A2->executionCount() + B2->executionCount(); + if (Samples1 != Samples2) + return Samples1 < Samples2; + + if (A1 != A2) + return A1->id() < A2->id(); + return B1->id() < B2->id(); +} + +} // end namespace anonymous + +/// CachePlus - layout of basic blocks with i-cache optimization. +/// +/// Similarly to OptimizeCacheReorderAlgorithm, this algorithm is a greedy +/// heuristic that works with clusters (ordered sequences) of basic blocks. +/// Initially all clusters are isolated basic blocks. On every iteration, +/// we pick a pair of clusters whose merging yields the biggest increase in +/// the ExtTSP metric (see CacheMetrics.cpp for exact implementation), which +/// models how i-cache "friendly" a specific cluster is. A pair of clusters +/// giving the maximum gain is merged into a new cluster. The procedure stops +/// when there is only one cluster left, or when merging does not increase +/// ExtTSP. In the latter case, the remaining clusters are sorted by density. +/// +/// An important aspect is the way two clusters are merged. Unlike earlier +/// algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two +/// clusters, X and Y, are first split into three, X1, X2, and Y. Then we +/// consider all possible ways of gluing the three clusters (e.g., X1YX2, X1X2Y, +/// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. +/// This improves the quality of the final result (the search space is larger) +/// while keeping the implementation sufficiently fast. +class CachePlus { +public: + CachePlus(const BinaryFunction &BF) + : BF(BF), Adjacent(BF.layout_size()), Cache(BF.layout_size()) { + initialize(); + } + + /// Run cache+ algorithm and return a basic block ordering + std::vector run() { + // Merge pairs of clusters while there is an improvement in ExtTSP metric + while (Clusters.size() > 1) { + Cluster *BestClusterPred = nullptr; + Cluster *BestClusterSucc = nullptr; + std::pair BestGain(-1, 0); + for (auto ClusterPred : Clusters) { + // Get candidates for merging with the current cluster + Adjacent.forAllAdjacent( + ClusterPred, + // Find the best candidate + [&](Cluster *ClusterSucc) { + assert(ClusterPred != ClusterSucc && "loop edges are not supported"); + // Do not merge cold blocks + if (ClusterPred->isCold() || ClusterSucc->isCold()) + return; + + // Compute the gain of merging two clusters + auto Gain = mergeGain(ClusterPred, ClusterSucc); + if (Gain.first <= 0.0) + return; + + // Breaking ties by density to make the hottest clusters be merged first + if (Gain.first > BestGain.first || + (std::abs(Gain.first - BestGain.first) < 1e-8 && + compareClusterPairs(ClusterPred, + ClusterSucc, + BestClusterPred, + BestClusterSucc))) { + BestGain = Gain; + BestClusterPred = ClusterPred; + BestClusterSucc = ClusterSucc; + } + }); + } + + // Stop merging when there is no improvement + if (BestGain.first <= 0.0) + break; + + // Merge the best pair of clusters + mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second); + } + + // Sorting clusters by density + std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters); + + // Collect the basic blocks in the order specified by their clusters + std::vector Result; + Result.reserve(BF.layout_size()); + for (auto Cluster : Clusters) { + Result.insert(Result.end(), + Cluster->blocks().begin(), + Cluster->blocks().end()); + } + + return Result; + } + +private: + /// Initialize the set of active clusters, edges between blocks, and + /// adjacency matrix. + void initialize() { + // Initialize indices of basic blocks + size_t LayoutIndex = 0; + for (auto BB : BF.layout()) { + BB->setLayoutIndex(LayoutIndex); + LayoutIndex++; + } + + // Initialize edges for the blocks and compute their total in/out weights + OutEdges = std::vector(BF.layout_size()); + auto InWeight = std::vector(BF.layout_size(), 0); + auto OutWeight = std::vector(BF.layout_size(), 0); + for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); + for (auto I : BB->successors()) { + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "missing profile for a jump"); + if (I != BB && BI->Count > 0) { + InWeight[I->getLayoutIndex()] += BI->Count; + OutEdges[BB->getLayoutIndex()].push_back(std::make_pair(I, BI->Count)); + OutWeight[BB->getLayoutIndex()] += BI->Count; + } + ++BI; + } + } + + // Initialize execution count for every basic block, which is the + // maximum over the sums of all in and out edge weights. + // Also execution count of the entry point is set to at least 1 + auto ExecutionCounts = std::vector(BF.layout_size(), 0); + for (auto BB : BF.layout()) { + uint64_t EC = BB->getKnownExecutionCount(); + EC = std::max(EC, InWeight[BB->getLayoutIndex()]); + EC = std::max(EC, OutWeight[BB->getLayoutIndex()]); + if (BB->getLayoutIndex() == 0) + EC = std::max(EC, uint64_t(1)); + ExecutionCounts[BB->getLayoutIndex()] = EC; + } + + // Initialize clusters + Clusters.reserve(BF.layout_size()); + AllClusters.reserve(BF.layout_size()); + Size.reserve(BF.layout_size()); + for (auto BB : BF.layout()) { + size_t Index = BB->getLayoutIndex(); + Size.push_back(std::max(BB->estimateSize(), size_t(1))); + AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]); + Clusters.push_back(&AllClusters[Index]); + } + + // Initialize adjacency matrix + Adjacent.initialize(Clusters); + for (auto BB : BF.layout()) { + for (auto I : BB->successors()) { + if (BB != I) + Adjacent.set(Clusters[BB->getLayoutIndex()], + Clusters[I->getLayoutIndex()]); + } + } + } + + /// Compute ExtTSP score for a given order of basic blocks + double score(const std::vector& Blocks) const { + uint64_t NotSet = static_cast(-1); + auto Addr = std::vector(BF.layout_size(), NotSet); + uint64_t CurAddr = 0; + for (auto BB : Blocks) { + size_t Index = BB->getLayoutIndex(); + Addr[Index] = CurAddr; + CurAddr += Size[Index]; + } + + double Score = 0; + for (auto BB : Blocks) { + size_t Index = BB->getLayoutIndex(); + for (auto Edge : OutEdges[Index]) { + auto SuccBB = Edge.first; + size_t SuccIndex = SuccBB->getLayoutIndex(); + + if (Addr[SuccBB->getLayoutIndex()] != NotSet) { + Score += CacheMetrics::extTSPScore(Addr[Index], + Size[Index], + Addr[SuccIndex], + Edge.second); + } + } + } + return Score; + } + + /// The gain of merging two clusters. + /// + /// The function considers all possible ways of merging two clusters and + /// computes the one having the largest increase in ExtTSP metric. The result + /// is a pair with the first element being the gain and the second element being + /// the corresponding merging type (encoded as an integer). + std::pair mergeGain(const Cluster *ClusterPred, + const Cluster *ClusterSucc) const { + if (Cache.contains(ClusterPred, ClusterSucc)) { + return Cache.get(ClusterPred, ClusterSucc); + } + + // The current score of two separate clusters + const auto CurScore = ClusterPred->score() + ClusterSucc->score(); + + // Merge two clusters and update the best Gain + auto computeMergeGain = [&](const std::pair &CurGain, + const Cluster *ClusterPred, + const Cluster *ClusterSucc, + size_t MergeType) { + auto MergedBlocks = mergeBlocks(ClusterPred->blocks(), + ClusterSucc->blocks(), + MergeType); + // Does the new cluster preserve the original entry point? + if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) && + MergedBlocks[0]->getLayoutIndex() != 0) + return CurGain; + + // The score of the new cluster + const auto NewScore = score(MergedBlocks); + if (NewScore > CurScore && NewScore - CurScore > CurGain.first) + return std::make_pair(NewScore - CurScore, MergeType); + else + return CurGain; + }; + + std::pair Gain = std::make_pair(-1, 0); + // Try to simply concatenate two clusters + Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0); + // Try to split ClusterPred into two and merge with ClusterSucc + for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) { + for (size_t Type = 0; Type < 4; Type++) { + size_t MergeType = 1 + Type + Offset * 4; + Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType); + } + } + + Cache.set(ClusterPred, ClusterSucc, Gain); + return Gain; + } + + /// Merge two clusters (orders) of blocks according to a given 'merge type'. + /// + /// If MergeType == 0, then the results is a concatentation of two clusters. + /// Otherwise, the first cluster is cut into two and we consider all possible + /// ways of concatenating three clusters. + std::vector mergeBlocks( + const std::vector &X, + const std::vector &Y, + size_t MergeType + ) const { + // Concatenate three clusters of blocks in the given order + auto concat = [&](const std::vector &A, + const std::vector &B, + const std::vector &C) { + std::vector Result; + Result.reserve(A.size() + B.size() + C.size()); + Result.insert(Result.end(), A.begin(), A.end()); + Result.insert(Result.end(), B.begin(), B.end()); + Result.insert(Result.end(), C.begin(), C.end()); + return Result; + }; + + // Merging w/o splitting existing clusters + if (MergeType == 0) { + return concat(X, Y, std::vector()); + } + + MergeType--; + size_t Type = MergeType % 4; + size_t Offset = MergeType / 4; + assert(0 < Offset && Offset < X.size() && + "Invalid offset while merging clusters"); + // Split the first cluster, X, into X1 and X2 + std::vector X1(X.begin(), X.begin() + Offset); + std::vector X2(X.begin() + Offset, X.end()); + + // Construct a new cluster from three existing ones + switch(Type) { + case 0: return concat(X1, Y, X2); + case 1: return concat(Y, X2, X1); + case 2: return concat(X2, Y, X1); + case 3: return concat(X2, X1, Y); + default: + llvm_unreachable("unexpected merge type"); + } + } + + /// Merge cluster From into cluster Into, update the list of active clusters, + /// adjacency information, and the corresponding cache. + void mergeClusters(Cluster *Into, Cluster *From, size_t MergeType) { + assert(Into != From && "Cluster cannot be merged with itself"); + // Merge the clusters + auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType); + Into->merge(From, MergedBlocks, score(MergedBlocks)); + + // Remove cluster From from the list of active clusters + auto Iter = std::remove(Clusters.begin(), Clusters.end(), From); + Clusters.erase(Iter, Clusters.end()); + + // Invalidate caches + Cache.invalidate(Into); + + // Update the adjacency matrix + Adjacent.merge(Into, From); + } + + // The binary function + const BinaryFunction &BF; + + // All clusters + std::vector AllClusters; + + // Active clusters. The vector gets udpated at runtime when clusters are merged + std::vector Clusters; + + // Size of the block + std::vector Size; + + // Outgoing edges of the block + std::vector OutEdges; + + // Cluster adjacency matrix + AdjacencyMatrix Adjacent; + + // A cache that keeps precomputed values of mergeGain for pairs of clusters; + // when a pair of clusters (x,y) gets merged, we invalidate the pairs + // containing both x and y and all clusters adjacent to x and y (and recompute + // them on the next iteration). + mutable ClusterPairCache> Cache; +}; + +void CachePlusReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Are there jumps with positive execution count? + uint64_t SumCount = 0; + for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); + for (auto I : BB->successors()) { + assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && I != nullptr); + SumCount += BI->Count; + ++BI; + } + } + + // Do not change layout of functions w/o profile information + if (SumCount == 0) { + for (auto BB : BF.layout()) { + Order.push_back(BB); + } + return; + } + + // Apply the algorithm + Order = CachePlus(BF).run(); + + // Verify correctness + assert(Order[0]->isEntryPoint() && "Original entry point is not preserved"); + assert(Order.size() == BF.layout_size() && "Wrong size of reordered layout"); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp index fb8f2cbcf2c2..4d15572110f4 100644 --- a/bolt/Passes/HFSortPlus.cpp +++ b/bolt/Passes/HFSortPlus.cpp @@ -29,11 +29,8 @@ #include "BinaryFunction.h" #include "HFSort.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/Format.h" +#include "ReorderUtils.h" #include "llvm/Support/Options.h" -#include "llvm/Support/raw_ostream.h" #include #include @@ -48,21 +45,9 @@ using namespace bolt; namespace opts { extern cl::OptionCategory BoltOptCategory; -extern cl::opt Verbosity; -cl::opt -ITLBPageSizeParam("itlb-page-size", - cl::desc("The size of i-tlb cache page"), - cl::init(4096), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - -cl::opt -ITLBEntriesParam("itlb-entries", - cl::desc("The number of entries in i-tlb cache"), - cl::init(16), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +extern cl::opt ITLBPageSize; +extern cl::opt ITLBEntries; cl::opt MergeProbability("merge-probability", @@ -92,189 +77,46 @@ int32_t ITLBPageSize; // while smaller values result in better i-cache performance int32_t ITLBEntries; -// This class maintains adjacency information for all Clusters being -// processed. It is used to invalidate cache entries when merging -// Clusters and for visiting all neighbors of any given Cluster. -class AdjacencyMatrix { - public: - AdjacencyMatrix(const CallGraph &Cg, - std::vector &Clusters, - const std::vector &FuncCluster) - : Clusters(Clusters), - Bits(Cg.numNodes(), BitVector(Cg.numNodes(), false)) { - initialize(Cg, FuncCluster); - } - - template - void forallAdjacent(const Cluster *C, F Func) const { - const_cast(this)->forallAdjacent(C, Func); - } - - template - void forallAdjacent(const Cluster *C, F Func) { - for (auto I = Bits[C->id()].find_first(); I != -1; I = Bits[C->id()].find_next(I)) { - Func(Clusters[I]); - } - } - - // Merge adjacency info from cluster B into cluster A. Info for cluster B is left - // in an undefined state. - void merge(const Cluster *A, const Cluster *B) { - Bits[A->id()] |= Bits[B->id()]; - Bits[A->id()][A->id()] = false; - Bits[A->id()][B->id()] = false; - Bits[B->id()][A->id()] = false; - for (auto I = Bits[B->id()].find_first(); I != -1; I = Bits[B->id()].find_next(I)) { - Bits[I][A->id()] = true; - Bits[I][B->id()] = false; - } - } - - void dump(const Cluster *A) const { - outs() << "Cluster " << A->id() << ":"; - forallAdjacent(A, [](const Cluster *B) { outs() << " " << B->id(); }); - } - - void dump() const { - for (auto *A : Clusters) { - if (!A) continue; - dump(A); - outs() << "\n"; - } - } - private: - void set(const Cluster *A, const Cluster *B, bool Value) { - assert(A != B); - Bits[A->id()][B->id()] = Value; - Bits[B->id()][A->id()] = Value; - } - - void initialize(const CallGraph &Cg, const std::vector &FuncCluster) { - for (auto *A : Clusters) { - for (auto TargetId : A->targets()) { - for (auto Succ : Cg.successors(TargetId)) { - auto *B = FuncCluster[Succ]; - if (!B || B == A) continue; - const auto &Arc = *Cg.findArc(TargetId, Succ); - if (Arc.weight() <= 0.0) continue; - - set(A, B, true); - } - for (auto Pred : Cg.predecessors(TargetId)) { - auto *B = FuncCluster[Pred]; - if (!B || B == A) continue; - const auto &Arc = *Cg.findArc(Pred, TargetId); - if (Arc.weight() <= 0.0) continue; - - set(A, B, true); - } - } - } - } - - std::vector Clusters; - std::vector Bits; -}; - -// A cache of precomputed results for a pair of clusters -class PrecomputedResults { - public: - explicit PrecomputedResults(size_t Size) - : Size(Size), - Cache(new double[Size*Size]), - Valid(Size * Size, false) { - memset(Cache, 0, sizeof(double)*Size*Size); - } - ~PrecomputedResults() { - delete[] Cache; - } - - bool contains(const Cluster *First, const Cluster *Second) const { - return Valid[index(First, Second)]; - } - - double get(const Cluster *First, const Cluster *Second) const { - assert(contains(First, Second)); - return Cache[index(First, Second)]; - } - - void set(const Cluster *First, const Cluster *Second, double Value) { - const auto Index = index(First, Second); - Cache[Index] = Value; - Valid[Index] = true; - } - - void invalidate(const Cluster *C) { - Valid.reset(C->id() * Size, (C->id() + 1) * Size); - for (size_t Id = 0; Id < Size; Id++) { - Valid.reset(Id * Size + C->id()); - } - } - - private: - size_t index(const Cluster *First, const Cluster *Second) const { - return First->id() * Size + Second->id(); - } - - size_t Size; - double *Cache; - BitVector Valid; -}; - -/* - * Erase an element from a container if it is present. Otherwise, do nothing. - */ -template -void maybeErase(C &Container, const V& Value) { - auto Itr = Container.find(Value); - if (Itr != Container.end()) - Container.erase(Itr); -} - -/* - * Density of a cluster formed by merging a given pair of clusters - */ +/// Density of a cluster formed by merging a given pair of clusters. double density(const Cluster *ClusterPred, const Cluster *ClusterSucc) { const double CombinedSamples = ClusterPred->samples() + ClusterSucc->samples(); const double CombinedSize = ClusterPred->size() + ClusterSucc->size(); return CombinedSamples / CombinedSize; } -/* - * Deterministically compare clusters by their density in decreasing order. - */ +/// Deterministically compare clusters by density in decreasing order. bool compareClusters(const Cluster *C1, const Cluster *C2) { const double D1 = C1->density(); const double D2 = C2->density(); - if (D1 != D2) return D1 > D2; + if (D1 != D2) + return D1 > D2; // making sure the sorting is deterministic - if (C1->size() != C2->size()) return C1->size() < C2->size(); - if (C1->samples() != C2->samples()) return C1->samples() > C2->samples(); + if (C1->size() != C2->size()) + return C1->size() < C2->size(); + if (C1->samples() != C2->samples()) + return C1->samples() > C2->samples(); return C1->target(0) < C2->target(0); } -/* - * Deterministically compare pairs of clusters by their density - * in decreasing order. - */ +/// Deterministically compare pairs of clusters by density in decreasing order. bool compareClusterPairs(const Cluster *A1, const Cluster *B1, const Cluster *A2, const Cluster *B2) { const auto D1 = density(A1, B1); const auto D2 = density(A2, B2); - if (D1 != D2) return D1 > D2; - // making sure the sorting is deterministic + if (D1 != D2) + return D1 > D2; const auto Size1 = A1->size() + B1->size(); const auto Size2 = A2->size() + B2->size(); - if (Size1 != Size2) return Size1 < Size2; + if (Size1 != Size2) + return Size1 < Size2; const auto Samples1 = A1->samples() + B1->samples(); const auto Samples2 = A2->samples() + B2->samples(); - if (Samples1 != Samples2) return Samples1 > Samples2; + if (Samples1 != Samples2) + return Samples1 > Samples2; return A1->target(0) < A2->target(0); } -/* - * Sorting clusters by their density in decreasing order - */ +/// Sorting clusters by their density in decreasing order. template std::vector sortByDensity(const C &Clusters_) { std::vector Clusters(Clusters_.begin(), Clusters_.end()); @@ -282,27 +124,23 @@ std::vector sortByDensity(const C &Clusters_) { return Clusters; } -/* - * HFSortPlus - layout of hot functions with iTLB cache optimization - * - * Given an ordering of hot functions (and hence, their assignment to the - * iTLB pages), we can divide all functions calls into two categories: - * - 'short' ones that have a caller-callee distance less than a page; - * - 'long' ones where the distance exceeds a page. - * The short calls are likely to result in a iTLB cache hit. For the long ones, - * the hit/miss result depends on the 'hotness' of the page (i.e., how often - * the page is accessed). Assuming that functions are sent to the iTLB cache - * in a random order, the probability that a page is present in the cache is - * proportional to the number of samples corresponding to the functions on the - * page. The following algorithm detects short and long calls, and optimizes - * the expected number of cache misses for the long ones. - */ +/// HFSortPlus - layout of hot functions with iTLB cache optimization +/// +/// Given an ordering of hot functions (and hence, their assignment to the +/// iTLB pages), we can divide all functions calls into two categories: +/// - 'short' ones that have a caller-callee distance less than a page; +/// - 'long' ones where the distance exceeds a page. +/// The short calls are likely to result in a iTLB cache hit. For the long ones, +/// the hit/miss result depends on the 'hotness' of the page (i.e., how often +/// the page is accessed). Assuming that functions are sent to the iTLB cache +/// in a random order, the probability that a page is present in the cache is +/// proportional to the number of samples corresponding to the functions on the +/// page. The following algorithm detects short and long calls, and optimizes +/// the expected number of cache misses for the long ones. class HFSortPlus { public: - /* - * The expected number of calls on different i-TLB pages for an arc of the - * call graph with a specified weight - */ + /// The expected number of calls on different i-TLB pages for an arc of the + /// call graph with a specified weight double expectedCalls(int64_t SrcAddr, int64_t DstAddr, double Weight) const { const auto Dist = std::abs(SrcAddr - DstAddr); if (Dist > ITLBPageSize) @@ -313,15 +151,13 @@ class HFSortPlus { return (1.0 - X * X) * Weight; } - /* - * The probability that a page with a given weight is not present in the cache - * - * Assume that the hot functions are called in a random order; then the - * probability of a i-TLB page being accessed after a function call is - * p=pageSamples/totalSamples. The probability that the page is not accessed - * is (1-p), and the probability that it is not in the cache (i.e. not accessed - * during the last ITLBEntries function calls) is (1-p)^ITLBEntries - */ + /// The probability that a page with a given weight is not present in the cache + /// + /// Assume that the hot functions are called in a random order; then the + /// probability of a i-TLB page being accessed after a function call is + /// p=pageSamples/totalSamples. The probability that the page is not accessed + /// is (1-p), and the probability that it is not in the cache (i.e. not accessed + /// during the last ITLBEntries function calls) is (1-p)^ITLBEntries double missProbability(double PageSamples) const { double P = PageSamples / TotalSamples; double X = ITLBEntries; @@ -330,10 +166,8 @@ class HFSortPlus { return pow(1.0 - P, X); } - /* - * The expected number of calls within a given cluster with both endpoints on - * the same cache page - */ + /// The expected number of calls within a given cluster with both endpoints on + /// the same cache page double shortCalls(const Cluster *Cluster) const { double Calls = 0; for (auto TargetId : Cluster->targets()) { @@ -352,10 +186,8 @@ class HFSortPlus { return Calls; } - /* - * The number of calls between the two clusters with both endpoints on - * the same i-TLB page, assuming that a given pair of clusters gets merged - */ + /// The number of calls between the two clusters with both endpoints on + /// the same i-TLB page, assuming that a given pair of clusters gets merged double shortCalls(const Cluster *ClusterPred, const Cluster *ClusterSucc) const { double Calls = 0; @@ -389,18 +221,16 @@ class HFSortPlus { return Calls; } - /* - * The gain of merging two clusters. - * - * We assume that the final clusters are sorted by their density, and hence - * every cluster is likely to be adjacent with clusters of the same density. - * Thus, the 'hotness' of every cluster can be estimated by density*pageSize, - * which is used to compute the probability of cache misses for long calls - * of a given cluster. - * The result is also scaled by the size of the resulting cluster in order to - * increse the chance of merging short clusters, which is helpful for - * the i-cache performance. - */ + /// The gain of merging two clusters. + /// + /// We assume that the final clusters are sorted by their density, and hence + /// every cluster is likely to be adjacent with clusters of the same density. + /// Thus, the 'hotness' of every cluster can be estimated by density*pageSize, + /// which is used to compute the probability of cache misses for long calls + /// of a given cluster. + /// The result is also scaled by the size of the resulting cluster in order to + /// increse the chance of merging short clusters, which is helpful for + /// the i-cache performance. double mergeGain(const Cluster *ClusterPred, const Cluster *ClusterSucc) const { if (UseGainCache && GainCache.contains(ClusterPred, ClusterSucc)) { @@ -435,9 +265,7 @@ class HFSortPlus { return Gain; } - /* - * For every active cluster, compute its total weight of outgoing edges - */ + /// For every active cluster, compute its total weight of outgoing edges std::unordered_map computeOutgoingWeight() { std::unordered_map OutWeight; for (auto ClusterPred : Clusters) { @@ -456,9 +284,7 @@ class HFSortPlus { return OutWeight; } - /* - * Find pairs of clusters that call each other with high probability - */ + /// Find pairs of clusters that call each other with high probability std::vector> findClustersToMerge() { // compute total weight of outgoing edges for every cluster auto OutWeight = computeOutgoingWeight(); @@ -503,10 +329,8 @@ class HFSortPlus { return PairsToMerge; } - /* - * Run the first optimization pass of the hfsort+ algorithm: - * Merge clusters that call each other with high probability - */ + /// Run the first optimization pass of the hfsort+ algorithm: + /// Merge clusters that call each other with high probability void runPassOne() { while (Clusters.size() > 1) { // pairs of clusters that will be merged on this iteration @@ -523,11 +347,9 @@ class HFSortPlus { } } - /* - * Run the second optimization pass of the hfsort+ algorithm: - * Merge pairs of clusters while there is an improvement in the - * expected cache miss ratio - */ + /// Run the second optimization pass of the hfsort+ algorithm: + /// Merge pairs of clusters while there is an improvement in the + /// expected cache miss ratio void runPassTwo() { while (Clusters.size() > 1) { Cluster *BestClusterPred = nullptr; @@ -535,7 +357,7 @@ class HFSortPlus { double BestGain = -1; for (auto ClusterPred : Clusters) { // get candidates for merging with the current cluster - Adjacent.forallAdjacent( + Adjacent.forAllAdjacent( ClusterPred, // find the best candidate [&](Cluster *ClusterSucc) { @@ -565,9 +387,7 @@ class HFSortPlus { } } - /* - * Run hfsort+ algorithm and return ordered set of function clusters. - */ + /// Run hfsort+ algorithm and return ordered set of function clusters. std::vector run() { DEBUG(dbgs() << "Starting hfsort+ w/" << (UseGainCache ? "gain cache" : "no cache") @@ -602,19 +422,37 @@ class HFSortPlus { Addr(Cg.numNodes(), InvalidAddr), TotalSamples(0.0), Clusters(initializeClusters()), - Adjacent(Cg, Clusters, FuncCluster), + Adjacent(Cg.numNodes()), UseGainCache(UseGainCache), GainCache(Clusters.size()) { + // Initialize adjacency matrix + Adjacent.initialize(Clusters); + for (auto *A : Clusters) { + for (auto TargetId : A->targets()) { + for (auto Succ : Cg.successors(TargetId)) { + auto *B = FuncCluster[Succ]; + if (!B || B == A) continue; + const auto &Arc = *Cg.findArc(TargetId, Succ); + if (Arc.weight() > 0.0) + Adjacent.set(A, B); + } + for (auto Pred : Cg.predecessors(TargetId)) { + auto *B = FuncCluster[Pred]; + if (!B || B == A) continue; + const auto &Arc = *Cg.findArc(Pred, TargetId); + if (Arc.weight() > 0.0) + Adjacent.set(A, B); + } + } + } } -private: - /* - * Initialize the set of active clusters, function id to cluster mapping, - * total number of samples and function addresses. - */ +private: + /// Initialize the set of active clusters, function id to cluster mapping, + /// total number of samples and function addresses. std::vector initializeClusters() { - ITLBPageSize = opts::ITLBPageSizeParam; - ITLBEntries = opts::ITLBEntriesParam; + ITLBPageSize = opts::ITLBPageSize; + ITLBEntries = opts::ITLBEntries; // Initialize clusters std::vector Clusters; @@ -632,16 +470,8 @@ class HFSortPlus { return Clusters; } - /* - * Merge cluster From into cluster Into and update the list of active clusters - */ + /// Merge cluster From into cluster Into and update the list of active clusters void mergeClusters(Cluster *Into, Cluster *From) { - DEBUG( - if (opts::Verbosity > 0) { - dbgs() << "Merging cluster " << From->id() - << " into cluster " << Into->id() << "\n"; - }); - // The adjacency merge must happen before the Cluster::merge since that // clobbers the contents of From. Adjacent.merge(Into, From); @@ -690,7 +520,7 @@ class HFSortPlus { std::vector Clusters; // Cluster adjacency matrix - AdjacencyMatrix Adjacent; + AdjacencyMatrix Adjacent; // Use cache for mergeGain results bool UseGainCache; @@ -699,10 +529,10 @@ class HFSortPlus { // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs // containing both x and y and all clusters adjacent to x and y (and recompute // them on the next iteration). - mutable PrecomputedResults GainCache; + mutable ClusterPairCache GainCache; }; -} +} // end namespace anonymous std::vector hfsortPlus(CallGraph &Cg, bool UseGainCache) { // It is required that the sum of incoming arc weights is not greater diff --git a/bolt/Passes/ReorderAlgorithm.h b/bolt/Passes/ReorderAlgorithm.h index fd50a6c311e5..5be8a93f6f1f 100644 --- a/bolt/Passes/ReorderAlgorithm.h +++ b/bolt/Passes/ReorderAlgorithm.h @@ -243,6 +243,16 @@ class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm { const BinaryFunction &BF, BasicBlockOrder &Order) const override; }; +/// A new reordering algorithm for basic blocks, cache+ +class CachePlusReorderAlgorithm : public ReorderAlgorithm { +public: + explicit CachePlusReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; /// Toy example that simply reverses the original basic block order. class ReverseReorderAlgorithm : public ReorderAlgorithm { diff --git a/bolt/Passes/ReorderUtils.h b/bolt/Passes/ReorderUtils.h new file mode 100644 index 000000000000..15c68d65e635 --- /dev/null +++ b/bolt/Passes/ReorderUtils.h @@ -0,0 +1,112 @@ +// Passes/ReorderUtils.h - Helper methods for function and block reordering // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_UTILS_H + +#include +#include + +#include "llvm/ADT/BitVector.h" + +namespace llvm { +namespace bolt { + +// This class maintains adjacency information for all Clusters being +// processed. It is used for visiting all neighbors of any given Cluster +// while merging pairs of Clusters. Every Cluster must implement the id() method +template class AdjacencyMatrix { +public: + explicit AdjacencyMatrix(size_t Size) : Bits(Size, BitVector(Size, false)) {} + + void initialize(std::vector &_Clusters) { Clusters = _Clusters; } + + template void forAllAdjacent(const Cluster *C, F Func) const { + const_cast(this)->forallAdjacent(C, Func); + } + + template void forAllAdjacent(const Cluster *C, F Func) { + for (auto I = Bits[C->id()].find_first(); I != -1; + I = Bits[C->id()].find_next(I)) { + Func(Clusters[I]); + } + } + + /// Merge adjacency info from cluster B into cluster A. Info for cluster B is + /// left in an undefined state. + void merge(const Cluster *A, const Cluster *B) { + Bits[A->id()] |= Bits[B->id()]; + Bits[A->id()][A->id()] = false; + Bits[A->id()][B->id()] = false; + Bits[B->id()][A->id()] = false; + for (auto I = Bits[B->id()].find_first(); I != -1; + I = Bits[B->id()].find_next(I)) { + Bits[I][A->id()] = true; + Bits[I][B->id()] = false; + } + } + + void set(const Cluster *A, const Cluster *B) { set(A, B, true); } + +private: + void set(const Cluster *A, const Cluster *B, bool Value) { + assert(A != B); + Bits[A->id()][B->id()] = Value; + Bits[B->id()][A->id()] = Value; + } + + std::vector Clusters; + std::vector Bits; +}; + +// This class holds cached results of specified type for a pair of Clusters. +// It can invalidate all cache entries associated with a given Cluster. +template class ClusterPairCache { +public: + explicit ClusterPairCache(size_t Size) + : Size(Size), Cache(Size * Size), Valid(Size * Size, false) {} + + bool contains(const Cluster *First, const Cluster *Second) const { + return Valid[index(First, Second)]; + } + + ValueType get(const Cluster *First, const Cluster *Second) const { + assert(contains(First, Second)); + return Cache[index(First, Second)]; + } + + void set(const Cluster *First, const Cluster *Second, ValueType Value) { + const auto Index = index(First, Second); + Cache[Index] = Value; + Valid[Index] = true; + } + + void invalidate(const Cluster *C) { + Valid.reset(C->id() * Size, (C->id() + 1) * Size); + for (size_t id = 0; id < Size; id++) { + Valid.reset((id * Size) + C->id()); + } + } + +private: + size_t index(const Cluster *First, const Cluster *Second) const { + return (First->id() * Size) + Second->id(); + } + + size_t Size; + std::vector Cache; + BitVector Valid; +}; + +} // namespace bolt +} // namespace llvm + +#endif From 22fdf7d623c8634008189e23e5dd34e2c7daa539 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 9 Nov 2017 16:59:18 -0800 Subject: [PATCH 364/904] [BOLT-AArch64] Support SPEC17 programs and organize AArch64 tests Summary: Add a few new relocation types to support a wider variety of binaries, add support for constant island duplication (so we can split functions in large binaries) and make LongJmp pass really precise with respect to layout, so we don't miss stubs insertions at the correct places for really large binaries. In LongJmp, introduce "freeze" annotations so fixBranches won't mess the jumps we carefully determined that needed a stub. (cherry picked from commit 6352ac688192b5a68301cb2627dc3f7a27bd4579) --- bolt/BinaryContext.cpp | 16 +++++ bolt/BinaryContext.h | 2 +- bolt/BinaryFunction.cpp | 58 ++++++++++++++--- bolt/BinaryFunction.h | 30 ++++++++- bolt/BinaryPassManager.cpp | 2 + bolt/Passes/BinaryPasses.cpp | 23 ++++--- bolt/Passes/BinaryPasses.h | 14 +++++ bolt/Passes/LongJmp.cpp | 117 +++++++++++++++++++++++++++-------- bolt/Passes/LongJmp.h | 7 ++- bolt/RewriteInstance.cpp | 29 ++++++++- 10 files changed, 253 insertions(+), 45 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 02821658b973..6be9c1cca959 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -565,7 +565,11 @@ size_t Relocation::getSizeForType(uint64_t Type) { case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST8_ABS_LO12_NC: case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_CALL: case ELF::R_AARCH64_JUMP26: case ELF::R_AARCH64_PREL32: return 4; @@ -585,12 +589,14 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, return Contents; case ELF::R_AARCH64_PREL32: return static_cast(PC) + SignExtend64<32>(Contents & 0xffffffff); + case ELF::R_AARCH64_TLSDESC_CALL: case ELF::R_AARCH64_JUMP26: case ELF::R_AARCH64_CALL26: // Immediate goes in bits 25:0 of B and BL. Contents &= ~0xfffffffffc000000ULL; return static_cast(PC) + SignExtend64<28>(Contents << 2); case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: case ELF::R_AARCH64_ADR_PREL_PG_HI21: { // Bits 32:12 of Symbol address goes in bits 30:29 + 23:5 of ADRP // instruction @@ -602,6 +608,7 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, Contents &= ~0xfffUll; return Contents; } + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: { // Immediate goes in bits 21:10 of LD/ST instruction, taken @@ -609,6 +616,7 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, Contents &= ~0xffffffffffc003ffU; return Contents >> (10 - 3); } + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: case ELF::R_AARCH64_ADD_ABS_LO12_NC: { // Immediate goes in bits 21:10 of ADD instruction Contents &= ~0xffffffffffc003ffU; @@ -647,6 +655,10 @@ bool Relocation::isGOT(uint64_t Type) { return false; case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_CALL: return true; } } @@ -668,6 +680,8 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST8_ABS_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: return false; case ELF::R_X86_64_PC8: @@ -677,9 +691,11 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_X86_64_GOTTPOFF: case ELF::R_X86_64_GOTPCRELX: case ELF::R_X86_64_REX_GOTPCRELX: + case ELF::R_AARCH64_TLSDESC_CALL: case ELF::R_AARCH64_CALL26: case ELF::R_AARCH64_ADR_PREL_PG_HI21: case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: case ELF::R_AARCH64_JUMP26: case ELF::R_AARCH64_PREL32: return true; diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 69b26b1d5c89..37266ec6f6fe 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -340,7 +340,7 @@ class BinaryContext { SmallString<256> Code; SmallVector Fixups; raw_svector_ostream VecOS(Code); - if (MIA->isCFI(*Beg)) { + if (MIA->isCFI(*Beg) || MIA->isEHLabel(*Beg)) { ++Beg; continue; } diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index a6c75fea32b2..8f4df24254fd 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -913,6 +913,10 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { isInConstantIsland(TargetAddress)) { TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "ISLANDat"); IslandSymbols[TargetAddress - getAddress()] = TargetSymbol; + if (!ColdIslandSymbols.count(TargetSymbol)) { + ColdIslandSymbols[TargetSymbol] = + Ctx->getOrCreateSymbol(TargetSymbol->getName() + ".cold"); + } } // Note that the address does not necessarily have to reside inside @@ -2033,6 +2037,9 @@ uint64_t BinaryFunction::getEditDistance() const { } void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { + if (EmitColdPart && hasConstantIsland()) + duplicateConstantIslands(); + int64_t CurrentGnuArgsSize = 0; for (auto BB : layout()) { if (EmitColdPart != BB->isCold()) @@ -2078,8 +2085,7 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { } } - if (!EmitColdPart) - emitConstantIslands(Streamer); + emitConstantIslands(Streamer, EmitColdPart); } void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { @@ -2140,11 +2146,15 @@ void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { } } -void BinaryFunction::emitConstantIslands(MCStreamer &Streamer) { +void BinaryFunction::emitConstantIslands(MCStreamer &Streamer, + bool EmitColdPart) { if (DataOffsets.empty()) return; - Streamer.EmitLabel(getFunctionConstantIslandLabel()); + if (!EmitColdPart) + Streamer.EmitLabel(getFunctionConstantIslandLabel()); + else + Streamer.EmitLabel(getFunctionColdConstantIslandLabel()); // Raw contents of the function. StringRef SectionContents; Section.getContents(SectionContents); @@ -2196,7 +2206,10 @@ void BinaryFunction::emitConstantIslands(MCStreamer &Streamer) { if (IS != IslandSymbols.end() && FunctionOffset == IS->first) { DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << IS->second->getName() << " at offset 0x" << Twine::utohexstr(IS->first) << '\n'); - Streamer.EmitLabel(IS->second); + if (!EmitColdPart) + Streamer.EmitLabel(IS->second); + else + Streamer.EmitLabel(ColdIslandSymbols[IS->second]); ++IS; } if (RI != MoveRelocations.end() && FunctionOffset == RI->first) { @@ -2218,6 +2231,33 @@ void BinaryFunction::emitConstantIslands(MCStreamer &Streamer) { assert(IS == IslandSymbols.end() && "some symbols were not emitted!"); } +void BinaryFunction::duplicateConstantIslands() { + for (auto BB : layout()) { + if (!BB->isCold()) + continue; + + for (auto &Inst : *BB) { + int OpNum = 0; + for (auto &Operand : Inst) { + if (!Operand.isExpr()) { + ++OpNum; + continue; + } + const auto *Symbol = BC.MIA->getTargetSymbol(Inst, OpNum); + auto ISym = ColdIslandSymbols.find(Symbol); + if (ISym == ColdIslandSymbols.end()) + continue; + Operand = MCOperand::createExpr(BC.MIA->getTargetExprFor( + Inst, + MCSymbolRefExpr::create(ISym->second, MCSymbolRefExpr::VK_None, + *BC.Ctx), + *BC.Ctx, 0)); + ++OpNum; + } + } + } +} + namespace { #ifndef MAX_PATH @@ -2480,7 +2520,8 @@ void BinaryFunction::fixBranches() { assert(CondBranch && "conditional branch expected"); const auto *TSuccessor = BB->getConditionalSuccessor(true); const auto *FSuccessor = BB->getConditionalSuccessor(false); - if (NextBB && NextBB == TSuccessor) { + if (NextBB && NextBB == TSuccessor && + !BC.MIA->hasAnnotation(*CondBranch, "DoNotChangeTarget")) { std::swap(TSuccessor, FSuccessor); MIA->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx); BB->swapConditionalSuccessors(); @@ -2490,7 +2531,10 @@ void BinaryFunction::fixBranches() { if (TSuccessor == FSuccessor) { BB->removeDuplicateConditionalSuccessor(CondBranch); } - if (!NextBB || (NextBB != TSuccessor && NextBB != FSuccessor)) { + if (!NextBB || + ((NextBB != TSuccessor || + BC.MIA->hasAnnotation(*CondBranch, "DoNotChangeTarget")) && + NextBB != FSuccessor)) { BB->addBranchInstruction(FSuccessor); } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 472890a6e327..7d755eff2b07 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -431,6 +431,7 @@ class BinaryFunction { /// The address offset where we emitted the constant island, that is, the /// chunk of data in the function code area (AArch only) int64_t OutputDataOffset; + int64_t OutputColdDataOffset; /// Map labels to corresponding basic blocks. std::unordered_map LabelToBB; @@ -639,6 +640,7 @@ class BinaryFunction { /// Offsets in function that are data values in a constant island identified /// after disassembling std::map IslandSymbols; + std::map ColdIslandSymbols; // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), @@ -677,6 +679,7 @@ class BinaryFunction { mutable MCSymbol *FunctionColdEndLabel{nullptr}; mutable MCSymbol *FunctionConstantIslandLabel{nullptr}; + mutable MCSymbol *FunctionColdConstantIslandLabel{nullptr}; /// Unique number associated with the function. uint64_t FunctionNumber; @@ -1137,6 +1140,14 @@ class BinaryFunction { return FunctionConstantIslandLabel; } + MCSymbol *getFunctionColdConstantIslandLabel() const { + if (!FunctionColdConstantIslandLabel) { + FunctionColdConstantIslandLabel = + BC.Ctx->createTempSymbol("func_cold_const_island", true); + } + return FunctionColdConstantIslandLabel; + } + /// Return true if this is a function representing a PLT entry. bool isPLTFunction() const { return PLTSymbol != nullptr; @@ -1168,13 +1179,16 @@ class BinaryFunction { case ELF::R_X86_64_64: case ELF::R_AARCH64_ABS64: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: case ELF::R_AARCH64_ADD_ABS_LO12_NC: case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST32_ABS_LO12_NC: case ELF::R_AARCH64_LDST8_ABS_LO12_NC: case ELF::R_AARCH64_LDST128_ABS_LO12_NC: case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: case ELF::R_AARCH64_ADR_PREL_PG_HI21: Relocations.emplace(Offset, Relocation{Offset, Symbol, RelType, Addend, Value}); @@ -1186,6 +1200,7 @@ class BinaryFunction { case ELF::R_X86_64_REX_GOTPCRELX: case ELF::R_AARCH64_JUMP26: case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_TLSDESC_CALL: break; // The following relocations are ignored. @@ -1714,6 +1729,14 @@ class BinaryFunction { return OutputDataOffset; } + void setOutputColdDataAddress(uint64_t Address) { + OutputColdDataOffset = Address; + } + + uint64_t getOutputColdDataAddress() const { + return OutputColdDataOffset; + } + /// Detects whether \p Address is inside a data region in this function /// (constant islands). bool isInConstantIsland(uint64_t Address) const { @@ -1969,7 +1992,12 @@ class BinaryFunction { void emitBodyRaw(MCStreamer *Streamer); /// Helper for emitBody to write data inside a function (used for AArch64) - void emitConstantIslands(MCStreamer &Streamer); + void emitConstantIslands(MCStreamer &Streamer, bool EmitColdPart); + + /// Traverse cold basic blocks and replace references to constants in islands + /// with a proxy symbol for the duplicated constant island that is going to be + /// emitted in the cold region. + void duplicateConstantIslands(); /// Merge profile data of this function into those of the given /// function. The functions should have been proven identical with diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 687c10497765..48d3d63dfc7c 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -447,6 +447,8 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass( llvm::make_unique(PrintAfterLowering)); + Manager.registerPass(llvm::make_unique(NeverPrint)); + Manager.runPasses(); } diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index b90cc0f10c52..03727de0f6d2 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -591,13 +591,6 @@ void FinalizeFunctions::runOnFunctions( auto &Function = It.second; const auto ShouldOptimize = shouldOptimize(Function); - // Strip all annotations. - for (auto &BB : Function) { - for (auto &Inst : BB) { - BC.MIA->removeAllAnnotations(Inst); - } - } - // Always fix functions in relocation mode. if (!BC.HasRelocations && !ShouldOptimize) continue; @@ -620,6 +613,22 @@ void FinalizeFunctions::runOnFunctions( } } +void StripAnnotations::runOnFunctions( + BinaryContext &BC, + std::map &BFs, + std::set & +) { + for (auto &It : BFs) { + auto &Function = It.second; + + for (auto &BB : Function) { + for (auto &Inst : BB) { + BC.MIA->removeAllAnnotations(Inst); + } + } + } +} + namespace { // This peephole fixes jump instructions that jump to another basic diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 0ef8e9027d55..955e9cfe3f50 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -227,6 +227,20 @@ class FinalizeFunctions : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// Strip all BOLT-related annotations before LLVM code emission +class StripAnnotations : public BinaryFunctionPass { + public: + explicit StripAnnotations(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "strip-annotations"; + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + /// An optimization to simplify conditional tail calls by removing /// unnecessary branches. /// diff --git a/bolt/Passes/LongJmp.cpp b/bolt/Passes/LongJmp.cpp index a3d004649899..d6a60bbced17 100644 --- a/bolt/Passes/LongJmp.cpp +++ b/bolt/Passes/LongJmp.cpp @@ -82,9 +82,20 @@ LongJmpPass::replaceTargetWithStub(const BinaryContext &BC, BinaryBasicBlock::BinaryBranchInfo BI{0, 0}; auto *TgtBB = BB.getSuccessor(TgtSym, BI); - // Do not issue a long jmp for blocks in the same region - if (TgtBB && TgtBB->isCold() == BB.isCold()) - return nullptr; + // Do not issue a long jmp for blocks in the same region, except if + // the region is too large to fit in this branch + if (TgtBB && TgtBB->isCold() == BB.isCold()) { + // Suppose we have half the available space to account for increase in the + // function size due to extra blocks being inserted (conservative estimate) + auto BitsAvail = BC.MIA->getPCRelEncodingSize(Inst) - 2; + uint64_t Mask = ~((1ULL << BitsAvail) - 1); + if (!(Func.getMaxSize() & Mask)) + return nullptr; + // This is a special case for fixBranches, which is usually free to swap + // targets when a block has two successors. The other successor may not + // fit in this instruction as well. + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "DoNotChangeTarget", true); + } BinaryBasicBlock *StubBB = BB.isCold() ? ColdStubs[&Func][TgtSym] : HotStubs[&Func][TgtSym]; @@ -155,8 +166,12 @@ void LongJmpPass::insertStubs(const BinaryContext &BC, BinaryFunction &Func) { // Insert stubs close to the patched BB if call, but far away from the // hot path if a branch, since this branch target is the cold region BinaryBasicBlock *InsertionPoint = &BB; - if (!BC.MIA->isCall(Inst) && Frontier && !BB.isCold()) - InsertionPoint = Frontier; + if (!BC.MIA->isCall(Inst) && Frontier && !BB.isCold()) { + auto BitsAvail = BC.MIA->getPCRelEncodingSize(Inst) - 2; + uint64_t Mask = ~((1ULL << BitsAvail) - 1); + if (!(Func.getMaxSize() & Mask)) + InsertionPoint = Frontier; + } // Create a stub to handle a far-away target Insertions.emplace_back(std::make_pair( InsertionPoint, replaceTargetWithStub(BC, Func, BB, Inst))); @@ -190,43 +205,67 @@ void LongJmpPass::tentativeBBLayout(const BinaryContext &BC, } } -uint64_t LongJmpPass::tentativeLayoutRelocMode( +uint64_t LongJmpPass::tentativeLayoutRelocColdPart( const BinaryContext &BC, std::vector &SortedFunctions, uint64_t DotAddress) { - - // Hot for (auto Func : SortedFunctions) { + if (!Func->isSplit()) + continue; DotAddress = RoundUpToAlignment(DotAddress, BinaryFunction::MinAlign); auto Pad = OffsetToAlignment(DotAddress, opts::AlignFunctions); if (Pad <= opts::AlignFunctionsMaxBytes) DotAddress += Pad; - HotAddresses[Func] = DotAddress; - DEBUG(dbgs() << Func->getPrintName() - << " tentative: " << Twine::utohexstr(DotAddress) << "\n"); - if (!Func->isSimple()) { - DotAddress += Func->getMaxSize(); - } else { - if (!Func->isSplit()) { - DotAddress += Func->estimateSize(); - } else { - DotAddress += Func->estimateHotSize(); - DotAddress += Func->estimateConstantIslandSize(); - } + ColdAddresses[Func] = DotAddress; + DEBUG(dbgs() << Func->getPrintName() << " cold tentative: " + << Twine::utohexstr(DotAddress) << "\n"); + DotAddress += Func->estimateColdSize(); + DotAddress += Func->estimateConstantIslandSize(); + } + return DotAddress; +} + +uint64_t LongJmpPass::tentativeLayoutRelocMode( + const BinaryContext &BC, std::vector &SortedFunctions, + uint64_t DotAddress) { + + // Compute hot cold frontier + uint32_t LastHotIndex = -1u; + uint32_t CurrentIndex = 0; + for (auto *BF : SortedFunctions) { + if (!BF->hasValidIndex() && LastHotIndex == -1u) { + LastHotIndex = CurrentIndex; } + ++CurrentIndex; } - // Cold + + // Hot + CurrentIndex = 0; + bool ColdLayoutDone = false; for (auto Func : SortedFunctions) { + if (!ColdLayoutDone && CurrentIndex >= LastHotIndex){ + DotAddress = + tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress); + ColdLayoutDone = true; + } + DotAddress = RoundUpToAlignment(DotAddress, BinaryFunction::MinAlign); auto Pad = OffsetToAlignment(DotAddress, opts::AlignFunctions); if (Pad <= opts::AlignFunctionsMaxBytes) DotAddress += Pad; - HotAddresses[Func] = Func->getAddress(); - DotAddress = RoundUpToAlignment(DotAddress, ColdFragAlign); - ColdAddresses[Func] = DotAddress; - if (Func->isSplit()) - DotAddress += Func->estimateColdSize(); - tentativeBBLayout(BC, *Func); + HotAddresses[Func] = DotAddress; + DEBUG(dbgs() << Func->getPrintName() + << " tentative: " << Twine::utohexstr(DotAddress) << "\n"); + if (!Func->isSplit()) + DotAddress += Func->estimateSize(); + else + DotAddress += Func->estimateHotSize(); + DotAddress += Func->estimateConstantIslandSize(); + ++CurrentIndex; } + // BBs + for (auto Func : SortedFunctions) + tentativeBBLayout(BC, *Func); + return DotAddress; } @@ -337,6 +376,30 @@ bool LongJmpPass::removeOrShrinkStubs(const BinaryContext &BC, continue; } + // Compute DoNotChangeTarget annotation, when fixBranches cannot swap + // targets + if (BC.MIA->isConditionalBranch(Inst) && BB.succ_size() == 2) { + auto *SuccBB = BB.getConditionalSuccessor(false); + bool IsStub = false; + auto Iter = Stubs.find(&Func); + if (Iter != Stubs.end()) + IsStub = Iter->second.count(SuccBB); + auto *RealTargetSym = + IsStub ? BC.MIA->getTargetSymbol(*SuccBB->begin()) : nullptr; + if (IsStub) + SuccBB = Func.getBasicBlockForLabel(RealTargetSym); + uint64_t Offset = getSymbolAddress(BC, RealTargetSym, SuccBB); + auto BitsAvail = BC.MIA->getPCRelEncodingSize(Inst) - 1; + uint64_t Mask = ~((1ULL << BitsAvail) - 1); + if ((Offset & Mask) && + !BC.MIA->hasAnnotation(Inst, "DoNotChangeTarget")) { + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "DoNotChangeTarget", true); + } else if ((!(Offset & Mask)) && + BC.MIA->hasAnnotation(Inst, "DoNotChangeTarget")) { + BC.MIA->removeAnnotation(Inst, "DoNotChangeTarget"); + } + } + auto StubSym = BC.MIA->getTargetSymbol(Inst); auto *StubBB = Func.getBasicBlockForLabel(StubSym); auto *RealTargetSym = BC.MIA->getTargetSymbol(*StubBB->begin()); diff --git a/bolt/Passes/LongJmp.h b/bolt/Passes/LongJmp.h index e54cc1ccb2b7..e771b6767076 100644 --- a/bolt/Passes/LongJmp.h +++ b/bolt/Passes/LongJmp.h @@ -83,6 +83,10 @@ class LongJmpPass : public BinaryFunctionPass { tentativeLayoutRelocMode(const BinaryContext &BC, std::vector &SortedFunctions, uint64_t DotAddress); + uint64_t + tentativeLayoutRelocColdPart(const BinaryContext &BC, + std::vector &SortedFunctions, + uint64_t DotAddress); void tentativeBBLayout(const BinaryContext &BC, const BinaryFunction &Func); /// Helper to identify whether \p Inst is branching to a stub @@ -92,7 +96,8 @@ class LongJmpPass : public BinaryFunctionPass { /// Helper to resolve a symbol address according to our tentative layout uint64_t getSymbolAddress(const BinaryContext &BC, const MCSymbol *Target, const BinaryBasicBlock *TgtBB) const; - /// Change \p Inst to not use a stub anymore, back to its original form + + /// Change \p Inst to do not use a stub anymore, back to its original form void removeStubRef(const BinaryContext &BC, BinaryBasicBlock *BB, MCInst &Inst, BinaryBasicBlock *StubBB, diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 22ffe9f273c0..518b9e9df5c7 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1846,7 +1846,10 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { uint64_t RefFunctionOffset = 0; MCSymbol *ReferencedSymbol = nullptr; if (ForceRelocation) { - ReferencedSymbol = BC->registerNameAtAddress(SymbolName, 0); + if (Relocation::isGOT(Rel.getType())) + ReferencedSymbol = BC->getOrCreateGlobalSymbol(0, "Zero"); + else + ReferencedSymbol = BC->registerNameAtAddress(SymbolName, 0); Addend = Address; DEBUG(dbgs() << "BOLT-DEBUG: creating relocations for huge pages against" " symbol " << SymbolName << " with addend " << Addend @@ -2633,6 +2636,11 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { const auto ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol); Function.cold().setAddress(BaseAddress + ColdStartOffset); Function.cold().setImageSize(ColdEndOffset - ColdStartOffset); + if (Function.hasConstantIsland()) { + const auto DataOffset = Layout.getSymbolOffset( + *Function.getFunctionColdConstantIslandLabel()); + Function.setOutputColdDataAddress(BaseAddress + DataOffset); + } } } else { Function.setOutputAddress(Function.getAddress()); @@ -3379,6 +3387,25 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { Write(0, reinterpret_cast(&CodeMarkSym), sizeof(CodeMarkSym)); } + if (!PatchExisting && Function->hasConstantIsland() && + Function->isSplit()) { + auto DataMark = Function->getOutputColdDataAddress(); + auto CISize = Function->estimateConstantIslandSize(); + auto CodeMark = DataMark + CISize; + auto DataMarkSym = NewSymbol; + DataMarkSym.st_name = AddToStrTab("$d"); + DataMarkSym.st_value = DataMark; + DataMarkSym.st_size = 0; + DataMarkSym.setType(ELF::STT_NOTYPE); + DataMarkSym.setBinding(ELF::STB_LOCAL); + auto CodeMarkSym = DataMarkSym; + CodeMarkSym.st_name = AddToStrTab("$x"); + CodeMarkSym.st_value = CodeMark; + Write(0, reinterpret_cast(&DataMarkSym), + sizeof(DataMarkSym)); + Write(0, reinterpret_cast(&CodeMarkSym), + sizeof(CodeMarkSym)); + } } else { if (NewSymbol.st_shndx < ELF::SHN_LORESERVE) { NewSymbol.st_shndx = NewSectionIndex[NewSymbol.st_shndx]; From c4a6b969376946ef76bfed413d5fdae21edcb648 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 13 Dec 2017 23:12:01 -0800 Subject: [PATCH 365/904] [BOLT] New profile format Summary: A new profile that is more resilient to minor binary modifications. BranchData is eliminated. For calls, the data is converted into instruction annotations if the profile matches a function. If a profile cannot be matched, AllCallSites data should have call sites profiles. The new profile format is YAML, which is quite verbose. It still takes less space than the older format because we avoid function name repetition. The plan is to get rid of the old profile format eventually. merge-fdata does not work with the new format yet. (cherry picked from commit 19f6fdf87e02e277d0b96f8ab06e5a38d8cbe5bb) --- bolt/BinaryBasicBlock.h | 7 +- bolt/BinaryFunction.cpp | 68 +++++- bolt/BinaryFunction.h | 78 ++++++- bolt/BinaryFunctionProfile.cpp | 130 ++++++++---- bolt/CMakeLists.txt | 2 + bolt/DataAggregator.cpp | 3 +- bolt/Passes/BinaryFunctionCallGraph.cpp | 59 +++--- bolt/Passes/IndirectCallPromotion.cpp | 87 ++++---- bolt/Passes/IndirectCallPromotion.h | 14 +- bolt/Passes/PLTCall.cpp | 2 +- bolt/ProfileReader.cpp | 265 ++++++++++++++++++++++++ bolt/ProfileReader.h | 68 ++++++ bolt/ProfileWriter.cpp | 174 ++++++++++++++++ bolt/ProfileWriter.h | 53 +++++ bolt/ProfileYAMLMapping.h | 147 +++++++++++++ bolt/RewriteInstance.cpp | 67 ++++-- bolt/RewriteInstance.h | 4 +- 17 files changed, 1061 insertions(+), 167 deletions(-) create mode 100644 bolt/ProfileReader.cpp create mode 100644 bolt/ProfileReader.h create mode 100644 bolt/ProfileWriter.cpp create mode 100644 bolt/ProfileWriter.h create mode 100644 bolt/ProfileYAMLMapping.h diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 52db09c8a8ed..90b4f11cdc7f 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -342,12 +342,17 @@ class BinaryBasicBlock { /// an unconditional branch) and thus has 2 successors, return a successor /// corresponding to a jump condition which could be true or false. /// Return nullptr if the basic block does not have a conditional jump. - const BinaryBasicBlock *getConditionalSuccessor(bool Condition) const { + BinaryBasicBlock *getConditionalSuccessor(bool Condition) { if (succ_size() != 2) return nullptr; return Successors[Condition == true ? 0 : 1]; } + const BinaryBasicBlock *getConditionalSuccessor(bool Condition) const { + return + const_cast(this)->getConditionalSuccessor(Condition); + } + /// Find the fallthrough successor for a block, or nullptr if there is /// none. const BinaryBasicBlock* getFallthrough() const { diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 8f4df24254fd..3820f7713f4f 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1466,7 +1466,7 @@ bool BinaryFunction::buildCFG() { for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { const auto Offset = I->first; - const auto &Instr = I->second; + auto &Instr = I->second; auto LI = Labels.find(Offset); if (LI != Labels.end()) { @@ -1819,6 +1819,11 @@ uint64_t BinaryFunction::getFunctionScore() { if (FunctionScore != -1) return FunctionScore; + if (!isSimple() || !hasValidProfile()) { + FunctionScore = 0; + return FunctionScore; + } + uint64_t TotalScore = 0ULL; for (auto BB : layout()) { uint64_t BBExecCount = BB->getExecutionCount(); @@ -2620,6 +2625,41 @@ void BinaryFunction::postProcessBranches() { assert(validateCFG() && "invalid CFG"); } +const MCSymbol *BinaryFunction::getSymbolForEntry(uint64_t EntryNum) const { + if (EntryNum == 0) + return getSymbol(); + + if (!isMultiEntry()) + return nullptr; + + uint64_t NumEntries = 0; + for (auto *BB : BasicBlocks) { + if (!BB->isEntryPoint()) + continue; + if (NumEntries == EntryNum) + return BB->getLabel(); + ++NumEntries; + } + + return nullptr; +} + +uint64_t BinaryFunction::getEntryForSymbol(const MCSymbol *EntrySymbol) const { + if (getSymbol() == EntrySymbol) + return 0; + + uint64_t NumEntries = 0; + for (const auto *BB : BasicBlocks) { + if (!BB->isEntryPoint()) + continue; + if (BB->getLabel() == EntrySymbol) + return NumEntries; + ++NumEntries; + } + + llvm_unreachable("no entry for symbol"); +} + BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { BasicBlockOrderType DFS; unsigned Index = 0; @@ -2649,8 +2689,24 @@ BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { Stack.push(SuccBB); } - for (auto *SuccBB : BB->successors()) { - Stack.push(SuccBB); + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + if (BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch) && + CondBranch && BB->succ_size() == 2) { + if (BC.MIA->getCanonicalBranchOpcode(CondBranch->getOpcode()) == + CondBranch->getOpcode()) { + Stack.push(BB->getConditionalSuccessor(true)); + Stack.push(BB->getConditionalSuccessor(false)); + } else { + Stack.push(BB->getConditionalSuccessor(false)); + Stack.push(BB->getConditionalSuccessor(true)); + } + } else { + for (auto *SuccBB : BB->successors()) { + Stack.push(SuccBB); + } } } @@ -2826,6 +2882,9 @@ bool BinaryFunction::equalJumpTables(const JumpTable *JumpTableA, } std::size_t BinaryFunction::hash(bool Recompute, bool UseDFS) const { + if (size() == 0) + return 0; + assert(hasCFG() && "function is expected to have CFG"); if (!Recompute) @@ -3687,13 +3746,14 @@ DynoStats BinaryFunction::getDynoStats() const { Stats[DynoStats::INDIRECT_CALLS] += CallFreq; } else if (const auto *CallSymbol = BC.MIA->getTargetSymbol(Instr)) { const auto *BF = BC.getFunctionForSymbol(CallSymbol); - if (BF && BF->isPLTFunction()) + if (BF && BF->isPLTFunction()) { Stats[DynoStats::PLT_CALLS] += CallFreq; // We don't process PLT functions and hence have to adjust // relevant dynostats here. Stats[DynoStats::LOADS] += CallFreq; Stats[DynoStats::INDIRECT_CALLS] += CallFreq; + } } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 7d755eff2b07..5ed98c22da13 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -164,6 +164,40 @@ enum IndirectCallPromotionType : char { ICP_ALL /// Perform ICP on calls and jump tables. }; +/// Information on a single indirect call to a particular callee. +struct IndirectCallProfile { + bool IsFunction; + uint32_t Offset; + StringRef Name; + uint64_t Count; + uint64_t Mispreds; + + IndirectCallProfile(bool IsFunction, StringRef Name, uint64_t Count, + uint64_t Mispreds, uint32_t Offset = 0) + : IsFunction(IsFunction), Offset(Offset), Name(Name), Count(Count), + Mispreds(Mispreds) {} + + bool operator==(const IndirectCallProfile &Other) const { + return IsFunction == Other.IsFunction && + Name == Other.Name && + Offset == Other.Offset; + } +}; + +/// Aggregated information for an indirect call site. +using IndirectCallSiteProfile = SmallVector; + +inline raw_ostream &operator<<(raw_ostream &OS, + const bolt::IndirectCallSiteProfile &ICSP) { + const char *Sep = ""; + for (auto &CSP : ICSP) { + OS << Sep << "{ " << (CSP.IsFunction ? CSP.Name : "") << ": " + << CSP.Count << " (" << CSP.Mispreds << " misses) }"; + Sep = ", "; + } + return OS; +} + /// BinaryFunction is a representation of machine-level function. /// /// We use the term "Binary" as "Machine" was already taken. @@ -294,6 +328,14 @@ class BinaryFunction { /// Profile match ratio for BranchData. float ProfileMatchRatio{0.0f}; + /// Indicates if function profile was collected using LBRs. + bool HasLBRProfile{true}; + + /// For functions with mismatched profile we store all call profile + /// information at a function level (as opposed to tying it to + /// specific call sites). + IndirectCallSiteProfile AllCallSites; + /// Score of the function (estimated number of instructions executed, /// according to profile data). -1 if the score has not been calculated yet. int64_t FunctionScore{-1}; @@ -511,11 +553,11 @@ class BinaryFunction { /// function and that apply before the entry basic block). CFIInstrMapType CIEFrameInstructions; +public: /// Representation of a jump table. /// /// The jump table may include other jump tables that are referenced by /// a different label at a different offset in this jump table. -public: struct JumpTable { enum JumpTableType : char { JTT_NORMAL, @@ -745,10 +787,6 @@ class BinaryFunction { Instructions.emplace(Offset, std::forward(Instruction)); } - /// Return instruction at a given offset in the function. Valid before - /// CFG is constructed or while instruction offsets are available in CFG. - MCInst *getInstructionAtOffset(uint64_t Offset); - /// Analyze and process indirect branch \p Instruction before it is /// added to Instructions list. IndirectBranchType processIndirectBranch(MCInst &Instruction, @@ -978,6 +1016,10 @@ class BinaryFunction { return nullptr; } + /// Return instruction at a given offset in the function. Valid before + /// CFG is constructed or while instruction offsets are available in CFG. + MCInst *getInstructionAtOffset(uint64_t Offset); + /// Return the name of the function as extracted from the binary file. /// If the function has multiple names - return the last one /// followed by "(*#)". @@ -1102,6 +1144,13 @@ class BinaryFunction { return OutputSymbol; } + /// Return MC symbol corresponding to an enumerated entry for multiple-entry + /// functions. + const MCSymbol *getSymbolForEntry(uint64_t EntryNum) const; + + /// Return an entry ID corresponding to a symbol. + uint64_t getEntryForSymbol(const MCSymbol *EntrySymbol) const; + MCSymbol *getColdSymbol() { if (ColdSymbol) return ColdSymbol; @@ -1895,6 +1944,15 @@ class BinaryFunction { MemData = Data; } + /// Return all call site profile info for this function. + IndirectCallSiteProfile &getAllCallSites() { + return AllCallSites; + } + + const IndirectCallSiteProfile &getAllCallSites() const { + return AllCallSites; + } + /// Walks the list of basic blocks filling in missing information about /// edge frequency for fall-throughs. /// @@ -2004,6 +2062,9 @@ class BinaryFunction { /// isIdenticalWith. void mergeProfileDataInto(BinaryFunction &BF) const; + /// Convert function-level branch data into instruction annotations. + void convertBranchData(); + /// Returns true if this function has identical code and CFG with /// the given function \p BF. /// @@ -2303,6 +2364,13 @@ template <> struct GraphTraits> : } }; +template <> +class MCAnnotationPrinter { +public: + void print(raw_ostream &OS, const bolt::IndirectCallSiteProfile &ICSP) const { + OS << ICSP; + } +}; } // namespace llvm diff --git a/bolt/BinaryFunctionProfile.cpp b/bolt/BinaryFunctionProfile.cpp index 66bf634ef6e9..30dc96e72ae5 100644 --- a/bolt/BinaryFunctionProfile.cpp +++ b/bolt/BinaryFunctionProfile.cpp @@ -261,7 +261,8 @@ bool BinaryFunction::recordBranch(uint64_t From, uint64_t To, if (!FromBB->getSuccessor(ToBB->getLabel())) { // Check if this is a recursive call or a return from a recursive call. - if (ToBB->isEntryPoint()) { + if (ToBB->isEntryPoint() && (BC.MIA->isCall(*FromInstruction) || + BC.MIA->isIndirectBranch(*FromInstruction))) { // Execution count is already accounted for. return true; } @@ -289,8 +290,18 @@ bool BinaryFunction::recordEntry(uint64_t To, bool Mispred, uint64_t Count) { if (!hasProfile()) ExecutionCount = 0; - if (To == 0) + BinaryBasicBlock *EntryBB = nullptr; + if (To == 0) { ExecutionCount += Count; + if (!empty()) + EntryBB = &front(); + } else if (auto *BB = getBasicBlockAtOffset(To)) { + if (BB->isEntryPoint()) + EntryBB = BB; + } + + if (EntryBB) + EntryBB->setExecutionCount(EntryBB->getKnownExecutionCount() + Count); return true; } @@ -319,8 +330,7 @@ void BinaryFunction::postProcessProfile() { return; } - // Is we are using non-LBR sampling there's nothing left to do. - if (!BranchData) + if (!HasLBRProfile) return; // Bug compatibility with previous version - double accounting for conditional @@ -339,7 +349,8 @@ void BinaryFunction::postProcessProfile() { } // Pre-sort branch data. - std::stable_sort(BranchData->Data.begin(), BranchData->Data.end()); + if (BranchData) + std::stable_sort(BranchData->Data.begin(), BranchData->Data.end()); // If we have at least some branch data for the function indicate that it // was executed. @@ -347,39 +358,22 @@ void BinaryFunction::postProcessProfile() { ExecutionCount = 1; } - // Compute preliminary execution count for each basic block + // Compute preliminary execution count for each basic block. for (auto *BB : BasicBlocks) { - BB->ExecutionCount = 0; + if ((!BB->isEntryPoint() && !BB->isLandingPad()) || + BB->ExecutionCount == BinaryBasicBlock::COUNT_NO_PROFILE) + BB->ExecutionCount = 0; } for (auto *BB : BasicBlocks) { auto SuccBIIter = BB->branch_info_begin(); for (auto Succ : BB->successors()) { - if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE) + if (!Succ->isEntryPoint() && + SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE) Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count); ++SuccBIIter; } } - // Set entry BBs to zero, we'll update their execution count next with entry - // data (we maintain a separate data structure for branches to function entry - // points) - for (auto *BB : BasicBlocks) { - if (BB->isEntryPoint()) - BB->ExecutionCount = 0; - } - - // Update execution counts of landing pad blocks and entry BBs - // There is a slight skew introduced here as branches originated from RETs - // may be accounted for in the execution count of an entry block if the last - // instruction in a predecessor fall-through block is a call. This situation - // should rarely happen because there are few multiple-entry functions. - for (const auto &I : BranchData->EntryData) { - BinaryBasicBlock *BB = getBasicBlockAtOffset(I.To.Offset); - if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { - BB->setExecutionCount(BB->getExecutionCount() + I.Branches); - } - } - inferFallThroughCounts(); // Update profile information for jump tables based on CFG branch data. @@ -442,6 +436,7 @@ void BinaryFunction::readProfile() { return; if (!BC.DR.hasLBR()) { + HasLBRProfile = false; readSampleData(); return; } @@ -452,6 +447,23 @@ void BinaryFunction::readProfile() { if (!BranchData) return; + // Assign basic block counts to function entry points. These only include + // counts for outside entries. + // + // There is a slight skew introduced here as branches originated from RETs + // may be accounted for in the execution count of an entry block if the last + // instruction in a predecessor fall-through block is a call. This situation + // should rarely happen because there are few multiple-entry functions. + for (const auto &BI : BranchData->EntryData) { + BinaryBasicBlock *BB = getBasicBlockAtOffset(BI.To.Offset); + if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { + auto Count = BB->getExecutionCount(); + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE) + Count = 0; + BB->setExecutionCount(Count + BI.Branches); + } + } + uint64_t MismatchedBranches = 0; for (const auto &BI : BranchData->Data) { if (BI.From.Name != BI.To.Name) { @@ -466,25 +478,59 @@ void BinaryFunction::readProfile() { } } - // Special profile data propagation is required for conditional tail calls. - for (auto BB : BasicBlocks) { - auto *CTCInstr = BB->getLastNonPseudoInstr(); - if (!CTCInstr || !BC.MIA->getConditionalTailCall(*CTCInstr)) - continue; + // Convert branch data into annotations. + convertBranchData(); +} - auto OffsetOrErr = - BC.MIA->tryGetAnnotationAs(*CTCInstr, "Offset"); - assert(OffsetOrErr && "offset not set for conditional tail call"); +void BinaryFunction::convertBranchData() { + if (!BranchData || empty()) + return; + + // Profile information for calls. + // + // There are 3 cases that we annotate differently: + // 1) Conditional tail calls that could be mispredicted. + // 2) Indirect calls to multiple destinations with mispredictions. + // Before we validate CFG we have to handle indirect branches here too. + // 3) Regular direct calls. The count could be different from containing + // basic block count. Keep this data in case we find it useful. + // + for (auto &BI : BranchData->Data) { + // Ignore internal branches. + if (BI.To.IsSymbol && BI.To.Name == BI.From.Name && BI.To.Offset != 0) + continue; - auto BranchInfoOrErr = BranchData->getDirectCallBranch(*OffsetOrErr); - if (!BranchInfoOrErr) + auto *Instr = getInstructionAtOffset(BI.From.Offset); + if (!Instr || + (!BC.MIA->isCall(*Instr) && !BC.MIA->isIndirectBranch(*Instr))) continue; - BC.MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCTakenCount", - BranchInfoOrErr->Branches); - BC.MIA->addAnnotation(BC.Ctx.get(), *CTCInstr, "CTCMispredCount", - BranchInfoOrErr->Mispreds); + auto setOrUpdateAnnotation = [&](StringRef Name, uint64_t Count) { + if (opts::Verbosity >= 1 && BC.MIA->hasAnnotation(*Instr, Name)) { + errs() << "BOLT-WARNING: duplicate " << Name << " info for offset 0x" + << Twine::utohexstr(BI.From.Offset) + << " in function " << *this << '\n'; + } + auto &Value = BC.MIA->getOrCreateAnnotationAs(BC.Ctx.get(), + *Instr, Name); + Value += Count; + }; + + if (BC.MIA->isIndirectCall(*Instr) || BC.MIA->isIndirectBranch(*Instr)) { + IndirectCallSiteProfile &CSP = + BC.MIA->getOrCreateAnnotationAs(BC.Ctx.get(), + *Instr, "CallProfile"); + CSP.emplace_back(BI.To.IsSymbol, BI.To.Name, BI.Branches, + BI.Mispreds); + } else if (BC.MIA->getConditionalTailCall(*Instr)) { + setOrUpdateAnnotation("CTCTakenCount", BI.Branches); + setOrUpdateAnnotation("CTCMispredCount", BI.Mispreds); + } else { + setOrUpdateAnnotation("Count", BI.Branches); + } } + + BranchData = nullptr; } void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 959b19915f10..5e6ce6fe7b8c 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -71,6 +71,8 @@ add_llvm_tool(llvm-bolt DebugData.cpp DWARFRewriter.cpp Exceptions.cpp + ProfileReader.cpp + ProfileWriter.cpp RewriteInstance.cpp ) diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index a964c73069e4..1b039c44225a 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -746,8 +746,7 @@ std::error_code DataAggregator::parseBranchEvents() { ++NumSamples; NumEntries += Sample.LBR.size(); - // Parser semantic actions - // LBRs are stored in reverse execution order. NextLBR refers to next + // LBRs are stored in reverse execution order. NextLBR refers to the next // executed branch record. const LBREntry *NextLBR{nullptr}; for (const auto &LBR : Sample.LBR) { diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index 24dc378e1e4c..2b49f323cfab 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -134,7 +134,6 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, continue; } - const auto *BranchData = Function->getBranchData(); const auto SrcId = lookupNode(Function); // Offset of the current basic block from the beginning of the function uint64_t Offset = 0; @@ -166,25 +165,6 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, return false; }; - auto getCallInfoFromBranchData = [&](const BranchInfo &BI, bool IsStale) { - MCSymbol *DstSym = nullptr; - uint64_t Count; - if (BI.To.IsSymbol && (DstSym = BC.getGlobalSymbolByName(BI.To.Name))) { - Count = BI.Branches; - } else { - Count = COUNT_NO_PROFILE; - } - // If we are using the perf data for a stale function we need to filter - // out data which comes from branches. We'll assume that the To offset - // is non-zero for branches. - if (IsStale && BI.To.Offset != 0 && - (!DstSym || Function == BC.getFunctionForSymbol(DstSym))) { - DstSym = nullptr; - Count = COUNT_NO_PROFILE; - } - return std::make_pair(DstSym, Count); - }; - // Get pairs of (symbol, count) for each target at this callsite. // If the call is to an unknown function the symbol will be nullptr. // If there is no profiling data the count will be COUNT_NO_PROFILE. @@ -193,12 +173,15 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, const auto *DstSym = BC.MIA->getTargetSymbol(Inst); // If this is an indirect call use perf data directly. - if (!DstSym && BranchData && - BC.MIA->hasAnnotation(Inst, "Offset")) { - const auto InstrOffset = - BC.MIA->getAnnotationAs(Inst, "Offset"); - for (const auto &BI : BranchData->getBranchRange(InstrOffset)) { - Counts.push_back(getCallInfoFromBranchData(BI, false)); + if (!DstSym && BC.MIA->hasAnnotation(Inst, "CallProfile")) { + const auto &ICSP = + BC.MIA->getAnnotationAs(Inst, "CallProfile"); + for (const auto &CSI : ICSP) { + if (!CSI.IsFunction) + continue; + if (auto DstSym = BC.getGlobalSymbolByName(CSI.Name)) { + Counts.push_back(std::make_pair(DstSym, CSI.Count)); + } } } else { const auto Count = BB->getExecutionCount(); @@ -211,23 +194,29 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, // If the function has an invalid profile, try to use the perf data // directly (if requested). If there is no perf data for this function, // fall back to the CFG walker which attempts to handle missing data. - if (!Function->hasValidProfile() && CgFromPerfData && BranchData) { + if (!Function->hasValidProfile() && CgFromPerfData && + !Function->getAllCallSites().empty()) { DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: Falling back to perf data" << " for " << *Function << "\n"); ++NumFallbacks; const auto Size = functionSize(Function); - for (const auto &BI : BranchData->Data) { - Offset = BI.From.Offset; + for (const auto &CSI : Function->getAllCallSites()) { + ++TotalCallsites; + + if (!CSI.IsFunction) + continue; + + auto *DstSym = BC.getGlobalSymbolByName(CSI.Name); + if (!DstSym) + continue; + // The computed offset may exceed the hot part of the function; hence, - // bound it the size + // bound it by the size. + Offset = CSI.Offset; if (Offset > Size) Offset = Size; - const auto CI = getCallInfoFromBranchData(BI, true); - if (!CI.first && CI.second == COUNT_NO_PROFILE) // probably a branch - continue; - ++TotalCallsites; - if (!recordCall(CI.first, CI.second)) { + if (!recordCall(DstSym, CSI.Count)) { ++NotProcessed; } } diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index f76564355c7b..299a339a69ad 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -142,21 +142,13 @@ namespace llvm { namespace bolt { IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF, - const BranchInfo &BI) -: From(BF.getSymbol()), - To(uint64_t(BI.To.Offset)), - Mispreds{uint64_t(BI.Mispreds)}, - Branches{uint64_t(BI.Branches)}, - Histories{BI.Histories} { - if (BI.To.IsSymbol) { - auto &BC = BF.getBinaryContext(); - auto Itr = BC.GlobalSymbols.find(BI.To.Name); - if (Itr != BC.GlobalSymbols.end()) { - To.IsSymbol = true; - To.Sym = BC.getOrCreateGlobalSymbol(Itr->second, "FUNCat"); - To.Addr = 0; - assert(To.Sym); - } + const IndirectCallProfile &ICP) + : From(BF.getSymbol()), + To(ICP.Offset), + Mispreds(ICP.Mispreds), + Branches(ICP.Count) { + if (ICP.IsFunction) { + To.Sym = BF.getBinaryContext().getGlobalSymbolByName(ICP.Name); } } @@ -192,20 +184,18 @@ IndirectCallPromotion::getCallTargets( Entry == BF.getFunctionColdEndLabel()) continue; const Location To(Entry); - Callsite CS{ - From, To, JI->Mispreds, JI->Count, BranchHistories(), - I - Range.first}; - Targets.emplace_back(CS); + Targets.emplace_back( + From, To, JI->Mispreds, JI->Count, I - Range.first); } // Sort by symbol then addr. std::sort(Targets.begin(), Targets.end(), [](const Callsite &A, const Callsite &B) { - if (A.To.IsSymbol && B.To.IsSymbol) + if (A.To.Sym && B.To.Sym) return A.To.Sym < B.To.Sym; - else if (A.To.IsSymbol && !B.To.IsSymbol) + else if (A.To.Sym && !B.To.Sym) return true; - else if (!A.To.IsSymbol && B.To.IsSymbol) + else if (!A.To.Sym && B.To.Sym) return false; else return A.To.Addr < B.To.Addr; @@ -221,7 +211,7 @@ IndirectCallPromotion::getCallTargets( while (++First != Last) { auto &A = *Result; const auto &B = *First; - if (A.To.IsSymbol && B.To.IsSymbol && A.To.Sym == B.To.Sym) { + if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym) { A.JTIndex.insert(A.JTIndex.end(), B.JTIndex.begin(), B.JTIndex.end()); } else { *(++Result) = *First; @@ -241,13 +231,13 @@ IndirectCallPromotion::getCallTargets( Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter()) { return Targets; } - const auto *BranchData = BF.getBranchData(); - assert(BranchData && "expected initialized branch data"); - auto Offset = BC.MIA->getAnnotationAs(Inst, "Offset"); - for (const auto &BI : BranchData->getBranchRange(Offset)) { - Callsite Site(BF, BI); - if (Site.isValid()) { - Targets.emplace_back(std::move(Site)); + auto ICSP = + BC.MIA->tryGetAnnotationAs(Inst, "CallProfile"); + if (ICSP) { + for (const auto &CSP : ICSP.get()) { + Callsite Site(BF, CSP); + if (Site.isValid()) + Targets.emplace_back(std::move(Site)); } } } @@ -262,7 +252,7 @@ IndirectCallPromotion::getCallTargets( auto Last = std::remove_if(Targets.begin(), Targets.end(), [](const Callsite &CS) { - return !CS.To.IsSymbol; + return !CS.To.Sym; }); Targets.erase(Last, Targets.end()); @@ -540,7 +530,7 @@ IndirectCallPromotion::findCallTargetSymbols( for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) { auto &Target = Targets[TgtIdx]; - assert(Target.To.IsSymbol && "All ICP targets must be to known symbols"); + assert(Target.To.Sym && "All ICP targets must be to known symbols"); assert(!Target.JTIndex.empty() && "Jump tables must have indices"); for (auto Idx : Target.JTIndex) { SymTargets.push_back(std::make_pair(Target.To.Sym, Idx)); @@ -549,7 +539,7 @@ IndirectCallPromotion::findCallTargetSymbols( } } else { for (size_t I = 0; I < N; ++I) { - assert(Targets[I].To.IsSymbol && + assert(Targets[I].To.Sym && "All ICP targets must be to known symbols"); assert(Targets[I].JTIndex.empty() && "Can't have jump table indices for non-jump tables"); @@ -725,7 +715,7 @@ IndirectCallPromotion::rewriteCall( auto TBB = Function.createBasicBlock(0, Sym); for (auto &Inst : Insts) { // sanitize new instructions. if (BC.MIA->isCall(Inst)) - BC.MIA->removeAnnotation(Inst, "Offset"); + BC.MIA->removeAnnotation(Inst, "CallProfile"); } TBB->addInstructions(Insts.begin(), Insts.end()); NewBBs.emplace_back(std::move(TBB)); @@ -822,7 +812,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( std::vector SymTargets; for (size_t I = 0; I < Targets.size(); ++I) { - assert(Targets[I].To.IsSymbol); + assert(Targets[I].To.Sym); if (Targets[I].JTIndex.empty()) SymTargets.push_back(Targets[I].To.Sym); else { @@ -1089,7 +1079,7 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, const auto Frequency = 100.0 * Targets[I].Branches / NumCalls; const auto MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls; outs() << "BOLT-INFO: "; - if (Targets[I].To.IsSymbol) + if (Targets[I].To.Sym) outs() << Targets[I].To.Sym->getName(); else outs() << Targets[I].To.Addr; @@ -1188,7 +1178,7 @@ void IndirectCallPromotion::runOnFunctions( if (!Function.isSimple() || !opts::shouldProcess(Function) || - !Function.getBranchData()) + !Function.hasProfile()) continue; const bool HasLayout = !Function.layout_empty(); @@ -1199,12 +1189,13 @@ void IndirectCallPromotion::runOnFunctions( for (auto &Inst : BB) { const bool IsJumpTable = Function.getJumpTable(Inst); - const bool HasBranchData = BC.MIA->hasAnnotation(Inst, "Offset"); + const bool HasIndirectCallProfile = + BC.MIA->hasAnnotation(Inst, "CallProfile"); const bool IsDirectCall = (BC.MIA->isCall(Inst) && BC.MIA->getTargetSymbol(Inst, 0)); if (!IsDirectCall && - ((HasBranchData && !IsJumpTable && OptimizeCalls) || + ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) { uint64_t NumCalls = 0; for (const auto &BInfo : getCallTargets(Function, Inst)) { @@ -1233,8 +1224,8 @@ void IndirectCallPromotion::runOnFunctions( ++Num; } outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls - << ", " << Num << " callsites cover " << opts::ICPTopCallsites << "% " - << "of all indirect calls\n"; + << ", " << Num << " callsites cover " << opts::ICPTopCallsites + << "% of all indirect calls\n"; // Mark sites to optimize with "DoICP" annotation. for (size_t I = 0; I < Num; ++I) { @@ -1249,8 +1240,7 @@ void IndirectCallPromotion::runOnFunctions( if (!Function.isSimple() || !opts::shouldProcess(Function)) continue; - const auto *BranchData = Function.getBranchData(); - if (!BranchData) + if (!Function.hasProfile()) continue; const bool HasLayout = !Function.layout_empty(); @@ -1279,15 +1269,15 @@ void IndirectCallPromotion::runOnFunctions( auto &Inst = BB->getInstructionAtIndex(Idx); const auto InstIdx = &Inst - &(*BB->begin()); const bool IsTailCall = BC.MIA->isTailCall(Inst); - const bool HasBranchData = Function.getBranchData() && - BC.MIA->hasAnnotation(Inst, "Offset"); + const bool HasIndirectCallProfile = + BC.MIA->hasAnnotation(Inst, "CallProfile"); const bool IsJumpTable = Function.getJumpTable(Inst); if (BC.MIA->isCall(Inst)) { TotalCalls += BB->getKnownExecutionCount(); } - if (!((HasBranchData && !IsJumpTable && OptimizeCalls) || + if (!((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) continue; @@ -1458,7 +1448,7 @@ void IndirectCallPromotion::runOnFunctions( TotalIndirectJmps += FuncTotalIndirectJmps; } - outs() << "BOLT-INFO: ICP total indirect callsites = " + outs() << "BOLT-INFO: ICP total indirect callsites with profile = " << TotalIndirectCallsites << "\n" << "BOLT-INFO: ICP total jump table callsites = " @@ -1475,7 +1465,8 @@ void IndirectCallPromotion::runOnFunctions( << format("%.1f", (100.0 * TotalNumFrequentCalls) / std::max(TotalIndirectCalls, 1ul)) << "%\n" - << "BOLT-INFO: ICP percentage of indirect calls that are optimized = " + << "BOLT-INFO: ICP percentage of indirect callsites that are " + "optimized = " << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / std::max(TotalIndirectCallsites, 1ul)) << "%\n" diff --git a/bolt/Passes/IndirectCallPromotion.h b/bolt/Passes/IndirectCallPromotion.h index e7b4cdc285e9..366fd1e23307 100644 --- a/bolt/Passes/IndirectCallPromotion.h +++ b/bolt/Passes/IndirectCallPromotion.h @@ -22,7 +22,7 @@ namespace bolt { /// Optimize indirect calls. /// The indirect call promotion pass visits each indirect call and -/// examines the BranchData for each. If the most frequent targets +/// examines a branch profile for each. If the most frequent targets /// from that callsite exceed the specified threshold (default 90%), /// the call is promoted. Otherwise, it is ignored. By default, /// only one target is considered at each callsite. @@ -103,14 +103,13 @@ class IndirectCallPromotion : public BinaryFunctionPass { using JumpTableInfoType = std::vector>; using SymTargetsType = std::vector>; struct Location { - bool IsSymbol{false}; MCSymbol *Sym{nullptr}; uint64_t Addr{0}; bool isValid() const { - return (IsSymbol && Sym) || (!IsSymbol && Addr != 0); + return Sym || (!Sym && Addr != 0); } Location() { } - explicit Location(MCSymbol *Sym) : IsSymbol(true), Sym(Sym) { } + explicit Location(MCSymbol *Sym) : Sym(Sym) { } explicit Location(uint64_t Addr) : Addr(Addr) { } }; @@ -119,18 +118,17 @@ class IndirectCallPromotion : public BinaryFunctionPass { Location To; uint64_t Mispreds{0}; uint64_t Branches{0}; - BranchHistories Histories; // Indices in the jmp table (jt only) std::vector JTIndex; bool isValid() const { return From.isValid() && To.isValid(); } - Callsite(BinaryFunction &BF, const BranchInfo &BI); + Callsite(BinaryFunction &BF, const IndirectCallProfile &ICP); Callsite(const Location &From, const Location &To, uint64_t Mispreds, uint64_t Branches, - const BranchHistories &Histories, uint64_t JTIndex) + uint64_t JTIndex) : From(From), To(To), Mispreds(Mispreds), Branches(Branches), - Histories(Histories), JTIndex(1, JTIndex) { } + JTIndex(1, JTIndex) { } }; std::unordered_set Modified; diff --git a/bolt/Passes/PLTCall.cpp b/bolt/Passes/PLTCall.cpp index e530dba77137..78eba87dc1e3 100644 --- a/bolt/Passes/PLTCall.cpp +++ b/bolt/Passes/PLTCall.cpp @@ -85,7 +85,7 @@ void PLTCall::runOnFunctions( if (NumCallsOptimized) { BC.RequiresZNow = true; outs() << "BOLT-INFO: " << NumCallsOptimized - << " PLT calls in the binary were opitmized.\n"; + << " PLT calls in the binary were optimized.\n"; } } diff --git a/bolt/ProfileReader.cpp b/bolt/ProfileReader.cpp new file mode 100644 index 000000000000..4f09ab900cb7 --- /dev/null +++ b/bolt/ProfileReader.cpp @@ -0,0 +1,265 @@ +//===-- ProfileReader.cpp - BOLT profile de-serializer ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "ProfileReader.h" +#include "ProfileYAMLMapping.h" +#include "llvm/Support/CommandLine.h" + +namespace opts { +extern llvm::cl::opt Verbosity; +} + +namespace llvm { +namespace bolt { + +void +ProfileReader::buildNameMaps(std::map &Functions) { + for (auto &YamlBF : YamlBFs) { + StringRef Name = YamlBF.Name; + const auto Pos = Name.find("(*"); + if (Pos != StringRef::npos) + Name = Name.substr(0, Pos); + ProfileNameToProfile[Name] = &YamlBF; + if (const auto CommonName = getLTOCommonName(Name)) { + LTOCommonNameMap[*CommonName].push_back(&YamlBF); + } + } + for (auto &BFI : Functions) { + const auto &Function = BFI.second; + for (auto &Name : Function.getNames()) { + if (const auto CommonName = getLTOCommonName(Name)) { + LTOCommonNameFunctionMap[*CommonName].insert(&Function); + } + } + } +} + +bool +ProfileReader::parseFunctionProfile(BinaryFunction &BF, + const yaml::bolt::BinaryFunctionProfile &YamlBF) { + auto &BC = BF.getBinaryContext(); + + bool ProfileMatched = true; + uint64_t MismatchedBlocks = 0; + uint64_t MismatchedCalls = 0; + uint64_t MismatchedEdges = 0; + + BF.setExecutionCount(YamlBF.ExecCount); + + if (YamlBF.Hash != BF.hash(true, true)) { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: hash mismatch\n"; + ProfileMatched = false; + } + + if (YamlBF.NumBasicBlocks != BF.size()) { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: number of basic blocks mismatch\n"; + ProfileMatched = false; + } + + auto DFSOrder = BF.dfs(); + + for (const auto &YamlBB : YamlBF.Blocks) { + if (YamlBB.Index >= DFSOrder.size()) { + if (opts::Verbosity >= 2) + errs() << "BOLT-WARNING: index " << YamlBB.Index + << " is out of bounds\n"; + ++MismatchedBlocks; + continue; + } + + auto &BB = *DFSOrder[YamlBB.Index]; + BB.setExecutionCount(YamlBB.ExecCount); + + for (const auto &YamlCSI: YamlBB.CallSites) { + auto *Callee = YamlCSI.DestId < YamlProfileToFunction.size() ? + YamlProfileToFunction[YamlCSI.DestId] : nullptr; + bool IsFunction = Callee ? true : false; + const MCSymbol *CalleeSymbol = nullptr; + if (IsFunction) { + CalleeSymbol = Callee->getSymbolForEntry(YamlCSI.EntryDiscriminator); + } + StringRef Name = CalleeSymbol ? CalleeSymbol->getName() : ""; + BF.getAllCallSites().emplace_back( + IsFunction, Name, YamlCSI.Count, YamlCSI.Mispreds, YamlCSI.Offset); + + if (YamlCSI.Offset >= BB.getOriginalSize()) { + if (opts::Verbosity >= 2) + errs() << "BOLT-WARNING: offset " << YamlCSI.Offset + << " out of bounds in block " << BB.getName() << '\n'; + ++MismatchedCalls; + continue; + } + + auto *Instr = + BF.getInstructionAtOffset(BB.getInputOffset() + YamlCSI.Offset); + if (!Instr) { + if (opts::Verbosity >= 2) + errs() << "BOLT-WARNING: no instruction at offset " << YamlCSI.Offset + << " in block " << BB.getName() << '\n'; + ++MismatchedCalls; + continue; + } + if (!BC.MIA->isCall(*Instr) && !BC.MIA->isIndirectBranch(*Instr)) { + if (opts::Verbosity >= 2) + errs() << "BOLT-WARNING: expected call at offset " << YamlCSI.Offset + << " in block " << BB.getName() << '\n'; + ++MismatchedCalls; + continue; + } + + auto setAnnotation = [&](StringRef Name, uint64_t Count) { + if (BC.MIA->hasAnnotation(*Instr, Name)) { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: ignoring duplicate " << Name + << " info for offset 0x" << Twine::utohexstr(YamlCSI.Offset) + << " in function " << BF << '\n'; + return; + } + BC.MIA->addAnnotation(BC.Ctx.get(), *Instr, Name, Count); + }; + + if (BC.MIA->isIndirectCall(*Instr) || BC.MIA->isIndirectBranch(*Instr)) { + IndirectCallSiteProfile &CSP = + BC.MIA->getOrCreateAnnotationAs(BC.Ctx.get(), + *Instr, "CallProfile"); + CSP.emplace_back(IsFunction, Name, YamlCSI.Count, YamlCSI.Mispreds); + } else if (BC.MIA->getConditionalTailCall(*Instr)) { + setAnnotation("CTCTakenCount", YamlCSI.Count); + setAnnotation("CTCMispredCount", YamlCSI.Mispreds); + } else { + setAnnotation("Count", YamlCSI.Count); + } + } + + for (const auto &YamlSI : YamlBB.Successors) { + if (YamlSI.Index >= DFSOrder.size()) { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: index out of bounds for profiled block\n"; + ++MismatchedEdges; + continue; + } + + auto &SuccessorBB = *DFSOrder[YamlSI.Index]; + if (!BB.getSuccessor(SuccessorBB.getLabel())) { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: no successor for block " << BB.getName() + << " that matches index " << YamlSI.Index << " or block " + << SuccessorBB.getName() << '\n'; + ++MismatchedEdges; + continue; + } + + BB.setSuccessorBranchInfo(SuccessorBB, YamlSI.Count, YamlSI.Mispreds); + } + } + + ProfileMatched &= !MismatchedBlocks && !MismatchedCalls && !MismatchedEdges; + + if (ProfileMatched) + BF.markProfiled(); + + if (!ProfileMatched && opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: " << MismatchedBlocks << " blocks, " + << MismatchedCalls << " calls, and " << MismatchedEdges + << " edges in profile did not match function " << BF << '\n'; + } + + return ProfileMatched; +} + +std::error_code +ProfileReader::readProfile(const std::string &FileName, + std::map &Functions) { + auto MB = MemoryBuffer::getFileOrSTDIN(FileName); + if (std::error_code EC = MB.getError()) { + errs() << "ERROR: cannot open " << FileName << ": " << EC.message() << "\n"; + return EC; + } + + yaml::Input YamlInput(MB.get()->getBuffer()); + YamlInput >> YamlBFs; + if (YamlInput.error()) { + errs() << "BOLT-ERROR: syntax error parsing " << FileName << " : " + << YamlInput.error().message() << '\n'; + return YamlInput.error(); + } + + buildNameMaps(Functions); + + YamlProfileToFunction.resize(YamlBFs.size() + 1); + for (auto &BFI : Functions) { + auto &Function = BFI.second; + auto Hash = Function.hash(true, true); + for (auto &FunctionName : Function.getNames()) { + const auto CommonName = getLTOCommonName(FunctionName); + if (CommonName) { + auto I = LTOCommonNameMap.find(*CommonName); + if (I == LTOCommonNameMap.end()) + continue; + + bool ProfileMatched{false}; + auto <OProfiles = I->getValue(); + for (auto *YamlBF : LTOProfiles) { + if (YamlBF->Used) + continue; + if (YamlBF->Hash == Hash) { + matchProfileToFunction(*YamlBF, Function); + break; + } + } + if (ProfileMatched) + break; + + // If there's only one function with a given name, try to + // match it partially. + if (LTOProfiles.size() == 1 && + LTOCommonNameFunctionMap[*CommonName].size() == 1 && + !LTOProfiles.front()->Used) { + matchProfileToFunction(*LTOProfiles.front(), Function); + break; + } + } else { + auto PI = ProfileNameToProfile.find(FunctionName); + if (PI == ProfileNameToProfile.end()) + continue; + + auto &YamlBF = *PI->getValue(); + matchProfileToFunction(YamlBF, Function); + break; + } + } + } + for (auto &YamlBF : YamlBFs) { + if (!YamlBF.Used) { + errs() << "BOLT-WARNING: profile ignored for function " + << YamlBF.Name << '\n'; + } + } + + for (auto &YamlBF : YamlBFs) { + if (YamlBF.Id >= YamlProfileToFunction.size()) { + // Such profile was ignored. + continue; + } + if (auto *BF = YamlProfileToFunction[YamlBF.Id]) { + parseFunctionProfile(*BF, YamlBF); + } + } + + return YamlInput.error(); +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/ProfileReader.h b/bolt/ProfileReader.h new file mode 100644 index 000000000000..1312ab6f3473 --- /dev/null +++ b/bolt/ProfileReader.h @@ -0,0 +1,68 @@ +//===-- ProfileReader.h - BOLT profile deserializer -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PROFILEREADER_H +#define LLVM_TOOLS_LLVM_BOLT_PROFILEREADER_H + +#include "BinaryFunction.h" +#include "ProfileYAMLMapping.h" +#include + +namespace llvm { +namespace bolt { + +class ProfileReader { + /// Number of function profiles that were unused by the reader. + uint64_t NumUnusedProfiles{0}; + + /// Map a function ID from a profile to a BinaryFunction object. + std::vector YamlProfileToFunction; + + void reportError(StringRef Message); + + bool parseFunctionProfile(BinaryFunction &Function, + const yaml::bolt::BinaryFunctionProfile &YamlBF); + + /// Profile for binary functions. + std::vector YamlBFs; + + /// For LTO symbol resolution. + /// Map a common LTO prefix to a list of profiles matching the prefix. + StringMap> LTOCommonNameMap; + + /// Map a common LTO prefix to a set of binary functions. + StringMap> + LTOCommonNameFunctionMap; + + StringMap ProfileNameToProfile; + + void buildNameMaps(std::map &Functions); + + /// Update matched YAML -> BinaryFunction pair. + void matchProfileToFunction(yaml::bolt::BinaryFunctionProfile &YamlBF, + BinaryFunction &BF) { + if (YamlBF.Id >= YamlProfileToFunction.size()) + YamlProfileToFunction.resize(YamlBF.Id + 1); + YamlProfileToFunction[YamlBF.Id] = &BF; + YamlBF.Used = true; + } + +public: + /// Read profile from a file and associate with a set of functions. + std::error_code readProfile(const std::string &FileName, + std::map &Functions); + +}; + +} +} + +#endif diff --git a/bolt/ProfileWriter.cpp b/bolt/ProfileWriter.cpp new file mode 100644 index 000000000000..21883e7074fc --- /dev/null +++ b/bolt/ProfileWriter.cpp @@ -0,0 +1,174 @@ +//===-- ProfileWriter.cpp - Serialize profiling data ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "ProfileWriter.h" +#include "ProfileYAMLMapping.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt-prof" + +namespace llvm { +namespace bolt { + +std::error_code +ProfileWriter::writeProfile(std::map &Functions) { + std::error_code EC; + OS = make_unique(FileName, EC, sys::fs::F_None); + if (EC) { + errs() << "BOLT-WARNING: " << EC.message() << " : unable to open " + << FileName << " for output.\n"; + return EC; + } + + printBinaryFunctionsProfile(Functions); + + return std::error_code(); +} + +namespace { +void +convert(const BinaryFunction &BF, yaml::bolt::BinaryFunctionProfile &YamlBF) { + auto &BC = BF.getBinaryContext(); + + YamlBF.Name = BF.getPrintName(); + YamlBF.Id = BF.getFunctionNumber(); + YamlBF.Hash = BF.hash(true, true); + YamlBF.ExecCount = BF.getKnownExecutionCount(); + YamlBF.NumBasicBlocks = BF.size(); + + for (const auto *BB : BF.dfs()) { + yaml::bolt::BinaryBasicBlockProfile YamlBB; + YamlBB.Index = BB->getLayoutIndex(); + YamlBB.NumInstructions = BB->getNumNonPseudos(); + YamlBB.ExecCount = BB->getKnownExecutionCount(); + + for (const auto &Instr : *BB) { + if (!BC.MIA->isCall(Instr) && !BC.MIA->isIndirectBranch(Instr)) + continue; + + yaml::bolt::CallSiteInfo CSI; + auto Offset = BC.MIA->tryGetAnnotationAs(Instr, "Offset"); + if (!Offset || Offset.get() < BB->getInputOffset()) + continue; + CSI.Offset = Offset.get() - BB->getInputOffset(); + + if (BC.MIA->isIndirectCall(Instr) || BC.MIA->isIndirectBranch(Instr)) { + auto ICSP = + BC.MIA->tryGetAnnotationAs(Instr, + "CallProfile"); + if (!ICSP) + continue; + for (auto &CSP : ICSP.get()) { + CSI.DestId = 0; // designated for unknown functions + CSI.EntryDiscriminator = 0; + if (CSP.IsFunction) { + const auto *CalleeSymbol = BC.getGlobalSymbolByName(CSP.Name); + if (CalleeSymbol) { + const auto *Callee = BC.getFunctionForSymbol(CalleeSymbol); + if (Callee) { + CSI.DestId = Callee->getFunctionNumber(); + } + } + } + CSI.Count = CSP.Count; + CSI.Mispreds = CSP.Mispreds; + YamlBB.CallSites.push_back(CSI); + } + } else { // direct call or a tail call + const auto *CalleeSymbol = BC.MIA->getTargetSymbol(Instr); + const auto Callee = BC.getFunctionForSymbol(CalleeSymbol); + if (Callee) { + CSI.DestId = Callee->getFunctionNumber();; + CSI.EntryDiscriminator = Callee->getEntryForSymbol(CalleeSymbol); + } + + if (BC.MIA->getConditionalTailCall(Instr)) { + auto CTCCount = + BC.MIA->tryGetAnnotationAs(Instr, "CTCTakenCount"); + if (CTCCount) { + CSI.Count = *CTCCount; + auto CTCMispreds = + BC.MIA->tryGetAnnotationAs(Instr, "CTCMispredCount"); + if (CTCMispreds) + CSI.Mispreds = *CTCMispreds; + } + } else { + auto Count = BC.MIA->tryGetAnnotationAs(Instr, "Count"); + if (Count) + CSI.Count = *Count; + } + + if (CSI.Count) + YamlBB.CallSites.emplace_back(CSI); + } + } + + // Skip printing if there's no profile data for non-entry basic block. + if (YamlBB.CallSites.empty() && !BB->isEntryPoint()) { + uint64_t SuccessorExecCount = 0; + for (auto &BranchInfo : BB->branch_info()) { + SuccessorExecCount += BranchInfo.Count; + } + if (!SuccessorExecCount) + continue; + } + + auto BranchInfo = BB->branch_info_begin(); + for (const auto *Successor : BB->successors()) { + yaml::bolt::SuccessorInfo YamlSI; + YamlSI.Index = Successor->getLayoutIndex(); + YamlSI.Count = BranchInfo->Count; + YamlSI.Mispreds = BranchInfo->MispredictedCount; + + YamlBB.Successors.emplace_back(YamlSI); + + ++BranchInfo; + } + + YamlBF.Blocks.emplace_back(YamlBB); + } +} +} // end anonymous namespace + +void ProfileWriter::printBinaryFunctionProfile(const BinaryFunction &BF) { + yaml::bolt::BinaryFunctionProfile YamlBF; + convert(BF, YamlBF); + + yaml::Output Out(*OS); + Out << YamlBF; +} + +void ProfileWriter::printBinaryFunctionsProfile( + std::map &BFs) { + std::vector YamlBFs; + for (auto &BFI : BFs) { + const auto &BF = BFI.second; + if (BF.hasProfile()) { + yaml::bolt::BinaryFunctionProfile YamlBF; + convert(BF, YamlBF); + YamlBFs.emplace_back(YamlBF); + } + } + + yaml::Output Out(*OS); + Out << YamlBFs; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/ProfileWriter.h b/bolt/ProfileWriter.h new file mode 100644 index 000000000000..dfbbc9ad7e30 --- /dev/null +++ b/bolt/ProfileWriter.h @@ -0,0 +1,53 @@ +//===-- ProfileWriter.cpp - serialize profiling data ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_TOOLS_LLVM_BOLT_PROFILE_WRITER_H +#define LLVM_TOOLS_LLVM_BOLT_PROFILE_WRITER_H + +#include "BinaryBasicBlock.h" +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "ProfileYAMLMapping.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace llvm { +namespace bolt { + +class ProfileWriter { + ProfileWriter() = delete; + + std::string FileName; + + std::error_code write(BinaryFunction &BF); + + std::unique_ptr OS; + + void printBinaryFunctionProfile(const BinaryFunction &BF); + + void printBinaryFunctionsProfile(std::map &BFs); + +public: + explicit ProfileWriter(const std::string &FileName) + : FileName(FileName) { + } + + /// Write profile for functions. + std::error_code writeProfile(std::map &Functions); +}; + +} // namespace bolt +} // namespace llvm + +#endif // LLVM_TOOLS_LLVM_BOLT_PROFILE_WRITER_H diff --git a/bolt/ProfileYAMLMapping.h b/bolt/ProfileYAMLMapping.h new file mode 100644 index 000000000000..85845ebc012e --- /dev/null +++ b/bolt/ProfileYAMLMapping.h @@ -0,0 +1,147 @@ +//===-- ProfileYAMLMapping.h - mappings for BOLT profile --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implement mapping between binary function profile and YAML representation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PROFILEYAMLMAPPING_H +#define LLVM_TOOLS_LLVM_BOLT_PROFILEYAMLMAPPING_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/YAMLTraits.h" +#include + +namespace llvm { +namespace yaml { + +namespace bolt { +struct CallSiteInfo { + llvm::yaml::Hex32 Offset{0}; + uint32_t DestId{0}; + uint32_t EntryDiscriminator{0}; // multiple entry discriminator + uint64_t Count{0}; + uint64_t Mispreds{0}; + + bool operator==(const CallSiteInfo &Other) const { + return Offset == Other.Offset && + DestId == Other.DestId && + EntryDiscriminator == Other.EntryDiscriminator; + } +}; +} + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, bolt::CallSiteInfo &CSI) { + YamlIO.mapRequired("off", CSI.Offset); + YamlIO.mapRequired("fid", CSI.DestId); + YamlIO.mapOptional("disc", CSI.EntryDiscriminator, (uint32_t)0); + YamlIO.mapRequired("cnt", CSI.Count); + YamlIO.mapOptional("mis", CSI.Mispreds, (uint64_t)0); + } + + static const bool flow = true; +}; + +namespace bolt { +struct SuccessorInfo { + uint32_t Index{0}; + uint64_t Count{0}; + uint64_t Mispreds{0}; + + bool operator==(const SuccessorInfo &Other) const { + return Index == Other.Index; + } +}; +} + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, bolt::SuccessorInfo &SI) { + YamlIO.mapRequired("bid", SI.Index); + YamlIO.mapRequired("cnt", SI.Count); + YamlIO.mapOptional("mis", SI.Mispreds, (uint64_t)0); + } + + static const bool flow = true; +}; + +} // end namespace yaml +} // end namespace llvm + +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::CallSiteInfo) +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::SuccessorInfo) + +namespace llvm { +namespace yaml { + +namespace bolt { +struct BinaryBasicBlockProfile { + uint32_t Index{0}; + uint32_t NumInstructions{0}; + llvm::yaml::Hex64 Hash{0}; + uint64_t ExecCount{0}; + std::vector CallSites; + std::vector Successors; + + bool operator==(const BinaryBasicBlockProfile &Other) const { + return Index == Other.Index; + } +}; +} // namespace bolt + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, bolt::BinaryBasicBlockProfile &BBP) { + YamlIO.mapRequired("bid", BBP.Index); + YamlIO.mapRequired("insns", BBP.NumInstructions); + YamlIO.mapOptional("exec", BBP.ExecCount, (uint64_t)0); + YamlIO.mapOptional("calls", BBP.CallSites, + std::vector()); + YamlIO.mapOptional("succ", BBP.Successors, + std::vector()); + } +}; + +} // end namespace yaml +} // end namespace llvm + +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::BinaryBasicBlockProfile) + +namespace llvm { +namespace yaml { + +namespace bolt { +struct BinaryFunctionProfile { + std::string Name; + uint32_t NumBasicBlocks; + uint32_t Id; + llvm::yaml::Hex64 Hash; + uint64_t ExecCount; + std::vector Blocks; + bool Used{false}; +}; +} + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, bolt::BinaryFunctionProfile &BFP) { + YamlIO.mapRequired("name", BFP.Name); + YamlIO.mapRequired("fid", BFP.Id); + YamlIO.mapRequired("hash", BFP.Hash); + YamlIO.mapRequired("exec", BFP.ExecCount); + YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks); + YamlIO.mapOptional("blocks", BFP.Blocks, + std::vector()); + } +}; + +} // end namespace yaml +} // end namespace llvm + +LLVM_YAML_IS_DOCUMENT_LIST_VECTOR(llvm::yaml::bolt::BinaryFunctionProfile) + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 518b9e9df5c7..ad3cc0253460 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -18,6 +18,8 @@ #include "DataAggregator.h" #include "DataReader.h" #include "Exceptions.h" +#include "ProfileReader.h" +#include "ProfileWriter.h" #include "RewriteInstance.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" @@ -97,6 +99,11 @@ AllowStripped("allow-stripped", cl::Hidden, cl::cat(BoltCategory)); +static cl::opt +BoltProfile("b", + cl::desc(""), + cl::cat(BoltCategory)); + cl::opt BoostMacroops("boost-macroops", cl::desc("try to boost macro-op fusions by avoiding the cache-line boundary"), @@ -217,6 +224,11 @@ RelocationMode("relocs", cl::ZeroOrMore, cl::cat(BoltCategory)); +static cl::opt +SaveProfile("w", + cl::desc("save recorded profile to a file"), + cl::cat(BoltOutputCategory)); + static cl::list SkipFunctionNames("skip-funcs", cl::CommaSeparated, @@ -873,7 +885,7 @@ void RewriteInstance::run() { discoverFileObjects(); readDebugInfo(); disassembleFunctions(); - readProfileData(); + processProfileData(); if (opts::AggregateOnly) return; postProcessFunctions(); @@ -1901,39 +1913,56 @@ void RewriteInstance::readDebugInfo() { BC->preprocessDebugInfo(BinaryFunctions); } -void RewriteInstance::readProfileData() { +void RewriteInstance::processProfileData() { if (DA.started()) { NamedRegionTimer T("aggregate data", TimerGroupName, opts::TimeRewrite); DA.aggregate(*BC.get(), BinaryFunctions); + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + Function.convertBranchData(); + } + if (opts::AggregateOnly) { if (std::error_code EC = DA.writeAggregatedFile()) { check_error(EC, "cannot create output data file"); } } - return; - } + } else { + NamedRegionTimer T("read profile data", TimerGroupName, opts::TimeRewrite); + + if (!opts::BoltProfile.empty()) { + ProfileReader PR; + PR.readProfile(opts::BoltProfile, BinaryFunctions); + + return; + } + + // Preliminary match profile data to functions. + if (!BC->DR.getAllFuncsData().empty()) { + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + if (auto *MemData = BC->DR.getFuncMemData(Function.getNames())) { + Function.MemData = MemData; + MemData->Used = true; + } + if (auto *FuncData = BC->DR.getFuncBranchData(Function.getNames())) { + Function.BranchData = FuncData; + Function.ExecutionCount = FuncData->ExecutionCount; + FuncData->Used = true; + } + } + } - NamedRegionTimer T("read profile data", TimerGroupName, opts::TimeRewrite); - // Preliminary match profile data to functions. - if (!BC->DR.getAllFuncsData().empty()) { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; - if (auto *MemData = BC->DR.getFuncMemData(Function.getNames())) { - Function.MemData = MemData; - MemData->Used = true; - } - if (auto *FuncData = BC->DR.getFuncBranchData(Function.getNames())) { - Function.BranchData = FuncData; - Function.ExecutionCount = FuncData->ExecutionCount; - FuncData->Used = true; - } + Function.readProfile(); } } - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - Function.readProfile(); + if (!opts::SaveProfile.empty()) { + ProfileWriter PW(opts::SaveProfile); + PW.writeProfile(BinaryFunctions); } } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 74c801a27d33..368ae2e6d61e 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -178,8 +178,8 @@ class RewriteInstance { /// Read information from debug sections. void readDebugInfo(); - /// Associate profile data with functions. - void readProfileData(); + /// Associate profile data with binary objects. + void processProfileData(); /// Disassemble each function in the binary and associate it with a /// BinaryFunction object, preparing all information necessary for binary From 61262bb735696bce1d89eeccdb2f6511ddd01b26 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 22 Nov 2017 16:17:36 -0800 Subject: [PATCH 366/904] [BOLT-AArch64] Support large test binary Summary: Rewrite how data/code markers are interpreted, so the code can have constant islands essentially anywhere. This is necessary to accomodate custom AArch64 assembly code coming from mozjpeg. Allow any function to refer to the constant island owned by any other function. When this happens, we pull the constant island from the referred function and emit it as our own, so it will live nearby the code that refers to it, allowing us to freely reorder functions and code pieces. Make bolt more strict about not changing anything in non-simple ARM functions, as we need to preserve offsets for those functions we don't interpret their jump tables (currently any function with jump tables in ARM is non-simple and is left untouched). (cherry picked from commit 9e10fdc3eeb1efb0adeaff12eaf38f2e932b13c8) --- bolt/BinaryContext.cpp | 16 +++++ bolt/BinaryContext.h | 3 + bolt/BinaryFunction.cpp | 116 ++++++++++++++++++++++++++++------- bolt/BinaryFunction.h | 80 +++++++++++++++++++++++- bolt/Passes/BinaryPasses.cpp | 3 + bolt/Passes/LongJmp.cpp | 17 +++-- bolt/RewriteInstance.cpp | 88 ++++++++++++++------------ 7 files changed, 256 insertions(+), 67 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 6be9c1cca959..33bb79ed2631 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -566,10 +566,14 @@ size_t Relocation::getSizeForType(uint64_t Type) { case ELF::R_AARCH64_LDST8_ABS_LO12_NC: case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: case ELF::R_AARCH64_TLSDESC_CALL: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case ELF::R_AARCH64_JUMP26: case ELF::R_AARCH64_PREL32: return 4; @@ -597,6 +601,7 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, return static_cast(PC) + SignExtend64<28>(Contents << 2); case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case ELF::R_AARCH64_ADR_PREL_PG_HI21: { // Bits 32:12 of Symbol address goes in bits 30:29 + 23:5 of ADRP // instruction @@ -608,6 +613,7 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, Contents &= ~0xfffUll; return Contents; } + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: { @@ -616,6 +622,8 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, Contents &= ~0xffffffffffc003ffU; return Contents >> (10 - 3); } + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: case ELF::R_AARCH64_ADD_ABS_LO12_NC: { // Immediate goes in bits 21:10 of ADD instruction @@ -654,7 +662,11 @@ bool Relocation::isGOT(uint64_t Type) { default: return false; case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: @@ -679,6 +691,9 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_AARCH64_LDST32_ABS_LO12_NC: case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: @@ -695,6 +710,7 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_AARCH64_CALL26: case ELF::R_AARCH64_ADR_PREL_PG_HI21: case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: case ELF::R_AARCH64_JUMP26: case ELF::R_AARCH64_PREL32: diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 37266ec6f6fe..370d209e49ef 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -115,6 +115,9 @@ class BinaryContext { std::unordered_map SymbolToFunctionMap; + /// Map address to a constant island owner (constant data in code section) + std::map AddressToConstantIslandMap; + /// Map virtual address to a section. std::map AllocatableSections; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 3820f7713f4f..caedd2a5549b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -909,13 +909,29 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } } - if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && - isInConstantIsland(TargetAddress)) { - TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "ISLANDat"); - IslandSymbols[TargetAddress - getAddress()] = TargetSymbol; - if (!ColdIslandSymbols.count(TargetSymbol)) { - ColdIslandSymbols[TargetSymbol] = - Ctx->getOrCreateSymbol(TargetSymbol->getName() + ".cold"); + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { + // Check if this is an access to a constant island and create bookkeeping + // to keep track of it and emit it later as part of this function + if (MCSymbol *IslandSym = getOrCreateIslandAccess(TargetAddress).first) { + TargetSymbol = IslandSym; + } else { + // Detect custom code written in assembly that refers to arbitrary + // constant islands from other functions. Write this reference so we + // can pull this constant island and emit it as part of this function + // too. + auto IslandIter = + BC.AddressToConstantIslandMap.lower_bound(TargetAddress); + if (IslandIter != BC.AddressToConstantIslandMap.end()) { + MCSymbol *IslandSym, *ColdIslandSym; + std::tie(IslandSym, ColdIslandSym) = + IslandIter->second->getOrCreateProxyIslandAccess(TargetAddress, + this); + if (IslandSym) { + TargetSymbol = IslandSym; + addConstantIslandDependency(IslandIter->second, IslandSym, + ColdIslandSym); + } + } } } @@ -1732,6 +1748,10 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { } void BinaryFunction::removeConditionalTailCalls() { + // Don't touch code if non-simple ARM + if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && !isSimple()) + return; + // Blocks to be appended at the end. std::vector> NewBlocks; @@ -1781,6 +1801,8 @@ void BinaryFunction::removeConditionalTailCalls() { // Add execution count for the block. TailCallBB->setExecutionCount(CTCTakenCount); + BC.MIA->convertTailCallToJmp(*CTCInstr); + // In attempt to preserve the direction of the original conditional jump, // we will either create an unconditional jump in a separate basic block // at the end of the function, or reverse a condition of the jump @@ -2151,15 +2173,33 @@ void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { } } -void BinaryFunction::emitConstantIslands(MCStreamer &Streamer, - bool EmitColdPart) { - if (DataOffsets.empty()) +void BinaryFunction::addConstantIslandDependency(BinaryFunction *OtherBF, + MCSymbol *HotSymbol, + MCSymbol *ColdSymbol) { + IslandDependency.insert(OtherBF); + if (!ColdIslandSymbols.count(HotSymbol)) { + ColdIslandSymbols[HotSymbol] = ColdSymbol; + } + DEBUG(dbgs() << "BOLT-DEBUG: Constant island dependency added! " + << getPrintName() << " refers to " << OtherBF->getPrintName() + << "\n"); +} + +void BinaryFunction::emitConstantIslands( + MCStreamer &Streamer, bool EmitColdPart, + BinaryFunction *OnBehalfOf) { + if (DataOffsets.empty() && IslandDependency.empty()) return; - if (!EmitColdPart) - Streamer.EmitLabel(getFunctionConstantIslandLabel()); - else - Streamer.EmitLabel(getFunctionColdConstantIslandLabel()); + if (!OnBehalfOf) { + if (!EmitColdPart) + Streamer.EmitLabel(getFunctionConstantIslandLabel()); + else + Streamer.EmitLabel(getFunctionColdConstantIslandLabel()); + } + + assert((!OnBehalfOf || IslandProxies[OnBehalfOf].size() > 0) && + "spurious OnBehalfOf constant island emission"); // Raw contents of the function. StringRef SectionContents; Section.getContents(SectionContents); @@ -2169,7 +2209,7 @@ void BinaryFunction::emitConstantIslands(MCStreamer &Streamer, SectionContents.substr(getAddress() - Section.getAddress(), getMaxSize()); - if (opts::Verbosity) + if (opts::Verbosity && !OnBehalfOf) outs() << "BOLT-INFO: emitting constant island for function " << *this << "\n"; @@ -2209,12 +2249,36 @@ void BinaryFunction::emitConstantIslands(MCStreamer &Streamer, FunctionOffset = NextStop; } if (IS != IslandSymbols.end() && FunctionOffset == IS->first) { - DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << IS->second->getName() - << " at offset 0x" << Twine::utohexstr(IS->first) << '\n'); - if (!EmitColdPart) - Streamer.EmitLabel(IS->second); - else - Streamer.EmitLabel(ColdIslandSymbols[IS->second]); + // This is a slightly complex code to decide which label to emit. We + // have 4 cases to handle: regular symbol, cold symbol, regular or cold + // symbol being emitted on behalf of an external function. + if (!OnBehalfOf) { + if (!EmitColdPart) { + DEBUG(dbgs() << "BOLT-DEBUG: emitted label " + << IS->second->getName() << " at offset 0x" + << Twine::utohexstr(IS->first) << '\n'); + Streamer.EmitLabel(IS->second); + } else { + DEBUG(dbgs() << "BOLT-DEBUG: emitted label " + << ColdIslandSymbols[IS->second]->getName() << '\n'); + Streamer.EmitLabel(ColdIslandSymbols[IS->second]); + } + } else { + if (!EmitColdPart) { + if (MCSymbol *Sym = IslandProxies[OnBehalfOf][IS->second]) { + DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName() + << '\n'); + Streamer.EmitLabel(Sym); + } + } else { + if (MCSymbol *Sym = + IslandProxies[OnBehalfOf][ColdIslandSymbols[IS->second]]) { + DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName() + << '\n'); + Streamer.EmitLabel(Sym); + } + } + } ++IS; } if (RI != MoveRelocations.end() && FunctionOffset == RI->first) { @@ -2232,8 +2296,16 @@ void BinaryFunction::emitConstantIslands(MCStreamer &Streamer, Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, EndOffset)); } } - assert(IS == IslandSymbols.end() && "some symbols were not emitted!"); + + if (OnBehalfOf) + return; + // Now emit constant islands from other functions that we may have used in + // this function. + for (auto *ExternalFunc : IslandDependency) { + ExternalFunc->emitConstantIslands(Streamer, EmitColdPart, this); + } + } void BinaryFunction::duplicateConstantIslands() { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 5ed98c22da13..52cdfd9f4f17 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -683,6 +683,11 @@ class BinaryFunction { /// after disassembling std::map IslandSymbols; std::map ColdIslandSymbols; + /// Keeps track of other functions we depend on because there is a reference + /// to the constant islands in them. + std::map> + IslandProxies; + std::set IslandDependency; // The other way around // Blocks are kept sorted in the layout order. If we need to change the // layout (if BasicBlocksLayout stores a different order than BasicBlocks), @@ -1228,7 +1233,11 @@ class BinaryFunction { case ELF::R_X86_64_64: case ELF::R_AARCH64_ABS64: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: case ELF::R_AARCH64_ADD_ABS_LO12_NC: @@ -1786,6 +1795,66 @@ class BinaryFunction { return OutputColdDataOffset; } + /// If \p Address represents an access to a constant island managed by this + /// function, return a symbol so code can safely refer to it. Otherwise, + /// return nullptr. First return value is the symbol for reference in the + /// hot code area while the second return value is the symbol for reference + /// in the cold code area, as when the function is split the islands are + /// duplicated. + std::pair getOrCreateIslandAccess(uint64_t Address) { + MCSymbol *Symbol, *ColdSymbol; + if (!isInConstantIsland(Address)) + return std::make_pair(nullptr, nullptr); + + // Register our island at global namespace + Symbol = BC.getOrCreateGlobalSymbol(Address, "ISLANDat"); + // Internal bookkeeping + const auto Offset = Address - getAddress(); + assert((!IslandSymbols.count(Offset) || IslandSymbols[Offset] == Symbol) && + "Inconsistent island symbol management"); + if (!IslandSymbols.count(Offset)) { + IslandSymbols[Offset] = Symbol; + } + if (!ColdIslandSymbols.count(Symbol)) { + ColdSymbol = BC.Ctx->getOrCreateSymbol(Symbol->getName() + ".cold"); + ColdIslandSymbols[Symbol] = ColdSymbol; + } + return std::make_pair(Symbol, ColdSymbol); + } + + /// Called by an external function which wishes to emit references to constant + /// island symbols of this function. We create a proxy for it, so we emit + /// separate symbols when emitting our constant island on behalf of this other + /// function. + std::pair + getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction *Referrer) { + auto HotColdSymbols = getOrCreateIslandAccess(Address); + if (!HotColdSymbols.first) + return HotColdSymbols; + + MCSymbol *ProxyHot, *ProxyCold; + if (!IslandProxies[Referrer].count(HotColdSymbols.first)) { + ProxyHot = + BC.Ctx->getOrCreateSymbol(HotColdSymbols.first->getName() + + ".proxy.for." + Referrer->getPrintName()); + ProxyCold = + BC.Ctx->getOrCreateSymbol(HotColdSymbols.second->getName() + + ".proxy.for." + Referrer->getPrintName()); + IslandProxies[Referrer][HotColdSymbols.first] = ProxyHot; + IslandProxies[Referrer][HotColdSymbols.second] = ProxyCold; + } + ProxyHot = IslandProxies[Referrer][HotColdSymbols.first]; + ProxyCold = IslandProxies[Referrer][HotColdSymbols.second]; + return std::make_pair(ProxyHot, ProxyCold); + } + + /// Make this function depend on \p OtherBF because we have a reference to its + /// constant island. When emitting this function, we will also emit OtherBF's + /// constants. This only happens in custom AArch64 assembly code (either + /// poorly written code or over-optimized). + void addConstantIslandDependency(BinaryFunction *OtherBF, MCSymbol *HotSymbol, + MCSymbol *ColdSymbol); + /// Detects whether \p Address is inside a data region in this function /// (constant islands). bool isInConstantIsland(uint64_t Address) const { @@ -1809,7 +1878,8 @@ class BinaryFunction { return *std::prev(CodeIter) <= *DataIter; } - uint64_t estimateConstantIslandSize() const { + uint64_t + estimateConstantIslandSize(const BinaryFunction *OnBehalfOf = nullptr) const { uint64_t Size = 0; for (auto DataIter = DataOffsets.begin(); DataIter != DataOffsets.end(); ++DataIter) { @@ -1831,6 +1901,11 @@ class BinaryFunction { Size += NextMarker - *DataIter; } + + if (!OnBehalfOf) { + for (auto *ExternalFunc : IslandDependency) + Size += ExternalFunc->estimateConstantIslandSize(this); + } return Size; } @@ -2050,7 +2125,8 @@ class BinaryFunction { void emitBodyRaw(MCStreamer *Streamer); /// Helper for emitBody to write data inside a function (used for AArch64) - void emitConstantIslands(MCStreamer &Streamer, bool EmitColdPart); + void emitConstantIslands(MCStreamer &Streamer, bool EmitColdPart, + BinaryFunction *OnBehalfOf = nullptr); /// Traverse cold basic blocks and replace references to constants in islands /// with a proxy symbol for the duplicated constant island that is going to be diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 03727de0f6d2..f11dd625dfd3 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -577,6 +577,9 @@ void FixupBranches::runOnFunctions( for (auto &It : BFs) { auto &Function = It.second; if (BC.HasRelocations || shouldOptimize(Function)) { + if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && + !Function.isSimple()) + continue; Function.fixBranches(); } } diff --git a/bolt/Passes/LongJmp.cpp b/bolt/Passes/LongJmp.cpp index d6a60bbced17..7a2beaee55c0 100644 --- a/bolt/Passes/LongJmp.cpp +++ b/bolt/Passes/LongJmp.cpp @@ -164,7 +164,7 @@ void LongJmpPass::insertStubs(const BinaryContext &BC, BinaryFunction &Func) { continue; // Insert stubs close to the patched BB if call, but far away from the - // hot path if a branch, since this branch target is the cold region + // hot path if a branch, since this branch target is the cold region. BinaryBasicBlock *InsertionPoint = &BB; if (!BC.MIA->isCall(Inst) && Frontier && !BB.isCold()) { auto BitsAvail = BC.MIA->getPCRelEncodingSize(Inst) - 2; @@ -172,6 +172,11 @@ void LongJmpPass::insertStubs(const BinaryContext &BC, BinaryFunction &Func) { if (!(Func.getMaxSize() & Mask)) InsertionPoint = Frontier; } + // Always put stubs at the end of the function if non-simple. We can't + // change the layout of non-simple functions because it has jump tables + // that we do not control. + if (!Func.isSimple()) + InsertionPoint = &*std::prev(Func.end()); // Create a stub to handle a far-away target Insertions.emplace_back(std::make_pair( InsertionPoint, replaceTargetWithStub(BC, Func, BB, Inst))); @@ -277,7 +282,7 @@ void LongJmpPass::tentativeLayout( if (!BC.HasRelocations) { for (auto Func : SortedFunctions) { HotAddresses[Func] = Func->getAddress(); - DotAddress = RoundUpToAlignment(DotAddress, 16); + DotAddress = RoundUpToAlignment(DotAddress, ColdFragAlign); ColdAddresses[Func] = DotAddress; if (Func->isSplit()) DotAddress += Func->estimateColdSize(); @@ -474,7 +479,10 @@ void LongJmpPass::runOnFunctions(BinaryContext &BC, BB.markValid(true); } insertStubs(BC, *Func); - Func->fixBranches(); + // Don't ruin non-simple functions, they can't afford to have the layout + // changed. + if (Func->isSimple()) + Func->fixBranches(); } bool Modified; @@ -484,7 +492,8 @@ void LongJmpPass::runOnFunctions(BinaryContext &BC, for (auto Func : Sorted) { if (removeOrShrinkStubs(BC, *Func)) { Func->eraseInvalidBBs(); - Func->fixBranches(); + if (Func->isSimple()) + Func->fixBranches(); Modified = true; } } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ad3cc0253460..9ec8db675f4b 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -873,9 +873,6 @@ void RewriteInstance::run() { if (opts::RelocationMode != cl::BOU_TRUE) { errs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully " "supported\n"; - } else if (opts::UseOldText) { - opts::UseOldText = false; - outs() << "BOLT-INFO: disabling -use-old-text for AArch64\n"; } } @@ -1010,8 +1007,22 @@ void RewriteInstance::discoverFileObjects() { return *(A.getAddress()) < *(B.getAddress()); }); + // For aarch64, the ABI defines mapping symbols so we identify data in the + // code section (see IHI0056B). $d identifies data contents. + auto MarkersBegin = SortedFileSymbols.end(); + if (BC->TheTriple->getArch() == llvm::Triple::aarch64) { + MarkersBegin = std::stable_partition( + SortedFileSymbols.begin(), SortedFileSymbols.end(), + [](const SymbolRef &Symbol) { + ErrorOr NameOrError = Symbol.getName(); + return !(Symbol.getType() == SymbolRef::ST_Unknown && + (*NameOrError == "$d" || *NameOrError == "$x")); + }); + } + BinaryFunction *PreviousFunction = nullptr; - for (const auto &Symbol : SortedFileSymbols) { + for (auto ISym = SortedFileSymbols.begin(); ISym != MarkersBegin; ++ISym) { + const auto &Symbol = *ISym; // Keep undefined symbols for pretty printing? if (Symbol.getFlags() & SymbolRef::SF_Undefined) continue; @@ -1031,32 +1042,13 @@ void RewriteInstance::discoverFileObjects() { continue; } - // In aarch, make $x symbols be replaceable by a more meaningful one - // whenever possible - if (BC->TheTriple->getArch() != llvm::Triple::aarch64 || - FileSymRefs.find(Address) == FileSymRefs.end()) { - FileSymRefs[Address] = Symbol; - } else { - if (FileSymRefs[Address].getType() == SymbolRef::ST_Unknown && - *FileSymRefs[Address].getName() == "$x") - FileSymRefs[Address] = Symbol; - else if (Symbol.getType() != SymbolRef::ST_Unknown || - *NameOrError != "$x") - FileSymRefs[Address] = Symbol; - } + FileSymRefs[Address] = Symbol; // There's nothing horribly wrong with anonymous symbols, but let's // ignore them for now. if (NameOrError->empty()) continue; - // For aarch64, the ABI defines mapping symbols so we identify data in the - // code section (see IHI0056B). $d identifies data contents. - if (BC->TheTriple->getArch() == llvm::Triple::aarch64 && - Symbol.getType() == SymbolRef::ST_Unknown && - (*NameOrError == "$d" || *NameOrError == "$x")) - continue; - /// It is possible we are seeing a globalized local. LLVM might treat it as /// a local if it has a "private global" prefix, e.g. ".L". Thus we have to /// change the prefix to enforce global scope of the symbol. @@ -1298,6 +1290,31 @@ void RewriteInstance::discoverFileObjects() { // Now that all the functions were created - adjust their boundaries. adjustFunctionBoundaries(); + // Annotate functions with code/data markers in AArch64 + for (auto ISym = MarkersBegin; ISym != SortedFileSymbols.end(); ++ISym) { + const auto &Symbol = *ISym; + ErrorOr AddressOrErr = Symbol.getAddress(); + check_error(AddressOrErr.getError(), "cannot get symbol address"); + auto SymbolSize = ELFSymbolRef(Symbol).getSize(); + uint64_t Address = *AddressOrErr; + auto *BF = getBinaryFunctionContainingAddress(Address, true, true); + if (!BF) { + // Stray marker + continue; + } + const auto EntryOffset = Address - BF->getAddress(); + if (BF->isCodeMarker(Symbol, SymbolSize)) { + BF->markCodeAtOffset(EntryOffset); + continue; + } + if (BF->isDataMarker(Symbol, SymbolSize)) { + BF->markDataAtOffset(EntryOffset); + BC->AddressToConstantIslandMap[Address] = BF; + continue; + } + llvm_unreachable("Unknown marker"); + } + if (!BC->HasRelocations) return; @@ -1406,21 +1423,14 @@ void RewriteInstance::adjustFunctionBoundaries() { // This is potentially another entry point into the function. auto EntryOffset = NextSymRefI->first - Function.getAddress(); - if (Function.isDataMarker(Symbol, SymbolSize)) { - Function.markDataAtOffset(EntryOffset); - } else if (Function.isCodeMarker(Symbol, SymbolSize)) { - Function.markCodeAtOffset(EntryOffset); - } else { - DEBUG(dbgs() << "BOLT-DEBUG: adding entry point to function " - << Function << " at offset 0x" - << Twine::utohexstr(EntryOffset) << '\n'); - Function.addEntryPointAtOffset(EntryOffset); - // In non-relocation mode there's potentially an external undetectable - // reference to the entry point and hence we cannot move this entry - // point. Optimizing without moving could be difficult. - if (!BC->HasRelocations) - Function.setSimple(false); - } + DEBUG(dbgs() << "BOLT-DEBUG: adding entry point to function " << Function + << " at offset 0x" << Twine::utohexstr(EntryOffset) << '\n'); + Function.addEntryPointAtOffset(EntryOffset); + // In non-relocation mode there's potentially an external undetectable + // reference to the entry point and hence we cannot move this entry + // point. Optimizing without moving could be difficult. + if (!BC->HasRelocations) + Function.setSimple(false); ++NextSymRefI; } From 732c6b411e5149756e8f30340c7b9c43de173e31 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 23 Jan 2018 15:10:24 -0800 Subject: [PATCH 367/904] [BOLT] Refactoring - add BinarySection class Summary: Add BinarySection class that is a wrapper around SectionRef. This is refactoring work for static data reordering. (cherry picked from commit f19c84e7e370365a1820158f0bb3ca541cf5b0d9) --- bolt/BinaryContext.cpp | 322 +++----------------------- bolt/BinaryContext.h | 93 ++++---- bolt/BinaryFunction.cpp | 39 ++-- bolt/BinaryFunction.h | 6 +- bolt/BinarySection.cpp | 227 ++++++++++++++++++ bolt/BinarySection.h | 148 ++++++++++++ bolt/CMakeLists.txt | 1 + bolt/DWARFRewriter.cpp | 7 +- bolt/Passes/BinaryPasses.cpp | 17 +- bolt/Passes/IndirectCallPromotion.cpp | 2 +- bolt/RewriteInstance.cpp | 137 +++++------ bolt/RewriteInstance.h | 17 +- 12 files changed, 553 insertions(+), 463 deletions(-) create mode 100644 bolt/BinarySection.cpp create mode 100644 bolt/BinarySection.h diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 33bb79ed2631..10f557603b68 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -19,7 +19,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" - using namespace llvm; using namespace bolt; @@ -47,14 +46,6 @@ PrintMemData("print-mem-data", } // namespace opts -namespace llvm { -namespace bolt { -extern void check_error(std::error_code EC, StringRef Message); -} -} - -Triple::ArchType Relocation::Arch; - BinaryContext::~BinaryContext() { } MCObjectWriter *BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { @@ -438,19 +429,15 @@ void BinaryContext::printInstruction(raw_ostream &OS, ErrorOr> BinaryContext::getFunctionData(const BinaryFunction &Function) const { - auto Section = Function.getSection(); - assert(Section.getAddress() <= Function.getAddress() && - Section.getAddress() + Section.getSize() - >= Function.getAddress() + Function.getSize() && + auto &Section = Function.getSection(); + assert(Section.containsRange(Function.getAddress(), Function.getSize()) && "wrong section for function"); if (!Section.isText() || Section.isVirtual() || !Section.getSize()) { return std::make_error_code(std::errc::bad_address); } - StringRef SectionContents; - check_error(Section.getContents(SectionContents), - "cannot get section contents"); + StringRef SectionContents = Section.getContents(); assert(SectionContents.size() == Section.getSize() && "section size mismatch"); @@ -461,7 +448,18 @@ BinaryContext::getFunctionData(const BinaryFunction &Function) const { return ArrayRef(Bytes + FunctionOffset, Function.getSize()); } -ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const{ +ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) { + auto SI = AllocatableSections.upper_bound(Address); + if (SI != AllocatableSections.begin()) { + --SI; + if (SI->first + SI->second.getSize() > Address) + return SI->second; + } + return std::make_error_code(std::errc::bad_address); +} + +ErrorOr +BinaryContext::getSectionForAddress(uint64_t Address) const { auto SI = AllocatableSections.upper_bound(Address); if (SI != AllocatableSections.begin()) { --SI; @@ -475,10 +473,9 @@ ErrorOr BinaryContext::extractPointerAtAddress(uint64_t Address) const { auto Section = getSectionForAddress(Address); if (!Section) - return Section.getError(); + return std::make_error_code(std::errc::bad_address); - StringRef SectionContents; - Section->getContents(SectionContents); + StringRef SectionContents = Section->getContents(); DataExtractor DE(SectionContents, AsmInfo->isLittleEndian(), AsmInfo->getPointerSize()); @@ -486,280 +483,31 @@ BinaryContext::extractPointerAtAddress(uint64_t Address) const { return DE.getAddress(&SectionOffset); } -void BinaryContext::addSectionRelocation(SectionRef Section, uint64_t Offset, - MCSymbol *Symbol, uint64_t Type, +void BinaryContext::addSectionRelocation(BinarySection &Section, + uint64_t Offset, + MCSymbol *Symbol, + uint64_t Type, uint64_t Addend) { - auto RI = SectionRelocations.find(Section); - if (RI == SectionRelocations.end()) { - auto Result = - SectionRelocations.emplace(Section, std::set()); - RI = Result.first; - } - RI->second.emplace(Relocation{Offset, Symbol, Type, Addend, 0}); + Section.addRelocation(Offset, Symbol, Type, Addend); } -void BinaryContext::addRelocation(uint64_t Address, MCSymbol *Symbol, - uint64_t Type, uint64_t Addend) { - auto ContainingSection = getSectionForAddress(Address); - assert(ContainingSection && "cannot find section for address"); - addSectionRelocation(*ContainingSection, - Address - ContainingSection->getAddress(), - Symbol, - Type, - Addend); +void BinaryContext::addRelocation(uint64_t Address, + MCSymbol *Symbol, + uint64_t Type, + uint64_t Addend) { + auto Section = getSectionForAddress(Address); + assert(Section && "cannot find section for address"); + Section->addRelocation(Address - Section->getAddress(), Symbol, Type, Addend); } void BinaryContext::removeRelocationAt(uint64_t Address) { - auto ContainingSection = getSectionForAddress(Address); - assert(ContainingSection && "cannot find section for address"); - auto RI = SectionRelocations.find(*ContainingSection); - if (RI == SectionRelocations.end()) - return; - - auto &Relocations = RI->second; - auto RelocI = Relocations.find( - Relocation{Address - ContainingSection->getAddress(), 0, 0, 0, 0}); - if (RelocI == Relocations.end()) - return; - - Relocations.erase(RelocI); + auto Section = getSectionForAddress(Address); + assert(Section && "cannot find section for address"); + Section->removeRelocationAt(Address - Section->getAddress()); } const Relocation *BinaryContext::getRelocationAt(uint64_t Address) { - auto ContainingSection = getSectionForAddress(Address); - assert(ContainingSection && "cannot find section for address"); - auto RI = SectionRelocations.find(*ContainingSection); - if (RI == SectionRelocations.end()) - return nullptr; - - auto &Relocations = RI->second; - auto RelocI = Relocations.find( - Relocation{Address - ContainingSection->getAddress(), 0, 0, 0, 0}); - if (RelocI == Relocations.end()) - return nullptr; - - return &*RelocI; -} - -size_t Relocation::getSizeForType(uint64_t Type) { - switch (Type) { - default: - llvm_unreachable("unsupported relocation type"); - case ELF::R_X86_64_PC8: - return 1; - case ELF::R_X86_64_PLT32: - case ELF::R_X86_64_PC32: - case ELF::R_X86_64_32S: - case ELF::R_X86_64_32: - case ELF::R_X86_64_GOTPCREL: - case ELF::R_X86_64_GOTTPOFF: - case ELF::R_X86_64_TPOFF32: - case ELF::R_X86_64_GOTPCRELX: - case ELF::R_X86_64_REX_GOTPCRELX: - case ELF::R_AARCH64_CALL26: - case ELF::R_AARCH64_ADR_PREL_PG_HI21: - case ELF::R_AARCH64_LDST64_ABS_LO12_NC: - case ELF::R_AARCH64_ADD_ABS_LO12_NC: - case ELF::R_AARCH64_LDST128_ABS_LO12_NC: - case ELF::R_AARCH64_LDST32_ABS_LO12_NC: - case ELF::R_AARCH64_LDST16_ABS_LO12_NC: - case ELF::R_AARCH64_LDST8_ABS_LO12_NC: - case ELF::R_AARCH64_ADR_GOT_PAGE: - case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: - case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: - case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: - case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: - case ELF::R_AARCH64_LD64_GOT_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: - case ELF::R_AARCH64_TLSDESC_CALL: - case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: - case ELF::R_AARCH64_JUMP26: - case ELF::R_AARCH64_PREL32: - return 4; - case ELF::R_X86_64_PC64: - case ELF::R_X86_64_64: - case ELF::R_AARCH64_ABS64: - return 8; - } -} - -uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, - uint64_t PC) { - switch (Type) { - default: - llvm_unreachable("unsupported relocation type"); - case ELF::R_AARCH64_ABS64: - return Contents; - case ELF::R_AARCH64_PREL32: - return static_cast(PC) + SignExtend64<32>(Contents & 0xffffffff); - case ELF::R_AARCH64_TLSDESC_CALL: - case ELF::R_AARCH64_JUMP26: - case ELF::R_AARCH64_CALL26: - // Immediate goes in bits 25:0 of B and BL. - Contents &= ~0xfffffffffc000000ULL; - return static_cast(PC) + SignExtend64<28>(Contents << 2); - case ELF::R_AARCH64_ADR_GOT_PAGE: - case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: - case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: - case ELF::R_AARCH64_ADR_PREL_PG_HI21: { - // Bits 32:12 of Symbol address goes in bits 30:29 + 23:5 of ADRP - // instruction - Contents &= ~0xffffffff9f00001fUll; - auto LowBits = (Contents >> 29) & 0x3; - auto HighBits = (Contents >> 5) & 0x7ffff; - Contents = LowBits | (HighBits << 2); - Contents = static_cast(PC) + SignExtend64<32>(Contents << 12); - Contents &= ~0xfffUll; - return Contents; - } - case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_LD64_GOT_LO12_NC: - case ELF::R_AARCH64_LDST64_ABS_LO12_NC: { - // Immediate goes in bits 21:10 of LD/ST instruction, taken - // from bits 11:3 of Symbol address - Contents &= ~0xffffffffffc003ffU; - return Contents >> (10 - 3); - } - case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: - case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: - case ELF::R_AARCH64_ADD_ABS_LO12_NC: { - // Immediate goes in bits 21:10 of ADD instruction - Contents &= ~0xffffffffffc003ffU; - return Contents >> (10 - 0); - } - case ELF::R_AARCH64_LDST128_ABS_LO12_NC: { - // Immediate goes in bits 21:10 of ADD instruction, taken - // from bits 11:4 of Symbol address - Contents &= ~0xffffffffffc003ffU; - return Contents >> (10 - 4); - } - case ELF::R_AARCH64_LDST32_ABS_LO12_NC: { - // Immediate goes in bits 21:10 of ADD instruction, taken - // from bits 11:2 of Symbol address - Contents &= ~0xffffffffffc003ffU; - return Contents >> (10 - 2); - } - case ELF::R_AARCH64_LDST16_ABS_LO12_NC: { - // Immediate goes in bits 21:10 of ADD instruction, taken - // from bits 11:1 of Symbol address - Contents &= ~0xffffffffffc003ffU; - return Contents >> (10 - 1); - } - case ELF::R_AARCH64_LDST8_ABS_LO12_NC: { - // Immediate goes in bits 21:10 of ADD instruction, taken - // from bits 11:0 of Symbol address - Contents &= ~0xffffffffffc003ffU; - return Contents >> (10 - 0); - } - } -} - -bool Relocation::isGOT(uint64_t Type) { - switch (Type) { - default: - return false; - case ELF::R_AARCH64_ADR_GOT_PAGE: - case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: - case ELF::R_AARCH64_LD64_GOT_LO12_NC: - case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: - case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: - case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: - case ELF::R_AARCH64_TLSDESC_CALL: - return true; - } -} - -bool Relocation::isPCRelative(uint64_t Type) { - switch (Type) { - default: - llvm_unreachable("Unknown relocation type"); - - case ELF::R_X86_64_64: - case ELF::R_X86_64_32: - case ELF::R_X86_64_32S: - case ELF::R_X86_64_TPOFF32: - case ELF::R_AARCH64_ABS64: - case ELF::R_AARCH64_LDST64_ABS_LO12_NC: - case ELF::R_AARCH64_ADD_ABS_LO12_NC: - case ELF::R_AARCH64_LDST128_ABS_LO12_NC: - case ELF::R_AARCH64_LDST32_ABS_LO12_NC: - case ELF::R_AARCH64_LDST16_ABS_LO12_NC: - case ELF::R_AARCH64_LDST8_ABS_LO12_NC: - case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: - case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: - case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: - case ELF::R_AARCH64_LD64_GOT_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: - return false; - - case ELF::R_X86_64_PC8: - case ELF::R_X86_64_PC32: - case ELF::R_X86_64_GOTPCREL: - case ELF::R_X86_64_PLT32: - case ELF::R_X86_64_GOTTPOFF: - case ELF::R_X86_64_GOTPCRELX: - case ELF::R_X86_64_REX_GOTPCRELX: - case ELF::R_AARCH64_TLSDESC_CALL: - case ELF::R_AARCH64_CALL26: - case ELF::R_AARCH64_ADR_PREL_PG_HI21: - case ELF::R_AARCH64_ADR_GOT_PAGE: - case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: - case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: - case ELF::R_AARCH64_JUMP26: - case ELF::R_AARCH64_PREL32: - return true; - } -} - -size_t Relocation::emit(MCStreamer *Streamer) const { - const auto Size = getSizeForType(Type); - auto &Ctx = Streamer->getContext(); - if (isPCRelative(Type)) { - auto *TempLabel = Ctx.createTempSymbol(); - Streamer->EmitLabel(TempLabel); - auto Value = - MCBinaryExpr::createSub(MCSymbolRefExpr::create(Symbol, Ctx), - MCSymbolRefExpr::create(TempLabel, Ctx), - Ctx); - if (Addend) { - Value = MCBinaryExpr::createAdd(Value, - MCConstantExpr::create(Addend, Ctx), - Ctx); - } - Streamer->EmitValue(Value, Size); - } else { - Streamer->EmitSymbolValue(Symbol, Size); - } - return Size; -} - -#define ELF_RELOC(name, value) #name, - -void Relocation::print(raw_ostream &OS) const { - static const char *X86RelocNames[] = { -#include "llvm/Support/ELFRelocs/x86_64.def" - }; - static const char *AArch64RelocNames[] = { -#include "llvm/Support/ELFRelocs/AArch64.def" - }; - if (Arch == Triple::aarch64) - OS << AArch64RelocNames[Type]; - else - OS << X86RelocNames[Type]; - OS << ", 0x" << Twine::utohexstr(Offset); - if (Symbol) { - OS << ", " << Symbol->getName(); - } - if (int64_t(Addend) < 0) - OS << ", -0x" << Twine::utohexstr(-int64_t(Addend)); - else - OS << ", 0x" << Twine::utohexstr(Addend); - OS << ", 0x" << Twine::utohexstr(Value); + auto Section = getSectionForAddress(Address); + assert(Section && "cannot find section for address"); + return Section->getRelocationAt(Address - Section->getAddress()); } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 370d209e49ef..8c9722b50435 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -14,6 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H #define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H +#include "BinarySection.h" #include "DebugData.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" @@ -53,51 +54,16 @@ namespace bolt { class BinaryFunction; class DataReader; -/// Relocation class. -struct Relocation { - static Triple::ArchType Arch; /// for printing, set by BinaryContext ctor. - uint64_t Offset; - mutable MCSymbol *Symbol; /// mutable to allow modification by emitter. - uint64_t Type; - uint64_t Addend; - uint64_t Value; - - /// Return size of the given relocation \p Type. - static size_t getSizeForType(uint64_t Type); - - /// Extract current relocated value from binary contents. This is used for - /// RISC architectures where values are encoded in specific bits depending - /// on the relocation value. - static uint64_t extractValue(uint64_t Type, uint64_t Contents, uint64_t PC); - - /// Return true if relocation type is PC-relative. Return false otherwise. - static bool isPCRelative(uint64_t Type); - - /// Return true if relocation type implies the creation of a GOT entry - static bool isGOT(uint64_t Type); - - /// Emit relocation at a current \p Streamer' position. The caller is - /// responsible for setting the position correctly. - size_t emit(MCStreamer *Streamer) const; - - /// Print a relocation to \p OS. - void print(raw_ostream &OS) const; -}; - -/// Relocation ordering by offset. -inline bool operator<(const Relocation &A, const Relocation &B) { - return A.Offset < B.Offset; -} - -inline raw_ostream &operator<<(raw_ostream &OS, const Relocation &Rel) { - Rel.print(OS); - return OS; -} - class BinaryContext { BinaryContext() = delete; + /// Map virtual address to a section. + using SectionMapType = std::map; + SectionMapType AllocatableSections; + + /// Map of section name to BinarySection object. + std::map NameToSection; public: /// [name] -> [address] map used for global symbol resolution. @@ -118,17 +84,11 @@ class BinaryContext { /// Map address to a constant island owner (constant data in code section) std::map AddressToConstantIslandMap; - /// Map virtual address to a section. - std::map AllocatableSections; - /// Set of addresses in the code that are not a function start, and are /// referenced from outside of containing function. E.g. this could happen /// when a function has more than a single entry point. std::set InterproceduralReferences; - /// Section relocations. - std::map> SectionRelocations; - std::unique_ptr Ctx; std::unique_ptr DwCtx; @@ -255,8 +215,43 @@ class BinaryContext { ErrorOr> getFunctionData(const BinaryFunction &Function) const; + BinarySection ®isterSection(SectionRef Section) { + assert(!AllocatableSections.count(Section.getAddress()) && + "can't register section twice"); + StringRef Name; + Section.getName(Name); + assert(!NameToSection.count(Name) && "can't register section name twice"); + auto Res = AllocatableSections.emplace(Section.getAddress(), + BinarySection(Section)); + NameToSection[Name] = &Res.first->second; + return Res.first->second; + } + + iterator_range sections() { + return make_range(AllocatableSections.begin(), AllocatableSections.end()); + } + + iterator_range sections() const { + return make_range(AllocatableSections.begin(), AllocatableSections.end()); + } + /// Return (allocatable) section containing the given \p Address. - ErrorOr getSectionForAddress(uint64_t Address) const; + ErrorOr getSectionForAddress(uint64_t Address); + ErrorOr getSectionForAddress(uint64_t Address) const; + + /// Return (allocatable) section associated with given \p Name. + ErrorOr getSectionByName(StringRef Name) { + auto Itr = NameToSection.find(Name); + if (Itr != NameToSection.end()) + return *Itr->second; + return std::make_error_code(std::errc::bad_address); + } + ErrorOr getSectionByName(StringRef Name) const { + auto Itr = NameToSection.find(Name); + if (Itr != NameToSection.end()) + return *Itr->second; + return std::make_error_code(std::errc::bad_address); + } /// Given \p Address in the binary, extract and return a pointer value at that /// address. The address has to be a valid statically allocated address for @@ -293,7 +288,7 @@ class BinaryContext { std::map &BFs); /// Add relocation for \p Section at a given \p Offset. - void addSectionRelocation(SectionRef Section, uint64_t Offset, + void addSectionRelocation(BinarySection &Section, uint64_t Offset, MCSymbol *Symbol, uint64_t Type, uint64_t Addend = 0); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index caedd2a5549b..4c249b0ab9ab 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -371,8 +371,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (!opts::shouldProcess(*this) || !opts::shouldPrint(*this)) return; - StringRef SectionName; - Section.getName(SectionName); + StringRef SectionName = Section.getName(); OS << "Binary Function \"" << *this << "\" " << Annotation << " {"; if (Names.size() > 1) { OS << "\n Other names : "; @@ -755,8 +754,8 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, } } - auto SectionOrError = BC.getSectionForAddress(ArrayStart); - if (!SectionOrError) { + auto Section = BC.getSectionForAddress(ArrayStart); + if (!Section) { // No section - possibly an absolute address. Since we don't allow // internal function addresses to escape the function scope - we // consider it a tail call. @@ -767,25 +766,23 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, } return IndirectBranchType::POSSIBLE_TAIL_CALL; } - auto &Section = *SectionOrError; - if (Section.isVirtual()) { + if (Section->isVirtual()) { // The contents are filled at runtime. return IndirectBranchType::POSSIBLE_TAIL_CALL; } // Extract the value at the start of the array. - StringRef SectionContents; - Section.getContents(SectionContents); + StringRef SectionContents = Section->getContents(); const auto EntrySize = Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE ? 4 : PtrSize; DataExtractor DE(SectionContents, BC.AsmInfo->isLittleEndian(), EntrySize); - auto ValueOffset = static_cast(ArrayStart - Section.getAddress()); + auto ValueOffset = static_cast(ArrayStart - Section->getAddress()); uint64_t Value = 0; std::vector JTOffsetCandidates; - while (ValueOffset <= Section.getSize() - EntrySize) { + while (ValueOffset <= Section->getSize() - EntrySize) { DEBUG(dbgs() << "BOLT-DEBUG: indirect jmp at 0x" << Twine::utohexstr(getAddress() + Offset) << " is referencing address 0x" - << Twine::utohexstr(Section.getAddress() + ValueOffset)); + << Twine::utohexstr(Section->getAddress() + ValueOffset)); // Extract the value and increment the offset. if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { Value = PCRelAddr + DE.getSigned(&ValueOffset, EntrySize); @@ -2122,8 +2119,7 @@ void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { "cannot emit raw body unless relocation accuracy is guaranteed"); // Raw contents of the function. - StringRef SectionContents; - Section.getContents(SectionContents); + StringRef SectionContents = Section.getContents(); // Raw contents of the function. StringRef FunctionContents = @@ -2201,8 +2197,7 @@ void BinaryFunction::emitConstantIslands( assert((!OnBehalfOf || IslandProxies[OnBehalfOf].size() > 0) && "spurious OnBehalfOf constant island emission"); // Raw contents of the function. - StringRef SectionContents; - Section.getContents(SectionContents); + StringRef SectionContents = Section.getContents(); // Raw contents of the function. StringRef FunctionContents = @@ -3314,22 +3309,20 @@ void BinaryFunction::JumpTable::updateOriginal(BinaryContext &BC) { // This way we only overwrite them when a corresponding function is // overwritten. assert(BC.HasRelocations && "relocation mode expected"); - auto SectionOrError = BC.getSectionForAddress(Address); - assert(SectionOrError && "section not found for jump table"); - auto Section = SectionOrError.get(); - uint64_t Offset = Address - Section.getAddress(); - StringRef SectionName; - Section.getName(SectionName); + auto Section = BC.getSectionForAddress(Address); + assert(Section && "section not found for jump table"); + uint64_t Offset = Address - Section->getAddress(); + StringRef SectionName = Section->getName(); for (auto *Entry : Entries) { const auto RelType = (Type == JTT_NORMAL) ? ELF::R_X86_64_64 : ELF::R_X86_64_PC32; const uint64_t RelAddend = (Type == JTT_NORMAL) - ? 0 : Offset - (Address - Section.getAddress()); + ? 0 : Offset - (Address - Section->getAddress()); DEBUG(dbgs() << "adding relocation to section " << SectionName << " at offset " << Twine::utohexstr(Offset) << " for symbol " << Entry->getName() << " with addend " << Twine::utohexstr(RelAddend) << '\n'); - BC.addSectionRelocation(Section, Offset, Entry, RelType, RelAddend); + BC.addSectionRelocation(*Section, Offset, Entry, RelType, RelAddend); Offset += EntrySize; } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 52cdfd9f4f17..88546feb950d 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -245,7 +245,7 @@ class BinaryFunction { std::vector Names; /// Containing section - SectionRef Section; + BinarySection &Section; /// Address of the function in memory. Also could be an offset from /// base address for position independent binaries. @@ -814,7 +814,7 @@ class BinaryFunction { friend class BinaryContext; /// Creation should be handled by RewriteInstance::createBinaryFunction(). - BinaryFunction(const std::string &Name, SectionRef Section, uint64_t Address, + BinaryFunction(const std::string &Name, BinarySection &Section, uint64_t Address, uint64_t Size, BinaryContext &BC, bool IsSimple) : Names({Name}), Section(Section), Address(Address), Size(Size), BC(BC), IsSimple(IsSimple), @@ -1075,7 +1075,7 @@ class BinaryFunction { } /// Return containing file section. - SectionRef getSection() const { + BinarySection &getSection() const { return Section; } diff --git a/bolt/BinarySection.cpp b/bolt/BinarySection.cpp new file mode 100644 index 000000000000..886727ae6af4 --- /dev/null +++ b/bolt/BinarySection.cpp @@ -0,0 +1,227 @@ +//===--- BinarySection.cpp - Interface for object file section -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinarySection.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" + +using namespace llvm; +using namespace bolt; + +Triple::ArchType Relocation::Arch; + +size_t Relocation::getSizeForType(uint64_t Type) { + switch (Type) { + default: + llvm_unreachable("unsupported relocation type"); + case ELF::R_X86_64_PC8: + return 1; + case ELF::R_X86_64_PLT32: + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_32: + case ELF::R_X86_64_GOTPCREL: + case ELF::R_X86_64_GOTTPOFF: + case ELF::R_X86_64_TPOFF32: + case ELF::R_X86_64_GOTPCRELX: + case ELF::R_X86_64_REX_GOTPCRELX: + case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_CALL: + case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_PREL32: + return 4; + case ELF::R_X86_64_PC64: + case ELF::R_X86_64_64: + case ELF::R_AARCH64_ABS64: + return 8; + } +} + +uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, + uint64_t PC) { + switch (Type) { + default: + llvm_unreachable("unsupported relocation type"); + case ELF::R_AARCH64_ABS64: + return Contents; + case ELF::R_AARCH64_PREL32: + return static_cast(PC) + SignExtend64<32>(Contents & 0xffffffff); + case ELF::R_AARCH64_TLSDESC_CALL: + case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_CALL26: + // Immediate goes in bits 25:0 of B and BL. + Contents &= ~0xfffffffffc000000ULL; + return static_cast(PC) + SignExtend64<28>(Contents << 2); + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: { + // Bits 32:12 of Symbol address goes in bits 30:29 + 23:5 of ADRP + // instruction + Contents &= ~0xffffffff9f00001fUll; + auto LowBits = (Contents >> 29) & 0x3; + auto HighBits = (Contents >> 5) & 0x7ffff; + Contents = LowBits | (HighBits << 2); + Contents = static_cast(PC) + SignExtend64<32>(Contents << 12); + Contents &= ~0xfffUll; + return Contents; + } + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of LD/ST instruction, taken + // from bits 11:3 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 3); + } + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_ADD_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 0); + } + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction, taken + // from bits 11:4 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 4); + } + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction, taken + // from bits 11:2 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 2); + } + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction, taken + // from bits 11:1 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 1); + } + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: { + // Immediate goes in bits 21:10 of ADD instruction, taken + // from bits 11:0 of Symbol address + Contents &= ~0xffffffffffc003ffU; + return Contents >> (10 - 0); + } + } +} + +bool Relocation::isGOT(uint64_t Type) { + switch (Type) { + default: + return false; + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_CALL: + return true; + } +} + +bool Relocation::isPCRelative(uint64_t Type) { + switch (Type) { + default: + llvm_unreachable("Unknown relocation type"); + + case ELF::R_X86_64_64: + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_TPOFF32: + case ELF::R_AARCH64_ABS64: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + return false; + + case ELF::R_X86_64_PC8: + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_GOTPCREL: + case ELF::R_X86_64_PLT32: + case ELF::R_X86_64_GOTTPOFF: + case ELF::R_X86_64_GOTPCRELX: + case ELF::R_X86_64_REX_GOTPCRELX: + case ELF::R_AARCH64_TLSDESC_CALL: + case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_PREL32: + return true; + } +} + +size_t Relocation::emit(MCStreamer *Streamer) const { + const auto Size = getSizeForType(Type); + auto &Ctx = Streamer->getContext(); + if (isPCRelative(Type)) { + auto *TempLabel = Ctx.createTempSymbol(); + Streamer->EmitLabel(TempLabel); + auto Value = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(Symbol, Ctx), + MCSymbolRefExpr::create(TempLabel, Ctx), + Ctx); + if (Addend) { + Value = MCBinaryExpr::createAdd(Value, + MCConstantExpr::create(Addend, Ctx), + Ctx); + } + Streamer->EmitValue(Value, Size); + } else { + Streamer->EmitSymbolValue(Symbol, Size); + } + return Size; +} + +#define ELF_RELOC(name, value) #name, + +void Relocation::print(raw_ostream &OS) const { + static const char *X86RelocNames[] = { +#include "llvm/Support/ELFRelocs/x86_64.def" + }; + static const char *AArch64RelocNames[] = { +#include "llvm/Support/ELFRelocs/AArch64.def" + }; + if (Arch == Triple::aarch64) + OS << AArch64RelocNames[Type]; + else + OS << X86RelocNames[Type]; + OS << ", 0x" << Twine::utohexstr(Offset); + if (Symbol) { + OS << ", " << Symbol->getName(); + } + if (int64_t(Addend) < 0) + OS << ", -0x" << Twine::utohexstr(-int64_t(Addend)); + else + OS << ", 0x" << Twine::utohexstr(Addend); + OS << ", 0x" << Twine::utohexstr(Value); +} diff --git a/bolt/BinarySection.h b/bolt/BinarySection.h new file mode 100644 index 000000000000..84b4ce4c2f0e --- /dev/null +++ b/bolt/BinarySection.h @@ -0,0 +1,148 @@ +//===--- BinarySection.h - Interface for object file section -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H + +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace llvm { + +using namespace object; + +namespace bolt { + +/// Relocation class. +struct Relocation { + static Triple::ArchType Arch; /// for printing, set by BinaryContext ctor. + uint64_t Offset; + mutable MCSymbol *Symbol; /// mutable to allow modification by emitter. + uint64_t Type; + uint64_t Addend; + uint64_t Value; + + /// Return size of the given relocation \p Type. + static size_t getSizeForType(uint64_t Type); + + /// Extract current relocated value from binary contents. This is used for + /// RISC architectures where values are encoded in specific bits depending + /// on the relocation value. + static uint64_t extractValue(uint64_t Type, uint64_t Contents, uint64_t PC); + + /// Return true if relocation type is PC-relative. Return false otherwise. + static bool isPCRelative(uint64_t Type); + + /// Return true if relocation type implies the creation of a GOT entry + static bool isGOT(uint64_t Type); + + /// Emit relocation at a current \p Streamer' position. The caller is + /// responsible for setting the position correctly. + size_t emit(MCStreamer *Streamer) const; + + /// Print a relocation to \p OS. + void print(raw_ostream &OS) const; +}; + +/// Relocation ordering by offset. +inline bool operator<(const Relocation &A, const Relocation &B) { + return A.Offset < B.Offset; +} + +inline raw_ostream &operator<<(raw_ostream &OS, const Relocation &Rel) { + Rel.print(OS); + return OS; +} + +/// A wrapper around SectionRef that also manages related relocations +class BinarySection { + SectionRef Section; + std::set Relocations; +public: + explicit BinarySection(SectionRef Section) : Section(Section) { } + + StringRef getName() const { + StringRef Name; + Section.getName(Name); + return Name; + } + uint64_t getAddress() const { return Section.getAddress(); } + uint64_t getEndAddress() const { return getAddress() + getSize(); } + uint64_t getSize() const { return Section.getSize(); } + uint64_t getAlignment() const { return Section.getAlignment(); } + bool containsAddress(uint64_t Address) const { + return getAddress() <= Address && Address < getEndAddress(); + } + bool containsRange(uint64_t Address, uint64_t Size) const { + return getAddress() <= Address && Address + Size <= getEndAddress(); + } + bool isReadOnly() const { return Section.isReadOnly(); } + bool isVirtual() const { return Section.isVirtual(); } + bool isText() const { return Section.isText(); } + bool isAllocatable() const { return getFlags() & ELF::SHF_ALLOC; } + StringRef getContents() const { + StringRef Contents; + if (auto EC = Section.getContents(Contents)) { + errs() << "BOLT-ERROR: cannot get section contents for " + << getName() << ": " << EC.message() << ".\n"; + exit(1); + } + return Contents; + } + unsigned getFlags() const { return ELFSectionRef(Section).getFlags(); } + unsigned getType() const { return ELFSectionRef(Section).getType(); } + SectionRef getSectionRef() const { return Section; } + + iterator_range::iterator> relocations() { + return make_range(Relocations.begin(), Relocations.end()); + } + + iterator_range::const_iterator> relocations() const { + return make_range(Relocations.begin(), Relocations.end()); + } + + bool hasRelocations() const { + return !Relocations.empty(); + } + + void removeRelocationAt(uint64_t Offset) { + Relocation Key{Offset, 0, 0, 0, 0}; + auto Itr = Relocations.find(Key); + if (Itr != Relocations.end()) + Relocations.erase(Itr); + } + + void addRelocation(uint64_t Offset, + MCSymbol *Symbol, + uint64_t Type, + uint64_t Addend, + uint64_t Value = 0) { + assert(Offset < getSize()); + Relocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); + } + + const Relocation *getRelocationAt(uint64_t Offset) const { + Relocation Key{Offset, 0, 0, 0, 0}; + auto Itr = Relocations.find(Key); + return Itr != Relocations.end() ? &*Itr : nullptr; + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 5e6ce6fe7b8c..bdcdb051f5b6 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -65,6 +65,7 @@ add_llvm_tool(llvm-bolt BinaryFunction.cpp BinaryFunctionProfile.cpp BinaryPassManager.cpp + BinarySection.cpp CacheMetrics.cpp DataAggregator.cpp DataReader.cpp diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 565a79dfe519..e6025b2ddf45 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -456,7 +456,7 @@ void RewriteInstance::updateLineTableOffsets() { void RewriteInstance::finalizeDebugSections() { // Skip .debug_aranges if we are re-generating .gdb_index. - if (opts::KeepARanges || !GdbIndexSection.getObject()) { + if (opts::KeepARanges || !GdbIndexSection) { SmallVector ARangesBuffer; raw_svector_ostream OS(ARangesBuffer); @@ -505,11 +505,10 @@ void RewriteInstance::finalizeDebugSections() { } void RewriteInstance::updateGdbIndexSection() { - if (!GdbIndexSection.getObject()) + if (!GdbIndexSection) return; - StringRef GdbIndexContents; - GdbIndexSection.getContents(GdbIndexContents); + StringRef GdbIndexContents = GdbIndexSection->getContents(); const auto *Data = GdbIndexContents.data(); diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index f11dd625dfd3..b7c356f5a3de 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -1158,24 +1158,15 @@ bool SimplifyRODataLoads::simplifyRODataLoads( // Get the contents of the section containing the target address of the // memory operand. We are only interested in read-only sections. - ErrorOr DataSectionOrErr = - BC.getSectionForAddress(TargetAddress); - if (!DataSectionOrErr) - continue; - SectionRef DataSection = DataSectionOrErr.get(); - if (!DataSection.isReadOnly()) + auto DataSection = BC.getSectionForAddress(TargetAddress); + if (!DataSection || !DataSection->isReadOnly()) continue; if (BC.getRelocationAt(TargetAddress)) continue; - uint32_t Offset = TargetAddress - DataSection.getAddress(); - StringRef ConstantData; - if (std::error_code EC = DataSection.getContents(ConstantData)) { - errs() << "BOLT-ERROR: 'cannot get section contents': " - << EC.message() << ".\n"; - exit(1); - } + uint32_t Offset = TargetAddress - DataSection->getAddress(); + StringRef ConstantData = DataSection->getContents(); ++NumLocalLoadsFound; if (BB->hasProfile()) diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index 299a339a69ad..5bb0044e0e47 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -1143,7 +1143,7 @@ void IndirectCallPromotion::runOnFunctions( PrintBB = true; break; } - if (auto Section = BC.getSectionForAddress(MI.Addr.Offset)) { + if (BC.getSectionForAddress(MI.Addr.Offset)) { PrintBB = true; break; } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 9ec8db675f4b..865e7ac0e088 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -639,11 +639,11 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, RewriteInstance::RewriteInstance(ELFObjectFileBase *File, DataReader &DR, DataAggregator &DA, const int Argc, const char *const *Argv) - : InputFile(File), Argc(Argc), Argv(Argv), DA(DA), - BC(createBinaryContext( - File, DR, - std::unique_ptr( - new DWARFContextInMemory(*InputFile, nullptr, true)))) {} + : InputFile(File), Argc(Argc), Argv(Argv), DA(DA), + BC(createBinaryContext( + File, DR, + std::unique_ptr( + new DWARFContextInMemory(*InputFile, nullptr, true)))) {} RewriteInstance::~RewriteInstance() {} @@ -1239,8 +1239,9 @@ void RewriteInstance::discoverFileObjects() { } BF->addAlternativeName(UniqueName); } else { - BF = createBinaryFunction(UniqueName, *Section, Address, SymbolSize, - IsSimple); + auto BS = BC->getSectionForAddress(Address); + assert(BS && "section for functions must be registered."); + BF = createBinaryFunction(UniqueName, *BS, Address, SymbolSize, IsSimple); } if (!AlternativeName.empty()) BF->addAlternativeName(AlternativeName); @@ -1327,20 +1328,19 @@ void RewriteInstance::discoverFileObjects() { } void RewriteInstance::disassemblePLT() { - if (!PLTSection.getObject()) + if (!PLTSection) return; - const auto PLTAddress = PLTSection.getAddress(); - StringRef PLTContents; - PLTSection.getContents(PLTContents); + const auto PLTAddress = PLTSection->getAddress(); + StringRef PLTContents = PLTSection->getContents(); ArrayRef PLTData( reinterpret_cast(PLTContents.data()), - PLTSection.getSize()); + PLTSection->getSize()); // Pseudo function for the start of PLT. The table could have a matching // FDE that we want to match to pseudo function. - createBinaryFunction("__BOLT_PLT_PSEUDO" , PLTSection, PLTAddress, 0, false); - for (uint64_t Offset = 0; Offset < PLTSection.getSize(); Offset += 0x10) { + createBinaryFunction("__BOLT_PLT_PSEUDO", *PLTSection, PLTAddress, 0, false); + for (uint64_t Offset = 0; Offset < PLTSection->getSize(); Offset += 0x10) { uint64_t InstrSize; MCInst Instruction; const uint64_t InstrAddr = PLTAddress + Offset; @@ -1369,7 +1369,7 @@ void RewriteInstance::disassemblePLT() { } // To get the name we have to read a relocation against the address. - for (const auto &Rel : RelaPLTSection.relocations()) { + for (const auto &Rel : RelaPLTSection->getSectionRef().relocations()) { if (Rel.getType() != ELF::R_X86_64_JUMP_SLOT) continue; if (Rel.getOffset() == TargetAddress) { @@ -1379,7 +1379,7 @@ void RewriteInstance::disassemblePLT() { const auto SymbolName = *(*SymbolIter).getName(); std::string Name = SymbolName.str() + "@PLT"; auto *BF = createBinaryFunction(Name, - PLTSection, + *PLTSection, InstrAddr, 0, /*IsSimple=*/false); @@ -1391,12 +1391,15 @@ void RewriteInstance::disassemblePLT() { } } - if (PLTGOTSection.getObject()) { + if (PLTGOTSection) { // Check if we need to create a function for .plt.got. Some linkers // (depending on the version) would mark it with FDE while others wouldn't. - if (!getBinaryFunctionAtAddress(PLTGOTSection.getAddress())) { - createBinaryFunction("__BOLT_PLT_GOT_PSEUDO" , PLTGOTSection, - PLTGOTSection.getAddress(), 0, false); + if (!getBinaryFunctionAtAddress(PLTGOTSection->getAddress())) { + createBinaryFunction("__BOLT_PLT_GOT_PSEUDO", + *PLTGOTSection, + PLTGOTSection->getAddress(), + 0, + false); } } } @@ -1439,8 +1442,7 @@ void RewriteInstance::adjustFunctionBoundaries() { : NextSymRefI->second.getSection(); // Function runs at most till the end of the containing section. - uint64_t NextObjectAddress = Function.getSection().getAddress() + - Function.getSection().getSize(); + uint64_t NextObjectAddress = Function.getSection().getEndAddress(); // Or till the next object marked by a symbol. if (NextSymRefI != FileSymRefs.end()) { NextObjectAddress = std::min(NextSymRefI->first, NextObjectAddress); @@ -1474,12 +1476,10 @@ void RewriteInstance::adjustFunctionBoundaries() { } void RewriteInstance::relocateEHFrameSection() { - assert(EHFrameSection.getObject() != nullptr && - "non-empty .eh_frame section expected"); + assert(EHFrameSection && "non-empty .eh_frame section expected"); - DWARFFrame EHFrame(EHFrameSection.getAddress()); - StringRef EHFrameSectionContents; - EHFrameSection.getContents(EHFrameSectionContents); + DWARFFrame EHFrame(EHFrameSection->getAddress()); + StringRef EHFrameSectionContents = EHFrameSection->getContents(); DataExtractor DE(EHFrameSectionContents, BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getPointerSize()); @@ -1521,7 +1521,7 @@ void RewriteInstance::relocateEHFrameSection() { DEBUG(dbgs() << "BOLT-DEBUG: adding DWARF reference against symbol " << Symbol->getName() << '\n'); - BC->addSectionRelocation(EHFrameSection, Offset, Symbol, RelType); + BC->addSectionRelocation(*EHFrameSection, Offset, Symbol, RelType); }; EHFrame.parse(DE, createReloc); @@ -1534,7 +1534,7 @@ void RewriteInstance::relocateEHFrameSection() { } BinaryFunction *RewriteInstance::createBinaryFunction( - const std::string &Name, SectionRef Section, uint64_t Address, + const std::string &Name, BinarySection &Section, uint64_t Address, uint64_t Size, bool IsSimple) { auto Result = BinaryFunctions.emplace( Address, BinaryFunction(Name, Section, Address, Size, *BC, IsSimple)); @@ -1567,20 +1567,8 @@ void RewriteInstance::readSpecialSections() { LSDAAddress = Section.getAddress(); } else if (SectionName == ".debug_loc") { DebugLocSize = Section.getSize(); - } else if (SectionName == ".eh_frame") { - EHFrameSection = Section; } else if (SectionName == ".rela.text") { HasTextRelocations = true; - } else if (SectionName == ".gdb_index") { - GdbIndexSection = Section; - } else if (SectionName == ".plt") { - PLTSection = Section; - } else if (SectionName == ".got.plt") { - GOTPLTSection = Section; - } else if (SectionName == ".plt.got") { - PLTGOTSection = Section; - } else if (SectionName == ".rela.plt") { - RelaPLTSection = Section; } // Ignore zero-size allocatable sections as they present no interest to us. @@ -1589,11 +1577,17 @@ void RewriteInstance::readSpecialSections() { if ((ELFSectionRef(Section).getFlags() & ELF::SHF_ALLOC) && Section.getSize() > 0 && SectionName != ".tbss") { - BC->AllocatableSections.emplace(std::make_pair(Section.getAddress(), - Section)); + BC->registerSection(Section); } } + EHFrameSection = BC->getSectionByName(".eh_frame"); + GdbIndexSection = BC->getSectionByName(".gdb_index"); + PLTSection = BC->getSectionByName(".plt"); + GOTPLTSection = BC->getSectionByName(".got.plt"); + PLTGOTSection = BC->getSectionByName(".plt.got"); + RelaPLTSection = BC->getSectionByName(".rela.plt"); + if (opts::RelocationMode == cl::BOU_TRUE && !HasTextRelocations) { errs() << "BOLT-ERROR: relocations against code are missing from the input " "file. Cannot proceed in relocations mode (-relocs).\n"; @@ -2043,8 +2037,7 @@ void RewriteInstance::disassembleFunctions() { continue; // PLT requires special handling and could be ignored in this context. - StringRef SectionName; - Section->getName(SectionName); + StringRef SectionName = Section->getName(); if (SectionName == ".plt" || SectionName == ".plt.got") continue; @@ -2382,9 +2375,9 @@ void RewriteInstance::emitFunctions() { emitDataSections(Streamer.get()); // Relocate .eh_frame to .eh_frame_old. - if (EHFrameSection.getObject() != nullptr) { + if (EHFrameSection) { relocateEHFrameSection(); - emitDataSection(Streamer.get(), EHFrameSection, ".eh_frame_old"); + emitDataSection(Streamer.get(), *EHFrameSection, ".eh_frame_old"); } Streamer->Finish(); @@ -2610,10 +2603,12 @@ void RewriteInstance::mapFileSections( } // Handling for sections with relocations. - for (auto &SRI : BC->SectionRelocations) { - auto &Section = SRI.first; - StringRef SectionName; - Section.getName(SectionName); + for (auto &SRI : BC->sections()) { + auto &Section = SRI.second; + if (!Section.hasRelocations()) + continue; + + StringRef SectionName = Section.getName(); auto SMII = EFMM->SectionMapInfo.find(OrgSecPrefix + std::string(SectionName)); if (SMII == EFMM->SectionMapInfo.end()) @@ -2636,8 +2631,7 @@ void RewriteInstance::mapFileSections( Section.getAddress()); SI.FileAddress = Section.getAddress(); - StringRef SectionContents; - Section.getContents(SectionContents); + StringRef SectionContents = Section.getContents(); SI.FileOffset = SectionContents.data() - InputFile->getData().data(); } } @@ -2726,23 +2720,17 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { } } -void RewriteInstance::emitDataSection(MCStreamer *Streamer, SectionRef Section, +void RewriteInstance::emitDataSection(MCStreamer *Streamer, + const BinarySection &Section, std::string Name) { - StringRef SectionName; - if (!Name.empty()) - SectionName = Name; - else - Section.getName(SectionName); - - const auto SectionFlags = ELFSectionRef(Section).getFlags(); - const auto SectionType = ELFSectionRef(Section).getType(); + StringRef SectionName = !Name.empty() ? StringRef(Name) : Section.getName(); + const auto SectionFlags = Section.getFlags(); + const auto SectionType = Section.getType(); + StringRef SectionContents = Section.getContents(); auto *ELFSection = BC->Ctx->getELFSection(SectionName, SectionType, SectionFlags); - StringRef SectionContents; - Section.getContents(SectionContents); - Streamer->SwitchSection(ELFSection); Streamer->EmitValueToAlignment(Section.getAlignment()); @@ -2750,15 +2738,13 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, SectionRef Section, << (SectionFlags & ELF::SHF_ALLOC ? "" : "non-") << "allocatable data section " << SectionName << '\n'); - auto SRI = BC->SectionRelocations.find(Section); - if (SRI == BC->SectionRelocations.end()) { + if (!Section.hasRelocations()) { Streamer->EmitBytes(SectionContents); return; } - auto &Relocations = SRI->second; uint64_t SectionOffset = 0; - for (auto &Relocation : Relocations) { + for (auto &Relocation : Section.relocations()) { assert(Relocation.Offset < Section.getSize() && "overflow detected"); if (SectionOffset < Relocation.Offset) { Streamer->EmitBytes( @@ -2781,11 +2767,12 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, SectionRef Section, } void RewriteInstance::emitDataSections(MCStreamer *Streamer) { - for (auto &SRI : BC->SectionRelocations) { - auto &Section = SRI.first; + for (auto &SRI : BC->sections()) { + auto &Section = SRI.second; + if (!Section.hasRelocations()) + continue; - StringRef SectionName; - Section.getName(SectionName); + StringRef SectionName = Section.getName(); assert(SectionName != ".eh_frame" && "should not emit .eh_frame as data"); @@ -3583,12 +3570,12 @@ template void RewriteInstance::patchELFRelaPLT(ELFObjectFile *File) { auto &OS = Out->os(); - if (!RelaPLTSection.getObject()) { + if (!RelaPLTSection) { errs() << "BOLT-INFO: no .rela.plt section found\n"; return; } - for (const auto &Rel : RelaPLTSection.relocations()) { + for (const auto &Rel : RelaPLTSection->getSectionRef().relocations()) { if (Rel.getType() == ELF::R_X86_64_IRELATIVE) { DataRefImpl DRI = Rel.getRawDataRefImpl(); const auto *RelA = File->getRela(DRI); diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 368ae2e6d61e..a1d55fed3c57 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -198,7 +198,8 @@ class RewriteInstance { /// Emit data \p Section, possibly with relocations. Use name \p Name if /// non-empty. - void emitDataSection(MCStreamer *Streamer, SectionRef Section, + void emitDataSection(MCStreamer *Streamer, + const BinarySection &Section, std::string Name = ""); /// Emit data sections that have code references in them. @@ -395,7 +396,7 @@ class RewriteInstance { /// Construct BinaryFunction object and add it to internal maps. BinaryFunction *createBinaryFunction(const std::string &Name, - object::SectionRef Section, + BinarySection &Section, uint64_t Address, uint64_t Size, bool IsSimple); @@ -485,29 +486,29 @@ class RewriteInstance { ArrayRef LSDAData; uint64_t LSDAAddress{0}; const llvm::DWARFFrame *EHFrame{nullptr}; - SectionRef EHFrameSection; + ErrorOr EHFrameSection{std::errc::bad_address}; /// .plt section. - SectionRef PLTSection; + ErrorOr PLTSection{std::errc::bad_address}; /// .got.plt sections. /// /// Contains jump slots (addresses) indirectly referenced by /// instructions in .plt section. - SectionRef GOTPLTSection; + ErrorOr GOTPLTSection{std::errc::bad_address}; /// .plt.got section (#clowntown). /// /// A section sometimes generated by BFD linker. - SectionRef PLTGOTSection; + ErrorOr PLTGOTSection{std::errc::bad_address}; /// .rela.plt section. /// /// Contains relocations against .got.plt. - SectionRef RelaPLTSection; + ErrorOr RelaPLTSection{std::errc::bad_address}; /// .gdb_index section. - SectionRef GdbIndexSection; + ErrorOr GdbIndexSection{std::errc::bad_address}; uint64_t NewSymTabOffset{0}; From 70b62585f7770cc6b2db855778966288cb6acbce Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 24 Jan 2018 05:42:11 -0800 Subject: [PATCH 368/904] [BOLT] Refactor relocation analysis code. Summary: Refactor the relocation anaylsis code. It should be a little better at validating that the relocation value matches up with the symbol address + addend stored in the relocation (except on aarch64). It is also a little better at finding the symbol address used to do the lookup in BinaryContext, rather than just using symbol address + addend. (cherry picked from commit 1acdaeb3859d73355a0bf2569cfbe2a9d013b4ab) --- bolt/BinaryContext.cpp | 5 +- bolt/BinarySection.cpp | 99 +++++++++++ bolt/BinarySection.h | 25 ++- bolt/RewriteInstance.cpp | 356 ++++++++++++++++++++++++--------------- bolt/RewriteInstance.h | 11 ++ 5 files changed, 354 insertions(+), 142 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 10f557603b68..1a475736b79e 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -137,8 +137,9 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF, } void BinaryContext::printGlobalSymbols(raw_ostream& OS) const { - for (auto &entry : GlobalSymbols) { - OS << "(" << entry.first << " -> " << entry.second << ")\n"; + for (auto &Entry : GlobalSymbols) { + OS << "(" << Entry.first << " -> 0x" + << Twine::utohexstr(Entry.second) << ")\n"; } } diff --git a/bolt/BinarySection.cpp b/bolt/BinarySection.cpp index 886727ae6af4..92417bc387b5 100644 --- a/bolt/BinarySection.cpp +++ b/bolt/BinarySection.cpp @@ -18,12 +18,58 @@ using namespace bolt; Triple::ArchType Relocation::Arch; +bool Relocation::isSupported(uint64_t Type) { + switch (Type) { + default: + return false; + case ELF::R_X86_64_8: + case ELF::R_X86_64_16: + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_64: + case ELF::R_X86_64_PC8: + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_PC64: + case ELF::R_X86_64_PLT32: + case ELF::R_X86_64_GOTPCREL: + case ELF::R_X86_64_GOTTPOFF: + case ELF::R_X86_64_TPOFF32: + case ELF::R_X86_64_GOTPCRELX: + case ELF::R_X86_64_REX_GOTPCRELX: + case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_ADR_PREL_PG_HI21: + case ELF::R_AARCH64_LDST64_ABS_LO12_NC: + case ELF::R_AARCH64_ADD_ABS_LO12_NC: + case ELF::R_AARCH64_LDST128_ABS_LO12_NC: + case ELF::R_AARCH64_LDST32_ABS_LO12_NC: + case ELF::R_AARCH64_LDST16_ABS_LO12_NC: + case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: + case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_CALL: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: + case ELF::R_AARCH64_JUMP26: + case ELF::R_AARCH64_PREL32: + case ELF::R_AARCH64_ABS64: + return true; + } +} + size_t Relocation::getSizeForType(uint64_t Type) { switch (Type) { default: llvm_unreachable("unsupported relocation type"); + case ELF::R_X86_64_8: case ELF::R_X86_64_PC8: return 1; + case ELF::R_X86_64_16: + return 2; case ELF::R_X86_64_PLT32: case ELF::R_X86_64_PC32: case ELF::R_X86_64_32S: @@ -43,10 +89,14 @@ size_t Relocation::getSizeForType(uint64_t Type) { case ELF::R_AARCH64_LDST8_ABS_LO12_NC: case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: case ELF::R_AARCH64_TLSDESC_CALL: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case ELF::R_AARCH64_JUMP26: case ELF::R_AARCH64_PREL32: return 4; @@ -74,6 +124,7 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, return static_cast(PC) + SignExtend64<28>(Contents << 2); case ELF::R_AARCH64_ADR_GOT_PAGE: case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case ELF::R_AARCH64_ADR_PREL_PG_HI21: { // Bits 32:12 of Symbol address goes in bits 30:29 + 23:5 of ADRP // instruction @@ -85,6 +136,7 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, Contents &= ~0xfffUll; return Contents; } + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: { @@ -93,6 +145,8 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, Contents &= ~0xffffffffffc003ffU; return Contents >> (10 - 3); } + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: case ELF::R_AARCH64_ADD_ABS_LO12_NC: { // Immediate goes in bits 21:10 of ADD instruction @@ -130,8 +184,24 @@ bool Relocation::isGOT(uint64_t Type) { switch (Type) { default: return false; + case ELF::R_X86_64_GOT32: + case ELF::R_X86_64_GOTPCREL: + case ELF::R_X86_64_GOTTPOFF: + case ELF::R_X86_64_GOTOFF64: + case ELF::R_X86_64_GOTPC32: + case ELF::R_X86_64_GOT64: + case ELF::R_X86_64_GOTPCREL64: + case ELF::R_X86_64_GOTPC64: + case ELF::R_X86_64_GOTPLT64: + case ELF::R_X86_64_GOTPC32_TLSDESC: + case ELF::R_X86_64_GOTPCRELX: + case ELF::R_X86_64_REX_GOTPCRELX: case ELF::R_AARCH64_ADR_GOT_PAGE: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: @@ -140,6 +210,25 @@ bool Relocation::isGOT(uint64_t Type) { } } +bool Relocation::isTLS(uint64_t Type) { + switch (Type) { + default: + return false; + case ELF::R_X86_64_TPOFF32: + case ELF::R_X86_64_TPOFF64: + case ELF::R_X86_64_GOTTPOFF: + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_CALL: + case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: + return true; + } +} + bool Relocation::isPCRelative(uint64_t Type) { switch (Type) { default: @@ -148,6 +237,8 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_X86_64_64: case ELF::R_X86_64_32: case ELF::R_X86_64_32S: + case ELF::R_X86_64_16: + case ELF::R_X86_64_8: case ELF::R_X86_64_TPOFF32: case ELF::R_AARCH64_ABS64: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: @@ -156,6 +247,9 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_AARCH64_LDST32_ABS_LO12_NC: case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST8_ABS_LO12_NC: + case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: + case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: @@ -196,6 +290,11 @@ size_t Relocation::emit(MCStreamer *Streamer) const { Ctx); } Streamer->EmitValue(Value, Size); + } else if (Addend) { + auto Value = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, Ctx), + MCConstantExpr::create(Addend, Ctx), + Ctx); + Streamer->EmitValue(Value, Size); } else { Streamer->EmitSymbolValue(Symbol, Size); } diff --git a/bolt/BinarySection.h b/bolt/BinarySection.h index 84b4ce4c2f0e..32544f09b4cd 100644 --- a/bolt/BinarySection.h +++ b/bolt/BinarySection.h @@ -30,10 +30,22 @@ namespace bolt { /// Relocation class. struct Relocation { static Triple::ArchType Arch; /// for printing, set by BinaryContext ctor. + + /// The offset of this relocation in the object it is contained in. uint64_t Offset; - mutable MCSymbol *Symbol; /// mutable to allow modification by emitter. + + /// The symbol this relocation is referring to. + MCSymbol *Symbol; + + /// Relocation type. uint64_t Type; + + /// The offset from the \p Symbol base used to compute the final + /// value of this relocation. uint64_t Addend; + + /// The computed relocation value extracted from the binary file. + /// Used to validate relocation correctness. uint64_t Value; /// Return size of the given relocation \p Type. @@ -47,9 +59,20 @@ struct Relocation { /// Return true if relocation type is PC-relative. Return false otherwise. static bool isPCRelative(uint64_t Type); + /// Check if \p Type is a supported relocation type. + static bool isSupported(uint64_t Type); + /// Return true if relocation type implies the creation of a GOT entry static bool isGOT(uint64_t Type); + /// Return true if relocation type is for thread local storage. + static bool isTLS(uint64_t Type); + + /// Return true if this relocation is PC-relative. Return false otherwise. + bool isPCRelative() const { + return isPCRelative(Type); + } + /// Emit relocation at a current \p Streamer' position. The caller is /// responsible for setting the position correctly. size_t emit(MCStreamer *Streamer) const; diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 865e7ac0e088..51d787b0bb4c 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -80,6 +80,14 @@ extern cl::OptionCategory AggregatorCategory; extern cl::opt JumpTables; +static cl::opt +ForceToDataRelocations("force-data-relocations", + cl::desc("force relocations to data sections to always be processed"), + cl::init(false), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltCategory)); + static cl::opt PrintCacheMetrics("print-cache-metrics", cl::desc("calculate and print various metrics for instruction cache"), @@ -514,6 +522,12 @@ ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { namespace { +StringRef getSectionName(SectionRef Section) { + StringRef SectionName; + Section.getName(SectionName); + return SectionName; +} + /// Create BinaryContext for a given architecture \p ArchName and /// triple \p TripleName. std::unique_ptr @@ -1578,6 +1592,10 @@ void RewriteInstance::readSpecialSections() { Section.getSize() > 0 && SectionName != ".tbss") { BC->registerSection(Section); + DEBUG(dbgs() << "BOLT-DEBUG: registering section " << SectionName + << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x" + << Twine::utohexstr(Section.getAddress() + Section.getSize()) + << "\n"); } } @@ -1646,6 +1664,151 @@ int64_t getRelocationAddend(const ELFObjectFileBase *Obj, } } // anonymous namespace +bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, + SectionRef RelocatedSection, + std::string &SymbolName, + uint64_t &SymbolAddress, + int64_t &Addend, + uint64_t &ExtractedValue) const { + if (!Relocation::isSupported(Rel.getType())) + return false; + + const bool IsAArch64 = BC->TheTriple->getArch() == llvm::Triple::aarch64; + const bool IsFromCode = RelocatedSection.isText(); + + // For value extraction. + StringRef RelocatedSectionContents; + RelocatedSection.getContents(RelocatedSectionContents); + DataExtractor DE(RelocatedSectionContents, + BC->AsmInfo->isLittleEndian(), + BC->AsmInfo->getPointerSize()); + + const bool IsPCRelative = Relocation::isPCRelative(Rel.getType()); + auto SymbolIter = Rel.getSymbol(); + assert(SymbolIter != InputFile->symbol_end() && + "relocation symbol must exist"); + auto Symbol = *SymbolIter; + SymbolName = *(Symbol.getName()); + SymbolAddress = *(Symbol.getAddress()); + Addend = getRelocationAddend(InputFile, Rel); + + uint32_t RelocationOffset = + Rel.getOffset() - RelocatedSection.getAddress(); + const auto RelSize = Relocation::getSizeForType(Rel.getType()); + ExtractedValue = + static_cast(DE.getSigned(&RelocationOffset, RelSize)); + + if (IsAArch64) { + ExtractedValue = Relocation::extractValue(Rel.getType(), + ExtractedValue, + Rel.getOffset()); + } + + // Weird stuff - section symbols are marked as ST_Debug. + const bool SymbolIsSection = (Symbol.getType() == SymbolRef::ST_Debug); + const auto PCRelOffset = + IsPCRelative && !IsAArch64 ? Rel.getOffset() : 0; + + // If no symbol has been found or if it is a relocation requiring the + // creation of a GOT entry, do not link against the symbol but against + // whatever address was extracted from the instruction itself. We are + // not creating a GOT entry as this was already processed by the linker. + if (!SymbolAddress || Relocation::isGOT(Rel.getType())) { + assert(!SymbolIsSection); + if (ExtractedValue) { + SymbolAddress = ExtractedValue - Addend + PCRelOffset; + } else { + // This is weird case. The extracted value is zero but the addend is + // non-zero and the relocation is not pc-rel. Using the previous logic, + // the SymbolAddress would end up as a huge number. Seen in + // exceptions_pic.test. + DEBUG(dbgs() << "BOLT-DEBUG: relocation @ " + << Twine::utohexstr(Rel.getOffset()) + << " value does not match addend for " + << "relocation to undefined symbol."); + SymbolAddress += PCRelOffset; + return true; + } + } else if (SymbolIsSection) { + auto Section = Symbol.getSection(); + if (Section && *Section != InputFile->section_end()) { + SymbolName = "section " + std::string(getSectionName(**Section)); + if (!IsAArch64) { + assert(SymbolAddress == (*Section)->getAddress() && + "section symbol address must be the same as section address"); + // Convert section symbol relocations to regular relocations inside + // non-section symbols. + if (IsPCRelative) { + Addend = ExtractedValue - (SymbolAddress - PCRelOffset); + } else { + SymbolAddress = ExtractedValue; + Addend = 0; + } + } + } + } + + if (!IsPCRelative && Addend != 0 && IsFromCode && !SymbolIsSection) { + // TODO: RefSection should be the same as **(Symbol.getSection()). + auto RefSection = BC->getSectionForAddress(SymbolAddress); + if (RefSection && RefSection->isText()) { + if (opts::Verbosity > 1) { + SmallString<16> TypeName; + Rel.getTypeName(TypeName); + errs() << "BOLT-WARNING: detected absolute reference from code into " + << "a middle of a function:\n" + << " offset = 0x" << Twine::utohexstr(Rel.getOffset()) + << "; type = " << Rel.getType() + << "; type name = " << TypeName + << "; value = 0x" << Twine::utohexstr(ExtractedValue) + << "; symbol = " << SymbolName + << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) + << "; symbol section = " << RefSection->getName() + << "; addend = 0x" << Twine::utohexstr(Addend) + << "; address = 0x" << Twine::utohexstr(SymbolAddress + Addend) + << '\n'; + } + assert(ExtractedValue == SymbolAddress + Addend && "value mismatch"); + } + } + + DEBUG( + if (!Relocation::isTLS(Rel.getType()) && + SymbolName != "__hot_start" && + SymbolName != "__hot_end" && + ExtractedValue != SymbolAddress + Addend - PCRelOffset) { + auto Section = Symbol.getSection(); + SmallString<16> TypeName; + Rel.getTypeName(TypeName); + dbgs() << "BOLT-DEBUG: Mismatch between extracted value and relocation " + << "data:\n" + << "BOLT-DEBUG: offset = 0x" + << Twine::utohexstr(Rel.getOffset()) + << "; type = " << Rel.getType() + << "; type name = " << TypeName + << "; value = 0x" << Twine::utohexstr(ExtractedValue) + << "; symbol = " << SymbolName + << "; symbol type = " << Symbol.getType() + << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) + << "; orig symbol address = 0x" + << Twine::utohexstr(*(Symbol.getAddress())) + << "; symbol section = " << getSectionName(**Section) + << "; addend = 0x" << Twine::utohexstr(Addend) + << "; original addend = 0x" + << Twine::utohexstr(getRelocationAddend(InputFile, Rel)) + << '\n'; + }); + + assert((IsAArch64 || + Relocation::isTLS(Rel.getType()) || + SymbolName == "__hot_start" || + SymbolName == "__hot_end" || + ExtractedValue == SymbolAddress + Addend - PCRelOffset) && + "extracted relocation value should match relocation components"); + + return true; +} + void RewriteInstance::readRelocations(const SectionRef &Section) { StringRef SectionName; Section.getName(SectionName); @@ -1677,153 +1840,54 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { return; } - // For value extraction. - StringRef RelocatedSectionContents; - RelocatedSection.getContents(RelocatedSectionContents); - DataExtractor DE(RelocatedSectionContents, - BC->AsmInfo->isLittleEndian(), - BC->AsmInfo->getPointerSize()); + const bool IsAArch64 = BC->TheTriple->getArch() == llvm::Triple::aarch64; + const bool IsFromCode = RelocatedSection.isText(); - bool IsFromCode = RelocatedSection.isText(); for (const auto &Rel : Section.relocations()) { SmallString<16> TypeName; Rel.getTypeName(TypeName); - DEBUG(dbgs() << "BOLT-DEBUG: offset = 0x" + + std::string SymbolName; + uint64_t SymbolAddress; + int64_t Addend; + uint64_t ExtractedValue; + + if (!analyzeRelocation(Rel, + RelocatedSection, + SymbolName, + SymbolAddress, + Addend, + ExtractedValue)) { + DEBUG(dbgs() << "BOLT-DEBUG: skipping relocation @ offset = 0x" << Twine::utohexstr(Rel.getOffset()) << "; type name = " << TypeName << '\n'); - - if (Rel.getType() == ELF::R_X86_64_TLSGD || - Rel.getType() == ELF::R_X86_64_TLSLD || - Rel.getType() == ELF::R_X86_64_DTPOFF32) { - DEBUG(dbgs() << "skipping relocation\n"); continue; } - // Extract value. - uint32_t RelocationOffset = - Rel.getOffset() - RelocatedSection.getAddress(); - auto ExtractedValue = static_cast( - DE.getSigned(&RelocationOffset, - Relocation::getSizeForType(Rel.getType()))); - - if (BC->TheTriple->getArch() == llvm::Triple::aarch64) - ExtractedValue = Relocation::extractValue(Rel.getType(), ExtractedValue, - Rel.getOffset()); - - bool IsPCRelative = Relocation::isPCRelative(Rel.getType()); - auto Addend = getRelocationAddend(InputFile, Rel); - uint64_t Address = 0; - uint64_t SymbolAddress = 0; - auto SymbolIter = Rel.getSymbol(); - std::string SymbolName = ""; - SymbolAddress = *SymbolIter->getAddress(); - // If no symbol has been found or if it is a relocation requiring the - // creation of a GOT entry, do not link against the symbol but against - // whatever address was extracted from the instruction itself. We are - // not creating a GOT entry as this was already processed by the linker. - if (!SymbolAddress || Relocation::isGOT(Rel.getType())) { - Address = ExtractedValue; - // For aarch, pc address has already been added in extractValue - if (IsPCRelative && BC->TheTriple->getArch() != llvm::Triple::aarch64) { - Address += Rel.getOffset(); - } - } else { - Address = SymbolAddress + Addend; - } - bool SymbolIsSection = false; - if (SymbolIter != InputFile->symbol_end()) { - SymbolName = (*(*SymbolIter).getName()); - if (SymbolIter->getType() == SymbolRef::ST_Debug) { - // Weird stuff - section symbols are marked as ST_Debug. - SymbolIsSection = true; - auto SymbolSection = SymbolIter->getSection(); - if (SymbolSection && *SymbolSection != InputFile->section_end()) { - StringRef SymbolSectionName; - (*SymbolSection)->getName(SymbolSectionName); - SymbolName = "section " + std::string(SymbolSectionName); - if (BC->TheTriple->getArch() != llvm::Triple::aarch64) - Address = Addend; - } - } - } - - bool ForceRelocation = false; - if (opts::HotText && - (SymbolName == "__hot_start" || SymbolName == "__hot_end")) { - ForceRelocation = true; - } - - bool IsAbsoluteCodeRefWithAddend = false; - if (!IsPCRelative && Addend != 0 && IsFromCode && !SymbolIsSection) { - auto RefSection = BC->getSectionForAddress(SymbolAddress); - if (RefSection && RefSection->isText()) { - if (opts::Verbosity > 1) { - errs() << "BOLT-WARNING: detected absolute reference from code into " - << "a middle of a function:\n" - << " offset = 0x" << Twine::utohexstr(Rel.getOffset()) - << "; symbol = " << SymbolName - << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) - << "; addend = 0x" << Twine::utohexstr(Addend) - << "; address = 0x" << Twine::utohexstr(Address) - << "; type = " << Rel.getType() - << "; type name = " << TypeName - << '\n'; - } - assert(ExtractedValue == SymbolAddress + Addend && "value mismatch"); - Address = SymbolAddress; - IsAbsoluteCodeRefWithAddend = true; - } else if (BC->TheTriple->getArch() == llvm::Triple::aarch64) { - Addend = 0; // TODO: check if should apply for x86 as well - } - } else if (Addend < 0 && IsPCRelative) { - Address -= Addend; - } else { - Addend = 0; - } + const auto Address = SymbolAddress + Addend; + const bool ForceRelocation = + (opts::HotText && (SymbolName == "__hot_start" || + SymbolName == "__hot_end")) + || Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE; DEBUG(dbgs() << "BOLT-DEBUG: offset = 0x" << Twine::utohexstr(Rel.getOffset()) + << "; type = " << Rel.getType() + << "; type name = " << TypeName + << "; value = 0x" << Twine::utohexstr(ExtractedValue) << "; symbol = " << SymbolName << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) << "; addend = 0x" << Twine::utohexstr(Addend) << "; address = 0x" << Twine::utohexstr(Address) - << "; type = " << Rel.getType() - << "; type name = " << TypeName << '\n'); - if (Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE) - ForceRelocation = true; - - if (Rel.getType() != ELF::R_X86_64_TPOFF32 && - Rel.getType() != ELF::R_X86_64_GOTTPOFF && - Rel.getType() != ELF::R_X86_64_GOTPCREL && - BC->TheTriple->getArch() != llvm::Triple::aarch64) { - if (!IsPCRelative) { - if (!IsAbsoluteCodeRefWithAddend) { - if (opts::Verbosity > 2 && - ExtractedValue != Address) { - errs() << "BOLT-WARNING: mismatch ExtractedValue = 0x" - << Twine::utohexstr(ExtractedValue) << '\n'; - } - Address = ExtractedValue; - } - } else { - if (opts::Verbosity > 2 && - ExtractedValue != Address - Rel.getOffset() + Addend) { - errs() << "BOLT-WARNING: PC-relative mismatch ExtractedValue = 0x" - << Twine::utohexstr(ExtractedValue) << '\n'; - } - Address = ExtractedValue - Addend; - } - } - BinaryFunction *ContainingBF = nullptr; if (IsFromCode) { - ContainingBF = getBinaryFunctionContainingAddress( - Rel.getOffset(), - /*CheckPastEnd*/ false, - /*UseMaxSize*/ BC->TheTriple->getArch() == llvm::Triple::aarch64); + ContainingBF = + getBinaryFunctionContainingAddress(Rel.getOffset(), + /*CheckPastEnd*/ false, + /*UseMaxSize*/ IsAArch64); assert(ContainingBF && "cannot find function for address in code"); DEBUG(dbgs() << "BOLT-DEBUG: relocation belongs to " << *ContainingBF << '\n'); @@ -1836,24 +1900,27 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { // between the two. If we blindly apply the relocation it will appear // that it references an arbitrary location in the code, possibly even // in a different function from that containing the jump table. - if (BC->TheTriple->getArch() != llvm::Triple::aarch64 && IsPCRelative) { + if (!IsAArch64 && Relocation::isPCRelative(Rel.getType())) { // Just register the fact that we have PC-relative relocation at a given // address. The actual referenced label/address cannot be determined // from linker data alone. if (IsFromCode) { ContainingBF->addPCRelativeRelocationAddress(Rel.getOffset()); } - DEBUG(dbgs() << "BOLT-DEBUG: not creating PC-relative relocation\n"); + DEBUG(dbgs() << "BOLT-DEBUG: not creating PC-relative relocation at 0x" + << Twine::utohexstr(Rel.getOffset()) + << "\n"); continue; } - auto RefSection = BC->getSectionForAddress(Address); + // TODO: RefSection should be the same as **Rel.getSymbol().getSection() + auto RefSection = BC->getSectionForAddress(SymbolAddress); if (!RefSection && !ForceRelocation) { DEBUG(dbgs() << "BOLT-DEBUG: cannot determine referenced section.\n"); continue; } - bool ToCode = RefSection && RefSection->isText(); + const bool IsToCode = RefSection && RefSection->isText(); // Occasionally we may see a reference past the last byte of the function // typically as a result of __builtin_unreachable(). Check it here. @@ -1866,6 +1933,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { ReferencedSymbol = BC->getOrCreateGlobalSymbol(0, "Zero"); else ReferencedSymbol = BC->registerNameAtAddress(SymbolName, 0); + SymbolAddress = 0; Addend = Address; DEBUG(dbgs() << "BOLT-DEBUG: creating relocations for huge pages against" " symbol " << SymbolName << " with addend " << Addend @@ -1880,6 +1948,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { if (RefFunctionOffset) { ReferencedSymbol = ReferencedBF->getOrCreateLocalLabel(Address, /*CreatePastEnd*/ true); + SymbolAddress = Address; + Addend = 0; } else { ReferencedSymbol = ReferencedBF->getSymbol(); } @@ -1889,20 +1959,28 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { DEBUG(dbgs() << "BOLT-DEBUG: no corresponding function for " "relocation against code\n"); } - ReferencedSymbol = BC->getOrCreateGlobalSymbol(Address, "SYMBOLat"); + ReferencedSymbol = BC->getOrCreateGlobalSymbol(SymbolAddress, "SYMBOLat"); } if (IsFromCode) { - if (ReferencedBF || ForceRelocation || - BC->TheTriple->getArch() == llvm::Triple::aarch64) { - ContainingBF->addRelocation(Rel.getOffset(), ReferencedSymbol, - Rel.getType(), Addend, ExtractedValue); + if (ReferencedBF || ForceRelocation || opts::ForceToDataRelocations || + IsAArch64) { + ContainingBF->addRelocation(Rel.getOffset(), + ReferencedSymbol, + Rel.getType(), + Addend, + ExtractedValue); } else { - DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocation from code to data\n"); + DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocation from code to data " + << ReferencedSymbol->getName() << "\n"); } - } else if (ToCode) { - assert(Addend == 0 && "did not expect addend"); - BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType()); + } else if (IsToCode) { + BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), Addend); + } else if (opts::ForceToDataRelocations) { + BC->addRelocation(Rel.getOffset(), + ReferencedSymbol, + Rel.getType(), + Addend); } else { DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocation from data to data\n"); } diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index a1d55fed3c57..fbe4af2bc330 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -294,6 +294,17 @@ class RewriteInstance { /// Make .eh_frame section relocatable. void relocateEHFrameSection(); + /// Analyze relocation \p Rel contained in section \p RelocatedSection. + /// Return true if the relocation was successfully processed, false otherwise. + /// The \p SymbolName, \p SymbolAddress, \p Addend and \p ExtractedValue + /// parameters will be set on success. + bool analyzeRelocation(const RelocationRef &Rel, + SectionRef RelocatedSection, + std::string &SymbolName, + uint64_t &SymbolAddress, + int64_t &Addend, + uint64_t &ExtractedValue) const; + /// Rewrite non-allocatable sections with modifications. void rewriteNoteSections(); From 6c9c9c0c5a5a149c33492906b43e7459c52b6595 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Wed, 24 Jan 2018 12:29:38 -0800 Subject: [PATCH 369/904] [BOLT] faster cache+ implementation Summary: Speeding up cache+ algorithm. The idea is to find and merge "fallthrough" successors before main optimization. For a pair of blocks, A and B, block B is the fallthrough successor of A, if (i) all jumps (based on profile) from A goes to B and (ii) all jumps to B are from A. Such blocks should be adjacent in an optimal ordering, and should not be considered for splitting. (This gives the speed up). The gap between cache and cache+ reduced from ~2m to ~1m. (cherry picked from commit 7c345e582abf44bde6be1219853e049362eccd48) --- bolt/Passes/CachePlusReorderAlgorithm.cpp | 94 +++++++++++++++++++++-- 1 file changed, 88 insertions(+), 6 deletions(-) diff --git a/bolt/Passes/CachePlusReorderAlgorithm.cpp b/bolt/Passes/CachePlusReorderAlgorithm.cpp index 5a717aaec5e2..d8a46912389a 100644 --- a/bolt/Passes/CachePlusReorderAlgorithm.cpp +++ b/bolt/Passes/CachePlusReorderAlgorithm.cpp @@ -147,21 +147,38 @@ class CachePlus { /// Run cache+ algorithm and return a basic block ordering std::vector run() { + // Merge blocks with their fallthrough successors + for (auto BB : BF.layout()) { + if (FallthroughPred[BB->getLayoutIndex()] == nullptr && + FallthroughSucc[BB->getLayoutIndex()] != nullptr) { + auto CurBB = BB; + while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) { + const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()]; + mergeClusters(&AllClusters[BB->getLayoutIndex()], + &AllClusters[NextBB->getLayoutIndex()], + 0); + CurBB = NextBB; + } + } + } + // Merge pairs of clusters while there is an improvement in ExtTSP metric while (Clusters.size() > 1) { Cluster *BestClusterPred = nullptr; Cluster *BestClusterSucc = nullptr; std::pair BestGain(-1, 0); for (auto ClusterPred : Clusters) { + // Do not merge cold blocks + if (ClusterPred->isCold()) + continue; + // Get candidates for merging with the current cluster Adjacent.forAllAdjacent( ClusterPred, // Find the best candidate [&](Cluster *ClusterSucc) { assert(ClusterPred != ClusterSucc && "loop edges are not supported"); - // Do not merge cold blocks - if (ClusterPred->isCold() || ClusterSucc->isCold()) - return; + assert(!ClusterSucc->isCold() && "cannot merge cold clusters"); // Compute the gain of merging two clusters auto Gain = mergeGain(ClusterPred, ClusterSucc); @@ -261,12 +278,63 @@ class CachePlus { // Initialize adjacency matrix Adjacent.initialize(Clusters); for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); for (auto I : BB->successors()) { - if (BB != I) + if (BB != I && BI->Count > 0) { Adjacent.set(Clusters[BB->getLayoutIndex()], Clusters[I->getLayoutIndex()]); + } + ++BI; + } + } + + // Initialize fallthrough successors + findFallthroughBlocks(InWeight, OutWeight); + } + + /// For a pair of blocks, A and B, block B is the fallthrough successor of A, + /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps + /// to B are from A. Such blocks should be adjacent in an optimal ordering, + /// and the method finds such pairs of blocks. + void findFallthroughBlocks(const std::vector &InWeight, + const std::vector &OutWeight) { + FallthroughSucc = std::vector(BF.size(), nullptr); + FallthroughPred = std::vector(BF.size(), nullptr); + // Find fallthroughs based on edge weights + for (auto BB : BF.layout()) { + if (OutWeight[BB->getLayoutIndex()] == 0) + continue; + for (auto Edge : OutEdges[BB->getLayoutIndex()]) { + const auto SuccBB = Edge.first; + // Successor cannot be the first BB, which is pinned + if (OutWeight[BB->getLayoutIndex()] == Edge.second && + InWeight[SuccBB->getLayoutIndex()] == Edge.second && + SuccBB->getLayoutIndex() != 0) { + FallthroughSucc[BB->getLayoutIndex()] = SuccBB; + FallthroughPred[SuccBB->getLayoutIndex()] = BB; + break; + } } } + + // There might be 'cycles' in the fallthrough dependencies (since profile + // data isn't 100% accurate). + // Break the cycles by choosing the block with smallest index as the tail + for (auto BB : BF.layout()) { + const auto Idx = BB->getLayoutIndex(); + if (FallthroughSucc[Idx] == nullptr || FallthroughPred[Idx] == nullptr) + continue; + + auto SuccBB = FallthroughSucc[Idx]; + while (SuccBB != nullptr && SuccBB != BB) { + SuccBB = FallthroughSucc[SuccBB->getLayoutIndex()]; + } + if (SuccBB == nullptr) + continue; + // break the cycle + FallthroughSucc[FallthroughPred[Idx]->getLayoutIndex()] = nullptr; + FallthroughPred[Idx] = nullptr; + } } /// Compute ExtTSP score for a given order of basic blocks @@ -335,10 +403,17 @@ class CachePlus { }; std::pair Gain = std::make_pair(-1, 0); - // Try to simply concatenate two clusters + // Try to concatenate two clusters w/o splitting Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0); // Try to split ClusterPred into two and merge with ClusterSucc for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) { + // Make sure the splitting does not break FT successors + auto BB = ClusterPred->blocks()[Offset - 1]; + if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) { + assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]); + continue; + } + for (size_t Type = 0; Type < 4; Type++) { size_t MergeType = 1 + Type + Offset * 4; Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType); @@ -400,7 +475,9 @@ class CachePlus { /// adjacency information, and the corresponding cache. void mergeClusters(Cluster *Into, Cluster *From, size_t MergeType) { assert(Into != From && "Cluster cannot be merged with itself"); - // Merge the clusters + assert(!Into->isCold() && !From->isCold() && "Merging cold clusters"); + + // Merge the blocks of clusters auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType); Into->merge(From, MergedBlocks, score(MergedBlocks)); @@ -433,6 +510,11 @@ class CachePlus { // Cluster adjacency matrix AdjacencyMatrix Adjacent; + // Fallthrough successor of the block + std::vector FallthroughSucc; + // Fallthrough predecessor of the block + std::vector FallthroughPred; + // A cache that keeps precomputed values of mergeGain for pairs of clusters; // when a pair of clusters (x,y) gets merged, we invalidate the pairs // containing both x and y and all clusters adjacent to x and y (and recompute From ae973aa707d777801dd1132f606a7e758b66ff15 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 29 Jan 2018 10:37:30 -0800 Subject: [PATCH 370/904] [BOLT] Do not assert on bad data Summary: A test is asserting on impossible addresses coming from perf.data, instead of just reporting it as bad data. Fix this behavior. (cherry picked from commit 0374d7ccba358368e10e885a3e360c3bf10bdced) --- bolt/BinaryFunctionProfile.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bolt/BinaryFunctionProfile.cpp b/bolt/BinaryFunctionProfile.cpp index 30dc96e72ae5..78ce9444c3a6 100644 --- a/bolt/BinaryFunctionProfile.cpp +++ b/bolt/BinaryFunctionProfile.cpp @@ -307,9 +307,8 @@ bool BinaryFunction::recordEntry(uint64_t To, bool Mispred, uint64_t Count) { } bool BinaryFunction::recordExit(uint64_t From, bool Mispred, uint64_t Count) { - if (!isSimple()) + if (!isSimple() || From > getSize()) return false; - assert(From <= getSize() && "wrong From address"); if (!hasProfile()) ExecutionCount = 0; From fab6a301a6dbb2e787c43cd7a9cee4f8f0762ea8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 30 Jan 2018 13:18:40 -0800 Subject: [PATCH 371/904] [BOLT] Handle multiple sections with the same name Summary: Multiple sections can have the same name, so we need to make the NameToSectionMap into a multimap. (cherry picked from commit afe9496e1ea9a1d1b5fca78c89ba709148c4b489) --- bolt/BinaryContext.h | 42 +++++++++++++++++++++++++++------------- bolt/RewriteInstance.cpp | 12 ++++++------ 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 8c9722b50435..a80ddb22ac6f 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -62,8 +62,10 @@ class BinaryContext { using SectionMapType = std::map; SectionMapType AllocatableSections; - /// Map of section name to BinarySection object. - std::map NameToSection; + /// multimap of section name to BinarySection object. Some binaries + /// have multiple sections with the same name. + using NameToSectionMapType = std::multimap; + NameToSectionMapType NameToSection; public: /// [name] -> [address] map used for global symbol resolution. @@ -220,10 +222,9 @@ class BinaryContext { "can't register section twice"); StringRef Name; Section.getName(Name); - assert(!NameToSection.count(Name) && "can't register section name twice"); auto Res = AllocatableSections.emplace(Section.getAddress(), BinarySection(Section)); - NameToSection[Name] = &Res.first->second; + NameToSection.insert(std::make_pair(Name, &Res.first->second)); return Res.first->second; } @@ -239,17 +240,32 @@ class BinaryContext { ErrorOr getSectionForAddress(uint64_t Address); ErrorOr getSectionForAddress(uint64_t Address) const; - /// Return (allocatable) section associated with given \p Name. - ErrorOr getSectionByName(StringRef Name) { - auto Itr = NameToSection.find(Name); - if (Itr != NameToSection.end()) - return *Itr->second; + /// Return (allocatable) section(s) associated with given \p Name. + iterator_range + getSectionByName(StringRef Name) { + return make_range(NameToSection.equal_range(Name)); + } + iterator_range + getSectionByName(StringRef Name) const { + return make_range(NameToSection.equal_range(Name)); + } + + /// Return the unique (allocatable) section associated with given \p Name. + /// If there is more than one section with the same name, return an error + /// object. + ErrorOr getUniqueSectionByName(StringRef SectionName) { + auto Sections = getSectionByName(SectionName); + if (Sections.begin() != Sections.end() && + std::next(Sections.begin()) == Sections.end()) + return *Sections.begin()->second; return std::make_error_code(std::errc::bad_address); } - ErrorOr getSectionByName(StringRef Name) const { - auto Itr = NameToSection.find(Name); - if (Itr != NameToSection.end()) - return *Itr->second; + ErrorOr + getUniqueSectionByName(StringRef SectionName) const { + auto Sections = getSectionByName(SectionName); + if (Sections.begin() != Sections.end() && + std::next(Sections.begin()) == Sections.end()) + return *Sections.begin()->second; return std::make_error_code(std::errc::bad_address); } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 51d787b0bb4c..ca5d0f2d680b 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1599,12 +1599,12 @@ void RewriteInstance::readSpecialSections() { } } - EHFrameSection = BC->getSectionByName(".eh_frame"); - GdbIndexSection = BC->getSectionByName(".gdb_index"); - PLTSection = BC->getSectionByName(".plt"); - GOTPLTSection = BC->getSectionByName(".got.plt"); - PLTGOTSection = BC->getSectionByName(".plt.got"); - RelaPLTSection = BC->getSectionByName(".rela.plt"); + EHFrameSection = BC->getUniqueSectionByName(".eh_frame"); + GdbIndexSection = BC->getUniqueSectionByName(".gdb_index"); + PLTSection = BC->getUniqueSectionByName(".plt"); + GOTPLTSection = BC->getUniqueSectionByName(".got.plt"); + PLTGOTSection = BC->getUniqueSectionByName(".plt.got"); + RelaPLTSection = BC->getUniqueSectionByName(".rela.plt"); if (opts::RelocationMode == cl::BOU_TRUE && !HasTextRelocations) { errs() << "BOLT-ERROR: relocations against code are missing from the input " From 99fa22620eb97e00477fb0d2e857f75d7495d8bc Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 23 Jan 2018 15:18:41 -0800 Subject: [PATCH 372/904] [BOLT] Fix profile for multi-entry functions Summary: When we read profile for functions, we initialize counts for entry blocks first, and then populate counts for all blocks based on incoming edges. During the second phase we ignore the entry blocks because we expect them to be already initialized. For the primary entry at offset 0 it's the correct thing to do, since we treat all incoming branches as calls or tail calls. However, for secondary entries we only consider external edges to be from calls and don't increase entry count if an edge originates from inside the function. Thus we need to update the secondary entry basic block counts with internal edges too. (cherry picked from commit 8bea5a9de602109c13c54d7077060807005c679f) --- bolt/BinaryFunctionProfile.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryFunctionProfile.cpp b/bolt/BinaryFunctionProfile.cpp index 78ce9444c3a6..c341faa72b11 100644 --- a/bolt/BinaryFunctionProfile.cpp +++ b/bolt/BinaryFunctionProfile.cpp @@ -366,8 +366,10 @@ void BinaryFunction::postProcessProfile() { for (auto *BB : BasicBlocks) { auto SuccBIIter = BB->branch_info_begin(); for (auto Succ : BB->successors()) { - if (!Succ->isEntryPoint() && - SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE) + // All incoming edges to the primary entry have been accounted for, thus + // we skip the update here. + if (SuccBIIter->Count != BinaryBasicBlock::COUNT_NO_PROFILE && + Succ != BasicBlocks.front()) Succ->setExecutionCount(Succ->getExecutionCount() + SuccBIIter->Count); ++SuccBIIter; } From 6d6d5134bd481ee75ea975569110ee395cc2f965 Mon Sep 17 00:00:00 2001 From: Qinfan Wu Date: Wed, 31 Jan 2018 11:52:39 -0800 Subject: [PATCH 373/904] Handle types CU list in updateGdbIndexSection Summary: Handle types CU list in `updateGdbIndexSection`. It looks like the types part of `.gdb_index` isn't empty when `-fdebug-types-section` is used. So instead of aborting, we copy the part to new `.gdb_index` section. (cherry picked from commit 7d9eeb363a177a3951f16968b21bf63b69e4b9ca) --- bolt/DWARFRewriter.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index e6025b2ddf45..5e89156477f6 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -508,6 +508,9 @@ void RewriteInstance::updateGdbIndexSection() { if (!GdbIndexSection) return; + // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html for + // .gdb_index section format. + StringRef GdbIndexContents = GdbIndexSection->getContents(); const auto *Data = GdbIndexContents.data(); @@ -529,9 +532,6 @@ void RewriteInstance::updateGdbIndexSection() { const auto ConstantPoolOffset = read32le(Data + 20); Data += 24; - assert(CUTypesOffset == AddressTableOffset && - "CU types in .gdb_index should be empty"); - // Map CUs offsets to indices and verify existing index table. std::map OffsetToIndexMap; const auto CUListSize = CUTypesOffset - CUListOffset; @@ -553,7 +553,8 @@ void RewriteInstance::updateGdbIndexSection() { // Ignore old address table. const auto OldAddressTableSize = SymbolTableOffset - AddressTableOffset; - Data += OldAddressTableSize; + // Move Data to the beginning of symbol table. + Data += SymbolTableOffset - CUTypesOffset; // Calculate the size of the new address table. uint32_t NewAddressTableSize = 0; @@ -580,9 +581,10 @@ void RewriteInstance::updateGdbIndexSection() { write32le(Buffer + 20, ConstantPoolOffset + Delta); Buffer += 24; - // Copy over CU list. - memcpy(Buffer, GdbIndexContents.data() + 24, CUListSize); - Buffer += CUListSize; + // Copy over CU list and types CU list. + memcpy(Buffer, GdbIndexContents.data() + 24, + AddressTableOffset - CUListOffset); + Buffer += AddressTableOffset - CUListOffset; // Generate new address table. for (const auto &CURangesPair : RangesSectionsWriter->getCUAddressRanges()) { From ef20eb6fa6c613b6a43b509c99555445b1238e60 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 31 Jan 2018 12:12:59 -0800 Subject: [PATCH 374/904] [BOLT] Fix lookup of non-allocatable sections in RewriteInstance Summary: Register all sections with BinaryContext. Store all sections in a set ordered by (address, size, name). Add two separate maps to lookup sections by address or by name. Non-allocatable sections are not stored in the address->section map since they all "start" at 0. (cherry picked from commit b0a277eaa2be14fbdc2b9ae962093a01049c2af5) --- bolt/BinaryContext.cpp | 32 ++++++++++++++++++++++++-------- bolt/BinaryContext.h | 39 +++++++++++++++++++-------------------- bolt/BinarySection.h | 8 ++++++++ bolt/RewriteInstance.cpp | 23 +++++++---------------- 4 files changed, 58 insertions(+), 44 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 1a475736b79e..337f439b665b 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -450,26 +450,42 @@ BinaryContext::getFunctionData(const BinaryFunction &Function) const { } ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) { - auto SI = AllocatableSections.upper_bound(Address); - if (SI != AllocatableSections.begin()) { + auto SI = AddressToSection.upper_bound(Address); + if (SI != AddressToSection.begin()) { --SI; - if (SI->first + SI->second.getSize() > Address) - return SI->second; + if (SI->first + SI->second->getSize() > Address) + return *SI->second; } return std::make_error_code(std::errc::bad_address); } ErrorOr BinaryContext::getSectionForAddress(uint64_t Address) const { - auto SI = AllocatableSections.upper_bound(Address); - if (SI != AllocatableSections.begin()) { + auto SI = AddressToSection.upper_bound(Address); + if (SI != AddressToSection.begin()) { --SI; - if (SI->first + SI->second.getSize() > Address) - return SI->second; + if (SI->first + SI->second->getSize() > Address) + return *SI->second; } return std::make_error_code(std::errc::bad_address); } +BinarySection &BinaryContext::registerSection(SectionRef Section) { + StringRef Name; + Section.getName(Name); + auto Res = Sections.insert(BinarySection(Section)); + assert(Res.second && "can't register the same section twice."); + // Cast away const here because std::set always stores values by + // const. It's ok to do this because we can never change the + // BinarySection properties that affect set ordering. + auto *BS = const_cast(&*Res.first); + // Only register sections with addresses in the AddressToSection map. + if (Section.getAddress()) + AddressToSection.insert(std::make_pair(Section.getAddress(), BS)); + NameToSection.insert(std::make_pair(Name, BS)); + return *BS; +} + ErrorOr BinaryContext::extractPointerAtAddress(uint64_t Address) const { auto Section = getSectionForAddress(Address); diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index a80ddb22ac6f..14cb834a2bdf 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -55,12 +55,16 @@ class BinaryFunction; class DataReader; class BinaryContext { - BinaryContext() = delete; - /// Map virtual address to a section. - using SectionMapType = std::map; - SectionMapType AllocatableSections; + /// Set of all sections. + using SectionSetType = std::set; + SectionSetType Sections; + + /// Map virtual address to a section. It is possible to have more than one + /// section mapped to the same address, e.g. non-allocatable sections. + using AddressToSectionMapType = std::multimap; + AddressToSectionMapType AddressToSection; /// multimap of section name to BinarySection object. Some binaries /// have multiple sections with the same name. @@ -217,30 +221,25 @@ class BinaryContext { ErrorOr> getFunctionData(const BinaryFunction &Function) const; - BinarySection ®isterSection(SectionRef Section) { - assert(!AllocatableSections.count(Section.getAddress()) && - "can't register section twice"); - StringRef Name; - Section.getName(Name); - auto Res = AllocatableSections.emplace(Section.getAddress(), - BinarySection(Section)); - NameToSection.insert(std::make_pair(Name, &Res.first->second)); - return Res.first->second; - } + /// Register information about the given section so we can look up + /// sections for addresses. + BinarySection ®isterSection(SectionRef Section); - iterator_range sections() { - return make_range(AllocatableSections.begin(), AllocatableSections.end()); + iterator_range sections() { + return make_range(Sections.begin(), Sections.end()); } - iterator_range sections() const { - return make_range(AllocatableSections.begin(), AllocatableSections.end()); + iterator_range sections() const { + return make_range(Sections.begin(), Sections.end()); } - /// Return (allocatable) section containing the given \p Address. + /// Return largest section containing the given \p Address. These + /// functions only work for allocatable sections, i.e. ones with non-zero + /// addresses. ErrorOr getSectionForAddress(uint64_t Address); ErrorOr getSectionForAddress(uint64_t Address) const; - /// Return (allocatable) section(s) associated with given \p Name. + /// Return section(s) associated with given \p Name. iterator_range getSectionByName(StringRef Name) { return make_range(NameToSection.equal_range(Name)); diff --git a/bolt/BinarySection.h b/bolt/BinarySection.h index 32544f09b4cd..4f0eb8013370 100644 --- a/bolt/BinarySection.h +++ b/bolt/BinarySection.h @@ -163,6 +163,14 @@ class BinarySection { auto Itr = Relocations.find(Key); return Itr != Relocations.end() ? &*Itr : nullptr; } + + bool operator<(const BinarySection &Other) const { + return (getAddress() < Other.getAddress() || + (getAddress() == Other.getAddress() && + (getSize() < Other.getSize() || + (getSize() == Other.getSize() && + getName() < Other.getName())))); + } }; } // namespace bolt diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ca5d0f2d680b..40871dab30ab 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1585,18 +1585,11 @@ void RewriteInstance::readSpecialSections() { HasTextRelocations = true; } - // Ignore zero-size allocatable sections as they present no interest to us. - // Note that .tbss is marked as having a positive size while in reality it - // is not taking any allocatable space. - if ((ELFSectionRef(Section).getFlags() & ELF::SHF_ALLOC) && - Section.getSize() > 0 && - SectionName != ".tbss") { - BC->registerSection(Section); - DEBUG(dbgs() << "BOLT-DEBUG: registering section " << SectionName - << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x" - << Twine::utohexstr(Section.getAddress() + Section.getSize()) - << "\n"); - } + BC->registerSection(Section); + DEBUG(dbgs() << "BOLT-DEBUG: registering section " << SectionName + << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x" + << Twine::utohexstr(Section.getAddress() + Section.getSize()) + << "\n"); } EHFrameSection = BC->getUniqueSectionByName(".eh_frame"); @@ -2681,8 +2674,7 @@ void RewriteInstance::mapFileSections( } // Handling for sections with relocations. - for (auto &SRI : BC->sections()) { - auto &Section = SRI.second; + for (const auto &Section : BC->sections()) { if (!Section.hasRelocations()) continue; @@ -2845,8 +2837,7 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, } void RewriteInstance::emitDataSections(MCStreamer *Streamer) { - for (auto &SRI : BC->sections()) { - auto &Section = SRI.second; + for (const auto &Section : BC->sections()) { if (!Section.hasRelocations()) continue; From 129d983a3acbfdb4d384dcb6fb1a741cb4f7707f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 1 Feb 2018 14:24:26 -0800 Subject: [PATCH 375/904] [BOLT] Fix branch info stats after SCTC Summary: SCTC was incorrectly swapping BranchInfo when reversing the branch condition. This was wrong because when we remove the successor BB later, it removes the BranchInfo for that BB. In this case the successor would be the BB with the stats we had just swapped. Instead leave BranchInfo as it is and read the branch count from the false or true branch depending on whether we reverse or replace the branch, respectively. The call to removeSuccessor later will remove the unused BranchInfo we no longer care about. (cherry picked from commit b622ca6469939998023ea1193e34fb26bb70f600) --- bolt/Passes/BinaryPasses.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index b7c356f5a3de..f27625133092 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -875,6 +875,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // Record this block so that we don't try to optimize it twice. BeenOptimized.insert(PredBB); + bool BranchForStats; if (CondSucc != BB) { // Patch the new target address into the conditional branch. MIA->reverseBranchCondition(*CondBranch, CalleeSymbol, BC.Ctx.get()); @@ -883,17 +884,16 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // branch to the old target. This has to be done manually since // fixupBranches is not called after SCTC. NeedsUncondBranch.emplace_back(std::make_pair(PredBB, CondSucc)); - // Swap branch statistics after swapping the branch targets. - auto BI = PredBB->branch_info_begin(); - std::swap(*BI, *(BI + 1)); + BranchForStats = false; } else { // Change destination of the conditional branch. MIA->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get()); + BranchForStats = true; } - const uint64_t CTCTakenFreq = PredBB->getBranchInfo(true).Count == - BinaryBasicBlock::COUNT_NO_PROFILE - ? 0 - : PredBB->getBranchInfo(true).Count; + const auto Count = PredBB->getBranchInfo(BranchForStats).Count; + const uint64_t CTCTakenFreq = + Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; + // Annotate it, so "isCall" returns true for this jcc MIA->setConditionalTailCall(*CondBranch); // Add info abount the conditional tail call frequency, otherwise this From 94e3cdf5510552d7cee3db7270c753457bd38808 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 1 Feb 2018 14:36:29 -0800 Subject: [PATCH 376/904] [BOLT] Reduce the usage of "Offset" annotation Summary: Limiting "Offset" annotation only to instructions that actually need it, improves the memory consumption on HHVM binary by 1GB. (cherry picked from commit 7fd438a1ef227a6243f3ee5aa09fcf28ab1f5ef5) --- bolt/BinaryContext.h | 12 +++++++++++- bolt/BinaryFunction.cpp | 4 +++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 14cb834a2bdf..ad8150ce45c4 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -363,7 +363,7 @@ class BinaryContext { return Size; } - /// Return a function execution count threshold for determining whether the + /// Return a function execution count threshold for determining whether /// the function is 'hot'. Consider it hot if count is above the average exec /// count of profiled functions. uint64_t getHotThreshold() const { @@ -375,6 +375,16 @@ class BinaryContext { return Threshold; } + /// Return true if instruction \p Inst requires an offset for further + /// processing (e.g. assigning a profile). + bool keepOffsetForInstruction(const MCInst &Inst) const { + if (MIA->isCall(Inst) || MIA->isBranch(Inst) || MIA->isReturn(Inst) || + MIA->isPrefix(Inst) || MIA->isIndirectBranch(Inst)) { + return true; + } + return false; + } + /// Print the string name for a CFI operation. static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst); diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 4c249b0ab9ab..980802aa6580 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1242,7 +1242,9 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } // Record offset of the instruction for profile matching. - MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); + if (BC.keepOffsetForInstruction(Instruction)) { + MIA->addAnnotation(Ctx.get(), Instruction, "Offset", Offset); + } if (MemData && !emptyRange(MemData->getMemInfoRange(Offset))) { MIA->addAnnotation(Ctx.get(), Instruction, "MemDataOffset", Offset); From e5349ead7e12f769971a43ddce38a06aa55beaa8 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 2 Feb 2018 14:46:21 -0800 Subject: [PATCH 377/904] [BOLT] Fix memory regression Summary: This fixes the increased memory consumption introduced in an earlier diff while I was working on new profiling infra. The increase came from a delayed release of memory allocated to intermediate structures used to build CFG. In this diff we release them ASAP, and don't keep them for all functions at the same time. (cherry picked from commit abd1e331a52ade66ba7146cf26b876328619611d) --- bolt/BinaryFunction.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 980802aa6580..1e20509e962d 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1627,6 +1627,14 @@ bool BinaryFunction::buildCFG() { updateLayoutIndices(); + // Clean-up memory taken by intermediate structures. + // + // NB: don't clear Labels list as we may need them if we mark the function + // as non-simple later in the process of discovering extra entry points. + clearList(Instructions); + clearList(OffsetToCFI); + clearList(TakenBranches); + // Update the state. CurrentState = State::CFG; @@ -1656,24 +1664,14 @@ void BinaryFunction::postProcessCFG() { } } - // Clean-up memory taken by instructions and labels. - // - // NB: don't clear Labels list as we may need them if we mark the function - // as non-simple later in the process of discovering extra entry points. - clearList(Instructions); - clearList(OffsetToCFI); - clearList(TakenBranches); + // The final cleanup of intermediate structures. clearList(IgnoredBranches); clearList(EntryOffsets); - // Remove "Offset" annotations from instructions that don't need those. - for (auto *BB : layout()) { - for (auto &Inst : *BB) { - if (BC.MIA->isCall(Inst) || BC.MIA->isIndirectBranch(Inst)) - continue; + // Remove "Offset" annotations. + for (auto *BB : layout()) + for (auto &Inst : *BB) BC.MIA->removeAnnotation(Inst, "Offset"); - } - } assert((!isSimple() || validateCFG()) && "Invalid CFG detected after post-processing CFG"); From 525b650a85509d5bb0e35be04feb4e530b82931c Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 6 Feb 2018 15:00:23 -0800 Subject: [PATCH 378/904] [BOLT rebase] Rebase fixes on top of LLVM Feb2018 Summary: This commit includes all code necessary to make BOLT working again after the rebase. This includes a redesign of the EHFrame work, cherry-pick of the 3dnow disassembly work, compilation error fixes, and port of the debug_info work. The macroop fusion feature is not ported yet. The rebased version has minor changes to the "executed instructions" dynostats counter because REP prefixes are considered a part of the instruction it applies to. Also, some X86 instructions had the "mayLoad" tablegen property removed, which BOLT uses to identify and account for loads, thus reducing the total number of loads reported by dynostats. This was observed in X86::MOVDQUmr. TRAP instructions are not terminators anymore, changing our CFG. This commit adds compensation to preserve this old behavior and minimize tests changes. debug_info sections are now slightly larger. The discriminator field in the line table is slightly different due to a change upstream. New profiles generated with the other bolt are incompatible with this version because of different hash values calculated for functions, so they will be considered 100% stale. This commit changes the corresponding test to XFAIL so it can be updated. The hash function changes because it relies on raw opcode values, which change according to the opcodes described in the X86 tablegen files. When processing HHVM, bolt was observed to be using about 800MB more memory in the rebased version and being about 5% slower. (cherry picked from commit 0522539c6956146706a0c3538a93d10ac116cbc3) --- bolt/BinaryBasicBlock.h | 37 +- bolt/BinaryContext.cpp | 58 +-- bolt/BinaryContext.h | 4 +- bolt/BinaryFunction.cpp | 84 ++-- bolt/BinaryFunction.h | 51 ++- bolt/BinaryFunctionProfile.cpp | 3 +- bolt/BinaryLoop.h | 6 +- bolt/BinaryPassManager.cpp | 5 +- bolt/BinaryPassManager.h | 1 + bolt/BinarySection.cpp | 28 +- bolt/BinarySection.h | 2 +- bolt/CMakeLists.txt | 3 + bolt/DWARFRewriter.cpp | 127 +++--- bolt/DataAggregator.cpp | 70 ++- bolt/DebugData.cpp | 18 +- bolt/DebugData.h | 4 +- bolt/Exceptions.cpp | 144 +++--- bolt/Exceptions.h | 12 +- bolt/Passes/BinaryFunctionCallGraph.cpp | 9 +- bolt/Passes/BinaryPasses.cpp | 22 +- bolt/Passes/CMakeLists.txt | 3 + bolt/Passes/CallGraphWalker.cpp | 3 +- bolt/Passes/DominatorAnalysis.h | 3 +- bolt/Passes/FrameAnalysis.cpp | 7 +- bolt/Passes/FrameOptimizer.cpp | 12 +- bolt/Passes/IndirectCallPromotion.cpp | 22 +- bolt/Passes/LivenessAnalysis.h | 3 +- bolt/Passes/LongJmp.cpp | 10 +- bolt/Passes/PLTCall.cpp | 3 +- bolt/Passes/ReachingDefOrUse.h | 3 +- bolt/Passes/ReachingInsns.h | 3 +- bolt/Passes/ReorderFunctions.cpp | 3 +- bolt/Passes/ShrinkWrapping.cpp | 3 +- bolt/Passes/StackAllocationAnalysis.h | 3 +- bolt/Passes/StackAvailableExpressions.h | 3 +- bolt/Passes/StackPointerTracking.h | 3 +- bolt/Passes/StackReachingUses.h | 3 +- bolt/RewriteInstance.cpp | 565 +++++++++++++----------- bolt/RewriteInstance.h | 38 +- bolt/llvm-bolt.cpp | 20 +- bolt/merge-fdata/merge-fdata.cpp | 6 +- 41 files changed, 723 insertions(+), 684 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 90b4f11cdc7f..431a8bd19e60 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -804,6 +804,11 @@ class BinaryBasicBlock { LayoutIndex = Index; } + /// FIXME + BinaryFunction *getParent() const { + return nullptr; + } + private: void adjustNumPseudos(const MCInst &Inst, int Sign); @@ -850,57 +855,57 @@ bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); // GraphTraits specializations for basic block graphs (CFGs) template <> struct GraphTraits { - using NodeType = bolt::BinaryBasicBlock; + using NodeRef = bolt::BinaryBasicBlock *; using ChildIteratorType = bolt::BinaryBasicBlock::succ_iterator; - static NodeType *getEntryNode(bolt::BinaryBasicBlock *BB) { return BB; } - static inline ChildIteratorType child_begin(NodeType *N) { + static NodeRef getEntryNode(bolt::BinaryBasicBlock *BB) { return BB; } + static inline ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } - static inline ChildIteratorType child_end(NodeType *N) { + static inline ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } }; template <> struct GraphTraits { - using NodeType = const bolt::BinaryBasicBlock; + using NodeRef = const bolt::BinaryBasicBlock *; using ChildIteratorType = bolt::BinaryBasicBlock::const_succ_iterator; - static NodeType *getEntryNode(const bolt::BinaryBasicBlock *BB) { + static NodeRef getEntryNode(const bolt::BinaryBasicBlock *BB) { return BB; } - static inline ChildIteratorType child_begin(NodeType *N) { + static inline ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } - static inline ChildIteratorType child_end(NodeType *N) { + static inline ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } }; template <> struct GraphTraits> { - using NodeType = bolt::BinaryBasicBlock; + using NodeRef = bolt::BinaryBasicBlock *; using ChildIteratorType = bolt::BinaryBasicBlock::pred_iterator; - static NodeType *getEntryNode(Inverse G) { + static NodeRef getEntryNode(Inverse G) { return G.Graph; } - static inline ChildIteratorType child_begin(NodeType *N) { + static inline ChildIteratorType child_begin(NodeRef N) { return N->pred_begin(); } - static inline ChildIteratorType child_end(NodeType *N) { + static inline ChildIteratorType child_end(NodeRef N) { return N->pred_end(); } }; template <> struct GraphTraits> { - using NodeType = const bolt::BinaryBasicBlock; + using NodeRef = const bolt::BinaryBasicBlock *; using ChildIteratorType = bolt::BinaryBasicBlock::const_pred_iterator; - static NodeType *getEntryNode(Inverse G) { + static NodeRef getEntryNode(Inverse G) { return G.Graph; } - static inline ChildIteratorType child_begin(NodeType *N) { + static inline ChildIteratorType child_begin(NodeRef N) { return N->pred_begin(); } - static inline ChildIteratorType child_end(NodeType *N) { + static inline ChildIteratorType child_end(NodeRef N) { return N->pred_end(); } }; diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 337f439b665b..363139c1d1b3 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -15,6 +15,7 @@ #include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/DebugInfo/DWARF/DWARFUnit.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" @@ -48,10 +49,11 @@ PrintMemData("print-mem-data", BinaryContext::~BinaryContext() { } -MCObjectWriter *BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { +std::unique_ptr +BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { if (!MAB) { MAB = std::unique_ptr( - TheTarget->createMCAsmBackend(*MRI, TripleName, "")); + TheTarget->createMCAsmBackend(*STI, *MRI, MCTargetOptions())); } return MAB->createObjectWriter(OS); @@ -148,21 +150,20 @@ namespace { /// Recursively finds DWARF DW_TAG_subprogram DIEs and match them with /// BinaryFunctions. Record DIEs for unknown subprograms (mostly functions that /// are never called and removed from the binary) in Unknown. -void findSubprograms(DWARFCompileUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, +void findSubprograms(const DWARFDie DIE, std::map &BinaryFunctions) { - if (DIE->isSubprogramDIE()) { + if (DIE.isSubprogramDIE()) { // TODO: handle DW_AT_ranges. - uint64_t LowPC, HighPC; - if (DIE->getLowAndHighPC(Unit, LowPC, HighPC)) { + uint64_t LowPC, HighPC, SectionIndex; + if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) { auto It = BinaryFunctions.find(LowPC); if (It != BinaryFunctions.end()) { - It->second.addSubprogramDIE(Unit, DIE); + It->second.addSubprogramDIE(DIE); } else { // The function must have been optimized away by GC. } } else { - const auto RangesVector = DIE->getAddressRanges(Unit); + const auto RangesVector = DIE.getAddressRanges(); if (!RangesVector.empty()) { errs() << "BOLT-ERROR: split function detected in .debug_info. " "Split functions are not supported.\n"; @@ -171,10 +172,9 @@ void findSubprograms(DWARFCompileUnit *Unit, } } - for (auto ChildDIE = DIE->getFirstChild(); - ChildDIE != nullptr && !ChildDIE->isNULL(); - ChildDIE = ChildDIE->getSibling()) { - findSubprograms(Unit, ChildDIE, BinaryFunctions); + for (auto ChildDIE = DIE.getFirstChild(); ChildDIE && !ChildDIE.isNULL(); + ChildDIE = ChildDIE.getSibling()) { + findSubprograms(ChildDIE, BinaryFunctions); } } @@ -190,10 +190,13 @@ unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID, // means empty dir. assert(FileIndex > 0 && FileIndex <= FileNames.size() && "FileIndex out of range for the compilation unit."); - const char *Dir = FileNames[FileIndex - 1].DirIdx ? - LineTable->Prologue.IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1] : - ""; - return Ctx->getDwarfFile(Dir, FileNames[FileIndex - 1].Name, 0, DestCUID); + StringRef Dir = + FileNames[FileIndex - 1].DirIdx + ? LineTable->Prologue + .IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1] + : ""; + return Ctx->getDwarfFile(Dir, FileNames[FileIndex - 1].Name, 0, nullptr, + DestCUID); } std::vector BinaryContext::getSortedFunctions( @@ -221,22 +224,23 @@ void BinaryContext::preprocessDebugInfo( // Populate MCContext with DWARF files. for (const auto &CU : DwCtx->compile_units()) { const auto CUID = CU->getOffset(); - auto LineTable = DwCtx->getLineTableForUnit(CU.get()); + auto *LineTable = DwCtx->getLineTableForUnit(CU.get()); const auto &FileNames = LineTable->Prologue.FileNames; for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) { // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 // means empty dir. - const char *Dir = FileNames[I].DirIdx ? - LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] : - ""; - Ctx->getDwarfFile(Dir, FileNames[I].Name, 0, CUID); + StringRef Dir = + FileNames[I].DirIdx + ? LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] + : ""; + Ctx->getDwarfFile(Dir, FileNames[I].Name, 0, nullptr, CUID); } } // For each CU, iterate over its children DIEs and match subprogram DIEs to // BinaryFunctions. for (auto &CU : DwCtx->compile_units()) { - findSubprograms(CU.get(), CU->getUnitDIE(false), BinaryFunctions); + findSubprograms(CU->getUnitDIE(false), BinaryFunctions); } // Some functions may not have a corresponding subprogram DIE @@ -250,8 +254,8 @@ void BinaryContext::preprocessDebugInfo( if (auto DebugAranges = DwCtx->getDebugAranges()) { auto CUOffset = DebugAranges->findAddress(FunctionAddress); if (CUOffset != -1U) { - Function.addSubprogramDIE(DwCtx->getCompileUnitForOffset(CUOffset), - nullptr); + Function.addSubprogramDIE( + DWARFDie(DwCtx->getCompileUnitForOffset(CUOffset), nullptr)); continue; } } @@ -266,7 +270,7 @@ void BinaryContext::preprocessDebugInfo( for (const auto &Range : CUDie->getAddressRanges(CU.get())) { if (FunctionAddress >= Range.first && FunctionAddress < Range.second) { - Function.addSubprogramDIE(CU.get(), nullptr); + Function.addSubprogramDIE(DWARFDie(CU.get(), nullptr)); break; } } @@ -495,7 +499,7 @@ BinaryContext::extractPointerAtAddress(uint64_t Address) const { StringRef SectionContents = Section->getContents(); DataExtractor DE(SectionContents, AsmInfo->isLittleEndian(), - AsmInfo->getPointerSize()); + AsmInfo->getCodePointerSize()); uint32_t SectionOffset = Address - Section->getAddress(); return DE.getAddress(&SectionOffset); } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index ad8150ce45c4..c9067168c7f1 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -23,7 +23,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCInstPrinter.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" @@ -189,7 +189,7 @@ class BinaryContext { ~BinaryContext(); - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS); + std::unique_ptr createObjectWriter(raw_pwrite_stream &OS); /// Return a global symbol registered at a given \p Address. If no symbol /// exists, create one with unique name using \p Prefix. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 1e20509e962d..3e9c7febbbdd 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -90,8 +90,7 @@ JumpTables("jump-tables", "function execution frequency"), clEnumValN(JTS_AGGRESSIVE, "aggressive", "aggressively split jump tables section based on usage " - "of the tables"), - clEnumValEnd), + "of the tables")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -163,7 +162,8 @@ namespace bolt { constexpr const char *DynoStats::Desc[]; constexpr unsigned BinaryFunction::MinAlign; -const char BinaryFunction::TimerGroupName[] = "Build binary functions"; +const char BinaryFunction::TimerGroupName[] = "buildfuncs"; +const char BinaryFunction::TimerGroupDesc[] = "Build Binary Functions"; namespace { @@ -485,7 +485,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, OS << '\n'; } - Offset = RoundUpToAlignment(Offset, BB->getAlignment()); + Offset = alignTo(Offset, BB->getAlignment()); // Note: offsets are imprecise since this is happening prior to relaxation. Offset = BC.printInstructions(OS, BB->begin(), BB->end(), Offset, this); @@ -605,7 +605,7 @@ void BinaryFunction::printRelocations(raw_ostream &OS, IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size, uint64_t Offset) { - const auto PtrSize = BC.AsmInfo->getPointerSize(); + const auto PtrSize = BC.AsmInfo->getCodePointerSize(); // An instruction referencing memory used by jump instruction (directly or // via register). This location could be an array of function pointers @@ -869,7 +869,8 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address, } void BinaryFunction::disassemble(ArrayRef FunctionData) { - NamedRegionTimer T("disassemble", TimerGroupName, opts::TimeBuild); + NamedRegionTimer T("disassemble", "Disassemble function", TimerGroupName, + TimerGroupDesc, opts::TimeBuild); assert(FunctionData.size() == getSize() && "function size does not match raw data size"); @@ -1435,7 +1436,8 @@ void BinaryFunction::recomputeLandingPads() { } bool BinaryFunction::buildCFG() { - NamedRegionTimer T("build cfg", TimerGroupName, opts::TimeBuild); + NamedRegionTimer T("buildcfg", "Build CFG", TimerGroupName, TimerGroupDesc, + opts::TimeBuild); auto &MIA = BC.MIA; if (!isSimple()) { @@ -2073,8 +2075,6 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { Streamer.EmitCodeAlignment(BB->getAlignment()); Streamer.EmitLabel(BB->getLabel()); - // Remember if last instruction emitted was a prefix - bool LastIsPrefix = false; SMLoc LastLocSeen; for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { auto &Instr = *I; @@ -2105,7 +2105,6 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { } Streamer.EmitInstruction(Instr, *BC.STI); - LastIsPrefix = BC.MIA->isPrefix(Instr); } } @@ -3129,10 +3128,9 @@ bool BinaryFunction::isDataMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const { // For aarch64, the ABI defines mapping symbols so we identify data in the // code section (see IHI0056B). $d identifies a symbol starting data contents. - if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && - Symbol.getType() == SymbolRef::ST_Unknown && - SymbolSize == 0 && - (!Symbol.getName().getError() && *Symbol.getName() == "$d")) + if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && Symbol.getType() && + cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 && + Symbol.getName() && cantFail(Symbol.getName()) == "$d") return true; return false; } @@ -3142,10 +3140,9 @@ bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol, // For aarch64, the ABI defines mapping symbols so we identify data in the // code section (see IHI0056B). $x identifies a symbol starting code or the // end of a data chunk inside code. - if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && - Symbol.getType() == SymbolRef::ST_Unknown && - SymbolSize == 0 && - (!Symbol.getName().getError() && *Symbol.getName() == "$x")) + if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && Symbol.getType() && + cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 && + Symbol.getName() && cantFail(Symbol.getName()) == "$x") return true; return false; } @@ -3159,10 +3156,10 @@ bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, // It's okay to have a zero-sized symbol in the middle of non-zero-sized // function. - if (SymbolSize == 0 && containsAddress(*Symbol.getAddress())) + if (SymbolSize == 0 && containsAddress(cantFail(Symbol.getAddress()))) return true; - if (Symbol.getType() != SymbolRef::ST_Unknown) + if (cantFail(Symbol.getType()) != SymbolRef::ST_Unknown) return false; if (Symbol.getFlags() & SymbolRef::SF_Global) @@ -3413,8 +3410,8 @@ void BinaryFunction::JumpTable::print(raw_ostream &OS) const { void BinaryFunction::calculateLoopInfo() { // Discover loops. - BinaryDominatorTree DomTree(false); - DomTree.recalculate(*this); + BinaryDominatorTree DomTree; + DomTree.recalculate(*this); BLI.reset(new BinaryLoopInfo()); BLI->analyze(DomTree); @@ -3531,15 +3528,15 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( uint64_t PrevEndAddress = 0; DWARFAddressRangesVector OutputRanges; for (const auto &Range : InputRanges) { - if (!containsAddress(Range.first)) { + if (!containsAddress(Range.LowPC)) { DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " - << *this << " : [0x" << Twine::utohexstr(Range.first) - << ", 0x" << Twine::utohexstr(Range.second) << "]\n"); + << *this << " : [0x" << Twine::utohexstr(Range.LowPC) + << ", 0x" << Twine::utohexstr(Range.HighPC) << "]\n"); PrevEndAddress = 0; continue; } - auto InputOffset = Range.first - getAddress(); - const auto InputEndOffset = std::min(Range.second - getAddress(), getSize()); + auto InputOffset = Range.LowPC - getAddress(); + const auto InputEndOffset = std::min(Range.HighPC - getAddress(), getSize()); auto BBI = std::upper_bound(BasicBlockOffsets.begin(), BasicBlockOffsets.end(), @@ -3550,8 +3547,8 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( const auto *BB = BBI->second; if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) { DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " - << *this << " : [0x" << Twine::utohexstr(Range.first) - << ", 0x" << Twine::utohexstr(Range.second) << "]\n"); + << *this << " : [0x" << Twine::utohexstr(Range.LowPC) + << ", 0x" << Twine::utohexstr(Range.HighPC) << "]\n"); PrevEndAddress = 0; break; } @@ -3564,13 +3561,13 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( EndAddress = StartAddress + InputEndOffset - InputOffset; if (StartAddress == PrevEndAddress) { - OutputRanges.back().second = std::max(OutputRanges.back().second, + OutputRanges.back().HighPC = std::max(OutputRanges.back().HighPC, EndAddress); } else { OutputRanges.emplace_back(StartAddress, std::max(StartAddress, EndAddress)); } - PrevEndAddress = OutputRanges.back().second; + PrevEndAddress = OutputRanges.back().HighPC; } InputOffset = BB->getEndOffset(); @@ -3583,13 +3580,13 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( DWARFAddressRangesVector MergedRanges; PrevEndAddress = 0; for(const auto &Range : OutputRanges) { - if (Range.first <= PrevEndAddress) { - MergedRanges.back().second = std::max(MergedRanges.back().second, - Range.second); + if (Range.LowPC <= PrevEndAddress) { + MergedRanges.back().HighPC = std::max(MergedRanges.back().HighPC, + Range.HighPC); } else { - MergedRanges.emplace_back(Range.first, Range.second); + MergedRanges.emplace_back(Range.LowPC, Range.HighPC); } - PrevEndAddress = MergedRanges.back().second; + PrevEndAddress = MergedRanges.back().HighPC; } return MergedRanges; @@ -3619,27 +3616,28 @@ MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) { DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( const DWARFDebugLoc::LocationList &InputLL, - uint64_t BaseAddress) const { + BaseAddress BaseAddr) const { + uint64_t BAddr = BaseAddr.Address; // If the function wasn't changed - there's nothing to update. if (!isEmitted() && !BC.HasRelocations) { - if (!BaseAddress) { + if (!BAddr) { return InputLL; } else { auto OutputLL = std::move(InputLL); for (auto &Entry : OutputLL.Entries) { - Entry.Begin += BaseAddress; - Entry.End += BaseAddress; + Entry.Begin += BAddr; + Entry.End += BAddr; } return OutputLL; } } uint64_t PrevEndAddress = 0; - SmallVectorImpl *PrevLoc = nullptr; + SmallVectorImpl *PrevLoc = nullptr; DWARFDebugLoc::LocationList OutputLL; for (auto &Entry : InputLL.Entries) { - const auto Start = Entry.Begin + BaseAddress; - const auto End = Entry.End + BaseAddress; + const auto Start = Entry.Begin + BAddr; + const auto End = Entry.End + BAddr; if (!containsAddress(Start)) { DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " << *this << " : [0x" << Twine::utohexstr(Start) diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 88546feb950d..43985de00949 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -24,9 +24,10 @@ #include "DebugData.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrAnalysis.h" @@ -34,7 +35,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -45,14 +45,14 @@ using namespace llvm::object; namespace llvm { -class DWARFCompileUnit; +class DWARFUnit; class DWARFDebugInfoEntryMinimal; namespace bolt { struct SectionInfo; -using DWARFUnitLineTable = std::pair; /// Class encapsulating runtime statistics about an execution unit. @@ -233,6 +233,7 @@ class BinaryFunction { static constexpr unsigned MinAlign = 2; static const char TimerGroupName[]; + static const char TimerGroupDesc[]; using BasicBlockOrderType = std::vector; @@ -345,8 +346,7 @@ class BinaryFunction { /// Associated DIEs in the .debug_info section with their respective CUs. /// There can be multiple because of identical code folding. - std::vector> SubprogramDIEs; + std::vector SubprogramDIEs; /// Line table for the function with containing compilation unit. /// Because of identical code folding the function could have multiple @@ -1235,11 +1235,11 @@ class BinaryFunction { case ELF::R_AARCH64_LDST64_ABS_LO12_NC: case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12: case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12: case ELF::R_AARCH64_ADD_ABS_LO12_NC: case ELF::R_AARCH64_LDST16_ABS_LO12_NC: case ELF::R_AARCH64_LDST32_ABS_LO12_NC: @@ -2163,12 +2163,12 @@ class BinaryFunction { std::size_t hash(bool Recompute = true, bool UseDFS = false) const; /// Sets the associated .debug_info entry. - void addSubprogramDIE(DWARFCompileUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE) { - SubprogramDIEs.emplace_back(DIE, Unit); + void addSubprogramDIE(const DWARFDie DIE) { + SubprogramDIEs.emplace_back(DIE); if (!UnitLineTable.first) { - if (const auto *LineTable = BC.DwCtx->getLineTableForUnit(Unit)) { - UnitLineTable = std::make_pair(Unit, LineTable); + if (const auto *LineTable = + BC.DwCtx->getLineTableForUnit(DIE.getDwarfUnit())) { + UnitLineTable = std::make_pair(DIE.getDwarfUnit(), LineTable); } } } @@ -2285,7 +2285,7 @@ class BinaryFunction { /// \p BaseAddress is applied to all addresses in \pInputLL. DWARFDebugLoc::LocationList translateInputToOutputLocationList( const DWARFDebugLoc::LocationList &InputLL, - uint64_t BaseAddress) const; + BaseAddress BaseAddr) const; virtual ~BinaryFunction(); @@ -2388,18 +2388,20 @@ inline raw_ostream &operator<<(raw_ostream &OS, // GraphTraits specializations for function basic block graphs (CFGs) template <> struct GraphTraits : public GraphTraits { - static NodeType *getEntryNode(bolt::BinaryFunction *F) { + static NodeRef getEntryNode(bolt::BinaryFunction *F) { return *F->layout_begin(); } - typedef bolt::BinaryBasicBlock * nodes_iterator; + using nodes_iterator = pointer_iterator; + +// typedef bolt::BinaryBasicBlock * nodes_iterator; static nodes_iterator nodes_begin(bolt::BinaryFunction *F) { llvm_unreachable("Not implemented"); - return &(*F->begin()); + return nodes_iterator(F->begin()); } static nodes_iterator nodes_end(bolt::BinaryFunction *F) { llvm_unreachable("Not implemented"); - return &(*F->end()); + return nodes_iterator(F->end()); } static size_t size(bolt::BinaryFunction *F) { return F->size(); @@ -2408,18 +2410,19 @@ template <> struct GraphTraits : template <> struct GraphTraits : public GraphTraits { - static NodeType *getEntryNode(const bolt::BinaryFunction *F) { + static NodeRef getEntryNode(const bolt::BinaryFunction *F) { return *F->layout_begin(); } - typedef const bolt::BinaryBasicBlock * nodes_iterator; + using nodes_iterator = pointer_iterator; + static nodes_iterator nodes_begin(const bolt::BinaryFunction *F) { llvm_unreachable("Not implemented"); - return &(*F->begin()); + return nodes_iterator(F->begin()); } static nodes_iterator nodes_end(const bolt::BinaryFunction *F) { llvm_unreachable("Not implemented"); - return &(*F->end()); + return nodes_iterator(F->end()); } static size_t size(const bolt::BinaryFunction *F) { return F->size(); @@ -2428,14 +2431,14 @@ template <> struct GraphTraits : template <> struct GraphTraits> : public GraphTraits> { - static NodeType *getEntryNode(Inverse G) { + static NodeRef getEntryNode(Inverse G) { return *G.Graph->layout_begin(); } }; template <> struct GraphTraits> : public GraphTraits> { - static NodeType *getEntryNode(Inverse G) { + static NodeRef getEntryNode(Inverse G) { return *G.Graph->layout_begin(); } }; diff --git a/bolt/BinaryFunctionProfile.cpp b/bolt/BinaryFunctionProfile.cpp index c341faa72b11..6b84ec559a8c 100644 --- a/bolt/BinaryFunctionProfile.cpp +++ b/bolt/BinaryFunctionProfile.cpp @@ -56,8 +56,7 @@ DoMCF("mcf", clEnumValN(MCF_LOG, "log", "cost function is inversely proportional to log of edge count"), clEnumValN(MCF_BLAMEFTS, "blamefts", - "tune cost to blame fall-through edges for surplus flow"), - clEnumValEnd), + "tune cost to blame fall-through edges for surplus flow")), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); diff --git a/bolt/BinaryLoop.h b/bolt/BinaryLoop.h index dc396d54b0b0..4406f073abd1 100644 --- a/bolt/BinaryLoop.h +++ b/bolt/BinaryLoop.h @@ -25,8 +25,8 @@ namespace bolt { class BinaryBasicBlock; -typedef DomTreeNodeBase BinaryDomTreeNode; -typedef DominatorTreeBase BinaryDominatorTree; +using BinaryDomTreeNode = DomTreeNodeBase; +using BinaryDominatorTree = DomTreeBase; class BinaryLoop : public LoopBase { public: @@ -76,7 +76,7 @@ template <> struct GraphTraits template <> struct GraphTraits : public GraphTraits { - static NodeType *getEntryNode(bolt::BinaryDominatorTree *DT) { + static NodeRef getEntryNode(bolt::BinaryDominatorTree *DT) { return DT->getRootNode(); } diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 48d3d63dfc7c..bbdba97246c5 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -272,6 +272,8 @@ namespace bolt { using namespace opts; const char BinaryFunctionPassManager::TimerGroupName[] = + "passman"; +const char BinaryFunctionPassManager::TimerGroupDesc[] = "Binary Function Pass Manager"; void BinaryFunctionPassManager::runPasses() { @@ -285,7 +287,8 @@ void BinaryFunctionPassManager::runPasses() { outs() << "BOLT-INFO: Starting pass: " << Pass->getName() << "\n"; } - NamedRegionTimer T(Pass->getName(), TimerGroupName, TimeOpts); + NamedRegionTimer T(Pass->getName(), Pass->getName(), TimerGroupName, + TimerGroupDesc, TimeOpts); callWithDynoStats( [this,&Pass] { diff --git a/bolt/BinaryPassManager.h b/bolt/BinaryPassManager.h index ceacc33cdca2..08fa9c1af2f5 100644 --- a/bolt/BinaryPassManager.h +++ b/bolt/BinaryPassManager.h @@ -34,6 +34,7 @@ class BinaryFunctionPassManager { public: static const char TimerGroupName[]; + static const char TimerGroupDesc[]; BinaryFunctionPassManager(BinaryContext &BC, std::map &BFs, diff --git a/bolt/BinarySection.cpp b/bolt/BinarySection.cpp index 92417bc387b5..80c038e6ea0b 100644 --- a/bolt/BinarySection.cpp +++ b/bolt/BinarySection.cpp @@ -50,8 +50,8 @@ bool Relocation::isSupported(uint64_t Type) { case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12: + case ELF::R_AARCH64_TLSDESC_ADD_LO12: case ELF::R_AARCH64_TLSDESC_CALL: case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case ELF::R_AARCH64_JUMP26: @@ -93,8 +93,8 @@ size_t Relocation::getSizeForType(uint64_t Type) { case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12: + case ELF::R_AARCH64_TLSDESC_ADD_LO12: case ELF::R_AARCH64_TLSDESC_CALL: case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case ELF::R_AARCH64_JUMP26: @@ -137,7 +137,7 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, return Contents; } case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12: case ELF::R_AARCH64_LD64_GOT_LO12_NC: case ELF::R_AARCH64_LDST64_ABS_LO12_NC: { // Immediate goes in bits 21:10 of LD/ST instruction, taken @@ -147,7 +147,7 @@ uint64_t Relocation::extractValue(uint64_t Type, uint64_t Contents, } case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_ADD_LO12: case ELF::R_AARCH64_ADD_ABS_LO12_NC: { // Immediate goes in bits 21:10 of ADD instruction Contents &= ~0xffffffffffc003ffU; @@ -203,8 +203,8 @@ bool Relocation::isGOT(uint64_t Type) { case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12: + case ELF::R_AARCH64_TLSDESC_ADD_LO12: case ELF::R_AARCH64_TLSDESC_CALL: return true; } @@ -221,8 +221,8 @@ bool Relocation::isTLS(uint64_t Type) { case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12: + case ELF::R_AARCH64_TLSDESC_ADD_LO12: case ELF::R_AARCH64_TLSDESC_CALL: case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: return true; @@ -251,8 +251,8 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12: case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: case ELF::R_AARCH64_LD64_GOT_LO12_NC: - case ELF::R_AARCH64_TLSDESC_LD64_LO12_NC: - case ELF::R_AARCH64_TLSDESC_ADD_LO12_NC: + case ELF::R_AARCH64_TLSDESC_LD64_LO12: + case ELF::R_AARCH64_TLSDESC_ADD_LO12: return false; case ELF::R_X86_64_PC8: @@ -305,10 +305,10 @@ size_t Relocation::emit(MCStreamer *Streamer) const { void Relocation::print(raw_ostream &OS) const { static const char *X86RelocNames[] = { -#include "llvm/Support/ELFRelocs/x86_64.def" +#include "llvm/BinaryFormat/ELFRelocs/x86_64.def" }; static const char *AArch64RelocNames[] = { -#include "llvm/Support/ELFRelocs/AArch64.def" +#include "llvm/BinaryFormat/ELFRelocs/AArch64.def" }; if (Arch == Triple::aarch64) OS << AArch64RelocNames[Type]; diff --git a/bolt/BinarySection.h b/bolt/BinarySection.h index 4f0eb8013370..80a3072cf15e 100644 --- a/bolt/BinarySection.h +++ b/bolt/BinarySection.h @@ -13,10 +13,10 @@ #define LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H #include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/ObjectFile.h" -#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/raw_ostream.h" #include diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index bdcdb051f5b6..742405334e3e 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -75,6 +75,9 @@ add_llvm_tool(llvm-bolt ProfileReader.cpp ProfileWriter.cpp RewriteInstance.cpp + + DEPENDS + intrinsics_gen ) add_llvm_tool_symlink(perf2bolt llvm-bolt) diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 5e89156477f6..67f31d6b2071 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -15,12 +15,14 @@ #include "BinaryFunction.h" #include "RewriteInstance.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h" #include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -29,11 +31,9 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/TimeValue.h" #include "llvm/Support/Timer.h" #include @@ -67,8 +67,7 @@ void RewriteInstance::updateDebugInfo() { LocationListWriter = llvm::make_unique(BC.get()); for (auto &CU : BC->DwCtx->compile_units()) { - updateUnitDebugInfo(CU.get(), - CU->getUnitDIE(false), + updateUnitDebugInfo(CU->getUnitDIE(false), std::vector{}); } @@ -78,28 +77,27 @@ void RewriteInstance::updateDebugInfo() { } void RewriteInstance::updateUnitDebugInfo( - DWARFCompileUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, + const DWARFDie DIE, std::vector FunctionStack) { bool IsFunctionDef = false; - switch (DIE->getTag()) { + switch (DIE.getTag()) { case dwarf::DW_TAG_compile_unit: { - const auto ModuleRanges = DIE->getAddressRanges(Unit); + const auto ModuleRanges = DIE.getAddressRanges(); auto OutputRanges = translateModuleAddressRanges(ModuleRanges); const auto RangesSectionOffset = - RangesSectionsWriter->addCURanges(Unit->getOffset(), + RangesSectionsWriter->addCURanges(DIE.getDwarfUnit()->getOffset(), std::move(OutputRanges)); - updateDWARFObjectAddressRanges(Unit, DIE, RangesSectionOffset); + updateDWARFObjectAddressRanges(DIE, RangesSectionOffset); } break; case dwarf::DW_TAG_subprogram: { // The function cannot have multiple ranges on the input. - uint64_t LowPC, HighPC; - if (DIE->getLowAndHighPC(Unit, LowPC, HighPC)) { + uint64_t SectionIndex, LowPC, HighPC; + if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) { IsFunctionDef = true; const auto *Function = getBinaryFunctionAtAddress(LowPC); if (Function && Function->isFolded()) { @@ -114,7 +112,7 @@ void RewriteInstance::updateUnitDebugInfo( RangesSectionsWriter->addRanges(Function, std::move(FunctionRanges)); } - updateDWARFObjectAddressRanges(Unit, DIE, RangesSectionOffset); + updateDWARFObjectAddressRanges(DIE, RangesSectionOffset); } } break; @@ -129,19 +127,19 @@ void RewriteInstance::updateUnitDebugInfo( const BinaryFunction *Function = FunctionStack.empty() ? nullptr : FunctionStack.back(); if (Function) { - const auto Ranges = DIE->getAddressRanges(Unit); + const auto Ranges = DIE.getAddressRanges(); auto OutputRanges = Function->translateInputToOutputRanges(Ranges); DEBUG( if (OutputRanges.empty() != Ranges.empty()) { dbgs() << "BOLT-DEBUG: problem with DIE at 0x" - << Twine::utohexstr(DIE->getOffset()) << " in CU at 0x" - << Twine::utohexstr(Unit->getOffset()) << '\n'; + << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x" + << Twine::utohexstr(DIE.getDwarfUnit()->getOffset()) << '\n'; } ); RangesSectionOffset = RangesSectionsWriter->addRanges(Function, std::move(OutputRanges)); } - updateDWARFObjectAddressRanges(Unit, DIE, RangesSectionOffset); + updateDWARFObjectAddressRanges(DIE, RangesSectionOffset); } break; @@ -152,8 +150,8 @@ void RewriteInstance::updateUnitDebugInfo( uint32_t AttrOffset; const BinaryFunction *Function = FunctionStack.empty() ? nullptr : FunctionStack.back(); - if (DIE->getAttributeValue(Unit, dwarf::DW_AT_location, Value, - &AttrOffset)) { + if (auto V = DIE.find(dwarf::DW_AT_location, &AttrOffset)) { + Value = *V; if (Value.isFormClass(DWARFFormValue::FC_Constant) || Value.isFormClass(DWARFFormValue::FC_SectionOffset)) { auto LocListSectionOffset = LocationListWriter->getEmptyListOffset(); @@ -164,20 +162,25 @@ void RewriteInstance::updateUnitDebugInfo( Value.getAsUnsignedConstant().getValue() : Value.getAsSectionOffset().getValue(); - Unit->getContext().getOneDebugLocList(LL); - if (LL.Entries.empty()) { + uint32_t LLOff = LL.Offset; + auto OptLL = + DIE.getDwarfUnit()->getContext().getOneDebugLocList(&LLOff); + if (!OptLL || OptLL->Entries.empty()) { errs() << "BOLT-WARNING: empty location list detected at 0x" - << Twine::utohexstr(LL.Offset) << " for DIE at 0x" - << Twine::utohexstr(DIE->getOffset()) << " in CU at 0x" - << Twine::utohexstr(Unit->getOffset()) << '\n'; + << Twine::utohexstr(LLOff) << " for DIE at 0x" + << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x" + << Twine::utohexstr(DIE.getDwarfUnit()->getOffset()) + << '\n'; } else { - const auto OutputLL = Function-> - translateInputToOutputLocationList(LL, Unit->getBaseAddress()); + const auto OutputLL = + Function->translateInputToOutputLocationList( + *OptLL, *DIE.getDwarfUnit()->getBaseAddress()); DEBUG(if (OutputLL.Entries.empty()) { dbgs() << "BOLT-DEBUG: location list translated to an empty " "one at 0x" - << Twine::utohexstr(DIE->getOffset()) << " in CU at 0x" - << Twine::utohexstr(Unit->getOffset()) << '\n'; + << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x" + << Twine::utohexstr(DIE.getDwarfUnit()->getOffset()) + << '\n'; }); LocListSectionOffset = LocationListWriter->addList(OutputLL); } @@ -192,9 +195,9 @@ void RewriteInstance::updateUnitDebugInfo( Value.isFormClass(DWARFFormValue::FC_Block)) && "unexpected DW_AT_location form"); } - } else if (DIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, Value, - &AttrOffset)) { - const auto Result = Value.getAsAddress(Unit); + } else if (auto V = DIE.find(dwarf::DW_AT_low_pc, &AttrOffset)) { + Value = *V; + const auto Result = Value.getAsAddress(); if (Result.hasValue()) { uint64_t NewAddress = 0; if (Function) { @@ -202,7 +205,7 @@ void RewriteInstance::updateUnitDebugInfo( NewAddress = Function->translateInputToOutputAddress(Address); DEBUG(dbgs() << "BOLT-DEBUG: Fixing low_pc 0x" << Twine::utohexstr(Address) - << " for DIE with tag " << DIE->getTag() + << " for DIE with tag " << DIE.getTag() << " to 0x" << Twine::utohexstr(NewAddress) << '\n'); } auto DebugInfoPatcher = @@ -218,19 +221,16 @@ void RewriteInstance::updateUnitDebugInfo( } // Recursively update each child. - for (auto Child = DIE->getFirstChild(); Child; Child = Child->getSibling()) { - updateUnitDebugInfo(Unit, Child, FunctionStack); + for (auto Child = DIE.getFirstChild(); Child; Child = Child.getSibling()) { + updateUnitDebugInfo(Child, FunctionStack); } if (IsFunctionDef) FunctionStack.pop_back(); } - void RewriteInstance::updateDWARFObjectAddressRanges( - const DWARFUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, - uint64_t DebugRangesOffset) { + const DWARFDie DIE, uint64_t DebugRangesOffset) { // Some objects don't have an associated DIE and cannot be updated (such as // compiler-generated functions). @@ -240,7 +240,7 @@ void RewriteInstance::updateDWARFObjectAddressRanges( if (opts::Verbosity >= 2 && DebugRangesOffset == -1U) { errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; + << Twine::utohexstr(DIE.getOffset()) << '\n'; } auto DebugInfoPatcher = @@ -250,24 +250,24 @@ void RewriteInstance::updateDWARFObjectAddressRanges( assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized."); - const auto *AbbreviationDecl = DIE->getAbbreviationDeclarationPtr(); + const auto *AbbreviationDecl = DIE.getAbbreviationDeclarationPtr(); if (!AbbreviationDecl) { if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: object's DIE doesn't have an abbreviation: " << "skipping update. DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; + << Twine::utohexstr(DIE.getOffset()) << '\n'; } return; } auto AbbrevCode = AbbreviationDecl->getCode(); - if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges) != -1U) { + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges)) { // Case 1: The object was already non-contiguous and had DW_AT_ranges. // In this case we simply need to update the value of DW_AT_ranges. - DWARFFormValue FormValue; uint32_t AttrOffset = -1U; - DIE->getAttributeValue(Unit, dwarf::DW_AT_ranges, FormValue, &AttrOffset); + DIE.find(dwarf::DW_AT_ranges, &AttrOffset); + assert(AttrOffset != -1U && "failed to locate DWARF attribute"); DebugInfoPatcher->addLE32Patch(AttrOffset, DebugRangesOffset); } else { // Case 2: The object has both DW_AT_low_pc and DW_AT_high_pc emitted back @@ -282,38 +282,37 @@ void RewriteInstance::updateDWARFObjectAddressRanges( // To fill in the gap we use a variable length DW_FORM_udata encoding for // DW_AT_low_pc. We exploit the fact that the encoding can take an arbitrary // large size. - if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) != -1U && - AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc) != -1U) { + if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) && + AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc)) { uint32_t LowPCOffset = -1U; uint32_t HighPCOffset = -1U; - DWARFFormValue LowPCFormValue; - DWARFFormValue HighPCFormValue; - DIE->getAttributeValue(Unit, dwarf::DW_AT_low_pc, LowPCFormValue, - &LowPCOffset); - DIE->getAttributeValue(Unit, dwarf::DW_AT_high_pc, HighPCFormValue, - &HighPCOffset); + DWARFFormValue LowPCFormValue = + *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset); + DWARFFormValue HighPCFormValue = + *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset); + if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr || (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " - << "at offset 0x" << Twine::utohexstr(DIE->getOffset()) + << "at offset 0x" << Twine::utohexstr(DIE.getOffset()) << "\n"; return; } if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " << "Cannot update DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; + << Twine::utohexstr(DIE.getOffset()) << '\n'; return; } - AbbrevPatcher->addAttributePatch(Unit, + AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(), AbbrevCode, dwarf::DW_AT_low_pc, dwarf::DW_AT_ranges, dwarf::DW_FORM_sec_offset); - AbbrevPatcher->addAttributePatch(Unit, + AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(), AbbrevCode, dwarf::DW_AT_high_pc, dwarf::DW_AT_low_pc, @@ -332,7 +331,7 @@ void RewriteInstance::updateDWARFObjectAddressRanges( } else { if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" - << Twine::utohexstr(DIE->getOffset()) << '\n'; + << Twine::utohexstr(DIE.getOffset()) << '\n'; } } } @@ -378,7 +377,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { Row.Address); auto Loc = BC->Ctx->getCurrentDwarfLoc(); BC->Ctx->clearDwarfLocSeen(); - OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, + OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc}, FunctionSection); } // Add an empty entry past the end of the function @@ -387,7 +386,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { Address + Function.getMaxSize()); auto Loc = BC->Ctx->getCurrentDwarfLoc(); BC->Ctx->clearDwarfLocSeen(); - OutputLineTable.addLineEntry(MCLineEntry{nullptr, Loc}, + OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc}, FunctionSection); } else { DEBUG(dbgs() << "BOLT-DEBUG: Function " << Function @@ -397,7 +396,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { } void RewriteInstance::updateLineTableOffsets() { - const auto LineSection = + const auto *LineSection = BC->Ctx->getObjectFileInfo()->getDwarfLineSection(); auto CurrentFragment = LineSection->begin(); uint32_t CurrentOffset = 0; @@ -460,8 +459,8 @@ void RewriteInstance::finalizeDebugSections() { SmallVector ARangesBuffer; raw_svector_ostream OS(ARangesBuffer); - auto MAB = std::unique_ptr( - BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, "")); + auto MAB = std::unique_ptr(BC->TheTarget->createMCAsmBackend( + *BC->STI, *BC->MRI, MCTargetOptions())); auto Writer = std::unique_ptr(MAB->createObjectWriter(OS)); RangesSectionsWriter->writeArangesSection(Writer.get()); @@ -591,8 +590,8 @@ void RewriteInstance::updateGdbIndexSection() { const auto CUIndex = OffsetToIndexMap[CURangesPair.first]; const auto &Ranges = CURangesPair.second; for (const auto &Range : Ranges) { - write64le(Buffer, Range.first); - write64le(Buffer + 8, Range.second); + write64le(Buffer, Range.LowPC); + write64le(Buffer + 8, Range.HighPC); write32le(Buffer + 16, CUIndex); Buffer += 20; } diff --git a/bolt/DataAggregator.cpp b/bolt/DataAggregator.cpp index 1b039c44225a..32145fd32c73 100644 --- a/bolt/DataAggregator.cpp +++ b/bolt/DataAggregator.cpp @@ -46,7 +46,8 @@ TimeAggregator("time-aggr", namespace { -const char TimerGroupName[] = "Aggregator"; +const char TimerGroupName[] = "aggregator"; +const char TimerGroupDesc[] = "Aggregator"; } @@ -83,8 +84,6 @@ void DataAggregator::abort() { bool DataAggregator::launchPerfBranchEventsNoWait() { SmallVector Argv; - SmallVector Redirects; - SmallVector RedirectPtrs; outs() << "PERF2BOLT: Spawning perf-script job to read branch events\n"; Argv.push_back(PerfPath.data()); @@ -108,28 +107,23 @@ bool DataAggregator::launchPerfBranchEventsNoWait() { << PerfBranchEventsErrPath << " with error " << Errc.message() << "\n"; exit(1); } - - Redirects.push_back(""); // Stdin - Redirects.push_back(StringRef(PerfBranchEventsOutputPath.data())); // Stdout - Redirects.push_back(StringRef(PerfBranchEventsErrPath.data())); // Stderr - RedirectPtrs.push_back(&Redirects[0]); - RedirectPtrs.push_back(&Redirects[1]); - RedirectPtrs.push_back(&Redirects[2]); + Optional Redirects[] = { + llvm::None, // Stdin + StringRef(PerfBranchEventsOutputPath.data()), // Stdout + StringRef(PerfBranchEventsErrPath.data())}; // Stderr DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " << PerfBranchEventsOutputPath.data() << " 2> " << PerfBranchEventsErrPath.data() << "\n"); BranchEventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, &RedirectPtrs[0]); + /*envp*/ nullptr, Redirects); return true; } bool DataAggregator::launchPerfMemEventsNoWait() { SmallVector Argv; - SmallVector Redirects; - SmallVector RedirectPtrs; outs() << "PERF2BOLT: Spawning perf-script job to read mem events\n"; Argv.push_back(PerfPath.data()); @@ -154,27 +148,23 @@ bool DataAggregator::launchPerfMemEventsNoWait() { exit(1); } - Redirects.push_back(""); // Stdin - Redirects.push_back(StringRef(PerfMemEventsOutputPath.data())); // Stdout - Redirects.push_back(StringRef(PerfMemEventsErrPath.data())); // Stderr - RedirectPtrs.push_back(&Redirects[0]); - RedirectPtrs.push_back(&Redirects[1]); - RedirectPtrs.push_back(&Redirects[2]); + Optional Redirects[] = { + llvm::None, // Stdin + StringRef(PerfMemEventsOutputPath.data()), // Stdout + StringRef(PerfMemEventsErrPath.data())}; // Stderr DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " << PerfMemEventsOutputPath.data() << " 2> " << PerfMemEventsErrPath.data() << "\n"); MemEventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, &RedirectPtrs[0]); + /*envp*/ nullptr, Redirects); return true; } bool DataAggregator::launchPerfTasksNoWait() { SmallVector Argv; - SmallVector Redirects; - SmallVector RedirectPtrs; outs() << "PERF2BOLT: Spawning perf-script job to read tasks\n"; Argv.push_back(PerfPath.data()); @@ -198,27 +188,23 @@ bool DataAggregator::launchPerfTasksNoWait() { exit(1); } - Redirects.push_back(""); // Stdin - Redirects.push_back(StringRef(PerfTasksOutputPath.data())); // Stdout - Redirects.push_back(StringRef(PerfTasksErrPath.data())); // Stderr - RedirectPtrs.push_back(&Redirects[0]); - RedirectPtrs.push_back(&Redirects[1]); - RedirectPtrs.push_back(&Redirects[2]); + Optional Redirects[] = { + llvm::None, // Stdin + StringRef(PerfTasksOutputPath.data()), // Stdout + StringRef(PerfTasksErrPath.data())}; // Stderr DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " << PerfTasksOutputPath.data() << " 2> " << PerfTasksErrPath.data() << "\n"); TasksPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, &RedirectPtrs[0]); + /*envp*/ nullptr, Redirects); return true; } Optional DataAggregator::getPerfBuildID() { SmallVector Argv; - SmallVector Redirects; - SmallVector RedirectPtrs; SmallVector OutputPath; SmallVector ErrPath; @@ -242,19 +228,17 @@ Optional DataAggregator::getPerfBuildID() { exit(1); } - Redirects.push_back(""); // Stdin - Redirects.push_back(StringRef(OutputPath.data())); // Stdout - Redirects.push_back(StringRef(ErrPath.data())); // Stderr - RedirectPtrs.push_back(&Redirects[0]); - RedirectPtrs.push_back(&Redirects[1]); - RedirectPtrs.push_back(&Redirects[2]); + Optional Redirects[] = { + llvm::None, // Stdin + StringRef(OutputPath.data()), // Stdout + StringRef(ErrPath.data())}; // Stderr DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " << OutputPath.data() << " 2> " << ErrPath.data() << "\n"); auto RetCode = sys::ExecuteAndWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, &RedirectPtrs[0]); + /*envp*/ nullptr, Redirects); if (RetCode != 0) { ErrorOr> MB = @@ -729,8 +713,8 @@ bool DataAggregator::hasData() { std::error_code DataAggregator::parseBranchEvents() { outs() << "PERF2BOLT: Aggregating branch events...\n"; - NamedRegionTimer T("Branch samples parsing", TimerGroupName, - opts::TimeAggregator); + NamedRegionTimer T("parseBranch", "Branch samples parsing", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); uint64_t NumEntries{0}; uint64_t NumSamples{0}; uint64_t NumTraces{0}; @@ -800,7 +784,8 @@ std::error_code DataAggregator::parseBranchEvents() { std::error_code DataAggregator::parseMemEvents() { outs() << "PERF2BOLT: Aggregating memory events...\n"; - NamedRegionTimer T("Mem samples parsing", TimerGroupName, opts::TimeAggregator); + NamedRegionTimer T("memevents", "Mem samples parsing", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); while (hasData()) { auto SampleRes = parseMemSample(); @@ -889,7 +874,8 @@ ErrorOr DataAggregator::parseTaskPID() { std::error_code DataAggregator::parseTasks() { outs() << "PERF2BOLT: Parsing perf-script tasks output\n"; - NamedRegionTimer T("Tasks parsing", TimerGroupName, opts::TimeAggregator); + NamedRegionTimer T("parseTasks", "Tasks parsing", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); while (hasData()) { auto PIDRes = parseTaskPID(); diff --git a/bolt/DebugData.cpp b/bolt/DebugData.cpp index 9b9600656809..3bc981d55e8f 100644 --- a/bolt/DebugData.cpp +++ b/bolt/DebugData.cpp @@ -43,9 +43,9 @@ uint64_t writeAddressRanges( const DWARFAddressRangesVector &AddressRanges, const bool WriteRelativeRanges = false) { for (auto &Range : AddressRanges) { - Writer->writeLE64(Range.first); - Writer->writeLE64(WriteRelativeRanges ? Range.second - Range.first - : Range.second); + Writer->writeLE64(Range.LowPC); + Writer->writeLE64(WriteRelativeRanges ? Range.HighPC - Range.LowPC + : Range.HighPC); } // Finish with 0 entries. Writer->writeLE64(0); @@ -202,13 +202,9 @@ void SimpleBinaryPatcher::addLEPatch(uint32_t Offset, uint64_t NewValue, } void SimpleBinaryPatcher::addUDataPatch(uint32_t Offset, uint64_t Value, uint64_t Size) { - const auto EncodedSize = getULEB128Size(Value); - assert(EncodedSize <= Size && "value did not fit"); - - const auto Padding = Size - EncodedSize; std::string Buff; raw_string_ostream OS(Buff); - encodeULEB128(Value, OS, Padding); + encodeULEB128(Value, OS, Size); Patches.emplace_back(Offset, OS.str()); } @@ -235,7 +231,7 @@ void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents) { void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit, uint32_t AbbrevCode, - uint16_t AttrTag, + dwarf::Attribute AttrTag, uint8_t NewAttrTag, uint8_t NewAttrForm) { assert(Unit && "No compile unit specified."); @@ -256,8 +252,8 @@ void DebugAbbrevPatcher::patchBinary(std::string &Contents) { const auto *AbbreviationDeclaration = UnitAbbreviations->getAbbreviationDeclaration(AttrPatch.Code); assert(AbbreviationDeclaration && "No abbreviation with given code."); - const auto *Attribute = AbbreviationDeclaration->findAttribute( - AttrPatch.Attr); + const auto Attribute = + AbbreviationDeclaration->findAttribute(AttrPatch.Attr); assert(Attribute && "Specified attribute doesn't occur in abbreviation."); // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or diff --git a/bolt/DebugData.h b/bolt/DebugData.h index 6b46ef071635..9b8c18c663af 100644 --- a/bolt/DebugData.h +++ b/bolt/DebugData.h @@ -220,7 +220,7 @@ class DebugAbbrevPatcher : public BinaryPatcher { /// Patch of changing one attribute to another. struct AbbrevAttrPatch { uint32_t Code; // Code of abbreviation to be modified. - uint16_t Attr; // ID of attribute to be replaced. + dwarf::Attribute Attr; // ID of attribute to be replaced. uint8_t NewAttr; // ID of the new attribute. uint8_t NewForm; // Form of the new attribute. }; @@ -238,7 +238,7 @@ class DebugAbbrevPatcher : public BinaryPatcher { /// We only handle standard forms, that are encoded in a single byte. void addAttributePatch(const DWARFUnit *Unit, uint32_t AbbrevCode, - uint16_t AttrTag, + dwarf::Attribute AttrTag, uint8_t NewAttrTag, uint8_t NewAttrForm); diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 34568e113a81..358641baf9a0 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -19,12 +19,12 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" -#include "llvm/DebugInfo/DWARF/DWARFFrame.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -61,7 +61,7 @@ unsigned getEncodingSize(unsigned Encoding, BinaryContext &BC) { default: llvm_unreachable("unknown encoding"); case dwarf::DW_EH_PE_absptr: case dwarf::DW_EH_PE_signed: - return BC.AsmInfo->getPointerSize(); + return BC.AsmInfo->getCodePointerSize(); case dwarf::DW_EH_PE_udata2: case dwarf::DW_EH_PE_sdata2: return 2; @@ -133,30 +133,24 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (!getLSDAAddress()) return; - assert(getLSDAAddress() < LSDASectionAddress + LSDASectionData.size() && - "wrong LSDA address"); + DWARFDataExtractor Data( + StringRef(reinterpret_cast(LSDASectionData.data()), + LSDASectionData.size()), + BC.DwCtx->getDWARFObj().isLittleEndian(), 8); + uint32_t Offset = getLSDAAddress() - LSDASectionAddress; + assert(Data.isValidOffset(Offset) && "wrong LSDA address"); - // Given an address in memory corresponding to some entity in mapped - // LSDA section return address of this entity in a binary file. - auto getFileAddress = [&](const uint8_t *InMemAddress) { - return InMemAddress - LSDASectionData.data() + LSDASectionAddress; - }; - const uint8_t *Ptr = - LSDASectionData.data() + getLSDAAddress() - LSDASectionAddress; - - uint8_t LPStartEncoding = *Ptr++; - uintptr_t LPStart = 0; - if (LPStartEncoding != DW_EH_PE_omit) { - LPStart = readEncodedPointer(Ptr, LPStartEncoding, getFileAddress(Ptr)); - } + uint8_t LPStartEncoding = Data.getU8(&Offset); + uint64_t LPStart = Data.getEncodedPointer(&Offset, LPStartEncoding, + Offset + LSDASectionAddress); assert(LPStart == 0 && "support for split functions not implemented"); - const auto TTypeEncoding = *Ptr++; + const auto TTypeEncoding = Data.getU8(&Offset); size_t TTypeEncodingSize = 0; uintptr_t TTypeEnd = 0; if (TTypeEncoding != DW_EH_PE_omit) { - TTypeEnd = readULEB128(Ptr); + TTypeEnd = Data.getULEB128(&Offset); TTypeEncodingSize = getEncodingSize(TTypeEncoding, BC); } @@ -171,24 +165,24 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, } // Table to store list of indices in type table. Entries are uleb128 values. - auto TypeIndexTableStart = Ptr + TTypeEnd; + const uint32_t TypeIndexTableStart = Offset + TTypeEnd; // Offset past the last decoded index. - intptr_t MaxTypeIndexTableOffset = 0; + uint32_t MaxTypeIndexTableOffset = 0; // Max positive index used in type table. unsigned MaxTypeIndex = 0; // The actual type info table starts at the same location, but grows in // opposite direction. TTypeEncoding is used to encode stored values. - const auto TypeTableStart = Ptr + TTypeEnd; + const auto TypeTableStart = Offset + TTypeEnd; - uint8_t CallSiteEncoding = *Ptr++; - uint32_t CallSiteTableLength = readULEB128(Ptr); - const uint8_t *CallSiteTableStart = Ptr; - const uint8_t *CallSiteTableEnd = CallSiteTableStart + CallSiteTableLength; - const uint8_t *CallSitePtr = CallSiteTableStart; - const uint8_t *ActionTableStart = CallSiteTableEnd; + uint8_t CallSiteEncoding = Data.getU8(&Offset); + uint32_t CallSiteTableLength = Data.getULEB128(&Offset); + auto CallSiteTableStart = Offset; + auto CallSiteTableEnd = CallSiteTableStart + CallSiteTableLength; + auto CallSitePtr = CallSiteTableStart; + auto ActionTableStart = CallSiteTableEnd; if (opts::PrintExceptions) { outs() << "CallSite Encoding = " << (unsigned)CallSiteEncoding << '\n'; @@ -199,13 +193,13 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, HasEHRanges = CallSitePtr < CallSiteTableEnd; uint64_t RangeBase = getAddress(); while (CallSitePtr < CallSiteTableEnd) { - uintptr_t Start = readEncodedPointer(CallSitePtr, CallSiteEncoding, - getFileAddress(CallSitePtr)); - uintptr_t Length = readEncodedPointer(CallSitePtr, CallSiteEncoding, - getFileAddress(CallSitePtr)); - uintptr_t LandingPad = readEncodedPointer(CallSitePtr, CallSiteEncoding, - getFileAddress(CallSitePtr)); - uintptr_t ActionEntry = readULEB128(CallSitePtr); + uintptr_t Start = Data.getEncodedPointer(&CallSitePtr, CallSiteEncoding, + CallSitePtr + LSDASectionAddress); + uintptr_t Length = Data.getEncodedPointer(&CallSitePtr, CallSiteEncoding, + CallSitePtr + LSDASectionAddress); + uintptr_t LandingPad = Data.getEncodedPointer( + &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress); + uintptr_t ActionEntry = Data.getULEB128(&CallSitePtr); if (opts::PrintExceptions) { outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) @@ -258,11 +252,10 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if (ActionEntry != 0) { auto printType = [&] (int Index, raw_ostream &OS) { assert(Index > 0 && "only positive indices are valid"); - const uint8_t *TTEntry = TypeTableStart - Index * TTypeEncodingSize; - const auto TTEntryAddress = getFileAddress(TTEntry); - auto TypeAddress = readEncodedPointer(TTEntry, - TTypeEncoding, - TTEntryAddress); + uint32_t TTEntry = TypeTableStart - Index * TTypeEncodingSize; + const auto TTEntryAddress = TTEntry + LSDASectionAddress; + auto TypeAddress = + Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress); if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) { TypeAddress = 0; @@ -285,14 +278,14 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, }; if (opts::PrintExceptions) outs() << " actions: "; - const uint8_t *ActionPtr = ActionTableStart + ActionEntry - 1; + uint32_t ActionPtr = ActionTableStart + ActionEntry - 1; long long ActionType; long long ActionNext; auto Sep = ""; do { - ActionType = readSLEB128(ActionPtr); + ActionType = Data.getSLEB128(&ActionPtr); auto Self = ActionPtr; - ActionNext = readSLEB128(ActionPtr); + ActionNext = Data.getSLEB128(&ActionPtr); if (opts::PrintExceptions) outs() << Sep << "(" << ActionType << ", " << ActionNext << ") "; if (ActionType == 0) { @@ -314,8 +307,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // of indices with base 1. // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are // encoded using uleb128 thus we cannot directly dereference them. - auto TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; - while (auto Index = readULEB128(TypeIndexTablePtr)) { + uint32_t TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1; + while (auto Index = Data.getULEB128(&TypeIndexTablePtr)) { MaxTypeIndex = std::max(MaxTypeIndex, static_cast(Index)); if (opts::PrintExceptions) { outs() << TSep; @@ -340,22 +333,20 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, outs() << '\n'; assert(TypeIndexTableStart + MaxTypeIndexTableOffset <= - LSDASectionData.data() + LSDASectionData.size() && + Data.getData().size() && "LSDA entry has crossed section boundary"); if (TTypeEnd) { - // TypeIndexTableStart is a alias for TypeTableStart. - LSDAActionTable = - ArrayRef(ActionTableStart, TypeIndexTableStart - - MaxTypeIndex * TTypeEncodingSize - ActionTableStart); + LSDAActionTable = LSDASectionData.slice( + ActionTableStart, TypeIndexTableStart - + MaxTypeIndex * TTypeEncodingSize - + ActionTableStart); for (unsigned Index = 1; Index <= MaxTypeIndex; ++Index) { - const uint8_t *TTEntry = TypeTableStart - Index * TTypeEncodingSize; - const auto TTEntryAddress = getFileAddress(TTEntry); - auto TypeAddress = readEncodedPointer(TTEntry, - TTypeEncoding, - TTEntryAddress); - if ((TTypeEncoding & DW_EH_PE_pcrel) && - (TypeAddress == TTEntryAddress)) { + uint32_t TTEntry = TypeTableStart - Index * TTypeEncodingSize; + const auto TTEntryAddress = TTEntry + LSDASectionAddress; + auto TypeAddress = + Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress); + if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) { TypeAddress = 0; } if (TypeAddress && @@ -367,7 +358,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, LSDATypeTable.emplace_back(TypeAddress); } LSDATypeIndexTable = - ArrayRef(TypeIndexTableStart, MaxTypeIndexTableOffset); + LSDASectionData.slice(TypeIndexTableStart, MaxTypeIndexTableOffset); } } @@ -595,7 +586,7 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { // Account for any extra padding that will be added to the call site table // length. - Streamer->EmitULEB128IntValue(TTypeBaseOffset, SizeAlign); + Streamer->EmitPaddedULEB128IntValue(TTypeBaseOffset, SizeAlign); // Emit the landing pad call site table. We use signed data4 since we can emit // a landing pad in a different part of the split function that could appear @@ -684,21 +675,22 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { return false; } - Function.setLSDAAddress(CurFDE.getLSDAAddress()); + auto LSDA = CurFDE.getLSDAAddress(); + Function.setLSDAAddress(LSDA ? *LSDA : 0); uint64_t Offset = 0; uint64_t CodeAlignment = CurFDE.getLinkedCIE()->getCodeAlignmentFactor(); uint64_t DataAlignment = CurFDE.getLinkedCIE()->getDataAlignmentFactor(); - if (CurFDE.getLinkedCIE()->getPersonalityAddress() != 0) { + if (CurFDE.getLinkedCIE()->getPersonalityAddress()) { Function.setPersonalityFunction( - CurFDE.getLinkedCIE()->getPersonalityAddress()); + *CurFDE.getLinkedCIE()->getPersonalityAddress()); Function.setPersonalityEncoding( - CurFDE.getLinkedCIE()->getPersonalityEncoding()); + *CurFDE.getLinkedCIE()->getPersonalityEncoding()); } auto decodeFrameInstruction = [&Function, &Offset, Address, CodeAlignment, DataAlignment]( - const FrameEntry::Instruction &Instr) { + const CFIProgram::Instruction &Instr) { uint8_t Opcode = Instr.Opcode; if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK) Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK; @@ -854,12 +846,12 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { return true; }; - for (const FrameEntry::Instruction &Instr : *(CurFDE.getLinkedCIE())) { + for (const CFIProgram::Instruction &Instr : CurFDE.getLinkedCIE()->cfis()) { if (!decodeFrameInstruction(Instr)) return false; } - for (const FrameEntry::Instruction &Instr : CurFDE) { + for (const CFIProgram::Instruction &Instr : CurFDE.cfis()) { if (!decodeFrameInstruction(Instr)) return false; } @@ -868,8 +860,8 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { } std::vector CFIReaderWriter::generateEHFrameHeader( - const DWARFFrame &OldEHFrame, - const DWARFFrame &NewEHFrame, + const DWARFDebugFrame &OldEHFrame, + const DWARFDebugFrame &NewEHFrame, uint64_t EHFrameHeaderAddress, std::vector &FailedAddresses) const { // Common PC -> FDE map to be written into .eh_frame_hdr. @@ -881,7 +873,7 @@ std::vector CFIReaderWriter::generateEHFrameHeader( // Initialize PCToFDE using NewEHFrame. NewEHFrame.for_each_FDE([&](const dwarf::FDE *FDE) { const auto FuncAddress = FDE->getInitialLocation(); - const auto FDEAddress = NewEHFrame.EHFrameAddress + FDE->getOffset(); + const auto FDEAddress = NewEHFrame.getEHFrameAddress() + FDE->getOffset(); // Ignore unused FDEs. if (FuncAddress == 0) @@ -898,13 +890,15 @@ std::vector CFIReaderWriter::generateEHFrameHeader( }); DEBUG(dbgs() << "BOLT-DEBUG: new .eh_frame contains " - << NewEHFrame.Entries.size() << " entries\n"); + << std::distance(NewEHFrame.entries().begin(), + NewEHFrame.entries().end()) + << " entries\n"); // Add entries from the original .eh_frame corresponding to the functions // that we did not update. OldEHFrame.for_each_FDE([&](const dwarf::FDE *FDE) { const auto FuncAddress = FDE->getInitialLocation(); - const auto FDEAddress = OldEHFrame.EHFrameAddress + FDE->getOffset(); + const auto FDEAddress = OldEHFrame.getEHFrameAddress() + FDE->getOffset(); // Add the address if we failed to write it. if (PCToFDE.count(FuncAddress) == 0) { @@ -916,7 +910,9 @@ std::vector CFIReaderWriter::generateEHFrameHeader( }); DEBUG(dbgs() << "BOLT-DEBUG: old .eh_frame contains " - << OldEHFrame.Entries.size() << " entries\n"); + << std::distance(OldEHFrame.entries().begin(), + OldEHFrame.entries().end()) + << " entries\n"); // Generate a new .eh_frame_hdr based on the new map. @@ -934,7 +930,7 @@ std::vector CFIReaderWriter::generateEHFrameHeader( // Address of eh_frame. Use the new one. support::ulittle32_t::ref(EHFrameHeader.data() + 4) = - NewEHFrame.EHFrameAddress - (EHFrameHeaderAddress + 4); + NewEHFrame.getEHFrameAddress() - (EHFrameHeaderAddress + 4); // Number of entries in the table (FDE count). support::ulittle32_t::ref(EHFrameHeader.data() + 8) = PCToFDE.size(); diff --git a/bolt/Exceptions.h b/bolt/Exceptions.h index b38cb5c5975f..fe0862f8fe84 100644 --- a/bolt/Exceptions.h +++ b/bolt/Exceptions.h @@ -14,7 +14,7 @@ #include "BinaryContext.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/DebugInfo/DWARF/DWARFFrame.h" +#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h" #include "llvm/Support/Casting.h" #include @@ -28,10 +28,10 @@ class RewriteInstance; /// BinaryFunction, as well as rewriting CFI sections. class CFIReaderWriter { public: - explicit CFIReaderWriter(const DWARFFrame &EHFrame) { + explicit CFIReaderWriter(const DWARFDebugFrame &EHFrame) { // Prepare FDEs for fast lookup - for (const auto &Entry : EHFrame.Entries) { - const auto *CurFDE = dyn_cast(Entry.get()); + for (const auto &Entry : EHFrame.entries()) { + const auto *CurFDE = dyn_cast(&Entry); // Skip CIEs. if (!CurFDE) continue; @@ -69,8 +69,8 @@ class CFIReaderWriter { /// \p EHFrameHeaderAddress specifies location of .eh_frame_hdr, /// and is required for relative addressing used in the section. std::vector generateEHFrameHeader( - const DWARFFrame &OldEHFrame, - const DWARFFrame &NewEHFrame, + const DWARFDebugFrame &OldEHFrame, + const DWARFDebugFrame &NewEHFrame, uint64_t EHFrameHeaderAddress, std::vector &FailedAddresses) const; diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/Passes/BinaryFunctionCallGraph.cpp index 2b49f323cfab..706caf9337ed 100644 --- a/bolt/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/Passes/BinaryFunctionCallGraph.cpp @@ -37,8 +37,8 @@ CallGraph::NodeId BinaryFunctionCallGraph::addNode(BinaryFunction *BF, } std::deque BinaryFunctionCallGraph::buildTraversalOrder() { - NamedRegionTimer T1("Build cg traversal order", "CG breakdown", - opts::TimeOpts); + NamedRegionTimer T1("buildcgorder", "Build cg traversal order", + "CG breakdown", "CG breakdown", opts::TimeOpts); std::deque TopologicalOrder; enum NodeStatus { NEW, VISITING, VISITED }; std::vector NodeStatus(Funcs.size()); @@ -85,7 +85,8 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, bool UseSplitHotSize, bool UseEdgeCounts, bool IgnoreRecursiveCalls) { - NamedRegionTimer T1("Callgraph construction", "CG breakdown", opts::TimeOpts); + NamedRegionTimer T1("buildcg", "Callgraph construction", "CG breakdown", + "CG breakdown", opts::TimeOpts); BinaryFunctionCallGraph Cg; static constexpr auto COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; @@ -161,7 +162,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, }); return true; } - + return false; }; diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index f27625133092..d2c5402c285d 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -111,8 +111,7 @@ Peepholes("peepholes", clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"), clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches", "remove useless conditional branches"), - clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations"), - clEnumValEnd), + clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -135,7 +134,8 @@ PrintSortedBy("print-sorted-by", dynoStatsOptDesc(bolt::DynoStats::name)), DYNO_STATS #undef D - clEnumValEnd), + clEnumValN(0xffff, ".", ".") + ), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -166,8 +166,7 @@ ReorderBlocks("reorder-blocks", "perform layout optimizing I-cache behavior"), clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, "cluster-shuffle", - "perform random layout of clusters"), - clEnumValEnd), + "perform random layout of clusters")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -196,8 +195,7 @@ SctcMode("sctc-mode", "preserved"), clEnumValN(SctcHeuristic, "heuristic", - "use branch prediction data to control sctc"), - clEnumValEnd), + "use branch prediction data to control sctc")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -1539,16 +1537,10 @@ void StripRepRet::runOnFunctions( for (auto &BB : BFI.second) { auto LastInstRIter = BB.getLastNonPseudo(); if (LastInstRIter == BB.rend() || - !BC.MIA->isReturn(*LastInstRIter)) + !BC.MIA->isReturn(*LastInstRIter) || + !BC.MIA->deleteREPPrefix(*LastInstRIter)) continue; - auto NextToLastInstRIter = std::next(LastInstRIter); - if (NextToLastInstRIter == BB.rend() || - !BC.MIA->isPrefix(*NextToLastInstRIter)) - continue; - - BB.eraseInstruction(std::next(NextToLastInstRIter).base()); - NumPrefixesRemoved += BB.getKnownExecutionCount(); ++NumBytesSaved; } diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 0255e7b40048..147395627aed 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -30,6 +30,9 @@ add_llvm_library(LLVMBOLTPasses StackPointerTracking.cpp StackReachingUses.cpp StokeInfo.cpp + + DEPENDS + intrinsics_gen ) include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt ) diff --git a/bolt/Passes/CallGraphWalker.cpp b/bolt/Passes/CallGraphWalker.cpp index 720dc6c9d9a4..d3ebd6b8166d 100644 --- a/bolt/Passes/CallGraphWalker.cpp +++ b/bolt/Passes/CallGraphWalker.cpp @@ -10,7 +10,8 @@ namespace llvm { namespace bolt { void CallGraphWalker::traverseCG() { - NamedRegionTimer T1("CG Traversal", "CG breakdown", opts::TimeOpts); + NamedRegionTimer T1("CG Traversal", "CG Traversal", "CG breakdown", + "CG breakdown", opts::TimeOpts); std::queue Queue; std::set InQueue; diff --git a/bolt/Passes/DominatorAnalysis.h b/bolt/Passes/DominatorAnalysis.h index 5586dac3043b..f807c577eb52 100644 --- a/bolt/Passes/DominatorAnalysis.h +++ b/bolt/Passes/DominatorAnalysis.h @@ -104,7 +104,8 @@ class DominatorAnalysis } void run() { - NamedRegionTimer T1("DA", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("DA", "Dominator Analysis", "Dataflow", "Dataflow", + opts::TimeOpts); InstrsDataflowAnalysis, Backward>::run(); } diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index a4e157192775..21223b38bca3 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -152,7 +152,8 @@ class FrameAccessAnalysis { FrameAccessAnalysis(const BinaryContext &BC, BinaryFunction &BF) : SPT(BC, BF), BC(BC), BF(BF) { { - NamedRegionTimer T1("SPT", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("SPT", "Stack Pointer Tracking", "Dataflow", + "Dataflow", opts::TimeOpts); SPT.run(); } } @@ -519,8 +520,8 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, } { - NamedRegionTimer T1("restore frame index", "FOP breakdown", - opts::TimeOpts); + NamedRegionTimer T1("restorefi", "restore frame index", "FOP", + "FOP breakdown", opts::TimeOpts); if (!restoreFrameIndex(I.second)) { ++NumFunctionsFailedRestoreFI; auto Count = I.second.getExecutionCount(); diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 30b3c8410e9e..7f0e5215e695 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -35,8 +35,7 @@ FrameOptimization("frame-opt", cl::values( clEnumValN(FOP_NONE, "none", "do not perform frame optimization"), clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"), - clEnumValN(FOP_ALL, "all", "perform FOP on all functions"), - clEnumValEnd), + clEnumValN(FOP_ALL, "all", "perform FOP on all functions")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -249,18 +248,21 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, << BC.getHotThreshold() << " )\n"); } { - NamedRegionTimer T1("remove loads", "FOP breakdown", opts::TimeOpts); + NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown", + opts::TimeOpts); removeUnnecessaryLoads(RA, FA, BC, I.second); } if (opts::RemoveStores) { - NamedRegionTimer T1("remove stores", "FOP breakdown", opts::TimeOpts); + NamedRegionTimer T1("removestores", "remove stores", "FOP", + "FOP breakdown", opts::TimeOpts); removeUnusedStores(FA, BC, I.second); } // Don't even start shrink wrapping if no profiling info is available if (I.second.getKnownExecutionCount() == 0) continue; { - NamedRegionTimer T1("move spills", "FOP breakdown", opts::TimeOpts); + NamedRegionTimer T1("movespills", "move spills", "FOP", "FOP breakdown", + opts::TimeOpts); DataflowInfoManager Info(BC, I.second, &RA, &FA); ShrinkWrapping SW(FA, BC, I.second, Info); SW.perform(); diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/Passes/IndirectCallPromotion.cpp index 5bb0044e0e47..47a8cd6fe53d 100644 --- a/bolt/Passes/IndirectCallPromotion.cpp +++ b/bolt/Passes/IndirectCallPromotion.cpp @@ -35,8 +35,7 @@ IndirectCallPromotion("indirect-call-promotion", clEnumValN(ICP_NONE, "none", "do not perform indirect call promotion"), clEnumValN(ICP_CALLS, "calls", "perform ICP on indirect calls"), clEnumValN(ICP_JUMP_TABLES, "jump-tables", "perform ICP on jump tables"), - clEnumValN(ICP_ALL, "all", "perform ICP on calls and jump tables"), - clEnumValEnd), + clEnumValN(ICP_ALL, "all", "perform ICP on calls and jump tables")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -174,7 +173,7 @@ IndirectCallPromotion::getCallTargets( const auto *JI = JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first]; const size_t JIAdj = JT->Counts.empty() ? 0 : 1; assert(JT->Type == BinaryFunction::JumpTable::JTT_PIC || - JT->EntrySize == BC.AsmInfo->getPointerSize()); + JT->EntrySize == BC.AsmInfo->getCodePointerSize()); for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) { auto *Entry = JT->Entries[I]; assert(BF.getBasicBlockForLabel(Entry) || @@ -307,16 +306,9 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets( int64_t DispValue; const MCExpr *DispExpr; MutableArrayRef Insts(&BB->front(), &CallInst); - const auto Type = BC.MIA->analyzeIndirectBranch(CallInst, - Insts.begin(), - Insts.end(), - BC.AsmInfo->getPointerSize(), - MemLocInstr, - BaseReg, - IndexReg, - DispValue, - DispExpr, - PCRelBaseOut); + const auto Type = BC.MIA->analyzeIndirectBranch( + CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(), + MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut); assert(MemLocInstr && "There should always be a load for jump tables"); if (!MemLocInstr) @@ -665,7 +657,7 @@ IndirectCallPromotion::maybeGetVtableAddrs( return MethodInfoType(VtableAddrs, MethodFetchInsns); } - + std::vector> IndirectCallPromotion::rewriteCall( BinaryContext &BC, @@ -1201,7 +1193,7 @@ void IndirectCallPromotion::runOnFunctions( for (const auto &BInfo : getCallTargets(Function, Inst)) { NumCalls += BInfo.Branches; } - + IndirectCalls.push_back(std::make_pair(NumCalls, &Inst)); TotalIndirectCalls += NumCalls; } diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index 1b22d0a9d66b..fa8347e44d5b 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -50,7 +50,8 @@ class LivenessAnalysis } void run() { - NamedRegionTimer T1("LA", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("LA", "Liveness Analysis", "Dataflow", "Dataflow", + opts::TimeOpts); Parent::run(); } diff --git a/bolt/Passes/LongJmp.cpp b/bolt/Passes/LongJmp.cpp index 7a2beaee55c0..5b2e81ff476b 100644 --- a/bolt/Passes/LongJmp.cpp +++ b/bolt/Passes/LongJmp.cpp @@ -130,7 +130,7 @@ LongJmpPass::replaceTargetWithStub(const BinaryContext &BC, } BC.MIA->replaceBranchTarget(Inst, StubSymbol, BC.Ctx.get()); ++StubRefCount[StubBB]; - StubBits[StubBB] = BC.AsmInfo->getPointerSize() * 8; + StubBits[StubBB] = BC.AsmInfo->getCodePointerSize() * 8; if (NewBB) { if (BB.isCold()) @@ -216,7 +216,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocColdPart( for (auto Func : SortedFunctions) { if (!Func->isSplit()) continue; - DotAddress = RoundUpToAlignment(DotAddress, BinaryFunction::MinAlign); + DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign); auto Pad = OffsetToAlignment(DotAddress, opts::AlignFunctions); if (Pad <= opts::AlignFunctionsMaxBytes) DotAddress += Pad; @@ -253,7 +253,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode( ColdLayoutDone = true; } - DotAddress = RoundUpToAlignment(DotAddress, BinaryFunction::MinAlign); + DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign); auto Pad = OffsetToAlignment(DotAddress, opts::AlignFunctions); if (Pad <= opts::AlignFunctionsMaxBytes) DotAddress += Pad; @@ -282,7 +282,7 @@ void LongJmpPass::tentativeLayout( if (!BC.HasRelocations) { for (auto Func : SortedFunctions) { HotAddresses[Func] = Func->getAddress(); - DotAddress = RoundUpToAlignment(DotAddress, ColdFragAlign); + DotAddress = alignTo(DotAddress, ColdFragAlign); ColdAddresses[Func] = DotAddress; if (Func->isSplit()) DotAddress += Func->estimateColdSize(); @@ -303,7 +303,7 @@ void LongJmpPass::tentativeLayout( DotAddress += Pad; } } else { - DotAddress = RoundUpToAlignment(BC.LayoutStartAddress, PageAlign); + DotAddress = alignTo(BC.LayoutStartAddress, PageAlign); } tentativeLayoutRelocMode(BC, SortedFunctions, DotAddress); diff --git a/bolt/Passes/PLTCall.cpp b/bolt/Passes/PLTCall.cpp index 78eba87dc1e3..219ecc10f363 100644 --- a/bolt/Passes/PLTCall.cpp +++ b/bolt/Passes/PLTCall.cpp @@ -34,8 +34,7 @@ PLT("plt", "optimize executed (hot) PLT calls"), clEnumValN(bolt::PLTCall::OT_ALL, "all", - "optimize all PLT calls"), - clEnumValEnd), + "optimize all PLT calls")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); diff --git a/bolt/Passes/ReachingDefOrUse.h b/bolt/Passes/ReachingDefOrUse.h index 2113eb4590b6..01ed496106fb 100644 --- a/bolt/Passes/ReachingDefOrUse.h +++ b/bolt/Passes/ReachingDefOrUse.h @@ -58,7 +58,8 @@ class ReachingDefOrUse } void run() { - NamedRegionTimer T1("RD", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("RD", "Reaching Defs", "Dataflow", "Dataflow", + opts::TimeOpts); InstrsDataflowAnalysis, !Def>::run(); } diff --git a/bolt/Passes/ReachingInsns.h b/bolt/Passes/ReachingInsns.h index 047c6e3154ce..3706380042b5 100644 --- a/bolt/Passes/ReachingInsns.h +++ b/bolt/Passes/ReachingInsns.h @@ -46,7 +46,8 @@ class ReachingInsns } void run() { - NamedRegionTimer T1("RI", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("RI", "Reaching Insns", "Dataflow", "Dataflow", + opts::TimeOpts); InstrsDataflowAnalysis, Backward>::run(); } diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp index 3535588773d5..fe4b14466290 100644 --- a/bolt/Passes/ReorderFunctions.cpp +++ b/bolt/Passes/ReorderFunctions.cpp @@ -51,8 +51,7 @@ ReorderFunctions("reorder-functions", "reorder functions randomly"), clEnumValN(bolt::ReorderFunctions::RT_USER, "user", - "use function order specified by -function-order"), - clEnumValEnd), + "use function order specified by -function-order")), cl::cat(BoltOptCategory)); static cl::opt diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp index 56c57b5507a9..a3bed4802b0c 100644 --- a/bolt/Passes/ShrinkWrapping.cpp +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -1366,7 +1366,8 @@ class PredictiveStackPointerTracking TodoMap(TodoMap), Info(Info) {} void run() { - NamedRegionTimer T1("PSPT", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("PSPT", "Predictive Stack Pointer Tracking", "Dataflow", + "Dataflow", opts::TimeOpts); StackPointerTrackingBase::run(); } }; diff --git a/bolt/Passes/StackAllocationAnalysis.h b/bolt/Passes/StackAllocationAnalysis.h index 22291448763d..19df8115132b 100644 --- a/bolt/Passes/StackAllocationAnalysis.h +++ b/bolt/Passes/StackAllocationAnalysis.h @@ -41,7 +41,8 @@ class StackAllocationAnalysis virtual ~StackAllocationAnalysis() {} void run() { - NamedRegionTimer T1("SAA", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("SAA", "Stack Allocation Analysis", "Dataflow", + "Dataflow", opts::TimeOpts); InstrsDataflowAnalysis::run(); } diff --git a/bolt/Passes/StackAvailableExpressions.h b/bolt/Passes/StackAvailableExpressions.h index 8291ce52100a..c5b19d342180 100644 --- a/bolt/Passes/StackAvailableExpressions.h +++ b/bolt/Passes/StackAvailableExpressions.h @@ -36,7 +36,8 @@ class StackAvailableExpressions virtual ~StackAvailableExpressions() {} void run() { - NamedRegionTimer T1("SAE", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("SAE", "Stack Available Expressions", "Dataflow", + "Dataflow", opts::TimeOpts); InstrsDataflowAnalysis::run(); } diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/Passes/StackPointerTracking.h index cec3244f298f..3438a07a3225 100644 --- a/bolt/Passes/StackPointerTracking.h +++ b/bolt/Passes/StackPointerTracking.h @@ -204,7 +204,8 @@ class StackPointerTracking virtual ~StackPointerTracking() {} void run() { - NamedRegionTimer T1("SPT", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("SPT", "Stack Pointer Tracking", "Dataflow", "Dataflow", + opts::TimeOpts); StackPointerTrackingBase::run(); } }; diff --git a/bolt/Passes/StackReachingUses.h b/bolt/Passes/StackReachingUses.h index bc34db3961c9..3d8fa9d79ed0 100644 --- a/bolt/Passes/StackReachingUses.h +++ b/bolt/Passes/StackReachingUses.h @@ -51,7 +51,8 @@ class StackReachingUses bool IncludeLocalAccesses = true) const; void run() { - NamedRegionTimer T1("SRU", "Dataflow", opts::TimeOpts); + NamedRegionTimer T1("SRU", "Stack Reaching Uses", "Dataflow", "Dataflow", + opts::TimeOpts); InstrsDataflowAnalysis::run(); } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 40871dab30ab..be8fcbc0a06e 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -23,22 +23,24 @@ #include "RewriteInstance.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" -#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInstPrinter.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" @@ -49,7 +51,6 @@ #include "llvm/Object/SymbolicFile.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/DataExtractor.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ManagedStatic.h" @@ -262,8 +263,7 @@ SplitFunctions("split-functions", clEnumValN(BinaryFunction::ST_LARGE, "2", "also split if function too large to fit"), clEnumValN(BinaryFunction::ST_ALL, "3", - "split all functions"), - clEnumValEnd), + "split all functions")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -420,7 +420,8 @@ const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; const std::string RewriteInstance::BOLTSecPrefix = ".bolt"; -const char RewriteInstance::TimerGroupName[] = "Rewrite passes"; +const char RewriteInstance::TimerGroupName[] = "rewrite"; +const char RewriteInstance::TimerGroupDesc[] = "Rewrite passes"; namespace llvm { namespace bolt { @@ -432,6 +433,13 @@ void report_error(StringRef Message, std::error_code EC) { exit(1); } +void report_error(StringRef Message, Error E) { + assert(E); + errs() << "BOLT-ERROR: '" << Message << "': " << toString(std::move(E)) + << ".\n"; + exit(1); +} + void check_error(std::error_code EC, StringRef Message) { if (!EC) return; @@ -447,6 +455,28 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, StringRef SectionName, bool IsCode, bool IsReadOnly) { + // Register as note section (non-allocatable) if we recognize it as so + for (auto &OverwriteName : RewriteInstance::SectionsToOverwrite) { + if (SectionName == OverwriteName) { + uint8_t *DataCopy = new uint8_t[Size]; + DEBUG(dbgs() << "BOLT: note section " << SectionName << " with size " + << Size << ", alignment " << Alignment << " at 0x" + << Twine::utohexstr(reinterpret_cast(DataCopy)) + << '\n'); + NoteSectionInfo[SectionName] = + SectionInfo(reinterpret_cast(DataCopy), + Size, + Alignment, + /*IsCode=*/false, + /*IsReadOnly=*/true, + /*IsLocal=*/false, + 0, + 0, + SectionID); + return DataCopy; + } + } + uint8_t *ret; if (IsCode) { ret = SectionMemoryManager::allocateCodeSection(Size, Alignment, @@ -590,8 +620,7 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, llvm::make_unique(); std::unique_ptr Ctx = llvm::make_unique(AsmInfo.get(), MRI.get(), MOFI.get()); - MOFI->InitMCObjectFileInfo(*TheTriple, Reloc::Default, - CodeModel::Small, *Ctx); + MOFI->InitMCObjectFileInfo(*TheTriple, /*PIC=*/false, *Ctx); std::unique_ptr DisAsm( TheTarget->createMCDisassembler(*STI, *Ctx)); @@ -653,11 +682,12 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, RewriteInstance::RewriteInstance(ELFObjectFileBase *File, DataReader &DR, DataAggregator &DA, const int Argc, const char *const *Argv) - : InputFile(File), Argc(Argc), Argv(Argv), DA(DA), - BC(createBinaryContext( - File, DR, - std::unique_ptr( - new DWARFContextInMemory(*InputFile, nullptr, true)))) {} + : InputFile(File), Argc(Argc), Argv(Argv), DA(DA), + BC(createBinaryContext( + File, DR, + DWARFContext::create(*File, nullptr, + DWARFContext::defaultErrorHandler, "", false))), + SHStrTab(StringTableBuilder::ELF) {} RewriteInstance::~RewriteInstance() {} @@ -667,10 +697,11 @@ void RewriteInstance::reset() { auto &DR = BC->DR; BC = createBinaryContext( InputFile, DR, - std::unique_ptr( - new DWARFContextInMemory(*InputFile, nullptr, true))); + DWARFContext::create(*InputFile, nullptr, + DWARFContext::defaultErrorHandler, "", false)); CFIRdWrt.reset(nullptr); - EFMM.reset(nullptr); + OLT.reset(nullptr); + EFMM.reset(); Out.reset(nullptr); EHFrame = nullptr; FailedAddresses.clear(); @@ -679,7 +710,8 @@ void RewriteInstance::reset() { } void RewriteInstance::discoverStorage() { - NamedRegionTimer T("discover storage", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("discoverStorage", "discover storage", TimerGroupName, + TimerGroupDesc, opts::TimeRewrite); // Stubs are harmful because RuntimeDyld may try to increase the size of // sections accounting for stubs when we need those sections to match the @@ -702,7 +734,8 @@ void RewriteInstance::discoverStorage() { NextAvailableAddress = 0; uint64_t NextAvailableOffset = 0; - for (const auto &Phdr : Obj->program_headers()) { + auto PHs = cantFail(Obj->program_headers(), "program_headers() failed"); + for (const auto &Phdr : PHs) { if (Phdr.p_type == ELF::PT_LOAD) { FirstAllocAddress = std::min(FirstAllocAddress, static_cast(Phdr.p_vaddr)); @@ -746,8 +779,8 @@ void RewriteInstance::discoverStorage() { FirstNonAllocatableOffset = NextAvailableOffset; - NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, PageAlign); - NextAvailableOffset = RoundUpToAlignment(NextAvailableOffset, PageAlign); + NextAvailableAddress = alignTo(NextAvailableAddress, PageAlign); + NextAvailableOffset = alignTo(NextAvailableOffset, PageAlign); if (!opts::UseGnuStack) { // This is where the black magic happens. Creating PHDR table in a segment @@ -784,8 +817,8 @@ void RewriteInstance::discoverStorage() { } // Align at cache line. - NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, 64); - NextAvailableOffset = RoundUpToAlignment(NextAvailableOffset, 64); + NextAvailableAddress = alignTo(NextAvailableAddress, 64); + NextAvailableOffset = alignTo(NextAvailableOffset, 64); NewTextSegmentAddress = NextAvailableAddress; NewTextSegmentOffset = NextAvailableOffset; @@ -826,7 +859,7 @@ RewriteInstance::getBuildID() { return NoneType(); StringRef Name = SectionContents.slice(Offset, Offset + NameSz); - Offset = RoundUpToAlignment(Offset + NameSz, 4); + Offset = alignTo(Offset + NameSz, 4); StringRef BinaryBuildID = SectionContents.slice(Offset, Offset + DescSz); if (Name.substr(0, 3) != "GNU") return NoneType(); @@ -949,8 +982,8 @@ void RewriteInstance::run() { // Copy allocatable part of the input. std::error_code EC; - Out = llvm::make_unique(opts::OutputFilename, EC, - sys::fs::F_None, 0777); + Out = llvm::make_unique(opts::OutputFilename, EC, + sys::fs::F_None, 0777); check_error(EC, "cannot create output executable file"); Out->os() << InputFile->getData().substr(0, FirstNonAllocatableOffset); @@ -959,8 +992,8 @@ void RewriteInstance::run() { } void RewriteInstance::discoverFileObjects() { - NamedRegionTimer T("discover file objects", TimerGroupName, - opts::TimeRewrite); + NamedRegionTimer T("discoverFileObjects", "discover file objects", + TimerGroupName, TimerGroupDesc, opts::TimeRewrite); FileSymRefs.clear(); BinaryFunctions.clear(); @@ -977,7 +1010,7 @@ void RewriteInstance::discoverFileObjects() { }; std::unordered_map SymbolToFileName; for (const auto &Symbol : InputFile->symbols()) { - ErrorOr NameOrError = Symbol.getName(); + auto NameOrError = Symbol.getName(); if (NameOrError && NameOrError->startswith("__asan_init")) { errs() << "BOLT-ERROR: input file was compiled or linked with sanitizer " "support. Cannot optimize.\n"; @@ -992,13 +1025,14 @@ void RewriteInstance::discoverFileObjects() { if (Symbol.getFlags() & SymbolRef::SF_Undefined) continue; - if (Symbol.getType() == SymbolRef::ST_File) { - check_error(NameOrError.getError(), "cannot get symbol name for file"); + if (cantFail(Symbol.getType()) == SymbolRef::ST_File) { + auto Name = + cantFail(std::move(NameOrError), "cannot get symbol name for file"); // Ignore Clang LTO artificial FILE symbol as it is not always generated, // and this uncertainty is causing havoc in function name matching. - if (*NameOrError == "ld-temp.o") + if (Name == "ld-temp.o") continue; - FileSymbolName = *NameOrError; + FileSymbolName = Name; SeenFileName = true; continue; } @@ -1014,11 +1048,13 @@ void RewriteInstance::discoverFileObjects() { std::stable_sort(SortedFileSymbols.begin(), SortedFileSymbols.end(), [](const SymbolRef &A, const SymbolRef &B) { // FUNC symbols have higher precedence. - if (*(A.getAddress()) == *(B.getAddress())) { - return A.getType() == SymbolRef::ST_Function && - B.getType() != SymbolRef::ST_Function; + auto AddressA = cantFail(A.getAddress()); + auto AddressB = cantFail(B.getAddress()); + if (AddressA == AddressB) { + return cantFail(A.getType()) == SymbolRef::ST_Function && + cantFail(B.getType()) != SymbolRef::ST_Function; } - return *(A.getAddress()) < *(B.getAddress()); + return AddressA < AddressB; }); // For aarch64, the ABI defines mapping symbols so we identify data in the @@ -1028,9 +1064,9 @@ void RewriteInstance::discoverFileObjects() { MarkersBegin = std::stable_partition( SortedFileSymbols.begin(), SortedFileSymbols.end(), [](const SymbolRef &Symbol) { - ErrorOr NameOrError = Symbol.getName(); - return !(Symbol.getType() == SymbolRef::ST_Unknown && - (*NameOrError == "$d" || *NameOrError == "$x")); + StringRef Name = cantFail(Symbol.getName()); + return !(cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && + (Name == "$d" || Name == "$x")); }); } @@ -1041,17 +1077,15 @@ void RewriteInstance::discoverFileObjects() { if (Symbol.getFlags() & SymbolRef::SF_Undefined) continue; - if (Symbol.getType() == SymbolRef::ST_File) + if (cantFail(Symbol.getType()) == SymbolRef::ST_File) continue; - ErrorOr NameOrError = Symbol.getName(); - check_error(NameOrError.getError(), "cannot get symbol name"); - - ErrorOr AddressOrErr = Symbol.getAddress(); - check_error(AddressOrErr.getError(), "cannot get symbol address"); - uint64_t Address = *AddressOrErr; + StringRef SymName = cantFail(Symbol.getName(), "cannot get symbol name"); + uint64_t Address = + cantFail(Symbol.getAddress(), "cannot get symbol address"); if (Address == 0) { - if (opts::Verbosity >= 1 && Symbol.getType() == SymbolRef::ST_Function) + if (opts::Verbosity >= 1 && + cantFail(Symbol.getType()) == SymbolRef::ST_Function) errs() << "BOLT-WARNING: function with 0 address seen\n"; continue; } @@ -1060,16 +1094,15 @@ void RewriteInstance::discoverFileObjects() { // There's nothing horribly wrong with anonymous symbols, but let's // ignore them for now. - if (NameOrError->empty()) + if (SymName.empty()) continue; /// It is possible we are seeing a globalized local. LLVM might treat it as /// a local if it has a "private global" prefix, e.g. ".L". Thus we have to /// change the prefix to enforce global scope of the symbol. - std::string Name = - NameOrError->startswith(BC->AsmInfo->getPrivateGlobalPrefix()) - ? "PG" + std::string(*NameOrError) - : std::string(*NameOrError); + std::string Name = SymName.startswith(BC->AsmInfo->getPrivateGlobalPrefix()) + ? "PG" + std::string(SymName) + : std::string(SymName); // Disambiguate all local symbols before adding to symbol table. // Since we don't know if we will see a global with the same name, @@ -1119,9 +1152,8 @@ void RewriteInstance::discoverFileObjects() { if (!AlternativeName.empty()) BC->registerNameAtAddress(AlternativeName, Address); - ErrorOr SectionOrErr = Symbol.getSection(); - check_error(SectionOrErr.getError(), "cannot get symbol section"); - section_iterator Section = *SectionOrErr; + section_iterator Section = + cantFail(Symbol.getSection(), "cannot get symbol section"); if (Section == InputFile->section_end()) { // Could be an absolute symbol. Could record for pretty printing. continue; @@ -1131,7 +1163,7 @@ void RewriteInstance::discoverFileObjects() { << " for function\n"); if (!Section->isText()) { - assert(Symbol.getType() != SymbolRef::ST_Function && + assert(cantFail(Symbol.getType()) != SymbolRef::ST_Function && "unexpected function inside non-code section"); DEBUG(dbgs() << "BOLT-DEBUG: rejecting as symbol is not in code\n"); continue; @@ -1146,7 +1178,7 @@ void RewriteInstance::discoverFileObjects() { // Sometimes assembly functions are not marked as functions and neither are // their local labels. The only way to tell them apart is to look at // symbol scope - global vs local. - if (Symbol.getType() != SymbolRef::ST_Function) { + if (cantFail(Symbol.getType()) != SymbolRef::ST_Function) { if (PreviousFunction) { if (PreviousFunction->getSize() == 0) { if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) { @@ -1308,10 +1340,9 @@ void RewriteInstance::discoverFileObjects() { // Annotate functions with code/data markers in AArch64 for (auto ISym = MarkersBegin; ISym != SortedFileSymbols.end(); ++ISym) { const auto &Symbol = *ISym; - ErrorOr AddressOrErr = Symbol.getAddress(); - check_error(AddressOrErr.getError(), "cannot get symbol address"); + uint64_t Address = + cantFail(Symbol.getAddress(), "cannot get symbol address"); auto SymbolSize = ELFSymbolRef(Symbol).getSize(); - uint64_t Address = *AddressOrErr; auto *BF = getBinaryFunctionContainingAddress(Address, true, true); if (!BF) { // Stray marker @@ -1390,7 +1421,7 @@ void RewriteInstance::disassemblePLT() { const auto SymbolIter = Rel.getSymbol(); assert(SymbolIter != InputFile->symbol_end() && "non-null symbol expected"); - const auto SymbolName = *(*SymbolIter).getName(); + const auto SymbolName = cantFail((*SymbolIter).getName()); std::string Name = SymbolName.str() + "@PLT"; auto *BF = createBinaryFunction(Name, *PLTSection, @@ -1451,9 +1482,6 @@ void RewriteInstance::adjustFunctionBoundaries() { ++NextSymRefI; } - auto NextSymRefSectionI = (NextSymRefI == FileSymRefs.end()) - ? InputFile->section_end() - : NextSymRefI->second.getSection(); // Function runs at most till the end of the containing section. uint64_t NextObjectAddress = Function.getSection().getEndAddress(); @@ -1492,11 +1520,10 @@ void RewriteInstance::adjustFunctionBoundaries() { void RewriteInstance::relocateEHFrameSection() { assert(EHFrameSection && "non-empty .eh_frame section expected"); - DWARFFrame EHFrame(EHFrameSection->getAddress()); + DWARFDebugFrame EHFrame(true, EHFrameSection->getAddress()); StringRef EHFrameSectionContents = EHFrameSection->getContents(); - DataExtractor DE(EHFrameSectionContents, - BC->AsmInfo->isLittleEndian(), - BC->AsmInfo->getPointerSize()); + DWARFDataExtractor DE(EHFrameSectionContents, BC->AsmInfo->isLittleEndian(), + BC->AsmInfo->getCodePointerSize()); auto createReloc = [&](uint64_t Value, uint64_t Offset, uint64_t DwarfType) { if (DwarfType == dwarf::DW_EH_PE_omit) return; @@ -1518,10 +1545,12 @@ void RewriteInstance::relocateEHFrameSection() { case dwarf::DW_EH_PE_sdata4: case dwarf::DW_EH_PE_udata4: RelType = ELF::R_X86_64_PC32; + Offset -= 4; break; case dwarf::DW_EH_PE_sdata8: case dwarf::DW_EH_PE_udata8: RelType = ELF::R_X86_64_PC64; + Offset -= 8; break; } @@ -1539,12 +1568,6 @@ void RewriteInstance::relocateEHFrameSection() { }; EHFrame.parse(DE, createReloc); - - if (!EHFrame.ParseError.empty()) { - errs() << "BOLT-ERROR: EHFrame reader failed with message \"" - << EHFrame.ParseError << '\n'; - exit(1); - } } BinaryFunction *RewriteInstance::createBinaryFunction( @@ -1560,8 +1583,8 @@ BinaryFunction *RewriteInstance::createBinaryFunction( } void RewriteInstance::readSpecialSections() { - NamedRegionTimer T("read special sections", TimerGroupName, - opts::TimeRewrite); + NamedRegionTimer T("readSpecialSections", "read special sections", + TimerGroupName, TimerGroupDesc, opts::TimeRewrite); bool HasTextRelocations = false; @@ -1570,11 +1593,14 @@ void RewriteInstance::readSpecialSections() { StringRef SectionName; check_error(Section.getName(SectionName), "cannot get section name"); StringRef SectionContents; - check_error(Section.getContents(SectionContents), - "cannot get section contents"); - ArrayRef SectionData( - reinterpret_cast(SectionContents.data()), - Section.getSize()); + ArrayRef SectionData; + if (!(ELFSectionRef(Section).getType() & ELF::SHT_NOBITS)) { + check_error(Section.getContents(SectionContents), + "cannot get section contents"); + SectionData = ArrayRef( + reinterpret_cast(SectionContents.data()), + Section.getSize()); + } if (SectionName == ".gcc_except_table") { LSDAData = SectionData; @@ -1612,14 +1638,9 @@ void RewriteInstance::readSpecialSections() { EHFrame = BC->DwCtx->getEHFrame(); if (opts::DumpEHFrame) { outs() << "BOLT-INFO: Dumping original binary .eh_frame\n"; - EHFrame->dump(outs()); + EHFrame->dump(outs(), NoneType()); } CFIRdWrt.reset(new CFIReaderWriter(*EHFrame)); - if (!EHFrame->ParseError.empty()) { - errs() << "BOLT-ERROR: EHFrame reader failed with message \"" - << EHFrame->ParseError << '\n'; - exit(1); - } } namespace { @@ -1629,7 +1650,7 @@ int64_t getRelocationAddend(const ELFObjectFile *Obj, int64_t Addend = 0; const ELFFile &EF = *Obj->getELFFile(); DataRefImpl Rel = RelRef.getRawDataRefImpl(); - const auto *RelocationSection = *(EF.getSection(Rel.d.a)); + const auto *RelocationSection = cantFail(EF.getSection(Rel.d.a)); switch (RelocationSection->sh_type) { default: llvm_unreachable("unexpected relocation section type"); case ELF::SHT_REL: @@ -1668,21 +1689,21 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, const bool IsAArch64 = BC->TheTriple->getArch() == llvm::Triple::aarch64; const bool IsFromCode = RelocatedSection.isText(); - + // For value extraction. StringRef RelocatedSectionContents; RelocatedSection.getContents(RelocatedSectionContents); DataExtractor DE(RelocatedSectionContents, BC->AsmInfo->isLittleEndian(), - BC->AsmInfo->getPointerSize()); + BC->AsmInfo->getCodePointerSize()); const bool IsPCRelative = Relocation::isPCRelative(Rel.getType()); auto SymbolIter = Rel.getSymbol(); assert(SymbolIter != InputFile->symbol_end() && "relocation symbol must exist"); auto Symbol = *SymbolIter; - SymbolName = *(Symbol.getName()); - SymbolAddress = *(Symbol.getAddress()); + SymbolName = cantFail(Symbol.getName()); + SymbolAddress = cantFail(Symbol.getAddress()); Addend = getRelocationAddend(InputFile, Rel); uint32_t RelocationOffset = @@ -1698,9 +1719,9 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, } // Weird stuff - section symbols are marked as ST_Debug. - const bool SymbolIsSection = (Symbol.getType() == SymbolRef::ST_Debug); - const auto PCRelOffset = - IsPCRelative && !IsAArch64 ? Rel.getOffset() : 0; + const bool SymbolIsSection = + (cantFail(Symbol.getType()) == SymbolRef::ST_Debug); + const auto PCRelOffset = IsPCRelative && !IsAArch64 ? Rel.getOffset() : 0; // If no symbol has been found or if it is a relocation requiring the // creation of a GOT entry, do not link against the symbol but against @@ -1781,10 +1802,10 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, << "; type name = " << TypeName << "; value = 0x" << Twine::utohexstr(ExtractedValue) << "; symbol = " << SymbolName - << "; symbol type = " << Symbol.getType() + << "; symbol type = " << cantFail(Symbol.getType()) << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) << "; orig symbol address = 0x" - << Twine::utohexstr(*(Symbol.getAddress())) + << Twine::utohexstr(cantFail(Symbol.getAddress())) << "; symbol section = " << getSectionName(**Section) << "; addend = 0x" << Twine::utohexstr(Addend) << "; original addend = 0x" @@ -1981,7 +2002,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } void RewriteInstance::readDebugInfo() { - NamedRegionTimer T("read debug info", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("readDebugInfo", "read debug info", TimerGroupName, + TimerGroupDesc, opts::TimeRewrite); if (!opts::UpdateDebugSections) return; @@ -1990,7 +2012,8 @@ void RewriteInstance::readDebugInfo() { void RewriteInstance::processProfileData() { if (DA.started()) { - NamedRegionTimer T("aggregate data", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("aggregate", "aggregate data", TimerGroupName, + TimerGroupDesc, opts::TimeRewrite); DA.aggregate(*BC.get(), BinaryFunctions); for (auto &BFI : BinaryFunctions) { @@ -2004,7 +2027,8 @@ void RewriteInstance::processProfileData() { } } } else { - NamedRegionTimer T("read profile data", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("readprofile", "read profile data", TimerGroupName, + TimerGroupDesc, opts::TimeRewrite); if (!opts::BoltProfile.empty()) { ProfileReader PR; @@ -2042,8 +2066,8 @@ void RewriteInstance::processProfileData() { } void RewriteInstance::disassembleFunctions() { - NamedRegionTimer T("disassemble functions", TimerGroupName, - opts::TimeRewrite); + NamedRegionTimer T("disassembleFunctions", "disassemble functions", + TimerGroupName, TimerGroupDesc, opts::TimeRewrite); for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; @@ -2190,8 +2214,8 @@ void RewriteInstance::postProcessFunctions() { } void RewriteInstance::runOptimizationPasses() { - NamedRegionTimer T("run optimization passes", TimerGroupName, - opts::TimeRewrite); + NamedRegionTimer T("runOptimizationPasses", "run optimization passes", + TimerGroupName, TimerGroupDesc, opts::TimeRewrite); BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions, LargeFunctions); } @@ -2232,7 +2256,6 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, Streamer.EmitCodeAlignment(Function.getAlignment(), MaxAlignBytes); } else { Streamer.EmitCodeAlignment(Function.getAlignment()); - Streamer.setCodeSkew(EmitColdPart ? 0 : Function.getAddress()); } MCContext &Context = Streamer.getContext(); @@ -2293,7 +2316,7 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, if (auto Padding = opts::padFunction(Function)) { DEBUG(dbgs() << "BOLT-DEBUG: padding function " << Function << " with " << Padding << " bytes\n"); - Streamer.EmitFill(Padding, MAI->getTextAlignFillValue()); + Streamer.emitFill(Padding, MAI->getTextAlignFillValue()); } if (opts::MarkFuncs) { @@ -2328,14 +2351,15 @@ std::vector singletonSet(T t) { } // anonymous namespace void RewriteInstance::emitFunctions() { - NamedRegionTimer T("emit functions", TimerGroupName, opts::TimeRewrite); + NamedRegionTimer T("emitFunctions", "emit functions", TimerGroupName, + TimerGroupDesc, opts::TimeRewrite); std::error_code EC; // This is an object file, which we keep for debugging purposes. // Once we decide it's useless, we should create it in memory. - std::unique_ptr TempOut = - llvm::make_unique(opts::OutputFilename + ".bolt.o", - EC, sys::fs::F_None); + std::unique_ptr TempOut = + llvm::make_unique(opts::OutputFilename + ".bolt.o", + EC, sys::fs::F_None); check_error(EC, "cannot create output object file"); std::unique_ptr BOS = @@ -2346,16 +2370,14 @@ void RewriteInstance::emitFunctions() { // and MCCodeEmitter (MCE). ~MCObjectStreamer() will delete these // two instances. auto MCE = BC->TheTarget->createMCCodeEmitter(*BC->MII, *BC->MRI, *BC->Ctx); - auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); - std::unique_ptr Streamer( - BC->TheTarget->createMCObjectStreamer(*BC->TheTriple, - *BC->Ctx, - *MAB, - *OS, - MCE, - *BC->STI, - /* RelaxAll */ false, - /* DWARFMustBeAtTheEnd */ false)); + auto MAB = + BC->TheTarget->createMCAsmBackend(*BC->STI, *BC->MRI, MCTargetOptions()); + std::unique_ptr Streamer(BC->TheTarget->createMCObjectStreamer( + *BC->TheTriple, *BC->Ctx, std::unique_ptr(MAB), *OS, + std::unique_ptr(MCE), *BC->STI, + /* RelaxAll */ false, + /* IncrementalLinkerCompatible */ false, + /* DWARFMustBeAtTheEnd */ false)); Streamer->InitSections(false); @@ -2465,18 +2487,17 @@ void RewriteInstance::emitFunctions() { // Get output object as ObjectFile. std::unique_ptr ObjectMemBuffer = MemoryBuffer::getMemBuffer(BOS->str(), "in-memory object file", false); - ErrorOr> ObjOrErr = - object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()); - check_error(ObjOrErr.getError(), "error creating in-memory object"); + std::unique_ptr Obj = cantFail( + object::ObjectFile::createObjectFile(ObjectMemBuffer->getMemBufferRef()), + "error creating in-memory object"); auto Resolver = orc::createLambdaResolver( - [&](const std::string &Name) { + [&](const std::string &Name) -> JITSymbol { DEBUG(dbgs() << "BOLT: looking for " << Name << "\n"); auto I = BC->GlobalSymbols.find(Name); if (I == BC->GlobalSymbols.end()) - return RuntimeDyld::SymbolInfo(nullptr); - return RuntimeDyld::SymbolInfo(I->second, - JITSymbolFlags::None); + return JITSymbol(nullptr); + return JITSymbol(I->second, JITSymbolFlags()); }, [](const std::string &S) { DEBUG(dbgs() << "BOLT: resolving " << S << "\n"); @@ -2485,21 +2506,35 @@ void RewriteInstance::emitFunctions() { ); Resolver->setAllowsZeroSymbols(true); - auto ObjectsHandle = OLT.addObjectSet( - singletonSet(std::move(ObjOrErr.get())), - EFMM.get(), - std::move(Resolver), - /* ProcessAllSections = */true); - - // Assign addresses to all sections. - mapFileSections(ObjectsHandle); - - // Update output addresses based on the new section map and layout. MCAsmLayout FinalLayout( static_cast(Streamer.get())->getAssembler()); - updateOutputValues(FinalLayout); - OLT.emitAndFinalize(ObjectsHandle); + OLT.reset(new decltype(OLT)::element_type( + [this]() { + // Get memory manager + return EFMM; + }, + [&](orc::RTDyldObjectLinkingLayerBase::ObjHandleT Handle, + const orc::RTDyldObjectLinkingLayer::ObjectPtr &Obj, + const RuntimeDyld::LoadedObjectInfo &) { + // On NotifyLoaded... + // Assign addresses to all sections. + mapFileSections(Handle); + + // Update output addresses based on the new section map and + // layout. + updateOutputValues(FinalLayout); + })); + + OLT->setProcessAllSections(true); + auto ObjectsHandle = cantFail( + OLT->addObject(std::unique_ptr>( + new OwningBinary( + std::move(Obj), std::move(ObjectMemBuffer))), + std::move(Resolver)), + "failed in addObject()"); + + cantFail(OLT->emitAndFinalize(ObjectsHandle)); if (opts::PrintCacheMetrics) { outs() << "BOLT-INFO: cache metrics after emitting functions:\n"; @@ -2511,7 +2546,7 @@ void RewriteInstance::emitFunctions() { } void RewriteInstance::mapFileSections( - orc::ObjectLinkingLayer<>::ObjSetHandleT &ObjectsHandle) { + orc::RTDyldObjectLinkingLayer::ObjHandleT &ObjectsHandle) { NewTextSectionStartAddress = NextAvailableAddress; if (BC->HasRelocations) { auto SMII = EFMM->SectionMapInfo.find(".text"); @@ -2550,9 +2585,9 @@ void RewriteInstance::mapFileSections( << Twine::utohexstr(SMII->second.AllocAddress) << " to 0x" << Twine::utohexstr(NewTextSectionStartAddress) << '\n'); - OLT.mapSectionAddress(ObjectsHandle, - SI.SectionID, - NewTextSectionStartAddress); + OLT->mapSectionAddress(ObjectsHandle, + SI.SectionID, + NewTextSectionStartAddress); } else { for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -2567,9 +2602,9 @@ void RewriteInstance::mapFileSections( << Twine::utohexstr(SMII->second.AllocAddress) << " to 0x" << Twine::utohexstr(Function.getAddress()) << '\n'); - OLT.mapSectionAddress(ObjectsHandle, - SMII->second.SectionID, - Function.getAddress()); + OLT->mapSectionAddress(ObjectsHandle, + SMII->second.SectionID, + Function.getAddress()); Function.setImageAddress(SMII->second.AllocAddress); Function.setImageSize(SMII->second.Size); if (Function.getImageSize() > Function.getMaxSize()) { @@ -2588,9 +2623,9 @@ void RewriteInstance::mapFileSections( JT.SecInfo->FileAddress = JT.Address; DEBUG(dbgs() << "BOLT-DEBUG: mapping " << JT.SectionName << " to 0x" << Twine::utohexstr(JT.Address) << '\n'); - OLT.mapSectionAddress(ObjectsHandle, - JT.SecInfo->SectionID, - JT.Address); + OLT->mapSectionAddress(ObjectsHandle, + JT.SecInfo->SectionID, + JT.Address); } } @@ -2601,7 +2636,7 @@ void RewriteInstance::mapFileSections( assert(SMII != EFMM->SectionMapInfo.end() && "cannot find section for cold part"); // Cold fragments are aligned at 16 bytes. - NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, 16); + NextAvailableAddress = alignTo(NextAvailableAddress, 16); auto &ColdPart = Function.cold(); if (TooLarge) { // The corresponding FDE will refer to address 0. @@ -2622,9 +2657,9 @@ void RewriteInstance::mapFileSections( << Twine::utohexstr(ColdPart.getAddress()) << " with size " << Twine::utohexstr(ColdPart.getImageSize()) << '\n'); - OLT.mapSectionAddress(ObjectsHandle, - SMII->second.SectionID, - ColdPart.getAddress()); + OLT->mapSectionAddress(ObjectsHandle, + SMII->second.SectionID, + ColdPart.getAddress()); NextAvailableAddress += ColdPart.getImageSize(); } @@ -2657,16 +2692,15 @@ void RewriteInstance::mapFileSections( if (SMII == EFMM->SectionMapInfo.end()) continue; SectionInfo &SI = SMII->second; - NextAvailableAddress = RoundUpToAlignment(NextAvailableAddress, - SI.Alignment); + NextAvailableAddress = alignTo(NextAvailableAddress, SI.Alignment); DEBUG(dbgs() << "BOLT: mapping section " << SectionName << " (0x" << Twine::utohexstr(SI.AllocAddress) << ") to 0x" << Twine::utohexstr(NextAvailableAddress) << '\n'); - OLT.mapSectionAddress(ObjectsHandle, - SI.SectionID, - NextAvailableAddress); + OLT->mapSectionAddress(ObjectsHandle, + SI.SectionID, + NextAvailableAddress); SI.FileAddress = NextAvailableAddress; SI.FileOffset = getFileOffsetForAddress(NextAvailableAddress); @@ -2696,9 +2730,9 @@ void RewriteInstance::mapFileSections( << ") to 0x" << Twine::utohexstr(Section.getAddress()) << '\n'); - OLT.mapSectionAddress(ObjectsHandle, - SI.SectionID, - Section.getAddress()); + OLT->mapSectionAddress(ObjectsHandle, + SI.SectionID, + Section.getAddress()); SI.FileAddress = Section.getAddress(); StringRef SectionContents = Section.getContents(); @@ -2730,10 +2764,10 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { Function.setOutputSize(EndOffset - StartOffset); if (Function.isSplit()) { const auto *ColdStartSymbol = Function.getColdSymbol(); - assert(ColdStartSymbol && ColdStartSymbol->isDefined(false) && + assert(ColdStartSymbol && ColdStartSymbol->isDefined() && "split function should have defined cold symbol"); const auto *ColdEndSymbol = Function.getFunctionColdEndLabel(); - assert(ColdEndSymbol && ColdEndSymbol->isDefined(false) && + assert(ColdEndSymbol && ColdEndSymbol->isDefined() && "split function should have defined cold end symbol"); const auto ColdStartOffset = Layout.getSymbolOffset(*ColdStartSymbol); const auto ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol); @@ -2763,7 +2797,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { for (auto BBI = Function.layout_begin(), BBE = Function.layout_end(); BBI != BBE; ++BBI) { auto *BB = *BBI; - assert(BB->getLabel()->isDefined(false) && "symbol should be defined"); + assert(BB->getLabel()->isDefined() && "symbol should be defined"); uint64_t BaseAddress; if (BC->HasRelocations) { BaseAddress = NewTextSectionStartAddress; @@ -2900,7 +2934,7 @@ void RewriteInstance::patchELFPHDRTable() { (void)AddedSegment; // Copy existing program headers with modifications. - for (auto &Phdr : Obj->program_headers()) { + for (auto &Phdr : cantFail(Obj->program_headers())) { auto NewPhdr = Phdr; if (PHDRTableAddress && Phdr.p_type == ELF::PT_PHDR) { NewPhdr.p_offset = PHDRTableOffset; @@ -2986,7 +3020,7 @@ void RewriteInstance::rewriteNoteSections() { OS.seek(NextAvailableOffset); // Copy over non-allocatable section contents and update file offsets. - for (auto &Section : Obj->sections()) { + for (auto &Section : cantFail(Obj->sections())) { if (Section.sh_type == ELF::SHT_NULL) continue; if (Section.sh_flags & ELF::SHF_ALLOC) @@ -2999,17 +3033,17 @@ void RewriteInstance::rewriteNoteSections() { NextAvailableOffset = appendPadding(OS, NextAvailableOffset, Section.sh_addralign); - ErrorOr SectionName = Obj->getSectionName(&Section); - check_error(SectionName.getError(), "cannot get section name"); + StringRef SectionName = + cantFail(Obj->getSectionName(&Section), "cannot get section name"); // New section size. uint64_t Size = 0; // Copy over section contents unless it's one of the sections we overwrite. - if (!willOverwriteSection(*SectionName)) { + if (!willOverwriteSection(SectionName)) { Size = Section.sh_size; std::string Data = InputFile->getData().substr(Section.sh_offset, Size); - auto SectionPatchersIt = SectionPatchers.find(*SectionName); + auto SectionPatchersIt = SectionPatchers.find(SectionName); if (SectionPatchersIt != SectionPatchers.end()) { (*SectionPatchersIt->second).patchBinary(Data); } @@ -3023,8 +3057,7 @@ void RewriteInstance::rewriteNoteSections() { uint64_t Address{0}; // Perform section post-processing. - - auto SII = EFMM->NoteSectionInfo.find(*SectionName); + auto SII = EFMM->NoteSectionInfo.find(SectionName); if (SII != EFMM->NoteSectionInfo.end()) { auto &SI = SII->second; assert(SI.Alignment <= Section.sh_addralign && @@ -3035,14 +3068,14 @@ void RewriteInstance::rewriteNoteSections() { if (Address) { DEBUG(dbgs() << "BOLT-DEBUG: " << (Size ? "appending" : "writing") << " contents to section " - << *SectionName << '\n'); + << SectionName << '\n'); OS.write(reinterpret_cast(Address), SI.Size); Size += SI.Size; } if (!SI.PendingRelocs.empty()) { DEBUG(dbgs() << "BOLT-DEBUG: processing relocs for section " - << *SectionName << '\n'); + << SectionName << '\n'); for (auto &Reloc : SI.PendingRelocs) { DEBUG(dbgs() << "BOLT-DEBUG: writing value " << Twine::utohexstr(Reloc.Value) @@ -3059,7 +3092,7 @@ void RewriteInstance::rewriteNoteSections() { } // Set/modify section info. - EFMM->NoteSectionInfo[*SectionName] = + EFMM->NoteSectionInfo[SectionName] = SectionInfo(Address, Size, Section.sh_addralign, @@ -3097,12 +3130,15 @@ void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { auto *Obj = File->getELFFile(); // Pre-populate section header string table. - for (auto &Section : Obj->sections()) { - ErrorOr SectionName = Obj->getSectionName(&Section); - check_error(SectionName.getError(), "cannot get section name"); - SHStrTab.add(*SectionName); - if (willOverwriteSection(*SectionName)) - SHStrTab.add(OrgSecPrefix + SectionName->str()); + for (auto &Section : cantFail(Obj->sections())) { + StringRef SectionName = + cantFail(Obj->getSectionName(&Section), "cannot get section name"); + SHStrTab.add(SectionName); + if (willOverwriteSection(SectionName)) { + AllSHStrTabStrings.emplace_back( + SHStrTabPool.intern(OrgSecPrefix + SectionName.str())); + SHStrTab.add(*AllSHStrTabStrings.back()); + } } for (auto &SMII : EFMM->SectionMapInfo) { SHStrTab.add(SMII.first); @@ -3110,11 +3146,12 @@ void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { for (auto &SMII : EFMM->NoteSectionInfo) { SHStrTab.add(SMII.first); } - SHStrTab.finalize(StringTableBuilder::ELF); + SHStrTab.finalize(); - const auto SHStrTabSize = SHStrTab.data().size(); + const auto SHStrTabSize = SHStrTab.getSize(); uint8_t *DataCopy = new uint8_t[SHStrTabSize]; - memcpy(DataCopy, SHStrTab.data().data(), SHStrTabSize); + memset(DataCopy, 0, SHStrTabSize); + SHStrTab.write(DataCopy); EFMM->NoteSectionInfo[".shstrtab"] = SectionInfo(reinterpret_cast(DataCopy), SHStrTabSize, @@ -3147,12 +3184,12 @@ void RewriteInstance::addBoltInfoSection() { OS.write(reinterpret_cast(&(Type)), 4); OS << NameStr; for (uint64_t I = NameStr.size(); - I < RoundUpToAlignment(NameStr.size(), 4); ++I) { + I < alignTo(NameStr.size(), 4); ++I) { OS << '\0'; } OS << DescStr; for (uint64_t I = DescStr.size(); - I < RoundUpToAlignment(DescStr.size(), 4); ++I) { + I < alignTo(DescStr.size(), 4); ++I) { OS << '\0'; } @@ -3181,14 +3218,16 @@ std::vector RewriteInstance::getOutputSections(ELFObjectFile *File, std::vector *OutputSections) { auto *Obj = File->getELFFile(); + auto Sections = cantFail(Obj->sections()); - std::vector NewSectionIndex(Obj->getNumSections(), 0); + std::vector NewSectionIndex( + std::distance(Sections.begin(), Sections.end()), 0); NewTextSectionIndex = 0; uint32_t CurIndex{0}; // Copy over entries for original allocatable sections with minor // modifications (e.g. name). - for (auto &Section : Obj->sections()) { + for (auto &Section : Sections) { // Always ignore this section. if (Section.sh_type == ELF::SHT_NULL) { NewSectionIndex[0] = CurIndex++; @@ -3209,27 +3248,27 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, if (!(Section.sh_flags & ELF::SHF_ALLOC)) continue; - NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = + NewSectionIndex[std::distance(Sections.begin(), &Section)] = CurIndex++; // If only computing the map, we're done with this iteration if (!OutputSections) continue; - ErrorOr SectionName = Obj->getSectionName(&Section); - check_error(SectionName.getError(), "cannot get section name"); + StringRef SectionName = + cantFail(Obj->getSectionName(&Section), "cannot get section name"); auto NewSection = Section; - if (*SectionName == ".bss") { + if (SectionName == ".bss") { // .bss section offset matches that of the next section. NewSection.sh_offset = NewTextSegmentOffset; } - if (willOverwriteSection(*SectionName)) { + if (willOverwriteSection(SectionName)) { NewSection.sh_name = SHStrTab.getOffset(OrgSecPrefix + - SectionName->str()); + SectionName.str()); } else { - NewSection.sh_name = SHStrTab.getOffset(*SectionName); + NewSection.sh_name = SHStrTab.getOffset(SectionName); } OutputSections->emplace_back(NewSection); @@ -3278,7 +3317,7 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, // Copy over entries for non-allocatable sections performing necessary // adjustments. - for (auto &Section : Obj->sections()) { + for (auto &Section : Sections) { if (Section.sh_type == ELF::SHT_NULL) continue; if (Section.sh_flags & ELF::SHF_ALLOC) @@ -3287,17 +3326,17 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, if (Section.sh_type == ELF::SHT_RELA) continue; - NewSectionIndex[std::distance(Obj->section_begin(), &Section)] = + NewSectionIndex[std::distance(Sections.begin(), &Section)] = CurIndex++; // If only computing the map, we're done with this iteration if (!OutputSections) continue; - ErrorOr SectionName = Obj->getSectionName(&Section); - check_error(SectionName.getError(), "cannot get section name"); + StringRef SectionName = + cantFail(Obj->getSectionName(&Section), "cannot get section name"); - auto SII = EFMM->NoteSectionInfo.find(*SectionName); + auto SII = EFMM->NoteSectionInfo.find(SectionName); assert(SII != EFMM->NoteSectionInfo.end() && "missing section info for non-allocatable section"); @@ -3305,7 +3344,7 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, auto NewSection = Section; NewSection.sh_offset = SI.FileOffset; NewSection.sh_size = SI.Size; - NewSection.sh_name = SHStrTab.getOffset(*SectionName); + NewSection.sh_name = SHStrTab.getOffset(SectionName); OutputSections->emplace_back(NewSection); @@ -3436,10 +3475,10 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { std::function Write, std::function AddToStrTab) { - auto StringSection = *Obj->getStringTableForSymtab(*Section); + auto StringSection = cantFail(Obj->getStringTableForSymtab(*Section)); unsigned IsHotTextUpdated = 0; - for (const Elf_Sym &Symbol : Obj->symbols(Section)) { + for (const Elf_Sym &Symbol : cantFail(Obj->symbols(Section))) { auto NewSymbol = Symbol; const auto *Function = getBinaryFunctionAtAddress(Symbol.st_value); // Some section symbols may be mistakenly associated with the first @@ -3456,9 +3495,10 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (!PatchExisting && Function->isSplit()) { auto NewColdSym = NewSymbol; SmallVector Buf; - NewColdSym.st_name = AddToStrTab(Twine(*Symbol.getName(StringSection)) - .concat(".cold.0") - .toStringRef(Buf)); + NewColdSym.st_name = + AddToStrTab(Twine(cantFail(Symbol.getName(StringSection))) + .concat(".cold.0") + .toStringRef(Buf)); NewColdSym.st_value = Function->cold().getAddress(); NewColdSym.st_size = Function->cold().getImageSize(); Write(0, reinterpret_cast(&NewColdSym), @@ -3511,9 +3551,9 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (NewSymbol.getType() == ELF::STT_NOTYPE && NewSymbol.getBinding() == ELF::STB_LOCAL && NewSymbol.st_size == 0) { - if (auto SecOrErr = - File->getELFFile()->getSection(NewSymbol.st_shndx)) { - auto Section = *SecOrErr; + auto ExpectedSec = File->getELFFile()->getSection(NewSymbol.st_shndx); + if (ExpectedSec) { + auto Section = *ExpectedSec; if (Section->sh_type == ELF::SHT_PROGBITS && Section->sh_flags & ELF::SHF_ALLOC && Section->sh_flags & ELF::SHF_EXECINSTR) { @@ -3524,6 +3564,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { // If patching an existing symtab, patch this value to zero. NewSymbol.st_value = 0; } + } else { + consumeError(ExpectedSec.takeError()); } } } @@ -3538,13 +3580,14 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { return true; }; - auto SymbolName = Symbol.getName(StringSection); - assert(SymbolName && "cannot get symbol name"); - if (*SymbolName == "__hot_start" || *SymbolName == "__hot_end") - updateSymbolValue(*SymbolName); + auto SymbolName = + cantFail(Symbol.getName(StringSection), "cannot get symbol name"); + if (SymbolName == "__hot_start" || SymbolName == "__hot_end") + updateSymbolValue(SymbolName); } - Write((&Symbol - Obj->symbol_begin(Section)) * sizeof(Elf_Sym), + Write((&Symbol - cantFail(Obj->symbols(Section)).begin()) * + sizeof(Elf_Sym), reinterpret_cast(&NewSymbol), sizeof(NewSymbol)); } @@ -3572,7 +3615,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { // Update dynamic symbol table. const Elf_Shdr *DynSymSection = nullptr; - for (const Elf_Shdr &Section : Obj->sections()) { + for (const Elf_Shdr &Section : cantFail(Obj->sections())) { if (Section.sh_type == ELF::SHT_DYNSYM) { DynSymSection = &Section; break; @@ -3588,7 +3631,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { // (re)create regular symbol table. const Elf_Shdr *SymTabSection = nullptr; - for (const auto &Section : Obj->sections()) { + for (const auto &Section : cantFail(Obj->sections())) { if (Section.sh_type == ELF::SHT_SYMTAB) { SymTabSection = &Section; break; @@ -3599,12 +3642,13 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { return; } - const Elf_Shdr *StrTabSection = *Obj->getSection(SymTabSection->sh_link); + const Elf_Shdr *StrTabSection = + cantFail(Obj->getSection(SymTabSection->sh_link)); std::string NewContents; std::string NewStrTab = File->getData().substr(StrTabSection->sh_offset, StrTabSection->sh_size); - auto SecName = *Obj->getSectionName(SymTabSection); - auto StrSecName = *Obj->getSectionName(StrTabSection); + auto SecName = cantFail(Obj->getSectionName(SymTabSection)); + auto StrSecName = cantFail(Obj->getSectionName(StrTabSection)); updateSymbolTable(/*patch existing table?*/false, SymTabSection, [&](size_t Offset, const char *Buf, size_t Size) { @@ -3707,7 +3751,7 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { // Locate DYNAMIC by looking through program headers. uint64_t DynamicOffset = 0; const Elf_Phdr *DynamicPhdr = 0; - for (auto &Phdr : Obj->program_headers()) { + for (auto &Phdr : cantFail(Obj->program_headers())) { if (Phdr.p_type == ELF::PT_DYNAMIC) { DynamicOffset = Phdr.p_offset; DynamicPhdr = &Phdr; @@ -3721,10 +3765,11 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { // Go through all dynamic entries and patch functions addresses with // new ones. - ErrorOr DTB = Obj->dynamic_table_begin(DynamicPhdr); - ErrorOr DTE = Obj->dynamic_table_end(DynamicPhdr); - assert(DTB && DTE && "error accessing dynamic table"); - for (auto *DE = *DTB; DE != *DTE; ++DE) { + const Elf_Dyn *DTB = cantFail(Obj->dynamic_table_begin(DynamicPhdr), + "error accessing dynamic table"); + const Elf_Dyn *DTE = cantFail(Obj->dynamic_table_end(DynamicPhdr), + "error accessing dynamic table"); + for (auto *DE = DTB; DE != DTE; ++DE) { auto NewDE = *DE; bool ShouldPatch = true; switch (DE->getTag()) { @@ -3756,7 +3801,7 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { } if (ShouldPatch) { OS.pwrite(reinterpret_cast(&NewDE), sizeof(NewDE), - DynamicOffset + (DE - *DTB) * sizeof(*DE)); + DynamicOffset + (DE - DTB) * sizeof(*DE)); } } @@ -3781,16 +3826,14 @@ void RewriteInstance::rewriteFile() { // We obtain an asm-specific writer so that we can emit nops in an // architecture-specific way at the end of the function. auto MCE = BC->TheTarget->createMCCodeEmitter(*BC->MII, *BC->MRI, *BC->Ctx); - auto MAB = BC->TheTarget->createMCAsmBackend(*BC->MRI, BC->TripleName, ""); - std::unique_ptr Streamer( - BC->TheTarget->createMCObjectStreamer(*BC->TheTriple, - *BC->Ctx, - *MAB, - OS, - MCE, - *BC->STI, - /* RelaxAll */ false, - /* DWARFMustBeAtTheEnd */ false)); + auto MAB = + BC->TheTarget->createMCAsmBackend(*BC->STI, *BC->MRI, MCTargetOptions()); + std::unique_ptr Streamer(BC->TheTarget->createMCObjectStreamer( + *BC->TheTriple, *BC->Ctx, std::unique_ptr(MAB), OS, + std::unique_ptr(MCE), *BC->STI, + /* RelaxAll */ false, + /*IncrementalLinkerCompatible */ false, + /* DWARFMustBeAtTheEnd */ false)); auto &Writer = static_cast(Streamer.get()) ->getAssembler() @@ -3929,6 +3972,8 @@ void RewriteInstance::rewriteFile() { OS.pwrite(reinterpret_cast(SI.AllocAddress), SI.Size, SI.FileOffset); + assert(SI.AllocAddress && + "writing section that was not assigned an address"); } // If .eh_frame is present create .eh_frame_hdr. @@ -3965,51 +4010,37 @@ void RewriteInstance::rewriteFile() { // If requested, open again the binary we just wrote to dump its EH Frame if (opts::DumpEHFrame) { - ErrorOr> BinaryOrErr = + Expected> BinaryOrErr = createBinary(opts::OutputFilename); - if (std::error_code EC = BinaryOrErr.getError()) - report_error(opts::OutputFilename, EC); + if (auto E = BinaryOrErr.takeError()) + report_error(opts::OutputFilename, std::move(E)); Binary &Binary = *BinaryOrErr.get().getBinary(); if (auto *E = dyn_cast(&Binary)) { - DWARFContextInMemory DwCtx(*E, nullptr, true); - const auto &EHFrame = DwCtx.getEHFrame(); + auto DwCtx = DWARFContext::create(*E); + const auto &EHFrame = DwCtx->getEHFrame(); outs() << "BOLT-INFO: Dumping rewritten .eh_frame\n"; - EHFrame->dump(outs()); + EHFrame->dump(outs(), NoneType()); } } } void RewriteInstance::writeEHFrameHeader(SectionInfo &EHFrameSecInfo) { - DWARFFrame NewEHFrame(EHFrameSecInfo.FileAddress); - NewEHFrame.parse( - DataExtractor(StringRef(reinterpret_cast( - EHFrameSecInfo.AllocAddress), - EHFrameSecInfo.Size), - BC->AsmInfo->isLittleEndian(), - BC->AsmInfo->getPointerSize())); - if (!NewEHFrame.ParseError.empty()) { - errs() << "BOLT-ERROR: EHFrame reader failed with message \"" - << NewEHFrame.ParseError << '\n'; - exit(1); - } + DWARFDebugFrame NewEHFrame(true, EHFrameSecInfo.FileAddress); + NewEHFrame.parse(DWARFDataExtractor( + StringRef(reinterpret_cast(EHFrameSecInfo.AllocAddress), + EHFrameSecInfo.Size), + BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getCodePointerSize())); auto OldSMII = EFMM->SectionMapInfo.find(".eh_frame_old"); assert(OldSMII != EFMM->SectionMapInfo.end() && "expected .eh_frame_old to be present"); auto &OldEHFrameSecInfo = OldSMII->second; - DWARFFrame OldEHFrame(OldEHFrameSecInfo.FileAddress); - OldEHFrame.parse( - DataExtractor(StringRef(reinterpret_cast( - OldEHFrameSecInfo.AllocAddress), - OldEHFrameSecInfo.Size), - BC->AsmInfo->isLittleEndian(), - BC->AsmInfo->getPointerSize())); - if (!OldEHFrame.ParseError.empty()) { - errs() << "BOLT-ERROR: EHFrame reader failed with message \"" - << OldEHFrame.ParseError << '\n'; - exit(1); - } + DWARFDebugFrame OldEHFrame(true, OldEHFrameSecInfo.FileAddress); + OldEHFrame.parse(DWARFDataExtractor( + StringRef(reinterpret_cast(OldEHFrameSecInfo.AllocAddress), + OldEHFrameSecInfo.Size), + BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getCodePointerSize())); DEBUG(dbgs() << "BOLT: writing a new .eh_frame_hdr\n"); @@ -4107,10 +4138,10 @@ DWARFAddressRangesVector RewriteInstance::translateModuleAddressRanges( DWARFAddressRangesVector OutputRanges; for (const auto Range : InputRanges) { - auto BFI = BinaryFunctions.lower_bound(Range.first); + auto BFI = BinaryFunctions.lower_bound(Range.LowPC); while (BFI != BinaryFunctions.end()) { const auto &Function = BFI->second; - if (Function.getAddress() >= Range.second) + if (Function.getAddress() >= Range.HighPC) break; const auto FunctionRanges = Function.getOutputAddressRanges(); std::move(std::begin(FunctionRanges), diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index fbe4af2bc330..80107f223982 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -17,11 +17,12 @@ #include "BinaryFunction.h" #include "DebugData.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/MC/StringTableBuilder.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/StringPool.h" #include #include @@ -29,7 +30,7 @@ namespace llvm { class DWARFContext; class DWARFFrame; -class tool_output_file; +class ToolOutputFile; namespace bolt { @@ -67,6 +68,7 @@ struct SectionInfo { bool IsReadOnly, bool IsLocal, uint64_t FileAddress = 0, uint64_t FileOffset = 0, unsigned SectionID = 0, bool IsELFNote = false) + : AllocAddress(Address), Size(Size), Alignment(Alignment), IsCode(IsCode), IsReadOnly(IsReadOnly), IsLocal(IsLocal), FileAddress(FileAddress), FileOffset(FileOffset), SectionID(SectionID), IsELFNote(IsELFNote) {} @@ -211,12 +213,12 @@ class RewriteInstance { /// Recursively update debug info for all DIEs in \p Unit. /// If \p Function is not empty, it points to a function corresponding /// to a parent DW_TAG_subprogram node of the current \p DIE. - void updateUnitDebugInfo(DWARFCompileUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, + void updateUnitDebugInfo(const DWARFDie DIE, std::vector FunctionStack); /// Map all sections to their final addresses. - void mapFileSections(orc::ObjectLinkingLayer<>::ObjSetHandleT &ObjectsHandle); + void + mapFileSections(orc::RTDyldObjectLinkingLayer::ObjHandleT &ObjectsHandle); /// Update output object's values based on the final \p Layout. void updateOutputValues(const MCAsmLayout &Layout); @@ -245,7 +247,8 @@ class RewriteInstance { /// Return value for the symbol \p Name in the output. uint64_t getNewValueForSymbol(const StringRef Name) { - return OLT.findSymbol(Name, false).getAddress(); + return cantFail(OLT->findSymbol(Name, false).getAddress(), + "findSymbol failed"); } /// Return BinaryFunction containing a given \p Address or nullptr if @@ -311,9 +314,6 @@ class RewriteInstance { /// Write .eh_frame_hdr. void writeEHFrameHeader(SectionInfo &EHFrameSecInfo); - // Run ObjectLinkingLayer() with custom memory manager and symbol resolver. - orc::ObjectLinkingLayer<> OLT; - /// Disassemble and create function entries for PLT. void disassemblePLT(); @@ -385,8 +385,7 @@ class RewriteInstance { /// new address ranges in the output binary. /// \p Unit Compile uniit the object belongs to. /// \p DIE is the object's DIE in the input binary. - void updateDWARFObjectAddressRanges(const DWARFUnit *Unit, - const DWARFDebugInfoEntryMinimal *DIE, + void updateDWARFObjectAddressRanges(const DWARFDie DIE, uint64_t DebugRangesOffset); /// Return file offset corresponding to a given virtual address. @@ -411,8 +410,8 @@ class RewriteInstance { uint64_t Address, uint64_t Size, bool IsSimple); -private: +public: /// When updating debug info, these are the sections we overwrite. static constexpr const char *SectionsToOverwrite[] = { ".shstrtab", @@ -425,8 +424,12 @@ class RewriteInstance { ".gdb_index", }; +private: + static const char TimerGroupName[]; + static const char TimerGroupDesc[]; + /// Huge page size used for alignment. static constexpr unsigned PageAlign = 0x200000; @@ -448,11 +451,14 @@ class RewriteInstance { /// Memory manager for sections and segments. Used to communicate with ORC /// among other things. - std::unique_ptr EFMM; + std::shared_ptr EFMM; + + // Run ObjectLinkingLayer() with custom memory manager and symbol resolver. + std::unique_ptr OLT; /// Output file where we mix original code from the input binary and /// optimized code for selected functions. - std::unique_ptr Out; + std::unique_ptr Out; /// Offset in the input file where non-allocatable sections start. uint64_t FirstNonAllocatableOffset{0}; @@ -496,7 +502,7 @@ class RewriteInstance { /// Exception handling and stack unwinding information in this binary. ArrayRef LSDAData; uint64_t LSDAAddress{0}; - const llvm::DWARFFrame *EHFrame{nullptr}; + const llvm::DWARFDebugFrame *EHFrame{nullptr}; ErrorOr EHFrameSection{std::errc::bad_address}; /// .plt section. @@ -536,6 +542,8 @@ class RewriteInstance { /// Section header string table. StringTableBuilder SHStrTab; + StringPool SHStrTabPool; + std::vector AllSHStrTabStrings; /// A rewrite of strtab std::string NewStrTab; diff --git a/bolt/llvm-bolt.cpp b/bolt/llvm-bolt.cpp index b283e371dfc6..42040ee1442f 100644 --- a/bolt/llvm-bolt.cpp +++ b/bolt/llvm-bolt.cpp @@ -91,6 +91,13 @@ static void report_error(StringRef Message, std::error_code EC) { exit(1); } +static void report_error(StringRef Message, Error E) { + assert(E); + errs() << ToolName << ": '" << Message << "': " << toString(std::move(E)) + << ".\n"; + exit(1); +} + namespace llvm { namespace bolt { const char *BoltRevision = @@ -99,8 +106,8 @@ const char *BoltRevision = } } -static void printBoltRevision() { - errs() << "BOLT revision " << BoltRevision << "\n"; +static void printBoltRevision(llvm::raw_ostream &OS) { + OS << "BOLT revision " << BoltRevision << "\n"; } void perf2boltMode(int argc, char **argv) { @@ -139,7 +146,7 @@ void boltMode(int argc, char **argv) { int main(int argc, char **argv) { // Print a stack trace if we signal out. - sys::PrintStackTraceOnErrorSignal(); + sys::PrintStackTraceOnErrorSignal(argv[0]); PrettyStackTraceProgram X(argc, argv); llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. @@ -198,9 +205,10 @@ int main(int argc, char **argv) { } // Attempt to open the binary. - ErrorOr> BinaryOrErr = createBinary(opts::InputFilename); - if (std::error_code EC = BinaryOrErr.getError()) - report_error(opts::InputFilename, EC); + Expected> BinaryOrErr = + createBinary(opts::InputFilename); + if (auto E = BinaryOrErr.takeError()) + report_error(opts::InputFilename, std::move(E)); Binary &Binary = *BinaryOrErr.get().getBinary(); if (auto *e = dyn_cast(&Binary)) { diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/merge-fdata/merge-fdata.cpp index 7aca64561819..9c6d2a745442 100644 --- a/bolt/merge-fdata/merge-fdata.cpp +++ b/bolt/merge-fdata/merge-fdata.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/StringSet.h" #include "llvm/Object/Binary.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Signals.h" @@ -56,8 +57,7 @@ PrintFunctionList("print", "print functions sorted by execution count"), clEnumValN(ST_TOTAL_BRANCHES, "branches", - "print functions sorted by total branch count"), - clEnumValEnd), + "print functions sorted by total branch count")), cl::cat(MergeFdataCategory)); static cl::opt @@ -79,7 +79,7 @@ static void report_error(StringRef Message, std::error_code EC) { int main(int argc, char **argv) { // Print a stack trace if we signal out. - sys::PrintStackTraceOnErrorSignal(); + sys::PrintStackTraceOnErrorSignal(argv[0]); PrettyStackTraceProgram X(argc, argv); llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. From a00c2d49abee47797e2bd9b258d7bfcd6655b791 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 2 Feb 2018 16:07:11 -0800 Subject: [PATCH 379/904] [BOLT] Limited "support" for AVX-512 Summary: In relocation mode trap on entry to any function that has AVX-512 instructions. This is controlled by "-trap-avx512" option which is on by default. If the option is disabled and AVX-512 instruction is seen in relocation mode, then we abort while re-writing the binary. (cherry picked from commit 713004c0c1c286736725ed0849f77af71d7ff2bc) --- bolt/BinaryContext.h | 3 +++ bolt/BinaryFunction.cpp | 40 ++++++++++++++++++++++++++++++++++-- bolt/BinaryFunction.h | 17 +++++++++++++-- bolt/Passes/BinaryPasses.cpp | 12 +++++++++++ bolt/RewriteInstance.cpp | 2 +- 5 files changed, 69 insertions(+), 5 deletions(-) diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index c9067168c7f1..952524224817 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -154,6 +154,9 @@ class BinaryContext { /// True if the binary requires immediate relocation processing. bool RequiresZNow{false}; + /// List of functions that always trap. + std::vector TrappedFunctions; + BinaryContext(std::unique_ptr Ctx, std::unique_ptr DwCtx, std::unique_ptr TheTriple, diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 3e9c7febbbdd..612e5d126951 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -136,6 +136,15 @@ TimeBuild("time-build", cl::Hidden, cl::cat(BoltCategory)); +cl::opt +TrapOnAVX512("trap-avx512", + cl::desc("in relocation mode trap upon entry to any function that uses " + "AVX-512 instructions (on by default)"), + cl::init(true), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + bool shouldPrint(const BinaryFunction &Function) { if (PrintOnly.empty() && PrintOnlyRegex.empty()) return true; @@ -1005,7 +1014,14 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { << Twine::utohexstr(Offset) << " (address 0x" << Twine::utohexstr(AbsoluteInstrAddr) << ") in function " << *this << '\n'; - IsSimple = false; + // Some AVX-512 instructions could not be disassembled at all. + if (BC.HasRelocations && opts::TrapOnAVX512 && + BC.TheTriple->getArch() == llvm::Triple::x86_64) { + setTrapOnEntry(); + BC.TrappedFunctions.push_back(this); + } else { + IsSimple = false; + } } break; } @@ -1018,7 +1034,13 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { << Twine::utohexstr(Offset) << ". Disassembly could be wrong." " Skipping further processing.\n"; } - IsSimple = false; + + if (BC.HasRelocations && opts::TrapOnAVX512) { + setTrapOnEntry(); + BC.TrappedFunctions.push_back(this); + } else { + IsSimple = false; + } break; } @@ -2168,6 +2190,20 @@ void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { } } +void BinaryFunction::setTrapOnEntry() { + clearList(Instructions); + clearList(IgnoredBranches); + clearList(TakenBranches); + + for (const auto EntryOffset : EntryOffsets) { + MCInst TrapInstr; + BC.MIA->createTrap(TrapInstr); + addInstruction(EntryOffset, std::move(TrapInstr)); + } + + TrapsOnEntry = true; +} + void BinaryFunction::addConstantIslandDependency(BinaryFunction *OtherBF, MCSymbol *HotSymbol, MCSymbol *ColdSymbol) { diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 43985de00949..4d6ef949291b 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -305,6 +305,9 @@ class BinaryFunction { /// for ICF optimization without relocations. bool IsFolded{false}; + /// Execution halts whenever this function is entered. + bool TrapsOnEntry{false}; + /// The address for the code for this function in codegen memory. uint64_t ImageAddress{0}; @@ -814,8 +817,9 @@ class BinaryFunction { friend class BinaryContext; /// Creation should be handled by RewriteInstance::createBinaryFunction(). - BinaryFunction(const std::string &Name, BinarySection &Section, uint64_t Address, - uint64_t Size, BinaryContext &BC, bool IsSimple) : + BinaryFunction(const std::string &Name, BinarySection &Section, + uint64_t Address, uint64_t Size, BinaryContext &BC, + bool IsSimple) : Names({Name}), Section(Section), Address(Address), Size(Size), BC(BC), IsSimple(IsSimple), CodeSectionName(".local.text." + Name), @@ -1292,6 +1296,15 @@ class BinaryFunction { return StringRef(ColdCodeSectionName); } + /// Return true iif the function will halt execution on entry. + bool trapsOnEntry() const { + return TrapsOnEntry; + } + + /// Make the function always trap on entry. Other than the trap instruction, + /// the function body will be empty. + void setTrapOnEntry(); + /// Return true if the function could be correctly processed. bool isSimple() const { return IsSimple; diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index d2c5402c285d..28c42b54dcf2 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -1512,6 +1512,18 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, outs() << "\n"; } } + + if (!BC.TrappedFunctions.empty()) { + errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() + << " functions will trap on entry"; + if (opts::Verbosity >= 1) { + errs() << ".\n"; + for (const auto *Function : BC.TrappedFunctions) + errs() << " " << *Function << '\n'; + } else { + errs() << " (use -v=1 to see the list).\n"; + } + } } void InstructionLowering::runOnFunctions( diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index be8fcbc0a06e..ebd823e6b3a1 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -2162,7 +2162,7 @@ void RewriteInstance::disassembleFunctions() { BC->InterproceduralReferences.clear(); // Fill in CFI information for this function - if (Function.isSimple()) { + if (Function.isSimple() && !Function.trapsOnEntry()) { if (!CFIRdWrt->fillCFIInfoFor(Function)) { errs() << "BOLT-ERROR: unable to fill CFI for function " << Function << ".\n"; From 7f90684b3499ecf50ba24f7232401d9d19d41566 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 14 Feb 2018 12:30:27 -0800 Subject: [PATCH 380/904] [BOLT] Improved function profile matching Summary: Prioritize functions with 100% name match when doing LTO "fuzzy" name matching. Avoid re-assigning profile to a function. (cherry picked from commit f9a791f40645db3d8bd01df8798011d73e0c58aa) --- bolt/Passes/BinaryPasses.cpp | 20 +++++++++++++++----- bolt/ProfileReader.cpp | 28 ++++++++++++++++++++++++++-- bolt/ProfileReader.h | 20 +++++++++++++------- 3 files changed, 54 insertions(+), 14 deletions(-) diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index 28c42b54dcf2..1fcf746fba2c 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -1352,18 +1352,23 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, std::set &) { uint64_t NumSimpleFunctions{0}; uint64_t NumStaleProfileFunctions{0}; + uint64_t NumNonSimpleProfiledFunctions{0}; std::vector ProfiledFunctions; const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; for (auto &BFI : BFs) { auto &Function = BFI.second; - if (!Function.isSimple()) + if (!Function.isSimple()) { + if (Function.hasProfile()) { + ++NumNonSimpleProfiledFunctions; + } continue; + } ++NumSimpleFunctions; - if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) + if (!Function.hasProfile()) continue; - if (Function.hasValidProfile()) + if (Function.hasValidProfile()) { ProfiledFunctions.push_back(&Function); - else { + } else { if (opts::ReportStaleFuncs) { outs() << StaleFuncsHeader; StaleFuncsHeader = ""; @@ -1382,13 +1387,18 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, << format("%.1f", NumAllProfiledFunctions / (float) NumSimpleFunctions * 100.0f) << "%) have non-empty execution profile.\n"; + if (NumNonSimpleProfiledFunctions) { + outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions + << " non-simple function(s) have profile.\n"; + } if (NumStaleProfileFunctions) { outs() << "BOLT-INFO: " << NumStaleProfileFunctions << format(" (%.1f%% of all profiled)", NumStaleProfileFunctions / (float) NumAllProfiledFunctions * 100.0f) << " function" << (NumStaleProfileFunctions == 1 ? "" : "s") - << " have invalid (possibly stale) profile.\n"; + << " have invalid (possibly stale) profile." + " Use -report-stale to see the list.\n"; } // Profile is marked as 'Used' if it either matches a function name diff --git a/bolt/ProfileReader.cpp b/bolt/ProfileReader.cpp index 4f09ab900cb7..c3933f8a0b0e 100644 --- a/bolt/ProfileReader.cpp +++ b/bolt/ProfileReader.cpp @@ -199,9 +199,31 @@ ProfileReader::readProfile(const std::string &FileName, buildNameMaps(Functions); YamlProfileToFunction.resize(YamlBFs.size() + 1); + + // We have to do 2 passes since LTO introduces an ambiguity in function + // names. The first pass assigns profiles that match 100% by name and + // by hash. The second pass allows name ambiguity for LTO private functions. for (auto &BFI : Functions) { auto &Function = BFI.second; auto Hash = Function.hash(true, true); + for (auto &FunctionName : Function.getNames()) { + auto PI = ProfileNameToProfile.find(FunctionName); + if (PI == ProfileNameToProfile.end()) + continue; + auto &YamlBF = *PI->getValue(); + if (YamlBF.Hash == Hash) { + matchProfileToFunction(YamlBF, Function); + } + } + } + + for (auto &BFI : Functions) { + auto &Function = BFI.second; + + if (ProfiledFunctions.count(&Function)) + continue; + + auto Hash = Function.hash(/*Recompute = */false); // was just recomputed for (auto &FunctionName : Function.getNames()) { const auto CommonName = getLTOCommonName(FunctionName); if (CommonName) { @@ -236,8 +258,10 @@ ProfileReader::readProfile(const std::string &FileName, continue; auto &YamlBF = *PI->getValue(); - matchProfileToFunction(YamlBF, Function); - break; + if (!YamlBF.Used) { + matchProfileToFunction(YamlBF, Function); + break; + } } } } diff --git a/bolt/ProfileReader.h b/bolt/ProfileReader.h index 1312ab6f3473..5268a8bb2990 100644 --- a/bolt/ProfileReader.h +++ b/bolt/ProfileReader.h @@ -20,30 +20,32 @@ namespace llvm { namespace bolt { class ProfileReader { - /// Number of function profiles that were unused by the reader. - uint64_t NumUnusedProfiles{0}; - - /// Map a function ID from a profile to a BinaryFunction object. + /// Map a function ID from a YAML profile to a BinaryFunction object. std::vector YamlProfileToFunction; - void reportError(StringRef Message); + /// To keep track of functions that have a matched profile before the profile + /// is attributed. + std::unordered_set ProfiledFunctions; + /// Populate \p Function profile with the one supplied in YAML format. bool parseFunctionProfile(BinaryFunction &Function, const yaml::bolt::BinaryFunctionProfile &YamlBF); - /// Profile for binary functions. + /// All function profiles in YAML format. std::vector YamlBFs; /// For LTO symbol resolution. - /// Map a common LTO prefix to a list of profiles matching the prefix. + /// Map a common LTO prefix to a list of YAML profiles matching the prefix. StringMap> LTOCommonNameMap; /// Map a common LTO prefix to a set of binary functions. StringMap> LTOCommonNameFunctionMap; + /// Strict matching of a name in a profile to its contents. StringMap ProfileNameToProfile; + /// Initialize maps for profile matching. void buildNameMaps(std::map &Functions); /// Update matched YAML -> BinaryFunction pair. @@ -53,6 +55,10 @@ class ProfileReader { YamlProfileToFunction.resize(YamlBF.Id + 1); YamlProfileToFunction[YamlBF.Id] = &BF; YamlBF.Used = true; + + assert(!ProfiledFunctions.count(&BF) && + "function already has an assigned profile"); + ProfiledFunctions.emplace(&BF); } public: From 26ac096f292df3c4d113b1dc4653a7edbba31195 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 13 Feb 2018 11:21:59 -0800 Subject: [PATCH 381/904] [BOLT] Fixes for new profile Summary: Do a better job of recording fall-through branches in new profile mode (-prof-compat-mode=0). For this we need to record offsets for all instructions that are last in the containing basic block. Change the way we convert conditional tail calls. Now we never reverse the condition. This is required for better profile matching. The original approach of preserving the direction was controversial to start with. Add "-infer-fall-throughs" option (on by default) to allow disabling inference of fall-through edge counts. (cherry picked from commit 255be160b0e628109c059914d9690ee1328b300f) --- bolt/BinaryBasicBlock.h | 8 +-- bolt/BinaryFunction.cpp | 66 ++++++++++++------------ bolt/BinaryFunction.h | 16 ++++-- bolt/BinaryFunctionProfile.cpp | 94 +++++++++++++++------------------- bolt/ProfileReader.cpp | 4 +- 5 files changed, 96 insertions(+), 92 deletions(-) diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 431a8bd19e60..5627a940dd76 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -411,18 +411,18 @@ class BinaryBasicBlock { /// Add instruction at the end of this basic block. /// Returns the index of the instruction in the Instructions vector of the BB. - uint32_t addInstruction(MCInst &&Inst) { + iterator addInstruction(MCInst &&Inst) { adjustNumPseudos(Inst, 1); Instructions.emplace_back(Inst); - return Instructions.size() - 1; + return std::prev(Instructions.end()); } /// Add instruction at the end of this basic block. /// Returns the index of the instruction in the Instructions vector of the BB. - uint32_t addInstruction(const MCInst &Inst) { + iterator addInstruction(const MCInst &Inst) { adjustNumPseudos(Inst, 1); Instructions.push_back(Inst); - return Instructions.size() - 1; + return std::prev(Instructions.end()); } /// Add a range of instructions to the end of this basic block. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 612e5d126951..78bab9a4ba2d 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -470,7 +470,11 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, uint64_t BBExecCount = BB->getExecutionCount(); if (hasValidProfile()) { - OS << " Exec Count : " << BBExecCount << '\n'; + OS << " Exec Count : "; + if (BB->getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE) + OS << BBExecCount << '\n'; + else + OS << "\n"; } if (BB->getCFIState() >= 0) { OS << " CFI State : " << BB->getCFIState() << '\n'; @@ -1492,7 +1496,7 @@ bool BinaryFunction::buildCFG() { BinaryBasicBlock *InsertBB{nullptr}; BinaryBasicBlock *PrevBB{nullptr}; bool IsLastInstrNop{false}; - const MCInst *PrevInstr{nullptr}; + uint64_t LastInstrOffset{0}; auto addCFIPlaceholders = [this](uint64_t CFIOffset, BinaryBasicBlock *InsertBB) { @@ -1503,6 +1507,16 @@ bool BinaryFunction::buildCFG() { } }; + // For profiling purposes we need to save the offset of the last instruction + // in the basic block. But in certain cases we don't if the instruction was + // the last one, and we have to go back and update its offset. + auto updateOffset = [&](uint64_t Offset) { + assert(PrevBB && PrevBB != InsertBB && "invalid previous block"); + auto *PrevInstr = PrevBB->getLastNonPseudoInstr(); + if (PrevInstr && !MIA->hasAnnotation(*PrevInstr, "Offset")) + MIA->addAnnotation(BC.Ctx.get(), *PrevInstr, "Offset", Offset); + }; + for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { const auto Offset = I->first; auto &Instr = I->second; @@ -1515,6 +1529,8 @@ bool BinaryFunction::buildCFG() { /* DeriveAlignment = */ IsLastInstrNop); if (hasEntryPointAtOffset(Offset)) InsertBB->setEntryPoint(); + if (PrevBB) + updateOffset(LastInstrOffset); } // Ignore nops. We use nops to derive alignment of the next basic block. // It will not always work, as some blocks are naturally aligned, but @@ -1528,6 +1544,7 @@ bool BinaryFunction::buildCFG() { // we see an unconditional branch following a conditional one. The latter // should not be a conditional tail call. assert(PrevBB && "no previous basic block for a fall through"); + auto *PrevInstr = PrevBB->getLastNonPseudoInstr(); assert(PrevInstr && "no previous instruction for a fall through"); if (MIA->isUnconditionalBranch(Instr) && !MIA->isUnconditionalBranch(*PrevInstr) && @@ -1538,6 +1555,7 @@ bool BinaryFunction::buildCFG() { InsertBB = addBasicBlock(Offset, BC.Ctx->createTempSymbol("FT", true), /* DeriveAlignment = */ IsLastInstrNop); + updateOffset(LastInstrOffset); } } if (Offset == 0) { @@ -1545,9 +1563,10 @@ bool BinaryFunction::buildCFG() { addCFIPlaceholders(0, InsertBB); } - IsLastInstrNop = false; - InsertBB->addInstruction(Instr); - PrevInstr = &Instr; + const auto IsBlockEnd = MIA->isTerminator(Instr); + IsLastInstrNop = MIA->isNoop(Instr); + LastInstrOffset = Offset; + InsertBB->addInstruction(std::move(Instr)); // Add associated CFI instrs. We always add the CFI instruction that is // located immediately after this instruction, since the next CFI @@ -1558,9 +1577,11 @@ bool BinaryFunction::buildCFG() { CFIOffset = NextInstr->first; else CFIOffset = getSize(); + + // Note: this potentially invalidates instruction pointers/iterators. addCFIPlaceholders(CFIOffset, InsertBB); - if (MIA->isTerminator(Instr)) { + if (IsBlockEnd) { PrevBB = InsertBB; InsertBB = nullptr; } @@ -1769,10 +1790,6 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { } void BinaryFunction::removeConditionalTailCalls() { - // Don't touch code if non-simple ARM - if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && !isSimple()) - return; - // Blocks to be appended at the end. std::vector> NewBlocks; @@ -1824,29 +1841,14 @@ void BinaryFunction::removeConditionalTailCalls() { BC.MIA->convertTailCallToJmp(*CTCInstr); - // In attempt to preserve the direction of the original conditional jump, - // we will either create an unconditional jump in a separate basic block - // at the end of the function, or reverse a condition of the jump - // and create a fall-through block right after the original tail call. - if (getAddress() >= *TargetAddressOrNone) { - // Insert the basic block right after the current one. - std::vector> TCBB; - TCBB.emplace_back(std::move(TailCallBB)); - BBI = insertBasicBlocks(BBI, - std::move(TCBB), - /* UpdateLayout */ true, - /* UpdateCFIState */ false); - BC.MIA->reverseBranchCondition( - *CTCInstr, (*std::next(BBI)).getLabel(), BC.Ctx.get()); + BC.MIA->replaceBranchTarget(*CTCInstr, TailCallBB->getLabel(), + BC.Ctx.get()); - } else { - BC.MIA->replaceBranchTarget(*CTCInstr, TailCallBB->getLabel(), - BC.Ctx.get()); - // Add basic block to the list that will be added to the end. - NewBlocks.emplace_back(std::move(TailCallBB)); - // Swap edges as the TailCallBB corresponds to the taken branch. - BB.swapConditionalSuccessors(); - } + // Add basic block to the list that will be added to the end. + NewBlocks.emplace_back(std::move(TailCallBB)); + + // Swap edges as the TailCallBB corresponds to the taken branch. + BB.swapConditionalSuccessors(); // This branch is no longer a conditional tail call. BC.MIA->unsetConditionalTailCall(*CTCInstr); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 4d6ef949291b..7d71488e4e3b 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -189,12 +189,22 @@ using IndirectCallSiteProfile = SmallVector; inline raw_ostream &operator<<(raw_ostream &OS, const bolt::IndirectCallSiteProfile &ICSP) { - const char *Sep = ""; + std::string TempString; + raw_string_ostream SS(TempString); + + const char *Sep = "\n "; + uint64_t TotalCount = 0; + uint64_t TotalMispreds = 0; for (auto &CSP : ICSP) { - OS << Sep << "{ " << (CSP.IsFunction ? CSP.Name : "") << ": " + SS << Sep << "{ " << (CSP.IsFunction ? CSP.Name : "") << ": " << CSP.Count << " (" << CSP.Mispreds << " misses) }"; - Sep = ", "; + Sep = ",\n "; + TotalCount += CSP.Count; + TotalMispreds += CSP.Mispreds; } + SS.flush(); + + OS << TotalCount << " (" << TotalMispreds << " misses) :" << TempString; return OS; } diff --git a/bolt/BinaryFunctionProfile.cpp b/bolt/BinaryFunctionProfile.cpp index 6b84ec559a8c..55098f6bf1a0 100644 --- a/bolt/BinaryFunctionProfile.cpp +++ b/bolt/BinaryFunctionProfile.cpp @@ -69,14 +69,22 @@ FixFuncCounts("fix-func-counts", cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt +InferFallThroughs("infer-fall-throughs", + cl::desc("infer execution count for fall-through blocks"), + cl::init(true), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { namespace bolt { bool BinaryFunction::recordTrace( - const LBREntry &First, - const LBREntry &Second, + const LBREntry &FirstLBR, + const LBREntry &SecondLBR, uint64_t Count, SmallVector, 16> *Branches) { if (!isSimple()) @@ -85,8 +93,8 @@ bool BinaryFunction::recordTrace( assert(CurrentState == State::CFG && "can only record traces in CFG state"); // Offsets of the trace within this function. - const auto From = First.To - getAddress(); - const auto To = Second.From - getAddress(); + const auto From = FirstLBR.To - getAddress(); + const auto To = SecondLBR.From - getAddress(); if (From > To) return false; @@ -97,47 +105,27 @@ bool BinaryFunction::recordTrace( if (!FromBB || !ToBB) return false; + // Adjust FromBB if the first LBR is a return from the last instruction in + // the previous block (that instruction should be a call). + if (From == FromBB->getOffset() && !containsAddress(FirstLBR.From) && + !FromBB->isEntryPoint() && !FromBB->isLandingPad()) { + auto *PrevBB = BasicBlocksLayout[FromBB->getIndex() - 1]; + if (PrevBB->getSuccessor(FromBB->getLabel())) { + const auto *Instr = PrevBB->getLastNonPseudoInstr(); + if (Instr && BC.MIA->isCall(*Instr)) { + FromBB = PrevBB; + } else { + DEBUG(dbgs() << "invalid incoming LBR (no call): " << FirstLBR << '\n'); + } + } else { + DEBUG(dbgs() << "invalid incoming LBR: " << FirstLBR << '\n'); + } + } + // Fill out information for fall-through edges. The From and To could be // within the same basic block, e.g. when two call instructions are in the // same block. In this case we skip the processing. if (FromBB == ToBB) { - if (opts::CompatMode) - return true; - - // If the previous block ended with a call, the destination of a return - // would be in ToBB basic block. And if the ToBB starts with a control - // transfer instruction, we will have a 0-length trace that we have to - // account for as a fall-through edge. - if (To == ToBB->getOffset()) { - // External entry point. - if (ToBB->isEntryPoint() || ToBB->isLandingPad()) - return true; - - // Check that the origin LBR of a trace starts in another function. - // Otherwise it's an internal branch that was accounted for. - if (containsAddress(First.From)) - return true; - - auto *PrevBB = BasicBlocksLayout[ToBB->getIndex() - 1]; - - // This could be a bad trace. - if (!PrevBB->getSuccessor(ToBB->getLabel())) { - DEBUG(dbgs() << "invalid LBR sequence:\n" - << " " << First << '\n' - << " " << Second << '\n'); - return false; - } - - auto &BI = PrevBB->getBranchInfo(*ToBB); - BI.Count += Count; - if (Branches) { - const auto *Instr = PrevBB->getLastNonPseudoInstr(); - const auto Offset = - BC.MIA->getAnnotationWithDefault(*Instr, "Offset"); - Branches->push_back(std::make_pair(Offset, ToBB->getOffset())); - } - } - return true; } @@ -151,8 +139,8 @@ bool BinaryFunction::recordTrace( // Check for bad LBRs. if (!BB->getSuccessor(NextBB->getLabel())) { DEBUG(dbgs() << "no fall-through for the trace:\n" - << " " << First << '\n' - << " " << Second << '\n'); + << " " << FirstLBR << '\n' + << " " << SecondLBR << '\n'); return false; } @@ -166,12 +154,13 @@ bool BinaryFunction::recordTrace( if (Branches) { const auto *Instr = BB->getLastNonPseudoInstr(); - // Note: real offset for conditional jump instruction shouldn't be 0. - const auto Offset = - BC.MIA->getAnnotationWithDefault(*Instr, "Offset"); - if (Offset) { - Branches->push_back(std::make_pair(Offset, NextBB->getOffset())); + uint64_t Offset{0}; + if (Instr) { + Offset = BC.MIA->getAnnotationWithDefault(*Instr, "Offset"); + } else { + Offset = BB->getOffset(); } + Branches->emplace_back(std::make_pair(Offset, NextBB->getOffset())); } } @@ -374,7 +363,8 @@ void BinaryFunction::postProcessProfile() { } } - inferFallThroughCounts(); + if (opts::InferFallThroughs) + inferFallThroughCounts(); // Update profile information for jump tables based on CFG branch data. for (auto *BB : BasicBlocks) { @@ -421,11 +411,11 @@ void BinaryFunction::postProcessProfile() { } Optional, 16>> -BinaryFunction::getFallthroughsInTrace(const LBREntry &First, - const LBREntry &Second) { +BinaryFunction::getFallthroughsInTrace(const LBREntry &FirstLBR, + const LBREntry &SecondLBR) { SmallVector, 16> Res; - if (!recordTrace(First, Second, 1, &Res)) + if (!recordTrace(FirstLBR, SecondLBR, 1, &Res)) return NoneType(); return Res; diff --git a/bolt/ProfileReader.cpp b/bolt/ProfileReader.cpp index c3933f8a0b0e..ffac6b614bf3 100644 --- a/bolt/ProfileReader.cpp +++ b/bolt/ProfileReader.cpp @@ -161,7 +161,9 @@ ProfileReader::parseFunctionProfile(BinaryFunction &BF, continue; } - BB.setSuccessorBranchInfo(SuccessorBB, YamlSI.Count, YamlSI.Mispreds); + auto &BI = BB.getBranchInfo(SuccessorBB); + BI.Count += YamlSI.Count; + BI.MispredictedCount += YamlSI.Mispreds; } } From 21e6fe5fc15f7115800840eca480a90028a18a63 Mon Sep 17 00:00:00 2001 From: Andy Newell Date: Fri, 9 Feb 2018 09:58:19 -0800 Subject: [PATCH 382/904] Cache+ speed, reduce mallocs Summary: Speed of cache+ by skipping mallocs on vectors. Although this change speeds up the algorithm by 2x, this is still not enough for some binaries where some functions have ~2500 hot basic blocks. Hence, introduce a threshold for expensive optimizations in CachePlusReorderAlgorithm. If the number of hot basic blocks exceeds the threshold (2048 by default), we use a cheaper version, which is quite fast. (cherry picked from commit 1c8ba8b025d34162b2c8bee10f2c41b26da9fc59) --- bolt/CacheMetrics.cpp | 2 + bolt/Passes/CachePlusReorderAlgorithm.cpp | 207 +++++++++++++++------- 2 files changed, 141 insertions(+), 68 deletions(-) diff --git a/bolt/CacheMetrics.cpp b/bolt/CacheMetrics.cpp index 638872dc67c2..795706135a45 100644 --- a/bolt/CacheMetrics.cpp +++ b/bolt/CacheMetrics.cpp @@ -116,6 +116,8 @@ double calcExtTSPScore( double Score = 0.0; for (auto BF : BinaryFunctions) { + if (!BF->hasProfile()) + continue; for (auto SrcBB : BF->layout()) { auto BI = SrcBB->branch_info_begin(); for (auto DstBB : SrcBB->successors()) { diff --git a/bolt/Passes/CachePlusReorderAlgorithm.cpp b/bolt/Passes/CachePlusReorderAlgorithm.cpp index d8a46912389a..cc25643e4619 100644 --- a/bolt/Passes/CachePlusReorderAlgorithm.cpp +++ b/bolt/Passes/CachePlusReorderAlgorithm.cpp @@ -14,11 +14,25 @@ #include "CacheMetrics.h" #include "ReorderAlgorithm.h" #include "ReorderUtils.h" +#include "llvm/Support/Options.h" using namespace llvm; using namespace bolt; using EdgeList = std::vector>; +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +cl::opt +ClusterSplitThreshold("cluster-split-threshold", + cl::desc("The maximum size of a function to apply splitting of clusters"), + cl::init(2048), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} + namespace llvm { namespace bolt { @@ -88,6 +102,59 @@ class Cluster { double Score; }; +using ClusterIter = std::vector::const_iterator; + +// A wrapper around three clusters of basic blocks; it is used to avoid extra +// instantiation of the vectors. +class MergedCluster { +public: + MergedCluster(ClusterIter Begin1, + ClusterIter End1, + ClusterIter Begin2, + ClusterIter End2, + ClusterIter Begin3, + ClusterIter End3) + : Begin1(Begin1), + End1(End1), + Begin2(Begin2), + End2(End2), + Begin3(Begin3), + End3(End3) {} + + template + void forEach(const F &Func) const { + for (auto It = Begin1; It != End1; It++) + Func(*It); + for (auto It = Begin2; It != End2; It++) + Func(*It); + for (auto It = Begin3; It != End3; It++) + Func(*It); + } + + std::vector getBlocks() const { + std::vector Result; + Result.reserve(std::distance(Begin1, End1) + + std::distance(Begin2, End2) + + std::distance(Begin3, End3)); + Result.insert(Result.end(), Begin1, End1); + Result.insert(Result.end(), Begin2, End2); + Result.insert(Result.end(), Begin3, End3); + return Result; + } + + const BinaryBasicBlock *getFirstBlock() const { + return *Begin1; + } + +private: + ClusterIter Begin1; + ClusterIter End1; + ClusterIter Begin2; + ClusterIter End2; + ClusterIter Begin3; + ClusterIter End3; +}; + /// Deterministically compare clusters by their density in decreasing order bool compareClusters(const Cluster *C1, const Cluster *C2) { // original entry point to the front @@ -140,8 +207,11 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1, /// while keeping the implementation sufficiently fast. class CachePlus { public: - CachePlus(const BinaryFunction &BF) - : BF(BF), Adjacent(BF.layout_size()), Cache(BF.layout_size()) { + CachePlus(const BinaryFunction &BF, bool UseClusterSplitting) + : BF(BF), + UseClusterSplitting(UseClusterSplitting), + Adjacent(BF.layout_size()), + Cache(BF.layout_size()) { initialize(); } @@ -338,31 +408,37 @@ class CachePlus { } /// Compute ExtTSP score for a given order of basic blocks - double score(const std::vector& Blocks) const { + double score(const MergedCluster& MergedBlocks) const { uint64_t NotSet = static_cast(-1); - auto Addr = std::vector(BF.layout_size(), NotSet); + EstimatedAddr.assign(BF.layout_size(), NotSet); + uint64_t CurAddr = 0; - for (auto BB : Blocks) { - size_t Index = BB->getLayoutIndex(); - Addr[Index] = CurAddr; - CurAddr += Size[Index]; - } + MergedBlocks.forEach( + [&](const BinaryBasicBlock *BB) { + size_t Index = BB->getLayoutIndex(); + EstimatedAddr[Index] = CurAddr; + CurAddr += Size[Index]; + } + ); double Score = 0; - for (auto BB : Blocks) { - size_t Index = BB->getLayoutIndex(); - for (auto Edge : OutEdges[Index]) { - auto SuccBB = Edge.first; - size_t SuccIndex = SuccBB->getLayoutIndex(); - - if (Addr[SuccBB->getLayoutIndex()] != NotSet) { - Score += CacheMetrics::extTSPScore(Addr[Index], - Size[Index], - Addr[SuccIndex], - Edge.second); + MergedBlocks.forEach( + [&](const BinaryBasicBlock *BB) { + size_t Index = BB->getLayoutIndex(); + for (auto Edge : OutEdges[Index]) { + auto SuccBB = Edge.first; + size_t SuccIndex = SuccBB->getLayoutIndex(); + + if (EstimatedAddr[SuccIndex] != NotSet) { + Score += CacheMetrics::extTSPScore(EstimatedAddr[Index], + Size[Index], + EstimatedAddr[SuccIndex], + Edge.second); + } } } - } + ); + return Score; } @@ -391,7 +467,7 @@ class CachePlus { MergeType); // Does the new cluster preserve the original entry point? if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) && - MergedBlocks[0]->getLayoutIndex() != 0) + MergedBlocks.getFirstBlock()->getLayoutIndex() != 0) return CurGain; // The score of the new cluster @@ -405,18 +481,20 @@ class CachePlus { std::pair Gain = std::make_pair(-1, 0); // Try to concatenate two clusters w/o splitting Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0); - // Try to split ClusterPred into two and merge with ClusterSucc - for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) { - // Make sure the splitting does not break FT successors - auto BB = ClusterPred->blocks()[Offset - 1]; - if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) { - assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]); - continue; - } + if (UseClusterSplitting) { + // Try to split ClusterPred into two and merge with ClusterSucc + for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) { + // Make sure the splitting does not break FT successors + auto BB = ClusterPred->blocks()[Offset - 1]; + if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) { + assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]); + continue; + } - for (size_t Type = 0; Type < 4; Type++) { - size_t MergeType = 1 + Type + Offset * 4; - Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType); + for (size_t Type = 0; Type < 4; Type++) { + size_t MergeType = 1 + Type + Offset * 4; + Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType); + } } } @@ -426,29 +504,16 @@ class CachePlus { /// Merge two clusters (orders) of blocks according to a given 'merge type'. /// - /// If MergeType == 0, then the results is a concatentation of two clusters. + /// If MergeType == 0, then the result is a concatentation of two clusters. /// Otherwise, the first cluster is cut into two and we consider all possible /// ways of concatenating three clusters. - std::vector mergeBlocks( - const std::vector &X, - const std::vector &Y, - size_t MergeType - ) const { - // Concatenate three clusters of blocks in the given order - auto concat = [&](const std::vector &A, - const std::vector &B, - const std::vector &C) { - std::vector Result; - Result.reserve(A.size() + B.size() + C.size()); - Result.insert(Result.end(), A.begin(), A.end()); - Result.insert(Result.end(), B.begin(), B.end()); - Result.insert(Result.end(), C.begin(), C.end()); - return Result; - }; - + MergedCluster mergeBlocks(const std::vector &X, + const std::vector &Y, + size_t MergeType) const { // Merging w/o splitting existing clusters if (MergeType == 0) { - return concat(X, Y, std::vector()); + ClusterIter Empty; + return MergedCluster(X.begin(), X.end(), Y.begin(), Y.end(), Empty, Empty); } MergeType--; @@ -457,15 +522,19 @@ class CachePlus { assert(0 < Offset && Offset < X.size() && "Invalid offset while merging clusters"); // Split the first cluster, X, into X1 and X2 - std::vector X1(X.begin(), X.begin() + Offset); - std::vector X2(X.begin() + Offset, X.end()); + ClusterIter BeginX1 = X.begin(); + ClusterIter EndX1 = X.begin() + Offset; + ClusterIter BeginX2 = X.begin() + Offset; + ClusterIter EndX2 = X.end(); + ClusterIter BeginY = Y.begin(); + ClusterIter EndY = Y.end(); // Construct a new cluster from three existing ones switch(Type) { - case 0: return concat(X1, Y, X2); - case 1: return concat(Y, X2, X1); - case 2: return concat(X2, Y, X1); - case 3: return concat(X2, X1, Y); + case 0: return MergedCluster(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); + case 1: return MergedCluster(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); + case 2: return MergedCluster(BeginX2, EndX2, BeginY, EndY, BeginX1, EndX1); + case 3: return MergedCluster(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); default: llvm_unreachable("unexpected merge type"); } @@ -479,7 +548,7 @@ class CachePlus { // Merge the blocks of clusters auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType); - Into->merge(From, MergedBlocks, score(MergedBlocks)); + Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks)); // Remove cluster From from the list of active clusters auto Iter = std::remove(Clusters.begin(), Clusters.end(), From); @@ -495,6 +564,9 @@ class CachePlus { // The binary function const BinaryFunction &BF; + // Indicates whether to use cluster splitting for optimization + bool UseClusterSplitting; + // All clusters std::vector AllClusters; @@ -520,6 +592,9 @@ class CachePlus { // containing both x and y and all clusters adjacent to x and y (and recompute // them on the next iteration). mutable ClusterPairCache> Cache; + + // A reusable vector used within score() method + mutable std::vector EstimatedAddr; }; void CachePlusReorderAlgorithm::reorderBasicBlocks( @@ -528,18 +603,14 @@ void CachePlusReorderAlgorithm::reorderBasicBlocks( return; // Are there jumps with positive execution count? - uint64_t SumCount = 0; + size_t NumHotBlocks = 0; for (auto BB : BF.layout()) { - auto BI = BB->branch_info_begin(); - for (auto I : BB->successors()) { - assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && I != nullptr); - SumCount += BI->Count; - ++BI; - } + if (BB->getKnownExecutionCount() > 0) + NumHotBlocks++; } // Do not change layout of functions w/o profile information - if (SumCount == 0) { + if (NumHotBlocks == 0) { for (auto BB : BF.layout()) { Order.push_back(BB); } @@ -547,7 +618,7 @@ void CachePlusReorderAlgorithm::reorderBasicBlocks( } // Apply the algorithm - Order = CachePlus(BF).run(); + Order = CachePlus(BF, NumHotBlocks <= opts::ClusterSplitThreshold).run(); // Verify correctness assert(Order[0]->isEntryPoint() && "Original entry point is not preserved"); From 927ce0168cdb386ddff943e79f0023da07d0cc0e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 22 Feb 2018 11:20:46 -0800 Subject: [PATCH 383/904] [BOLT] Fix jump table placement for non-simple functions Summary: When we move a jump table to either hot or cold new section (-jump-tables=move), we rely on a number of taken branches from the table to decide if it's hot or cold. However, if the function is non-simple, we always get 0 count, and always move the table to the cold section. Instead, we should make a conservative decision based on the execution count of the function. (cherry picked from commit fee9e6d92f5ec92c98a8b34d4fced523044a8166) --- bolt/BinaryFunction.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 78bab9a4ba2d..5c382da35447 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -3287,8 +3287,14 @@ void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { ELF::SHF_ALLOC); ColdSection = HotSection; } else { - HotSection = BC.MOFI->getReadOnlySection(); - ColdSection = BC.MOFI->getReadOnlyColdSection(); + if (isSimple()) { + HotSection = BC.MOFI->getReadOnlySection(); + ColdSection = BC.MOFI->getReadOnlyColdSection(); + } else { + HotSection = hasProfile() ? BC.MOFI->getReadOnlySection() + : BC.MOFI->getReadOnlyColdSection(); + ColdSection = HotSection; + } } JT.emit(Streamer, HotSection, ColdSection); } From e7687fa0fc483025cee03f7ad5ef932b29e501e7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 1 Feb 2018 16:33:43 -0800 Subject: [PATCH 384/904] [BOLT] Refactoring of section handling code Summary: This is a big refactoring of the section handling code. I've removed the SectionInfoMap and NoteSectionInfo and stored all the associated info about sections in BinaryContext and BinarySection classes. BinarySections should now hold all the info we care about for each section. They can be initialized from SectionRefs but don't necessarily require one to be created. There are only one or two spots that needed access to the original SectionRef to work properly. The trickiest part was making sure RewriteInstance.cpp iterated over the proper sets of sections for each of it's different types of processing. The different sets are broken down roughly as allocatable and non-alloctable and "registered" (I couldn't think up a better name). "Registered" means that the section has been updated to include output information, i.e. contents, file offset/address, new size, etc. It may help to have special iterators on BinaryContext to iterate over the different classes to make things easier. I can do that if you guys think it is worthwhile. I found pointee_iterator in the llvm ADT code. Use that for iterating over BBs in BinaryFunction rather than the custom iterator class. (cherry picked from commit 9a5ef399fee0114f85e5f22f2f7b1c950ee9632f) --- bolt/BinaryContext.cpp | 123 ++++++-- bolt/BinaryContext.h | 70 ++++- bolt/BinaryFunction.cpp | 16 +- bolt/BinaryFunction.h | 31 +- bolt/BinarySection.cpp | 34 +++ bolt/BinarySection.h | 335 +++++++++++++++++--- bolt/DWARFRewriter.cpp | 67 ++-- bolt/RewriteInstance.cpp | 645 ++++++++++++++++++++------------------- bolt/RewriteInstance.h | 52 +--- 9 files changed, 853 insertions(+), 520 deletions(-) diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 363139c1d1b3..e6000c01dce9 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -23,6 +23,9 @@ using namespace llvm; using namespace bolt; +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + namespace opts { extern cl::OptionCategory BoltCategory; @@ -31,23 +34,30 @@ static cl::opt PrintDebugInfo("print-debug-info", cl::desc("print debug info when printing functions"), cl::Hidden, + cl::ZeroOrMore, cl::cat(BoltCategory)); -static cl::opt +cl::opt PrintRelocations("print-relocations", - cl::desc("print relocations when printing functions"), + cl::desc("print relocations when printing functions/objects"), cl::Hidden, + cl::ZeroOrMore, cl::cat(BoltCategory)); static cl::opt PrintMemData("print-mem-data", cl::desc("print memory data annotations when printing functions"), cl::Hidden, + cl::ZeroOrMore, cl::cat(BoltCategory)); } // namespace opts -BinaryContext::~BinaryContext() { } +BinaryContext::~BinaryContext() { + for (auto *Section : Sections) { + delete Section; + } +} std::unique_ptr BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { @@ -474,20 +484,82 @@ BinaryContext::getSectionForAddress(uint64_t Address) const { return std::make_error_code(std::errc::bad_address); } -BinarySection &BinaryContext::registerSection(SectionRef Section) { - StringRef Name; - Section.getName(Name); - auto Res = Sections.insert(BinarySection(Section)); +BinarySection &BinaryContext::registerSection(BinarySection *Section) { + assert(!Section->getName().empty() && + "can't register sections without a name"); + auto Res = Sections.insert(Section); assert(Res.second && "can't register the same section twice."); - // Cast away const here because std::set always stores values by - // const. It's ok to do this because we can never change the - // BinarySection properties that affect set ordering. - auto *BS = const_cast(&*Res.first); // Only register sections with addresses in the AddressToSection map. - if (Section.getAddress()) - AddressToSection.insert(std::make_pair(Section.getAddress(), BS)); - NameToSection.insert(std::make_pair(Name, BS)); - return *BS; + if (Section->getAddress()) + AddressToSection.insert(std::make_pair(Section->getAddress(), Section)); + NameToSection.insert(std::make_pair(Section->getName(), Section)); + DEBUG(dbgs() << "BOLT-DEBUG: registering " << *Section << "\n"); + return *Section; +} + +BinarySection &BinaryContext::registerSection(SectionRef Section) { + return registerSection(new BinarySection(Section)); +} + +BinarySection &BinaryContext::registerOrUpdateSection(StringRef Name, + unsigned ELFType, + unsigned ELFFlags, + uint8_t *Data, + uint64_t Size, + unsigned Alignment, + bool IsLocal) { + auto NamedSections = getSectionByName(Name); + if (NamedSections.begin() != NamedSections.end()) { + assert(std::next(NamedSections.begin()) == NamedSections.end() && + "can only update unique sections"); + auto *Section = NamedSections.begin()->second; + + DEBUG(dbgs() << "BOLT-DEBUG: updating " << *Section << " -> "); + const auto Flag = Section->isAllocatable(); + Section->update(Data, Size, Alignment, ELFType, ELFFlags, IsLocal); + DEBUG(dbgs() << *Section << "\n"); + assert(Flag == Section->isAllocatable() && + "can't change section allocation status"); + return *Section; + } + + return registerSection(new BinarySection(Name, Data, Size, Alignment, + ELFType, ELFFlags, IsLocal)); +} + +bool BinaryContext::deregisterSection(BinarySection &Section) { + auto *SectionPtr = &Section; + auto Itr = Sections.find(SectionPtr); + if (Itr != Sections.end()) { + auto Range = AddressToSection.equal_range(SectionPtr->getAddress()); + while (Range.first != Range.second) { + if (Range.first->second == SectionPtr) { + AddressToSection.erase(Range.first); + break; + } + ++Range.first; + } + + auto NameRange = NameToSection.equal_range(SectionPtr->getName()); + while (NameRange.first != NameRange.second) { + if (NameRange.first->second == SectionPtr) { + NameToSection.erase(NameRange.first); + break; + } + ++NameRange.first; + } + + Sections.erase(Itr); + delete SectionPtr; + return true; + } + return false; +} + +void BinaryContext::printSections(raw_ostream &OS) const { + for (auto &Section : Sections) { + OS << "BOLT-INFO: " << *Section << "\n"; + } } ErrorOr @@ -504,27 +576,24 @@ BinaryContext::extractPointerAtAddress(uint64_t Address) const { return DE.getAddress(&SectionOffset); } -void BinaryContext::addSectionRelocation(BinarySection &Section, - uint64_t Offset, - MCSymbol *Symbol, - uint64_t Type, - uint64_t Addend) { - Section.addRelocation(Offset, Symbol, Type, Addend); -} - void BinaryContext::addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type, - uint64_t Addend) { + uint64_t Addend, + uint64_t Value) { auto Section = getSectionForAddress(Address); assert(Section && "cannot find section for address"); - Section->addRelocation(Address - Section->getAddress(), Symbol, Type, Addend); + Section->addRelocation(Address - Section->getAddress(), + Symbol, + Type, + Addend, + Value); } -void BinaryContext::removeRelocationAt(uint64_t Address) { +bool BinaryContext::removeRelocationAt(uint64_t Address) { auto Section = getSectionForAddress(Address); assert(Section && "cannot find section for address"); - Section->removeRelocationAt(Address - Section->getAddress()); + return Section->removeRelocationAt(Address - Section->getAddress()); } const Relocation *BinaryContext::getRelocationAt(uint64_t Address) { diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 952524224817..bea5ec4a4f21 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -16,6 +16,7 @@ #include "BinarySection.h" #include "DebugData.h" +#include "llvm/ADT/iterator.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -58,9 +59,17 @@ class BinaryContext { BinaryContext() = delete; /// Set of all sections. - using SectionSetType = std::set; + struct CompareSections { + bool operator()(const BinarySection *A, const BinarySection *B) const { + return *A < *B; + } + }; + using SectionSetType = std::set; SectionSetType Sections; + using SectionIterator = pointee_iterator; + using SectionConstIterator = pointee_iterator; + /// Map virtual address to a section. It is possible to have more than one /// section mapped to the same address, e.g. non-allocatable sections. using AddressToSectionMapType = std::multimap; @@ -70,6 +79,9 @@ class BinaryContext { /// have multiple sections with the same name. using NameToSectionMapType = std::multimap; NameToSectionMapType NameToSection; + + /// Low level section registration. + BinarySection ®isterSection(BinarySection *Section); public: /// [name] -> [address] map used for global symbol resolution. @@ -125,8 +137,6 @@ class BinaryContext { std::unique_ptr MAB; - std::function ErrorCheck; - DataReader &DR; /// Indicates if relocations are availabe for usage. @@ -224,18 +234,53 @@ class BinaryContext { ErrorOr> getFunctionData(const BinaryFunction &Function) const; - /// Register information about the given section so we can look up - /// sections for addresses. + /// Register information about the given \p Section so we can look up + /// sections by address. BinarySection ®isterSection(SectionRef Section); - iterator_range sections() { + /// Register or update the information for the section with the given + /// /p Name. If the section already exists, the information in the + /// section will be updated with the new data. + BinarySection ®isterOrUpdateSection(StringRef Name, + unsigned ELFType, + unsigned ELFFlags, + uint8_t *Data = nullptr, + uint64_t Size = 0, + unsigned Alignment = 1, + bool IsLocal = false); + + /// Register the information for the note (non-allocatable) section + /// with the given /p Name. If the section already exists, the + /// information in the section will be updated with the new data. + BinarySection ®isterOrUpdateNoteSection(StringRef Name, + uint8_t *Data = nullptr, + uint64_t Size = 0, + unsigned Alignment = 1, + bool IsReadOnly = true, + unsigned ELFType = ELF::SHT_PROGBITS, + bool IsLocal = false) { + return registerOrUpdateSection(Name, ELFType, + BinarySection::getFlags(IsReadOnly), + Data, Size, Alignment, IsLocal); + } + + /// Remove the given /p Section from the set of all sections. Return + /// true if the section was removed (and deleted), otherwise false. + bool deregisterSection(BinarySection &Section); + + /// Iterate over all registered sections. + iterator_range sections() { return make_range(Sections.begin(), Sections.end()); } - iterator_range sections() const { + /// Iterate over all registered sections. + iterator_range sections() const { return make_range(Sections.begin(), Sections.end()); } + /// Print all sections. + void printSections(raw_ostream& OS) const; + /// Return largest section containing the given \p Address. These /// functions only work for allocatable sections, i.e. ones with non-zero /// addresses. @@ -305,17 +350,12 @@ class BinaryContext { BinaryFunction &ParentBF, std::map &BFs); - /// Add relocation for \p Section at a given \p Offset. - void addSectionRelocation(BinarySection &Section, uint64_t Offset, - MCSymbol *Symbol, uint64_t Type, - uint64_t Addend = 0); - - /// Add a relocation at a given \p Address. + /// Add a Section relocation at a given \p Address. void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type, - uint64_t Addend = 0); + uint64_t Addend = 0, uint64_t Value = 0); /// Remove registered relocation at a given \p Address. - void removeRelocationAt(uint64_t Address); + bool removeRelocationAt(uint64_t Address); /// Return a relocation registered at a given \p Address, or nullptr if there /// is no relocation at such address. diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 5c382da35447..0f59195919a5 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -1312,8 +1312,18 @@ void BinaryFunction::postProcessJumpTables() { TakenBranches.emplace_back(JTSiteOffset, TargetOffset); // Take ownership of jump table relocations. - if (BC.HasRelocations) - BC.removeRelocationAt(JT->Address + EntryOffset); + if (BC.HasRelocations) { + auto EntryAddress = JT->Address + EntryOffset; + auto Res = BC.removeRelocationAt(EntryAddress); + (void)Res; + DEBUG( + auto Section = BC.getSectionForAddress(EntryAddress); + auto Offset = EntryAddress - Section->getAddress(); + dbgs() << "BOLT-DEBUG: removing relocation from section " + << Section->getName() << " at offset 0x" + << Twine::utohexstr(Offset) << " = " + << Res << '\n'); + } EntryOffset += JT->EntrySize; @@ -3363,7 +3373,7 @@ void BinaryFunction::JumpTable::updateOriginal(BinaryContext &BC) { << " at offset " << Twine::utohexstr(Offset) << " for symbol " << Entry->getName() << " with addend " << Twine::utohexstr(RelAddend) << '\n'); - BC.addSectionRelocation(*Section, Offset, Entry, RelType, RelAddend); + Section->addRelocation(Offset, Entry, RelType, RelAddend); Offset += EntrySize; } } diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 7d71488e4e3b..87bfb15ad654 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -24,6 +24,7 @@ #include "DebugData.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" +#include "llvm/ADT/iterator.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" @@ -600,7 +601,7 @@ class BinaryFunction { std::map Labels; /// Corresponding section if any. - SectionInfo *SecInfo{nullptr}; + ErrorOr Section{std::errc::bad_address}; /// Corresponding section name if any. std::string SectionName; @@ -747,21 +748,6 @@ class BinaryFunction { /// Count the number of functions created. static uint64_t Count; - template - class Iterator : public std::iterator { - public: - Iterator &operator++() { ++itr; return *this; } - Iterator &operator--() { --itr; return *this; } - Iterator operator++(int) { auto tmp(itr); itr++; return tmp; } - Iterator operator--(int) { auto tmp(itr); itr--; return tmp; } - bool operator==(const Iterator& other) const { return itr == other.itr; } - bool operator!=(const Iterator& other) const { return itr != other.itr; } - T& operator*() { return **itr; } - Iterator(Itr itr) : itr(itr) { } - private: - Itr itr; - }; - /// Register alternative function name. void addAlternativeName(std::string NewName) { Names.emplace_back(NewName); @@ -842,13 +828,12 @@ class BinaryFunction { BinaryFunction(BinaryFunction &&) = default; - typedef Iterator iterator; - typedef Iterator const_iterator; - typedef Iterator reverse_iterator; - typedef Iterator const_reverse_iterator; + using iterator = pointee_iterator; + using const_iterator = pointee_iterator; + using reverse_iterator = + pointee_iterator; + using const_reverse_iterator = + pointee_iterator; typedef BasicBlockOrderType::iterator order_iterator; typedef BasicBlockOrderType::const_iterator const_order_iterator; diff --git a/bolt/BinarySection.cpp b/bolt/BinarySection.cpp index 80c038e6ea0b..0e3df9962255 100644 --- a/bolt/BinarySection.cpp +++ b/bolt/BinarySection.cpp @@ -12,10 +12,18 @@ #include "BinarySection.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" using namespace llvm; using namespace bolt; +namespace opts { +extern cl::opt PrintRelocations; +} + Triple::ArchType Relocation::Arch; bool Relocation::isSupported(uint64_t Type) { @@ -324,3 +332,29 @@ void Relocation::print(raw_ostream &OS) const { OS << ", 0x" << Twine::utohexstr(Addend); OS << ", 0x" << Twine::utohexstr(Value); } + +BinarySection::~BinarySection() { + if (!isAllocatable() && + (!hasSectionRef() || + OutputContents.data() != getContents(Section).data())) { + delete[] getOutputData(); + } +} + +void BinarySection::print(raw_ostream &OS) const { + OS << getName() << ", " + << "0x" << Twine::utohexstr(getAddress()) << ", " + << getSize() + << " (0x" << Twine::utohexstr(getFileAddress()) << ", " + << getOutputSize() << ")" + << ", data = " << getData() + << ", output data = " << getOutputData(); + + if (isAllocatable()) + OS << " (allocatable)"; + + if (opts::PrintRelocations) { + for (auto &R : relocations()) + OS << "\n " << R; + } +} diff --git a/bolt/BinarySection.h b/bolt/BinarySection.h index 80a3072cf15e..fbd8ad9059f2 100644 --- a/bolt/BinarySection.h +++ b/bolt/BinarySection.h @@ -1,4 +1,4 @@ -//===--- BinarySection.h - Interface for object file section -------------===// +//===--- BinarySection.h - Interface for object file section --------------===// // // The LLVM Compiler Infrastructure // @@ -12,6 +12,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H #define LLVM_TOOLS_LLVM_BOLT_BINARY_SECTION_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCSymbol.h" @@ -91,88 +92,338 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Relocation &Rel) { return OS; } -/// A wrapper around SectionRef that also manages related relocations -class BinarySection { - SectionRef Section; - std::set Relocations; -public: - explicit BinarySection(SectionRef Section) : Section(Section) { } +inline uint8_t *copyByteArray(const uint8_t *Data, uint64_t Size) { + auto Array = new uint8_t[Size]; + memcpy(Array, Data, Size); + return Array; +} + +inline uint8_t *copyByteArray(StringRef Buffer) { + return copyByteArray(reinterpret_cast(Buffer.data()), + Buffer.size()); +} - StringRef getName() const { +inline uint8_t *copyByteArray(ArrayRef Buffer) { + return copyByteArray(reinterpret_cast(Buffer.data()), + Buffer.size()); +} + +/// A class to manage binary sections that also manages related relocations +class BinarySection { + friend class BinaryContext; + + const std::string Name; // Section name + const SectionRef Section; // SectionRef (may be null) + StringRef Contents; // input section contents + const uint64_t Address; // address of section in input binary (may be 0) + const uint64_t Size; // input section size + unsigned Alignment; // alignment in bytes (must be > 0) + unsigned ELFType; // ELF section type + unsigned ELFFlags; // ELF section flags + bool IsLocal; // Is this a local section? + + // Relocations associated with this section. Relocation offsets are + // wrt. to the original section address and size. + using RelocationSetType = std::set; + RelocationSetType Relocations; + + // Pending relocations for this section. For the moment, just used by + // the .debug_info section. TODO: it would be nice to get rid of this. + RelocationSetType PendingRelocations; + + // Output info + bool IsFinalized{false}; // Has this section had output information + // finalized? + uint64_t FileAddress{0}; // Section address for the rewritten binary. + uint64_t OutputSize{0}; // Section size in the rewritten binary. + uint64_t FileOffset{0}; // File offset in the rewritten binary file. + StringRef OutputContents; // Rewritten section contents. + unsigned SectionID{-1u}; // Unique ID used for address mapping. + // Set by ExecutableFileMemoryManager. + + // non-copyable + BinarySection(const BinarySection &) = delete; + BinarySection(BinarySection &&) = delete; + BinarySection &operator=(const BinarySection &) = delete; + BinarySection &operator=(BinarySection &&) = delete; + + static StringRef getName(SectionRef Section) { StringRef Name; Section.getName(Name); return Name; } - uint64_t getAddress() const { return Section.getAddress(); } - uint64_t getEndAddress() const { return getAddress() + getSize(); } - uint64_t getSize() const { return Section.getSize(); } - uint64_t getAlignment() const { return Section.getAlignment(); } - bool containsAddress(uint64_t Address) const { - return getAddress() <= Address && Address < getEndAddress(); - } - bool containsRange(uint64_t Address, uint64_t Size) const { - return getAddress() <= Address && Address + Size <= getEndAddress(); - } - bool isReadOnly() const { return Section.isReadOnly(); } - bool isVirtual() const { return Section.isVirtual(); } - bool isText() const { return Section.isText(); } - bool isAllocatable() const { return getFlags() & ELF::SHF_ALLOC; } - StringRef getContents() const { + static StringRef getContents(SectionRef Section) { StringRef Contents; - if (auto EC = Section.getContents(Contents)) { - errs() << "BOLT-ERROR: cannot get section contents for " - << getName() << ": " << EC.message() << ".\n"; - exit(1); + if (ELFSectionRef(Section).getType() != ELF::SHT_NOBITS) { + if (auto EC = Section.getContents(Contents)) { + errs() << "BOLT-ERROR: cannot get section contents for " + << getName(Section) << ": " << EC.message() << ".\n"; + exit(1); + } } return Contents; } - unsigned getFlags() const { return ELFSectionRef(Section).getFlags(); } - unsigned getType() const { return ELFSectionRef(Section).getType(); } + + // Set output info for this section. + void update(uint8_t *NewData, + uint64_t NewSize, + unsigned NewAlignment, + unsigned NewELFType, + unsigned NewELFFlags, + bool NewIsLocal) { + assert(NewAlignment > 0 && "section alignment must be > 0"); + OutputSize = NewSize; + Alignment = NewAlignment; + ELFType = NewELFType; + ELFFlags = NewELFFlags; + IsLocal = NewIsLocal || StringRef(Name).startswith(".local."); + OutputContents = StringRef(reinterpret_cast(NewData), + NewData ? NewSize : 0); + IsFinalized = true; + } +public: + explicit BinarySection(SectionRef Section, bool IsLocal = false) + : Name(getName(Section)), + Section(Section), + Contents(getContents(Section)), + Address(Section.getAddress()), + Size(Section.getSize()), + Alignment(Section.getAlignment()), + ELFType(ELFSectionRef(Section).getType()), + ELFFlags(ELFSectionRef(Section).getFlags()), + IsLocal(IsLocal || StringRef(Name).startswith(".local.")), + OutputSize(0) { + } + + // TODO: pass Data as StringRef/ArrayRef? use StringRef::copy method. + BinarySection(StringRef Name, + uint8_t *Data, + uint64_t Size, + unsigned Alignment, + unsigned ELFType, + unsigned ELFFlags, + bool IsLocal) + : Name(Name), + Contents(reinterpret_cast(Data), Data ? Size : 0), + Address(0), + Size(Size), + Alignment(Alignment), + ELFType(ELFType), + ELFFlags(ELFFlags), + IsLocal(IsLocal || Name.startswith(".local.")), + IsFinalized(true), + OutputSize(Size), + OutputContents(Contents) { + assert(Alignment > 0 && "section alignment must be > 0"); + } + + ~BinarySection(); + + /// Helper function to generate the proper ELF flags from section properties. + static unsigned getFlags(bool IsReadOnly = true, + bool IsText = false, + bool IsAllocatable = false) { + unsigned Flags = 0; + if (IsAllocatable) + Flags |= ELF::SHF_ALLOC; + if (!IsReadOnly) + Flags |= ELF::SHF_WRITE; + if (IsText) + Flags |= ELF::SHF_EXECINSTR; + return Flags; + } + + operator bool() const { + return ELFType != ELF::SHT_NULL; + } + + bool operator==(const BinarySection &Other) const { + return (Name == Other.Name && + Address == Other.Address && + Size == Other.Size && + getData() == Other.getData() && + Alignment == Other.Alignment && + ELFType == Other.ELFType && + ELFFlags == Other.ELFFlags && + IsLocal == Other.IsLocal); + } + + bool operator!=(const BinarySection &Other) const { + return !operator==(Other); + } + + // Order sections by their immutable properties. + bool operator<(const BinarySection &Other) const { + return (getAddress() < Other.getAddress() || + (getAddress() == Other.getAddress() && + (getSize() < Other.getSize() || + (getSize() == Other.getSize() && + getName() < Other.getName())))); + } + + /// + /// Basic proprety access. + /// + StringRef getName() const { return Name; } + uint64_t getAddress() const { return Address; } + uint64_t getEndAddress() const { return Address + Size; } + uint64_t getSize() const { return Size; } + uint64_t getAlignment() const { return Alignment; } + bool isText() const { + return (ELFFlags & ELF::SHF_EXECINSTR); + } + bool isData() const { + return (ELFType == ELF::SHT_PROGBITS && + (ELFFlags & (ELF::SHF_ALLOC | ELF::SHF_WRITE))); + } + bool isBSS() const { + return (ELFType == ELF::SHT_NOBITS && + (ELFFlags & (ELF::SHF_ALLOC | ELF::SHF_WRITE))); + } + bool isNote() const { return ELFType == ELF::SHT_NOTE; } + bool isStrTab() const { return ELFType == ELF::SHT_STRTAB; } + bool isSymTab() const { return ELFType == ELF::SHT_SYMTAB; } + bool isVirtual() const { return ELFType == ELF::SHT_NOBITS; } + bool isRela() const { return ELFType == ELF::SHT_RELA; } + bool isReadOnly() const { + return ((ELFFlags & ELF::SHF_ALLOC) && + !(ELFFlags & ELF::SHF_WRITE) && + ELFType == ELF::SHT_PROGBITS); + } + bool isAllocatable() const { + return (ELFFlags & ELF::SHF_ALLOC); + } + bool isLocal() const { return IsLocal; } + unsigned getELFType() const { return ELFType; } + unsigned getELFFlags() const { return ELFFlags; } + + uint8_t *getData() { + return reinterpret_cast(const_cast(getContents().data())); + } + const uint8_t *getData() const { + return reinterpret_cast(getContents().data()); + } + StringRef getContents() const { return Contents; } + bool hasSectionRef() const { return Section != SectionRef(); } SectionRef getSectionRef() const { return Section; } - iterator_range::iterator> relocations() { + /// Does this section contain the given /p Addr? + /// Note: this is in terms of the original mapped binary addresses. + bool containsAddress(uint64_t Addr) const { + return getAddress() <= Addr && Addr < getEndAddress(); + } + /// Does this section contain the range given by /p Addr and /p Sz? + /// Note: this is in terms of the original mapped binary addresses. + bool containsRange(uint64_t Addr, uint64_t Sz) const { + return getAddress() <= Addr && Addr + Sz <= getEndAddress(); + } + + /// Iterate over all non-pending relocations for this section. + iterator_range relocations() { return make_range(Relocations.begin(), Relocations.end()); } - iterator_range::const_iterator> relocations() const { + /// Iterate over all non-pending relocations for this section. + iterator_range relocations() const { return make_range(Relocations.begin(), Relocations.end()); } + /// Does this section have any non-pending relocations? bool hasRelocations() const { return !Relocations.empty(); } - void removeRelocationAt(uint64_t Offset) { + /// Iterate over all pending relocations in this section. + iterator_range pendingRelocations() const { + return make_range(PendingRelocations.begin(), PendingRelocations.end()); + } + + /// Does this section have any pending relocations? + bool hasPendingRelocations() const { + return !PendingRelocations.empty(); + } + + /// Remove non-pending relocation with the given /p Offset. + bool removeRelocationAt(uint64_t Offset) { Relocation Key{Offset, 0, 0, 0, 0}; auto Itr = Relocations.find(Key); - if (Itr != Relocations.end()) + if (Itr != Relocations.end()) { Relocations.erase(Itr); + return true; + } + return false; } + /// Add a new relocation at the given /p Offset. Note: pending relocations + /// are only used by .debug_info and should eventually go away. void addRelocation(uint64_t Offset, MCSymbol *Symbol, uint64_t Type, uint64_t Addend, - uint64_t Value = 0) { - assert(Offset < getSize()); - Relocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); + uint64_t Value = 0, + bool Pending = false) { + assert(Offset < getSize() && "offset not within section bounds"); + if (!Pending) { + Relocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); + } else { + PendingRelocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); + } } + /// Lookup the relocation (if any) at the given /p Offset. const Relocation *getRelocationAt(uint64_t Offset) const { Relocation Key{Offset, 0, 0, 0, 0}; auto Itr = Relocations.find(Key); return Itr != Relocations.end() ? &*Itr : nullptr; } - bool operator<(const BinarySection &Other) const { - return (getAddress() < Other.getAddress() || - (getAddress() == Other.getAddress() && - (getSize() < Other.getSize() || - (getSize() == Other.getSize() && - getName() < Other.getName())))); + /// + /// Property accessors related to output data. + /// + + bool isFinalized() const { return IsFinalized; } + void setIsFinalized() { IsFinalized = true; } + uint64_t getOutputSize() const { return OutputSize; } + uint8_t *getOutputData() { + return reinterpret_cast(const_cast(getOutputContents().data())); + } + const uint8_t *getOutputData() const { + return reinterpret_cast(getOutputContents().data()); } + StringRef getOutputContents() const { return OutputContents; } + uint64_t getAllocAddress() const { + return reinterpret_cast(getOutputData()); + } + uint64_t getFileAddress() const { return FileAddress; } + uint64_t getFileOffset() const { return FileOffset; } + unsigned getSectionID() const { + assert(hasValidSectionID() && "trying to use uninitialized section id"); + return SectionID; + } + bool hasValidSectionID() const { + return SectionID != -1u; + } + + // mutation + void setFileAddress(uint64_t Address) { + FileAddress = Address; + } + void setFileOffset(uint64_t Offset) { + FileOffset = Offset; + } + void setSectionID(unsigned ID) { + assert(!hasValidSectionID() && "trying to set section id twice"); + SectionID = ID; + } + + void print(raw_ostream &OS) const; }; +inline raw_ostream &operator<<(raw_ostream &OS, const BinarySection &Section) { + Section.print(OS); + return OS; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/DWARFRewriter.cpp b/bolt/DWARFRewriter.cpp index 67f31d6b2071..cb1b37b05c17 100644 --- a/bolt/DWARFRewriter.cpp +++ b/bolt/DWARFRewriter.cpp @@ -444,9 +444,19 @@ void RewriteInstance::updateLineTableOffsets() { Offset += Label->getOffset() - CurrentOffset; CurrentOffset = Label->getOffset(); - auto &SI = EFMM->NoteSectionInfo[".debug_info"]; - SI.PendingRelocs.emplace_back( - SectionInfo::Reloc{LTOffset, 4, 0, Offset}); + auto DbgInfoSection = BC->getUniqueSectionByName(".debug_info"); + assert(DbgInfoSection && ".debug_info section must exist"); + auto *Zero = BC->registerNameAtAddress("Zero", 0); + DbgInfoSection->addRelocation(LTOffset, + Zero, + ELF::R_X86_64_32, + Offset, + 0, + /*Pending=*/true); + // Set .debug_info as finalized so it won't be skipped over when + // we process sections while writing out the new binary. This ensures + // that the pending relocations will be processed and not ignored. + DbgInfoSection->setIsFinalized(); DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first << " has line table at " << Offset << "\n"); @@ -466,41 +476,20 @@ void RewriteInstance::finalizeDebugSections() { RangesSectionsWriter->writeArangesSection(Writer.get()); const auto &ARangesContents = OS.str(); - // Freed by ExecutableFileMemoryManager. - uint8_t *SectionData = new uint8_t[ARangesContents.size()]; - memcpy(SectionData, ARangesContents.data(), ARangesContents.size()); - EFMM->NoteSectionInfo[".debug_aranges"] = SectionInfo( - reinterpret_cast(SectionData), - ARangesContents.size(), - /*Alignment=*/0, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false); + BC->registerOrUpdateNoteSection(".debug_aranges", + copyByteArray(ARangesContents), + ARangesContents.size()); } auto RangesSectionContents = RangesSectionsWriter->finalize(); - auto SectionSize = RangesSectionContents->size(); - uint8_t *SectionData = new uint8_t[SectionSize]; - memcpy(SectionData, RangesSectionContents->data(), SectionSize); - EFMM->NoteSectionInfo[".debug_ranges"] = SectionInfo( - reinterpret_cast(SectionData), - SectionSize, - /*Alignment=*/1, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false); + BC->registerOrUpdateNoteSection(".debug_ranges", + copyByteArray(*RangesSectionContents), + RangesSectionContents->size()); auto LocationListSectionContents = LocationListWriter->finalize(); - SectionSize = LocationListSectionContents->size(); - SectionData = new uint8_t[SectionSize]; - memcpy(SectionData, LocationListSectionContents->data(), SectionSize); - EFMM->NoteSectionInfo[".debug_loc"] = SectionInfo( - reinterpret_cast(SectionData), - SectionSize, - /*Alignment=*/1, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false); + BC->registerOrUpdateNoteSection(".debug_loc", + copyByteArray(*LocationListSectionContents), + LocationListSectionContents->size()); } void RewriteInstance::updateGdbIndexSection() { @@ -569,7 +558,7 @@ void RewriteInstance::updateGdbIndexSection() { size_t NewGdbIndexSize = GdbIndexContents.size() + Delta; // Free'd by ExecutableFileMemoryManager. - auto * const NewGdbIndexContents = new uint8_t[NewGdbIndexSize]; + auto *NewGdbIndexContents = new uint8_t[NewGdbIndexSize]; auto *Buffer = NewGdbIndexContents; write32le(Buffer, Version); @@ -606,11 +595,7 @@ void RewriteInstance::updateGdbIndexSection() { memcpy(Buffer, Data, TrailingSize); // Register the new section. - EFMM->NoteSectionInfo[".gdb_index"] = SectionInfo( - reinterpret_cast(NewGdbIndexContents), - NewGdbIndexSize, - /*Alignment=*/0, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false); + BC->registerOrUpdateNoteSection(".gdb_index", + NewGdbIndexContents, + NewGdbIndexSize); } diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index ebd823e6b3a1..8af59b379964 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -220,6 +220,13 @@ PrintDisasm("print-disasm", cl::Hidden, cl::cat(BoltCategory)); +static cl::opt +PrintSections("print-sections", + cl::desc("print all registered sections"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + static cl::opt PrintLoopInfo("print-loops", cl::desc("print loop related information"), @@ -459,55 +466,44 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, for (auto &OverwriteName : RewriteInstance::SectionsToOverwrite) { if (SectionName == OverwriteName) { uint8_t *DataCopy = new uint8_t[Size]; - DEBUG(dbgs() << "BOLT: note section " << SectionName << " with size " - << Size << ", alignment " << Alignment << " at 0x" - << Twine::utohexstr(reinterpret_cast(DataCopy)) - << '\n'); - NoteSectionInfo[SectionName] = - SectionInfo(reinterpret_cast(DataCopy), - Size, - Alignment, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false, - 0, - 0, - SectionID); + auto &Section = BC.registerOrUpdateNoteSection(SectionName, + DataCopy, + Size, + Alignment); + Section.setSectionID(SectionID); + assert(!Section.isAllocatable() && "note sections cannot be allocatable"); return DataCopy; } } - uint8_t *ret; + uint8_t *Ret; if (IsCode) { - ret = SectionMemoryManager::allocateCodeSection(Size, Alignment, + Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, SectionName); } else { - ret = SectionMemoryManager::allocateDataSection(Size, Alignment, + Ret = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, IsReadOnly); } - bool IsLocal = false; - if (SectionName.startswith(".local.")) - IsLocal = true; - - DEBUG(dbgs() << "BOLT: allocating " << (IsLocal ? "local " : "") + const auto Flags = BinarySection::getFlags(IsReadOnly, IsCode, true); + auto &Section = BC.registerOrUpdateSection(SectionName, + ELF::SHT_PROGBITS, + Flags, + Ret, + Size, + Alignment); + Section.setSectionID(SectionID); + assert(Section.isAllocatable() && + "verify that allocatable is marked as allocatable"); + + DEBUG(dbgs() << "BOLT: allocating " << (Section.isLocal() ? "local " : "") << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data")) << " section : " << SectionName << " with size " << Size << ", alignment " << Alignment - << " at 0x" << ret << "\n"); - - SectionMapInfo[SectionName] = SectionInfo(reinterpret_cast(ret), - Size, - Alignment, - IsCode, - IsReadOnly, - IsLocal, - 0, - 0, - SectionID); - - return ret; + << " at 0x" << Ret << ", ID = " << SectionID << "\n"); + + return Ret; } /// Notifier for non-allocatable (note) section. @@ -522,21 +518,13 @@ uint8_t *ExecutableFileMemoryManager::recordNoteSection( << " with size " << Size << ", alignment " << Alignment << " at 0x" << Twine::utohexstr(reinterpret_cast(Data)) << '\n'); - // We need to make a copy of the section contents if we'll need it for - // a future reference. RuntimeDyld will not allocate the space forus. - uint8_t *DataCopy = new uint8_t[Size]; - memcpy(DataCopy, Data, Size); - NoteSectionInfo[SectionName] = - SectionInfo(reinterpret_cast(DataCopy), - Size, - Alignment, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false, - 0, - 0, - SectionID); - return DataCopy; + auto &Section = BC.registerOrUpdateNoteSection(SectionName, + copyByteArray(Data, Size), + Size, + Alignment); + Section.setSectionID(SectionID); + assert(!Section.isAllocatable() && "note sections cannot be allocatable"); + return Section.getOutputData(); } bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { @@ -544,11 +532,7 @@ bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { return SectionMemoryManager::finalizeMemory(ErrMsg); } -ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { - for (auto &SII : NoteSectionInfo) { - delete[] reinterpret_cast(SII.second.AllocAddress); - } -} +ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { } namespace { @@ -717,8 +701,7 @@ void RewriteInstance::discoverStorage() { // sections accounting for stubs when we need those sections to match the // same size seen in the input binary, in case this section is a copy // of the original one seen in the binary. - EFMM.reset(new ExecutableFileMemoryManager( - /*AllowStubs*/ false)); + EFMM.reset(new ExecutableFileMemoryManager(*BC, /*AllowStubs*/ false)); auto ELF64LEFile = dyn_cast(InputFile); if (!ELF64LEFile) { @@ -1285,9 +1268,10 @@ void RewriteInstance::discoverFileObjects() { } BF->addAlternativeName(UniqueName); } else { - auto BS = BC->getSectionForAddress(Address); - assert(BS && "section for functions must be registered."); - BF = createBinaryFunction(UniqueName, *BS, Address, SymbolSize, IsSimple); + auto Section = BC->getSectionForAddress(Address); + assert(Section && "section for functions must be registered."); + BF = createBinaryFunction(UniqueName, *Section, Address, + SymbolSize, IsSimple); } if (!AlternativeName.empty()) BF->addAlternativeName(AlternativeName); @@ -1521,8 +1505,8 @@ void RewriteInstance::relocateEHFrameSection() { assert(EHFrameSection && "non-empty .eh_frame section expected"); DWARFDebugFrame EHFrame(true, EHFrameSection->getAddress()); - StringRef EHFrameSectionContents = EHFrameSection->getContents(); - DWARFDataExtractor DE(EHFrameSectionContents, BC->AsmInfo->isLittleEndian(), + DWARFDataExtractor DE(EHFrameSection->getContents(), + BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getCodePointerSize()); auto createReloc = [&](uint64_t Value, uint64_t Offset, uint64_t DwarfType) { if (DwarfType == dwarf::DW_EH_PE_omit) @@ -1564,7 +1548,7 @@ void RewriteInstance::relocateEHFrameSection() { DEBUG(dbgs() << "BOLT-DEBUG: adding DWARF reference against symbol " << Symbol->getName() << '\n'); - BC->addSectionRelocation(*EHFrameSection, Offset, Symbol, RelType); + EHFrameSection->addRelocation(Offset, Symbol, RelType, 0); }; EHFrame.parse(DE, createReloc); @@ -1594,7 +1578,7 @@ void RewriteInstance::readSpecialSections() { check_error(Section.getName(SectionName), "cannot get section name"); StringRef SectionContents; ArrayRef SectionData; - if (!(ELFSectionRef(Section).getType() & ELF::SHT_NOBITS)) { + if (ELFSectionRef(Section).getType() != ELF::SHT_NOBITS) { check_error(Section.getContents(SectionContents), "cannot get section contents"); SectionData = ArrayRef( @@ -1611,11 +1595,19 @@ void RewriteInstance::readSpecialSections() { HasTextRelocations = true; } - BC->registerSection(Section); - DEBUG(dbgs() << "BOLT-DEBUG: registering section " << SectionName - << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x" - << Twine::utohexstr(Section.getAddress() + Section.getSize()) - << "\n"); + // Only register sections with names. + if (!getSectionName(Section).empty()) { + BC->registerSection(Section); + DEBUG(dbgs() << "BOLT-DEBUG: registering section " << SectionName + << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x" + << Twine::utohexstr(Section.getAddress() + Section.getSize()) + << "\n"); + } + } + + if (opts::PrintSections) { + outs() << "BOLT-INFO: Sections from original binary:\n"; + BC->printSections(outs()); } EHFrameSection = BC->getUniqueSectionByName(".eh_frame"); @@ -2549,19 +2541,18 @@ void RewriteInstance::mapFileSections( orc::RTDyldObjectLinkingLayer::ObjHandleT &ObjectsHandle) { NewTextSectionStartAddress = NextAvailableAddress; if (BC->HasRelocations) { - auto SMII = EFMM->SectionMapInfo.find(".text"); - assert(SMII != EFMM->SectionMapInfo.end() && - ".text not found in output"); - auto &SI = SMII->second; + auto TextSection = BC->getUniqueSectionByName(".text"); + assert(TextSection && ".text not found in output"); uint64_t NewTextSectionOffset = 0; - if (opts::UseOldText && SI.Size <= BC->OldTextSectionSize) { + if (opts::UseOldText && + TextSection->getOutputSize() <= BC->OldTextSectionSize) { outs() << "BOLT-INFO: using original .text for new code\n"; // Utilize the original .text for storage. NewTextSectionStartAddress = BC->OldTextSectionAddress; NewTextSectionOffset = BC->OldTextSectionOffset; auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); - if (Padding + SI.Size <= BC->OldTextSectionSize) { + if (Padding + TextSection->getOutputSize() <= BC->OldTextSectionSize) { outs() << "BOLT-INFO: using 0x200000 alignment\n"; NewTextSectionStartAddress += Padding; NewTextSectionOffset += Padding; @@ -2569,24 +2560,24 @@ void RewriteInstance::mapFileSections( } else { if (opts::UseOldText) { errs() << "BOLT-ERROR: original .text too small to fit the new code. " - << SI.Size << " bytes needed, have " << BC->OldTextSectionSize - << " bytes available.\n"; + << TextSection->getOutputSize() << " bytes needed, have " + << BC->OldTextSectionSize << " bytes available.\n"; } auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); NextAvailableAddress += Padding; NewTextSectionStartAddress = NextAvailableAddress; NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress); - NextAvailableAddress += Padding + SI.Size; + NextAvailableAddress += Padding + TextSection->getOutputSize(); } - SI.FileAddress = NewTextSectionStartAddress; - SI.FileOffset = NewTextSectionOffset; + TextSection->setFileAddress(NewTextSectionStartAddress); + TextSection->setFileOffset(NewTextSectionOffset); DEBUG(dbgs() << "BOLT: mapping .text 0x" - << Twine::utohexstr(SMII->second.AllocAddress) + << Twine::utohexstr(TextSection->getAllocAddress()) << " to 0x" << Twine::utohexstr(NewTextSectionStartAddress) << '\n'); OLT->mapSectionAddress(ObjectsHandle, - SI.SectionID, + TextSection->getSectionID(), NewTextSectionStartAddress); } else { for (auto &BFI : BinaryFunctions) { @@ -2595,18 +2586,17 @@ void RewriteInstance::mapFileSections( continue; auto TooLarge = false; - auto SMII = EFMM->SectionMapInfo.find(Function.getCodeSectionName()); - assert(SMII != EFMM->SectionMapInfo.end() && - "cannot find section for function"); + auto FuncSection = BC->getUniqueSectionByName(Function.getCodeSectionName()); + assert(FuncSection && "cannot find section for function"); DEBUG(dbgs() << "BOLT: mapping 0x" - << Twine::utohexstr(SMII->second.AllocAddress) + << Twine::utohexstr(FuncSection->getAllocAddress()) << " to 0x" << Twine::utohexstr(Function.getAddress()) << '\n'); OLT->mapSectionAddress(ObjectsHandle, - SMII->second.SectionID, + FuncSection->getSectionID(), Function.getAddress()); - Function.setImageAddress(SMII->second.AllocAddress); - Function.setImageSize(SMII->second.Size); + Function.setImageAddress(FuncSection->getAllocAddress()); + Function.setImageSize(FuncSection->getOutputSize()); if (Function.getImageSize() > Function.getMaxSize()) { TooLarge = true; FailedAddresses.emplace_back(Function.getAddress()); @@ -2616,15 +2606,13 @@ void RewriteInstance::mapFileSections( if (opts::JumpTables == JTS_BASIC) { for (auto &JTI : Function.JumpTables) { auto &JT = JTI.second; - auto SMII = EFMM->SectionMapInfo.find(JT.SectionName); - assert(SMII != EFMM->SectionMapInfo.end() && - "cannot find section for jump table"); - JT.SecInfo = &SMII->second; - JT.SecInfo->FileAddress = JT.Address; + JT.Section = BC->getUniqueSectionByName(JT.SectionName); + assert(JT.Section && "cannot find section for jump table"); + JT.Section->setFileAddress(JT.Address); DEBUG(dbgs() << "BOLT-DEBUG: mapping " << JT.SectionName << " to 0x" << Twine::utohexstr(JT.Address) << '\n'); OLT->mapSectionAddress(ObjectsHandle, - JT.SecInfo->SectionID, + JT.Section->getSectionID(), JT.Address); } } @@ -2632,9 +2620,9 @@ void RewriteInstance::mapFileSections( if (!Function.isSplit()) continue; - SMII = EFMM->SectionMapInfo.find(Function.getColdCodeSectionName()); - assert(SMII != EFMM->SectionMapInfo.end() && - "cannot find section for cold part"); + auto ColdSection = + BC->getUniqueSectionByName(Function.getColdCodeSectionName()); + assert(ColdSection && "cannot find section for cold part"); // Cold fragments are aligned at 16 bytes. NextAvailableAddress = alignTo(NextAvailableAddress, 16); auto &ColdPart = Function.cold(); @@ -2646,8 +2634,8 @@ void RewriteInstance::mapFileSections( ColdPart.setFileOffset(0); } else { ColdPart.setAddress(NextAvailableAddress); - ColdPart.setImageAddress(SMII->second.AllocAddress); - ColdPart.setImageSize(SMII->second.Size); + ColdPart.setImageAddress(ColdSection->getAllocAddress()); + ColdPart.setImageSize(ColdSection->getOutputSize()); ColdPart.setFileOffset(getFileOffsetForAddress(NextAvailableAddress)); } @@ -2658,7 +2646,7 @@ void RewriteInstance::mapFileSections( << " with size " << Twine::utohexstr(ColdPart.getImageSize()) << '\n'); OLT->mapSectionAddress(ObjectsHandle, - SMII->second.SectionID, + ColdSection->getSectionID(), ColdPart.getAddress()); NextAvailableAddress += ColdPart.getImageSize(); @@ -2669,15 +2657,19 @@ void RewriteInstance::mapFileSections( // entry in section header table. auto NewTextSectionSize = NextAvailableAddress - NewTextSectionStartAddress; if (NewTextSectionSize) { - EFMM->SectionMapInfo[BOLTSecPrefix + ".text"] = - SectionInfo(0, - NewTextSectionSize, - 16, - true /*IsCode*/, - true /*IsReadOnly*/, - true /*IsLocal*/, - NewTextSectionStartAddress, - getFileOffsetForAddress(NewTextSectionStartAddress)); + const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/true, + /*IsText=*/true, + /*IsAllocatable=*/true); + auto &Section = BC->registerOrUpdateSection(BOLTSecPrefix + ".text", + ELF::SHT_PROGBITS, + Flags, + nullptr, + NewTextSectionSize, + 16, + true /*IsLocal*/); + Section.setFileAddress(NewTextSectionStartAddress); + Section.setFileOffset( + getFileOffsetForAddress(NewTextSectionStartAddress)); } } @@ -2688,55 +2680,58 @@ void RewriteInstance::mapFileSections( ".gcc_except_table", ".rodata", ".rodata.cold" }; for (auto &SectionName : Sections) { - auto SMII = EFMM->SectionMapInfo.find(SectionName); - if (SMII == EFMM->SectionMapInfo.end()) + auto Section = BC->getUniqueSectionByName(SectionName); + if (!Section || !Section->isAllocatable() || !Section->isFinalized()) continue; - SectionInfo &SI = SMII->second; - NextAvailableAddress = alignTo(NextAvailableAddress, SI.Alignment); + NextAvailableAddress = alignTo(NextAvailableAddress, + Section->getAlignment()); DEBUG(dbgs() << "BOLT: mapping section " << SectionName << " (0x" - << Twine::utohexstr(SI.AllocAddress) + << Twine::utohexstr(Section->getAllocAddress()) << ") to 0x" << Twine::utohexstr(NextAvailableAddress) << '\n'); OLT->mapSectionAddress(ObjectsHandle, - SI.SectionID, + Section->getSectionID(), NextAvailableAddress); - SI.FileAddress = NextAvailableAddress; - SI.FileOffset = getFileOffsetForAddress(NextAvailableAddress); + Section->setFileAddress(NextAvailableAddress); + Section->setFileOffset(getFileOffsetForAddress(NextAvailableAddress)); - NextAvailableAddress += SI.Size; + NextAvailableAddress += Section->getOutputSize(); } // Handling for sections with relocations. for (const auto &Section : BC->sections()) { - if (!Section.hasRelocations()) + if (!Section || + !Section.hasRelocations() || + !Section.hasSectionRef()) continue; StringRef SectionName = Section.getName(); - auto SMII = EFMM->SectionMapInfo.find(OrgSecPrefix + - std::string(SectionName)); - if (SMII == EFMM->SectionMapInfo.end()) + auto OrgSection = + BC->getUniqueSectionByName(OrgSecPrefix + std::string(SectionName)); + if (!OrgSection || + !OrgSection->isAllocatable() || + !OrgSection->isFinalized()) continue; - SectionInfo &SI = SMII->second; - if (SI.FileAddress) { + if (OrgSection->getFileAddress()) { DEBUG(dbgs() << "BOLT-DEBUG: section " << SectionName << " is already mapped at 0x" - << Twine::utohexstr(SI.FileAddress) << '\n'); + << Twine::utohexstr(OrgSection->getFileAddress()) << '\n'); continue; } DEBUG(dbgs() << "BOLT: mapping original section " << SectionName << " (0x" - << Twine::utohexstr(SI.AllocAddress) + << Twine::utohexstr(OrgSection->getAllocAddress()) << ") to 0x" << Twine::utohexstr(Section.getAddress()) << '\n'); OLT->mapSectionAddress(ObjectsHandle, - SI.SectionID, + OrgSection->getSectionID(), Section.getAddress()); - SI.FileAddress = Section.getAddress(); - StringRef SectionContents = Section.getContents(); - SI.FileOffset = SectionContents.data() - InputFile->getData().data(); + OrgSection->setFileAddress(Section.getAddress()); + OrgSection->setFileOffset(Section.getContents().data() - + InputFile->getData().data()); } } @@ -2826,20 +2821,18 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { void RewriteInstance::emitDataSection(MCStreamer *Streamer, const BinarySection &Section, - std::string Name) { - StringRef SectionName = !Name.empty() ? StringRef(Name) : Section.getName(); - const auto SectionFlags = Section.getFlags(); - const auto SectionType = Section.getType(); + StringRef Name) { + StringRef SectionName = !Name.empty() ? Name : Section.getName(); StringRef SectionContents = Section.getContents(); auto *ELFSection = BC->Ctx->getELFSection(SectionName, - SectionType, - SectionFlags); + Section.getELFType(), + Section.getELFFlags()); Streamer->SwitchSection(ELFSection); Streamer->EmitValueToAlignment(Section.getAlignment()); DEBUG(dbgs() << "BOLT-DEBUG: emitting " - << (SectionFlags & ELF::SHF_ALLOC ? "" : "non-") + << (Section.isAllocatable() ? "" : "non-") << "allocatable data section " << SectionName << '\n'); if (!Section.hasRelocations()) { @@ -2872,7 +2865,7 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, void RewriteInstance::emitDataSections(MCStreamer *Streamer) { for (const auto &Section : BC->sections()) { - if (!Section.hasRelocations()) + if (!Section || !Section.hasRelocations() || !Section.hasSectionRef()) continue; StringRef SectionName = Section.getName(); @@ -2943,14 +2936,15 @@ void RewriteInstance::patchELFPHDRTable() { NewPhdr.p_filesz = sizeof(NewPhdr) * Phnum; NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum; } else if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) { - auto SMII = EFMM->SectionMapInfo.find(".eh_frame_hdr"); - if (SMII != EFMM->SectionMapInfo.end()) { - auto &EHFrameHdrSecInfo = SMII->second; - NewPhdr.p_offset = EHFrameHdrSecInfo.FileOffset; - NewPhdr.p_vaddr = EHFrameHdrSecInfo.FileAddress; - NewPhdr.p_paddr = EHFrameHdrSecInfo.FileAddress; - NewPhdr.p_filesz = EHFrameHdrSecInfo.Size; - NewPhdr.p_memsz = EHFrameHdrSecInfo.Size; + auto EHFrameHdrSec = BC->getUniqueSectionByName(".eh_frame_hdr"); + if (EHFrameHdrSec && + EHFrameHdrSec->isAllocatable() && + EHFrameHdrSec->isFinalized()) { + NewPhdr.p_offset = EHFrameHdrSec->getFileOffset(); + NewPhdr.p_vaddr = EHFrameHdrSec->getFileAddress(); + NewPhdr.p_paddr = EHFrameHdrSec->getFileAddress(); + NewPhdr.p_filesz = EHFrameHdrSec->getOutputSize(); + NewPhdr.p_memsz = EHFrameHdrSec->getOutputSize(); } } else if (opts::UseGnuStack && Phdr.p_type == ELF::PT_GNU_STACK) { NewPhdr.p_type = ELF::PT_LOAD; @@ -3053,75 +3047,79 @@ void RewriteInstance::rewriteNoteSections() { Size = appendPadding(OS, Size, Section.sh_addralign); } - // Address of extension to the section. - uint64_t Address{0}; - // Perform section post-processing. - auto SII = EFMM->NoteSectionInfo.find(SectionName); - if (SII != EFMM->NoteSectionInfo.end()) { - auto &SI = SII->second; - assert(SI.Alignment <= Section.sh_addralign && + auto BSec = BC->getUniqueSectionByName(SectionName); + uint8_t *SectionData = nullptr; + if (BSec && !BSec->isAllocatable()) { + assert(BSec->getAlignment() <= Section.sh_addralign && "alignment exceeds value in file"); - // Write section extension. - Address = SI.AllocAddress; - if (Address) { + if (BSec->getAllocAddress()) { + SectionData = BSec->getOutputData(); DEBUG(dbgs() << "BOLT-DEBUG: " << (Size ? "appending" : "writing") << " contents to section " << SectionName << '\n'); - OS.write(reinterpret_cast(Address), SI.Size); - Size += SI.Size; + OS.write(reinterpret_cast(SectionData), + BSec->getOutputSize()); + Size += BSec->getOutputSize(); } - if (!SI.PendingRelocs.empty()) { + if (BSec->hasPendingRelocations()) { DEBUG(dbgs() << "BOLT-DEBUG: processing relocs for section " << SectionName << '\n'); - for (auto &Reloc : SI.PendingRelocs) { - DEBUG(dbgs() << "BOLT-DEBUG: writing value " - << Twine::utohexstr(Reloc.Value) - << " of size " << (unsigned)Reloc.Size - << " at offset " + for (auto &Reloc : BSec->pendingRelocations()) { + DEBUG(dbgs() << "BOLT-DEBUG: writing value 0x" + << Twine::utohexstr(Reloc.Addend) + << " of size " << Relocation::getSizeForType(Reloc.Type) + << " at offset 0x" << Twine::utohexstr(Reloc.Offset) << '\n'); - assert(Reloc.Size == 4 && - "only relocations of size 4 are supported at the moment"); - OS.pwrite(reinterpret_cast(&Reloc.Value), - Reloc.Size, + assert(Reloc.Type == ELF::R_X86_64_32 && + "only R_X86_64_32 relocations are supported at the moment"); + uint32_t Value = Reloc.Addend; + OS.pwrite(reinterpret_cast(&Value), + Relocation::getSizeForType(Reloc.Type), NextAvailableOffset + Reloc.Offset); } } } // Set/modify section info. - EFMM->NoteSectionInfo[SectionName] = - SectionInfo(Address, - Size, - Section.sh_addralign, - /*IsCode=*/false, - /*IsReadOnly=*/false, - /*IsLocal=*/false, - /*FileAddress=*/0, - NextAvailableOffset); + auto &NewSection = + BC->registerOrUpdateNoteSection(SectionName, + SectionData, + Size, + Section.sh_addralign, + BSec ? BSec->isReadOnly() : false, + BSec ? BSec->getELFType() + : ELF::SHT_PROGBITS, + BSec ? BSec->isLocal() : false); + NewSection.setFileAddress(0); + NewSection.setFileOffset(NextAvailableOffset); NextAvailableOffset += Size; } // Write new note sections. - for (auto &SII : EFMM->NoteSectionInfo) { - auto &SI = SII.second; - if (SI.FileOffset || !SI.AllocAddress) + for (auto &Section : BC->sections()) { + if (!Section || + Section.getFileOffset() || + !Section.getAllocAddress() || + Section.isAllocatable()) continue; - assert(SI.PendingRelocs.empty() && "cannot have pending relocs"); + assert(!Section.hasPendingRelocations() && "cannot have pending relocs"); - NextAvailableOffset = appendPadding(OS, NextAvailableOffset, SI.Alignment); - SI.FileOffset = NextAvailableOffset; + NextAvailableOffset = appendPadding(OS, NextAvailableOffset, + Section.getAlignment()); + Section.setFileOffset(NextAvailableOffset); - DEBUG(dbgs() << "BOLT-DEBUG: writing out new section " << SII.first - << " of size " << SI.Size << " at offset 0x" - << Twine::utohexstr(SI.FileOffset) << '\n'); + DEBUG(dbgs() << "BOLT-DEBUG: writing out new section " + << Section.getName() << " of size " << Section.getOutputSize() + << " at offset 0x" << Twine::utohexstr(Section.getFileOffset()) + << '\n'); - OS.write(reinterpret_cast(SI.AllocAddress), SI.Size); - NextAvailableOffset += SI.Size; + OS.write(Section.getOutputContents().data(), Section.getOutputSize()); + NextAvailableOffset += Section.getOutputSize(); } } @@ -3140,11 +3138,10 @@ void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { SHStrTab.add(*AllSHStrTabStrings.back()); } } - for (auto &SMII : EFMM->SectionMapInfo) { - SHStrTab.add(SMII.first); - } - for (auto &SMII : EFMM->NoteSectionInfo) { - SHStrTab.add(SMII.first); + for (auto &Section : BC->sections()) { + if (Section) { + SHStrTab.add(Section.getName()); + } } SHStrTab.finalize(); @@ -3152,14 +3149,12 @@ void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { uint8_t *DataCopy = new uint8_t[SHStrTabSize]; memset(DataCopy, 0, SHStrTabSize); SHStrTab.write(DataCopy); - EFMM->NoteSectionInfo[".shstrtab"] = - SectionInfo(reinterpret_cast(DataCopy), - SHStrTabSize, - /*Alignment*/1, - /*IsCode=*/false, - /*IsReadOnly=*/false, - /*IsLocal=*/false); - EFMM->NoteSectionInfo[".shstrtab"].IsStrTab = true; + BC->registerOrUpdateNoteSection(".shstrtab", + DataCopy, + SHStrTabSize, + /*Alignment=*/1, + /*IsReadOnly=*/true, + ELF::SHT_STRTAB); } void RewriteInstance::addBoltInfoSection() { @@ -3194,16 +3189,12 @@ void RewriteInstance::addBoltInfoSection() { } const auto BoltInfo = OS.str(); - const auto SectionSize = BoltInfo.size(); - uint8_t *SectionData = new uint8_t[SectionSize]; - memcpy(SectionData, BoltInfo.data(), SectionSize); - EFMM->NoteSectionInfo[".note.bolt_info"] = - SectionInfo(reinterpret_cast(SectionData), SectionSize, - /*Alignment=*/1, - /*IsCode=*/false, - /*IsReadOnly=*/true, - /*IsLocal=*/false, 0, 0, 0, - /*IsELFNote=*/true); + BC->registerOrUpdateNoteSection(".note.bolt_info", + copyByteArray(BoltInfo), + BoltInfo.size(), + /*Alignment=*/1, + /*IsReadOnly=*/true, + ELF::SHT_NOTE); } } @@ -3275,20 +3266,21 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, } // If we are creating our own .text section, it should be the first section - // we created in EFMM->SectionMapInfo, so this is the correct index. + // we created in BinaryContext, so this is the correct index. if (!opts::UseOldText) { NewTextSectionIndex = CurIndex; } // Process entries for all new allocatable sections. - for (auto &SMII : EFMM->SectionMapInfo) { - const auto &SectionName = SMII.first; - const auto &SI = SMII.second; + for (auto &Section : BC->sections()) { + if (!Section || !Section.isAllocatable() || !Section.isFinalized()) + continue; + // Ignore function sections. - if (SI.FileAddress < NewTextSegmentAddress) { + if (Section.getFileAddress() < NewTextSegmentAddress) { if (opts::Verbosity) outs() << "BOLT-INFO: not writing section header for existing section " - << SMII.first << '\n'; + << Section.getName() << '\n'; continue; } @@ -3298,18 +3290,19 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, continue; if (opts::Verbosity >= 1) - outs() << "BOLT-INFO: writing section header for " << SectionName << '\n'; + outs() << "BOLT-INFO: writing section header for " + << Section.getName() << '\n'; ELFShdrTy NewSection; - NewSection.sh_name = SHStrTab.getOffset(SectionName); + NewSection.sh_name = SHStrTab.getOffset(Section.getName()); NewSection.sh_type = ELF::SHT_PROGBITS; - NewSection.sh_addr = SI.FileAddress; - NewSection.sh_offset = SI.FileOffset; - NewSection.sh_size = SI.Size; + NewSection.sh_addr = Section.getFileAddress(); + NewSection.sh_offset = Section.getFileOffset(); + NewSection.sh_size = Section.getOutputSize(); NewSection.sh_entsize = 0; - NewSection.sh_flags = ELF::SHF_ALLOC | ELF::SHF_EXECINSTR; + NewSection.sh_flags = Section.getELFFlags(); NewSection.sh_link = 0; NewSection.sh_info = 0; - NewSection.sh_addralign = SI.Alignment; + NewSection.sh_addralign = Section.getAlignment(); OutputSections->emplace_back(NewSection); } @@ -3336,19 +3329,17 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, StringRef SectionName = cantFail(Obj->getSectionName(&Section), "cannot get section name"); - auto SII = EFMM->NoteSectionInfo.find(SectionName); - assert(SII != EFMM->NoteSectionInfo.end() && - "missing section info for non-allocatable section"); + auto BSec = BC->getUniqueSectionByName(SectionName); + assert(BSec && "missing section info for non-allocatable section"); - const auto &SI = SII->second; auto NewSection = Section; - NewSection.sh_offset = SI.FileOffset; - NewSection.sh_size = SI.Size; + NewSection.sh_offset = BSec->getFileOffset(); + NewSection.sh_size = BSec->getOutputSize(); NewSection.sh_name = SHStrTab.getOffset(SectionName); OutputSections->emplace_back(NewSection); - LastFileOffset = SI.FileOffset; + LastFileOffset = BSec->getFileOffset(); } // Map input -> output is ready. Early return if that's all we need. @@ -3356,28 +3347,27 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, return NewSectionIndex; // Create entries for new non-allocatable sections. - for (auto &SII : EFMM->NoteSectionInfo) { - const auto &SectionName = SII.first; - const auto &SI = SII.second; - - if (SI.FileOffset <= LastFileOffset) + for (auto &Section : BC->sections()) { + if (!Section || + Section.isAllocatable() || + Section.getFileOffset() <= LastFileOffset) continue; - if (opts::Verbosity >= 1) - outs() << "BOLT-INFO: writing section header for " << SectionName << '\n'; + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: writing section header for " + << Section.getName() << '\n'; + } ELFShdrTy NewSection; - NewSection.sh_name = SHStrTab.getOffset(SectionName); - NewSection.sh_type = - (SI.IsStrTab ? ELF::SHT_STRTAB - : SI.IsELFNote ? ELF::SHT_NOTE : ELF::SHT_PROGBITS); + NewSection.sh_name = SHStrTab.getOffset(Section.getName()); + NewSection.sh_type = Section.getELFType(); NewSection.sh_addr = 0; - NewSection.sh_offset = SI.FileOffset; - NewSection.sh_size = SI.Size; + NewSection.sh_offset = Section.getFileOffset(); + NewSection.sh_size = Section.getOutputSize(); NewSection.sh_entsize = 0; - NewSection.sh_flags = 0; + NewSection.sh_flags = Section.getELFFlags(); NewSection.sh_link = 0; NewSection.sh_info = 0; - NewSection.sh_addralign = SI.Alignment ? SI.Alignment : 1; + NewSection.sh_addralign = Section.getAlignment(); OutputSections->emplace_back(NewSection); } @@ -3660,23 +3650,19 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { return Idx; }); - uint8_t *DataCopy = new uint8_t[NewContents.size()]; - memcpy(DataCopy, NewContents.data(), NewContents.size()); - EFMM->NoteSectionInfo[SecName] = - SectionInfo(reinterpret_cast(DataCopy), NewContents.size(), - /*Alignment*/ 1, - /*IsCode=*/false, - /*IsReadOnly=*/false, - /*IsLocal=*/false); - DataCopy = new uint8_t[NewStrTab.size()]; - memcpy(DataCopy, NewStrTab.data(), NewStrTab.size()); - EFMM->NoteSectionInfo[StrSecName] = - SectionInfo(reinterpret_cast(DataCopy), NewStrTab.size(), - /*Alignment*/ 1, - /*IsCode=*/false, - /*IsReadOnly=*/false, - /*IsLocal=*/false); - EFMM->NoteSectionInfo[StrSecName].IsStrTab = true; + BC->registerOrUpdateNoteSection(SecName, + copyByteArray(NewContents), + NewContents.size(), + /*Alignment=*/1, + /*IsReadOnly=*/true, + ELF::SHT_SYMTAB); + + BC->registerOrUpdateNoteSection(StrSecName, + copyByteArray(NewStrTab), + NewStrTab.size(), + /*Alignment=*/1, + /*IsReadOnly=*/true, + ELF::SHT_STRTAB); } template @@ -3892,13 +3878,12 @@ void RewriteInstance::rewriteFile() { if (opts::JumpTables == JTS_BASIC) { for (auto &JTI : Function.JumpTables) { auto &JT = JTI.second; - assert(JT.SecInfo && "section info for jump table expected"); - JT.SecInfo->FileOffset = - getFileOffsetForAddress(JT.Address); - assert(JT.SecInfo->FileOffset && "no matching offset in file"); - Out->os().pwrite(reinterpret_cast(JT.SecInfo->AllocAddress), - JT.SecInfo->Size, - JT.SecInfo->FileOffset); + assert(JT.Section && "section for jump table expected"); + JT.Section->setFileOffset(getFileOffsetForAddress(JT.Address)); + assert(JT.Section->getFileOffset() && "no matching offset in file"); + OS.pwrite(reinterpret_cast(JT.Section->getOutputData()), + JT.Section->getOutputSize(), + JT.Section->getFileOffset()); } } @@ -3959,27 +3944,26 @@ void RewriteInstance::rewriteFile() { } // Write all non-local sections, i.e. those not emitted with the function. - for (auto &SMII : EFMM->SectionMapInfo) { - SectionInfo &SI = SMII.second; - if (SI.IsLocal) + for (auto &Section : BC->sections()) { + if (!Section || + !Section.isAllocatable() || + !Section.isFinalized() || + Section.isLocal()) continue; if (opts::Verbosity >= 1) { - outs() << "BOLT: writing new section " << SMII.first << '\n'; - outs() << " data at 0x" << Twine::utohexstr(SI.AllocAddress) << '\n'; - outs() << " of size " << SI.Size << '\n'; - outs() << " at offset " << SI.FileOffset << '\n'; + outs() << "BOLT: writing new section " << Section.getName() << '\n'; + outs() << " data at 0x" << Twine::utohexstr(Section.getAllocAddress()) << '\n'; + outs() << " of size " << Section.getOutputSize() << '\n'; + outs() << " at offset " << Section.getFileOffset() << '\n'; } - OS.pwrite(reinterpret_cast(SI.AllocAddress), - SI.Size, - SI.FileOffset); - assert(SI.AllocAddress && - "writing section that was not assigned an address"); + OS.pwrite(reinterpret_cast(Section.getOutputData()), + Section.getOutputSize(), + Section.getFileOffset()); } // If .eh_frame is present create .eh_frame_hdr. - auto SMII = EFMM->SectionMapInfo.find(".eh_frame"); - if (SMII != EFMM->SectionMapInfo.end()) { - writeEHFrameHeader(SMII->second); + if (EHFrameSection && EHFrameSection->isFinalized()) { + writeEHFrameHeader(); } // Patch program header table. @@ -4006,6 +3990,11 @@ void RewriteInstance::rewriteFile() { // Update ELF book-keeping info. patchELFSectionHeaderTable(); + if (opts::PrintSections) { + outs() << "BOLT-INFO: Sections after processing:\n"; + BC->printSections(outs()); + } + Out->keep(); // If requested, open again the binary we just wrote to dump its EH Frame @@ -4025,54 +4014,69 @@ void RewriteInstance::rewriteFile() { } } -void RewriteInstance::writeEHFrameHeader(SectionInfo &EHFrameSecInfo) { - DWARFDebugFrame NewEHFrame(true, EHFrameSecInfo.FileAddress); - NewEHFrame.parse(DWARFDataExtractor( - StringRef(reinterpret_cast(EHFrameSecInfo.AllocAddress), - EHFrameSecInfo.Size), - BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getCodePointerSize())); - - auto OldSMII = EFMM->SectionMapInfo.find(".eh_frame_old"); - assert(OldSMII != EFMM->SectionMapInfo.end() && - "expected .eh_frame_old to be present"); - auto &OldEHFrameSecInfo = OldSMII->second; - DWARFDebugFrame OldEHFrame(true, OldEHFrameSecInfo.FileAddress); - OldEHFrame.parse(DWARFDataExtractor( - StringRef(reinterpret_cast(OldEHFrameSecInfo.AllocAddress), - OldEHFrameSecInfo.Size), - BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getCodePointerSize())); +void RewriteInstance::writeEHFrameHeader() { + DWARFDebugFrame NewEHFrame(true, EHFrameSection->getFileAddress()); + NewEHFrame.parse(DWARFDataExtractor(EHFrameSection->getOutputContents(), + BC->AsmInfo->isLittleEndian(), + BC->AsmInfo->getCodePointerSize())); + + auto OldEHFrameSection = BC->getUniqueSectionByName(".eh_frame_old"); + assert(OldEHFrameSection && "expected .eh_frame_old to be present"); + DWARFDebugFrame OldEHFrame(true, OldEHFrameSection->getFileAddress()); + OldEHFrame.parse(DWARFDataExtractor(OldEHFrameSection->getOutputContents(), + BC->AsmInfo->isLittleEndian(), + BC->AsmInfo->getCodePointerSize())); DEBUG(dbgs() << "BOLT: writing a new .eh_frame_hdr\n"); NextAvailableAddress = appendPadding(Out->os(), NextAvailableAddress, EHFrameHdrAlign); - SectionInfo EHFrameHdrSecInfo; - EHFrameHdrSecInfo.FileAddress = NextAvailableAddress; - EHFrameHdrSecInfo.FileOffset = getFileOffsetForAddress(NextAvailableAddress); + const auto EHFrameHdrFileAddress = NextAvailableAddress; + const auto EHFrameHdrFileOffset = + getFileOffsetForAddress(NextAvailableAddress); auto NewEHFrameHdr = CFIRdWrt->generateEHFrameHeader(OldEHFrame, NewEHFrame, - EHFrameHdrSecInfo.FileAddress, + EHFrameHdrFileAddress, FailedAddresses); - EHFrameHdrSecInfo.Size = NewEHFrameHdr.size(); - - assert(Out->os().tell() == EHFrameHdrSecInfo.FileOffset && - "offset mismatch"); - Out->os().write(NewEHFrameHdr.data(), EHFrameHdrSecInfo.Size); + assert(Out->os().tell() == EHFrameHdrFileOffset && "offset mismatch"); + Out->os().write(NewEHFrameHdr.data(), NewEHFrameHdr.size()); - EFMM->SectionMapInfo[".eh_frame_hdr"] = EHFrameHdrSecInfo; + const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/true, + /*IsText=*/false, + /*IsAllocatable=*/true); + auto &EHFrameHdrSec = BC->registerOrUpdateSection(".eh_frame_hdr", + ELF::SHT_PROGBITS, + Flags, + nullptr, + NewEHFrameHdr.size(), + /*Alignment=*/1); + EHFrameHdrSec.setFileOffset(EHFrameHdrFileOffset); + EHFrameHdrSec.setFileAddress(EHFrameHdrFileAddress); - NextAvailableAddress += EHFrameHdrSecInfo.Size; + NextAvailableAddress += EHFrameHdrSec.getOutputSize(); // Merge .eh_frame and .eh_frame_old so that gdb can locate all FDEs. - EHFrameSecInfo.Size = OldEHFrameSecInfo.FileAddress + OldEHFrameSecInfo.Size - - EHFrameSecInfo.FileAddress; - EFMM->SectionMapInfo.erase(OldSMII); + const auto EHFrameSectionSize = (OldEHFrameSection->getFileAddress() + + OldEHFrameSection->getOutputSize() - + EHFrameSection->getFileAddress()); + + EHFrameSection = + BC->registerOrUpdateSection(".eh_frame", + EHFrameSection->getELFType(), + EHFrameSection->getELFFlags(), + EHFrameSection->getOutputData(), + EHFrameSectionSize, + EHFrameSection->getAlignment(), + EHFrameSection->isLocal()); + + BC->deregisterSection(*OldEHFrameSection); + DEBUG(dbgs() << "BOLT-DEBUG: size of .eh_frame after merge is " - << EHFrameSecInfo.Size << '\n'); + << EHFrameSection->getOutputSize() << '\n'); } uint64_t RewriteInstance::getFileOffsetForAddress(uint64_t Address) const { @@ -4100,11 +4104,8 @@ bool RewriteInstance::willOverwriteSection(StringRef SectionName) { return true; } - auto SMII = EFMM->SectionMapInfo.find(SectionName); - if (SMII != EFMM->SectionMapInfo.end()) - return true; - - return false; + auto Section = BC->getUniqueSectionByName(SectionName); + return Section && Section->isAllocatable() && Section->isFinalized(); } BinaryFunction * diff --git a/bolt/RewriteInstance.h b/bolt/RewriteInstance.h index 80107f223982..f7fc8c9a0530 100644 --- a/bolt/RewriteInstance.h +++ b/bolt/RewriteInstance.h @@ -39,43 +39,6 @@ class CFIReaderWriter; class DataAggregator; class DataReader; -/// Section information for mapping and re-writing. -struct SectionInfo { - uint64_t AllocAddress{0}; /// Current location of the section in memory. - uint64_t Size{0}; /// Section size. - unsigned Alignment{0}; /// Alignment of the section. - bool IsCode{false}; /// Does this section contain code? - bool IsReadOnly{false}; /// Is the section read-only? - bool IsLocal{false}; /// Is this section local to a function, and - /// should only be emitted with the function? - bool IsStrTab{false}; /// Is this a string table section. - uint64_t FileAddress{0}; /// Address for the output file (final address). - uint64_t FileOffset{0}; /// Offset in the output file. - unsigned SectionID{0}; /// Unique ID used for address mapping. - bool IsELFNote{false}; /// Is ELF note section? - - struct Reloc { - uint32_t Offset; - uint8_t Size; - uint8_t Type; // unused atm - uint32_t Value; - }; - - /// Pending relocations for the section. - std::vector PendingRelocs; - - SectionInfo(uint64_t Address, uint64_t Size, unsigned Alignment, bool IsCode, - bool IsReadOnly, bool IsLocal, uint64_t FileAddress = 0, - uint64_t FileOffset = 0, unsigned SectionID = 0, - bool IsELFNote = false) - - : AllocAddress(Address), Size(Size), Alignment(Alignment), IsCode(IsCode), - IsReadOnly(IsReadOnly), IsLocal(IsLocal), FileAddress(FileAddress), - FileOffset(FileOffset), SectionID(SectionID), IsELFNote(IsELFNote) {} - - SectionInfo() {} -}; - struct SegmentInfo { uint64_t Address; /// Address of the segment in memory. uint64_t Size; /// Size of the segment in memory. @@ -105,20 +68,15 @@ class ExecutableFileMemoryManager : public SectionMemoryManager { StringRef SectionName, bool IsCode, bool IsReadOnly); - + BinaryContext &BC; bool AllowStubs; public: /// [start memory address] -> [segment info] mapping. std::map SegmentMapInfo; - /// Keep [section name] -> [section info] map for later remapping. - std::map SectionMapInfo; - - /// Information about non-allocatable sections. - std::map NoteSectionInfo; - - ExecutableFileMemoryManager(bool AllowStubs) : AllowStubs(AllowStubs) {} + ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs) + : BC(BC), AllowStubs(AllowStubs) {} ~ExecutableFileMemoryManager(); @@ -202,7 +160,7 @@ class RewriteInstance { /// non-empty. void emitDataSection(MCStreamer *Streamer, const BinarySection &Section, - std::string Name = ""); + StringRef Name = StringRef()); /// Emit data sections that have code references in them. void emitDataSections(MCStreamer *Streamer); @@ -312,7 +270,7 @@ class RewriteInstance { void rewriteNoteSections(); /// Write .eh_frame_hdr. - void writeEHFrameHeader(SectionInfo &EHFrameSecInfo); + void writeEHFrameHeader(); /// Disassemble and create function entries for PLT. void disassemblePLT(); From c9a0d15267f7b952fb1daf1455751546f37f651c Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 26 Feb 2018 20:09:14 -0800 Subject: [PATCH 385/904] [BOLT/LSDA] Fix alignment Summary: Fix a bug introduced by rebasing with respect to aligned ULEBs. This wasn't breaking anything but it is good to keep LDSA aligned. (cherry picked from commit 37ffbc27339a896553f783ce793f056618ba517f) --- bolt/Exceptions.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/Exceptions.cpp b/bolt/Exceptions.cpp index 358641baf9a0..bc13810ee6c4 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/Exceptions.cpp @@ -586,7 +586,8 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { // Account for any extra padding that will be added to the call site table // length. - Streamer->EmitPaddedULEB128IntValue(TTypeBaseOffset, SizeAlign); + Streamer->EmitPaddedULEB128IntValue(TTypeBaseOffset, + TTypeBaseOffsetSize + SizeAlign); // Emit the landing pad call site table. We use signed data4 since we can emit // a landing pad in a different part of the split function that could appear From 00e9c1da0b1df638f6ba3e32f316f6c0d78f9587 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 14 Dec 2017 17:26:19 -0800 Subject: [PATCH 386/904] [BOLT] Fix ShrinkWrapping bugs and enable testing Summary: Fix a few ShrinkWrapping bugs: - Using push-pop mode in a function that required aligned stack - Correctly update the edges in jump tables after splitting critical edges - Fix stack pointer restores based on RBP + offset, when we change the stack layout in push-pop mode. (cherry picked from commit 1c95638e43c99e9cf8e2b01e7346b24d49b652bc) --- bolt/Passes/AllocCombiner.cpp | 3 +- bolt/Passes/AllocCombiner.h | 6 ++- bolt/Passes/FrameAnalysis.cpp | 42 ++++++++-------- bolt/Passes/FrameOptimizer.cpp | 3 +- bolt/Passes/FrameOptimizer.h | 6 +++ bolt/Passes/ShrinkWrapping.cpp | 92 +++++++++++++++++++++++++++++++--- bolt/Passes/ShrinkWrapping.h | 7 ++- 7 files changed, 128 insertions(+), 31 deletions(-) diff --git a/bolt/Passes/AllocCombiner.cpp b/bolt/Passes/AllocCombiner.cpp index 6d9c82732012..0a1208a7fba5 100644 --- a/bolt/Passes/AllocCombiner.cpp +++ b/bolt/Passes/AllocCombiner.cpp @@ -94,6 +94,7 @@ void AllocCombinerPass::combineAdjustments(BinaryContext &BC, BB.eraseInstruction(Prev); ++NumCombined; + FuncsChanged.insert(&BF); Prev = &Inst; } } @@ -108,7 +109,7 @@ void AllocCombinerPass::runOnFunctions(BinaryContext &BC, runForAllWeCare( BFs, [&](BinaryFunction &Function) { combineAdjustments(BC, Function); }); - outs() << "BOLT-INFO: Allocation combiner: " << NumCoalesced + outs() << "BOLT-INFO: Allocation combiner: " << NumCombined << " empty spaces coalesced.\n"; } diff --git a/bolt/Passes/AllocCombiner.h b/bolt/Passes/AllocCombiner.h index 1be39974be3c..0e816048956d 100644 --- a/bolt/Passes/AllocCombiner.h +++ b/bolt/Passes/AllocCombiner.h @@ -21,7 +21,7 @@ namespace bolt { class AllocCombinerPass : public BinaryFunctionPass { /// Stats aggregating variables uint64_t NumCombined{0}; - uint64_t NumCoalesced{0}; + DenseSet FuncsChanged; void combineAdjustments(BinaryContext &BC, BinaryFunction &BF); void coalesceEmptySpace(BinaryContext &BC, BinaryFunction &BF, @@ -35,6 +35,10 @@ class AllocCombinerPass : public BinaryFunctionPass { return "alloc-combiner"; } + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0; + } + /// Pass entry point void runOnFunctions(BinaryContext &BC, std::map &BFs, diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 21223b38bca3..5d7e20478b55 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -316,12 +316,11 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst, // If indirect call, we conservatively assume it accesses all stack positions if (TargetSymbol == nullptr) { addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true)); - bool Updated{false}; if (!FunctionsRequireAlignment.count(&BF)) { - Updated = true; FunctionsRequireAlignment.insert(&BF); + return true; } - return Updated; + return false; } const auto *Function = BC.getFunctionForSymbol(TargetSymbol); @@ -329,20 +328,17 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst, // it accesses all stack positions if (Function == nullptr) { addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true)); - bool Updated{false}; if (!FunctionsRequireAlignment.count(&BF)) { - Updated = true; FunctionsRequireAlignment.insert(&BF); + return true; } - return Updated; + return false; } auto Iter = ArgsTouchedMap.find(Function); - if (Iter == ArgsTouchedMap.end()) - return false; bool Changed = false; - if (BC.MIA->isTailCall(Inst)) { + if (BC.MIA->isTailCall(Inst) && Iter != ArgsTouchedMap.end()) { // Ignore checking CurOffset because we can't always reliably determine the // offset specially after an epilogue, where tailcalls happen. It should be // -8. @@ -358,6 +354,8 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst, Changed = true; FunctionsRequireAlignment.insert(&BF); } + if (Iter == ArgsTouchedMap.end()) + return false; if (CurOffset == StackPointerTracking::EMPTY || CurOffset == StackPointerTracking::SUPERPOSITION) { @@ -382,18 +380,18 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst, bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) { if (!BF.isSimple() || !BF.hasCFG()) { DEBUG(dbgs() << "Treating " << BF.getPrintName() << " conservatively.\n"); - bool Updated = false; ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0)); if (!FunctionsRequireAlignment.count(&BF)) { - Updated = true; FunctionsRequireAlignment.insert(&BF); + return true; } - return Updated; + return false; } DEBUG(dbgs() << "Now computing args accessed for: " << BF.getPrintName() << "\n"); bool UpdatedArgsTouched = false; + bool NoInfo = false; FrameAccessAnalysis FAA(BC, BF); for (auto BB : BF.layout()) { @@ -402,6 +400,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) { for (auto &Inst : *BB) { if (!FAA.doNext(*BB, Inst)) { ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0)); + NoInfo = true; break; } @@ -429,25 +428,26 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) { BC.printInstruction(dbgs(), Inst, 0, &BF, true); }); } + if (NoInfo) + break; } if (FunctionsRequireAlignment.count(&BF)) return UpdatedArgsTouched; - bool UpdatedAlignedStatus = false; + if (NoInfo) { + FunctionsRequireAlignment.insert(&BF); + return true; + } + for (auto &BB : BF) { - if (UpdatedAlignedStatus) - break; for (auto &Inst : BB) { if (BC.MIA->requiresAlignedAddress(Inst)) { - if (!FunctionsRequireAlignment.count(&BF)) { - UpdatedAlignedStatus = true; - FunctionsRequireAlignment.insert(&BF); - break; - } + FunctionsRequireAlignment.insert(&BF); + return true; } } } - return UpdatedArgsTouched || UpdatedAlignedStatus; + return UpdatedArgsTouched; } bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) { diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 7f0e5215e695..94cbb09bca8d 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -265,7 +265,8 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, opts::TimeOpts); DataflowInfoManager Info(BC, I.second, &RA, &FA); ShrinkWrapping SW(FA, BC, I.second, Info); - SW.perform(); + if (SW.perform()) + FuncsChanged.insert(&I.second); } } diff --git a/bolt/Passes/FrameOptimizer.h b/bolt/Passes/FrameOptimizer.h index 3c6e3bee168a..b868e65fb974 100644 --- a/bolt/Passes/FrameOptimizer.h +++ b/bolt/Passes/FrameOptimizer.h @@ -84,6 +84,8 @@ class FrameOptimizerPass : public BinaryFunctionPass { uint64_t NumLoadsChangedToImm{0}; uint64_t NumLoadsDeleted{0}; + DenseSet FuncsChanged; + /// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from /// the frame. Use the analysis to convert memory loads to register moves or /// immediate loads. Delete redundant register moves. @@ -109,6 +111,10 @@ class FrameOptimizerPass : public BinaryFunctionPass { void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; + + bool shouldPrint(const BinaryFunction &BF) const override { + return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0; + } }; } // namespace bolt diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp index a3bed4802b0c..e0f5dabb44f5 100644 --- a/bolt/Passes/ShrinkWrapping.cpp +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -256,6 +256,63 @@ void StackLayoutModifier::checkFramePointerInitialization(MCInst &Point) { blacklistRegion(0, 0); } +void StackLayoutModifier::checkStackPointerRestore(MCInst &Point) { + auto &SPT = Info.getStackPointerTracking(); + if (!BC.MII->get(Point.getOpcode()) + .hasDefOfPhysReg(Point, BC.MIA->getStackPointer(), *BC.MRI)) + return; + // Check if the definition of SP comes from FP -- in this case, this + // value may need to be updated depending on our stack layout changes + const auto InstInfo = BC.MII->get(Point.getOpcode()); + auto NumDefs = InstInfo.getNumDefs(); + bool UsesFP{false}; + for (unsigned I = NumDefs, E = Point.getNumPrimeOperands(); I < E; ++I) { + auto &Operand = Point.getOperand(I); + if (!Operand.isReg()) + continue; + if (Operand.getReg() == BC.MIA->getFramePointer()) { + UsesFP = true; + break; + } + } + if (!UsesFP) + return; + + // Setting up evaluation + int SPVal, FPVal; + std::tie(SPVal, FPVal) = *SPT.getStateBefore(Point); + std::pair FP; + + if (FPVal != SPT.EMPTY && FPVal != SPT.SUPERPOSITION) + FP = std::make_pair(BC.MIA->getFramePointer(), FPVal); + else + FP = std::make_pair(0, 0); + std::pair SP; + + if (SPVal != SPT.EMPTY && SPVal != SPT.SUPERPOSITION) + SP = std::make_pair(BC.MIA->getStackPointer(), SPVal); + else + SP = std::make_pair(0, 0); + + int64_t Output; + if (!BC.MIA->evaluateSimple(Point, Output, SP, FP)) + return; + + // If the value is the same of FP, no need to adjust it + if (Output == FPVal) + return; + + // If an allocation happened through FP, bail + if (Output <= SPVal) { + blacklistRegion(0, 0); + return; + } + + // We are restoring SP to an old value based on FP. Mark it as a stack + // access to be fixed later. + BC.MIA->addAnnotation(BC.Ctx.get(), Point, getSlotTagName(), Output); +} + void StackLayoutModifier::classifyStackAccesses() { // Understand when stack slots are being used non-locally auto &SRU = Info.getStackReachingUses(); @@ -265,6 +322,7 @@ void StackLayoutModifier::classifyStackAccesses() { for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { auto &Inst = *I; checkFramePointerInitialization(Inst); + checkStackPointerRestore(Inst); auto FIEX = FA.getFIEFor(Inst); if (!FIEX) { Prev = &Inst; @@ -441,6 +499,15 @@ bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr, scheduleChange(Inst, WorklistItem(WorklistItem::AdjustCFI, RegionSz)); continue; } + auto FIE = FA.getFIEFor(Inst); + if (!FIE) { + if (Slot > RegionAddr) + continue; + // SP update based on frame pointer + scheduleChange( + Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, RegionSz)); + continue; + } if (Slot == RegionAddr) { BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "AccessesDeletedPos", 0U); @@ -450,8 +517,7 @@ bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr, continue; } - auto FIE = FA.getFIEFor(Inst); - assert(FIE); + if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr) continue; @@ -534,9 +600,15 @@ bool StackLayoutModifier::insertRegion(ProgramPoint P, int64_t RegionSz) { scheduleChange(Inst, WorklistItem(WorklistItem::AdjustCFI, -RegionSz)); continue; } - auto FIE = FA.getFIEFor(Inst); - assert(FIE); + if (!FIE) { + if (Slot >= RegionAddr) + continue; + scheduleChange( + Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, -RegionSz)); + continue; + } + if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr) continue; if (FIE->StackPtrReg == BC.MIA->getFramePointer() && Slot >= RegionAddr) @@ -606,6 +678,12 @@ void StackLayoutModifier::performChanges() { Success = BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, Reg, SrcImm, StackPtrReg, StackOffset, Size, IsSimple, IsIndexed); + if (!Success) { + // SP update based on FP value + Success = BC.MIA->addToImm(Inst, Adjustment, &*BC.Ctx); + assert(Success); + continue; + } assert(Success && IsSimple && !IsIndexed && (!IsStore || IsStoreFromReg)); if (StackPtrReg != BC.MIA->getFramePointer()) Adjustment = -Adjustment; @@ -1282,6 +1360,7 @@ bool ShrinkWrapping::foldIdenticalSplitEdges() { BinaryBasicBlock *Pred = *RBB.pred_begin(); uint64_t OrigCount{Pred->branch_info_begin()->Count}; uint64_t OrigMispreds{Pred->branch_info_begin()->MispredictedCount}; + BF.replaceJumpTableEntryIn(Pred, &RBB, &BB); Pred->replaceSuccessor(&RBB, &BB, OrigCount, OrigMispreds); Changed = true; // Remove the block from CFG @@ -1807,7 +1886,7 @@ void ShrinkWrapping::rebuildCFI() { } } -void ShrinkWrapping::perform() { +bool ShrinkWrapping::perform() { HasDeletedOffsetCFIs = std::vector(BC.MRI->getNumRegs(), false); PushOffsetByReg = std::vector(BC.MRI->getNumRegs(), 0LL); PopOffsetByReg = std::vector(BC.MRI->getNumRegs(), 0LL); @@ -1827,7 +1906,7 @@ void ShrinkWrapping::perform() { SLM.performChanges(); // Early exit if processInsertions doesn't detect any todo items if (!processInsertions()) - return; + return false; processDeletions(); if (foldIdenticalSplitEdges()) { const auto Stats = BF.eraseInvalidBBs(); @@ -1842,6 +1921,7 @@ void ShrinkWrapping::perform() { dbgs() << "Func after shrink-wrapping: \n"; BF.dump(); }); + return true; } void ShrinkWrapping::printStats() { diff --git a/bolt/Passes/ShrinkWrapping.h b/bolt/Passes/ShrinkWrapping.h index 91549124148b..f22b188c5d97 100644 --- a/bolt/Passes/ShrinkWrapping.h +++ b/bolt/Passes/ShrinkWrapping.h @@ -161,6 +161,11 @@ class StackLayoutModifier { /// regions. void checkFramePointerInitialization(MCInst &Point); + /// If \p Point is restoring the value with SP with FP plus offset, + /// add a slottag to this instruction as it needs to be updated when we + /// change the stack layout. + void checkStackPointerRestore(MCInst &Point); + /// Make sense of each stack offsets we can freely change void classifyStackAccesses(); void classifyCFIs(); @@ -470,7 +475,7 @@ class ShrinkWrapping { } } - void perform(); + bool perform(); static void printStats(); }; From fad42ce002d9b87ace85041fb3904d5e608422b0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 14 Nov 2017 20:05:11 -0800 Subject: [PATCH 387/904] [BOLT] Refactor global symbol handling code. Summary: This is preparation work for static data reordering. I've created a new class called BinaryData which represents a symbol contained in a section. It records almost all the information relevant for dealing with data, e.g. names, address, size, alignment, profiling data, etc. BinaryContext still stores and manages BinaryData objects similar to how it managed symbols and global addresses before. The interfaces are not changed too drastically from before either. There is a bit of overlap between BinaryData and BinaryFunction. I would have liked to do some more refactoring to make a BinaryFunctionFragment that subclassed from BinaryData and then have BinaryFunction be composed or associated with BinaryFunctionFragments. I've also attempted to use (symbol + offset) for when addresses are pointing into the middle of symbols with known sizes. This changes the simplify rodata loads optimization slightly since the expression on an instruction can now also be a (symbol + offset) rather than just a symbol. One of the overall goals for this refactoring is to make sure every relocation is associated with a BinaryData object. This requires adding "hole" BinaryData's wherever there are gaps in a section's address space. Most of the holes seem to be data that has no associated symbol info. In this case we can't do any better than lumping all the adjacent hole symbols into one big symbol (there may be more than one actual data object that contributes to a hole). At least the combined holes should be moveable. Jump tables have similar issues. They appear to mostly be sub-objects for top level local symbols. The main problem is that we can't recognize jump tables at the time we scan the symbol table, we have to wait til disassembly. When a jump table is discovered we add it as a sub-object to the existing local symbol. If there are one or more existing BinaryData's that appear in the address range of a newly created jump table, those are added as sub-objects as well. (cherry picked from commit 5643274ab5dbafe84417360192e2ee0310c5acc8) --- bolt/BinaryBasicBlock.cpp | 15 +- bolt/BinaryContext.cpp | 386 ++++++++++++++++++++++-- bolt/BinaryContext.h | 308 ++++++++++++++++--- bolt/BinaryData.cpp | 132 ++++++++ bolt/BinaryData.h | 207 +++++++++++++ bolt/BinaryFunction.cpp | 376 +++++++++-------------- bolt/BinaryFunction.h | 134 ++------ bolt/BinaryFunctionProfile.cpp | 12 +- bolt/BinarySection.cpp | 317 +------------------ bolt/BinarySection.h | 102 ++----- bolt/CMakeLists.txt | 3 + bolt/DWARFRewriter.cpp | 2 +- bolt/DataAggregator.cpp | 10 +- bolt/Exceptions.cpp | 14 +- bolt/JumpTable.cpp | 191 ++++++++++++ bolt/JumpTable.h | 123 ++++++++ bolt/Passes/BinaryFunctionCallGraph.cpp | 10 +- bolt/Passes/BinaryPasses.cpp | 16 +- bolt/Passes/IndirectCallPromotion.cpp | 49 +-- bolt/Passes/IndirectCallPromotion.h | 2 +- bolt/Passes/JTFootprintReduction.cpp | 8 +- bolt/Passes/JTFootprintReduction.h | 6 +- bolt/Passes/LongJmp.cpp | 6 +- bolt/Passes/ReorderFunctions.cpp | 19 +- bolt/ProfileWriter.cpp | 7 +- bolt/Relocation.cpp | 326 ++++++++++++++++++++ bolt/Relocation.h | 90 ++++++ bolt/RewriteInstance.cpp | 386 +++++++++++++++++------- bolt/RewriteInstance.h | 12 +- 29 files changed, 2268 insertions(+), 1001 deletions(-) create mode 100644 bolt/BinaryData.cpp create mode 100644 bolt/BinaryData.h create mode 100644 bolt/JumpTable.cpp create mode 100644 bolt/JumpTable.h create mode 100644 bolt/Relocation.cpp create mode 100644 bolt/Relocation.h diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index 8bb3919b18e1..f64a1b4ee338 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -78,13 +78,22 @@ bool BinaryBasicBlock::validateSuccessorInvariants() { // Work on the assumption that jump table blocks don't // have a conditional successor. Valid = false; + errs() << "BOLT-WARNING: Jump table successor " + << Succ->getName() + << " not contained in the jump table.\n"; } } // If there are any leftover entries in the jump table, they // must be one of the function end labels. - for (auto *Sym : UniqueSyms) { - Valid &= (Sym == Function->getFunctionEndLabel() || - Sym == Function->getFunctionColdEndLabel()); + if (Valid) { + for (auto *Sym : UniqueSyms) { + Valid &= (Sym == Function->getFunctionEndLabel() || + Sym == Function->getFunctionColdEndLabel()); + if (!Valid) { + errs() << "BOLT-WARNING: Jump table contains illegal entry: " + << Sym->getName() << "\n"; + } + } } } else { const MCSymbol *TBB = nullptr; diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index e6000c01dce9..09ebe6b7d58c 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -11,6 +11,7 @@ #include "BinaryContext.h" #include "BinaryFunction.h" +#include "DataReader.h" #include "llvm/ADT/Twine.h" #include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/DebugInfo/DWARF/DWARFUnit.h" @@ -19,6 +20,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" +#include using namespace llvm; using namespace bolt; @@ -57,6 +59,7 @@ BinaryContext::~BinaryContext() { for (auto *Section : Sections) { delete Section; } + clearBinaryData(); } std::unique_ptr @@ -69,47 +72,224 @@ BinaryContext::createObjectWriter(raw_pwrite_stream &OS) { return MAB->createObjectWriter(OS); } +bool BinaryContext::validateObjectNesting() const { + auto Itr = BinaryDataMap.begin(); + auto End = BinaryDataMap.end(); + bool Valid = true; + while (Itr != End) { + auto Next = std::next(Itr); + while (Next != End && + Itr->second->getSection() == Next->second->getSection() && + Itr->second->containsRange(Next->second->getAddress(), + Next->second->getSize())) { + if (Next->second->Parent != Itr->second) { + errs() << "BOLT-WARNING: object nesting incorrect for:\n" + << "BOLT-WARNING: " << *Itr->second << "\n" + << "BOLT-WARNING: " << *Next->second << "\n"; + Valid = false; + } + ++Next; + } + Itr = Next; + } + return Valid; +} + +bool BinaryContext::validateHoles() const { + bool Valid = true; + for (auto &Section : sections()) { + for (const auto &Rel : Section.relocations()) { + auto RelAddr = Rel.Offset + Section.getAddress(); + auto *BD = getBinaryDataContainingAddress(RelAddr); + if (!BD) { + errs() << "BOLT-WARNING: no BinaryData found for relocation at address" + << " 0x" << Twine::utohexstr(RelAddr) << " in " + << Section.getName() << "\n"; + Valid = false; + } else if (!BD->getAtomicRoot()) { + errs() << "BOLT-WARNING: no atomic BinaryData found for relocation at " + << "address 0x" << Twine::utohexstr(RelAddr) << " in " + << Section.getName() << "\n"; + Valid = false; + } + } + } + return Valid; +} + +void BinaryContext::updateObjectNesting(BinaryDataMapType::iterator GAI) { + const auto Address = GAI->second->getAddress(); + const auto Size = GAI->second->getSize(); + + auto fixParents = + [&](BinaryDataMapType::iterator Itr, BinaryData *NewParent) { + auto *OldParent = Itr->second->Parent; + Itr->second->Parent = NewParent; + ++Itr; + while (Itr != BinaryDataMap.end() && OldParent && + Itr->second->Parent == OldParent) { + Itr->second->Parent = NewParent; + ++Itr; + } + }; + + // Check if the previous symbol contains the newly added symbol. + if (GAI != BinaryDataMap.begin()) { + auto *Prev = std::prev(GAI)->second; + while (Prev) { + if (Prev->getSection() == GAI->second->getSection() && + Prev->containsRange(Address, Size)) { + fixParents(GAI, Prev); + } else { + fixParents(GAI, nullptr); + } + Prev = Prev->Parent; + } + } + + // Check if the newly added symbol contains any subsequent symbols. + if (Size != 0) { + auto *BD = GAI->second->Parent ? GAI->second->Parent : GAI->second; + auto Itr = std::next(GAI); + while (Itr != BinaryDataMap.end() && + BD->containsRange(Itr->second->getAddress(), + Itr->second->getSize())) { + Itr->second->Parent = BD; + ++Itr; + } + } +} + MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, + uint64_t Size, + uint16_t Alignment, Twine Prefix) { - MCSymbol *Symbol{nullptr}; - std::string Name; - auto NI = GlobalAddresses.find(Address); - if (NI != GlobalAddresses.end()) { - // Even though there could be multiple names registered at the address, - // we only use the first one. - Name = NI->second; - } else { - Name = (Prefix + "0x" + Twine::utohexstr(Address)).str(); - assert(GlobalSymbols.find(Name) == GlobalSymbols.end() && - "created name is not unique"); - GlobalAddresses.emplace(std::make_pair(Address, Name)); + auto Itr = BinaryDataMap.find(Address); + if (Itr != BinaryDataMap.end()) { + assert(Itr->second->getSize() == Size || !Size); + return Itr->second->getSymbol(); } - Symbol = Ctx->lookupSymbol(Name); - if (Symbol) - return Symbol; + std::string Name = (Prefix + "0x" + Twine::utohexstr(Address)).str(); + assert(!GlobalSymbols.count(Name) && "created name is not unique"); + return registerNameAtAddress(Name, Address, Size, Alignment); +} - Symbol = Ctx->getOrCreateSymbol(Name); - GlobalSymbols[Name] = Address; +MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, + uint64_t Address, + uint64_t Size, + uint16_t Alignment) { + auto SectionOrErr = getSectionForAddress(Address); + auto &Section = SectionOrErr ? SectionOrErr.get() : absoluteSection(); + auto GAI = BinaryDataMap.find(Address); + BinaryData *BD; + if (GAI == BinaryDataMap.end()) { + BD = new BinaryData(Name, + Address, + Size, + Alignment ? Alignment : 1, + Section); + } else { + BD = GAI->second; + } + return registerNameAtAddress(Name, Address, BD); +} +MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, + uint64_t Address, + BinaryData *BD) { + auto GAI = BinaryDataMap.find(Address); + if (GAI != BinaryDataMap.end()) { + if (BD != GAI->second) { + // Note: this could be a source of bugs if client code holds + // on to BinaryData*'s in data structures for any length of time. + auto *OldBD = GAI->second; + BD->merge(GAI->second); + delete OldBD; + GAI->second = BD; + for (auto &Name : BD->names()) { + GlobalSymbols[Name] = BD; + } + updateObjectNesting(GAI); + } else if (!GAI->second->hasName(Name)) { + GAI->second->Names.push_back(Name); + GlobalSymbols[Name] = GAI->second; + } + BD = nullptr; + } else { + GAI = BinaryDataMap.emplace(Address, BD).first; + GlobalSymbols[Name] = BD; + updateObjectNesting(GAI); + } + + // Register the name with MCContext. + auto *Symbol = Ctx->getOrCreateSymbol(Name); + if (BD) { + BD->Symbols.push_back(Symbol); + assert(BD->Symbols.size() == BD->Names.size()); + } return Symbol; } -MCSymbol *BinaryContext::getGlobalSymbolAtAddress(uint64_t Address) const { - auto NI = GlobalAddresses.find(Address); - if (NI == GlobalAddresses.end()) - return nullptr; +const BinaryData * +BinaryContext::getBinaryDataContainingAddressImpl(uint64_t Address, + bool IncludeEnd, + bool BestFit) const { + auto NI = BinaryDataMap.lower_bound(Address); + auto End = BinaryDataMap.end(); + if ((NI != End && Address == NI->first) || + (NI-- != BinaryDataMap.begin())) { + if (NI->second->containsAddress(Address) || + (IncludeEnd && NI->second->getEndAddress() == Address)) { + while (BestFit && + std::next(NI) != End && + (std::next(NI)->second->containsAddress(Address) || + (IncludeEnd && std::next(NI)->second->getEndAddress() == Address))) { + ++NI; + } + return NI->second; + } - auto *Symbol = Ctx->lookupSymbol(NI->second); - assert(Symbol && "symbol cannot be NULL at this point"); + // If this is a sub-symbol, see if a parent data contains the address. + auto *BD = NI->second->getParent(); + while (BD) { + if (BD->containsAddress(Address) || + (IncludeEnd && NI->second->getEndAddress() == Address)) + return BD; + BD = BD->getParent(); + } + } + return nullptr; +} - return Symbol; +bool BinaryContext::setBinaryDataSize(uint64_t Address, uint64_t Size) { + auto NI = BinaryDataMap.find(Address); + assert(NI != BinaryDataMap.end()); + if (NI == BinaryDataMap.end()) + return false; + assert(!NI->second->Size || NI->second->Size == Size); + NI->second->Size = Size; + updateObjectNesting(NI); + return true; } -MCSymbol *BinaryContext::getGlobalSymbolByName(const std::string &Name) const { - auto Itr = GlobalSymbols.find(Name); - return Itr == GlobalSymbols.end() - ? nullptr : getGlobalSymbolAtAddress(Itr->second); +void BinaryContext::postProcessSymbolTable() { + fixBinaryDataHoles(); + bool Valid = true; + for (auto &Entry : BinaryDataMap) { + auto *BD = Entry.second; + if ((BD->getName().startswith("SYMBOLat") || + BD->getName().startswith("DATAat")) && + !BD->getParent() && + !BD->getSize() && + !BD->isAbsolute() && + BD->getSection()) { + outs() << "BOLT-WARNING: zero sized top level symbol: " << *BD << "\n"; + Valid = false; + } + } + assert(Valid); + assignMemData(); } void BinaryContext::foldFunction(BinaryFunction &ChildBF, @@ -126,7 +306,7 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF, assert(Symbol && "symbol cannot be NULL at this point"); SymbolToFunctionMap[Symbol] = &ParentBF; - // NB: there's no need to update GlobalAddresses and GlobalSymbols. + // NB: there's no need to update BinaryDataMap and GlobalSymbols. } // Merge execution counts of ChildBF into those of ParentBF. @@ -148,10 +328,138 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF, } } +void BinaryContext::fixBinaryDataHoles() { + assert(validateObjectNesting() && "object nesting inconsitency detected"); + + for (auto &Section : allocatableSections()) { + std::vector> Holes; + + auto isNotHole = [&Section](const binary_data_iterator &Itr) { + auto *BD = Itr->second; + bool isHole = (!BD->getParent() && + !BD->getSize() && + BD->isObject() && + (BD->getName().startswith("SYMBOLat0x") || + BD->getName().startswith("DATAat0x") || + BD->getName().startswith("ANONYMOUS"))); + return !isHole && BD->getSection() == Section && !BD->getParent(); + }; + + auto BDStart = BinaryDataMap.begin(); + auto BDEnd = BinaryDataMap.end(); + auto Itr = FilteredBinaryDataIterator(isNotHole, BDStart, BDEnd); + auto End = FilteredBinaryDataIterator(isNotHole, BDEnd, BDEnd); + + uint64_t EndAddress = Section.getAddress(); + + while (Itr != End) { + auto Gap = Itr->second->getAddress() - EndAddress; + if (Gap > 0) { + assert(EndAddress < Itr->second->getAddress()); + Holes.push_back(std::make_pair(EndAddress, Gap)); + } + EndAddress = Itr->second->getEndAddress(); + ++Itr; + } + + if (EndAddress < Section.getEndAddress()) { + Holes.push_back(std::make_pair(EndAddress, + Section.getEndAddress() - EndAddress)); + } + + // If there is already a symbol at the start of the hole, grow that symbol + // to cover the rest. Otherwise, create a new symbol to cover the hole. + for (auto &Hole : Holes) { + auto *BD = getBinaryDataAtAddress(Hole.first); + if (BD) { + // BD->getSection() can be != Section if there are sections that + // overlap. In this case it is probably safe to just skip the holes + // since the overlapping section will not(?) have any symbols in it. + if (BD->getSection() == Section) + setBinaryDataSize(Hole.first, Hole.second); + } else { + getOrCreateGlobalSymbol(Hole.first, Hole.second, 1, "HOLEat"); + } + } + } + + assert(validateObjectNesting() && "object nesting inconsitency detected"); + assert(validateHoles() && "top level hole detected in object map"); +} + void BinaryContext::printGlobalSymbols(raw_ostream& OS) const { - for (auto &Entry : GlobalSymbols) { - OS << "(" << Entry.first << " -> 0x" - << Twine::utohexstr(Entry.second) << ")\n"; + const BinarySection* CurrentSection = nullptr; + bool FirstSection = true; + + for (auto &Entry : BinaryDataMap) { + const auto *BD = Entry.second; + const auto &Section = BD->getSection(); + if (FirstSection || Section != *CurrentSection) { + uint64_t Address, Size; + StringRef Name = Section.getName(); + if (Section) { + Address = Section.getAddress(); + Size = Section.getSize(); + } else { + Address = BD->getAddress(); + Size = BD->getSize(); + } + OS << "BOLT-INFO: Section " << Name << ", " + << "0x" + Twine::utohexstr(Address) << ":" + << "0x" + Twine::utohexstr(Address + Size) << "/" + << Size << "\n"; + CurrentSection = &Section; + FirstSection = false; + } + + OS << "BOLT-INFO: "; + auto *P = BD->getParent(); + while (P) { + OS << " "; + P = P->getParent(); + } + OS << *BD << "\n"; + } +} + +void BinaryContext::assignMemData() { + auto getAddress = [&](const MemInfo &MI) { + if (!MI.Addr.IsSymbol) + return MI.Addr.Offset; + + if (auto *BD = getBinaryDataByName(MI.Addr.Name)) + return BD->getAddress() + MI.Addr.Offset; + + return 0ul; + }; + + // Map of sections (or heap/stack) to count/size. + std::map Counts; + + uint64_t TotalCount = 0; + for (auto &Entry : DR.getAllFuncsMemData()) { + for (auto &MI : Entry.second.Data) { + const auto Addr = getAddress(MI); + auto *BD = getBinaryDataContainingAddress(Addr); + if (BD) { + BD->getAtomicRoot()->addMemData(MI); + Counts[BD->getSectionName()] += MI.Count; + } else { + Counts["Heap/stack"] += MI.Count; + } + TotalCount += MI.Count; + } + } + + if (!Counts.empty()) { + outs() << "BOLT-INFO: Memory stats breakdown:\n"; + for (auto &Entry : Counts) { + const auto Section = Entry.first; + const auto Count = Entry.second; + outs() << "BOLT-INFO: " << Section << " = " << Count + << format(" (%.1f%%)\n", 100.0*Count/TotalCount); + } + outs() << "BOLT-INFO: Total memory events: " << TotalCount << "\n"; } } @@ -484,6 +792,14 @@ BinaryContext::getSectionForAddress(uint64_t Address) const { return std::make_error_code(std::errc::bad_address); } +ErrorOr +BinaryContext::getSectionNameForAddress(uint64_t Address) const { + if (auto Section = getSectionForAddress(Address)) { + return Section->getName(); + } + return std::make_error_code(std::errc::bad_address); +} + BinarySection &BinaryContext::registerSection(BinarySection *Section) { assert(!Section->getName().empty() && "can't register sections without a name"); @@ -562,6 +878,12 @@ void BinaryContext::printSections(raw_ostream &OS) const { } } +BinarySection &BinaryContext::absoluteSection() { + if (auto Section = getUniqueSectionByName("")) + return *Section; + return registerOrUpdateSection("", ELF::SHT_NULL, 0u); +} + ErrorOr BinaryContext::extractPointerAtAddress(uint64_t Address) const { auto Section = getSectionForAddress(Address); diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index bea5ec4a4f21..5cb67ad4fe16 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -14,6 +14,7 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H #define LLVM_TOOLS_LLVM_BOLT_BINARY_CONTEXT_H +#include "BinaryData.h" #include "BinarySection.h" #include "DebugData.h" #include "llvm/ADT/iterator.h" @@ -55,6 +56,50 @@ namespace bolt { class BinaryFunction; class DataReader; +/// Filter iterator. +template > +class FilterIterator + : public std::iterator::value_type> { + using Iterator = FilterIterator; + using T = typename std::iterator_traits::reference; + using PointerT = typename std::iterator_traits::pointer; + + PredType Pred; + ItrType Itr, End; + + void prev() { + while (!Pred(--Itr)) + ; + } + void next() { + ++Itr; + nextMatching(); + } + void nextMatching() { + while (Itr != End && !Pred(Itr)) + ++Itr; + } +public: + Iterator &operator++() { next(); return *this; } + Iterator &operator--() { prev(); return *this; } + Iterator operator++(int) { auto Tmp(Itr); next(); return Tmp; } + Iterator operator--(int) { auto Tmp(Itr); prev(); return Tmp; } + bool operator==(const Iterator& Other) const { + return Itr == Other.Itr; + } + bool operator!=(const Iterator& Other) const { + return !operator==(Other); + } + T operator*() { return *Itr; } + PointerT operator->() { return &operator*(); } + FilterIterator(PredType Pred, ItrType Itr, ItrType End) + : Pred(Pred), Itr(Itr), End(End) { + nextMatching(); + } +}; + class BinaryContext { BinaryContext() = delete; @@ -70,6 +115,9 @@ class BinaryContext { using SectionIterator = pointee_iterator; using SectionConstIterator = pointee_iterator; + using FilteredSectionIterator = FilterIterator; + using FilteredSectionConstIterator = FilterIterator; + /// Map virtual address to a section. It is possible to have more than one /// section mapped to the same address, e.g. non-allocatable sections. using AddressToSectionMapType = std::multimap; @@ -84,13 +132,24 @@ class BinaryContext { BinarySection ®isterSection(BinarySection *Section); public: - /// [name] -> [address] map used for global symbol resolution. - typedef std::map SymbolMapType; + /// [name] -> [BinaryData*] map used for global symbol resolution. + using SymbolMapType = std::map; SymbolMapType GlobalSymbols; - /// [address] -> [name1], [name2], ... - /// Global addresses never change. - std::multimap GlobalAddresses; + /// [address] -> [BinaryData], ... + /// Addresses never change. + /// Note: it is important that clients do not hold on to instances of + /// BinaryData* while the map is still being modified during BinaryFunction + /// disassembly. This is because of the possibility that a regular + /// BinaryData is later discovered to be a JumpTable. + using BinaryDataMapType = std::map; + using binary_data_iterator = BinaryDataMapType::iterator; + using binary_data_const_iterator = BinaryDataMapType::const_iterator; + BinaryDataMapType BinaryDataMap; + + using FilteredBinaryDataConstIterator = + FilterIterator; + using FilteredBinaryDataIterator = FilterIterator; /// [MCSymbol] -> [BinaryFunction] /// @@ -99,6 +158,38 @@ class BinaryContext { std::unordered_map SymbolToFunctionMap; + /// Look up the symbol entry that contains the given \p Address (based on + /// the start address and size for each symbol). Returns a pointer to + /// the BinaryData for that symbol. If no data is found, nullptr is returned. + const BinaryData *getBinaryDataContainingAddressImpl(uint64_t Address, + bool IncludeEnd, + bool BestFit) const; + + /// Update the Parent fields in BinaryDatas after adding a new entry into + /// \p BinaryDataMap. + void updateObjectNesting(BinaryDataMapType::iterator GAI); + + /// Validate that if object address ranges overlap that the object with + /// the larger range is a parent of the object with the smaller range. + bool validateObjectNesting() const; + + /// Validate that there are no top level "holes" in each section + /// and that all relocations with a section are mapped to a valid + /// top level BinaryData. + bool validateHoles() const; + + /// Get a bogus "absolute" section that will be associated with all + /// absolute BinaryDatas. + BinarySection &absoluteSection(); + + /// Process "holes" in between known BinaryData objects. For now, + /// symbols are padded with the space before the next BinaryData object. + void fixBinaryDataHoles(); + + /// Populate \p GlobalMemData. This should be done after all symbol discovery + /// is complete, e.g. after building CFGs for all functions. + void assignMemData(); +public: /// Map address to a constant island owner (constant data in code section) std::map AddressToConstantIslandMap; @@ -204,28 +295,122 @@ class BinaryContext { std::unique_ptr createObjectWriter(raw_pwrite_stream &OS); - /// Return a global symbol registered at a given \p Address. If no symbol - /// exists, create one with unique name using \p Prefix. + /// Iterate over all BinaryData. + iterator_range getBinaryData() const { + return make_range(BinaryDataMap.begin(), BinaryDataMap.end()); + } + + /// Iterate over all BinaryData. + iterator_range getBinaryData() { + return make_range(BinaryDataMap.begin(), BinaryDataMap.end()); + } + + /// Iterate over all BinaryData associated with the given \p Section. + iterator_range + getBinaryDataForSection(StringRef SectionName) const { + auto Begin = BinaryDataMap.begin(); + auto End = BinaryDataMap.end(); + auto pred = + [&SectionName](const binary_data_const_iterator &Itr) -> bool { + return Itr->second->getSection().getName() == SectionName; + }; + return make_range(FilteredBinaryDataConstIterator(pred, Begin, End), + FilteredBinaryDataConstIterator(pred, End, End)); + } + + /// Iterate over all BinaryData associated with the given \p Section. + iterator_range + getBinaryDataForSection(StringRef SectionName) { + auto Begin = BinaryDataMap.begin(); + auto End = BinaryDataMap.end(); + auto pred = [&SectionName](const binary_data_iterator &Itr) -> bool { + return Itr->second->getSection().getName() == SectionName; + }; + return make_range(FilteredBinaryDataIterator(pred, Begin, End), + FilteredBinaryDataIterator(pred, End, End)); + } + + /// Clear the global symbol address -> name(s) map. + void clearBinaryData() { + GlobalSymbols.clear(); + for (auto &Entry : BinaryDataMap) { + delete Entry.second; + } + BinaryDataMap.clear(); + } + + + /// Return a global symbol registered at a given \p Address and \p Size. + /// If no symbol exists, create one with unique name using \p Prefix. /// If there are multiple symbols registered at the \p Address, then /// return the first one. - MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, Twine Prefix); - - /// Return MCSymbol registered at a given \p Address or nullptr if no + MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, + uint64_t Size, + uint16_t Alignment, + Twine Prefix); + + /// Register a symbol with \p Name at a given \p Address and \p Size. + MCSymbol *registerNameAtAddress(StringRef Name, + uint64_t Address, + BinaryData* BD); + + /// Register a symbol with \p Name at a given \p Address and \p Size. + MCSymbol *registerNameAtAddress(StringRef Name, + uint64_t Address, + uint64_t Size, + uint16_t Alignment); + + /// Return BinaryData registered at a given \p Address or nullptr if no /// global symbol was registered at the location. - MCSymbol *getGlobalSymbolAtAddress(uint64_t Address) const; + const BinaryData *getBinaryDataAtAddress(uint64_t Address) const { + auto NI = BinaryDataMap.find(Address); + return NI != BinaryDataMap.end() ? NI->second : nullptr; + } - /// Find the address of the global symbol with the given \p Name. - /// return an error if no such symbol exists. - ErrorOr getAddressForGlobalSymbol(StringRef Name) const { - auto Itr = GlobalSymbols.find(Name); - if (Itr != GlobalSymbols.end()) - return Itr->second; - return std::make_error_code(std::errc::bad_address); + BinaryData *getBinaryDataAtAddress(uint64_t Address) { + auto NI = BinaryDataMap.find(Address); + return NI != BinaryDataMap.end() ? NI->second : nullptr; + } + + /// Look up the symbol entry that contains the given \p Address (based on + /// the start address and size for each symbol). Returns a pointer to + /// the BinaryData for that symbol. If no data is found, nullptr is returned. + const BinaryData *getBinaryDataContainingAddress(uint64_t Address, + bool IncludeEnd = false, + bool BestFit = false) const { + return getBinaryDataContainingAddressImpl(Address, IncludeEnd, BestFit); + } + + BinaryData *getBinaryDataContainingAddress(uint64_t Address, + bool IncludeEnd = false, + bool BestFit = false) { + return const_cast(getBinaryDataContainingAddressImpl(Address, + IncludeEnd, + BestFit)); } - /// Return MCSymbol for the given \p Name or nullptr if no + /// Return BinaryData for the given \p Name or nullptr if no /// global symbol with that name exists. - MCSymbol *getGlobalSymbolByName(const std::string &Name) const; + const BinaryData *getBinaryDataByName(StringRef Name) const { + auto Itr = GlobalSymbols.find(Name); + return Itr != GlobalSymbols.end() ? Itr->second : nullptr; + } + + BinaryData *getBinaryDataByName(StringRef Name) { + auto Itr = GlobalSymbols.find(Name); + return Itr != GlobalSymbols.end() ? Itr->second : nullptr; + } + + /// Perform any necessary post processing on the symbol table after + /// function disassembly is complete. This processing fixes top + /// level data holes and makes sure the symbol table is valid. + /// It also assigns all memory profiling info to the appropriate + /// BinaryData objects. + void postProcessSymbolTable(); + + /// Set the size of the global symbol located at \p Address. Return + /// false if no symbol exists, true otherwise. + bool setBinaryDataSize(uint64_t Address, uint64_t Size); /// Print the global symbol table. void printGlobalSymbols(raw_ostream& OS) const; @@ -269,15 +454,62 @@ class BinaryContext { bool deregisterSection(BinarySection &Section); /// Iterate over all registered sections. - iterator_range sections() { - return make_range(Sections.begin(), Sections.end()); + iterator_range sections() { + auto notNull = [](const SectionIterator &Itr) { + return (bool)*Itr; + }; + return make_range(FilteredSectionIterator(notNull, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(notNull, + Sections.end(), + Sections.end())); } /// Iterate over all registered sections. - iterator_range sections() const { - return make_range(Sections.begin(), Sections.end()); + iterator_range sections() const { + return const_cast(this)->sections(); + } + + /// Iterate over all registered allocatable sections. + iterator_range allocatableSections() { + auto isAllocatable = [](const SectionIterator &Itr) { + return *Itr && Itr->isAllocatable(); + }; + return make_range(FilteredSectionIterator(isAllocatable, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(isAllocatable, + Sections.end(), + Sections.end())); + } + + /// Iterate over all registered allocatable sections. + iterator_range allocatableSections() const { + return const_cast(this)->allocatableSections(); } + /// Iterate over all registered non-allocatable sections. + iterator_range nonAllocatableSections() { + auto notAllocated = [](const SectionIterator &Itr) { + return *Itr && !Itr->isAllocatable(); + }; + return make_range(FilteredSectionIterator(notAllocated, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(notAllocated, + Sections.end(), + Sections.end())); + } + + /// Iterate over all registered non-allocatable sections. + iterator_range nonAllocatableSections() const { + return const_cast(this)->nonAllocatableSections(); + } + + /// Return section name containing the given \p Address. + ErrorOr getSectionNameForAddress(uint64_t Address) const; + /// Print all sections. void printSections(raw_ostream& OS) const; @@ -321,28 +553,6 @@ class BinaryContext { /// the binary. ErrorOr extractPointerAtAddress(uint64_t Address) const; - /// Register a symbol with \p Name at a given \p Address. - MCSymbol *registerNameAtAddress(const std::string &Name, uint64_t Address) { - // Check if the Name was already registered. - const auto GSI = GlobalSymbols.find(Name); - if (GSI != GlobalSymbols.end()) { - assert(GSI->second == Address && "addresses do not match"); - auto *Symbol = Ctx->lookupSymbol(Name); - assert(Symbol && "symbol should be registered with MCContext"); - - return Symbol; - } - - // Add the name to global symbols map. - GlobalSymbols[Name] = Address; - - // Add to the reverse map. There could multiple names at the same address. - GlobalAddresses.emplace(std::make_pair(Address, Name)); - - // Register the name with MCContext. - return Ctx->getOrCreateSymbol(Name); - } - /// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then /// removed from the list of functions \p BFs. The profile data of \p ChildBF /// is merged into that of \p ParentBF. @@ -371,6 +581,12 @@ class BinaryContext { return BFI == SymbolToFunctionMap.end() ? nullptr : BFI->second; } + /// Associate the symbol \p Sym with the function \p BF for lookups with + /// getFunctionForSymbol(). + void setSymbolToFunctionMap(const MCSymbol *Sym, BinaryFunction *BF) { + SymbolToFunctionMap[Sym] = BF; + } + /// Populate some internal data structures with debug info. void preprocessDebugInfo( std::map &BinaryFunctions); diff --git a/bolt/BinaryData.cpp b/bolt/BinaryData.cpp new file mode 100644 index 000000000000..ea27bead6f9c --- /dev/null +++ b/bolt/BinaryData.cpp @@ -0,0 +1,132 @@ +//===--- BinaryData.cpp - Representation of section data objects ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "BinaryData.h" +#include "BinarySection.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" + +using namespace llvm; +using namespace bolt; + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +namespace opts { +extern cl::OptionCategory BoltCategory; +extern cl::opt Verbosity; + +cl::opt +PrintSymbolAliases("print-aliases", + cl::desc("print aliases when printing objects"), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltCategory)); +} + +bool BinaryData::isMoveable() const { + return (!isAbsolute() && + (IsMoveable && + (!Parent || isTopLevelJumpTable()))); +} + +void BinaryData::merge(const BinaryData *Other) { + assert(!Size || !Other->Size || Size == Other->Size); + assert(Address == Other->Address); + assert(*Section == *Other->Section); + assert(OutputOffset == Other->OutputOffset); + assert(OutputSection == Other->OutputSection); + Names.insert(Names.end(), Other->Names.begin(), Other->Names.end()); + Symbols.insert(Symbols.end(), Other->Symbols.begin(), Other->Symbols.end()); + MemData.insert(MemData.end(), Other->MemData.begin(), Other->MemData.end()); + if (!Size) + Size = Other->Size; +} + +bool BinaryData::hasNameRegex(StringRef NameRegex) const { + Regex MatchName(NameRegex); + for (auto &Name : Names) + if (MatchName.match(Name)) + return true; + return false; +} + +StringRef BinaryData::getSectionName() const { + return getSection().getName(); +} + +uint64_t BinaryData::computeOutputOffset() const { + return Address - getSection().getAddress(); +} + +void BinaryData::setSection(BinarySection &NewSection) { + Section = &NewSection; + if (OutputSection.empty()) + OutputSection = getSection().getName(); +} + +bool BinaryData::isMoved() const { + return (computeOutputOffset() != OutputOffset || + OutputSection != getSectionName()); +} + +void BinaryData::print(raw_ostream &OS) const { + printBrief(OS); +} + +void BinaryData::printBrief(raw_ostream &OS) const { + OS << "("; + + if (isJumpTable()) + OS << "jump-table: "; + else + OS << "object: "; + + OS << getName(); + + if ((opts::PrintSymbolAliases || opts::Verbosity > 1) && Names.size() > 1) { + OS << ", aliases:"; + for (unsigned I = 1u; I < Names.size(); ++I) { + OS << (I == 1 ? " (" : ", ") << Names[I]; + } + OS << ")"; + } + + if (opts::Verbosity > 1 && Parent) { + OS << " (" << Parent->getName() << "/" << Parent->getSize() << ")"; + } + + OS << ", 0x" << Twine::utohexstr(getAddress()) + << ":0x" << Twine::utohexstr(getEndAddress()) + << "/" << getSize(); + + if (opts::Verbosity > 1) { + for (auto &MI : memData()) { + OS << ", " << MI; + } + } + + OS << ")"; +} + +BinaryData::BinaryData(StringRef Name, + uint64_t Address, + uint64_t Size, + uint16_t Alignment, + BinarySection &Section) +: Names({Name}), + Section(&Section), + Address(Address), + Size(Size), + Alignment(Alignment), + OutputSection(Section.getName()), + OutputOffset(computeOutputOffset()) +{ } diff --git a/bolt/BinaryData.h b/bolt/BinaryData.h new file mode 100644 index 000000000000..0acace0ca7ae --- /dev/null +++ b/bolt/BinaryData.h @@ -0,0 +1,207 @@ +//===--- BinaryData.h - Representation of section data objects -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_BINARY_DATA_H +#define LLVM_TOOLS_LLVM_BOLT_BINARY_DATA_H + +#include "DataReader.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include + +namespace llvm { +namespace bolt { + +struct BinarySection; + +/// \p BinaryData represents an indivisible part of a data section section. +/// BinaryData's may contain sub-components, e.g. jump tables but they are +/// considered to be part of the parent symbol in terms of divisibility and +/// reordering. +class BinaryData { + friend class BinaryContext; + /// Non-null if this BinaryData is contained in a larger BinaryData object, + /// i.e. the start and end addresses are contained within another object. + BinaryData *Parent{nullptr}; + + // non-copyable + BinaryData() = delete; + BinaryData(const BinaryData &) = delete; + BinaryData &operator=(const BinaryData &) = delete; + +protected: + /// All names associated with this data. The first name is the primary one. + std::vector Names; + /// All symbols associated with this data. This vector should have one entry + /// corresponding to every entry in \p Names. + std::vector Symbols; + + /// Section this data belongs to. + BinarySection *Section; + /// Start address of this symbol. + uint64_t Address{0}; + /// Size of this data (can be 0). + uint64_t Size{0}; + /// Alignment of this data. + uint16_t Alignment{1}; + + /// Output section for this data if it has been moved from the original + /// section. + std::string OutputSection; + /// The offset of this symbol in the output section. This is different + /// from \p Address - Section.getAddress() when the data has been reordered. + uint64_t OutputOffset{0}; + + /// Memory profiling data associated with this object. + std::vector MemData; + + bool IsMoveable{true}; + + void addMemData(const MemInfo &MI) { + MemData.push_back(MI); + } + + BinaryData *getRootData() { + auto *BD = this; + while (BD->Parent) + BD = BD->Parent; + return BD; + } + + BinaryData *getAtomicRoot() { + auto *BD = this; + while (!BD->isAtomic() && BD->Parent) + BD = BD->Parent; + return BD; + } + + uint64_t computeOutputOffset() const; + +public: + BinaryData(BinaryData &&) = default; + BinaryData(StringRef Name, + uint64_t Address, + uint64_t Size, + uint16_t Alignment, + BinarySection &Section); + virtual ~BinaryData() { } + + virtual bool isJumpTable() const { return false; } + virtual bool isObject() const { return !isJumpTable(); } + virtual void merge(const BinaryData *Other); + + bool isTopLevelJumpTable() const { + return (isJumpTable() && + (!Parent || (!Parent->Parent && Parent->isObject()))); + } + + // BinaryData that is considered atomic and potentially moveable. All + // MemInfo data and relocations should be wrt. to atomic data. + bool isAtomic() const { + return isTopLevelJumpTable() || !Parent; + } + + iterator_range::const_iterator> names() const { + return make_range(Names.begin(), Names.end()); + } + + iterator_range::const_iterator> symbols() const { + return make_range(Symbols.begin(), Symbols.end()); + } + + iterator_range::const_iterator> memData() const { + return make_range(MemData.begin(), MemData.end()); + } + + StringRef getName() const { return Names.front(); } + const std::vector &getNames() const { return Names; } + MCSymbol *getSymbol() { return Symbols.front(); } + const MCSymbol *getSymbol() const { return Symbols.front(); } + + bool hasName(StringRef Name) const { + return std::find(Names.begin(), Names.end(), Name) != Names.end(); + } + bool hasNameRegex(StringRef Name) const; + bool nameStartsWith(StringRef Prefix) const { + for (const auto &Name : Names) { + if (StringRef(Name).startswith(Prefix)) + return true; + } + return false; + } + + bool hasSymbol(const MCSymbol *Symbol) const { + return std::find(Symbols.begin(), Symbols.end(), Symbol) != Symbols.end(); + } + + bool isAbsolute() const { return getSymbol()->isAbsolute(); } + bool isMoveable() const; + + uint64_t getAddress() const { return Address; } + uint64_t getEndAddress() const { return Address + Size; } + uint64_t getSize() const { return Size; } + uint16_t getAlignment() const { return Alignment; } + uint64_t getOutputOffset() const { return OutputOffset; } + uint64_t getOutputSize() const { return Size; } + + BinarySection &getSection() { return *Section; } + const BinarySection &getSection() const { return *Section; } + StringRef getSectionName() const; + StringRef getOutputSection() const { return OutputSection; } + + bool isMoved() const; + bool containsAddress(uint64_t Address) const { + return ((getAddress() <= Address && Address < getEndAddress()) || + (getAddress() == Address && !getSize())); + } + bool containsRange(uint64_t Address, uint64_t Size) const { + return (getAddress() <= Address && Address + Size <= getEndAddress()); + } + + const BinaryData *getParent() const { + return Parent; + } + + const BinaryData *getRootData() const { + auto *BD = this; + while (BD->Parent) + BD = BD->Parent; + return BD; + } + + const BinaryData *getAtomicRoot() const { + auto *BD = this; + while (!BD->isAtomic() && BD->Parent) + BD = BD->Parent; + return BD; + } + + void setIsMoveable(bool Flag) { IsMoveable = Flag; } + void setOutputOffset(uint64_t Offset) { OutputOffset = Offset; } + void setOutputSection(StringRef Name) { OutputSection = Name; } + void setSection(BinarySection &NewSection); + + virtual void printBrief(raw_ostream &OS) const; + virtual void print(raw_ostream &OS) const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const BinaryData &BD) { + BD.printBrief(OS); + return OS; +} + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 0f59195919a5..7a12d8eeaf7b 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -129,7 +129,7 @@ PrintOnlyRegex("print-only-regex", cl::Hidden, cl::cat(BoltCategory)); -cl::opt +static cl::opt TimeBuild("time-build", cl::desc("print time spent constructing binary functions"), cl::ZeroOrMore, @@ -364,9 +364,9 @@ bool BinaryFunction::isForwardCall(const MCSymbol *CalleeSymbol) const { } } else { // Absolute symbol. - auto const CalleeSI = BC.GlobalSymbols.find(CalleeSymbol->getName()); - assert(CalleeSI != BC.GlobalSymbols.end() && "unregistered symbol found"); - return CalleeSI->second > getAddress(); + auto *CalleeSI = BC.getBinaryDataByName(CalleeSymbol->getName()); + assert(CalleeSI && "unregistered symbol found"); + return CalleeSI->getAddress() > getAddress(); } } @@ -563,7 +563,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, // Print all jump tables. for (auto &JTI : JumpTables) { - JTI.second.print(OS); + JTI.second->print(OS); } OS << "DWARF CFI Instructions:\n"; @@ -675,9 +675,8 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { const auto *Sym = BC.MIA->getTargetSymbol(*PCRelBaseInstr, 1); assert (Sym && "Symbol extraction failed"); - auto SI = BC.GlobalSymbols.find(Sym->getName()); - if (SI != BC.GlobalSymbols.end()) { - PCRelAddr = SI->second; + if (auto *BD = BC.getBinaryDataByName(Sym->getName())) { + PCRelAddr = BD->getAddress(); } else { for (auto &Elmt : Labels) { if (Elmt.second == Sym) { @@ -708,10 +707,12 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, // RIP-relative addressing should be converted to symbol form by now // in processed instructions (but not in jump). if (DispExpr) { - auto SI = - BC.GlobalSymbols.find(BC.MIA->getTargetSymbol(DispExpr)->getName()); - assert(SI != BC.GlobalSymbols.end() && "global symbol needs a value"); - ArrayStart = SI->second; + const MCSymbol *TargetSym; + uint64_t TargetOffset; + std::tie(TargetSym, TargetOffset) = BC.MIA->getTargetSymbolInfo(DispExpr); + auto *BD = BC.getBinaryDataByName(TargetSym->getName()); + assert(BD && "global symbol needs a value"); + ArrayStart = BD->getAddress() + TargetOffset; BaseRegNum = 0; if (BC.TheTriple->getArch() == llvm::Triple::aarch64) { ArrayStart &= ~0xFFFULL; @@ -729,13 +730,13 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, // Check if there's already a jump table registered at this address. if (auto *JT = getJumpTableContainingAddress(ArrayStart)) { - auto JTOffset = ArrayStart - JT->Address; + auto JTOffset = ArrayStart - JT->getAddress(); if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE && JTOffset != 0) { // Adjust the size of this jump table and create a new one if necessary. // We cannot re-use the entries since the offsets are relative to the // table start. DEBUG(dbgs() << "BOLT-DEBUG: adjusting size of jump table at 0x" - << Twine::utohexstr(JT->Address) << '\n'); + << Twine::utohexstr(JT->getAddress()) << '\n'); JT->OffsetEntries.resize(JTOffset / JT->EntrySize); } else { // Re-use an existing jump table. Perhaps parts of it. @@ -750,8 +751,10 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, // Get or create a new label for the table. auto LI = JT->Labels.find(JTOffset); if (LI == JT->Labels.end()) { - auto *JTStartLabel = BC.getOrCreateGlobalSymbol(ArrayStart, - "JUMP_TABLEat"); + auto *JTStartLabel = BC.registerNameAtAddress(generateJumpTableName(ArrayStart), + ArrayStart, + 0, + JT->EntrySize); auto Result = JT->Labels.emplace(JTOffset, JTStartLabel); assert(Result.second && "error adding jump table label"); LI = Result.first; @@ -827,20 +830,33 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) { assert(JTOffsetCandidates.size() > 2 && "expected more than 2 jump table entries"); - auto *JTStartLabel = BC.getOrCreateGlobalSymbol(ArrayStart, "JUMP_TABLEat"); - DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " - << JTStartLabel->getName() - << " in function " << *this << " with " - << JTOffsetCandidates.size() << " entries.\n"); + + auto JumpTableName = generateJumpTableName(ArrayStart); auto JumpTableType = Type == IndirectBranchType::POSSIBLE_JUMP_TABLE ? JumpTable::JTT_NORMAL : JumpTable::JTT_PIC; - JumpTables.emplace(ArrayStart, JumpTable{ArrayStart, - EntrySize, - JumpTableType, - std::move(JTOffsetCandidates), - {{0, JTStartLabel}}}); + + auto *JTStartLabel = BC.Ctx->getOrCreateSymbol(JumpTableName); + + auto JT = llvm::make_unique(JumpTableName, + ArrayStart, + EntrySize, + JumpTableType, + std::move(JTOffsetCandidates), + JumpTable::LabelMapType{{0, JTStartLabel}}, + *BC.getSectionForAddress(ArrayStart)); + + auto *JTLabel = BC.registerNameAtAddress(JumpTableName, + ArrayStart, + JT.get()); + assert(JTLabel == JTStartLabel); + + DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " + << JTStartLabel->getName() + << " in function " << *this << " with " + << JTOffsetCandidates.size() << " entries.\n"); + JumpTables.emplace(ArrayStart, JT.release()); BC.MIA->replaceMemOperandDisp(const_cast(*MemLocInstr), JTStartLabel, BC.Ctx.get()); BC.MIA->setJumpTable(BC.Ctx.get(), Instruction, ArrayStart, IndexRegNum); @@ -849,6 +865,7 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, return Type; } + assert(!Value || BC.getSectionForAddress(Value)); BC.InterproceduralReferences.insert(Value); return IndirectBranchType::POSSIBLE_TAIL_CALL; } @@ -865,9 +882,9 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address, // Check if there's a global symbol registered at given address. // If so - reuse it since we want to keep the symbol value updated. if (Offset != 0) { - if (auto *Symbol = BC.getGlobalSymbolAtAddress(Address)) { - Labels[Offset] = Symbol; - return Symbol; + if (auto *BD = BC.getBinaryDataAtAddress(Address)) { + Labels[Offset] = BD->getSymbol(); + return BD->getSymbol(); } } @@ -903,6 +920,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { auto handlePCRelOperand = [&](MCInst &Instruction, uint64_t Address, uint64_t Size) { uint64_t TargetAddress{0}; + uint64_t TargetOffset{0}; MCSymbol *TargetSymbol{nullptr}; if (!MIA->evaluateMemOperandTarget(Instruction, TargetAddress, Address, Size)) { @@ -970,13 +988,31 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { BC.InterproceduralReferences.insert(TargetAddress); } } - if (!TargetSymbol) - TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); + if (!TargetSymbol) { + auto *BD = BC.getBinaryDataContainingAddress(TargetAddress); + if (BD) { + TargetSymbol = BD->getSymbol(); + TargetOffset = TargetAddress - BD->getAddress(); + } else { + // TODO: use DWARF info to get size/alignment here? + TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, 0, 0, "DATAat"); + DEBUG(if (opts::Verbosity >= 2) { + dbgs() << "Created DATAat sym: " << TargetSymbol->getName() + << " in section " << BD->getSectionName() << "\n"; + }); + } + } + const MCExpr *Expr = MCSymbolRefExpr::create(TargetSymbol, + MCSymbolRefExpr::VK_None, + *BC.Ctx); + if (TargetOffset) { + auto *Offset = MCConstantExpr::create(TargetOffset, *BC.Ctx); + Expr = MCBinaryExpr::createAdd(Expr, Offset, *BC.Ctx); + } MIA->replaceMemOperandDisp( Instruction, MCOperand::createExpr(BC.MIA->getTargetExprFor( Instruction, - MCSymbolRefExpr::create( - TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx), + Expr, *BC.Ctx, 0))); return true; }; @@ -1050,33 +1086,39 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Check if there's a relocation associated with this instruction. bool UsedReloc{false}; - if (!Relocations.empty()) { - auto RI = Relocations.lower_bound(Offset); - if (RI != Relocations.end() && RI->first < Offset + Size) { - const auto &Relocation = RI->second; - DEBUG(dbgs() << "BOLT-DEBUG: replacing immediate with relocation" - " against " << Relocation.Symbol->getName() - << " in function " << *this - << " for instruction at offset 0x" - << Twine::utohexstr(Offset) << '\n'); - int64_t Value; - const auto Result = BC.MIA->replaceImmWithSymbol( - Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value, - Relocation.Type); - (void)Result; - assert(Result && "cannot replace immediate with relocation"); - // For aarch, if we replaced an immediate with a symbol from a - // relocation, we mark it so we do not try to further process a - // pc-relative operand. All we need is the symbol. - if (BC.TheTriple->getArch() == llvm::Triple::aarch64) - UsedReloc = true; - - // Make sure we replaced the correct immediate (instruction - // can have multiple immediate operands). - assert((BC.TheTriple->getArch() == llvm::Triple::aarch64 || - static_cast(Value) == Relocation.Value) && - "immediate value mismatch in function"); - } + for (auto Itr = Relocations.lower_bound(Offset); + Itr != Relocations.upper_bound(Offset + Size); + ++Itr) { + const auto &Relocation = Itr->second; + if (Relocation.Offset >= Offset + Size) + continue; + + DEBUG(dbgs() << "BOLT-DEBUG: replacing immediate with relocation" + " against " << Relocation.Symbol->getName() + << "+" << Relocation.Addend + << " in function " << *this + << " for instruction at offset 0x" + << Twine::utohexstr(Offset) << '\n'); + int64_t Value = Relocation.Value; + const auto Result = BC.MIA->replaceImmWithSymbol(Instruction, + Relocation.Symbol, + Relocation.Addend, + Ctx.get(), + Value, + Relocation.Type); + (void)Result; + assert(Result && "cannot replace immediate with relocation"); + // For aarch, if we replaced an immediate with a symbol from a + // relocation, we mark it so we do not try to further process a + // pc-relative operand. All we need is the symbol. + if (BC.TheTriple->getArch() == llvm::Triple::aarch64) + UsedReloc = true; + + // Make sure we replaced the correct immediate (instruction + // can have multiple immediate operands). + assert((BC.TheTriple->getArch() == llvm::Triple::aarch64 || + static_cast(Value) == Relocation.Value) && + "immediate value mismatch in function"); } // Convert instruction to a shorter version that could be relaxed if needed. @@ -1157,6 +1199,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, + 0, + 0, "FUNCat"); if (TargetAddress == 0) { // We actually see calls to address 0 in presence of weak symbols @@ -1288,12 +1332,13 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { void BinaryFunction::postProcessJumpTables() { // Create labels for all entries. for (auto &JTI : JumpTables) { - auto &JT = JTI.second; + auto &JT = *JTI.second; for (auto Offset : JT.OffsetEntries) { auto *Label = getOrCreateLocalLabel(getAddress() + Offset, /*CreatePastEnd*/ true); JT.Entries.push_back(Label); } + BC.setBinaryDataSize(JT.getAddress(), JT.getSize()); } // Add TakenBranches from JumpTables. @@ -1305,7 +1350,7 @@ void BinaryFunction::postProcessJumpTables() { const auto JTAddress = JTSite.second; const auto *JT = getJumpTableContainingAddress(JTAddress); assert(JT && "cannot find jump table for address"); - auto EntryOffset = JTAddress - JT->Address; + auto EntryOffset = JTAddress - JT->getAddress(); while (EntryOffset < JT->getSize()) { auto TargetOffset = JT->OffsetEntries[EntryOffset / JT->EntrySize]; if (TargetOffset < getSize()) @@ -1313,7 +1358,7 @@ void BinaryFunction::postProcessJumpTables() { // Take ownership of jump table relocations. if (BC.HasRelocations) { - auto EntryAddress = JT->Address + EntryOffset; + auto EntryAddress = JT->getAddress() + EntryOffset; auto Res = BC.removeRelocationAt(EntryAddress); (void)Res; DEBUG( @@ -1335,7 +1380,7 @@ void BinaryFunction::postProcessJumpTables() { // Free memory used by jump table offsets. for (auto &JTI : JumpTables) { - auto &JT = JTI.second; + auto &JT = *JTI.second; clearList(JT.OffsetEntries); } @@ -1755,7 +1800,8 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { << " at offset 0x" << Twine::utohexstr(Address - getAddress()) << '\n'); - auto *EntrySymbol = BC.getGlobalSymbolAtAddress(Address); + auto *EntryBD = BC.getBinaryDataAtAddress(Address); + auto *EntrySymbol = EntryBD ? EntryBD->getSymbol() : nullptr; // If we haven't disassembled the function yet we can add a new entry point // even if it doesn't have an associated entry in the symbol table. @@ -2905,26 +2951,28 @@ bool BinaryFunction::isIdenticalWith(const BinaryFunction &OtherBF, } // Check if symbols are jump tables. - auto SIA = BC.GlobalSymbols.find(A->getName()); - if (SIA == BC.GlobalSymbols.end()) + auto *SIA = BC.getBinaryDataByName(A->getName()); + if (!SIA) return false; - auto SIB = BC.GlobalSymbols.find(B->getName()); - if (SIB == BC.GlobalSymbols.end()) + auto *SIB = BC.getBinaryDataByName(B->getName()); + if (!SIB) return false; - assert((SIA->second != SIB->second) && + assert((SIA->getAddress() != SIB->getAddress()) && "different symbols should not have the same value"); - const auto *JumpTableA = getJumpTableContainingAddress(SIA->second); + const auto *JumpTableA = + getJumpTableContainingAddress(SIA->getAddress()); if (!JumpTableA) return false; + const auto *JumpTableB = - OtherBF.getJumpTableContainingAddress(SIB->second); + OtherBF.getJumpTableContainingAddress(SIB->getAddress()); if (!JumpTableB) return false; - if ((SIA->second - JumpTableA->Address) != - (SIB->second - JumpTableB->Address)) + if ((SIA->getAddress() - JumpTableA->getAddress()) != + (SIB->getAddress() - JumpTableB->getAddress())) return false; return equalJumpTables(JumpTableA, JumpTableB, OtherBF); @@ -2955,6 +3003,24 @@ bool BinaryFunction::isIdenticalWith(const BinaryFunction &OtherBF, return true; } +std::string BinaryFunction::generateJumpTableName(uint64_t Address) const { + auto *JT = getJumpTableContainingAddress(Address); + size_t Id; + uint64_t Offset = 0; + if (JT) { + Offset = Address - JT->getAddress(); + auto Itr = JT->Labels.find(Offset); + if (Itr != JT->Labels.end()) { + return Itr->second->getName(); + } + Id = JumpTableIds.at(JT->getAddress()); + } else { + Id = JumpTableIds[Address] = JumpTables.size(); + } + return ("JUMP_TABLE/" + Names[0] + "." + std::to_string(Id) + + (Offset ? ("." + std::to_string(Offset)) : "")); +} + bool BinaryFunction::equalJumpTables(const JumpTable *JumpTableA, const JumpTable *JumpTableB, const BinaryFunction &BFB) const { @@ -3282,17 +3348,18 @@ void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { outs() << "BOLT-INFO: jump tables for function " << *this << ":\n"; } for (auto &JTI : JumpTables) { - auto &JT = JTI.second; + auto &JT = *JTI.second; if (opts::PrintJumpTables) JT.print(outs()); if (opts::JumpTables == JTS_BASIC && BC.HasRelocations) { - JT.updateOriginal(BC); + JT.updateOriginal(); } else { MCSection *HotSection, *ColdSection; if (opts::JumpTables == JTS_BASIC) { - JT.SectionName = - ".local.JUMP_TABLEat0x" + Twine::utohexstr(JT.Address).str(); - HotSection = BC.Ctx->getELFSection(JT.SectionName, + std::string Name = JT.Labels[0]->getName().str(); + std::replace(Name.begin(), Name.end(), '/', '.'); + JT.setOutputSection(".local." + Name); + HotSection = BC.Ctx->getELFSection(JT.getOutputSection(), ELF::SHT_PROGBITS, ELF::SHF_ALLOC); ColdSection = HotSection; @@ -3311,157 +3378,6 @@ void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { } } -std::pair -BinaryFunction::JumpTable::getEntriesForAddress(const uint64_t Addr) const { - const uint64_t InstOffset = Addr - Address; - size_t StartIndex = 0, EndIndex = 0; - uint64_t Offset = 0; - - for (size_t I = 0; I < Entries.size(); ++I) { - auto LI = Labels.find(Offset); - if (LI != Labels.end()) { - const auto NextLI = std::next(LI); - const auto NextOffset = - NextLI == Labels.end() ? getSize() : NextLI->first; - if (InstOffset >= LI->first && InstOffset < NextOffset) { - StartIndex = I; - EndIndex = I; - while (Offset < NextOffset) { - ++EndIndex; - Offset += EntrySize; - } - break; - } - } - Offset += EntrySize; - } - - return std::make_pair(StartIndex, EndIndex); -} - -bool BinaryFunction::JumpTable::replaceDestination(uint64_t JTAddress, - const MCSymbol *OldDest, - MCSymbol *NewDest) { - bool Patched{false}; - const auto Range = getEntriesForAddress(JTAddress); - for (auto I = &Entries[Range.first], E = &Entries[Range.second]; - I != E; ++I) { - auto &Entry = *I; - if (Entry == OldDest) { - Patched = true; - Entry = NewDest; - } - } - return Patched; -} - -void BinaryFunction::JumpTable::updateOriginal(BinaryContext &BC) { - // In non-relocation mode we have to emit jump tables in local sections. - // This way we only overwrite them when a corresponding function is - // overwritten. - assert(BC.HasRelocations && "relocation mode expected"); - auto Section = BC.getSectionForAddress(Address); - assert(Section && "section not found for jump table"); - uint64_t Offset = Address - Section->getAddress(); - StringRef SectionName = Section->getName(); - for (auto *Entry : Entries) { - const auto RelType = (Type == JTT_NORMAL) ? ELF::R_X86_64_64 - : ELF::R_X86_64_PC32; - const uint64_t RelAddend = (Type == JTT_NORMAL) - ? 0 : Offset - (Address - Section->getAddress()); - DEBUG(dbgs() << "adding relocation to section " << SectionName - << " at offset " << Twine::utohexstr(Offset) << " for symbol " - << Entry->getName() << " with addend " - << Twine::utohexstr(RelAddend) << '\n'); - Section->addRelocation(Offset, Entry, RelType, RelAddend); - Offset += EntrySize; - } -} - -uint64_t BinaryFunction::JumpTable::emit(MCStreamer *Streamer, - MCSection *HotSection, - MCSection *ColdSection) { - // Pre-process entries for aggressive splitting. - // Each label represents a separate switch table and gets its own count - // determining its destination. - std::map LabelCounts; - if (opts::JumpTables > JTS_SPLIT && !Counts.empty()) { - MCSymbol *CurrentLabel = Labels[0]; - uint64_t CurrentLabelCount = 0; - for (unsigned Index = 0; Index < Entries.size(); ++Index) { - auto LI = Labels.find(Index * EntrySize); - if (LI != Labels.end()) { - LabelCounts[CurrentLabel] = CurrentLabelCount; - CurrentLabel = LI->second; - CurrentLabelCount = 0; - } - CurrentLabelCount += Counts[Index].Count; - } - LabelCounts[CurrentLabel] = CurrentLabelCount; - } else { - Streamer->SwitchSection(Count > 0 ? HotSection : ColdSection); - Streamer->EmitValueToAlignment(EntrySize); - } - MCSymbol *LastLabel = nullptr; - uint64_t Offset = 0; - for (auto *Entry : Entries) { - auto LI = Labels.find(Offset); - if (LI != Labels.end()) { - DEBUG(dbgs() << "BOLT-DEBUG: emitting jump table " - << LI->second->getName() << " (originally was at address 0x" - << Twine::utohexstr(Address + Offset) - << (Offset ? "as part of larger jump table\n" : "\n")); - if (!LabelCounts.empty()) { - DEBUG(dbgs() << "BOLT-DEBUG: jump table count: " - << LabelCounts[LI->second] << '\n'); - if (LabelCounts[LI->second] > 0) { - Streamer->SwitchSection(HotSection); - } else { - Streamer->SwitchSection(ColdSection); - } - Streamer->EmitValueToAlignment(EntrySize); - } - Streamer->EmitLabel(LI->second); - LastLabel = LI->second; - } - if (Type == JTT_NORMAL) { - Streamer->EmitSymbolValue(Entry, OutputEntrySize); - } else { // JTT_PIC - auto JT = MCSymbolRefExpr::create(LastLabel, Streamer->getContext()); - auto E = MCSymbolRefExpr::create(Entry, Streamer->getContext()); - auto Value = MCBinaryExpr::createSub(E, JT, Streamer->getContext()); - Streamer->EmitValue(Value, EntrySize); - } - Offset += EntrySize; - } - - return Offset; -} - -void BinaryFunction::JumpTable::print(raw_ostream &OS) const { - uint64_t Offset = 0; - for (const auto *Entry : Entries) { - auto LI = Labels.find(Offset); - if (LI != Labels.end()) { - OS << "Jump Table " << LI->second->getName() << " at @0x" - << Twine::utohexstr(Address+Offset); - if (Offset) { - OS << " (possibly part of larger jump table):\n"; - } else { - OS << " with total count of " << Count << ":\n"; - } - } - OS << format(" 0x%04" PRIx64 " : ", Offset) << Entry->getName(); - if (!Counts.empty()) { - OS << " : " << Counts[Offset / EntrySize].Mispreds - << "/" << Counts[Offset / EntrySize].Count; - } - OS << '\n'; - Offset += EntrySize; - } - OS << "\n\n"; -} - void BinaryFunction::calculateLoopInfo() { // Discover loops. BinaryDominatorTree DomTree; diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index 87bfb15ad654..99e9cb7bfb52 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -22,6 +22,7 @@ #include "BinaryLoop.h" #include "DataReader.h" #include "DebugData.h" +#include "JumpTable.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/iterator.h" @@ -51,8 +52,6 @@ class DWARFDebugInfoEntryMinimal; namespace bolt { -struct SectionInfo; - using DWARFUnitLineTable = std::pair; @@ -150,14 +149,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) { DynoStats operator+(const DynoStats &A, const DynoStats &B); -enum JumpTableSupportLevel : char { - JTS_NONE = 0, /// Disable jump tables support. - JTS_BASIC = 1, /// Enable basic jump tables support (in-place). - JTS_MOVE = 2, /// Move jump tables to a separate section. - JTS_SPLIT = 3, /// Enable hot/cold splitting of jump tables. - JTS_AGGRESSIVE = 4, /// Aggressive splitting of jump tables. -}; - enum IndirectCallPromotionType : char { ICP_NONE, /// Don't perform ICP. ICP_CALLS, /// Perform ICP on indirect calls. @@ -231,12 +222,6 @@ class BinaryFunction { ST_ALL, /// Split all functions }; - /// Branch statistics for jump table entries. - struct JumpInfo { - uint64_t Mispreds{0}; - uint64_t Count{0}; - }; - static constexpr uint64_t COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; @@ -567,90 +552,17 @@ class BinaryFunction { /// function and that apply before the entry basic block). CFIInstrMapType CIEFrameInstructions; -public: - /// Representation of a jump table. - /// - /// The jump table may include other jump tables that are referenced by - /// a different label at a different offset in this jump table. - struct JumpTable { - enum JumpTableType : char { - JTT_NORMAL, - JTT_PIC, - }; - - /// Original address. - uint64_t Address; - - /// Size of the entry used for storage. - std::size_t EntrySize; - - /// Size of the entry size we will write (we may use a more compact layout) - std::size_t OutputEntrySize; - - /// The type of this jump table. - JumpTableType Type; - - /// All the entries as labels. - std::vector Entries; - - /// All the entries as offsets into a function. Invalid after CFG is built. - std::vector OffsetEntries; - - /// Map -> Hasher; - size_t Seed = Hasher(Val.first); - hashCombine(Seed, Val.second); - return Seed; - } -}; - -} - -void ClusterAlgorithm::computeClusterAverageFrequency() { - AvgFreq.resize(Clusters.size(), 0.0); - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { - double Freq = 0.0; - for (auto BB : Clusters[I]) { - if (BB->getNumNonPseudos() > 0) - Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos(); - } - AvgFreq[I] = Freq; - } -} - -void ClusterAlgorithm::printClusters() const { - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { - errs() << "Cluster number " << I; - if (AvgFreq.size() == Clusters.size()) - errs() << " (frequency: " << AvgFreq[I] << ")"; - errs() << " : "; - auto Sep = ""; - for (auto BB : Clusters[I]) { - errs() << Sep << BB->getName(); - Sep = ", "; - } - errs() << "\n"; - } -} - -void ClusterAlgorithm::reset() { - Clusters.clear(); - ClusterEdges.clear(); - AvgFreq.clear(); -} - -void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { - OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; -} - -size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { - HashPair Hasher; - return Hasher(std::make_pair(E.Src, E.Dst)); -} - -bool GreedyClusterAlgorithm::EdgeEqual::operator()( - const EdgeTy &A, const EdgeTy &B) const { - return A.Src == B.Src && A.Dst == B.Dst; -} - -void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF, - bool ComputeEdges) { - reset(); - - // Greedy heuristic implementation for the TSP, applied to BB layout. Try to - // maximize weight during a path traversing all BBs. In this way, we will - // convert the hottest branches into fall-throughs. - - // This is the queue of edges from which we will pop edges and use them to - // cluster basic blocks in a greedy fashion. - std::vector Queue; - - // Initialize inter-cluster weights. - if (ComputeEdges) - ClusterEdges.resize(BF.layout_size()); - - // Initialize clusters and edge queue. - for (auto BB : BF.layout()) { - // Create a cluster for this BB. - uint32_t I = Clusters.size(); - Clusters.emplace_back(); - auto &Cluster = Clusters.back(); - Cluster.push_back(BB); - BBToClusterMap[BB] = I; - // Populate priority queue with edges. - auto BI = BB->branch_info_begin(); - for (auto &I : BB->successors()) { - assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "attempted reordering blocks of function with no profile data"); - Queue.emplace_back(EdgeTy(BB, I, BI->Count)); - ++BI; - } - } - // Sort and adjust the edge queue. - initQueue(Queue, BF); - - // Grow clusters in a greedy fashion. - while (!Queue.empty()) { - auto E = Queue.back(); - Queue.pop_back(); - - const auto *SrcBB = E.Src; - const auto *DstBB = E.Dst; - - DEBUG(dbgs() << "Popped edge "; - E.print(dbgs()); - dbgs() << "\n"); - - // Case 1: BBSrc and BBDst are the same. Ignore this edge - if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { - DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); - continue; - } - - int I = BBToClusterMap[SrcBB]; - int J = BBToClusterMap[DstBB]; - - // Case 2: If they are already allocated at the same cluster, just increase - // the weight of this cluster - if (I == J) { - if (ComputeEdges) - ClusterEdges[I][I] += E.Count; - DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); - continue; - } - - auto &ClusterA = Clusters[I]; - auto &ClusterB = Clusters[J]; - if (areClustersCompatible(ClusterA, ClusterB, E)) { - // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, - // allowing us to merge two clusters. - for (auto BB : ClusterB) - BBToClusterMap[BB] = I; - ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); - ClusterB.clear(); - if (ComputeEdges) { - // Increase the intra-cluster edge count of cluster A with the count of - // this edge as well as with the total count of previously visited edges - // from cluster B cluster A. - ClusterEdges[I][I] += E.Count; - ClusterEdges[I][I] += ClusterEdges[J][I]; - // Iterate through all inter-cluster edges and transfer edges targeting - // cluster B to cluster A. - for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) - ClusterEdges[K][I] += ClusterEdges[K][J]; - } - // Adjust the weights of the remaining edges and re-sort the queue. - adjustQueue(Queue, BF); - DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); - } else { - // Case 4: Both SrcBB and DstBB are allocated in positions we cannot - // merge them. Add the count of this edge to the inter-cluster edge count - // between clusters A and B to help us decide ordering between these - // clusters. - if (ComputeEdges) - ClusterEdges[I][J] += E.Count; - DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); - } - } -} - -void GreedyClusterAlgorithm::reset() { - ClusterAlgorithm::reset(); - BBToClusterMap.clear(); -} - -void PHGreedyClusterAlgorithm::initQueue( - std::vector &Queue, const BinaryFunction &BF) { - // Define a comparison function to establish SWO between edges. - auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) { - // With equal weights, prioritize branches with lower index - // source/destination. This helps to keep original block order for blocks - // when optimal order cannot be deducted from a profile. - if (A.Count == B.Count) { - const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); - return (SrcOrder != 0) - ? SrcOrder > 0 - : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; - } - return A.Count < B.Count; - }; - - // Sort edges in increasing profile count order. - std::sort(Queue.begin(), Queue.end(), Comp); -} - -void PHGreedyClusterAlgorithm::adjustQueue( - std::vector &Queue, const BinaryFunction &BF) { - // Nothing to do. - return; -} - -bool PHGreedyClusterAlgorithm::areClustersCompatible( - const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { - return Front.back() == E.Src && Back.front() == E.Dst; -} - -int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( - const EdgeTy &E, const BinaryFunction &BF) const { - const BinaryBasicBlock *SrcBB = E.Src; - const BinaryBasicBlock *DstBB = E.Dst; - - // Initial weight value. - int64_t W = (int64_t)E.Count; - - // Adjust the weight by taking into account other edges with the same source. - auto BI = SrcBB->branch_info_begin(); - for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { - assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "attempted reordering blocks of function with no profile data"); - assert(BI->Count <= std::numeric_limits::max() && - "overflow detected"); - // Ignore edges with same source and destination, edges that target the - // entry block as well as the edge E itself. - if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB) - W -= (int64_t)BI->Count; - ++BI; - } - - // Adjust the weight by taking into account other edges with the same - // destination. - for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { - // Ignore edges with same source and destination as well as the edge E - // itself. - if (PredBB == DstBB || PredBB == SrcBB) - continue; - auto BI = PredBB->branch_info_begin(); - for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { - if (SuccBB == DstBB) - break; - ++BI; - } - assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); - assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "attempted reordering blocks of function with no profile data"); - assert(BI->Count <= std::numeric_limits::max() && - "overflow detected"); - W -= (int64_t)BI->Count; - } - - return W; -} - -void MinBranchGreedyClusterAlgorithm::initQueue( - std::vector &Queue, const BinaryFunction &BF) { - // Initialize edge weights. - for (const EdgeTy &E : Queue) - Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); - - // Sort edges in increasing weight order. - adjustQueue(Queue, BF); -} - -void MinBranchGreedyClusterAlgorithm::adjustQueue( - std::vector &Queue, const BinaryFunction &BF) { - // Define a comparison function to establish SWO between edges. - auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) { - // With equal weights, prioritize branches with lower index - // source/destination. This helps to keep original block order for blocks - // when optimal order cannot be deduced from a profile. - if (Weight[A] == Weight[B]) { - const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); - return (SrcOrder != 0) - ? SrcOrder > 0 - : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; - } - return Weight[A] < Weight[B]; - }; - - // Iterate through all remaining edges to find edges that have their - // source and destination in the same cluster. - std::vector NewQueue; - for (const EdgeTy &E : Queue) { - const auto *SrcBB = E.Src; - const auto *DstBB = E.Dst; - - // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore - // this edge. - if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { - DEBUG(dbgs() << "\tAdjustment: Ignored edge "; - E.print(dbgs()); - dbgs() << " (same src, dst)\n"); - continue; - } - - int I = BBToClusterMap[SrcBB]; - int J = BBToClusterMap[DstBB]; - auto &ClusterA = Clusters[I]; - auto &ClusterB = Clusters[J]; - - // Case 2: They are already allocated at the same cluster or incompatible - // clusters. Adjust the weights of edges with the same source or - // destination, so that this edge has no effect on them any more, and ignore - // this edge. Also increase the intra- (or inter-) cluster edge count. - if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { - if (!ClusterEdges.empty()) - ClusterEdges[I][J] += E.Count; - DEBUG(dbgs() << "\tAdjustment: Ignored edge "; - E.print(dbgs()); - dbgs() << " (src, dst belong to same cluster or incompatible " - "clusters)\n"); - for (const auto *SuccBB : SrcBB->successors()) { - if (SuccBB == DstBB) - continue; - auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); - assert(WI != Weight.end() && "CFG edge not found in Weight map"); - WI->second += (int64_t)E.Count; - } - for (const auto *PredBB : DstBB->predecessors()) { - if (PredBB == SrcBB) - continue; - auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); - assert(WI != Weight.end() && "CFG edge not found in Weight map"); - WI->second += (int64_t)E.Count; - } - continue; - } - - // Case 3: None of the previous cases is true, so just keep this edge in - // the queue. - NewQueue.emplace_back(E); - } - - // Sort remaining edges in increasing weight order. - Queue.swap(NewQueue); - std::sort(Queue.begin(), Queue.end(), Comp); -} - -bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( - const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { - return Front.back() == E.Src && Back.front() == E.Dst; -} - -void MinBranchGreedyClusterAlgorithm::reset() { - GreedyClusterAlgorithm::reset(); - Weight.clear(); -} - -void OptimalReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { - std::vector> Weight; - std::unordered_map BBToIndex; - std::vector IndexToBB; - - unsigned N = BF.layout_size(); - // Populating weight map and index map - for (auto BB : BF.layout()) { - BBToIndex[BB] = IndexToBB.size(); - IndexToBB.push_back(BB); - } - Weight.resize(N); - for (auto BB : BF.layout()) { - auto BI = BB->branch_info_begin(); - Weight[BBToIndex[BB]].resize(N); - for (auto I : BB->successors()) { - if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) - Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count; - ++BI; - } - } - - std::vector> DP; - DP.resize(1 << N); - for (auto &Elmt : DP) { - Elmt.resize(N, -1); - } - // Start with the entry basic block being allocated with cost zero - DP[1][0] = 0; - // Walk through TSP solutions using a bitmask to represent state (current set - // of BBs in the layout) - unsigned BestSet = 1; - unsigned BestLast = 0; - int64_t BestWeight = 0; - for (unsigned Set = 1; Set < (1U << N); ++Set) { - // Traverse each possibility of Last BB visited in this layout - for (unsigned Last = 0; Last < N; ++Last) { - // Case 1: There is no possible layout with this BB as Last - if (DP[Set][Last] == -1) - continue; - - // Case 2: There is a layout with this Set and this Last, and we try - // to expand this set with New - for (unsigned New = 1; New < N; ++New) { - // Case 2a: BB "New" is already in this Set - if ((Set & (1 << New)) != 0) - continue; - - // Case 2b: BB "New" is not in this set and we add it to this Set and - // record total weight of this layout with "New" as the last BB. - unsigned NewSet = (Set | (1 << New)); - if (DP[NewSet][New] == -1) - DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; - DP[NewSet][New] = std::max(DP[NewSet][New], - DP[Set][Last] + (int64_t)Weight[Last][New]); - - if (DP[NewSet][New] > BestWeight) { - BestWeight = DP[NewSet][New]; - BestSet = NewSet; - BestLast = New; - } - } - } - } - - // Define final function layout based on layout that maximizes weight - unsigned Last = BestLast; - unsigned Set = BestSet; - std::vector Visited; - Visited.resize(N); - Visited[Last] = true; - Order.push_back(IndexToBB[Last]); - Set = Set & ~(1U << Last); - while (Set != 0) { - int64_t Best = -1; - for (unsigned I = 0; I < N; ++I) { - if (DP[Set][I] == -1) - continue; - if (DP[Set][I] > Best) { - Last = I; - Best = DP[Set][I]; - } - } - Visited[Last] = true; - Order.push_back(IndexToBB[Last]); - Set = Set & ~(1U << Last); - } - std::reverse(Order.begin(), Order.end()); - - // Finalize layout with BBs that weren't assigned to the layout - for (auto BB : BF.layout()) { - if (Visited[BBToIndex[BB]] == false) - Order.push_back(BB); - } -} - -void OptimizeReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { - if (BF.layout_empty()) - return; - - // Cluster basic blocks. - CAlgo->clusterBasicBlocks(BF); - - if (opts::PrintClusters) - CAlgo->printClusters(); - - // Arrange basic blocks according to clusters. - for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) - Order.insert(Order.end(), Cluster.begin(), Cluster.end()); -} - -void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { - if (BF.layout_empty()) - return; - - // Cluster basic blocks. - CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true); - std::vector &Clusters = CAlgo->Clusters; - auto &ClusterEdges = CAlgo->ClusterEdges; - - // Compute clusters' average frequencies. - CAlgo->computeClusterAverageFrequency(); - std::vector &AvgFreq = CAlgo->AvgFreq; - - if (opts::PrintClusters) - CAlgo->printClusters(); - - // Cluster layout order - std::vector ClusterOrder; - - // Do a topological sort for clusters, prioritizing frequently-executed BBs - // during the traversal. - std::stack Stack; - std::vector Status; - std::vector Parent; - Status.resize(Clusters.size(), 0); - Parent.resize(Clusters.size(), 0); - constexpr uint32_t STACKED = 1; - constexpr uint32_t VISITED = 2; - Status[0] = STACKED; - Stack.push(0); - while (!Stack.empty()) { - uint32_t I = Stack.top(); - if (!(Status[I] & VISITED)) { - Status[I] |= VISITED; - // Order successors by weight - auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { - return ClusterEdges[I][A] > ClusterEdges[I][B]; - }; - std::priority_queue, - decltype(ClusterComp)> SuccQueue(ClusterComp); - for (auto &Target: ClusterEdges[I]) { - if (Target.second > 0 && !(Status[Target.first] & STACKED) && - !Clusters[Target.first].empty()) { - Parent[Target.first] = I; - Status[Target.first] = STACKED; - SuccQueue.push(Target.first); - } - } - while (!SuccQueue.empty()) { - Stack.push(SuccQueue.top()); - SuccQueue.pop(); - } - continue; - } - // Already visited this node - Stack.pop(); - ClusterOrder.push_back(I); - } - std::reverse(ClusterOrder.begin(), ClusterOrder.end()); - // Put unreachable clusters at the end - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!(Status[I] & VISITED) && !Clusters[I].empty()) - ClusterOrder.push_back(I); - - // Sort nodes with equal precedence - auto Beg = ClusterOrder.begin(); - // Don't reorder the first cluster, which contains the function entry point - ++Beg; - std::stable_sort(Beg, ClusterOrder.end(), - [&AvgFreq, &Parent](uint32_t A, uint32_t B) { - uint32_t P = Parent[A]; - while (Parent[P] != 0) { - if (Parent[P] == B) - return false; - P = Parent[P]; - } - P = Parent[B]; - while (Parent[P] != 0) { - if (Parent[P] == A) - return true; - P = Parent[P]; - } - return AvgFreq[A] > AvgFreq[B]; - }); - - if (opts::PrintClusters) { - errs() << "New cluster order: "; - auto Sep = ""; - for (auto O : ClusterOrder) { - errs() << Sep << O; - Sep = ", "; - } - errs() << '\n'; - } - - // Arrange basic blocks according to cluster order. - for (uint32_t ClusterIndex : ClusterOrder) { - ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; - Order.insert(Order.end(), Cluster.begin(), Cluster.end()); - } -} - -void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { - if (BF.layout_empty()) - return; - - // Cluster basic blocks. - CAlgo->clusterBasicBlocks(BF); - std::vector &Clusters = CAlgo->Clusters; - - // Compute clusters' average frequencies. - CAlgo->computeClusterAverageFrequency(); - std::vector &AvgFreq = CAlgo->AvgFreq; - - if (opts::PrintClusters) - CAlgo->printClusters(); - - // Cluster layout order - std::vector ClusterOrder; - - // Order clusters based on average instruction execution frequency - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!Clusters[I].empty()) - ClusterOrder.push_back(I); - // Don't reorder the first cluster, which contains the function entry point - std::stable_sort(std::next(ClusterOrder.begin()), - ClusterOrder.end(), - [&AvgFreq](uint32_t A, uint32_t B) { - return AvgFreq[A] > AvgFreq[B]; - }); - - if (opts::PrintClusters) { - errs() << "New cluster order: "; - auto Sep = ""; - for (auto O : ClusterOrder) { - errs() << Sep << O; - Sep = ", "; - } - errs() << '\n'; - } - - // Arrange basic blocks according to cluster order. - for (uint32_t ClusterIndex : ClusterOrder) { - ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; - Order.insert(Order.end(), Cluster.begin(), Cluster.end()); - } -} - -void ReverseReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { - if (BF.layout_empty()) - return; - - auto FirstBB = *BF.layout_begin(); - Order.push_back(FirstBB); - for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) - Order.push_back(*RLI); -} - - -void RandomClusterReorderAlgorithm::reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const { - if (BF.layout_empty()) - return; - - // Cluster basic blocks. - CAlgo->clusterBasicBlocks(BF); - std::vector &Clusters = CAlgo->Clusters; - - if (opts::PrintClusters) - CAlgo->printClusters(); - - // Cluster layout order - std::vector ClusterOrder; - - // Order clusters based on average instruction execution frequency - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!Clusters[I].empty()) - ClusterOrder.push_back(I); - - std::srand(opts::RandomSeed); - std::random_shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end()); - - if (opts::PrintClusters) { - errs() << "New cluster order: "; - auto Sep = ""; - for (auto O : ClusterOrder) { - errs() << Sep << O; - Sep = ", "; - } - errs() << '\n'; - } - - // Arrange basic blocks according to cluster order. - for (uint32_t ClusterIndex : ClusterOrder) { - ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; - Order.insert(Order.end(), Cluster.begin(), Cluster.end()); - } -} diff --git a/bolt/ReorderAlgorithm.h b/bolt/ReorderAlgorithm.h deleted file mode 100644 index ff190191da0e..000000000000 --- a/bolt/ReorderAlgorithm.h +++ /dev/null @@ -1,269 +0,0 @@ -//===- ReorderAlgorithm.h - Interface for basic block reorderng algorithms ===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Interface to different basic block reordering algorithms. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H -#define LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H - -#include "BinaryFunction.h" -#include "llvm/Support/ErrorHandling.h" -#include -#include -#include - - -namespace llvm { - -class raw_ostream; - - -namespace bolt { - -class BinaryBasicBlock; -class BinaryFunction; - -/// Objects of this class implement various basic block clustering algorithms. -/// Basic block clusters are chains of basic blocks that should be laid out -/// in this order to maximize performace. These algorithms group basic blocks -/// into clusters using execution profile data and various heuristics. -class ClusterAlgorithm { -public: - using ClusterTy = std::vector; - std::vector Clusters; - std::vector> ClusterEdges; - std::vector AvgFreq; - - /// Group the basic blocks in the given function into clusters stored in the - /// Clusters vector. Also encode relative weights between two clusters in - /// the ClusterEdges vector if requested. This vector is indexed by - /// the clusters indices in the Clusters vector. - virtual void clusterBasicBlocks(const BinaryFunction &BF, - bool ComputeEdges = false) = 0; - - /// Compute for each cluster its averagae execution frequency, that is - /// the sum of average frequencies of its blocks (execution count / # instrs). - /// The average frequencies are stored in the AvgFreq vector, index by the - /// cluster indices in the Clusters vector. - void computeClusterAverageFrequency(); - - /// Clear clusters and related info. - virtual void reset(); - - void printClusters() const; - - virtual ~ClusterAlgorithm() {} -}; - -/// Base class for a greedy clustering algorithm that selects edges in order -/// based on some heuristic and uses them to join basic blocks into clusters. -class GreedyClusterAlgorithm : public ClusterAlgorithm { -protected: - // Represents an edge between two basic blocks, with source, destination, and - // profile count. - struct EdgeTy { - const BinaryBasicBlock *Src; - const BinaryBasicBlock *Dst; - uint64_t Count; - - EdgeTy(const BinaryBasicBlock *Src, const BinaryBasicBlock *Dst, - uint64_t Count) : - Src(Src), Dst(Dst), Count(Count) {} - - void print(raw_ostream &OS) const; - }; - - struct EdgeHash { - size_t operator() (const EdgeTy &E) const; - }; - - struct EdgeEqual { - bool operator() (const EdgeTy &A, const EdgeTy &B) const; - }; - - // Virtual methods that allow custom specialization of the heuristic used by - // the algorithm to select edges. - virtual void initQueue( - std::vector &Queue, const BinaryFunction &BF) = 0; - virtual void adjustQueue( - std::vector &Queue, const BinaryFunction &BF) = 0; - virtual bool areClustersCompatible( - const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const = 0; - - // Map from basic block to owning cluster index. - using BBToClusterMapTy = std::unordered_map; - BBToClusterMapTy BBToClusterMap; - -public: - void clusterBasicBlocks(const BinaryFunction &BF, - bool ComputeEdges = false) override; - void reset() override; -}; - - -/// This clustering algorithm is based on a greedy heuristic suggested by -/// Pettis and Hansen (PLDI '90). -class PHGreedyClusterAlgorithm : public GreedyClusterAlgorithm { -protected: - void initQueue( - std::vector &Queue, const BinaryFunction &BF) override; - void adjustQueue( - std::vector &Queue, const BinaryFunction &BF) override; - bool areClustersCompatible( - const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const - override; -}; - - -/// This clustering algorithm is based on a greedy heuristic that is a -/// modification of the heuristic suggested by Pettis (PLDI '90). It is -/// geared towards minimizing branches. -class MinBranchGreedyClusterAlgorithm : public GreedyClusterAlgorithm { -private: - // Map from an edge to its weight which is used by the algorithm to sort the - // edges. - std::unordered_map Weight; - - // The weight of an edge is calculated as the win in branches if we choose - // to layout this edge as a fall-through. For example, consider the edges - // A -> B with execution count 500, - // A -> C with execution count 100, and - // D -> B with execution count 150 - // wher B, C are the only successors of A and A, D are thr only predessecors - // of B. Then if we choose to layout edge A -> B as a fallthrough, the win in - // branches would be 500 - 100 - 150 = 250. That is the weight of edge A->B. - int64_t calculateWeight(const EdgeTy &E, const BinaryFunction &BF) const; - -protected: - void initQueue( - std::vector &Queue, const BinaryFunction &BF) override; - void adjustQueue( - std::vector &Queue, const BinaryFunction &BF) override; - bool areClustersCompatible( - const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const - override; - -public: - void reset() override; -}; - - -/// Objects of this class implement various basic block reordering alogrithms. -/// Most of these algorithms depend on a clustering alogrithm. -/// Here we have 3 conflicting goals as to how to layout clusters. If we want -/// to minimize jump offsets, we should put clusters with heavy inter-cluster -/// dependence as close as possible. If we want to maximize the probability -/// that all inter-cluster edges are predicted as not-taken, we should enforce -/// a topological order to make targets appear after sources, creating forward -/// branches. If we want to separate hot from cold blocks to maximize the -/// probability that unfrequently executed code doesn't pollute the cache, we -/// should put clusters in descending order of hotness. -class ReorderAlgorithm { -protected: - std::unique_ptr CAlgo; - -public: - ReorderAlgorithm() { } - explicit ReorderAlgorithm(std::unique_ptr CAlgo) : - CAlgo(std::move(CAlgo)) { } - - using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; - - /// Reorder the basic blocks of the given function and store the new order in - /// the new Clusters vector. - virtual void reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const = 0; - - void setClusterAlgorithm(ClusterAlgorithm *CAlgo) { - this->CAlgo.reset(CAlgo); - } - - virtual ~ReorderAlgorithm() { } -}; - - -/// Dynamic programming implementation for the TSP, applied to BB layout. Find -/// the optimal way to maximize weight during a path traversing all BBs. In -/// this way, we will convert the hottest branches into fall-throughs. -/// -/// Uses exponential amount of memory on the number of basic blocks and should -/// only be used for small functions. -class OptimalReorderAlgorithm : public ReorderAlgorithm { -public: - void reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const override; -}; - - -/// Simple algorithm that groups basic blocks into clusters and then -/// lays them out cluster after cluster. -class OptimizeReorderAlgorithm : public ReorderAlgorithm { -public: - explicit OptimizeReorderAlgorithm(std::unique_ptr CAlgo) : - ReorderAlgorithm(std::move(CAlgo)) { } - - void reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const override; -}; - - -/// This reorder algorithm tries to ensure that all inter-cluster edges are -/// predicted as not-taken, by enforcing a topological order to make -/// targets appear after sources, creating forward branches. -class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm { -public: - explicit OptimizeBranchReorderAlgorithm( - std::unique_ptr CAlgo) : - ReorderAlgorithm(std::move(CAlgo)) { } - - void reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const override; -}; - - -/// This reorder tries to separate hot from cold blocks to maximize the -/// probability that unfrequently executed code doesn't pollute the cache, by -/// putting clusters in descending order of hotness. -class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm { -public: - explicit OptimizeCacheReorderAlgorithm( - std::unique_ptr CAlgo) : - ReorderAlgorithm(std::move(CAlgo)) { } - - void reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const override; -}; - - -/// Toy example that simply reverses the original basic block order. -class ReverseReorderAlgorithm : public ReorderAlgorithm { -public: - void reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const override; -}; - -/// Create clusters as usual and place them in random order. -class RandomClusterReorderAlgorithm : public ReorderAlgorithm { -public: - explicit RandomClusterReorderAlgorithm( - std::unique_ptr CAlgo) : - ReorderAlgorithm(std::move(CAlgo)) { } - - void reorderBasicBlocks( - const BinaryFunction &BF, BasicBlockOrder &Order) const override; -}; - -} // namespace bolt -} // namespace llvm - -#endif - diff --git a/bolt/llvm.patch b/bolt/llvm.patch new file mode 100644 index 000000000000..53272088cb62 --- /dev/null +++ b/bolt/llvm.patch @@ -0,0 +1,2534 @@ +diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h +index 124c2a8..03af230 100644 +--- a/include/llvm/ADT/BitVector.h ++++ b/include/llvm/ADT/BitVector.h +@@ -591,6 +591,11 @@ public: + return *this; + } + ++ friend BitVector operator|(BitVector LHS, const BitVector &RHS) { ++ LHS |= RHS; ++ return LHS; ++ } ++ + BitVector &operator^=(const BitVector &RHS) { + if (size() < RHS.size()) + resize(RHS.size()); +diff --git a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h +index 84b2339..9ed1792 100644 +--- a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h ++++ b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h +@@ -28,12 +28,15 @@ class raw_ostream; + class DWARFAbbreviationDeclaration { + public: + struct AttributeSpec { +- AttributeSpec(dwarf::Attribute A, dwarf::Form F, int64_t Value) +- : Attr(A), Form(F), Value(Value) { ++ AttributeSpec(dwarf::Attribute A, dwarf::Form F, int64_t Value, ++ uint32_t AttrOffset = -1U, uint32_t FormOffset = -1U) ++ : Attr(A), Form(F), AttrOffset(AttrOffset), FormOffset(FormOffset), ++ Value(Value) { + assert(isImplicitConst()); + } +- AttributeSpec(dwarf::Attribute A, dwarf::Form F, Optional ByteSize) +- : Attr(A), Form(F) { ++ AttributeSpec(dwarf::Attribute A, dwarf::Form F, Optional ByteSize, ++ uint32_t AttrOffset = -1U, uint32_t FormOffset = -1U) ++ : Attr(A), Form(F), AttrOffset(AttrOffset), FormOffset(FormOffset) { + assert(!isImplicitConst()); + this->ByteSize.HasByteSize = ByteSize.hasValue(); + if (this->ByteSize.HasByteSize) +@@ -42,6 +45,8 @@ public: + + dwarf::Attribute Attr; + dwarf::Form Form; ++ uint32_t AttrOffset; ++ uint32_t FormOffset; + + private: + /// The following field is used for ByteSize for non-implicit_const +@@ -112,6 +117,8 @@ public: + return AttributeSpecs[idx].Attr; + } + ++ const AttributeSpec *findAttribute(dwarf::Attribute Attr) const; ++ + /// Get the index of the specified attribute. + /// + /// Searches the this abbreviation declaration for the index of the specified +@@ -133,7 +140,8 @@ public: + /// \returns Optional DWARF form value if the attribute was extracted. + Optional getAttributeValue(const uint32_t DIEOffset, + const dwarf::Attribute Attr, +- const DWARFUnit &U) const; ++ const DWARFUnit &U, ++ uint32_t *OffsetPtr = 0) const; + + bool extract(DataExtractor Data, uint32_t* OffsetPtr); + void dump(raw_ostream &OS) const; +diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h +index e842cf2..83b0dbe 100644 +--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h ++++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h +@@ -225,6 +225,9 @@ public: + /// Get a pointer to the parsed DebugLoc object. + const DWARFDebugLoc *getDebugLoc(); + ++ /// Extract one location list corresponding in \p Offset ++ Optional getOneDebugLocList(uint32_t *Offset); ++ + /// Get a pointer to the parsed dwo abbreviations object. + const DWARFDebugAbbrev *getDebugAbbrevDWO(); + +@@ -280,6 +283,10 @@ public: + /// given address where applicable. + DIEsForAddress getDIEsForAddress(uint64_t Address); + ++ /// Get offset to an attribute value within a compile unit ++ /// or 0 if the attribute was not found. ++ uint32_t getAttrFieldOffsetForUnit(DWARFUnit *U, dwarf::Attribute Attr) const; ++ + DILineInfo getLineInfoForAddress(uint64_t Address, + DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override; + DILineInfoTable getLineInfoForAddressRange(uint64_t Address, uint64_t Size, +@@ -302,7 +309,7 @@ public: + static std::unique_ptr + create(const object::ObjectFile &Obj, const LoadedObjectInfo *L = nullptr, + function_ref HandleError = defaultErrorHandler, +- std::string DWPName = ""); ++ std::string DWPName = "", bool UsesRelocs = true); + + static std::unique_ptr + create(const StringMap> &Sections, +@@ -313,7 +320,6 @@ public: + /// have initialized the relevant target descriptions. + Error loadRegisterInfo(const object::ObjectFile &Obj); + +-private: + /// Return the compile unit that includes an offset (relative to .debug_info). + DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset); + +diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h +index ff1c7fb..2622a4e 100644 +--- a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h ++++ b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h +@@ -16,6 +16,7 @@ + #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h" + #include "llvm/DebugInfo/DWARF/DWARFExpression.h" + #include "llvm/Support/Error.h" ++#include + #include + #include + +@@ -279,9 +280,20 @@ public: + void dump(raw_ostream &OS, const MCRegisterInfo *MRI, + Optional Offset) const; + ++ using RefHandlerType = ++ std::function; ++ + /// Parse the section from raw data. \p Data is assumed to contain the whole + /// frame section contents to be parsed. +- void parse(DWARFDataExtractor Data); ++ /// If non-null RefHandler is passed, call it for every encountered external ++ /// reference in frame data. The expected signature is: ++ /// ++ /// void RefHandler(uint64_t Value, uint64_t Offset, uint64_t Type); ++ /// ++ /// where Value is a value of the reference, Offset - is an offset into the ++ /// frame data at which the reference occured, and Type is a DWARF encoding ++ /// type of the reference. ++ void parse(DWARFDataExtractor Data, RefHandlerType RefHandler= nullptr); + + /// Return whether the section has any entries. + bool empty() const { return Entries.empty(); } +@@ -293,6 +305,12 @@ public: + return iterator_range(Entries.begin(), Entries.end()); + } + ++ using FDEFunction = std::function; ++ ++ /// Call function F for every FDE in the frame. ++ void for_each_FDE(FDEFunction F) const; ++ ++ + uint64_t getEHFrameAddress() const { return EHFrameAddress; } + }; + +diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h +index a6d319a..39674a9 100644 +--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h ++++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h +@@ -68,6 +68,9 @@ public: + /// Return the location list at the given offset or nullptr. + LocationList const *getLocationListAtOffset(uint64_t Offset) const; + ++ /// Returns the parsed location lists. ++ const LocationLists &getLocationLists() const { return Locations; } ++ + Optional parseOneLocationList(DWARFDataExtractor Data, + uint32_t *Offset); + }; +diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h +index 39a3dd3..8427987 100644 +--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h ++++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h +@@ -130,7 +130,8 @@ public: + /// \param Attr the attribute to extract. + /// \returns an optional DWARFFormValue that will have the form value if the + /// attribute was successfully extracted. +- Optional find(dwarf::Attribute Attr) const; ++ Optional find(dwarf::Attribute Attr, ++ uint32_t *OffsetPtr = 0) const; + + /// Extract the first value of any attribute in Attrs from this DIE. + /// +diff --git a/include/llvm/DebugInfo/DWARF/DWARFObject.h b/include/llvm/DebugInfo/DWARF/DWARFObject.h +index 795eddd..43243e7 100644 +--- a/include/llvm/DebugInfo/DWARF/DWARFObject.h ++++ b/include/llvm/DebugInfo/DWARF/DWARFObject.h +@@ -41,6 +41,7 @@ public: + virtual StringRef getARangeSection() const { return ""; } + virtual StringRef getDebugFrameSection() const { return ""; } + virtual StringRef getEHFrameSection() const { return ""; } ++ virtual uint64_t getEHFrameAddress() const { return 0; } + virtual const DWARFSection &getLineSection() const { return Dummy; } + virtual StringRef getLineStringSection() const { return ""; } + virtual StringRef getStringSection() const { return ""; } +diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h +index 7932688..51bf471 100644 +--- a/include/llvm/ExecutionEngine/ExecutionEngine.h ++++ b/include/llvm/ExecutionEngine/ExecutionEngine.h +@@ -251,6 +251,16 @@ public: + "EE!"); + } + ++ /// mapSectionAddress - map a section to its target address space value. ++ /// Map a JIT section with a given ID to the address in the target process ++ /// as the running code will see it. This is the address which will be used ++ /// for relocation resolution. ++ virtual void mapSectionAddress(unsigned SectionID, ++ uint64_t TargetAddress) { ++ llvm_unreachable("Re-mapping of section addresses not supported with this " ++ "EE!"); ++ } ++ + /// generateCodeForModule - Run code generation for the specified module and + /// load it into memory. + /// +diff --git a/include/llvm/ExecutionEngine/JITSymbol.h b/include/llvm/ExecutionEngine/JITSymbol.h +index 86ab173..257ed03 100644 +--- a/include/llvm/ExecutionEngine/JITSymbol.h ++++ b/include/llvm/ExecutionEngine/JITSymbol.h +@@ -297,7 +297,17 @@ public: + /// missing. Instead, that symbol will be left out of the result map. + virtual Expected lookupFlags(const LookupSet &Symbols) = 0; + ++ /// Specify if this resolver can return valid symbols with zero value. ++ virtual void setAllowsZeroSymbols(bool Value = true) { ++ AllowsZeroSymbols = Value; ++ } ++ ++ /// Return true if the resolver can return a valid symbol with zero value. ++ virtual bool allowsZeroSymbols() { return AllowsZeroSymbols; } ++ + private: ++ bool AllowsZeroSymbols = false; ++ + virtual void anchor(); + }; + +diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h +index 26fec8b..c533003 100644 +--- a/include/llvm/ExecutionEngine/Orc/Core.h ++++ b/include/llvm/ExecutionEngine/Orc/Core.h +@@ -110,7 +110,17 @@ public: + virtual SymbolNameSet lookup(std::shared_ptr Query, + SymbolNameSet Symbols) = 0; + ++ /// Specify if this resolver can return valid symbols with zero value. ++ virtual void setAllowsZeroSymbols(bool Value = true) { ++ AllowsZeroSymbols = Value; ++ } ++ ++ /// Return true if the resolver can return a valid symbol with zero value. ++ virtual bool allowsZeroSymbols() { return AllowsZeroSymbols; } ++ + private: ++ bool AllowsZeroSymbols = false; ++ + virtual void anchor(); + }; + +diff --git a/include/llvm/ExecutionEngine/Orc/Legacy.h b/include/llvm/ExecutionEngine/Orc/Legacy.h +index b2b389a..7c108ef 100644 +--- a/include/llvm/ExecutionEngine/Orc/Legacy.h ++++ b/include/llvm/ExecutionEngine/Orc/Legacy.h +@@ -25,6 +25,10 @@ public: + JITSymbolResolverAdapter(ExecutionSession &ES, SymbolResolver &R); + Expected lookupFlags(const LookupSet &Symbols) override; + Expected lookup(const LookupSet &Symbols) override; ++ bool allowsZeroSymbols() override { return R.allowsZeroSymbols(); } ++ void setAllowsZeroSymbols(bool Value) override { ++ R.setAllowsZeroSymbols(Value); ++ } + + private: + ExecutionSession &ES; +@@ -70,11 +74,13 @@ Expected lookupFlagsWithLegacyFn(SymbolFlagsMap &SymbolFlags, + template + SymbolNameSet lookupWithLegacyFn(AsynchronousSymbolQuery &Query, + const SymbolNameSet &Symbols, +- FindSymbolFn FindSymbol) { ++ FindSymbolFn FindSymbol, ++ bool AllowZeroSymbols = false) { + SymbolNameSet SymbolsNotFound; + + for (auto &S : Symbols) { +- if (JITSymbol Sym = FindSymbol(*S)) { ++ JITSymbol Sym = FindSymbol(*S); ++ if (Sym || (AllowZeroSymbols && !Sym.getFlags().hasError())) { + if (auto Addr = Sym.getAddress()) { + Query.setDefinition(S, JITEvaluatedSymbol(*Addr, Sym.getFlags())); + Query.notifySymbolFinalized(); +@@ -116,7 +122,8 @@ public: + + SymbolNameSet lookup(std::shared_ptr Query, + SymbolNameSet Symbols) final { +- return lookupWithLegacyFn(*Query, Symbols, LegacyLookup); ++ return lookupWithLegacyFn(*Query, Symbols, LegacyLookup, ++ this->allowsZeroSymbols()); + } + + private: +diff --git a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h +index cfc3922..c0b43ce 100644 +--- a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h ++++ b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h +@@ -80,6 +80,12 @@ public: + BaseLayer.mapSectionAddress(K, LocalAddress, TargetAddr); + } + ++ /// @brief Map section addresses for the objects associated with the handle H. ++ void mapSectionAddress(VModuleKey K, unsigned SectionID, ++ JITTargetAddress TargetAddr) { ++ BaseLayer.mapSectionAddress(K, SectionID, TargetAddr); ++ } ++ + /// @brief Access the transform functor directly. + TransformFtor &getTransform() { return Transform; } + +diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +index 8f0d9fa..ada93a2 100644 +--- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h ++++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +@@ -62,6 +62,8 @@ protected: + + virtual void mapSectionAddress(const void *LocalAddress, + JITTargetAddress TargetAddr) const = 0; ++ virtual void mapSectionAddress(unsigned SectionID, ++ JITTargetAddress TargetAddr) const = 0; + + JITSymbol getSymbol(StringRef Name, bool ExportedSymbolsOnly) { + auto SymEntry = SymbolTable.find(Name); +@@ -133,6 +135,9 @@ private: + std::unique_ptr Info = + PFC->RTDyld->loadObject(*PFC->Obj.getBinary()); + ++ if (PFC->Parent.NotifyLoaded) ++ PFC->Parent.NotifyLoaded(PFC->K, *PFC->Obj.getBinary(), *Info); ++ + // Copy the symbol table out of the RuntimeDyld instance. + { + auto SymTab = PFC->RTDyld->getSymbolTable(); +@@ -140,9 +145,6 @@ private: + SymbolTable[KV.first] = KV.second; + } + +- if (PFC->Parent.NotifyLoaded) +- PFC->Parent.NotifyLoaded(PFC->K, *PFC->Obj.getBinary(), *Info); +- + PFC->RTDyld->finalizeWithMemoryManagerLocking(); + + if (PFC->RTDyld->hasError()) +@@ -175,6 +177,13 @@ private: + PFC->RTDyld->mapSectionAddress(LocalAddress, TargetAddr); + } + ++ void mapSectionAddress(unsigned SectionID, ++ JITTargetAddress TargetAddr) const override { ++ assert(PFC && "mapSectionAddress called on finalized LinkedObject"); ++ assert(PFC->RTDyld && "mapSectionAddress called on raw LinkedObject"); ++ PFC->RTDyld->mapSectionAddress(SectionID, TargetAddr); ++ } ++ + private: + void buildInitialSymbolTable(const OwnedObject &Obj) { + for (auto &Symbol : Obj.getBinary()->symbols()) { +@@ -325,6 +334,13 @@ public: + LinkedObjects[K]->mapSectionAddress(LocalAddress, TargetAddr); + } + ++ /// @brief Map section addresses for the objects associated with the handle H. ++ void mapSectionAddress(VModuleKey K, unsigned SectionID, ++ JITTargetAddress TargetAddr) { ++ assert(LinkedObjects.count(K) && "VModuleKey not associated with object"); ++ LinkedObjects[K]->mapSectionAddress(SectionID, TargetAddr); ++ } ++ + /// @brief Immediately emit and finalize the object represented by the given + /// VModuleKey. + /// @param K VModuleKey for object to emit/finalize. +diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h +index 14da5af..27b0243 100644 +--- a/include/llvm/ExecutionEngine/RuntimeDyld.h ++++ b/include/llvm/ExecutionEngine/RuntimeDyld.h +@@ -112,6 +112,14 @@ public: + StringRef SectionName, + bool IsReadOnly) = 0; + ++ /// Notify that a comment/note section exists and where it's located ++ /// in case the user needs to look up extra information about the ++ /// code, e.g. debugging information. ++ virtual uint8_t *recordNoteSection(const uint8_t *Data, uintptr_t Size, ++ unsigned Alignment, ++ unsigned SectionID, ++ StringRef SectionName) { return nullptr;} ++ + /// Inform the memory manager about the total amount of memory required to + /// allocate all sections to be loaded: + /// \p CodeSize - the total size of all code sections +@@ -129,6 +137,11 @@ public: + /// Override to return true to enable the reserveAllocationSpace callback. + virtual bool needsToReserveAllocationSpace() { return false; } + ++ /// Override to return false to tell LLVM no stub space will be needed. ++ /// This requires some guarantees depending on architecuture, but when ++ /// you know what you are doing it saves allocated space. ++ virtual bool allowStubAllocation() const { return true; } ++ + /// Register the EH frames with the runtime so that c++ exceptions work. + /// + /// \p Addr parameter provides the local address of the EH frame section +@@ -205,6 +218,12 @@ public: + /// This is the address which will be used for relocation resolution. + void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress); + ++ /// Map a section to its target address space value. ++ /// Map a JIT section with a given ID to the address in the target process as ++ /// the running code will see it. This is the address which will be used for ++ /// relocation resolution. ++ void mapSectionAddress(unsigned SectionID, uint64_t TargetAddress); ++ + /// Register any EH frame sections that have been loaded but not previously + /// registered with the memory manager. Note, RuntimeDyld is responsible + /// for identifying the EH frame and calling the memory manager with the +diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h +index c538c46..7b16897 100644 +--- a/include/llvm/MC/MCAsmInfo.h ++++ b/include/llvm/MC/MCAsmInfo.h +@@ -225,6 +225,10 @@ protected: + /// result of a alignment directive. Defaults to 0 + unsigned TextAlignFillValue = 0; + ++ /// If non-zero, this is used to fill the executable space with instructions ++ /// that will trap. Defaults to 0 ++ unsigned TrapFillValue = 0; ++ + //===--- Global Variable Emission Directives --------------------------===// + + /// This is the directive used to declare a global entity. Defaults to +@@ -504,6 +508,7 @@ public: + const char *getAscizDirective() const { return AscizDirective; } + bool getAlignmentIsInBytes() const { return AlignmentIsInBytes; } + unsigned getTextAlignFillValue() const { return TextAlignFillValue; } ++ unsigned getTrapFillValue() const { return TrapFillValue; } + const char *getGlobalDirective() const { return GlobalDirective; } + + bool doesSetDirectiveSuppressReloc() const { +diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h +index c110ffd..a29f320 100644 +--- a/include/llvm/MC/MCContext.h ++++ b/include/llvm/MC/MCContext.h +@@ -506,6 +506,10 @@ namespace llvm { + return MCDwarfLineTablesCUMap; + } + ++ std::map &getMCDwarfLineTables() { ++ return MCDwarfLineTablesCUMap; ++ } ++ + MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) { + return MCDwarfLineTablesCUMap[CUID]; + } +@@ -552,13 +556,14 @@ namespace llvm { + /// instruction will be created. + void setCurrentDwarfLoc(unsigned FileNum, unsigned Line, unsigned Column, + unsigned Flags, unsigned Isa, +- unsigned Discriminator) { ++ unsigned Discriminator, uint64_t Addr = -1ULL) { + CurrentDwarfLoc.setFileNum(FileNum); + CurrentDwarfLoc.setLine(Line); + CurrentDwarfLoc.setColumn(Column); + CurrentDwarfLoc.setFlags(Flags); + CurrentDwarfLoc.setIsa(Isa); + CurrentDwarfLoc.setDiscriminator(Discriminator); ++ CurrentDwarfLoc.setAbsoluteAddr(Addr); + DwarfLocSeen = true; + } + +diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h +index 5cdb176..cd46632 100644 +--- a/include/llvm/MC/MCDwarf.h ++++ b/include/llvm/MC/MCDwarf.h +@@ -73,6 +73,7 @@ class MCDwarfLoc { + uint8_t Flags; + uint8_t Isa; + uint32_t Discriminator; ++ uint64_t AbsoluteAddr; + + // Flag that indicates the initial value of the is_stmt_start flag. + #define DWARF2_LINE_DEFAULT_IS_STMT 1 +@@ -87,14 +88,17 @@ private: // MCContext manages these + friend class MCDwarfLineEntry; + + MCDwarfLoc(unsigned fileNum, unsigned line, unsigned column, unsigned flags, +- unsigned isa, unsigned discriminator) ++ unsigned isa, unsigned discriminator, uint64_t addr=-1ULL) + : FileNum(fileNum), Line(line), Column(column), Flags(flags), Isa(isa), +- Discriminator(discriminator) {} ++ Discriminator(discriminator), AbsoluteAddr(addr) {} + + // Allow the default copy constructor and assignment operator to be used + // for an MCDwarfLoc object. + + public: ++ /// \brief Get the AbsoluteAddr of this MCDwarfLoc. ++ uint64_t getAbsoluteAddr() const { return AbsoluteAddr; } ++ + /// \brief Get the FileNum of this MCDwarfLoc. + unsigned getFileNum() const { return FileNum; } + +@@ -141,6 +145,11 @@ public: + void setDiscriminator(unsigned discriminator) { + Discriminator = discriminator; + } ++ ++ /// \brief Set the AbsoluteAddr of this MCDwarfLoc. ++ void setAbsoluteAddr(uint64_t addr) { ++ AbsoluteAddr = addr; ++ } + }; + + /// \brief Instances of this class represent the line information for +@@ -274,7 +283,7 @@ public: + + // This emits the Dwarf file and the line tables for a given Compile Unit. + void EmitCU(MCObjectStreamer *MCOS, MCDwarfLineTableParams Params, +- Optional &LineStr) const; ++ Optional &LineStr); + + Expected tryGetFile(StringRef &Directory, StringRef &FileName, + MD5::MD5Result *Checksum, +@@ -380,6 +389,57 @@ public: + SMLoc &Loc); + }; + ++/// \brief A sequence of MCDwarfOperations corresponds to a DWARF expression, ++/// used as operand in some MCCFIInstructions. ++struct MCDwarfOperation { ++ uint8_t Operation{0}; ++ uint64_t Operand0{0}; ++ uint64_t Operand1{0}; ++ ++ MCDwarfOperation(uint8_t O, uint64_t O0, uint64_t O1) ++ : Operation(O), Operand0(O0), Operand1(O1) {} ++ ++ bool operator==(const MCDwarfOperation &Other) const { ++ return (Other.Operation == Operation && Other.Operand0 == Operand0 && ++ Other.Operand1 == Operand1); ++ } ++}; ++typedef std::vector MCDwarfExpression; ++ ++/// \brief This builder should be used to create MCDwarfExpression objects ++/// before feeding them to a CFIInstruction factory method. ++class MCDwarfExprBuilder { ++public: ++ MCDwarfExprBuilder() {} ++ ++private: ++ MCDwarfExpression Expr; ++ ++public: ++ MCDwarfExprBuilder &appendOperation(uint8_t Operation) { ++ Expr.push_back(MCDwarfOperation(Operation, 0, 0)); ++ return *this; ++ } ++ ++ MCDwarfExprBuilder &appendOperation(uint8_t Operation, uint64_t Op0) { ++ Expr.push_back(MCDwarfOperation(Operation, Op0, 0)); ++ return *this; ++ } ++ ++ MCDwarfExprBuilder &appendOperation(uint8_t Operation, uint64_t Op0, ++ uint64_t Op1) { ++ Expr.push_back(MCDwarfOperation(Operation, Op0, Op1)); ++ return *this; ++ } ++ ++ /// \brief Return the resulting expression and reset internal state ++ MCDwarfExpression take() { ++ MCDwarfExpression Res; ++ std::swap(Res, Expr); ++ return Res; ++ } ++}; ++ + class MCCFIInstruction { + public: + enum OpType { +@@ -397,6 +457,9 @@ public: + OpUndefined, + OpRegister, + OpWindowSave, ++ OpExpression, ++ OpDefCfaExpression, ++ OpValExpression, + OpGnuArgsSize + }; + +@@ -409,11 +472,13 @@ private: + unsigned Register2; + }; + std::vector Values; ++ MCDwarfExpression Expression; + + MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V) + : Operation(Op), Label(L), Register(R), Offset(O), + Values(V.begin(), V.end()) { +- assert(Op != OpRegister); ++ assert(Op != OpRegister && Op != OpDefCfaExpression && ++ Op != OpValExpression && Op != OpExpression); + } + + MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R1, unsigned R2) +@@ -421,6 +486,20 @@ private: + assert(Op == OpRegister); + } + ++ MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, ++ const MCDwarfExpression &E) ++ : Operation(Op), Label(L), Register(R), Offset(0), Expression(E) { ++ assert(Op == OpDefCfaExpression || Op == OpValExpression || ++ Op == OpExpression); ++ } ++ ++ MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, MCDwarfExpression &&E) ++ : Operation(Op), Label(L), Register(R), Offset(0), ++ Expression(std::move(E)) { ++ assert(Op == OpDefCfaExpression || Op == OpValExpression || ++ Op == OpExpression); ++ } ++ + public: + /// \brief .cfi_def_cfa defines a rule for computing CFA as: take address from + /// Register and add Offset to it. +@@ -516,14 +595,56 @@ public: + return MCCFIInstruction(OpGnuArgsSize, L, 0, Size, ""); + } + ++ /// \brief MCCFIInstructions that refer to an expression, expression object is ++ /// created by copying ++ static MCCFIInstruction createDefCfaExpression(MCSymbol *L, ++ const MCDwarfExpression &E) { ++ return MCCFIInstruction(OpDefCfaExpression, L, 0, E); ++ } ++ ++ static MCCFIInstruction createValExpression(MCSymbol *L, unsigned R, ++ const MCDwarfExpression &E) { ++ return MCCFIInstruction(OpValExpression, L, R, E); ++ } ++ ++ static MCCFIInstruction createExpression(MCSymbol *L, unsigned R, ++ const MCDwarfExpression &E) { ++ return MCCFIInstruction(OpExpression, L, R, E); ++ } ++ ++ /// \brief MCCFIInstructions that refer to an expression, expression object is ++ /// moved from an r-value ++ static MCCFIInstruction createDefCfaExpression(MCSymbol *L, ++ MCDwarfExpression &&E) { ++ return MCCFIInstruction(OpDefCfaExpression, L, 0, E); ++ } ++ ++ static MCCFIInstruction createValExpression(MCSymbol *L, unsigned R, ++ MCDwarfExpression &&E) { ++ return MCCFIInstruction(OpValExpression, L, R, E); ++ } ++ ++ static MCCFIInstruction createExpression(MCSymbol *L, unsigned R, ++ MCDwarfExpression &&E) { ++ return MCCFIInstruction(OpExpression, L, R, E); ++ } ++ ++ bool operator==(const MCCFIInstruction &Other) const { ++ return (Other.Operation == Operation && Other.Label == Label && ++ Other.Offset == Offset && Other.Register == Register && ++ Other.Expression == Expression); ++ } ++ + OpType getOperation() const { return Operation; } + MCSymbol *getLabel() const { return Label; } ++ void setLabel(MCSymbol *L) { Label = L; } + + unsigned getRegister() const { + assert(Operation == OpDefCfa || Operation == OpOffset || + Operation == OpRestore || Operation == OpUndefined || + Operation == OpSameValue || Operation == OpDefCfaRegister || +- Operation == OpRelOffset || Operation == OpRegister); ++ Operation == OpRelOffset || Operation == OpRegister || ++ Operation == OpExpression || Operation == OpValExpression); + return Register; + } + +@@ -539,6 +660,33 @@ public: + return Offset; + } + ++ void setOffset(int NewOffset) { ++ assert(Operation == OpDefCfa || Operation == OpOffset || ++ Operation == OpRelOffset || Operation == OpDefCfaOffset || ++ Operation == OpAdjustCfaOffset || Operation == OpGnuArgsSize); ++ Offset = NewOffset; ++ } ++ ++ void setRegister(unsigned NewReg) { ++ assert(Operation == OpDefCfa || Operation == OpOffset || ++ Operation == OpRestore || Operation == OpUndefined || ++ Operation == OpSameValue || Operation == OpDefCfaRegister || ++ Operation == OpRelOffset || Operation == OpRegister || ++ Operation == OpExpression || Operation == OpValExpression); ++ Register = NewReg; ++ } ++ ++ void setRegister2(unsigned NewReg) { ++ assert(Operation == OpRegister); ++ Register2 = NewReg; ++ } ++ ++ const MCDwarfExpression &getExpression() const { ++ assert(Operation == OpDefCfaExpression || Operation == OpExpression || ++ Operation == OpValExpression); ++ return Expression; ++ } ++ + StringRef getValues() const { + assert(Operation == OpEscape); + return StringRef(&Values[0], Values.size()); +diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h +index fcbbe65..25847aa 100644 +--- a/include/llvm/MC/MCExpr.h ++++ b/include/llvm/MC/MCExpr.h +@@ -123,6 +123,9 @@ public: + /// expression. + MCFragment *findAssociatedFragment() const; + ++ /// Helper method that returns the Symbol of an MCSymbolRef Expression. ++ const MCSymbol &getSymbol() const; ++ + /// @} + }; + +diff --git a/include/llvm/MC/MCFragment.h b/include/llvm/MC/MCFragment.h +index 38c3655..dec2957 100644 +--- a/include/llvm/MC/MCFragment.h ++++ b/include/llvm/MC/MCFragment.h +@@ -34,6 +34,7 @@ class MCFragment : public ilist_node_with_parent { + public: + enum FragmentType : uint8_t { + FT_Align, ++ FT_NeverAlign, + FT_Data, + FT_CompactEncodedInst, + FT_Fill, +@@ -325,6 +326,46 @@ public: + } + }; + ++class MCNeverAlignFragment : public MCFragment { ++ /// Alignment - The alignment the end of the next fragment should avoid ++ unsigned Alignment; ++ ++ /// EmitNops - Flag to indicate that (optimal) NOPs should be emitted instead ++ /// of using the provided value. The exact interpretation of this flag is ++ /// target dependent. ++ bool EmitNops : 1; ++ ++ /// Value - Value to use for filling padding bytes. ++ int64_t Value; ++ ++ /// ValueSize - The size of the integer (in bytes) of \p Value. ++ unsigned ValueSize; ++ ++ public: ++ MCNeverAlignFragment(unsigned Alignment, ++ int64_t Value, unsigned ValueSize, MCSection *Sec = nullptr) ++ : MCFragment(FT_NeverAlign, false, 0, Sec), Alignment(Alignment), ++ EmitNops(false), Value(Value), ValueSize(ValueSize) {} ++ ++ /// \name Accessors ++ /// @{ ++ ++ unsigned getAlignment() const { return Alignment; } ++ ++ int64_t getValue() const { return Value; } ++ ++ unsigned getValueSize() const { return ValueSize; } ++ ++ bool hasEmitNops() const { return EmitNops; } ++ void setEmitNops(bool Value) { EmitNops = Value; } ++ ++ /// @} ++ ++ static bool classof(const MCFragment *F) { ++ return F->getKind() == MCFragment::FT_NeverAlign; ++ } ++}; ++ + /// Fragment for adding required padding. + /// This fragment is always inserted before an instruction, and holds that + /// instruction as context information (as well as a mask of kinds) for +diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h +index db28fd0..e136a10 100644 +--- a/include/llvm/MC/MCInst.h ++++ b/include/llvm/MC/MCInst.h +@@ -187,7 +187,7 @@ public: + using const_iterator = SmallVectorImpl::const_iterator; + + void clear() { Operands.clear(); } +- void erase(iterator I) { Operands.erase(I); } ++ iterator erase(iterator I) { return Operands.erase(I); } + size_t size() const { return Operands.size(); } + iterator begin() { return Operands.begin(); } + const_iterator begin() const { return Operands.begin(); } +diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h +index c99f252..e6b4a88 100644 +--- a/include/llvm/MC/MCObjectFileInfo.h ++++ b/include/llvm/MC/MCObjectFileInfo.h +@@ -65,6 +65,9 @@ protected: + /// constants. + MCSection *ReadOnlySection; + ++ /// Same as the above but for infrequently used data. ++ MCSection *ReadOnlyColdSection; ++ + /// If exception handling is supported by the target, this is the section the + /// Language Specific Data Area information is emitted to. + MCSection *LSDASection; +@@ -230,6 +233,7 @@ public: + MCSection *getDataSection() const { return DataSection; } + MCSection *getBSSSection() const { return BSSSection; } + MCSection *getReadOnlySection() const { return ReadOnlySection; } ++ MCSection *getReadOnlyColdSection() const { return ReadOnlyColdSection; } + MCSection *getLSDASection() const { return LSDASection; } + MCSection *getCompactUnwindSection() const { return CompactUnwindSection; } + MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; } +diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h +index 8e9b4ac..d2c569e 100644 +--- a/include/llvm/MC/MCObjectStreamer.h ++++ b/include/llvm/MC/MCObjectStreamer.h +@@ -121,6 +121,8 @@ public: + unsigned MaxBytesToEmit = 0) override; + void EmitCodeAlignment(unsigned ByteAlignment, + unsigned MaxBytesToEmit = 0) override; ++ void EmitNeverAlignCodeAtEnd(unsigned ByteAlignment, int64_t Value = 0, ++ unsigned ValueSize = 1) override; + void emitValueToOffset(const MCExpr *Offset, unsigned char Value, + SMLoc Loc) override; + void +diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h +index 582a836..0b15454 100644 +--- a/include/llvm/MC/MCStreamer.h ++++ b/include/llvm/MC/MCStreamer.h +@@ -199,7 +199,7 @@ class MCStreamer { + + /// \brief Tracks an index to represent the order a symbol was emitted in. + /// Zero means we did not emit that symbol. +- DenseMap SymbolOrdering; ++ unsigned SymbolOrdering = 1; + + /// \brief This is stack of current and previous section values saved by + /// PushSection. +@@ -290,8 +290,8 @@ public: + /// If the comment includes embedded \n's, they will each get the comment + /// prefix as appropriate. The added comment should not end with a \n. + /// By default, each comment is terminated with an end of line, i.e. the +- /// EOL param is set to true by default. If one prefers not to end the +- /// comment with a new line then the EOL param should be passed ++ /// EOL param is set to true by default. If one prefers not to end the ++ /// comment with a new line then the EOL param should be passed + /// with a false value. + virtual void AddComment(const Twine &T, bool EOL = true) {} + +@@ -338,9 +338,7 @@ public: + + /// \brief Returns an index to represent the order a symbol was emitted in. + /// (zero if we did not emit that symbol) +- unsigned GetSymbolOrder(const MCSymbol *Sym) const { +- return SymbolOrdering.lookup(Sym); +- } ++ unsigned GetSymbolOrder(const MCSymbol *Sym) const; + + /// \brief Update streamer for a new active section. + /// +@@ -608,6 +606,10 @@ public: + + virtual void EmitSLEB128Value(const MCExpr *Value); + ++ /// \brief Like EmitULEB128Value but pads the output to specific number of ++ /// bytes. ++ void EmitPaddedULEB128IntValue(uint64_t Value, unsigned PadTo); ++ + /// \brief Special case of EmitULEB128Value that avoids the client having to + /// pass in a MCExpr for constant integers. + void EmitULEB128IntValue(uint64_t Value); +@@ -726,6 +728,12 @@ public: + virtual void EmitCodeAlignment(unsigned ByteAlignment, + unsigned MaxBytesToEmit = 0); + ++ /// \brief If the end of the following fragment ever gets aligned to ++ /// \p ByteAlignment, emit a single nop or \t Value to break this alignment. ++ virtual void EmitNeverAlignCodeAtEnd(unsigned ByteAlignment, ++ int64_t Value = 0, ++ unsigned ValueSize = 1); ++ + /// \brief Emit some number of copies of \p Value until the byte offset \p + /// Offset is reached. + /// +@@ -904,6 +912,8 @@ public: + + virtual void EmitSyntaxDirective(); + ++ virtual void EmitCFIInstruction(const MCCFIInstruction &Inst); ++ + /// \brief Emit a .reloc directive. + /// Returns true if the relocation could not be emitted because Name is not + /// known. +diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h +index cc8fc02..7b7835e 100644 +--- a/include/llvm/MC/MCSymbol.h ++++ b/include/llvm/MC/MCSymbol.h +@@ -120,10 +120,15 @@ protected: + + /// The Flags field is used by object file implementations to store + /// additional per symbol information which is not easily classified. +- enum : unsigned { NumFlagsBits = 16 }; ++ enum : unsigned { NumFlagsBits = 15 }; + mutable uint32_t Flags : NumFlagsBits; + +- /// Index field, for use by the object file implementation. ++ /// Indicates if the next field is used for Index or Order. ++ mutable bool IsIndex : 1; ++ ++ /// Index field for use by the object file implementation. It is also used to ++ /// represent order of the symbol. The semantics of the current value is ++ /// indicated by IsIndex field. + mutable uint32_t Index = 0; + + union { +@@ -154,7 +159,7 @@ protected: + : IsTemporary(isTemporary), IsRedefinable(false), IsUsed(false), + IsRegistered(false), IsExternal(false), IsPrivateExtern(false), + Kind(Kind), IsUsedInReloc(false), SymbolContents(SymContentsUnset), +- CommonAlignLog2(0), Flags(0) { ++ CommonAlignLog2(0), Flags(0), IsIndex{false} { + Offset = 0; + FragmentAndHasName.setInt(!!Name); + if (Name) +@@ -308,11 +313,27 @@ public: + + /// Get the (implementation defined) index. + uint32_t getIndex() const { ++ assert(IsIndex && "Index unavailable"); + return Index; + } + + /// Set the (implementation defined) index. + void setIndex(uint32_t Value) const { ++ assert((IsIndex = true, true) && "assertion-specific code"); ++ Index = Value; ++ } ++ ++ bool hasIndex() const { return IsIndex; } ++ ++ /// Get the (implementation defined) order. ++ uint32_t getOrder() const { ++ assert(!IsIndex && "Order unavailable"); ++ return Index; ++ } ++ ++ /// Set the (implementation defined) order. ++ void setOrder(uint32_t Value) const { ++ assert((IsIndex = false, true) && "assertion-specific code"); + Index = Value; + } + +diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h +index 9190149..25646fe 100644 +--- a/include/llvm/Object/COFF.h ++++ b/include/llvm/Object/COFF.h +@@ -899,6 +899,7 @@ protected: + bool isSectionData(DataRefImpl Sec) const override; + bool isSectionBSS(DataRefImpl Sec) const override; + bool isSectionVirtual(DataRefImpl Sec) const override; ++ bool isSectionReadOnly(DataRefImpl Sec) const override; + relocation_iterator section_rel_begin(DataRefImpl Sec) const override; + relocation_iterator section_rel_end(DataRefImpl Sec) const override; + +diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h +index 46504e7..836fd8d 100644 +--- a/include/llvm/Object/ELF.h ++++ b/include/llvm/Object/ELF.h +@@ -127,6 +127,18 @@ public: + getHeader()->getDataEncoding() == ELF::ELFDATA2LSB; + } + ++ Expected dynamic_table_begin(const Elf_Phdr *Phdr) const; ++ Expected dynamic_table_end(const Elf_Phdr *Phdr) const; ++ Expected dynamic_table(const Elf_Phdr *Phdr) const { ++ Expected Begin = dynamic_table_begin(Phdr); ++ if (!Begin) ++ return Begin.takeError(); ++ Expected End = dynamic_table_end(Phdr); ++ if (!End) ++ return End.takeError(); ++ return makeArrayRef(Begin.get(), End.get()); ++ } ++ + Expected sections() const; + + Expected symbols(const Elf_Shdr *Sec) const { +@@ -397,6 +409,34 @@ void ELFFile::getRelocationTypeName(uint32_t Type, + } + + template ++Expected::Elf_Dyn *> ++ELFFile::dynamic_table_begin(const Elf_Phdr *Phdr) const { ++ if (!Phdr) ++ return nullptr; ++ assert(Phdr->p_type == ELF::PT_DYNAMIC && "Got the wrong program header"); ++ uintX_t Offset = Phdr->p_offset; ++ if (Offset > Buf.size()) ++ return make_error("Could not read dynamic table"); ++ return reinterpret_cast(base() + Offset); ++} ++ ++template ++Expected::Elf_Dyn *> ++ELFFile::dynamic_table_end(const Elf_Phdr *Phdr) const { ++ if (!Phdr) ++ return nullptr; ++ assert(Phdr->p_type == ELF::PT_DYNAMIC && "Got the wrong program header"); ++ uintX_t Size = Phdr->p_filesz; ++ if (Size % sizeof(Elf_Dyn)) ++ return make_error("Invalid dynamic table size"); ++ // FIKME: Check for overflow? ++ uintX_t End = Phdr->p_offset + Size; ++ if (End > Buf.size()) ++ return make_error("Could not read dynamic table"); ++ return reinterpret_cast(base() + End); ++} ++ ++template + Expected + ELFFile::getRelocationSymbol(const Elf_Rel *Rel, + const Elf_Shdr *SymTab) const { +diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h +index 4d00103..06a6295 100644 +--- a/include/llvm/Object/ELFObjectFile.h ++++ b/include/llvm/Object/ELFObjectFile.h +@@ -254,6 +254,7 @@ protected: + bool isSectionData(DataRefImpl Sec) const override; + bool isSectionBSS(DataRefImpl Sec) const override; + bool isSectionVirtual(DataRefImpl Sec) const override; ++ bool isSectionReadOnly(DataRefImpl Sec) const override; + relocation_iterator section_rel_begin(DataRefImpl Sec) const override; + relocation_iterator section_rel_end(DataRefImpl Sec) const override; + section_iterator getRelocatedSection(DataRefImpl Sec) const override; +@@ -717,6 +718,14 @@ bool ELFObjectFile::isSectionVirtual(DataRefImpl Sec) const { + } + + template ++bool ELFObjectFile::isSectionReadOnly(DataRefImpl Sec) const { ++ const Elf_Shdr *EShdr = getSection(Sec); ++ return EShdr->sh_flags & ELF::SHF_ALLOC && ++ !(EShdr->sh_flags & ELF::SHF_WRITE) && ++ EShdr->sh_type == ELF::SHT_PROGBITS; ++} ++ ++template + relocation_iterator + ELFObjectFile::section_rel_begin(DataRefImpl Sec) const { + DataRefImpl RelData; +@@ -751,9 +760,6 @@ ELFObjectFile::section_rel_end(DataRefImpl Sec) const { + template + section_iterator + ELFObjectFile::getRelocatedSection(DataRefImpl Sec) const { +- if (EF.getHeader()->e_type != ELF::ET_REL) +- return section_end(); +- + const Elf_Shdr *EShdr = getSection(Sec); + uintX_t Type = EShdr->sh_type; + if (Type != ELF::SHT_REL && Type != ELF::SHT_RELA) +@@ -792,8 +798,6 @@ ELFObjectFile::getRelocationSymbol(DataRefImpl Rel) const { + + template + uint64_t ELFObjectFile::getRelocationOffset(DataRefImpl Rel) const { +- assert(EF.getHeader()->e_type == ELF::ET_REL && +- "Only relocatable object files have relocation offsets"); + const Elf_Shdr *sec = getRelSection(Rel); + if (sec->sh_type == ELF::SHT_REL) + return getRel(Rel)->r_offset; +diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h +index bfd3462..9be0b26 100644 +--- a/include/llvm/Object/MachO.h ++++ b/include/llvm/Object/MachO.h +@@ -320,6 +320,7 @@ public: + /// from offset 0 (i.e. the Mach-O header at the beginning of the file). + bool isSectionStripped(DataRefImpl Sec) const override; + ++ bool isSectionReadOnly(DataRefImpl Sec) const override; + relocation_iterator section_rel_begin(DataRefImpl Sec) const override; + relocation_iterator section_rel_end(DataRefImpl Sec) const override; + +@@ -331,7 +332,7 @@ public: + + relocation_iterator locrel_begin() const; + relocation_iterator locrel_end() const; +- ++ + void moveRelocationNext(DataRefImpl &Rel) const override; + uint64_t getRelocationOffset(DataRefImpl Rel) const override; + symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override; +diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h +index 9c4ae94..6434272 100644 +--- a/include/llvm/Object/ObjectFile.h ++++ b/include/llvm/Object/ObjectFile.h +@@ -110,6 +110,7 @@ public: + bool isVirtual() const; + bool isBitcode() const; + bool isStripped() const; ++ bool isReadOnly() const; + + bool containsSymbol(SymbolRef S) const; + +@@ -238,6 +239,7 @@ protected: + virtual bool isSectionVirtual(DataRefImpl Sec) const = 0; + virtual bool isSectionBitcode(DataRefImpl Sec) const; + virtual bool isSectionStripped(DataRefImpl Sec) const; ++ virtual bool isSectionReadOnly(DataRefImpl Sec) const = 0; + virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0; + virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0; + virtual section_iterator getRelocatedSection(DataRefImpl Sec) const; +@@ -442,6 +444,10 @@ inline bool SectionRef::isStripped() const { + return OwningObject->isSectionStripped(SectionPimpl); + } + ++inline bool SectionRef::isReadOnly() const { ++ return OwningObject->isSectionReadOnly(SectionPimpl); ++} ++ + inline relocation_iterator SectionRef::relocation_begin() const { + return OwningObject->section_rel_begin(SectionPimpl); + } +diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h +index d49acf3a..5929a22 100644 +--- a/include/llvm/Object/Wasm.h ++++ b/include/llvm/Object/Wasm.h +@@ -177,6 +177,7 @@ public: + bool isSectionText(DataRefImpl Sec) const override; + bool isSectionData(DataRefImpl Sec) const override; + bool isSectionBSS(DataRefImpl Sec) const override; ++ bool isSectionReadOnly(DataRefImpl Sec) const override; + bool isSectionVirtual(DataRefImpl Sec) const override; + bool isSectionBitcode(DataRefImpl Sec) const override; + relocation_iterator section_rel_begin(DataRefImpl Sec) const override; +diff --git a/include/llvm/Support/ToolOutputFile.h b/include/llvm/Support/ToolOutputFile.h +index 7fd5f20..2a47ef1 100644 +--- a/include/llvm/Support/ToolOutputFile.h ++++ b/include/llvm/Support/ToolOutputFile.h +@@ -46,7 +46,7 @@ public: + /// This constructor's arguments are passed to raw_fd_ostream's + /// constructor. + ToolOutputFile(StringRef Filename, std::error_code &EC, +- sys::fs::OpenFlags Flags); ++ sys::fs::OpenFlags Flags, unsigned Mode = 0666); + + ToolOutputFile(StringRef Filename, int FD); + +diff --git a/include/llvm/Support/X86DisassemblerDecoderCommon.h b/include/llvm/Support/X86DisassemblerDecoderCommon.h +index eeffb9c..2ec2496 100644 +--- a/include/llvm/Support/X86DisassemblerDecoderCommon.h ++++ b/include/llvm/Support/X86DisassemblerDecoderCommon.h +@@ -62,7 +62,8 @@ namespace X86Disassembler { + ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10)) \ + ENUM_ENTRY(ATTR_EVEXK, (0x1 << 11)) \ + ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12)) \ +- ENUM_ENTRY(ATTR_EVEXB, (0x1 << 13)) ++ ENUM_ENTRY(ATTR_EVEXB, (0x1 << 13)) \ ++ ENUM_ENTRY(ATTR_3DNOW, (0x1 << 14)) + + #define ENUM_ENTRY(n, v) n = v, + enum attributeBits { +@@ -272,7 +273,8 @@ enum attributeBits { + ENUM_ENTRY(IC_EVEX_L2_W_KZ, 3, "requires EVEX_KZ, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ, 4, "requires EVEX_KZ, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ, 4, "requires EVEX_KZ, L2, W and XD prefix") \ +- ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") ++ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") \ ++ ENUM_ENTRY(IC_3DNOW, 8, "requires AMD 3DNow prefix 0f0f") + + #define ENUM_ENTRY(n, r, d) n, + enum InstructionContext { +diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h +index d11f5a8..0ad115c 100644 +--- a/include/llvm/Support/raw_ostream.h ++++ b/include/llvm/Support/raw_ostream.h +@@ -393,7 +393,7 @@ public: + /// STDOUT_FILENO instead of opening a file. This will not close the stdout + /// descriptor. + raw_fd_ostream(StringRef Filename, std::error_code &EC, +- sys::fs::OpenFlags Flags); ++ sys::fs::OpenFlags Flags, unsigned Mode = 0666); + + /// FD is the file descriptor that this writes to. If ShouldClose is true, + /// this closes the file when the stream is destroyed. If FD is for stdout or +diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp +index adada67..c9c7997 100644 +--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp ++++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp +@@ -38,7 +38,7 @@ DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() { + } + + bool +-DWARFAbbreviationDeclaration::extract(DataExtractor Data, ++DWARFAbbreviationDeclaration::extract(DataExtractor Data, + uint32_t* OffsetPtr) { + clear(); + const uint32_t Offset = *OffsetPtr; +@@ -61,13 +61,15 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data, + + // Read all of the abbreviation attributes and forms. + while (true) { ++ uint32_t AOff = *OffsetPtr; + auto A = static_cast(Data.getULEB128(OffsetPtr)); ++ uint32_t FOff = *OffsetPtr; + auto F = static_cast
(Data.getULEB128(OffsetPtr)); + if (A && F) { + bool IsImplicitConst = (F == DW_FORM_implicit_const); + if (IsImplicitConst) { + int64_t V = Data.getSLEB128(OffsetPtr); +- AttributeSpecs.push_back(AttributeSpec(A, F, V)); ++ AttributeSpecs.push_back(AttributeSpec(A, F, V, AOff, FOff)); + continue; + } + Optional ByteSize; +@@ -109,7 +111,7 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data, + break; + } + // Record this attribute and its fixed size if it has one. +- AttributeSpecs.push_back(AttributeSpec(A, F, ByteSize)); ++ AttributeSpecs.push_back(AttributeSpec(A, F, ByteSize, AOff, FOff)); + } else if (A == 0 && F == 0) { + // We successfully reached the end of this abbreviation declaration + // since both attribute and form are zero. +@@ -139,6 +141,15 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const { + OS << '\n'; + } + ++const DWARFAbbreviationDeclaration::AttributeSpec * ++DWARFAbbreviationDeclaration::findAttribute(dwarf::Attribute Attr) const { ++ for (uint32_t i = 0, e = AttributeSpecs.size(); i != e; ++i) { ++ if (AttributeSpecs[i].Attr == Attr) ++ return &AttributeSpecs[i]; ++ } ++ return nullptr; ++} ++ + Optional + DWARFAbbreviationDeclaration::findAttributeIndex(dwarf::Attribute Attr) const { + for (uint32_t i = 0, e = AttributeSpecs.size(); i != e; ++i) { +@@ -150,7 +161,7 @@ DWARFAbbreviationDeclaration::findAttributeIndex(dwarf::Attribute Attr) const { + + Optional DWARFAbbreviationDeclaration::getAttributeValue( + const uint32_t DIEOffset, const dwarf::Attribute Attr, +- const DWARFUnit &U) const { ++ const DWARFUnit &U, uint32_t *OffsetPtr) const { + Optional MatchAttrIndex = findAttributeIndex(Attr); + if (!MatchAttrIndex) + return None; +@@ -164,6 +175,8 @@ Optional DWARFAbbreviationDeclaration::getAttributeValue( + for (const auto &Spec : AttributeSpecs) { + if (*MatchAttrIndex == AttrIndex) { + // We have arrived at the attribute to extract, extract if from Offset. ++ if (OffsetPtr) ++ *OffsetPtr = Offset; + DWARFFormValue FormValue(Spec.Form); + if (Spec.isImplicitConst()) { + FormValue.setSValue(Spec.getImplicitConstValue()); +diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp +index 3a974dd..65bd4a6 100644 +--- a/lib/DebugInfo/DWARF/DWARFContext.cpp ++++ b/lib/DebugInfo/DWARF/DWARFContext.cpp +@@ -681,6 +681,15 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() { + return Loc.get(); + } + ++Optional ++DWARFContext::getOneDebugLocList(uint32_t *Offset) { ++ DWARFDebugLoc L; ++ DWARFDataExtractor LocData(*DObj, DObj->getLocSection(), isLittleEndian(), ++ getCompileUnitAtIndex(0)->getAddressByteSize()); ++ ++ return L.parseOneLocationList(LocData, Offset); ++} ++ + const DWARFDebugLocDWO *DWARFContext::getDebugLocDWO() { + if (LocDWO) + return LocDWO.get(); +@@ -726,7 +735,8 @@ const DWARFDebugFrame *DWARFContext::getEHFrame() { + + DWARFDataExtractor debugFrameData(DObj->getEHFrameSection(), isLittleEndian(), + DObj->getAddressSize()); +- DebugFrame.reset(new DWARFDebugFrame(true /* IsEH */)); ++ DebugFrame.reset( ++ new DWARFDebugFrame(true /* IsEH */, DObj->getEHFrameAddress())); + DebugFrame->parse(debugFrameData); + return DebugFrame.get(); + } +@@ -809,6 +819,19 @@ DWARFContext::getLineTableForUnit(DWARFUnit *U) { + return Line->getOrParseLineTable(lineData, stmtOffset, *this, U); + } + ++uint32_t DWARFContext::getAttrFieldOffsetForUnit(DWARFUnit *U, ++ dwarf::Attribute Attr) const { ++ const auto UnitDIE = U->getUnitDIE(); ++ if (!UnitDIE) ++ return 0; ++ ++ uint32_t Offset = 0; ++ if (!UnitDIE.find(Attr, &Offset)) ++ return 0; ++ ++ return Offset; ++} ++ + void DWARFContext::parseCompileUnits() { + CUs.parse(*this, DObj->getInfoSection()); + } +@@ -1244,6 +1267,9 @@ class DWARFObjInMemory final : public DWARFObject { + + SmallVector, 4> UncompressedSections; + ++ uint64_t EHFrameAddress{0}; ++ bool UsesRelocs{true}; ++ + StringRef *mapSectionToMember(StringRef Name) { + if (DWARFSection *Sec = mapNameToDWARFSection(Name)) + return &Sec->Data; +@@ -1300,10 +1326,11 @@ public: + } + } + DWARFObjInMemory(const object::ObjectFile &Obj, const LoadedObjectInfo *L, +- function_ref HandleError) ++ function_ref HandleError, ++ bool UsesRelocs = true) + : IsLittleEndian(Obj.isLittleEndian()), + AddressSize(Obj.getBytesInAddress()), FileName(Obj.getFileName()), +- Obj(&Obj) { ++ Obj(&Obj), UsesRelocs(UsesRelocs) { + + StringMap SectionAmountMap; + for (const SectionRef &Section : Obj.sections()) { +@@ -1350,6 +1377,8 @@ public: + if (Name == "debug_ranges") { + // FIXME: Use the other dwo range section when we emit it. + RangeDWOSection.Data = Data; ++ } else if (Name == "eh_frame") { ++ EHFrameAddress = Section.getAddress(); + } + } else if (Name == "debug_types") { + // Find debug_types data by section rather than name as there are +@@ -1402,7 +1431,7 @@ public: + continue; + } + +- if (Section.relocation_begin() == Section.relocation_end()) ++ if (Section.relocation_begin() == Section.relocation_end() || !UsesRelocs) + continue; + + // Symbol to [address, section index] cache mapping. +@@ -1445,6 +1474,8 @@ public: + + Optional find(const DWARFSection &S, + uint64_t Pos) const override { ++ if (!UsesRelocs) ++ return None; + auto &Sec = static_cast(S); + RelocAddrMap::const_iterator AI = Sec.Relocs.find(Pos); + if (AI == Sec.Relocs.end()) +@@ -1499,6 +1530,7 @@ public: + StringRef getARangeSection() const override { return ARangeSection; } + StringRef getDebugFrameSection() const override { return DebugFrameSection; } + StringRef getEHFrameSection() const override { return EHFrameSection; } ++ uint64_t getEHFrameAddress() const override { return EHFrameAddress; } + const DWARFSection &getLineSection() const override { return LineSection; } + StringRef getStringSection() const override { return StringSection; } + const DWARFSection &getRangeSection() const override { return RangeSection; } +@@ -1544,8 +1576,9 @@ public: + std::unique_ptr + DWARFContext::create(const object::ObjectFile &Obj, const LoadedObjectInfo *L, + function_ref HandleError, +- std::string DWPName) { +- auto DObj = llvm::make_unique(Obj, L, HandleError); ++ std::string DWPName, bool UsesRelocs) { ++ auto DObj = ++ llvm::make_unique(Obj, L, HandleError, UsesRelocs); + return llvm::make_unique(std::move(DObj), std::move(DWPName)); + } + +diff --git a/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp +index 03e3174..0436778 100644 +--- a/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp ++++ b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp +@@ -7,6 +7,7 @@ + // + //===----------------------------------------------------------------------===// + ++#include "llvm/BinaryFormat/Dwarf.h" + #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h" + #include "llvm/BinaryFormat/Dwarf.h" + #include "llvm/DebugInfo/DWARF/DWARFContext.h" +diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp +index b9dc215..40a5790 100644 +--- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp ++++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp +@@ -352,7 +352,8 @@ static void LLVM_ATTRIBUTE_NORETURN ReportError(uint32_t StartOffset, + report_fatal_error(Str); + } + +-void DWARFDebugFrame::parse(DWARFDataExtractor Data) { ++void DWARFDebugFrame::parse(DWARFDataExtractor Data, ++ RefHandlerType RefHandler) { + uint32_t Offset = 0; + DenseMap CIEs; + +@@ -369,6 +370,9 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { + // length. + IsDWARF64 = true; + Length = Data.getU64(&Offset); ++ } else if (Length == 0) { ++ // Skip empty entry. ++ continue; + } + + // At this point, Offset points to the next field after Length. +@@ -425,6 +429,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { + Personality = Data.getEncodedPointer( + &Offset, *PersonalityEncoding, + EHFrameAddress ? EHFrameAddress + Offset : 0); ++ if (RefHandler) ++ RefHandler(*Personality, Offset, *PersonalityEncoding); + break; + } + case 'R': +@@ -478,6 +484,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { + EHFrameAddress ? EHFrameAddress + Offset : 0)) { + InitialLocation = *Val; + } ++ if (RefHandler) ++ RefHandler(InitialLocation, Offset, Cie->getFDEPointerEncoding()); + if (auto Val = Data.getEncodedPointer( + &Offset, Cie->getFDEPointerEncoding(), 0)) { + AddressRange = *Val; +@@ -496,6 +504,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { + LSDAAddress = Data.getEncodedPointer( + &Offset, Cie->getLSDAPointerEncoding(), + EHFrameAddress ? Offset + EHFrameAddress : 0); ++ if (RefHandler) ++ RefHandler(*LSDAAddress, Offset, Cie->getLSDAPointerEncoding()); + } + + if (Offset != EndAugmentationOffset) +@@ -531,6 +541,13 @@ FrameEntry *DWARFDebugFrame::getEntryAtOffset(uint64_t Offset) const { + return nullptr; + } + ++void DWARFDebugFrame::for_each_FDE(FDEFunction F) const { ++ for (const auto &Entry : Entries) { ++ if (const auto *FDE = dyn_cast(Entry.get())) ++ F(FDE); ++ } ++} ++ + void DWARFDebugFrame::dump(raw_ostream &OS, const MCRegisterInfo *MRI, + Optional Offset) const { + if (Offset) { +diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp +index 7ae38e6..f1fd34a 100644 +--- a/lib/DebugInfo/DWARF/DWARFDie.cpp ++++ b/lib/DebugInfo/DWARF/DWARFDie.cpp +@@ -270,12 +270,13 @@ bool DWARFDie::isSubroutineDIE() const { + return Tag == DW_TAG_subprogram || Tag == DW_TAG_inlined_subroutine; + } + +-Optional DWARFDie::find(dwarf::Attribute Attr) const { ++Optional DWARFDie::find(dwarf::Attribute Attr, ++ uint32_t *OffsetPtr) const { + if (!isValid()) + return None; + auto AbbrevDecl = getAbbreviationDeclarationPtr(); + if (AbbrevDecl) +- return AbbrevDecl->getAttributeValue(getOffset(), Attr, *U); ++ return AbbrevDecl->getAttributeValue(getOffset(), Attr, *U, OffsetPtr); + return None; + } + +diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +index 3d274b6..cef29f4 100644 +--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp ++++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +@@ -175,6 +175,12 @@ static Error getOffset(const SymbolRef &Sym, SectionRef Sec, + return Error::success(); + } + ++void RuntimeDyldImpl::mapSectionAddress(unsigned SectionID, ++ uint64_t TargetAddress) { ++ MutexGuard locked(lock); ++ reassignSectionAddress(SectionID, TargetAddress); ++} ++ + Expected + RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) { + MutexGuard locked(lock); +@@ -585,6 +591,10 @@ unsigned RuntimeDyldImpl::computeGOTSize(const ObjectFile &Obj) { + // compute stub buffer size for the given section + unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj, + const SectionRef &Section) { ++ if (!MemMgr.allowStubAllocation()) { ++ return 0; ++ } ++ + unsigned StubSize = getMaxStubSize(); + if (StubSize == 0) { + return 0; +@@ -795,9 +805,12 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj, + // to handle later processing (and by 'handle' I mean don't do anything + // with these sections). + Allocate = 0; +- Addr = nullptr; ++ Addr = ++ MemMgr.recordNoteSection(reinterpret_cast(data.data()), ++ DataSize, Alignment, SectionID, Name); + DEBUG(dbgs() << "emitSection SectionID: " << SectionID << " Name: " << Name +- << " obj addr: " << format("%p", data.data()) << " new addr: 0" ++ << " obj addr: " << format("%p", data.data()) ++ << " new addr: " << format("%p", Addr) + << " DataSize: " << DataSize << " StubBufSize: " << StubBufSize + << " Allocate: " << Allocate << "\n"); + } +@@ -1064,7 +1077,7 @@ Error RuntimeDyldImpl::resolveExternalSymbols() { + } + + // FIXME: Implement error handling that doesn't kill the host program! +- if (!Addr) ++ if (!Addr && !Resolver.allowsZeroSymbols()) + report_fatal_error("Program used external function '" + Name + + "' which could not be resolved!"); + +@@ -1215,6 +1228,11 @@ void RuntimeDyld::mapSectionAddress(const void *LocalAddress, + Dyld->mapSectionAddress(LocalAddress, TargetAddress); + } + ++void RuntimeDyld::mapSectionAddress(unsigned SectionID, ++ uint64_t TargetAddress) { ++ Dyld->mapSectionAddress(SectionID, TargetAddress); ++} ++ + bool RuntimeDyld::hasError() { return Dyld->hasError(); } + + StringRef RuntimeDyld::getErrorString() { return Dyld->getErrorString(); } +diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +index 36b43ec9..3dc3e8f 100644 +--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp ++++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +@@ -270,6 +270,25 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section, + break; + case ELF::R_X86_64_NONE: + break; ++ case ELF::R_X86_64_8: { ++ Value += Addend; ++ assert((int64_t)Value <= INT8_MAX && (int64_t)Value >= INT8_MIN); ++ uint8_t TruncatedAddr = (Value & 0xFF); ++ *Section.getAddressWithOffset(Offset) = TruncatedAddr; ++ DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr) << " at " ++ << format("%p\n", Section.getAddressWithOffset(Offset))); ++ break; ++ } ++ case ELF::R_X86_64_16: { ++ Value += Addend; ++ assert((int64_t)Value <= INT16_MAX && (int64_t)Value >= INT16_MIN); ++ uint16_t TruncatedAddr = (Value & 0xFFFF); ++ support::ulittle16_t::ref(Section.getAddressWithOffset(Offset)) = ++ TruncatedAddr; ++ DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr) << " at " ++ << format("%p\n", Section.getAddressWithOffset(Offset))); ++ break; ++ } + case ELF::R_X86_64_64: { + support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) = + Value + Addend; +@@ -390,6 +409,26 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section, + case ELF::R_AARCH64_PREL64: + write(isBE, TargetPtr, Value + Addend - FinalAddress); + break; ++ case ELF::R_AARCH64_CONDBR19: { ++ uint64_t BranchImm = Value + Addend - FinalAddress; ++ ++ assert(isInt<21>(BranchImm)); ++ ++ *TargetPtr &= 0xff00001fU; ++ // Immediate:20:2 goes in bits 23:5 of Bcc, CBZ, CBNZ ++ *TargetPtr |= static_cast(BranchImm & 0x1ffffcU) << 3; ++ break; ++ } ++ case ELF::R_AARCH64_TSTBR14: { ++ uint64_t BranchImm = Value + Addend - FinalAddress; ++ ++ assert(isInt<16>(BranchImm)); ++ ++ *TargetPtr &= 0xfff8001fU; ++ // Immediate:15:2 goes in bits 18:5 of TBZ, TBNZ ++ *TargetPtr |= static_cast(BranchImm & 0xffffffcU) << 3; ++ break; ++ } + case ELF::R_AARCH64_CALL26: // fallthrough + case ELF::R_AARCH64_JUMP26: { + // Operation: S+A-P. Set Call or B immediate value to bits fff_fffc of the +@@ -462,6 +501,33 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section, + // from bits 11:4 of X + or32AArch64Imm(TargetPtr, getBits(Value + Addend, 4, 11)); + break; ++ case ELF::R_AARCH64_LD_PREL_LO19: { ++ // Operation: S + A - P ++ uint64_t Result = Value + Addend - FinalAddress; ++ ++ // "Check that -2^20 <= result < 2^20". ++ assert(isInt<21>(Result)); ++ ++ *TargetPtr &= 0xff00001fU; ++ // Immediate goes in bits 23:5 of LD imm instruction, taken ++ // from bits 20:2 of X ++ *TargetPtr |= ((Result & 0xffc) << (5 - 2)); ++ break; ++ } ++ case ELF::R_AARCH64_ADR_PREL_LO21: { ++ // Operation: S + A - P ++ uint64_t Result = Value + Addend - FinalAddress; ++ ++ // "Check that -2^20 <= result < 2^20". ++ assert(isInt<21>(Result)); ++ ++ *TargetPtr &= 0x9f00001fU; ++ // Immediate goes in bits 23:5, 30:29 of ADR imm instruction, taken ++ // from bits 20:0 of X ++ *TargetPtr |= ((Result & 0xffc) << (5 - 2)); ++ *TargetPtr |= (Result & 0x3) << 29; ++ break; ++ } + } + } + +@@ -1173,7 +1239,9 @@ RuntimeDyldELF::processRelocationRef( + DEBUG(dbgs() << "\t\tSectionID: " << SectionID << " Offset: " << Offset + << "\n"); + if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be)) { +- if (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26) { ++ if ((RelType == ELF::R_AARCH64_CALL26 || ++ RelType == ELF::R_AARCH64_JUMP26) && ++ MemMgr.allowStubAllocation()) { + resolveAArch64Branch(SectionID, Value, RelI, Stubs); + } else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) { + // Craete new GOT entry or find existing one. If GOT entry is +@@ -1410,7 +1478,7 @@ RuntimeDyldELF::processRelocationRef( + } else { + processSimpleRelocation(SectionID, Offset, RelType, Value); + } +- ++ + } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) { + if (RelType == ELF::R_PPC64_REL24) { + // Determine ABI variant in use for this object. +@@ -1632,7 +1700,7 @@ RuntimeDyldELF::processRelocationRef( + // equivalent to the usual PLT implementation except that we use the stub + // mechanism in RuntimeDyld (which puts stubs at the end of the section) + // rather than allocating a PLT section. +- if (Value.SymbolName) { ++ if (Value.SymbolName && MemMgr.allowStubAllocation()) { + // This is a call to an external function. + // Look for an existing stub. + SectionEntry &Section = Sections[SectionID]; +@@ -1674,9 +1742,8 @@ RuntimeDyldELF::processRelocationRef( + resolveRelocation(Section, Offset, StubAddress, ELF::R_X86_64_PC32, + Addend); + } else { +- RelocationEntry RE(SectionID, Offset, ELF::R_X86_64_PC32, Value.Addend, +- Value.Offset); +- addRelocationForSection(RE, Value.SectionID); ++ Value.Addend += support::ulittle32_t::ref(computePlaceholderAddress(SectionID, Offset)); ++ processSimpleRelocation(SectionID, Offset, ELF::R_X86_64_PC32, Value); + } + } else if (RelType == ELF::R_X86_64_GOTPCREL || + RelType == ELF::R_X86_64_GOTPCRELX || +diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +index 766a9b2..a36c791 100644 +--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h ++++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +@@ -540,6 +540,8 @@ public: + + void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress); + ++ void mapSectionAddress(unsigned SectionID, uint64_t TargetAddress); ++ + // Is the linker in an error state? + bool hasError() { return HasError; } + +diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp +index a0f9a85..be32963 100644 +--- a/lib/MC/MCAssembler.cpp ++++ b/lib/MC/MCAssembler.cpp +@@ -318,6 +318,34 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, + return Size; + } + ++ case MCFragment::FT_NeverAlign: { ++ const MCNeverAlignFragment &NAF = cast(F); ++ uint64_t Offset = Layout.getFragmentOffset(&NAF); ++ unsigned Size = 0; ++ uint64_t OffsetToAvoid = 0; ++ // Calculate offset to avoid in order to avoid aligning the end of the ++ // next fragment ++ if (const auto *NextFrag = dyn_cast(F.getNextNode())) { ++ OffsetToAvoid = NAF.getAlignment() - ++ (NextFrag->getContents().size() % NAF.getAlignment()); ++ } else if (const auto *NextFrag = ++ dyn_cast(F.getNextNode())) { ++ OffsetToAvoid = NAF.getAlignment() - ++ (NextFrag->getContents().size() % NAF.getAlignment()); ++ } ++ // Check if the current offset matches the alignment plus offset we want to ++ // avoid ++ if (Offset % NAF.getAlignment() == OffsetToAvoid) { ++ // Avoid this alignment by introducing one extra byte ++ Size = 1; ++ if (Size > 0 && NAF.hasEmitNops()) { ++ while (Size % getBackend().getMinimumNopSize()) ++ Size += 1; ++ } ++ } ++ return Size; ++ } ++ + case MCFragment::FT_Org: { + const MCOrgFragment &OF = cast(F); + MCValue Value; +@@ -525,6 +553,35 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout, + break; + } + ++ case MCFragment::FT_NeverAlign: { ++ const MCNeverAlignFragment &NAF = cast(F); ++ assert(NAF.getValueSize() && "Invalid virtual align in concrete fragment!"); ++ ++ uint64_t Count = FragmentSize / NAF.getValueSize(); ++ if (Count == 0) ++ break; ++ assert(Count * NAF.getValueSize() == FragmentSize); ++ ++ if (NAF.hasEmitNops()) { ++ if (!Asm.getBackend().writeNopData(Count, OW)) ++ report_fatal_error("unable to write nop sequence of " + ++ Twine(Count) + " bytes"); ++ break; ++ } ++ ++ // Otherwise, write out in multiples of the value size. ++ for (uint64_t i = 0; i != Count; ++i) { ++ switch (NAF.getValueSize()) { ++ default: llvm_unreachable("Invalid size!"); ++ case 1: OW->write8 (uint8_t (NAF.getValue())); break; ++ case 2: OW->write16(uint16_t(NAF.getValue())); break; ++ case 4: OW->write32(uint32_t(NAF.getValue())); break; ++ case 8: OW->write64(uint64_t(NAF.getValue())); break; ++ } ++ } ++ break; ++ } ++ + case MCFragment::FT_Data: + ++stats::EmittedDataFragments; + OW->writeBytes(cast(F).getContents()); +@@ -651,6 +708,11 @@ void MCAssembler::writeSectionData(const MCSection *Sec, + cast(F).getValue() == 0) && + "Invalid align in virtual section!"); + break; ++ case MCFragment::FT_NeverAlign: ++ assert((cast(F).getValueSize() == 0 || ++ cast(F).getValue() == 0) && ++ "Invalid neveralign in virtual section!"); ++ break; + case MCFragment::FT_Fill: + assert((cast(F).getValue() == 0) && + "Invalid fill in virtual section!"); +diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp +index 0e0ea96..0044566 100644 +--- a/lib/MC/MCDwarf.cpp ++++ b/lib/MC/MCDwarf.cpp +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -156,12 +157,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section, + unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0; + unsigned Isa = 0; + unsigned Discriminator = 0; ++ uint64_t LastAddress = -1ULL; ++ (void)LastAddress; + MCSymbol *LastLabel = nullptr; + + // Loop through each MCDwarfLineEntry and encode the dwarf line number table. +- for (const MCDwarfLineEntry &LineEntry : LineEntries) { ++ for (auto it = LineEntries.begin(), ++ ie = LineEntries.end(); ++ it != ie; ++it) { ++ const MCDwarfLineEntry &LineEntry = *it; + int64_t LineDelta = static_cast(LineEntry.getLine()) - LastLine; + ++ uint64_t Address = LineEntry.getAbsoluteAddr(); ++ if (Address != -1ULL) { ++ auto nit = it; ++ if (++nit == ie) { ++ // We are using a hacky way to update debug info for functions that we ++ // didn't rewrite. We don't have a code section context, and should ++ // emit end_sequence at the address indicated by the last entry. ++ MCOS->EmitIntValue(dwarf::DW_LNS_extended_op, 1); ++ MCOS->EmitIntValue(8 + 1, 1); ++ MCOS->EmitIntValue(dwarf::DW_LNE_set_address, 1); ++ MCOS->EmitIntValue(Address, 8); ++ MCDwarfLineAddr::Emit(MCOS, ++ MCOS->getAssembler().getDWARFLinetableParams(), ++ INT64_MAX, ++ 0); ++ return; ++ } ++ } ++ + if (FileNum != LineEntry.getFileNum()) { + FileNum = LineEntry.getFileNum(); + MCOS->EmitIntValue(dwarf::DW_LNS_set_file, 1); +@@ -197,18 +222,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section, + if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN) + MCOS->EmitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1); + +- MCSymbol *Label = LineEntry.getLabel(); ++ if (Address == -1ULL) { ++ assert(LastAddress == -1ULL && ++ "Absolute addresses can only be added at the end of the table."); ++ ++ MCSymbol *Label = LineEntry.getLabel(); + +- // At this point we want to emit/create the sequence to encode the delta in +- // line numbers and the increment of the address from the previous Label +- // and the current Label. +- const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo(); +- MCOS->EmitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label, +- asmInfo->getCodePointerSize()); ++ // At this point we want to emit/create the sequence to encode the delta in ++ // line numbers and the increment of the address from the previous Label ++ // and the current Label. ++ const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo(); ++ MCOS->EmitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label, ++ asmInfo->getCodePointerSize()); ++ LastLabel = Label; ++ } else { ++ MCOS->EmitIntValue(dwarf::DW_LNS_extended_op, 1); ++ // DW_LNE_set_address length: pointer size (8) + 1 ++ MCOS->EmitIntValue(8 + 1, 1); ++ MCOS->EmitIntValue(dwarf::DW_LNE_set_address, 1); ++ MCOS->EmitIntValue(Address, 8); ++ MCOS->EmitIntValue(dwarf::DW_LNS_advance_line, 1); ++ MCOS->EmitSLEB128IntValue(LineDelta); ++ MCOS->EmitIntValue(dwarf::DW_LNS_copy, 1); ++ LastAddress = Address; ++ } + + Discriminator = 0; + LastLine = LineEntry.getLine(); +- LastLabel = Label; + } + + // Emit a DW_LNE_end_sequence for the end of the section. +@@ -250,7 +290,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS, + MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection()); + + // Handle the rest of the Compile Units. +- for (const auto &CUIDTablePair : LineTables) ++ for (auto &CUIDTablePair : LineTables) + CUIDTablePair.second.EmitCU(MCOS, Params, LineStr); + + if (LineStr) +@@ -484,7 +524,7 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, + + // Parameters of the state machine, are next. + MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1); +- // maximum_operations_per_instruction ++ // maximum_operations_per_instruction + // For non-VLIW architectures this field is always 1. + // FIXME: VLIW architectures need to update this field accordingly. + if (LineTableVersion >= 4) +@@ -514,8 +554,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, + + void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS, + MCDwarfLineTableParams Params, +- Optional &LineStr) const { +- MCSymbol *LineEndSym = Header.Emit(MCOS, Params, LineStr).second; ++ Optional &LineStr) { ++ MCSymbol *LineBeginSym; ++ MCSymbol *LineEndSym; ++ ++ std::tie(LineBeginSym, LineEndSym) = Header.Emit(MCOS, Params, LineStr); ++ setLabel(LineBeginSym); + + // Put out the line tables. + for (const auto &LineSec : MCLineSections.getMCLineEntries()) +@@ -1253,12 +1297,217 @@ public: + void EmitCFIInstruction(const MCCFIInstruction &Instr); + }; + ++// A stripped-down version of MCObjectStreamer that only calculates how many ++// bytes were written to it. We use it to know in advance how many bytes ++// DWARF expressions will use. ++class SizeCalcMCStreamer { ++ uint64_t TotalSize = {0}; ++ ++public: ++ SizeCalcMCStreamer() {} ++ ++ uint64_t getSize() { return TotalSize; } ++ ++ void EmitIntValue(uint64_t Value, unsigned Size) { TotalSize += Size; } ++ ++ void EmitULEB128IntValue(uint64_t Value, unsigned Padding = 0) { ++ TotalSize += Padding + getULEB128Size(Value); ++ } ++ ++ void EmitSLEB128IntValue(int64_t Value) { ++ TotalSize += getSLEB128Size(Value); ++ } ++}; ++ + } // end anonymous namespace + + static void emitEncodingByte(MCObjectStreamer &Streamer, unsigned Encoding) { + Streamer.EmitIntValue(Encoding, 1); + } + ++template ++static void EmitDwarfExpression(T &Streamer, ++ const MCDwarfExpression &Expr) { ++ for (const auto &Elem : Expr) { ++ Streamer.EmitIntValue(Elem.Operation, 1); ++ switch (Elem.Operation) { ++ default: ++ llvm_unreachable("Unrecognized DWARF expression opcode"); ++ case dwarf::DW_OP_addr: ++ case dwarf::DW_OP_call_ref: ++ llvm_unreachable("DW_OP_addr & DW_OP_call_ref are unimplemented"); ++ break; ++ case dwarf::DW_OP_const1u: ++ case dwarf::DW_OP_const1s: ++ case dwarf::DW_OP_pick: ++ case dwarf::DW_OP_deref_size: ++ case dwarf::DW_OP_xderef_size: ++ Streamer.EmitIntValue(Elem.Operand0, 1); ++ break; ++ case dwarf::DW_OP_const2u: ++ case dwarf::DW_OP_const2s: ++ case dwarf::DW_OP_skip: ++ case dwarf::DW_OP_bra: ++ case dwarf::DW_OP_call2: ++ Streamer.EmitIntValue(Elem.Operand0, 2); ++ break; ++ case dwarf::DW_OP_const4u: ++ case dwarf::DW_OP_const4s: ++ case dwarf::DW_OP_call4: ++ Streamer.EmitIntValue(Elem.Operand0, 4); ++ break; ++ case dwarf::DW_OP_const8u: ++ case dwarf::DW_OP_const8s: ++ Streamer.EmitIntValue(Elem.Operand0, 8); ++ break; ++ case dwarf::DW_OP_constu: ++ case dwarf::DW_OP_plus_uconst: ++ case dwarf::DW_OP_regx: ++ case dwarf::DW_OP_piece: ++ case dwarf::DW_OP_GNU_addr_index: ++ case dwarf::DW_OP_GNU_const_index: ++ Streamer.EmitULEB128IntValue(Elem.Operand0); ++ break; ++ case dwarf::DW_OP_consts: ++ case dwarf::DW_OP_breg0: ++ case dwarf::DW_OP_breg1: ++ case dwarf::DW_OP_breg2: ++ case dwarf::DW_OP_breg3: ++ case dwarf::DW_OP_breg4: ++ case dwarf::DW_OP_breg5: ++ case dwarf::DW_OP_breg6: ++ case dwarf::DW_OP_breg7: ++ case dwarf::DW_OP_breg8: ++ case dwarf::DW_OP_breg9: ++ case dwarf::DW_OP_breg10: ++ case dwarf::DW_OP_breg11: ++ case dwarf::DW_OP_breg12: ++ case dwarf::DW_OP_breg13: ++ case dwarf::DW_OP_breg14: ++ case dwarf::DW_OP_breg15: ++ case dwarf::DW_OP_breg16: ++ case dwarf::DW_OP_breg17: ++ case dwarf::DW_OP_breg18: ++ case dwarf::DW_OP_breg19: ++ case dwarf::DW_OP_breg20: ++ case dwarf::DW_OP_breg21: ++ case dwarf::DW_OP_breg22: ++ case dwarf::DW_OP_breg23: ++ case dwarf::DW_OP_breg24: ++ case dwarf::DW_OP_breg25: ++ case dwarf::DW_OP_breg26: ++ case dwarf::DW_OP_breg27: ++ case dwarf::DW_OP_breg28: ++ case dwarf::DW_OP_breg29: ++ case dwarf::DW_OP_breg30: ++ case dwarf::DW_OP_breg31: ++ case dwarf::DW_OP_fbreg: ++ Streamer.EmitSLEB128IntValue(Elem.Operand0); ++ break; ++ case dwarf::DW_OP_deref: ++ case dwarf::DW_OP_dup: ++ case dwarf::DW_OP_drop: ++ case dwarf::DW_OP_over: ++ case dwarf::DW_OP_swap: ++ case dwarf::DW_OP_rot: ++ case dwarf::DW_OP_xderef: ++ case dwarf::DW_OP_abs: ++ case dwarf::DW_OP_and: ++ case dwarf::DW_OP_div: ++ case dwarf::DW_OP_minus: ++ case dwarf::DW_OP_mod: ++ case dwarf::DW_OP_mul: ++ case dwarf::DW_OP_neg: ++ case dwarf::DW_OP_not: ++ case dwarf::DW_OP_or: ++ case dwarf::DW_OP_plus: ++ case dwarf::DW_OP_shl: ++ case dwarf::DW_OP_shr: ++ case dwarf::DW_OP_shra: ++ case dwarf::DW_OP_xor: ++ case dwarf::DW_OP_eq: ++ case dwarf::DW_OP_ge: ++ case dwarf::DW_OP_gt: ++ case dwarf::DW_OP_le: ++ case dwarf::DW_OP_lt: ++ case dwarf::DW_OP_ne: ++ case dwarf::DW_OP_lit0: ++ case dwarf::DW_OP_lit1: ++ case dwarf::DW_OP_lit2: ++ case dwarf::DW_OP_lit3: ++ case dwarf::DW_OP_lit4: ++ case dwarf::DW_OP_lit5: ++ case dwarf::DW_OP_lit6: ++ case dwarf::DW_OP_lit7: ++ case dwarf::DW_OP_lit8: ++ case dwarf::DW_OP_lit9: ++ case dwarf::DW_OP_lit10: ++ case dwarf::DW_OP_lit11: ++ case dwarf::DW_OP_lit12: ++ case dwarf::DW_OP_lit13: ++ case dwarf::DW_OP_lit14: ++ case dwarf::DW_OP_lit15: ++ case dwarf::DW_OP_lit16: ++ case dwarf::DW_OP_lit17: ++ case dwarf::DW_OP_lit18: ++ case dwarf::DW_OP_lit19: ++ case dwarf::DW_OP_lit20: ++ case dwarf::DW_OP_lit21: ++ case dwarf::DW_OP_lit22: ++ case dwarf::DW_OP_lit23: ++ case dwarf::DW_OP_lit24: ++ case dwarf::DW_OP_lit25: ++ case dwarf::DW_OP_lit26: ++ case dwarf::DW_OP_lit27: ++ case dwarf::DW_OP_lit28: ++ case dwarf::DW_OP_lit29: ++ case dwarf::DW_OP_lit30: ++ case dwarf::DW_OP_lit31: ++ case dwarf::DW_OP_reg0: ++ case dwarf::DW_OP_reg1: ++ case dwarf::DW_OP_reg2: ++ case dwarf::DW_OP_reg3: ++ case dwarf::DW_OP_reg4: ++ case dwarf::DW_OP_reg5: ++ case dwarf::DW_OP_reg6: ++ case dwarf::DW_OP_reg7: ++ case dwarf::DW_OP_reg8: ++ case dwarf::DW_OP_reg9: ++ case dwarf::DW_OP_reg10: ++ case dwarf::DW_OP_reg11: ++ case dwarf::DW_OP_reg12: ++ case dwarf::DW_OP_reg13: ++ case dwarf::DW_OP_reg14: ++ case dwarf::DW_OP_reg15: ++ case dwarf::DW_OP_reg16: ++ case dwarf::DW_OP_reg17: ++ case dwarf::DW_OP_reg18: ++ case dwarf::DW_OP_reg19: ++ case dwarf::DW_OP_reg20: ++ case dwarf::DW_OP_reg21: ++ case dwarf::DW_OP_reg22: ++ case dwarf::DW_OP_reg23: ++ case dwarf::DW_OP_reg24: ++ case dwarf::DW_OP_reg25: ++ case dwarf::DW_OP_reg26: ++ case dwarf::DW_OP_reg27: ++ case dwarf::DW_OP_reg28: ++ case dwarf::DW_OP_reg29: ++ case dwarf::DW_OP_reg30: ++ case dwarf::DW_OP_reg31: ++ case dwarf::DW_OP_nop: ++ case dwarf::DW_OP_push_object_address: ++ case dwarf::DW_OP_form_tls_address: ++ case dwarf::DW_OP_GNU_push_tls_address: ++ break; ++ case dwarf::DW_OP_bregx: ++ Streamer.EmitULEB128IntValue(Elem.Operand0); ++ Streamer.EmitSLEB128IntValue(Elem.Operand1); ++ break; ++ } ++ } ++} ++ + void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) { + int dataAlignmentFactor = getDataAlignmentFactor(Streamer); + auto *MRI = Streamer.getContext().getRegisterInfo(); +@@ -1373,7 +1622,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) { + Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1); + Streamer.EmitULEB128IntValue(Instr.getOffset()); + return; +- ++ case MCCFIInstruction::OpDefCfaExpression: { ++ Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_expression, 1); ++ SizeCalcMCStreamer FakeStreamer; ++ EmitDwarfExpression<>(FakeStreamer, Instr.getExpression()); ++ Streamer.EmitULEB128IntValue(FakeStreamer.getSize()); ++ EmitDwarfExpression<>(Streamer, Instr.getExpression()); ++ return; ++ } ++ case MCCFIInstruction::OpExpression: ++ case MCCFIInstruction::OpValExpression: { ++ unsigned Reg = Instr.getRegister(); ++ Streamer.EmitIntValue(Instr.getOperation() == MCCFIInstruction::OpExpression ++ ? dwarf::DW_CFA_expression ++ : dwarf::DW_CFA_val_expression, ++ 1); ++ Streamer.EmitULEB128IntValue(Reg); ++ SizeCalcMCStreamer FakeStreamer; ++ EmitDwarfExpression<>(FakeStreamer, Instr.getExpression()); ++ Streamer.EmitULEB128IntValue(FakeStreamer.getSize()); ++ EmitDwarfExpression<>(Streamer, Instr.getExpression()); ++ return; ++ } + case MCCFIInstruction::OpEscape: + Streamer.EmitBytes(Instr.getValues()); + return; +diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp +index 65fbe8e..4b32cd7 100644 +--- a/lib/MC/MCExpr.cpp ++++ b/lib/MC/MCExpr.cpp +@@ -834,3 +834,7 @@ MCFragment *MCExpr::findAssociatedFragment() const { + + llvm_unreachable("Invalid assembly expression kind!"); + } ++ ++const MCSymbol &MCExpr::getSymbol() const { ++ return cast(this)->getSymbol(); ++} +diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp +index 1aed50a..e740a0d 100644 +--- a/lib/MC/MCFragment.cpp ++++ b/lib/MC/MCFragment.cpp +@@ -254,6 +254,9 @@ void MCFragment::destroy() { + case FT_Align: + delete cast(this); + return; ++ case FT_NeverAlign: ++ delete cast(this); ++ return; + case FT_Data: + delete cast(this); + return; +@@ -316,6 +319,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { + OS << "<"; + switch (getKind()) { + case MCFragment::FT_Align: OS << "MCAlignFragment"; break; ++ case MCFragment::FT_NeverAlign: OS << "MCNeverAlignFragment"; break; + case MCFragment::FT_Data: OS << "MCDataFragment"; break; + case MCFragment::FT_CompactEncodedInst: + OS << "MCCompactEncodedInstFragment"; break; +@@ -348,6 +352,15 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { + << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">"; + break; + } ++ case MCFragment::FT_NeverAlign: { ++ const MCNeverAlignFragment *NAF = cast(this); ++ if (NAF->hasEmitNops()) ++ OS << " (emit nops)"; ++ OS << "\n "; ++ OS << " Alignment:" << NAF->getAlignment() ++ << " Value:" << NAF->getValue() << " ValueSize:" << NAF->getValueSize(); ++ break; ++ } + case MCFragment::FT_Data: { + const MCDataFragment *DF = cast(this); + OS << "\n "; +@@ -480,7 +493,10 @@ LLVM_DUMP_METHOD void MCAssembler::dump() const{ + if (it != symbol_begin()) OS << ",\n "; + OS << "("; + it->dump(); +- OS << ", Index:" << it->getIndex() << ", "; ++ if (it->hasIndex()) ++ OS << ", Index:" << it->getIndex() << ", "; ++ else ++ OS << ", Order:" << it->getOrder() << ", "; + OS << ")"; + } + OS << "]>\n"; +diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp +index 83da8ac..820aa68 100644 +--- a/lib/MC/MCObjectFileInfo.cpp ++++ b/lib/MC/MCObjectFileInfo.cpp +@@ -480,6 +480,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { + ReadOnlySection = + Ctx->getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + ++ ReadOnlyColdSection = ++ Ctx->getELFSection(".rodata.cold", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); ++ + TLSDataSection = + Ctx->getELFSection(".tdata", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_TLS | ELF::SHF_WRITE); +diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp +index 0a68458..58199c9 100644 +--- a/lib/MC/MCObjectStreamer.cpp ++++ b/lib/MC/MCObjectStreamer.cpp +@@ -494,6 +494,13 @@ void MCObjectStreamer::EmitCodeAlignment(unsigned ByteAlignment, + cast(getCurrentFragment())->setEmitNops(true); + } + ++void MCObjectStreamer::EmitNeverAlignCodeAtEnd(unsigned ByteAlignment, ++ int64_t Value, ++ unsigned ValueSize) { ++ insert(new MCNeverAlignFragment(ByteAlignment, 0, 1)); ++ cast(getCurrentFragment())->setEmitNops(true); ++} ++ + void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, + unsigned char Value, + SMLoc Loc) { +diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp +index 7765698..0954b70 100644 +--- a/lib/MC/MCStreamer.cpp ++++ b/lib/MC/MCStreamer.cpp +@@ -85,11 +85,15 @@ void MCStreamer::reset() { + DwarfFrameInfos.clear(); + CurrentWinFrameInfo = nullptr; + WinFrameInfos.clear(); +- SymbolOrdering.clear(); ++ SymbolOrdering = 1; + SectionStack.clear(); + SectionStack.push_back(std::pair()); + } + ++unsigned MCStreamer::GetSymbolOrder(const MCSymbol *Symbol) const { ++ return Symbol->getOrder(); ++} ++ + raw_ostream &MCStreamer::GetCommentOS() { + // By default, discard comments. + return nulls(); +@@ -130,6 +134,13 @@ void MCStreamer::EmitULEB128IntValue(uint64_t Value) { + EmitBytes(OSE.str()); + } + ++void MCStreamer::EmitPaddedULEB128IntValue(uint64_t Value, unsigned PadTo) { ++ SmallString<128> Tmp; ++ raw_svector_ostream OSE(Tmp); ++ encodeULEB128(Value, OSE, PadTo); ++ EmitBytes(OSE.str()); ++} ++ + /// EmitSLEB128IntValue - Special case of EmitSLEB128Value that avoids the + /// client having to pass in a MCExpr for constant integers. + void MCStreamer::EmitSLEB128IntValue(int64_t Value) { +@@ -315,7 +326,7 @@ void MCStreamer::AssignFragment(MCSymbol *Symbol, MCFragment *Fragment) { + + // As we emit symbols into a section, track the order so that they can + // be sorted upon later. Zero is reserved to mean 'unemitted'. +- SymbolOrdering[Symbol] = 1 + SymbolOrdering.size(); ++ Symbol->setOrder(SymbolOrdering); + } + + void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) { +@@ -513,7 +524,7 @@ void MCStreamer::EmitCFIEscape(StringRef Values) { + + void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) { + MCSymbol *Label = EmitCFILabel(); +- MCCFIInstruction Instruction = ++ MCCFIInstruction Instruction = + MCCFIInstruction::createGnuArgsSize(Label, Size); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); + if (!CurFrame) +@@ -884,6 +895,14 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) { + } + } + ++void MCStreamer::EmitCFIInstruction(const MCCFIInstruction &Inst) { ++ MCSymbol *Label = EmitCFILabel(); ++ MCCFIInstruction Instruction = Inst; ++ Instruction.setLabel(Label); ++ MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); ++ CurFrame->Instructions.push_back(Instruction); ++} ++ + void MCStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) { + // Scan for values. +@@ -961,6 +980,8 @@ void MCStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value, + unsigned MaxBytesToEmit) {} + void MCStreamer::EmitCodeAlignment(unsigned ByteAlignment, + unsigned MaxBytesToEmit) {} ++void MCStreamer::EmitNeverAlignCodeAtEnd(unsigned ByteAlignment, int64_t Value, ++ unsigned ValueSize) {} + void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value, + SMLoc Loc) {} + void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {} +diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp +index b544fa5..746c9f3 100644 +--- a/lib/Object/COFFObjectFile.cpp ++++ b/lib/Object/COFFObjectFile.cpp +@@ -339,11 +339,16 @@ unsigned COFFObjectFile::getSectionID(SectionRef Sec) const { + + bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const { + const coff_section *Sec = toSec(Ref); +- // In COFF, a virtual section won't have any in-file ++ // In COFF, a virtual section won't have any in-file + // content, so the file pointer to the content will be zero. + return Sec->PointerToRawData == 0; + } + ++bool COFFObjectFile::isSectionReadOnly(DataRefImpl Ref) const { ++ llvm_unreachable("not implemented"); ++ return false; ++} ++ + static uint32_t getNumberOfRelocations(const coff_section *Sec, + MemoryBufferRef M, const uint8_t *base) { + // The field for the number of relocations in COFF section table is only +diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp +index adc54b4..2fcc293 100644 +--- a/lib/Object/MachOObjectFile.cpp ++++ b/lib/Object/MachOObjectFile.cpp +@@ -1986,6 +1986,11 @@ bool MachOObjectFile::isSectionStripped(DataRefImpl Sec) const { + return getSection(Sec).offset == 0; + } + ++bool MachOObjectFile::isSectionReadOnly(DataRefImpl Sec) const { ++ llvm_unreachable("not implemented"); ++ return false; ++} ++ + relocation_iterator MachOObjectFile::section_rel_begin(DataRefImpl Sec) const { + DataRefImpl Ret; + Ret.d.a = Sec.d.a; +diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp +index 0c78631..c0dac22 100644 +--- a/lib/Object/WasmObjectFile.cpp ++++ b/lib/Object/WasmObjectFile.cpp +@@ -1140,6 +1140,8 @@ bool WasmObjectFile::isSectionBSS(DataRefImpl Sec) const { return false; } + + bool WasmObjectFile::isSectionVirtual(DataRefImpl Sec) const { return false; } + ++bool WasmObjectFile::isSectionReadOnly(DataRefImpl Sec) const { return false; } ++ + bool WasmObjectFile::isSectionBitcode(DataRefImpl Sec) const { return false; } + + relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Ref) const { +diff --git a/lib/Support/ToolOutputFile.cpp b/lib/Support/ToolOutputFile.cpp +index e12d9e8..1c74d40 100644 +--- a/lib/Support/ToolOutputFile.cpp ++++ b/lib/Support/ToolOutputFile.cpp +@@ -35,8 +35,8 @@ ToolOutputFile::CleanupInstaller::~CleanupInstaller() { + } + + ToolOutputFile::ToolOutputFile(StringRef Filename, std::error_code &EC, +- sys::fs::OpenFlags Flags) +- : Installer(Filename), OS(Filename, EC, Flags) { ++ sys::fs::OpenFlags Flags, unsigned Mode) ++ : Installer(Filename), OS(Filename, EC, Flags, Mode) { + // If open fails, no cleanup is needed. + if (EC) + Installer.Keep = true; +diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp +index e026111..9455379 100644 +--- a/lib/Support/raw_ostream.cpp ++++ b/lib/Support/raw_ostream.cpp +@@ -490,7 +490,7 @@ void format_object_base::home() { + //===----------------------------------------------------------------------===// + + static int getFD(StringRef Filename, std::error_code &EC, +- sys::fs::OpenFlags Flags) { ++ sys::fs::OpenFlags Flags, unsigned Mode = 0666) { + // Handle "-" as stdout. Note that when we do this, we consider ourself + // the owner of stdout and may set the "binary" flag globally based on Flags. + if (Filename == "-") { +@@ -503,7 +503,7 @@ static int getFD(StringRef Filename, std::error_code &EC, + } + + int FD; +- EC = sys::fs::openFileForWrite(Filename, FD, Flags); ++ EC = sys::fs::openFileForWrite(Filename, FD, Flags, Mode); + if (EC) + return -1; + +@@ -511,8 +511,8 @@ static int getFD(StringRef Filename, std::error_code &EC, + } + + raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC, +- sys::fs::OpenFlags Flags) +- : raw_fd_ostream(getFD(Filename, EC, Flags), true) {} ++ sys::fs::OpenFlags Flags, unsigned Mode) ++ : raw_fd_ostream(getFD(Filename, EC, Flags, Mode), true) {} + + /// FD is the file descriptor that this writes to. If ShouldClose is true, this + /// closes the file when the stream is destroyed. +diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt +index ed79f4f..95cb71f 100644 +--- a/lib/Target/X86/CMakeLists.txt ++++ b/lib/Target/X86/CMakeLists.txt +@@ -19,6 +19,7 @@ if (X86_GEN_FOLD_TABLES) + endif() + + add_public_tablegen_target(X86CommonTableGen) ++add_public_tablegen_target(X86GenInstrInfo) + + set(sources + X86AsmPrinter.cpp +diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +index fa7c352..35d28c1 100644 +--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp ++++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +@@ -46,6 +46,8 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { + + TextAlignFillValue = 0x90; + ++ TrapFillValue = 0xCC; ++ + if (!is64Bit) + Data64bitsDirective = nullptr; // we can't emit a 64-bit unit + +@@ -98,6 +100,8 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { + + TextAlignFillValue = 0x90; + ++ TrapFillValue = 0xCC; ++ + // Debug Information + SupportsDebugInformation = true; + +@@ -141,6 +145,8 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { + + TextAlignFillValue = 0x90; + ++ TrapFillValue = 0xCC; ++ + AllowAtInName = true; + + UseIntegratedAssembler = true; +@@ -164,5 +170,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { + + TextAlignFillValue = 0x90; + ++ TrapFillValue = 0xCC; ++ + UseIntegratedAssembler = true; + } +diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td +index 9fba65c..1c8eb27 100644 +--- a/lib/Target/X86/X86InstrControl.td ++++ b/lib/Target/X86/X86InstrControl.td +@@ -200,7 +200,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + } + + // Loop instructions +-let SchedRW = [WriteJump] in { ++let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { + def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>; + def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>; + def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>; +@@ -299,12 +299,13 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + (ins i32imm_pcrel:$dst), + "jmp\t$dst", + [], IIC_JMP_REL>; +- +- def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), +- "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. +- let mayLoad = 1 in +- def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), +- "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>; ++ let isIndirectBranch = 1 in { ++ def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), ++ "jmp{l}\t{*}$dst", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. ++ let mayLoad = 1 in ++ def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), ++ "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>; ++ } + } + + // Conditional tail calls are similar to the above, but they are branches +diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td +index f25f1b0..8a36933 100644 +--- a/lib/Target/X86/X86InstrSystem.td ++++ b/lib/Target/X86/X86InstrSystem.td +@@ -29,7 +29,8 @@ let mayLoad = 1, mayStore = 0, hasSideEffects = 1 in { + def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB; + } + +-def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>; ++let isTerminator = 1 in ++ def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>; + def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB; + + // Interrupt and SysCall Instructions. diff --git a/bolt/merge-fdata/LLVMBuild.txt b/bolt/merge-fdata/LLVMBuild.txt deleted file mode 100644 index 39bc693fb469..000000000000 --- a/bolt/merge-fdata/LLVMBuild.txt +++ /dev/null @@ -1,21 +0,0 @@ -;===- ./tools/llvm-bolt/merge-fdata/LLVMBuild.txt --------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Tool -name = merge-fdata -parent = llvm-bolt diff --git a/bolt/merge-fdata/Makefile b/bolt/merge-fdata/Makefile deleted file mode 100644 index 41687a01b91b..000000000000 --- a/bolt/merge-fdata/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -##===- tools/lli/Makefile ------------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL := ../../.. -TOOLNAME := merge-fdata - -include $(LEVEL)/Makefile.config - -LINK_COMPONENTS := support - -SOURCES := merge-fdata.cpp ../DataReader.cpp - -include $(LLVM_SRC_ROOT)/Makefile.rules diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/src/BinaryBasicBlock.cpp similarity index 100% rename from bolt/BinaryBasicBlock.cpp rename to bolt/src/BinaryBasicBlock.cpp diff --git a/bolt/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h similarity index 100% rename from bolt/BinaryBasicBlock.h rename to bolt/src/BinaryBasicBlock.h diff --git a/bolt/BinaryContext.cpp b/bolt/src/BinaryContext.cpp similarity index 96% rename from bolt/BinaryContext.cpp rename to bolt/src/BinaryContext.cpp index a09b4f8aa630..9415c864a7ab 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -518,13 +518,20 @@ unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID, // means empty dir. assert(FileIndex > 0 && FileIndex <= FileNames.size() && "FileIndex out of range for the compilation unit."); - StringRef Dir = - FileNames[FileIndex - 1].DirIdx - ? LineTable->Prologue + StringRef Dir = ""; + if (FileNames[FileIndex - 1].DirIdx != 0) { + if (auto DirName = + LineTable->Prologue .IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1] - : ""; - return Ctx->getDwarfFile(Dir, FileNames[FileIndex - 1].Name, 0, nullptr, - DestCUID); + .getAsCString()) { + Dir = *DirName; + } + } + StringRef FileName = ""; + if (auto FName = FileNames[FileIndex - 1].Name.getAsCString()) + FileName = *FName; + assert(FileName != ""); + return cantFail(Ctx->getDwarfFile(Dir, FileName, 0, nullptr, None, DestCUID)); } std::vector BinaryContext::getSortedFunctions( @@ -557,11 +564,17 @@ void BinaryContext::preprocessDebugInfo( for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) { // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 // means empty dir. - StringRef Dir = - FileNames[I].DirIdx - ? LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] - : ""; - Ctx->getDwarfFile(Dir, FileNames[I].Name, 0, nullptr, CUID); + StringRef Dir = ""; + if (FileNames[I].DirIdx != 0) + if (auto DirName = + LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] + .getAsCString()) + Dir = *DirName; + StringRef FileName = ""; + if (auto FName = FileNames[I].Name.getAsCString()) + FileName = *FName; + assert(FileName != ""); + cantFail(Ctx->getDwarfFile(Dir, FileName, 0, nullptr, None, CUID)); } } @@ -716,9 +729,11 @@ void BinaryContext::printInstruction(raw_ostream &OS, if (RowRef != DebugLineTableRowRef::NULL_ROW) { const auto &Row = LineTable->Rows[RowRef.RowIndex - 1]; - OS << " # debug line " - << LineTable->Prologue.FileNames[Row.File - 1].Name - << ":" << Row.Line; + StringRef FileName = ""; + if (auto FName = + LineTable->Prologue.FileNames[Row.File - 1].Name.getAsCString()) + FileName = *FName; + OS << " # debug line " << FileName << ":" << Row.Line; if (Row.Column) { OS << ":" << Row.Column; diff --git a/bolt/BinaryContext.h b/bolt/src/BinaryContext.h similarity index 100% rename from bolt/BinaryContext.h rename to bolt/src/BinaryContext.h diff --git a/bolt/BinaryData.cpp b/bolt/src/BinaryData.cpp similarity index 100% rename from bolt/BinaryData.cpp rename to bolt/src/BinaryData.cpp diff --git a/bolt/BinaryData.h b/bolt/src/BinaryData.h similarity index 99% rename from bolt/BinaryData.h rename to bolt/src/BinaryData.h index 0acace0ca7ae..e63e72a59417 100644 --- a/bolt/BinaryData.h +++ b/bolt/src/BinaryData.h @@ -111,7 +111,7 @@ class BinaryData { bool isAtomic() const { return isTopLevelJumpTable() || !Parent; } - + iterator_range::const_iterator> names() const { return make_range(Names.begin(), Names.end()); } diff --git a/bolt/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp similarity index 100% rename from bolt/BinaryFunction.cpp rename to bolt/src/BinaryFunction.cpp diff --git a/bolt/BinaryFunction.h b/bolt/src/BinaryFunction.h similarity index 100% rename from bolt/BinaryFunction.h rename to bolt/src/BinaryFunction.h diff --git a/bolt/BinaryFunctionProfile.cpp b/bolt/src/BinaryFunctionProfile.cpp similarity index 100% rename from bolt/BinaryFunctionProfile.cpp rename to bolt/src/BinaryFunctionProfile.cpp diff --git a/bolt/BinaryLoop.h b/bolt/src/BinaryLoop.h similarity index 100% rename from bolt/BinaryLoop.h rename to bolt/src/BinaryLoop.h diff --git a/bolt/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp similarity index 100% rename from bolt/BinaryPassManager.cpp rename to bolt/src/BinaryPassManager.cpp diff --git a/bolt/BinaryPassManager.h b/bolt/src/BinaryPassManager.h similarity index 100% rename from bolt/BinaryPassManager.h rename to bolt/src/BinaryPassManager.h diff --git a/bolt/BinarySection.cpp b/bolt/src/BinarySection.cpp similarity index 100% rename from bolt/BinarySection.cpp rename to bolt/src/BinarySection.cpp diff --git a/bolt/BinarySection.h b/bolt/src/BinarySection.h similarity index 100% rename from bolt/BinarySection.h rename to bolt/src/BinarySection.h diff --git a/bolt/BoltDiff.cpp b/bolt/src/BoltDiff.cpp similarity index 100% rename from bolt/BoltDiff.cpp rename to bolt/src/BoltDiff.cpp diff --git a/bolt/src/CMakeLists.txt b/bolt/src/CMakeLists.txt new file mode 100644 index 000000000000..3804b16f2e8f --- /dev/null +++ b/bolt/src/CMakeLists.txt @@ -0,0 +1,92 @@ +add_subdirectory(merge-fdata) +add_subdirectory(Passes) +add_subdirectory(Target) + +# Get the current git revision for BOLT. +function(get_version ofn) + find_program(git_executable NAMES git git.exe git.cmd) + if (git_executable) + execute_process(COMMAND ${git_executable} rev-parse HEAD + WORKING_DIRECTORY ${LLVM_MAIN_SRC_DIR} + TIMEOUT 5 + RESULT_VARIABLE git_result + OUTPUT_VARIABLE git_output) + if( git_result EQUAL 0 ) + string(STRIP "${git_output}" git_ref_id) + set(BOLT_REVISION "${git_ref_id}") + endif() + endif() + + # If we can't find a revision, set it to "". + if (NOT BOLT_REVISION) + set(BOLT_REVISION "") + endif() + + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} + COMMAND echo '"${BOLT_REVISION}"' > ${CMAKE_CURRENT_BINARY_DIR}/${ofn} + COMMENT "Generating bogus ${ofn}..." + ) + + set(VERSION_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} PARENT_SCOPE) + + # `make clean' must remove all those generated files: + set_property(DIRECTORY APPEND + PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${ofn}) + set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${ofn} PROPERTIES + GENERATED 1) +endfunction() + +# Creates a public target for generating the revision file. +function(add_public_gen_version_target target) + add_custom_target(${target} DEPENDS ${VERSION_OUTPUT}) + set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} ${target} PARENT_SCOPE) +endfunction() + +get_version(BoltRevision.inc) +add_public_gen_version_target(GenBoltRevision) + +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + BOLTPasses + BOLTTargetAArch64 + BOLTTargetX86 + CodeGen + Core + DebugInfoDWARF + MC + MCDisassembler + MCParser + Object + Orcjit + Support + ) + +add_llvm_tool(llvm-bolt + llvm-bolt.cpp + BinaryBasicBlock.cpp + BinaryContext.cpp + BinaryData.cpp + BinaryFunction.cpp + BinaryFunctionProfile.cpp + BinaryPassManager.cpp + BinarySection.cpp + BoltDiff.cpp + CacheMetrics.cpp + DataAggregator.cpp + DataReader.cpp + DebugData.cpp + DWARFRewriter.cpp + Exceptions.cpp + JumpTable.cpp + MCPlusBuilder.cpp + ProfileReader.cpp + ProfileWriter.cpp + Relocation.cpp + RewriteInstance.cpp + + DEPENDS + intrinsics_gen + ) + +add_llvm_tool_symlink(perf2bolt llvm-bolt) +add_llvm_tool_symlink(llvm-boltdiff llvm-bolt) diff --git a/bolt/CacheMetrics.cpp b/bolt/src/CacheMetrics.cpp similarity index 100% rename from bolt/CacheMetrics.cpp rename to bolt/src/CacheMetrics.cpp diff --git a/bolt/CacheMetrics.h b/bolt/src/CacheMetrics.h similarity index 98% rename from bolt/CacheMetrics.h rename to bolt/src/CacheMetrics.h index b512168ebaf3..b3550b0806ca 100644 --- a/bolt/CacheMetrics.h +++ b/bolt/src/CacheMetrics.h @@ -30,7 +30,7 @@ double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr, uint64_t Count); - + } // namespace CacheMetrics } // namespace bolt } // namespace llvm diff --git a/bolt/DWARFRewriter.cpp b/bolt/src/DWARFRewriter.cpp similarity index 100% rename from bolt/DWARFRewriter.cpp rename to bolt/src/DWARFRewriter.cpp diff --git a/bolt/DataAggregator.cpp b/bolt/src/DataAggregator.cpp similarity index 100% rename from bolt/DataAggregator.cpp rename to bolt/src/DataAggregator.cpp diff --git a/bolt/DataAggregator.h b/bolt/src/DataAggregator.h similarity index 100% rename from bolt/DataAggregator.h rename to bolt/src/DataAggregator.h diff --git a/bolt/DataReader.cpp b/bolt/src/DataReader.cpp similarity index 100% rename from bolt/DataReader.cpp rename to bolt/src/DataReader.cpp diff --git a/bolt/DataReader.h b/bolt/src/DataReader.h similarity index 100% rename from bolt/DataReader.h rename to bolt/src/DataReader.h diff --git a/bolt/DebugData.cpp b/bolt/src/DebugData.cpp similarity index 100% rename from bolt/DebugData.cpp rename to bolt/src/DebugData.cpp diff --git a/bolt/DebugData.h b/bolt/src/DebugData.h similarity index 100% rename from bolt/DebugData.h rename to bolt/src/DebugData.h diff --git a/bolt/Exceptions.cpp b/bolt/src/Exceptions.cpp similarity index 96% rename from bolt/Exceptions.cpp rename to bolt/src/Exceptions.cpp index 380e26268967..5277aea99d83 100644 --- a/bolt/Exceptions.cpp +++ b/bolt/src/Exceptions.cpp @@ -141,8 +141,10 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, assert(Data.isValidOffset(Offset) && "wrong LSDA address"); uint8_t LPStartEncoding = Data.getU8(&Offset); - uint64_t LPStart = Data.getEncodedPointer(&Offset, LPStartEncoding, - Offset + LSDASectionAddress); + uint64_t LPStart = 0; + if (auto MaybeLPStart = Data.getEncodedPointer(&Offset, LPStartEncoding, + Offset + LSDASectionAddress)) + LPStart = *MaybeLPStart; assert(LPStart == 0 && "support for split functions not implemented"); @@ -193,13 +195,13 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, HasEHRanges = CallSitePtr < CallSiteTableEnd; uint64_t RangeBase = getAddress(); while (CallSitePtr < CallSiteTableEnd) { - uintptr_t Start = Data.getEncodedPointer(&CallSitePtr, CallSiteEncoding, - CallSitePtr + LSDASectionAddress); - uintptr_t Length = Data.getEncodedPointer(&CallSitePtr, CallSiteEncoding, + uint64_t Start = *Data.getEncodedPointer(&CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress); - uintptr_t LandingPad = Data.getEncodedPointer( + uint64_t Length = *Data.getEncodedPointer( &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress); - uintptr_t ActionEntry = Data.getULEB128(&CallSitePtr); + uint64_t LandingPad = *Data.getEncodedPointer( + &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress); + uint64_t ActionEntry = Data.getULEB128(&CallSitePtr); if (opts::PrintExceptions) { outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) @@ -253,8 +255,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, assert(Index > 0 && "only positive indices are valid"); uint32_t TTEntry = TypeTableStart - Index * TTypeEncodingSize; const auto TTEntryAddress = TTEntry + LSDASectionAddress; - auto TypeAddress = - Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress); + uint32_t TypeAddress = + *Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress); if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) { TypeAddress = 0; @@ -342,8 +344,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, for (unsigned Index = 1; Index <= MaxTypeIndex; ++Index) { uint32_t TTEntry = TypeTableStart - Index * TTypeEncodingSize; const auto TTEntryAddress = TTEntry + LSDASectionAddress; - auto TypeAddress = - Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress); + uint64_t TypeAddress = + *Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress); if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) { TypeAddress = 0; } @@ -794,20 +796,17 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { case DW_CFA_def_cfa_expression: case DW_CFA_val_expression: { MCDwarfExprBuilder Builder; - for (const auto &Operation : Instr.ExprOps) { - switch (Operation.Ops.size()) { - case 0: - Builder.appendOperation(Operation.Opcode); - break; - case 1: - Builder.appendOperation(Operation.Opcode, Operation.Ops[0]); - break; - case 2: - Builder.appendOperation(Operation.Opcode, Operation.Ops[0], - Operation.Ops[1]); - break; - default: - llvm_unreachable("Unrecognized DWARF expression"); + for (auto &ExprOp : *Instr.Expression) { + const DWARFExpression::Operation::Description &Desc = + ExprOp.getDescription(); + if (Desc.Op[0] == DWARFExpression::Operation::SizeNA) { + Builder.appendOperation(ExprOp.getCode()); + } else if (Desc.Op[1] == DWARFExpression::Operation::SizeNA) { + Builder.appendOperation(ExprOp.getCode(), + ExprOp.getRawOperand(0)); + } else { + Builder.appendOperation(ExprOp.getCode(), ExprOp.getRawOperand(0), + ExprOp.getRawOperand(1)); } } if (Opcode == DW_CFA_expression) { diff --git a/bolt/Exceptions.h b/bolt/src/Exceptions.h similarity index 100% rename from bolt/Exceptions.h rename to bolt/src/Exceptions.h diff --git a/bolt/JumpTable.cpp b/bolt/src/JumpTable.cpp similarity index 100% rename from bolt/JumpTable.cpp rename to bolt/src/JumpTable.cpp diff --git a/bolt/JumpTable.h b/bolt/src/JumpTable.h similarity index 100% rename from bolt/JumpTable.h rename to bolt/src/JumpTable.h diff --git a/bolt/MCPlus.h b/bolt/src/MCPlus.h similarity index 100% rename from bolt/MCPlus.h rename to bolt/src/MCPlus.h diff --git a/bolt/MCPlusBuilder.cpp b/bolt/src/MCPlusBuilder.cpp similarity index 100% rename from bolt/MCPlusBuilder.cpp rename to bolt/src/MCPlusBuilder.cpp diff --git a/bolt/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h similarity index 100% rename from bolt/MCPlusBuilder.h rename to bolt/src/MCPlusBuilder.h diff --git a/bolt/Passes/Aligner.cpp b/bolt/src/Passes/Aligner.cpp similarity index 100% rename from bolt/Passes/Aligner.cpp rename to bolt/src/Passes/Aligner.cpp diff --git a/bolt/Passes/Aligner.h b/bolt/src/Passes/Aligner.h similarity index 100% rename from bolt/Passes/Aligner.h rename to bolt/src/Passes/Aligner.h diff --git a/bolt/Passes/AllocCombiner.cpp b/bolt/src/Passes/AllocCombiner.cpp similarity index 100% rename from bolt/Passes/AllocCombiner.cpp rename to bolt/src/Passes/AllocCombiner.cpp diff --git a/bolt/Passes/AllocCombiner.h b/bolt/src/Passes/AllocCombiner.h similarity index 100% rename from bolt/Passes/AllocCombiner.h rename to bolt/src/Passes/AllocCombiner.h diff --git a/bolt/Passes/BinaryFunctionCallGraph.cpp b/bolt/src/Passes/BinaryFunctionCallGraph.cpp similarity index 100% rename from bolt/Passes/BinaryFunctionCallGraph.cpp rename to bolt/src/Passes/BinaryFunctionCallGraph.cpp diff --git a/bolt/Passes/BinaryFunctionCallGraph.h b/bolt/src/Passes/BinaryFunctionCallGraph.h similarity index 100% rename from bolt/Passes/BinaryFunctionCallGraph.h rename to bolt/src/Passes/BinaryFunctionCallGraph.h diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp similarity index 100% rename from bolt/Passes/BinaryPasses.cpp rename to bolt/src/Passes/BinaryPasses.cpp diff --git a/bolt/Passes/BinaryPasses.h b/bolt/src/Passes/BinaryPasses.h similarity index 100% rename from bolt/Passes/BinaryPasses.h rename to bolt/src/Passes/BinaryPasses.h diff --git a/bolt/Passes/CMakeLists.txt b/bolt/src/Passes/CMakeLists.txt similarity index 91% rename from bolt/Passes/CMakeLists.txt rename to bolt/src/Passes/CMakeLists.txt index 147395627aed..355623899681 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/src/Passes/CMakeLists.txt @@ -35,4 +35,4 @@ add_llvm_library(LLVMBOLTPasses intrinsics_gen ) -include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt ) +include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt/src ) diff --git a/bolt/Passes/CachePlusReorderAlgorithm.cpp b/bolt/src/Passes/CachePlusReorderAlgorithm.cpp similarity index 100% rename from bolt/Passes/CachePlusReorderAlgorithm.cpp rename to bolt/src/Passes/CachePlusReorderAlgorithm.cpp diff --git a/bolt/Passes/CallGraph.cpp b/bolt/src/Passes/CallGraph.cpp similarity index 100% rename from bolt/Passes/CallGraph.cpp rename to bolt/src/Passes/CallGraph.cpp diff --git a/bolt/Passes/CallGraph.h b/bolt/src/Passes/CallGraph.h similarity index 99% rename from bolt/Passes/CallGraph.h rename to bolt/src/Passes/CallGraph.h index 0eeb60c17f51..f55b4ea89cb8 100644 --- a/bolt/Passes/CallGraph.h +++ b/bolt/src/Passes/CallGraph.h @@ -161,7 +161,7 @@ class CallGraph { template void printDot(char* fileName, L getLabel) const; - + private: void setSamples(const NodeId Id, uint64_t Samples) { assert(Id < Nodes.size()); diff --git a/bolt/Passes/CallGraphWalker.cpp b/bolt/src/Passes/CallGraphWalker.cpp similarity index 100% rename from bolt/Passes/CallGraphWalker.cpp rename to bolt/src/Passes/CallGraphWalker.cpp diff --git a/bolt/Passes/CallGraphWalker.h b/bolt/src/Passes/CallGraphWalker.h similarity index 100% rename from bolt/Passes/CallGraphWalker.h rename to bolt/src/Passes/CallGraphWalker.h diff --git a/bolt/Passes/DataflowAnalysis.cpp b/bolt/src/Passes/DataflowAnalysis.cpp similarity index 100% rename from bolt/Passes/DataflowAnalysis.cpp rename to bolt/src/Passes/DataflowAnalysis.cpp diff --git a/bolt/Passes/DataflowAnalysis.h b/bolt/src/Passes/DataflowAnalysis.h similarity index 100% rename from bolt/Passes/DataflowAnalysis.h rename to bolt/src/Passes/DataflowAnalysis.h diff --git a/bolt/Passes/DataflowInfoManager.cpp b/bolt/src/Passes/DataflowInfoManager.cpp similarity index 100% rename from bolt/Passes/DataflowInfoManager.cpp rename to bolt/src/Passes/DataflowInfoManager.cpp diff --git a/bolt/Passes/DataflowInfoManager.h b/bolt/src/Passes/DataflowInfoManager.h similarity index 100% rename from bolt/Passes/DataflowInfoManager.h rename to bolt/src/Passes/DataflowInfoManager.h diff --git a/bolt/Passes/DominatorAnalysis.h b/bolt/src/Passes/DominatorAnalysis.h similarity index 100% rename from bolt/Passes/DominatorAnalysis.h rename to bolt/src/Passes/DominatorAnalysis.h diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/src/Passes/FrameAnalysis.cpp similarity index 100% rename from bolt/Passes/FrameAnalysis.cpp rename to bolt/src/Passes/FrameAnalysis.cpp diff --git a/bolt/Passes/FrameAnalysis.h b/bolt/src/Passes/FrameAnalysis.h similarity index 100% rename from bolt/Passes/FrameAnalysis.h rename to bolt/src/Passes/FrameAnalysis.h diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/src/Passes/FrameOptimizer.cpp similarity index 100% rename from bolt/Passes/FrameOptimizer.cpp rename to bolt/src/Passes/FrameOptimizer.cpp diff --git a/bolt/Passes/FrameOptimizer.h b/bolt/src/Passes/FrameOptimizer.h similarity index 100% rename from bolt/Passes/FrameOptimizer.h rename to bolt/src/Passes/FrameOptimizer.h diff --git a/bolt/Passes/HFSort.cpp b/bolt/src/Passes/HFSort.cpp similarity index 99% rename from bolt/Passes/HFSort.cpp rename to bolt/src/Passes/HFSort.cpp index 193ac30f40f4..2f03d68a33bd 100644 --- a/bolt/Passes/HFSort.cpp +++ b/bolt/src/Passes/HFSort.cpp @@ -46,7 +46,7 @@ namespace bolt { using NodeId = CallGraph::NodeId; using Arc = CallGraph::Arc; -using Node = CallGraph::Node; +using Node = CallGraph::Node; namespace { @@ -252,7 +252,7 @@ std::vector clusterize(const CallGraph &Cg) { std::vector randomClusters(const CallGraph &Cg) { std::vector FuncIds(Cg.numNodes(), 0); std::vector Clusters; - Clusters.reserve(Cg.numNodes()); + Clusters.reserve(Cg.numNodes()); for (NodeId F = 0; F < Cg.numNodes(); F++) { if (Cg.samples(F) == 0) continue; diff --git a/bolt/Passes/HFSort.h b/bolt/src/Passes/HFSort.h similarity index 100% rename from bolt/Passes/HFSort.h rename to bolt/src/Passes/HFSort.h diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/src/Passes/HFSortPlus.cpp similarity index 100% rename from bolt/Passes/HFSortPlus.cpp rename to bolt/src/Passes/HFSortPlus.cpp diff --git a/bolt/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp similarity index 100% rename from bolt/Passes/IndirectCallPromotion.cpp rename to bolt/src/Passes/IndirectCallPromotion.cpp diff --git a/bolt/Passes/IndirectCallPromotion.h b/bolt/src/Passes/IndirectCallPromotion.h similarity index 100% rename from bolt/Passes/IndirectCallPromotion.h rename to bolt/src/Passes/IndirectCallPromotion.h diff --git a/bolt/Passes/Inliner.cpp b/bolt/src/Passes/Inliner.cpp similarity index 100% rename from bolt/Passes/Inliner.cpp rename to bolt/src/Passes/Inliner.cpp diff --git a/bolt/Passes/Inliner.h b/bolt/src/Passes/Inliner.h similarity index 100% rename from bolt/Passes/Inliner.h rename to bolt/src/Passes/Inliner.h diff --git a/bolt/Passes/JTFootprintReduction.cpp b/bolt/src/Passes/JTFootprintReduction.cpp similarity index 100% rename from bolt/Passes/JTFootprintReduction.cpp rename to bolt/src/Passes/JTFootprintReduction.cpp diff --git a/bolt/Passes/JTFootprintReduction.h b/bolt/src/Passes/JTFootprintReduction.h similarity index 100% rename from bolt/Passes/JTFootprintReduction.h rename to bolt/src/Passes/JTFootprintReduction.h diff --git a/bolt/Passes/LivenessAnalysis.cpp b/bolt/src/Passes/LivenessAnalysis.cpp similarity index 100% rename from bolt/Passes/LivenessAnalysis.cpp rename to bolt/src/Passes/LivenessAnalysis.cpp diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/src/Passes/LivenessAnalysis.h similarity index 100% rename from bolt/Passes/LivenessAnalysis.h rename to bolt/src/Passes/LivenessAnalysis.h diff --git a/bolt/Passes/LongJmp.cpp b/bolt/src/Passes/LongJmp.cpp similarity index 100% rename from bolt/Passes/LongJmp.cpp rename to bolt/src/Passes/LongJmp.cpp diff --git a/bolt/Passes/LongJmp.h b/bolt/src/Passes/LongJmp.h similarity index 100% rename from bolt/Passes/LongJmp.h rename to bolt/src/Passes/LongJmp.h diff --git a/bolt/Passes/PLTCall.cpp b/bolt/src/Passes/PLTCall.cpp similarity index 100% rename from bolt/Passes/PLTCall.cpp rename to bolt/src/Passes/PLTCall.cpp diff --git a/bolt/Passes/PLTCall.h b/bolt/src/Passes/PLTCall.h similarity index 100% rename from bolt/Passes/PLTCall.h rename to bolt/src/Passes/PLTCall.h diff --git a/bolt/Passes/PettisAndHansen.cpp b/bolt/src/Passes/PettisAndHansen.cpp similarity index 99% rename from bolt/Passes/PettisAndHansen.cpp rename to bolt/src/Passes/PettisAndHansen.cpp index 6cd1cbd1eecd..432cb18e08af 100644 --- a/bolt/Passes/PettisAndHansen.cpp +++ b/bolt/src/Passes/PettisAndHansen.cpp @@ -13,7 +13,7 @@ namespace bolt { using NodeId = CallGraph::NodeId; using Arc = CallGraph::Arc; -using Node = CallGraph::Node; +using Node = CallGraph::Node; namespace { class ClusterArc { diff --git a/bolt/Passes/ReachingDefOrUse.h b/bolt/src/Passes/ReachingDefOrUse.h similarity index 100% rename from bolt/Passes/ReachingDefOrUse.h rename to bolt/src/Passes/ReachingDefOrUse.h diff --git a/bolt/Passes/ReachingInsns.h b/bolt/src/Passes/ReachingInsns.h similarity index 100% rename from bolt/Passes/ReachingInsns.h rename to bolt/src/Passes/ReachingInsns.h diff --git a/bolt/Passes/RegAnalysis.cpp b/bolt/src/Passes/RegAnalysis.cpp similarity index 100% rename from bolt/Passes/RegAnalysis.cpp rename to bolt/src/Passes/RegAnalysis.cpp diff --git a/bolt/Passes/RegAnalysis.h b/bolt/src/Passes/RegAnalysis.h similarity index 100% rename from bolt/Passes/RegAnalysis.h rename to bolt/src/Passes/RegAnalysis.h diff --git a/bolt/Passes/RegReAssign.cpp b/bolt/src/Passes/RegReAssign.cpp similarity index 100% rename from bolt/Passes/RegReAssign.cpp rename to bolt/src/Passes/RegReAssign.cpp diff --git a/bolt/Passes/RegReAssign.h b/bolt/src/Passes/RegReAssign.h similarity index 100% rename from bolt/Passes/RegReAssign.h rename to bolt/src/Passes/RegReAssign.h diff --git a/bolt/Passes/ReorderAlgorithm.cpp b/bolt/src/Passes/ReorderAlgorithm.cpp similarity index 100% rename from bolt/Passes/ReorderAlgorithm.cpp rename to bolt/src/Passes/ReorderAlgorithm.cpp diff --git a/bolt/Passes/ReorderAlgorithm.h b/bolt/src/Passes/ReorderAlgorithm.h similarity index 100% rename from bolt/Passes/ReorderAlgorithm.h rename to bolt/src/Passes/ReorderAlgorithm.h diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/src/Passes/ReorderFunctions.cpp similarity index 100% rename from bolt/Passes/ReorderFunctions.cpp rename to bolt/src/Passes/ReorderFunctions.cpp diff --git a/bolt/Passes/ReorderFunctions.h b/bolt/src/Passes/ReorderFunctions.h similarity index 100% rename from bolt/Passes/ReorderFunctions.h rename to bolt/src/Passes/ReorderFunctions.h diff --git a/bolt/Passes/ReorderUtils.h b/bolt/src/Passes/ReorderUtils.h similarity index 100% rename from bolt/Passes/ReorderUtils.h rename to bolt/src/Passes/ReorderUtils.h diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/src/Passes/ShrinkWrapping.cpp similarity index 100% rename from bolt/Passes/ShrinkWrapping.cpp rename to bolt/src/Passes/ShrinkWrapping.cpp diff --git a/bolt/Passes/ShrinkWrapping.h b/bolt/src/Passes/ShrinkWrapping.h similarity index 100% rename from bolt/Passes/ShrinkWrapping.h rename to bolt/src/Passes/ShrinkWrapping.h diff --git a/bolt/Passes/StackAllocationAnalysis.cpp b/bolt/src/Passes/StackAllocationAnalysis.cpp similarity index 100% rename from bolt/Passes/StackAllocationAnalysis.cpp rename to bolt/src/Passes/StackAllocationAnalysis.cpp diff --git a/bolt/Passes/StackAllocationAnalysis.h b/bolt/src/Passes/StackAllocationAnalysis.h similarity index 100% rename from bolt/Passes/StackAllocationAnalysis.h rename to bolt/src/Passes/StackAllocationAnalysis.h diff --git a/bolt/Passes/StackAvailableExpressions.cpp b/bolt/src/Passes/StackAvailableExpressions.cpp similarity index 100% rename from bolt/Passes/StackAvailableExpressions.cpp rename to bolt/src/Passes/StackAvailableExpressions.cpp diff --git a/bolt/Passes/StackAvailableExpressions.h b/bolt/src/Passes/StackAvailableExpressions.h similarity index 100% rename from bolt/Passes/StackAvailableExpressions.h rename to bolt/src/Passes/StackAvailableExpressions.h diff --git a/bolt/Passes/StackPointerTracking.cpp b/bolt/src/Passes/StackPointerTracking.cpp similarity index 100% rename from bolt/Passes/StackPointerTracking.cpp rename to bolt/src/Passes/StackPointerTracking.cpp diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/src/Passes/StackPointerTracking.h similarity index 100% rename from bolt/Passes/StackPointerTracking.h rename to bolt/src/Passes/StackPointerTracking.h diff --git a/bolt/Passes/StackReachingUses.cpp b/bolt/src/Passes/StackReachingUses.cpp similarity index 100% rename from bolt/Passes/StackReachingUses.cpp rename to bolt/src/Passes/StackReachingUses.cpp diff --git a/bolt/Passes/StackReachingUses.h b/bolt/src/Passes/StackReachingUses.h similarity index 100% rename from bolt/Passes/StackReachingUses.h rename to bolt/src/Passes/StackReachingUses.h diff --git a/bolt/Passes/StokeInfo.cpp b/bolt/src/Passes/StokeInfo.cpp similarity index 100% rename from bolt/Passes/StokeInfo.cpp rename to bolt/src/Passes/StokeInfo.cpp diff --git a/bolt/Passes/StokeInfo.h b/bolt/src/Passes/StokeInfo.h similarity index 100% rename from bolt/Passes/StokeInfo.h rename to bolt/src/Passes/StokeInfo.h diff --git a/bolt/ProfileReader.cpp b/bolt/src/ProfileReader.cpp similarity index 100% rename from bolt/ProfileReader.cpp rename to bolt/src/ProfileReader.cpp diff --git a/bolt/ProfileReader.h b/bolt/src/ProfileReader.h similarity index 100% rename from bolt/ProfileReader.h rename to bolt/src/ProfileReader.h diff --git a/bolt/ProfileWriter.cpp b/bolt/src/ProfileWriter.cpp similarity index 99% rename from bolt/ProfileWriter.cpp rename to bolt/src/ProfileWriter.cpp index fa0a7a4cb62c..a8930f6cd311 100644 --- a/bolt/ProfileWriter.cpp +++ b/bolt/src/ProfileWriter.cpp @@ -53,7 +53,7 @@ convert(const BinaryFunction &BF, yaml::bolt::BinaryFunctionProfile &YamlBF) { YamlBB.NumInstructions = BB->getNumNonPseudos(); if (!LBRProfile) { - YamlBB.EventCount = + YamlBB.EventCount = SampleDataOrErr->getSamples(BB->getInputOffset(), BB->getEndOffset()); if (YamlBB.EventCount) YamlBF.Blocks.emplace_back(YamlBB); diff --git a/bolt/ProfileWriter.h b/bolt/src/ProfileWriter.h similarity index 100% rename from bolt/ProfileWriter.h rename to bolt/src/ProfileWriter.h diff --git a/bolt/ProfileYAMLMapping.h b/bolt/src/ProfileYAMLMapping.h similarity index 100% rename from bolt/ProfileYAMLMapping.h rename to bolt/src/ProfileYAMLMapping.h diff --git a/bolt/Relocation.cpp b/bolt/src/Relocation.cpp similarity index 100% rename from bolt/Relocation.cpp rename to bolt/src/Relocation.cpp diff --git a/bolt/Relocation.h b/bolt/src/Relocation.h similarity index 100% rename from bolt/Relocation.h rename to bolt/src/Relocation.h diff --git a/bolt/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp similarity index 100% rename from bolt/RewriteInstance.cpp rename to bolt/src/RewriteInstance.cpp diff --git a/bolt/RewriteInstance.h b/bolt/src/RewriteInstance.h similarity index 100% rename from bolt/RewriteInstance.h rename to bolt/src/RewriteInstance.h diff --git a/bolt/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp similarity index 100% rename from bolt/Target/AArch64/AArch64MCPlusBuilder.cpp rename to bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp diff --git a/bolt/Target/AArch64/CMakeLists.txt b/bolt/src/Target/AArch64/CMakeLists.txt similarity index 78% rename from bolt/Target/AArch64/CMakeLists.txt rename to bolt/src/Target/AArch64/CMakeLists.txt index d3cedac53aa0..115823680cde 100644 --- a/bolt/Target/AArch64/CMakeLists.txt +++ b/bolt/src/Target/AArch64/CMakeLists.txt @@ -7,5 +7,5 @@ add_llvm_library(LLVMBOLTTargetAArch64 ) include_directories(${LLVM_MAIN_SRC_DIR}/lib/Target/AArch64 ${LLVM_BINARY_DIR}/lib/Target/AArch64) -include_directories(${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt) +include_directories(${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt/src) diff --git a/bolt/Target/CMakeLists.txt b/bolt/src/Target/CMakeLists.txt similarity index 100% rename from bolt/Target/CMakeLists.txt rename to bolt/src/Target/CMakeLists.txt diff --git a/bolt/Target/X86/CMakeLists.txt b/bolt/src/Target/X86/CMakeLists.txt similarity index 76% rename from bolt/Target/X86/CMakeLists.txt rename to bolt/src/Target/X86/CMakeLists.txt index 2a55ec2ca61d..65994a7f0d12 100644 --- a/bolt/Target/X86/CMakeLists.txt +++ b/bolt/src/Target/X86/CMakeLists.txt @@ -7,5 +7,5 @@ add_llvm_library(LLVMBOLTTargetX86 ) include_directories(${LLVM_MAIN_SRC_DIR}/lib/Target/X86 ${LLVM_BINARY_DIR}/lib/Target/X86) -include_directories(${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt) +include_directories(${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt/src) diff --git a/bolt/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp similarity index 99% rename from bolt/Target/X86/X86MCPlusBuilder.cpp rename to bolt/src/Target/X86/X86MCPlusBuilder.cpp index 681a13ada980..ec43191105d9 100644 --- a/bolt/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -997,7 +997,7 @@ class X86MCPlusBuilder : public MCPlusBuilder { return false; } } - + bool evaluateX86MemoryOperand(const MCInst &Inst, unsigned *BaseRegNum, int64_t *ScaleImm, diff --git a/bolt/llvm-bolt.cpp b/bolt/src/llvm-bolt.cpp similarity index 100% rename from bolt/llvm-bolt.cpp rename to bolt/src/llvm-bolt.cpp diff --git a/bolt/merge-fdata/CMakeLists.txt b/bolt/src/merge-fdata/CMakeLists.txt similarity index 100% rename from bolt/merge-fdata/CMakeLists.txt rename to bolt/src/merge-fdata/CMakeLists.txt diff --git a/bolt/merge-fdata/merge-fdata.cpp b/bolt/src/merge-fdata/merge-fdata.cpp similarity index 100% rename from bolt/merge-fdata/merge-fdata.cpp rename to bolt/src/merge-fdata/merge-fdata.cpp From f0117510a84aeedf3a24480e79ecb3f8825485d7 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 7 Nov 2017 15:42:28 -0800 Subject: [PATCH 416/904] [BOLT] Align basic blocks based on execution count Summary: The default is not changing, i.e. we are not aligning code within a function by default. New meaning of options for aligning basic blocks: -align-blocks triggers basic block alignment based on profile -preserve-blocks-alignment tries to preserve basic block alignment seen on input Tuning options for "-align-blocks": -align-blocks-min-size= blocks smaller than the specified size wouldn't be aligned -align-blocks-threshold= align only blocks with frequency larger than containing function execution frequency specified in percent. E.g. 1000 means aligning blocks that are 10 times more frequently executed than the containing function. (cherry picked from commit 0b8bb1e3c770bb8ebec477596a49433033a701e5) --- bolt/src/BinaryBasicBlock.h | 19 +++++-- bolt/src/BinaryFunction.cpp | 24 ++++++--- bolt/src/Passes/Aligner.cpp | 101 +++++++++++++++++++++++++++++++++--- bolt/src/Passes/Aligner.h | 13 ++++- 4 files changed, 141 insertions(+), 16 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index cf3b6dd27d63..946de333093e 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -80,7 +80,10 @@ class BinaryBasicBlock { std::pair InputRange{INVALID_OFFSET, INVALID_OFFSET}; /// Alignment requirements for the block. - uint64_t Alignment{1}; + uint32_t Alignment{1}; + + /// Maximum number of bytes to use for alignment of the block. + uint32_t AlignmentMaxBytes{0}; /// Number of times this basic block was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; @@ -499,15 +502,25 @@ class BinaryBasicBlock { } /// Set minimum alignment for the basic block. - void setAlignment(uint64_t Align) { + void setAlignment(uint32_t Align) { Alignment = Align; } /// Return required alignment for the block. - uint64_t getAlignment() const { + uint32_t getAlignment() const { return Alignment; } + /// Set the maximum number of bytes to use for the block alignment. + void setAlignmentMaxBytes(uint32_t Value) { + AlignmentMaxBytes = Value; + } + + /// Return the maximum number of bytes to use for the block alignment. + uint32_t getAlignmentMaxBytes() const { + return AlignmentMaxBytes; + } + /// Adds block to successor list, and also updates predecessor list for /// successor block. /// Set branch info for this path. diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index a127b8e3aea2..dd8ef1eae0c6 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -55,9 +55,10 @@ extern bool shouldProcess(const BinaryFunction &); extern cl::opt UpdateDebugSections; extern cl::opt Verbosity; -static cl::opt +cl::opt AlignBlocks("align-blocks", - cl::desc("try to align BBs inserting nops"), + cl::desc("align basic blocks"), + cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -74,6 +75,13 @@ AlignMacroOpFusion("align-macro-fusion", cl::ZeroOrMore, cl::cat(BoltRelocCategory)); +cl::opt +PreserveBlocksAlignment("preserve-blocks-alignment", + cl::desc("try to preserve basic block alignment"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt DotToolTipCode("dot-tooltip-code", cl::desc("add basic block instructions as tool tips on nodes"), @@ -1603,7 +1611,7 @@ bool BinaryFunction::buildCFG() { // Always create new BB at branch destination. PrevBB = InsertBB; InsertBB = addBasicBlock(LI->first, LI->second, - /* DeriveAlignment = */ IsLastInstrNop); + opts::PreserveBlocksAlignment && IsLastInstrNop); if (hasEntryPointAtOffset(Offset)) InsertBB->setEntryPoint(); if (PrevBB) @@ -1631,7 +1639,8 @@ bool BinaryFunction::buildCFG() { } else { InsertBB = addBasicBlock(Offset, BC.Ctx->createTempSymbol("FT", true), - /* DeriveAlignment = */ IsLastInstrNop); + opts::PreserveBlocksAlignment && + IsLastInstrNop); updateOffset(LastInstrOffset); } } @@ -2195,8 +2204,11 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { if (EmitColdPart != BB->isCold()) continue; - if (opts::AlignBlocks && BB->getAlignment() > 1) - Streamer.EmitCodeAlignment(BB->getAlignment()); + if ((opts::AlignBlocks || opts::PreserveBlocksAlignment) + && BB->getAlignment() > 1) { + Streamer.EmitCodeAlignment(BB->getAlignment(), + BB->getAlignmentMaxBytes()); + } Streamer.EmitLabel(BB->getLabel()); // Check if special alignment for macro-fusion is needed. diff --git a/bolt/src/Passes/Aligner.cpp b/bolt/src/Passes/Aligner.cpp index 0d56e9ffcfea..c4d4434f5b30 100644 --- a/bolt/src/Passes/Aligner.cpp +++ b/bolt/src/Passes/Aligner.cpp @@ -11,16 +11,34 @@ #include "Aligner.h" +#define DEBUG_TYPE "bolt-aligner" + using namespace llvm; namespace opts { + extern cl::OptionCategory BoltOptCategory; -cl::opt -UseCompactAligner("use-compact-aligner", - cl::desc("Use compact approach for aligning functions"), - cl::init(false), +extern cl::opt AlignBlocks; +extern cl::opt PreserveBlocksAlignment; + +cl::opt +AlignBlocksMinSize("align-blocks-min-size", + cl::desc("minimal size of the basic block that should be aligned"), + cl::init(0), cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +cl::opt +AlignBlocksThreshold("align-blocks-threshold", + cl::desc("align only blocks with frequency larger than containing function " + "execution frequency specified in percent. E.g. 1000 means aligning " + "blocks that are 10 times more frequently executed than the " + "containing function."), + cl::init(800), + cl::ZeroOrMore, + cl::Hidden, cl::cat(BoltOptCategory)); cl::opt @@ -37,6 +55,20 @@ AlignFunctionsMaxBytes("align-functions-max-bytes", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +cl::opt +BlockAlignment("block-alignment", + cl::desc("boundary to use for alignment of basic blocks"), + cl::init(16), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +cl::opt +UseCompactAligner("use-compact-aligner", + cl::desc("Use compact approach for aligning functions"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + } // end namespace opts namespace llvm { @@ -56,9 +88,11 @@ void alignMaxBytes(BinaryFunction &Function) { // the fuction by not more than the minimum over // -- the size of the function // -- the specified number of bytes -void alignCompact(BinaryContext &BC, BinaryFunction &Function) { +void alignCompact(BinaryFunction &Function) { + const auto &BC = Function.getBinaryContext(); size_t HotSize = 0; size_t ColdSize = 0; + for (const auto *BB : Function.layout()) { if (BB->isCold()) ColdSize += BC.computeCodeSize(BB->begin(), BB->end()); @@ -80,19 +114,74 @@ void alignCompact(BinaryContext &BC, BinaryFunction &Function) { } // end anonymous namespace +void AlignerPass::alignBlocks(BinaryFunction &Function) { + if (!Function.hasValidProfile() || !Function.isSimple()) + return; + + const auto &BC = Function.getBinaryContext(); + + const auto FuncCount = std::max(1UL, Function.getKnownExecutionCount()); + BinaryBasicBlock *PrevBB{nullptr}; + for (auto *BB : Function.layout()) { + auto Count = BB->getKnownExecutionCount(); + + if (Count <= FuncCount * opts::AlignBlocksThreshold / 100) { + PrevBB = BB; + continue; + } + + uint64_t FTCount = 0; + if (PrevBB && PrevBB->getFallthrough() == BB) { + FTCount = PrevBB->getBranchInfo(*BB).Count; + } + PrevBB = BB; + + if (Count < FTCount * 2) + continue; + + const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end()); + const auto BytesToUse = std::min(opts::BlockAlignment - 1UL, BlockSize); + + if (opts::AlignBlocksMinSize && BlockSize < opts::AlignBlocksMinSize) + continue; + + BB->setAlignment(opts::BlockAlignment); + BB->setAlignmentMaxBytes(BytesToUse); + + // Update stats. + AlignHistogram[BytesToUse]++; + AlignedBlocksCount += BB->getKnownExecutionCount(); + } +} + void AlignerPass::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { if (!BC.HasRelocations) return; + AlignHistogram.resize(opts::BlockAlignment); + for (auto &It : BFs) { auto &Function = It.second; + if (opts::UseCompactAligner) - alignCompact(BC, Function); + alignCompact(Function); else alignMaxBytes(Function); + + if (opts::AlignBlocks && !opts::PreserveBlocksAlignment) + alignBlocks(Function); } + + DEBUG( + dbgs() << "BOLT-DEBUG: max bytes per basic block alignment distribution:\n"; + for (unsigned I = 1; I < AlignHistogram.size(); ++I) { + dbgs() << " " << I << " : " << AlignHistogram[I] << '\n'; + } + dbgs() << "BOLT-DEBUG: total execution count of aligned blocks: " + << AlignedBlocksCount << '\n'; + ); } } // end namespace bolt diff --git a/bolt/src/Passes/Aligner.h b/bolt/src/Passes/Aligner.h index 3164a47a91c8..28e6f6d693b0 100644 --- a/bolt/src/Passes/Aligner.h +++ b/bolt/src/Passes/Aligner.h @@ -18,7 +18,18 @@ namespace llvm { namespace bolt { class AlignerPass : public BinaryFunctionPass { - public: +private: + + /// Stats for usage of max bytes for basic block alignment. + std::vector AlignHistogram; + + /// Stats: execution count of blocks that were aligned. + uint64_t AlignedBlocksCount{0}; + + /// Assign alignment to basic blocks based on profile. + void alignBlocks(BinaryFunction &Function); + +public: explicit AlignerPass() : BinaryFunctionPass(false) {} const char *getName() const override { From 193796eb64cb11a6fc5cefe455553e1b5d135016 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 20 Apr 2018 20:03:31 -0700 Subject: [PATCH 417/904] [BOLT] Static data reordering pass. Summary: Enable BOLT to reorder data sections in a binary based on memory profiling data. This diff adds a new pass to BOLT that can reorder data sections for better locality based on memory profiling data. For now, the algorithm to order data is primitive and just relies on the frequency of loads to order the contents of a section. We could probably do a lot better by looking at what functions use the hot data and grouping together hot data that is used by a single function (or cluster of functions). Block ordering might give some hints on how to order the data better as well. The new pass has two basic modes: inplace and split (when inplace is false). The default is split since inplace hasn't really been tested much. When splitting is on, the cold data is copied to a "cold" version of the section while the hot data is kept in the original section, e.g. for .rodata, .rodata will contain the hot data and .bolt.org.rodata will contain the cold bits. In inplace mode, the section contents are reordered inplace. In either mode, all relocations to data within that section are updated to reflect new data locations. Things to improve: - The current algorithm is really dumb and doesn't seem to lead to any wins. It certainly could use some improvement. - Private symbols can have data that leaks over to an adjacent symbol, e.g. a string that has a common suffix can start in one symbol and leak over (with the common suffix) into the next. For now, we punt on adjacent private symbols. - Handle ambiguous relocations better. Section relocations that point to the boundary of two symbols will prevent the adjacent symbols from being moved because we can't tell which symbol the relocation is for. - Handle jump tables. Right now jump table support must be basic if data reordering is enabled. - Being able to handle TLS. A good amount of data access in some binaries are happening in TLS. It would be worthwhile to be able to reorder any TLS sections too. - Handle sections with writeable data. This hasn't been tested so probably won't work. We could try to prevent false sharing in writeable sections as well. - A pie in the sky goal would be to use DWARF info to reorder types. (cherry picked from commit 52775d18f03e8a767c88c7a0805aba789be137d3) --- bolt/src/BinaryContext.cpp | 45 +- bolt/src/BinaryContext.h | 42 +- bolt/src/BinaryData.cpp | 30 +- bolt/src/BinaryData.h | 49 +- bolt/src/BinaryFunction.cpp | 30 +- bolt/src/BinaryFunction.h | 3 + bolt/src/BinaryPassManager.cpp | 5 + bolt/src/BinarySection.cpp | 71 ++- bolt/src/BinarySection.h | 78 +++- bolt/src/JumpTable.cpp | 2 +- bolt/src/MCPlusBuilder.h | 4 +- bolt/src/Passes/CMakeLists.txt | 1 + bolt/src/Passes/IndirectCallPromotion.cpp | 33 +- bolt/src/Passes/IndirectCallPromotion.h | 13 +- bolt/src/Passes/ReorderData.cpp | 475 +++++++++++++++++++ bolt/src/Passes/ReorderData.h | 68 +++ bolt/src/RewriteInstance.cpp | 531 ++++++++++++++++------ bolt/src/RewriteInstance.h | 23 +- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 29 +- 19 files changed, 1277 insertions(+), 255 deletions(-) create mode 100644 bolt/src/Passes/ReorderData.cpp create mode 100644 bolt/src/Passes/ReorderData.h diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 9415c864a7ab..69e665cecfd0 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -160,10 +160,22 @@ void BinaryContext::updateObjectNesting(BinaryDataMapType::iterator GAI) { } } +iterator_range +BinaryContext::getSubBinaryData(BinaryData *BD) { + auto Start = std::next(BinaryDataMap.find(BD->getAddress())); + auto End = Start; + while (End != BinaryDataMap.end() && + BD->isAncestorOf(End->second)) { + ++End; + } + return make_range(Start, End); +} + MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, uint64_t Size, uint16_t Alignment, - Twine Prefix) { + Twine Prefix, + unsigned Flags) { auto Itr = BinaryDataMap.find(Address); if (Itr != BinaryDataMap.end()) { assert(Itr->second->getSize() == Size || !Size); @@ -172,13 +184,14 @@ MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, std::string Name = (Prefix + "0x" + Twine::utohexstr(Address)).str(); assert(!GlobalSymbols.count(Name) && "created name is not unique"); - return registerNameAtAddress(Name, Address, Size, Alignment); + return registerNameAtAddress(Name, Address, Size, Alignment, Flags); } MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, uint64_t Address, uint64_t Size, - uint16_t Alignment) { + uint16_t Alignment, + unsigned Flags) { auto SectionOrErr = getSectionForAddress(Address); auto &Section = SectionOrErr ? SectionOrErr.get() : absoluteSection(); auto GAI = BinaryDataMap.find(Address); @@ -188,7 +201,8 @@ MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, Address, Size, Alignment ? Alignment : 1, - Section); + Section, + Flags); } else { BD = GAI->second; } @@ -237,7 +251,7 @@ BinaryContext::getBinaryDataContainingAddressImpl(uint64_t Address, bool BestFit) const { auto NI = BinaryDataMap.lower_bound(Address); auto End = BinaryDataMap.end(); - if ((NI != End && Address == NI->first) || + if ((NI != End && Address == NI->first && !IncludeEnd) || (NI-- != BinaryDataMap.begin())) { if (NI->second->containsAddress(Address) || (IncludeEnd && NI->second->getEndAddress() == Address)) { @@ -445,6 +459,7 @@ void BinaryContext::assignMemData() { // Map of sections (or heap/stack) to count/size. std::map Counts; + std::map JumpTableCounts; uint64_t TotalCount = 0; for (auto &Entry : DR.getAllFuncsMemData()) { @@ -454,6 +469,9 @@ void BinaryContext::assignMemData() { if (BD) { BD->getAtomicRoot()->addMemData(MI); Counts[BD->getSectionName()] += MI.Count; + if (BD->getAtomicRoot()->isJumpTable()) { + JumpTableCounts[BD->getSectionName()] += MI.Count; + } } else { Counts["Heap/stack"] += MI.Count; } @@ -468,6 +486,11 @@ void BinaryContext::assignMemData() { const auto Count = Entry.second; outs() << "BOLT-INFO: " << Section << " = " << Count << format(" (%.1f%%)\n", 100.0*Count/TotalCount); + if (JumpTableCounts.count(Section) != 0) { + const auto JTCount = JumpTableCounts[Section]; + outs() << "BOLT-INFO: jump tables = " << JTCount + << format(" (%.1f%%)\n", 100.0*JTCount/Count); + } } outs() << "BOLT-INFO: Total memory events: " << TotalCount << "\n"; } @@ -832,7 +855,15 @@ BinarySection &BinaryContext::registerSection(BinarySection *Section) { } BinarySection &BinaryContext::registerSection(SectionRef Section) { - return registerSection(new BinarySection(Section)); + return registerSection(new BinarySection(*this, Section)); +} + +BinarySection & +BinaryContext::registerSection(StringRef SectionName, + const BinarySection &OriginalSection) { + return registerSection(new BinarySection(*this, + SectionName, + OriginalSection)); } BinarySection &BinaryContext::registerOrUpdateSection(StringRef Name, @@ -857,7 +888,7 @@ BinarySection &BinaryContext::registerOrUpdateSection(StringRef Name, return *Section; } - return registerSection(new BinarySection(Name, Data, Size, Alignment, + return registerSection(new BinarySection(*this, Name, Data, Size, Alignment, ELFType, ELFFlags, IsLocal)); } diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index a1a40ff90a83..f7a41fd71e02 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -325,12 +325,15 @@ class BinaryContext { /// Iterate over all BinaryData associated with the given \p Section. iterator_range - getBinaryDataForSection(StringRef SectionName) const { - auto Begin = BinaryDataMap.begin(); - auto End = BinaryDataMap.end(); + getBinaryDataForSection(const BinarySection &Section) const { + auto Begin = BinaryDataMap.lower_bound(Section.getAddress()); + if (Begin != BinaryDataMap.begin()) { + --Begin; + } + auto End = BinaryDataMap.upper_bound(Section.getEndAddress()); auto pred = - [&SectionName](const binary_data_const_iterator &Itr) -> bool { - return Itr->second->getSection().getName() == SectionName; + [&Section](const binary_data_const_iterator &Itr) -> bool { + return Itr->second->getSection() == Section; }; return make_range(FilteredBinaryDataConstIterator(pred, Begin, End), FilteredBinaryDataConstIterator(pred, End, End)); @@ -338,16 +341,22 @@ class BinaryContext { /// Iterate over all BinaryData associated with the given \p Section. iterator_range - getBinaryDataForSection(StringRef SectionName) { - auto Begin = BinaryDataMap.begin(); - auto End = BinaryDataMap.end(); - auto pred = [&SectionName](const binary_data_iterator &Itr) -> bool { - return Itr->second->getSection().getName() == SectionName; + getBinaryDataForSection(BinarySection &Section) { + auto Begin = BinaryDataMap.lower_bound(Section.getAddress()); + if (Begin != BinaryDataMap.begin()) { + --Begin; + } + auto End = BinaryDataMap.upper_bound(Section.getEndAddress()); + auto pred = [&Section](const binary_data_iterator &Itr) -> bool { + return Itr->second->getSection() == Section; }; return make_range(FilteredBinaryDataIterator(pred, Begin, End), FilteredBinaryDataIterator(pred, End, End)); } + /// Iterate over all the sub-symbols of /p BD (if any). + iterator_range getSubBinaryData(BinaryData *BD); + /// Clear the global symbol address -> name(s) map. void clearBinaryData() { GlobalSymbols.clear(); @@ -365,18 +374,21 @@ class BinaryContext { MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, uint64_t Size, uint16_t Alignment, - Twine Prefix); + Twine Prefix, + unsigned Flags = 0); /// Register a symbol with \p Name at a given \p Address and \p Size. MCSymbol *registerNameAtAddress(StringRef Name, uint64_t Address, BinaryData* BD); - /// Register a symbol with \p Name at a given \p Address and \p Size. + /// Register a symbol with \p Name at a given \p Address, \p Size and + /// /p Flags. See llvm::SymbolRef::Flags for definition of /p Flags. MCSymbol *registerNameAtAddress(StringRef Name, uint64_t Address, uint64_t Size, - uint16_t Alignment); + uint16_t Alignment, + unsigned Flags = 0); /// Return BinaryData registered at a given \p Address or nullptr if no /// global symbol was registered at the location. @@ -441,6 +453,10 @@ class BinaryContext { /// sections by address. BinarySection ®isterSection(SectionRef Section); + /// Register a copy of /p OriginalSection under a different name. + BinarySection ®isterSection(StringRef SectionName, + const BinarySection &OriginalSection); + /// Register or update the information for the section with the given /// /p Name. If the section already exists, the information in the /// section will be updated with the new data. diff --git a/bolt/src/BinaryData.cpp b/bolt/src/BinaryData.cpp index ea27bead6f9c..8bd72d792b5f 100644 --- a/bolt/src/BinaryData.cpp +++ b/bolt/src/BinaryData.cpp @@ -47,6 +47,7 @@ void BinaryData::merge(const BinaryData *Other) { Names.insert(Names.end(), Other->Names.begin(), Other->Names.end()); Symbols.insert(Symbols.end(), Other->Symbols.begin(), Other->Symbols.end()); MemData.insert(MemData.end(), Other->MemData.begin(), Other->MemData.end()); + Flags |= Other->Flags; if (!Size) Size = Other->Size; } @@ -63,19 +64,27 @@ StringRef BinaryData::getSectionName() const { return getSection().getName(); } -uint64_t BinaryData::computeOutputOffset() const { +StringRef BinaryData::getOutputSectionName() const { + return getOutputSection().getName(); +} + +uint64_t BinaryData::getOutputAddress() const { + assert(OutputSection->getFileAddress()); + return OutputSection->getFileAddress() + OutputOffset; +} + +uint64_t BinaryData::getOffset() const { return Address - getSection().getAddress(); } void BinaryData::setSection(BinarySection &NewSection) { + if (OutputSection == Section) + OutputSection = &NewSection; Section = &NewSection; - if (OutputSection.empty()) - OutputSection = getSection().getName(); } bool BinaryData::isMoved() const { - return (computeOutputOffset() != OutputOffset || - OutputSection != getSectionName()); + return (getOffset() != OutputOffset || OutputSection != Section); } void BinaryData::print(raw_ostream &OS) const { @@ -106,7 +115,8 @@ void BinaryData::printBrief(raw_ostream &OS) const { OS << ", 0x" << Twine::utohexstr(getAddress()) << ":0x" << Twine::utohexstr(getEndAddress()) - << "/" << getSize(); + << "/" << getSize() << "/" << getAlignment() + << "/0x" << Twine::utohexstr(Flags); if (opts::Verbosity > 1) { for (auto &MI : memData()) { @@ -121,12 +131,14 @@ BinaryData::BinaryData(StringRef Name, uint64_t Address, uint64_t Size, uint16_t Alignment, - BinarySection &Section) + BinarySection &Section, + unsigned Flags) : Names({Name}), Section(&Section), Address(Address), Size(Size), Alignment(Alignment), - OutputSection(Section.getName()), - OutputOffset(computeOutputOffset()) + Flags(Flags), + OutputSection(&Section), + OutputOffset(getOffset()) { } diff --git a/bolt/src/BinaryData.h b/bolt/src/BinaryData.h index e63e72a59417..6b1f4eafe798 100644 --- a/bolt/src/BinaryData.h +++ b/bolt/src/BinaryData.h @@ -56,9 +56,12 @@ class BinaryData { /// Alignment of this data. uint16_t Alignment{1}; + /// Symbol flags (same as llvm::SymbolRef::Flags) + unsigned Flags{0}; + /// Output section for this data if it has been moved from the original /// section. - std::string OutputSection; + BinarySection *OutputSection{nullptr}; /// The offset of this symbol in the output section. This is different /// from \p Address - Section.getAddress() when the data has been reordered. uint64_t OutputOffset{0}; @@ -79,22 +82,14 @@ class BinaryData { return BD; } - BinaryData *getAtomicRoot() { - auto *BD = this; - while (!BD->isAtomic() && BD->Parent) - BD = BD->Parent; - return BD; - } - - uint64_t computeOutputOffset() const; - public: BinaryData(BinaryData &&) = default; BinaryData(StringRef Name, uint64_t Address, uint64_t Size, uint16_t Alignment, - BinarySection &Section); + BinarySection &Section, + unsigned Flags = 0); virtual ~BinaryData() { } virtual bool isJumpTable() const { return false; } @@ -150,15 +145,20 @@ class BinaryData { uint64_t getAddress() const { return Address; } uint64_t getEndAddress() const { return Address + Size; } + uint64_t getOffset() const; uint64_t getSize() const { return Size; } uint16_t getAlignment() const { return Alignment; } - uint64_t getOutputOffset() const { return OutputOffset; } - uint64_t getOutputSize() const { return Size; } BinarySection &getSection() { return *Section; } const BinarySection &getSection() const { return *Section; } StringRef getSectionName() const; - StringRef getOutputSection() const { return OutputSection; } + + BinarySection &getOutputSection() { return *OutputSection; } + const BinarySection &getOutputSection() const { return *OutputSection; } + StringRef getOutputSectionName() const; + uint64_t getOutputAddress() const; + uint64_t getOutputOffset() const { return OutputOffset; } + uint64_t getOutputSize() const { return Size; } bool isMoved() const; bool containsAddress(uint64_t Address) const { @@ -180,6 +180,13 @@ class BinaryData { return BD; } + BinaryData *getAtomicRoot() { + auto *BD = this; + while (!BD->isAtomic() && BD->Parent) + BD = BD->Parent; + return BD; + } + const BinaryData *getAtomicRoot() const { auto *BD = this; while (!BD->isAtomic() && BD->Parent) @@ -187,10 +194,20 @@ class BinaryData { return BD; } + bool isAncestorOf(const BinaryData *BD) const { + return Parent && (Parent == BD || Parent->isAncestorOf(BD)); + } + void setIsMoveable(bool Flag) { IsMoveable = Flag; } - void setOutputOffset(uint64_t Offset) { OutputOffset = Offset; } - void setOutputSection(StringRef Name) { OutputSection = Name; } void setSection(BinarySection &NewSection); + void setOutputSection(BinarySection &NewSection) { + OutputSection = &NewSection; + } + void setOutputOffset(uint64_t Offset) { OutputOffset = Offset; } + void setOutputLocation(BinarySection &NewSection, uint64_t NewOffset) { + setOutputSection(NewSection); + setOutputOffset(NewOffset); + } virtual void printBrief(raw_ostream &OS) const; virtual void print(raw_ostream &OS) const; diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index dd8ef1eae0c6..1b86764663a7 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3461,10 +3461,12 @@ void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { } else { MCSection *HotSection, *ColdSection; if (opts::JumpTables == JTS_BASIC) { - std::string Name = JT.Labels[0]->getName().str(); + std::string Name = ".local." + JT.Labels[0]->getName().str(); std::replace(Name.begin(), Name.end(), '/', '.'); - JT.setOutputSection(".local." + Name); - HotSection = BC.Ctx->getELFSection(JT.getOutputSection(), + JT.setOutputSection(BC.registerOrUpdateSection(Name, + ELF::SHT_PROGBITS, + ELF::SHF_ALLOC)); + HotSection = BC.Ctx->getELFSection(Name, ELF::SHT_PROGBITS, ELF::SHF_ALLOC); ColdSection = HotSection; @@ -3689,6 +3691,28 @@ MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) { } } +std::set BinaryFunction::dataUses(bool OnlyHot) const { + std::set Uses; + for (auto *BB : BasicBlocks) { + if (OnlyHot && BB->isCold()) + continue; + + for (const auto &Inst : *BB) { + if (auto Mem = + BC.MIB->tryGetAnnotationAs(Inst, "MemDataOffset")) { + for (auto &MI : getMemData()->getMemInfoRange(Mem.get())) { + if (auto *BD = MI.Addr.IsSymbol + ? BC.getBinaryDataByName(MI.Addr.Name) + : BC.getBinaryDataContainingAddress(MI.Addr.Offset)) { + Uses.insert(BD); + } + } + } + } + } + return Uses; +} + DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( const DWARFDebugLoc::LocationList &InputLL, BaseAddress BaseAddr) const { diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index fc6532c5bb48..14e9f9d372d7 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -1231,6 +1231,9 @@ class BinaryFunction { PCRelativeRelocationOffsets.emplace(Address - getAddress()); } + /// Get data used by this function. + std::set dataUses(bool OnlyHot) const; + /// Return internal section name for this function. StringRef getCodeSectionName() const { return StringRef(CodeSectionName); diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index bcdf9df3f50a..145810ddaf28 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -20,6 +20,7 @@ #include "Passes/PLTCall.h" #include "Passes/RegReAssign.h" #include "Passes/ReorderFunctions.h" +#include "Passes/ReorderData.h" #include "Passes/StokeInfo.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" @@ -427,6 +428,10 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique()); + // Perform reordering on data contained in one or more sections using + // memory profiling data. + Manager.registerPass(llvm::make_unique()); + // This pass should always run last.* Manager.registerPass(llvm::make_unique(PrintFinalized)); diff --git a/bolt/src/BinarySection.cpp b/bolt/src/BinarySection.cpp index 91da8ea02aed..52d27835f412 100644 --- a/bolt/src/BinarySection.cpp +++ b/bolt/src/BinarySection.cpp @@ -10,10 +10,11 @@ //===----------------------------------------------------------------------===// #include "BinarySection.h" +#include "BinaryContext.h" #include "llvm/Support/CommandLine.h" #undef DEBUG_TYPE -#define DEBUG_TYPE "bolt" +#define DEBUG_TYPE "binary-section" using namespace llvm; using namespace bolt; @@ -23,6 +24,11 @@ extern cl::opt PrintRelocations; } BinarySection::~BinarySection() { + if (isReordered()) { + delete[] getData(); + return; + } + if (!isAllocatable() && (!hasSectionRef() || OutputContents.data() != getContents(Section).data())) { @@ -53,3 +59,66 @@ void BinarySection::print(raw_ostream &OS) const { OS << "\n " << R; } } + +std::set BinarySection::reorderRelocations(bool Inplace) const { + assert(PendingRelocations.empty() && + "reodering pending relocations not supported"); + std::set NewRelocations; + for (const auto &Rel : relocations()) { + auto RelAddr = Rel.Offset + getAddress(); + auto *BD = BC.getBinaryDataContainingAddress(RelAddr); + BD = BD->getAtomicRoot(); + assert(BD); + + if ((!BD->isMoved() && !Inplace) || BD->isJumpTable()) + continue; + + auto NewRel(Rel); + auto RelOffset = RelAddr - BD->getAddress(); + NewRel.Offset = BD->getOutputOffset() + RelOffset; + assert(NewRel.Offset < getSize()); + DEBUG(dbgs() << "BOLT-DEBUG: moving " << Rel << " -> " << NewRel << "\n"); + auto Res = NewRelocations.emplace(std::move(NewRel)); + (void)Res; + assert(Res.second && "Can't overwrite existing relocation"); + } + return NewRelocations; +} + +void BinarySection::reorderContents(const std::vector &Order, + bool Inplace) { + IsReordered = true; + + Relocations = reorderRelocations(Inplace); + + std::string Str; + raw_string_ostream OS(Str); + auto *Src = Contents.data(); + DEBUG(dbgs() << "BOLT-DEBUG: reorderContents for " << Name << "\n"); + for (auto *BD : Order) { + assert((BD->isMoved() || !Inplace) && !BD->isJumpTable()); + assert(BD->isAtomic() && BD->isMoveable()); + const auto SrcOffset = BD->getAddress() - getAddress(); + assert(SrcOffset < Contents.size()); + assert(SrcOffset == BD->getOffset()); + while (OS.tell() < BD->getOutputOffset()) { + OS.write((unsigned char)0); + } + DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName() + << " @ " << OS.tell() << "\n"); + OS.write(&Src[SrcOffset], BD->getOutputSize()); + } + if (Relocations.empty()) { + // If there are no existing relocations, tack a phony one at the end + // of the reordered segment to force LLVM to recognize and map this + // section. + auto *ZeroSym = BC.registerNameAtAddress("Zero", 0, 0, 0); + addRelocation(OS.tell(), ZeroSym, ELF::R_X86_64_64, 0xdeadbeef); + + uint64_t Zero = 0; + OS.write(reinterpret_cast(&Zero), sizeof(Zero)); + } + auto *NewData = reinterpret_cast(copyByteArray(OS.str())); + Contents = OutputContents = StringRef(NewData, OS.str().size()); + OutputSize = Contents.size(); +} diff --git a/bolt/src/BinarySection.h b/bolt/src/BinarySection.h index ce3c0ee0c336..3ae1e5ecd886 100644 --- a/bolt/src/BinarySection.h +++ b/bolt/src/BinarySection.h @@ -29,10 +29,14 @@ using namespace object; namespace bolt { +class BinaryContext; +class BinaryData; + /// A class to manage binary sections that also manages related relocations. class BinarySection { friend class BinaryContext; + BinaryContext &BC; // Owning BinaryContext const std::string Name; // Section name const SectionRef Section; // SectionRef (may be null) StringRef Contents; // input section contents @@ -53,14 +57,17 @@ class BinarySection { RelocationSetType PendingRelocations; // Output info - bool IsFinalized{false}; // Has this section had output information - // finalized? - uint64_t FileAddress{0}; // Section address for the rewritten binary. - uint64_t OutputSize{0}; // Section size in the rewritten binary. - uint64_t FileOffset{0}; // File offset in the rewritten binary file. - StringRef OutputContents; // Rewritten section contents. - unsigned SectionID{-1u}; // Unique ID used for address mapping. - // Set by ExecutableFileMemoryManager. + bool IsFinalized{false}; // Has this section had output information + // finalized? + std::string OutputName; // Output section name (if the section has + // been renamed) + uint64_t FileAddress{0}; // Section address for the rewritten binary. + uint64_t OutputSize{0}; // Section size in the rewritten binary. + uint64_t FileOffset{0}; // File offset in the rewritten binary file. + StringRef OutputContents; // Rewritten section contents. + unsigned SectionID{-1u}; // Unique ID used for address mapping. + // Set by ExecutableFileMemoryManager. + mutable bool IsReordered{false}; // Have the contents been reordered? // non-copyable BinarySection(const BinarySection &) = delete; @@ -85,7 +92,12 @@ class BinarySection { return Contents; } - // Set output info for this section. + /// Get the set of relocations refering to data in this section that + /// has been reordered. The relocation offsets will be modified to + /// reflect the new data locations. + std::set reorderRelocations(bool Inplace) const; + + /// Set output info for this section. void update(uint8_t *NewData, uint64_t NewSize, unsigned NewAlignment, @@ -93,18 +105,41 @@ class BinarySection { unsigned NewELFFlags, bool NewIsLocal) { assert(NewAlignment > 0 && "section alignment must be > 0"); - OutputSize = NewSize; Alignment = NewAlignment; ELFType = NewELFType; ELFFlags = NewELFFlags; IsLocal = NewIsLocal || StringRef(Name).startswith(".local."); + OutputSize = NewSize; OutputContents = StringRef(reinterpret_cast(NewData), NewData ? NewSize : 0); IsFinalized = true; } public: - explicit BinarySection(SectionRef Section, bool IsLocal = false) - : Name(getName(Section)), + /// Copy a section. + explicit BinarySection(BinaryContext &BC, + StringRef Name, + const BinarySection &Section, + bool IsLocal = false) + : BC(BC), + Name(Name), + Section(Section.getSectionRef()), + Contents(Section.getContents()), + Address(Section.getAddress()), + Size(Section.getSize()), + Alignment(Section.getAlignment()), + ELFType(Section.getELFType()), + ELFFlags(Section.getELFFlags()), + IsLocal(IsLocal || StringRef(Name).startswith(".local.")), + Relocations(Section.Relocations), + PendingRelocations(Section.PendingRelocations), + OutputName(Name) { + } + + BinarySection(BinaryContext &BC, + SectionRef Section, + bool IsLocal = false) + : BC(BC), + Name(getName(Section)), Section(Section), Contents(getContents(Section)), Address(Section.getAddress()), @@ -113,18 +148,20 @@ class BinarySection { ELFType(ELFSectionRef(Section).getType()), ELFFlags(ELFSectionRef(Section).getFlags()), IsLocal(IsLocal || StringRef(Name).startswith(".local.")), - OutputSize(0) { + OutputName(Name) { } // TODO: pass Data as StringRef/ArrayRef? use StringRef::copy method. - BinarySection(StringRef Name, + BinarySection(BinaryContext &BC, + StringRef Name, uint8_t *Data, uint64_t Size, unsigned Alignment, unsigned ELFType, unsigned ELFFlags, bool IsLocal) - : Name(Name), + : BC(BC), + Name(Name), Contents(reinterpret_cast(Data), Data ? Size : 0), Address(0), Size(Size), @@ -133,6 +170,7 @@ class BinarySection { ELFFlags(ELFFlags), IsLocal(IsLocal || Name.startswith(".local.")), IsFinalized(true), + OutputName(Name), OutputSize(Size), OutputContents(Contents) { assert(Alignment > 0 && "section alignment must be > 0"); @@ -218,6 +256,7 @@ class BinarySection { return (ELFFlags & ELF::SHF_ALLOC); } bool isLocal() const { return IsLocal; } + bool isReordered() const { return IsReordered; } unsigned getELFType() const { return ELFType; } unsigned getELFFlags() const { return ELFFlags; } @@ -307,6 +346,7 @@ class BinarySection { bool isFinalized() const { return IsFinalized; } void setIsFinalized() { IsFinalized = true; } + StringRef getOutputName() const { return OutputName; } uint64_t getOutputSize() const { return OutputSize; } uint8_t *getOutputData() { return reinterpret_cast(const_cast(getOutputContents().data())); @@ -339,6 +379,14 @@ class BinarySection { assert(!hasValidSectionID() && "trying to set section id twice"); SectionID = ID; } + void setOutputName(StringRef Name) { + OutputName = Name; + } + + /// Reorder the contents of this section according to /p Order. If + /// /p Inplace is true, the entire contents of the section is reordered, + /// otherwise the new contents contain only the reordered data. + void reorderContents(const std::vector &Order, bool Inplace); void print(raw_ostream &OS) const; }; diff --git a/bolt/src/JumpTable.cpp b/bolt/src/JumpTable.cpp index 410ee8c4964a..582d5cf7dee9 100644 --- a/bolt/src/JumpTable.cpp +++ b/bolt/src/JumpTable.cpp @@ -86,7 +86,7 @@ void JumpTable::updateOriginal() { << Twine::utohexstr(Offset) << " for symbol " << Entry->getName() << " with addend " << Twine::utohexstr(RelAddend) << '\n'); - getSection().addRelocation(Offset, Entry, RelType, RelAddend); + getOutputSection().addRelocation(Offset, Entry, RelType, RelAddend); Offset += EntrySize; } } diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index 296a9eb02cee..15aab393495a 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -1502,8 +1502,8 @@ class MCPlusBuilder { using ICPdata = std::vector>>; virtual ICPdata indirectCallPromotion( const MCInst &CallInst, - const std::vector>& Targets, - const std::vector &VtableAddrs, + const std::vector> &Targets, + const std::vector> &VtableSyms, const std::vector &MethodFetchInsns, const bool MinimizeCodeSize, MCContext *Ctx diff --git a/bolt/src/Passes/CMakeLists.txt b/bolt/src/Passes/CMakeLists.txt index 355623899681..46903cc5904d 100644 --- a/bolt/src/Passes/CMakeLists.txt +++ b/bolt/src/Passes/CMakeLists.txt @@ -24,6 +24,7 @@ add_llvm_library(LLVMBOLTPasses RegReAssign.cpp ReorderAlgorithm.cpp ReorderFunctions.cpp + ReorderData.cpp ShrinkWrapping.cpp StackAllocationAnalysis.cpp StackAvailableExpressions.cpp diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp index 3237620e4ea6..6cb483a62d8f 100644 --- a/bolt/src/Passes/IndirectCallPromotion.cpp +++ b/bolt/src/Passes/IndirectCallPromotion.cpp @@ -392,7 +392,7 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets( // Deal with bad/stale data if (!MI.Addr.Name.startswith("JUMP_TABLE/" + Function.getNames().front())) return JumpTableInfoType(); - Index = MI.Addr.Offset / JT->EntrySize; + Index = (MI.Addr.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize; } else { Index = (MI.Addr.Offset - ArrayStart) / JT->EntrySize; } @@ -539,7 +539,7 @@ IndirectCallPromotion::findCallTargetSymbols( } IndirectCallPromotion::MethodInfoType -IndirectCallPromotion::maybeGetVtableAddrs( +IndirectCallPromotion::maybeGetVtableSyms( BinaryContext &BC, BinaryFunction &Function, BinaryBasicBlock *BB, @@ -547,7 +547,7 @@ IndirectCallPromotion::maybeGetVtableAddrs( const SymTargetsType &SymTargets ) const { const auto *MemData = Function.getMemData(); - std::vector VtableAddrs; + std::vector> VtableSyms; std::vector MethodFetchInsns; unsigned VtableReg, MethodReg; uint64_t MethodOffset; @@ -637,13 +637,16 @@ IndirectCallPromotion::maybeGetVtableAddrs( for (size_t I = 0; I < SymTargets.size(); ++I) { auto Itr = MethodToVtable.find(SymTargets[I].first); if (Itr != MethodToVtable.end()) { - VtableAddrs.push_back(Itr->second); - } else { - // Give up if we can't find the vtable for a method. - DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for " - << SymTargets[I].first->getName() << "\n"); - return MethodInfoType(); + if (auto *BD = BC.getBinaryDataContainingAddress(Itr->second)) { + const uint64_t Addend = Itr->second - BD->getAddress(); + VtableSyms.push_back(std::make_pair(BD->getSymbol(), Addend)); + continue; + } } + // Give up if we can't find the vtable for a method. + DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for " + << SymTargets[I].first->getName() << "\n"); + return MethodInfoType(); } // Make sure the vtable reg is not clobbered by the argument passing code @@ -656,7 +659,7 @@ IndirectCallPromotion::maybeGetVtableAddrs( } } - return MethodInfoType(VtableAddrs, MethodFetchInsns); + return MethodInfoType(VtableSyms, MethodFetchInsns); } std::vector> @@ -1345,11 +1348,11 @@ void IndirectCallPromotion::runOnFunctions( MethodInfoType MethodInfo; if (!IsJumpTable) { - MethodInfo = maybeGetVtableAddrs(BC, - Function, - BB, - Inst, - SymTargets); + MethodInfo = maybeGetVtableSyms(BC, + Function, + BB, + Inst, + SymTargets); TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1; DEBUG(dbgs() << "BOLT-INFO: ICP " << (!MethodInfo.first.empty() ? "found" : "did not find") diff --git a/bolt/src/Passes/IndirectCallPromotion.h b/bolt/src/Passes/IndirectCallPromotion.h index 95c671199926..b0262bdbcaf4 100644 --- a/bolt/src/Passes/IndirectCallPromotion.h +++ b/bolt/src/Passes/IndirectCallPromotion.h @@ -99,7 +99,8 @@ namespace bolt { /// class IndirectCallPromotion : public BinaryFunctionPass { using BasicBlocksVector = std::vector>; - using MethodInfoType = std::pair, std::vector>; + using MethodInfoType = std::pair>, + std::vector>; using JumpTableInfoType = std::vector>; using SymTargetsType = std::vector>; struct Location { @@ -207,11 +208,11 @@ class IndirectCallPromotion : public BinaryFunctionPass { MCInst &Inst, MCInst *&TargetFetchInst) const; - MethodInfoType maybeGetVtableAddrs(BinaryContext &BC, - BinaryFunction &Function, - BinaryBasicBlock *BB, - MCInst &Inst, - const SymTargetsType &SymTargets) const; + MethodInfoType maybeGetVtableSyms(BinaryContext &BC, + BinaryFunction &Function, + BinaryBasicBlock *BB, + MCInst &Inst, + const SymTargetsType &SymTargets) const; std::vector> rewriteCall(BinaryContext &BC, diff --git a/bolt/src/Passes/ReorderData.cpp b/bolt/src/Passes/ReorderData.cpp new file mode 100644 index 000000000000..48b979dd3659 --- /dev/null +++ b/bolt/src/Passes/ReorderData.cpp @@ -0,0 +1,475 @@ +//===--- ReorderSection.cpp - Profile based reordering of section data =======// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +// TODO: +// - make sure writeable data isn't put on same cache line unless temporally local +// - estimate temporal locality by looking at CFG? + +#include "ReorderData.h" +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "reorder-data" + +using namespace llvm; +using namespace bolt; + +namespace opts { +extern cl::OptionCategory BoltCategory; +extern cl::OptionCategory BoltOptCategory; +extern cl::opt JumpTables; + +static cl::opt +PrintReorderedData("print-reordered-data", + cl::desc("print section contents after reordering"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +cl::list +ReorderData("reorder-data", + cl::CommaSeparated, + cl::desc("list of sections to reorder"), + cl::value_desc("section1,section2,section3,..."), + cl::cat(BoltOptCategory)); + +enum ReorderAlgo : char { + REORDER_COUNT = 0, + REORDER_FUNCS = 1 +}; + +static cl::opt +ReorderAlgorithm("reorder-data-algo", + cl::desc("algorithm used to reorder data sections"), + cl::init(REORDER_COUNT), + cl::values( + clEnumValN(REORDER_COUNT, + "count", + "sort hot data by read counts"), + clEnumValN(REORDER_FUNCS, + "funcs", + "sort hot data by hot function usage and count")), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +ReorderDataMaxSymbols("reorder-data-max-symbols", + cl::desc("maximum number of symbols to reorder"), + cl::ZeroOrMore, + cl::init(std::numeric_limits::max()), + cl::cat(BoltOptCategory)); + +static cl::opt +ReorderDataMaxBytes("reorder-data-max-bytes", + cl::desc("maximum number of bytes to reorder"), + cl::ZeroOrMore, + cl::init(std::numeric_limits::max()), + cl::cat(BoltOptCategory)); + +static cl::list +ReorderSymbols("reorder-symbols", + cl::CommaSeparated, + cl::desc("list of symbol names that can be reordered"), + cl::value_desc("symbol1,symbol2,symbol3,..."), + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::list +SkipSymbols("reorder-skip-symbols", + cl::CommaSeparated, + cl::desc("list of symbol names that cannot be reordered"), + cl::value_desc("symbol1,symbol2,symbol3,..."), + cl::Hidden, + cl::cat(BoltCategory)); + +static cl::opt +ReorderInplace("reorder-data-inplace", + cl::desc("reorder data sections in place"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} + +namespace llvm { +namespace bolt { + +namespace { + +static constexpr uint16_t MinAlignment = 16; + +bool isSupported(const BinarySection &BS) { + return BS.isData() && !BS.isTLS(); +} + +bool filterSymbol(const BinaryData *BD) { + if (!BD->isAtomic() || BD->isJumpTable() || !BD->isMoveable()) + return false; + + bool IsValid = true; + + if (!opts::ReorderSymbols.empty()) { + IsValid = false; + for (auto &Name : opts::ReorderSymbols) { + if (BD->hasName(Name)) { + IsValid = true; + break; + } + } + } + + if (!IsValid) + return false; + + if (!opts::SkipSymbols.empty()) { + for (auto &Name : opts::SkipSymbols) { + if (BD->hasName(Name)) { + IsValid = false; + break; + } + } + } + + return IsValid; +} + +} + +using DataOrder = ReorderData::DataOrder; + +void ReorderData::printOrder(const BinarySection &Section, + DataOrder::const_iterator Begin, + DataOrder::const_iterator End) const { + uint64_t TotalSize = 0; + bool PrintHeader = false; + while (Begin != End) { + const auto *BD = Begin->first; + + if (!PrintHeader) { + outs() << "BOLT-INFO: Hot global symbols for " + << Section.getName() << ":\n"; + PrintHeader = true; + } + + outs() << "BOLT-INFO: " << *BD << ", moveable=" << BD->isMoveable() + << format(", weight=%.5f\n", double(Begin->second)/BD->getSize()); + + TotalSize += BD->getSize(); + ++Begin; + } + if (TotalSize) + outs() << "BOLT-INFO: Total hot symbol size = " << TotalSize << "\n"; +} + +DataOrder ReorderData::baseOrder(BinaryContext &BC, + const BinarySection &Section) const { + DataOrder Order; + for (auto &Entry : BC.getBinaryDataForSection(Section)) { + auto *BD = Entry.second; + if (!BD->isAtomic()) // skip sub-symbols + continue; + const auto Total = std::accumulate(BD->memData().begin(), + BD->memData().end(), + 0, + [](uint64_t Sum, const MemInfo& MI) { + return Sum + MI.Count; + }); + Order.push_back(std::make_pair(BD, Total)); + } + return Order; +} + +/// Only consider moving data that is used by the hottest functions with +/// valid profiles. +std::pair ReorderData::sortedByFunc( + BinaryContext &BC, + const BinarySection &Section, + std::map &BFs +) const { + std::map> BDtoFunc; + std::map BDtoFuncCount; + + for (auto &Entry : BFs) { + auto &BF = Entry.second; + if (BF.hasValidProfile()) { + for (auto *BD : BF.dataUses(true)) { + if (!BC.getFunctionForSymbol(BD->getSymbol())) { + BDtoFunc[BD->getAtomicRoot()].insert(&BF); + BDtoFuncCount[BD->getAtomicRoot()] += BF.getKnownExecutionCount(); + } + } + } + } + + DataOrder Order = baseOrder(BC, Section); + unsigned SplitPoint = Order.size(); + + std::sort(Order.begin(), Order.end(), + [&](const DataOrder::value_type &A, + const DataOrder::value_type &B) { + // Total execution counts of functions referencing BD. + const auto ACount = BDtoFuncCount[A.first]; + const auto BCount = BDtoFuncCount[B.first]; + // Weight by number of loads/data size. + const auto AWeight = double(A.second) / A.first->getSize(); + const auto BWeight = double(B.second) / B.first->getSize(); + return (ACount > BCount || + (ACount == BCount && + (AWeight > BWeight || + (AWeight == BWeight && + A.first->getAddress() < B.first->getAddress())))); + }); + + for (unsigned Idx = 0; Idx < Order.size(); ++Idx) { + if (!BDtoFuncCount[Order[Idx].first]) { + SplitPoint = Idx; + break; + } + } + + return std::make_pair(Order, SplitPoint); +} + +std::pair ReorderData::sortedByCount( + BinaryContext &BC, + const BinarySection &Section +) const { + auto Order = baseOrder(BC, Section); + unsigned SplitPoint = Order.size(); + + std::sort(Order.begin(), Order.end(), + [](const DataOrder::value_type &A, + const DataOrder::value_type &B) { + // Weight by number of loads/data size. + const auto AWeight = double(A.second) / A.first->getSize(); + const auto BWeight = double(B.second) / B.first->getSize(); + return (AWeight > BWeight || + (AWeight == BWeight && + (A.first->getSize() < B.first->getSize() || + (A.first->getSize() == B.first->getSize() && + A.first->getAddress() < B.first->getAddress())))); + }); + + for (unsigned Idx = 0; Idx < Order.size(); ++Idx) { + if (!Order[Idx].second) { + SplitPoint = Idx; + break; + } + } + + return std::make_pair(Order, SplitPoint); +} + +// TODO +// add option for cache-line alignment (or just use cache-line when section +// is writeable)? +void ReorderData::setSectionOrder(BinaryContext &BC, + BinarySection &OutputSection, + DataOrder::iterator Begin, + DataOrder::iterator End) { + std::vector NewOrder; + unsigned NumReordered = 0; + uint64_t Offset = 0; + uint64_t Count = 0; + + // Get the total count just for stats + uint64_t TotalCount = 0; + for (auto Itr = Begin; Itr != End; ++Itr) { + TotalCount += Itr->second; + } + + DEBUG(dbgs() << "BOLT-DEBUG: setSectionOrder for " + << OutputSection.getName() << "\n"); + + for (; Begin != End; ++Begin) { + auto *BD = Begin->first; + + // we can't move certain symbols because they are screwy, see T25076484. + if (!filterSymbol(BD)) + continue; + + ++NumReordered; + if (NumReordered > opts::ReorderDataMaxSymbols) { + if (!NewOrder.empty()) { + dbgs() << "BOLT-DEBUG: processing ending on symbol " + << *NewOrder.back() << "\n"; + } + break; + } + + auto Alignment = std::max(BD->getAlignment(), MinAlignment); + Offset = alignTo(Offset, Alignment); + + if ((Offset + BD->getSize()) > opts::ReorderDataMaxBytes) { + if (!NewOrder.empty()) { + dbgs() << "BOLT-DEBUG: processing ending on symbol " + << *NewOrder.back() << "\n"; + } + break; + } + + DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName() << " @ 0x" + << Twine::utohexstr(Offset) << "\n"); + + BD->setOutputLocation(OutputSection, Offset); + + // reorder sub-symbols + for (auto &SubBD : BC.getSubBinaryData(BD)) { + if (!SubBD.second->isJumpTable()) { + auto SubOffset = Offset + SubBD.second->getAddress() - BD->getAddress(); + DEBUG(dbgs() << "BOLT-DEBUG: SubBD " << SubBD.second->getName() + << " @ " << SubOffset << "\n"); + SubBD.second->setOutputLocation(OutputSection, SubOffset); + } + } + + Offset += BD->getSize(); + Count += Begin->second; + NewOrder.push_back(BD); + } + + OutputSection.reorderContents(NewOrder, opts::ReorderInplace); + + outs() << "BOLT-INFO: reorder-data: " << Count << "/" << TotalCount + << format(" (%.1f%%)", 100.0*Count/TotalCount) << " events, " + << Offset << " hot bytes\n"; +} + +bool ReorderData::markUnmoveableSymbols(BinaryContext &BC, + BinarySection &Section) const { + // Private symbols currently can't be moved because data can "leak" across + // the boundary of one symbol to the next, e.g. a string that has a common + // suffix might start in one private symbol and end with the common + // suffix in another. + auto isPrivate = [&](const BinaryData *BD) { + auto Prefix = std::string("PG") + BC.AsmInfo->getPrivateGlobalPrefix(); + return BD->getName().startswith(Prefix.str()); + }; + auto Range = BC.getBinaryDataForSection(Section); + bool FoundUnmoveable = false; + for (auto Itr = Range.begin(); Itr != Range.end(); ++Itr) { + if (Itr->second->getName().startswith("PG.")) { + auto *Prev = Itr != Range.begin() ? std::prev(Itr)->second : nullptr; + auto *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr; + auto PrevIsPrivate = Prev && isPrivate(Prev); + auto NextIsPrivate = Next && isPrivate(Next); + if (isPrivate(Itr->second) && (PrevIsPrivate || NextIsPrivate)) + Itr->second->setIsMoveable(false); + } else { + // check for overlapping symbols. + auto *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr; + if (Next && + Itr->second->getEndAddress() != Next->getAddress() && + Next->containsAddress(Itr->second->getEndAddress())) { + Itr->second->setIsMoveable(false); + Next->setIsMoveable(false); + } + } + FoundUnmoveable |= !Itr->second->isMoveable(); + } + return FoundUnmoveable; +} + +void ReorderData::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + static const char* DefaultSections[] = { + ".rodata", + ".data", + ".bss", + nullptr + }; + + if (!BC.HasRelocations || opts::ReorderData.empty()) + return; + + // For now + if (opts::JumpTables > JTS_BASIC) { + outs() << "BOLT-WARNING: jump table support must be basic for " + << "data reordering to work.\n"; + return; + } + + std::vector Sections; + + for (auto &SectionName : opts::ReorderData) { + if (SectionName == "default") { + for (unsigned I = 0; DefaultSections[I]; ++I) { + if (auto Section = BC.getUniqueSectionByName(DefaultSections[I])) + Sections.push_back(&*Section); + } + continue; + } + + auto Section = BC.getUniqueSectionByName(SectionName); + if (!Section) { + outs() << "BOLT-WARNING: Section " << SectionName << " not found, skipping.\n"; + continue; + } + + if (!isSupported(*Section)) { + outs() << "BOLT-ERROR: Section " << SectionName << " not supported.\n"; + exit(1); + } + + Sections.push_back(&*Section); + } + + for (auto *Section : Sections) { + const bool FoundUnmoveable = markUnmoveableSymbols(BC, *Section); + + DataOrder Order; + unsigned SplitPointIdx; + + if (opts::ReorderAlgorithm == opts::ReorderAlgo::REORDER_COUNT) { + outs() << "BOLT-INFO: reorder-sections: ordering data by count\n"; + std::tie(Order, SplitPointIdx) = sortedByCount(BC, *Section); + } else { + outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n"; + std::tie(Order, SplitPointIdx) = sortedByFunc(BC, *Section, BFs); + } + auto SplitPoint = Order.begin() + SplitPointIdx; + + if (opts::PrintReorderedData) { + printOrder(*Section, Order.begin(), SplitPoint); + } + + if (!opts::ReorderInplace || FoundUnmoveable) { + if (opts::ReorderInplace && FoundUnmoveable) { + outs() << "BOLT-INFO: Found unmoveable symbols in " + << Section->getName() << " falling back to splitting " + << "instead of in-place reordering.\n"; + } + + // Copy original section to
.cold. + auto &Cold = BC.registerSection(std::string(Section->getName()) + ".cold", + *Section); + + // Reorder contents of original section. + setSectionOrder(BC, *Section, Order.begin(), SplitPoint); + + // This keeps the original data from thinking it has been moved. + for (auto &Entry : BC.getBinaryDataForSection(*Section)) { + if (!Entry.second->isMoved()) { + Entry.second->setSection(Cold); + Entry.second->setOutputSection(Cold); + } + } + } else { + outs() << "BOLT-WARNING: Inplace section reordering not supported yet.\n"; + setSectionOrder(BC, *Section, Order.begin(), Order.end()); + } + } +} + +} +} diff --git a/bolt/src/Passes/ReorderData.h b/bolt/src/Passes/ReorderData.h new file mode 100644 index 000000000000..44698d65f5f0 --- /dev/null +++ b/bolt/src/Passes/ReorderData.h @@ -0,0 +1,68 @@ +//===--- ReorderSection.h - Profile based reordering of section data =========// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_DATA_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_REORDER_DATA_H + +#include "BinaryPasses.h" +#include "BinarySection.h" + +namespace llvm { +namespace bolt { + +class ReorderData : public BinaryFunctionPass { +public: + using DataOrder = std::vector>; + +private: + DataOrder baseOrder(BinaryContext &BC, + const BinarySection &Section) const; + + /// Sort symbols by memory profiling data execution count. The output + /// is a vector of [address,count] pairs. + std::pair + sortedByCount(BinaryContext &BC, const BinarySection &Section) const; + + std::pair + sortedByFunc(BinaryContext &BC, + const BinarySection &Section, + std::map &BFs) const; + + void printOrder(const BinarySection &Section, + DataOrder::const_iterator Begin, + DataOrder::const_iterator End) const; + + /// Set the ordering of the section with \p SectionName. \p NewOrder is a + /// vector of [old address, size] pairs. The new symbol order is implicit + /// in the order of the vector. + void setSectionOrder(BinaryContext &BC, + BinarySection &OutputSection, + DataOrder::iterator Begin, + DataOrder::iterator End); + + bool markUnmoveableSymbols(BinaryContext &BC, + BinarySection &Section) const; +public: + explicit ReorderData() : BinaryFunctionPass(false) {} + + const char *getName() const override { + return "reorder-data"; + } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index e0b602ab4d28..bdf5443577f9 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -83,6 +83,7 @@ extern cl::OptionCategory AggregatorCategory; extern cl::opt AlignMacroOpFusion; extern cl::opt JumpTables; +extern cl::list ReorderData; static cl::opt ForceToDataRelocations("force-data-relocations", @@ -92,6 +93,16 @@ ForceToDataRelocations("force-data-relocations", cl::ZeroOrMore, cl::cat(BoltCategory)); +// Note: enabling this is liable to make things break. +static cl::opt +AllowSectionRelocations("allow-section-relocations", + cl::desc("allow reordering of data referenced by section relocations " + "(experimental)"), + cl::init(false), + cl::Hidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt PrintCacheMetrics("print-cache-metrics", cl::desc("calculate and print various metrics for instruction cache"), @@ -175,6 +186,19 @@ HotText("hot-text", cl::ZeroOrMore, cl::cat(BoltCategory)); +static cl::opt +HotData("hot-data", + cl::desc("hot data symbols support (relocation mode)"), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +static cl::opt +UpdateEnd("update-end", + cl::desc("update the _end symbol to point to the end of all data sections"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + static cl::opt KeepTmp("keep-tmp", cl::desc("preserve intermediate .o file"), @@ -500,12 +524,24 @@ void check_error(std::error_code EC, StringRef Message) { } namespace { + std::string uniquifyName(BinaryContext &BC, std::string NamePrefix) { unsigned LocalID = 1; while (BC.getBinaryDataByName(NamePrefix + std::to_string(LocalID))) ++LocalID; return NamePrefix + std::to_string(LocalID); } + +bool refersToReorderedSection(ErrorOr Section) { + auto Itr = std::find_if(opts::ReorderData.begin(), + opts::ReorderData.end(), + [&](const std::string &SectionName) { + return (Section && + Section->getName() == SectionName); + }); + return Itr != opts::ReorderData.end(); +} + } uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, @@ -1210,13 +1246,15 @@ void RewriteInstance::discoverFileObjects() { uint64_t NextAddress = getNextAddress(ISym); uint64_t TentativeSize = !SymbolSize ? NextAddress - Address : SymbolSize; uint64_t SymbolAlignment = Symbol.getAlignment(); + unsigned SymbolFlags = Symbol.getFlags(); auto registerName = [&](uint64_t FinalSize) { // Register names even if it's not a function, e.g. for an entry point. - BC->registerNameAtAddress(UniqueName, Address, FinalSize, SymbolAlignment); + BC->registerNameAtAddress(UniqueName, Address, FinalSize, + SymbolAlignment, SymbolFlags); if (!AlternativeName.empty()) BC->registerNameAtAddress(AlternativeName, Address, FinalSize, - SymbolAlignment); + SymbolAlignment, SymbolFlags); }; section_iterator Section = @@ -1679,6 +1717,15 @@ BinaryFunction *RewriteInstance::createBinaryFunction( return BF; } +ArrayRef RewriteInstance::getLSDAData() { + return ArrayRef(LSDASection->getData(), + LSDASection->getContents().size()); +} + +uint64_t RewriteInstance::getLSDAAddress() { + return LSDASection->getAddress(); +} + void RewriteInstance::readSpecialSections() { NamedRegionTimer T("readSpecialSections", "read special sections", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); @@ -1689,27 +1736,9 @@ void RewriteInstance::readSpecialSections() { for (const auto &Section : InputFile->sections()) { StringRef SectionName; check_error(Section.getName(SectionName), "cannot get section name"); - StringRef SectionContents; - ArrayRef SectionData; - if (ELFSectionRef(Section).getType() != ELF::SHT_NOBITS) { - check_error(Section.getContents(SectionContents), - "cannot get section contents"); - SectionData = ArrayRef( - reinterpret_cast(SectionContents.data()), - Section.getSize()); - } - - if (SectionName == ".gcc_except_table") { - LSDAData = SectionData; - LSDAAddress = Section.getAddress(); - } else if (SectionName == ".debug_loc") { - DebugLocSize = Section.getSize(); - } else if (SectionName == ".rela.text") { - HasTextRelocations = true; - } // Only register sections with names. - if (!getSectionName(Section).empty()) { + if (!SectionName.empty()) { BC->registerSection(Section); DEBUG(dbgs() << "BOLT-DEBUG: registering section " << SectionName << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x" @@ -1718,6 +1747,15 @@ void RewriteInstance::readSpecialSections() { } } + HasTextRelocations = (bool)BC->getUniqueSectionByName(".rela.text"); + LSDASection = BC->getUniqueSectionByName(".gcc_except_table"); + EHFrameSection = BC->getUniqueSectionByName(".eh_frame"); + GdbIndexSection = BC->getUniqueSectionByName(".gdb_index"); + PLTSection = BC->getUniqueSectionByName(".plt"); + GOTPLTSection = BC->getUniqueSectionByName(".got.plt"); + PLTGOTSection = BC->getUniqueSectionByName(".plt.got"); + RelaPLTSection = BC->getUniqueSectionByName(".rela.plt"); + if (opts::PrintSections) { outs() << "BOLT-INFO: Sections from original binary:\n"; BC->printSections(outs()); @@ -1728,13 +1766,6 @@ void RewriteInstance::readSpecialSections() { BC->printSections(outs()); } - EHFrameSection = BC->getUniqueSectionByName(".eh_frame"); - GdbIndexSection = BC->getUniqueSectionByName(".gdb_index"); - PLTSection = BC->getUniqueSectionByName(".plt"); - GOTPLTSection = BC->getUniqueSectionByName(".got.plt"); - PLTGOTSection = BC->getUniqueSectionByName(".plt.got"); - RelaPLTSection = BC->getUniqueSectionByName(".rela.plt"); - if (opts::RelocationMode == cl::BOU_TRUE && !HasTextRelocations) { errs() << "BOLT-ERROR: relocations against code are missing from the input " "file. Cannot proceed in relocations mode (-relocs).\n"; @@ -2055,8 +2086,11 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { const auto Address = SymbolAddress + Addend; const bool ForceRelocation = (opts::HotText && (SymbolName == "__hot_start" || - SymbolName == "__hot_end")) - || Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE; + SymbolName == "__hot_end")) || + (opts::HotData && (SymbolName == "__hot_data_start" || + SymbolName == "__hot_data_end")) || + SymbolName == "_end" || + Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE; DEBUG( dbgs() << "BOLT-DEBUG: "; @@ -2090,7 +2124,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { ContainingBF->addPCRelativeRelocationAddress(Rel.getOffset()); } DEBUG(dbgs() << "BOLT-DEBUG: not creating PC-relative relocation at 0x" - << Twine::utohexstr(Rel.getOffset()) + << Twine::utohexstr(Rel.getOffset()) << " for " << SymbolName << "\n"); continue; } @@ -2103,6 +2137,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } const bool IsToCode = RefSection && RefSection->isText(); + const bool IsSectionRelocation = + (cantFail(Rel.getSymbol()->getType()) == SymbolRef::ST_Debug); // Occasionally we may see a reference past the last byte of the function // typically as a result of __builtin_unreachable(). Check it here. @@ -2121,9 +2157,9 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } else if (ReferencedBF) { RefFunctionOffset = Address - ReferencedBF->getAddress(); DEBUG(dbgs() << " referenced function " << *ReferencedBF; - if (Address != ReferencedBF->getAddress()) - dbgs() << " at offset 0x" - << Twine::utohexstr(RefFunctionOffset); + if (Address != ReferencedBF->getAddress()) { + dbgs() << " at offset 0x" << Twine::utohexstr(RefFunctionOffset); + } dbgs() << '\n'); if (RefFunctionOffset) { ReferencedSymbol = @@ -2150,20 +2186,94 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { Addend = 0; } - if (auto *BD = BC->getBinaryDataContainingAddress(SymbolAddress)) { + // This function makes sure that symbols referenced by ambiguous + // relocations are marked as unmoveable. For now, if a section + // relocation points at the boundary between two symbols then + // those symbols are marked as unmoveable. + auto markAmbiguousRelocations = [&](BinaryData *BD) { + if (Address == BD->getAddress()) { + BD = BD->getAtomicRoot(); + DEBUG(if (BD->isMoveable()) { + dbgs() << "BOLT-DEBUG: setting " << *BD << " as unmoveable " + << "due to ambiguous relocation (0x" + << Twine::utohexstr(Address) << ") @ 0x" + << Twine::utohexstr(Rel.getOffset()) << "\n"; + }); + BD->setIsMoveable(false); + + // set previous symbol as unmoveable + auto *Prev = BC->getBinaryDataContainingAddress(Address-1); + if (Prev && Prev->getEndAddress() == BD->getAddress()) { + Prev = Prev->getAtomicRoot(); + DEBUG(if (Prev->isMoveable()) { + dbgs() << "BOLT-DEBUG: setting " << *Prev << " as unmoveable " + << "due to ambiguous relocation (0x" + << Twine::utohexstr(Address) << ") @ 0x" + << Twine::utohexstr(Rel.getOffset()) << "\n"; + }); + Prev->setIsMoveable(false); + } + } + + if (Address == BD->getEndAddress()) { + BD = BD->getAtomicRoot(); + DEBUG(if (BD->isMoveable()) { + dbgs() << "BOLT-DEBUG: setting " << *BD << " as unmoveable " + << "due to ambiguous relocation (0x" + << Twine::utohexstr(Address) << ") @ 0x" + << Twine::utohexstr(Rel.getOffset()) << "\n"; + }); + BD->setIsMoveable(false); + + // set next symbol as unmoveable + auto *Next = BC->getBinaryDataContainingAddress(BD->getEndAddress()); + if (Next && Next->getAddress() == BD->getEndAddress()) { + Next = Next->getAtomicRoot(); + DEBUG(if (Next->isMoveable()) { + dbgs() << "BOLT-DEBUG: setting " << *Next << " as unmoveable " + << "due to ambiguous relocation (0x" + << Twine::utohexstr(Address) << ") @ 0x" + << Twine::utohexstr(Rel.getOffset()) << "\n"; + }); + Next->setIsMoveable(false); + } + } + }; + + // If we are allowing section relocations, we assign relocations + // that are pointing to the end of a symbol to that symbol rather + // than the following symbol. + const auto IncludeEnd = + opts::AllowSectionRelocations && IsSectionRelocation; + + if (auto *BD = BC->getBinaryDataContainingAddress(SymbolAddress, + IncludeEnd)) { + assert(!IncludeEnd || + (BD == BC->getBinaryDataContainingAddress(SymbolAddress) || + !BC->getBinaryDataContainingAddress(SymbolAddress) || + (IsSectionRelocation && BD->getEndAddress() == + BC->getBinaryDataContainingAddress(SymbolAddress)->getAddress()))); + // Note: this assertion is trying to check sanity of BinaryData objects // but AArch64 has inferred and incomplete object locations coming from // GOT/TLS or any other non-trivial relocation (that requires creation // of sections and whose symbol address is not really what should be // encoded in the instruction). So we essentially disabled this check // for AArch64 and live with bogus names for objects. - assert(IsAArch64 || - cantFail(Rel.getSymbol()->getType()) == SymbolRef::ST_Debug || - BD->nameStartsWith(SymbolName) || - BD->nameStartsWith("PG" + SymbolName) || - ((BD->nameStartsWith("ANONYMOUS")) && - (BD->getSectionName().startswith(".plt") || - BD->getSectionName().endswith(".plt")))); + assert((IsAArch64 || + IsSectionRelocation || + BD->nameStartsWith(SymbolName) || + BD->nameStartsWith("PG" + SymbolName) || + (BD->nameStartsWith("ANONYMOUS") && + (BD->getSectionName().startswith(".plt") || + BD->getSectionName().endswith(".plt")))) && + "BOLT symbol names of all non-section relocations must match " + "up with symbol names referenced in the relocation"); + + if (!opts::AllowSectionRelocations && IsSectionRelocation) { + markAmbiguousRelocations(BD); + } + ReferencedSymbol = BD->getSymbol(); Addend += (SymbolAddress - BD->getAddress()); SymbolAddress = BD->getAddress(); @@ -2172,14 +2282,15 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { auto Symbol = *Rel.getSymbol(); // These are mostly local data symbols but undefined symbols // in relocation sections can get through here too, from .plt. - assert(IsAArch64 || - cantFail(Symbol.getType()) == SymbolRef::ST_Debug || - BC->getSectionForAddress(SymbolAddress) - ->getName() - .startswith(".plt")); + assert((IsAArch64 || + IsSectionRelocation || + BC->getSectionNameForAddress(SymbolAddress)->startswith(".plt")) + && "known symbols should not resolve to anonymous locals"); + const uint64_t SymbolSize = IsAArch64 ? 0 : ELFSymbolRef(Symbol).getSize(); const uint64_t SymbolAlignment = IsAArch64 ? 1 : Symbol.getAlignment(); + const unsigned SymbolFlags = Symbol.getFlags(); if (cantFail(Symbol.getType()) != SymbolRef::ST_Debug) { std::string Name; @@ -2194,12 +2305,19 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { ReferencedSymbol = BC->registerNameAtAddress(Name, SymbolAddress, SymbolSize, - SymbolAlignment); + SymbolAlignment, + SymbolFlags); } else { ReferencedSymbol = BC->getOrCreateGlobalSymbol(SymbolAddress, SymbolSize, SymbolAlignment, - "SYMBOLat"); + "SYMBOLat", + SymbolFlags); + } + + if (!opts::AllowSectionRelocations && IsSectionRelocation) { + auto *BD = BC->getBinaryDataByName(ReferencedSymbol->getName()); + markAmbiguousRelocations(BD); } } } @@ -2223,6 +2341,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { if (IsFromCode) { if (ReferencedBF || ForceRelocation || IsAArch64 || + refersToReorderedSection(RefSection) || (opts::ForceToDataRelocations && checkMaxDataRelocations())) { ContainingBF->addRelocation(Rel.getOffset(), ReferencedSymbol, @@ -2235,7 +2354,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } } else if (IsToCode) { BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), Addend); - } else if (opts::ForceToDataRelocations && checkMaxDataRelocations()) { + } else if (refersToReorderedSection(RefSection) || + (opts::ForceToDataRelocations && checkMaxDataRelocations())) { BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), @@ -2441,7 +2561,7 @@ void RewriteInstance::disassembleFunctions() { // Parse LSDA. if (Function.getLSDAAddress() != 0) - Function.parseLSDA(LSDAData, LSDAAddress); + Function.parseLSDA(getLSDAData(), getLSDAAddress()); if (!Function.buildCFG()) continue; @@ -2746,6 +2866,11 @@ void RewriteInstance::emitFunctions() { emitDataSection(Streamer.get(), *EHFrameSection, ".eh_frame_old"); } + // Update _end if needed. + if (opts::UpdateEnd) { + Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("_end")); + } + Streamer->Finish(); ////////////////////////////////////////////////////////////////////////////// @@ -2767,8 +2892,12 @@ void RewriteInstance::emitFunctions() { auto Resolver = orc::createLegacyLookupResolver( [&](const std::string &Name) -> JITSymbol { DEBUG(dbgs() << "BOLT: looking for " << Name << "\n"); - if (auto *I = BC->getBinaryDataByName(Name)) - return JITSymbol(I->getAddress(), JITSymbolFlags()); + if (auto *I = BC->getBinaryDataByName(Name)) { + const uint64_t Address = I->isMoved() && !I->isJumpTable() + ? I->getOutputAddress() + : I->getAddress(); + return JITSymbol(Address, JITSymbolFlags()); + } return JITSymbol(nullptr); }, [](Error Err) { cantFail(std::move(Err), "lookup failed"); }); @@ -2817,6 +2946,11 @@ void RewriteInstance::emitFunctions() { } void RewriteInstance::mapFileSections(orc::VModuleKey Key) { + mapTextSections(Key); + mapDataSections(Key); +} + +void RewriteInstance::mapTextSections(orc::VModuleKey Key) { NewTextSectionStartAddress = NextAvailableAddress; if (BC->HasRelocations) { auto TextSection = BC->getUniqueSectionByName(".text"); @@ -2882,14 +3016,12 @@ void RewriteInstance::mapFileSections(orc::VModuleKey Key) { if (opts::JumpTables == JTS_BASIC) { for (auto &JTI : Function.JumpTables) { auto *JT = JTI.second; - auto Section = BC->getUniqueSectionByName(JT->getOutputSection()); - assert(Section && "cannot find section for jump table"); - JT->setSection(*Section); - Section->setFileAddress(JT->getAddress()); - DEBUG(dbgs() << "BOLT-DEBUG: mapping " << Section->getName() + auto &Section = JT->getOutputSection(); + Section.setFileAddress(JT->getAddress()); + DEBUG(dbgs() << "BOLT-DEBUG: mapping " << Section.getName() << " to 0x" << Twine::utohexstr(JT->getAddress()) << '\n'); - OLT->mapSectionAddress(Key, Section->getSectionID(), + OLT->mapSectionAddress(Key, Section.getSectionID(), JT->getAddress()); } } @@ -2948,7 +3080,9 @@ void RewriteInstance::mapFileSections(orc::VModuleKey Key) { getFileOffsetForAddress(NewTextSectionStartAddress)); } } +} +void RewriteInstance::mapDataSections(orc::VModuleKey Key) { // Map special sections to their addresses in the output image. // These are the sections that we generate via MCStreamer. // The order is important. @@ -2964,6 +3098,8 @@ void RewriteInstance::mapFileSections(orc::VModuleKey Key) { DEBUG(dbgs() << "BOLT: mapping section " << SectionName << " (0x" << Twine::utohexstr(Section->getAllocAddress()) << ") to 0x" << Twine::utohexstr(NextAvailableAddress) + << ":0x" << Twine::utohexstr(NextAvailableAddress + + Section->getOutputSize()) << '\n'); OLT->mapSectionAddress(Key, Section->getSectionID(), NextAvailableAddress); @@ -3096,8 +3232,8 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { void RewriteInstance::emitDataSection(MCStreamer *Streamer, const BinarySection &Section, - StringRef Name) { - StringRef SectionName = !Name.empty() ? Name : Section.getName(); + StringRef NewName) { + StringRef SectionName = !NewName.empty() ? NewName : Section.getName(); StringRef SectionContents = Section.getContents(); auto *ELFSection = BC->Ctx->getELFSection(SectionName, Section.getELFType(), @@ -3106,36 +3242,41 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, Streamer->SwitchSection(ELFSection); Streamer->EmitValueToAlignment(Section.getAlignment()); + if (BC->HasRelocations && opts::HotData && Section.isReordered()) + Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_data_start")); + DEBUG(dbgs() << "BOLT-DEBUG: emitting " << (Section.isAllocatable() ? "" : "non-") << "allocatable data section " << SectionName << '\n'); if (!Section.hasRelocations()) { Streamer->EmitBytes(SectionContents); - return; - } - - uint64_t SectionOffset = 0; - for (auto &Relocation : Section.relocations()) { - assert(Relocation.Offset < Section.getSize() && "overflow detected"); - if (SectionOffset < Relocation.Offset) { - Streamer->EmitBytes( - SectionContents.substr(SectionOffset, - Relocation.Offset - SectionOffset)); - SectionOffset = Relocation.Offset; + } else { + uint64_t SectionOffset = 0; + for (auto &Relocation : Section.relocations()) { + assert(Relocation.Offset < SectionContents.size() && "overflow detected"); + if (SectionOffset < Relocation.Offset) { + Streamer->EmitBytes( + SectionContents.substr(SectionOffset, + Relocation.Offset - SectionOffset)); + SectionOffset = Relocation.Offset; + } + DEBUG(dbgs() << "BOLT-DEBUG: emitting relocation for symbol " + << Relocation.Symbol->getName() << " at offset 0x" + << Twine::utohexstr(Relocation.Offset) + << " with size " + << Relocation::getSizeForType(Relocation.Type) << '\n'); + auto RelocationSize = Relocation.emit(Streamer); + SectionOffset += RelocationSize; + } + assert(SectionOffset <= SectionContents.size() && "overflow error"); + if (SectionOffset < SectionContents.size()) { + Streamer->EmitBytes(SectionContents.substr(SectionOffset)); } - DEBUG(dbgs() << "BOLT-DEBUG: emitting relocation for symbol " - << Relocation.Symbol->getName() << " at offset 0x" - << Twine::utohexstr(Relocation.Offset) - << " with size " - << Relocation::getSizeForType(Relocation.Type) << '\n'); - auto RelocationSize = Relocation.emit(Streamer); - SectionOffset += RelocationSize; - } - assert(SectionOffset <= SectionContents.size() && "overflow error"); - if (SectionOffset < SectionContents.size()) { - Streamer->EmitBytes(SectionContents.substr(SectionOffset)); } + + if (BC->HasRelocations && opts::HotData && Section.isReordered()) + Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_data_end")); } void RewriteInstance::emitDataSections(MCStreamer *Streamer) { @@ -3145,7 +3286,9 @@ void RewriteInstance::emitDataSections(MCStreamer *Streamer) { StringRef SectionName = Section.getName(); assert(SectionName != ".eh_frame" && "should not emit .eh_frame as data"); - auto EmitName = OrgSecPrefix + std::string(SectionName); + std::string EmitName = Section.isReordered() + ? std::string(Section.getOutputName()) + : OrgSecPrefix + std::string(SectionName); emitDataSection(Streamer, Section, EmitName); } } @@ -3473,9 +3616,11 @@ void RewriteInstance::addBoltInfoSection() { // optional because for reference updating in the symbol table we only need the // map of input to output indices, not the real output section list. template -std::vector -RewriteInstance::getOutputSections(ELFObjectFile *File, - std::vector *OutputSections) { +std::vector RewriteInstance::getOutputSections( + ELFObjectFile *File, + std::vector *OutputSections, + std::map *SectionNameMap +) { auto *Obj = File->getELFFile(); auto Sections = cantFail(Obj->sections()); @@ -3507,16 +3652,20 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, if (!(Section.sh_flags & ELF::SHF_ALLOC)) continue; - NewSectionIndex[std::distance(Sections.begin(), &Section)] = - CurIndex++; + StringRef SectionName = + cantFail(Obj->getSectionName(&Section), "cannot get section name"); + + if (SectionNameMap && !SectionNameMap->count(SectionName)) { + (*SectionNameMap)[SectionName] = CurIndex; + } + const auto OldIdx = std::distance(Sections.begin(), &Section); + assert(NewSectionIndex[OldIdx] == 0); + NewSectionIndex[OldIdx] = CurIndex++; // If only computing the map, we're done with this iteration if (!OutputSections) continue; - StringRef SectionName = - cantFail(Obj->getSectionName(&Section), "cannot get section name"); - auto NewSection = Section; if (SectionName == ".bss") { // .bss section offset matches that of the next section. @@ -3539,38 +3688,65 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, NewTextSectionIndex = CurIndex; } - // Process entries for all new allocatable sections. + // Process entries for all new allocatable sections. Make sure + // allocatable sections follow the same order as in mapDataSections so + // that the section indices are consistent. + std::vector AllocatableSections; + std::vector SectionNames = { ".eh_frame", + ".gcc_except_table", + ".rodata", + ".rodata.cold" }; + for (const auto &SectionName : SectionNames) { + auto Section = BC->getUniqueSectionByName(SectionName); + if (Section && Section->isFinalized()) { + AllocatableSections.push_back(&*Section); + } + } for (auto &Section : BC->allocatableSections()) { if (!Section.isFinalized()) continue; + if (std::find_if(AllocatableSections.begin(), + AllocatableSections.end(), + [&Section](const BinarySection *BSec) { + return BSec == &Section; + }) == AllocatableSections.end()) { + AllocatableSections.push_back(&Section); + } + } + + for (const auto *Section : AllocatableSections) { // Ignore function sections. - if (Section.getFileAddress() < NewTextSegmentAddress) { + if (Section->getFileAddress() < NewTextSegmentAddress) { if (opts::Verbosity) outs() << "BOLT-INFO: not writing section header for existing section " - << Section.getName() << '\n'; + << Section->getName() << '\n'; continue; } + if (SectionNameMap) { + (*SectionNameMap)[Section->getName()] = CurIndex; + } ++CurIndex; + // If only computing the map, we're done with this iteration if (!OutputSections) continue; if (opts::Verbosity >= 1) outs() << "BOLT-INFO: writing section header for " - << Section.getName() << '\n'; + << Section->getName() << '\n'; ELFShdrTy NewSection; - NewSection.sh_name = SHStrTab.getOffset(Section.getName()); + NewSection.sh_name = SHStrTab.getOffset(Section->getName()); NewSection.sh_type = ELF::SHT_PROGBITS; - NewSection.sh_addr = Section.getFileAddress(); - NewSection.sh_offset = Section.getFileOffset(); - NewSection.sh_size = Section.getOutputSize(); + NewSection.sh_addr = Section->getFileAddress(); + NewSection.sh_offset = Section->getFileOffset(); + NewSection.sh_size = Section->getOutputSize(); NewSection.sh_entsize = 0; - NewSection.sh_flags = Section.getELFFlags(); + NewSection.sh_flags = Section->getELFFlags(); NewSection.sh_link = 0; NewSection.sh_info = 0; - NewSection.sh_addralign = Section.getAlignment(); + NewSection.sh_addralign = Section->getAlignment(); OutputSections->emplace_back(NewSection); } @@ -3587,16 +3763,20 @@ RewriteInstance::getOutputSections(ELFObjectFile *File, if (Section.sh_type == ELF::SHT_RELA) continue; - NewSectionIndex[std::distance(Sections.begin(), &Section)] = - CurIndex++; + StringRef SectionName = + cantFail(Obj->getSectionName(&Section), "cannot get section name"); + + if (SectionNameMap && !SectionNameMap->count(SectionName)) { + (*SectionNameMap)[SectionName] = CurIndex; + } + const auto OldIdx = std::distance(Sections.begin(), &Section); + assert(NewSectionIndex[OldIdx] == 0); + NewSectionIndex[OldIdx] = CurIndex++; // If only computing the map, we're done with this iteration if (!OutputSections) continue; - StringRef SectionName = - cantFail(Obj->getSectionName(&Section), "cannot get section name"); - auto BSec = BC->getUniqueSectionByName(SectionName); assert(BSec && "missing section info for non-allocatable section"); @@ -3724,15 +3904,25 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { // Compute a preview of how section indices will change after rewriting, so // we can properly update the symbol table. + std::map SectionNameMap; auto NewSectionIndex = - getOutputSections(File, (std::vector *)nullptr); + getOutputSections(File, (std::vector *)nullptr, &SectionNameMap); - auto updateSymbolTable = [&](bool PatchExisting, const Elf_Shdr *Section, - std::function - Write, - std::function AddToStrTab) { + DEBUG(dbgs() << "BOLT-DEBUG: SectionNameMap:\n"; + for (auto &Entry : SectionNameMap) { + dbgs() << "BOLT-DEBUG: " << Entry.first << " -> " + << Entry.second << "\n"; + }); + + auto updateSymbolTable = + [&](bool PatchExisting, + const Elf_Shdr *Section, + std::function + Write, + std::function AddToStrTab) { auto StringSection = cantFail(Obj->getStringTableForSymtab(*Section)); unsigned IsHotTextUpdated = 0; + unsigned IsHotDataUpdated = 0; std::map IslandSizes; auto getConstantIslandSize = [&IslandSizes](const BinaryFunction *BF) { @@ -3806,9 +3996,34 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { sizeof(CodeMarkSym)); } } else { - if (NewSymbol.st_shndx < ELF::SHN_LORESERVE) { - NewSymbol.st_shndx = NewSectionIndex[NewSymbol.st_shndx]; + uint32_t OldSectionIndex = NewSymbol.st_shndx; + auto *BD = !Function ? BC->getBinaryDataAtAddress(NewSymbol.st_value) + : nullptr; + if (BD && BD->isMoved() && !BD->isJumpTable()) { + assert((!BD->getSize() || + !NewSymbol.st_size || + NewSymbol.st_size == BD->getSize()) && + "sizes must match"); + + auto &OutputSection = BD->getOutputSection(); + + assert(SectionNameMap.count(OutputSection.getName())); + DEBUG(dbgs() << "BOLT-DEBUG: moving " << BD->getName() << " from " + << *BC->getSectionNameForAddress(NewSymbol.st_value) + << " (" << NewSymbol.st_shndx << ") to " + << OutputSection.getName() << " (" + << SectionNameMap[OutputSection.getName()] << ")\n"); + OldSectionIndex = ELF::SHN_LORESERVE; + NewSymbol.st_shndx = SectionNameMap[OutputSection.getName()]; + + // TODO: use getNewValueForSymbol()? + NewSymbol.st_value = BD->getOutputAddress(); + } + + if (OldSectionIndex < ELF::SHN_LORESERVE) { + NewSymbol.st_shndx = NewSectionIndex[OldSectionIndex]; } + // Detect local syms in the text section that we didn't update // and were preserved by the linker to support relocations against // .text (t15274167). Remove then from the symtab. @@ -3834,20 +4049,31 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { } } - if (opts::HotText) { - auto updateSymbolValue = [&](const StringRef Name) { - NewSymbol.st_value = getNewValueForSymbol(Name); - NewSymbol.st_shndx = ELF::SHN_ABS; - outs() << "BOLT-INFO: setting " << Name << " to 0x" - << Twine::utohexstr(NewSymbol.st_value) << '\n'; - ++IsHotTextUpdated; - return true; - }; - - auto SymbolName = - cantFail(Symbol.getName(StringSection), "cannot get symbol name"); - if (SymbolName == "__hot_start" || SymbolName == "__hot_end") - updateSymbolValue(SymbolName); + auto SymbolName = Symbol.getName(StringSection); + assert(SymbolName && "cannot get symbol name"); + + auto updateSymbolValue = [&](const StringRef Name, unsigned &IsUpdated) { + NewSymbol.st_value = getNewValueForSymbol(Name); + NewSymbol.st_shndx = ELF::SHN_ABS; + outs() << "BOLT-INFO: setting " << Name << " to 0x" + << Twine::utohexstr(NewSymbol.st_value) << '\n'; + ++IsUpdated; + return true; + }; + + if (opts::HotText && (*SymbolName == "__hot_start" || + *SymbolName == "__hot_end")) + updateSymbolValue(*SymbolName, IsHotTextUpdated); + + if (opts::HotData && (*SymbolName == "__hot_data_start" || + *SymbolName == "__hot_data_end")) + updateSymbolValue(*SymbolName, IsHotDataUpdated); + + if (opts::UpdateEnd && *SymbolName == "_end") { + NewSymbol.st_value = getNewValueForSymbol(*SymbolName); + NewSymbol.st_shndx = ELF::SHN_ABS; + outs() << "BOLT-INFO: setting " << *SymbolName << " to 0x" + << Twine::utohexstr(NewSymbol.st_value) << '\n'; } Write((&Symbol - cantFail(Obj->symbols(Section)).begin()) * @@ -3857,24 +4083,33 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { assert((!IsHotTextUpdated || IsHotTextUpdated == 2) && "either none or both __hot_start/__hot_end symbols were expected"); - if (opts::HotText && !IsHotTextUpdated && !PatchExisting) { - auto addSymbol = [&](const std::string &Name) { - Elf_Sym Symbol; - Symbol.st_value = getNewValueForSymbol(Name); - Symbol.st_shndx = ELF::SHN_ABS; - Symbol.st_name = AddToStrTab(Name); - Symbol.st_size = 0; - Symbol.st_other = 0; - Symbol.setBindingAndType(ELF::STB_WEAK, ELF::STT_NOTYPE); - - outs() << "BOLT-INFO: setting " << Name << " to 0x" - << Twine::utohexstr(Symbol.st_value) << '\n'; + assert((!IsHotDataUpdated || IsHotDataUpdated == 2) && + "either none or both __hot_data_start/__hot_data_end symbols were expected"); + + auto addSymbol = [&](const std::string &Name) { + Elf_Sym Symbol; + Symbol.st_value = getNewValueForSymbol(Name); + Symbol.st_shndx = ELF::SHN_ABS; + Symbol.st_name = AddToStrTab(Name); + Symbol.st_size = 0; + Symbol.st_other = 0; + Symbol.setBindingAndType(ELF::STB_WEAK, ELF::STT_NOTYPE); + + outs() << "BOLT-INFO: setting " << Name << " to 0x" + << Twine::utohexstr(Symbol.st_value) << '\n'; + + Write(0, reinterpret_cast(&Symbol), sizeof(Symbol)); + }; - Write(0, reinterpret_cast(&Symbol), sizeof(Symbol)); - }; + if (opts::HotText && !IsHotTextUpdated && !PatchExisting) { addSymbol("__hot_start"); addSymbol("__hot_end"); } + + if (opts::HotData && !IsHotDataUpdated && !PatchExisting) { + addSymbol("__hot_data_start"); + addSymbol("__hot_data_end"); + } }; // Update dynamic symbol table. @@ -3886,7 +4121,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { } } assert(DynSymSection && "no dynamic symbol table found"); - updateSymbolTable(/*patch existing table?*/ true, DynSymSection, + updateSymbolTable(/*patch existing table?*/ true, + DynSymSection, [&](size_t Offset, const char *Buf, size_t Size) { Out->os().pwrite(Buf, Size, DynSymSection->sh_offset + Offset); @@ -3914,10 +4150,12 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto SecName = cantFail(Obj->getSectionName(SymTabSection)); auto StrSecName = cantFail(Obj->getSectionName(StrTabSection)); - updateSymbolTable(/*patch existing table?*/false, SymTabSection, + updateSymbolTable(/*patch existing table?*/ false, + SymTabSection, [&](size_t Offset, const char *Buf, size_t Size) { NewContents.append(Buf, Size); - }, [&](StringRef Str) { + }, + [&](StringRef Str) { size_t Idx = NewStrTab.size(); NewStrTab.append(Str.data(), Str.size()); NewStrTab.append(1, '\0'); @@ -4152,7 +4390,7 @@ void RewriteInstance::rewriteFile() { if (opts::JumpTables == JTS_BASIC) { for (auto &JTI : Function.JumpTables) { auto *JT = JTI.second; - auto &Section = JT->getSection(); + auto &Section = JT->getOutputSection(); Section.setFileOffset(getFileOffsetForAddress(JT->getAddress())); assert(Section.getFileOffset() && "no matching offset in file"); OS.pwrite(reinterpret_cast(Section.getOutputData()), @@ -4254,7 +4492,6 @@ void RewriteInstance::rewriteFile() { if (BC->HasRelocations) { patchELFRelaPLT(); - patchELFGOT(); } diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 7dd4a5301d2e..edebf647622c 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -183,6 +183,8 @@ class RewriteInstance { std::vector FunctionStack); /// Map all sections to their final addresses. + void mapTextSections(orc::VModuleKey ObjectsHandle); + void mapDataSections(orc::VModuleKey ObjectsHandle); void mapFileSections(orc::VModuleKey ObjectsHandle); /// Update output object's values based on the final \p Layout. @@ -316,12 +318,14 @@ class RewriteInstance { ELF_FUNCTION(finalizeSectionStringTable); /// Get a list of all the sections to include in the output binary along - /// with a map of input to output indices. + /// with a map of input to output indices. Optionally produce a mapping + /// of section name to new section index in /p OutputSectionNameMap. template ::Elf_Shdr> - std::vector - getOutputSections(ELFObjectFile *File, - std::vector *OutputSections); + std::vector getOutputSections( + ELFObjectFile *File, + std::vector *OutputSections = nullptr, + std::map *OutputSectionNameMap = nullptr); /// Add a notes section containing the BOLT revision and command line options. void addBoltInfoSection(); @@ -389,6 +393,11 @@ class RewriteInstance { }; private: + /// Get the contents of the LSDA section for this binary. + ArrayRef getLSDAData(); + + /// Get the mapped address of the LSDA section for this binary. + uint64_t getLSDAAddress(); static const char TimerGroupName[]; @@ -471,8 +480,7 @@ class RewriteInstance { uint64_t NewTextSectionIndex{0}; /// Exception handling and stack unwinding information in this binary. - ArrayRef LSDAData; - uint64_t LSDAAddress{0}; + ErrorOr LSDASection{std::errc::bad_address}; const llvm::DWARFDebugFrame *EHFrame{nullptr}; ErrorOr EHFrameSection{std::errc::bad_address}; @@ -504,9 +512,6 @@ class RewriteInstance { /// rewriting CFI info for these functions. std::vector FailedAddresses; - /// Size of the .debug_loc section in input. - uint32_t DebugLocSize{0}; - /// Keep track of which functions didn't fit in their original space in the /// last emission, so that we may either decide to split or not optimize them. std::set LargeFunctions; diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index ec43191105d9..ca8eb62523c7 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -2857,8 +2857,8 @@ class X86MCPlusBuilder : public MCPlusBuilder { ICPdata indirectCallPromotion( const MCInst &CallInst, - const std::vector> &Targets, - const std::vector &VtableAddrs, + const std::vector> &Targets, + const std::vector> &VtableSyms, const std::vector &MethodFetchInsns, const bool MinimizeCodeSize, MCContext *Ctx @@ -2876,8 +2876,8 @@ class X86MCPlusBuilder : public MCPlusBuilder { unsigned FuncAddrReg = X86::R10; - const bool LoadElim = !VtableAddrs.empty(); - assert((!LoadElim || VtableAddrs.size() == Targets.size()) && + const bool LoadElim = !VtableSyms.empty(); + assert((!LoadElim || VtableSyms.size() == Targets.size()) && "There must be a vtable entry for every method " "in the targets vector."); @@ -2978,14 +2978,21 @@ class X86MCPlusBuilder : public MCPlusBuilder { } // Target address. - if (Targets[i].first && !LoadElim) { - Compare.addOperand( - MCOperand::createExpr( - MCSymbolRefExpr::create(Targets[i].first, - MCSymbolRefExpr::VK_None, - *Ctx))); + if (Targets[i].first || LoadElim) { + const auto *Sym = LoadElim ? VtableSyms[i].first : Targets[i].first; + const auto Addend = LoadElim ? VtableSyms[i].second : 0; + + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, *Ctx); + + if (Addend) { + Expr = MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(Addend, *Ctx), + *Ctx); + } + + Compare.addOperand(MCOperand::createExpr(Expr)); } else { - const auto Addr = LoadElim ? VtableAddrs[i] : Targets[i].second; + const auto Addr = Targets[i].second; // Immediate address is out of sign extended 32 bit range. if (int64_t(Addr) != int64_t(int32_t(Addr))) { return ICPdata(); From 3f2f4d1f41a8e94d1d619f676619ea0899b59611 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Fri, 11 May 2018 12:03:19 -0700 Subject: [PATCH 418/904] adjusting cache stats for non-simple functions Summary: While working with a binary in non-relocations mode, I realized some cache metrics are not computed correctly. Hence, this fix. In addition, logging the number of functions with modified ordering of basic blocks, which is helpful for analysis. (cherry picked from commit 547c318883e326d91dcb2bc5b1470d6737e692ab) --- bolt/src/BinaryFunction.cpp | 2 +- bolt/src/BinaryFunction.h | 8 ++++++-- bolt/src/CacheMetrics.cpp | 16 +++++++++++++--- bolt/src/Passes/BinaryPasses.cpp | 6 +++++- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 1b86764663a7..cd383b9a9a0d 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -2188,7 +2188,7 @@ uint64_t BinaryFunction::getInstructionCount() const { } bool BinaryFunction::hasLayoutChanged() const { - return BasicBlocksPreviousLayout != BasicBlocksLayout; + return ModifiedLayout; } uint64_t BinaryFunction::getEditDistance() const { diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 14e9f9d372d7..d7750f4315aa 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -653,6 +653,7 @@ class BinaryFunction { BasicBlockOrderType BasicBlocksLayout; /// Previous layout replaced by modifyLayout BasicBlockOrderType BasicBlocksPreviousLayout; + bool ModifiedLayout{false}; /// BasicBlockOffsets are used during CFG construction to map from code /// offsets to BinaryBasicBlocks. Any modifications made to the CFG @@ -852,8 +853,11 @@ class BinaryFunction { if (SavePrevLayout) BasicBlocksPreviousLayout = BasicBlocksLayout; - BasicBlocksLayout.clear(); - BasicBlocksLayout.swap(NewLayout); + if (NewLayout != BasicBlocksLayout) { + ModifiedLayout = true; + BasicBlocksLayout.clear(); + BasicBlocksLayout.swap(NewLayout); + } } /// Return a list of basic blocks sorted using DFS and update layout indices diff --git a/bolt/src/CacheMetrics.cpp b/bolt/src/CacheMetrics.cpp index c1fde8a28524..fd6fe8cb9587 100644 --- a/bolt/src/CacheMetrics.cpp +++ b/bolt/src/CacheMetrics.cpp @@ -76,11 +76,19 @@ void extractBasicBlockInfo( std::unordered_map &BBAddr, std::unordered_map &BBSize) { - // Use addresses/sizes as in the output binary for (auto BF : BinaryFunctions) { + const auto &BC = BF->getBinaryContext(); for (auto BB : BF->layout()) { - BBAddr[BB] = BB->getOutputAddressRange().first; - BBSize[BB] = BB->getOutputSize(); + if (BF->isSimple() || BC.HasRelocations) { + // Use addresses/sizes as in the output binary + BBAddr[BB] = BB->getOutputAddressRange().first; + BBSize[BB] = BB->getOutputSize(); + } else { + // Output ranges should match the input if the body hasn't changed + BBAddr[BB] = BB->getInputAddressRange().first + BF->getAddress(); + BBSize[BB] = BB->getOriginalSize(); + } + assert(BBAddr[BB] > 0 && "incorrect output block address"); } } } @@ -94,6 +102,8 @@ double calcTSPScore( double Score = 0; for (auto BF : BinaryFunctions) { + if (!BF->hasProfile()) + continue; for (auto SrcBB : BF->layout()) { auto BI = SrcBB->branch_info_begin(); for (auto DstBB : SrcBB->successors()) { diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index a9d48ca06f06..85453af7b574 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -395,11 +395,15 @@ void ReorderBasicBlocks::runOnFunctions( modifyFunctionLayout(Function, opts::ReorderBlocks, opts::MinBranchClusters, ShouldSplit); - if (opts::PrintFuncStat > 0 && Function.hasLayoutChanged()) { + if (Function.hasLayoutChanged()) { ++ModifiedFuncCount; } } + outs() << "BOLT-INFO: basic block reordering modified layout of " + << format("%zu (%.2lf%%) functions\n", + ModifiedFuncCount, 100.0 * ModifiedFuncCount / BFs.size()); + if (opts::PrintFuncStat > 0) { raw_ostream &OS = outs(); // Copy all the values into vector in order to sort them From b4b4f4784e42017812cc14922639bb060dccec80 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 11 May 2018 15:30:56 -0700 Subject: [PATCH 419/904] [BOLT] Fix dyno-stats for PLT calls Summary: To accurately account for PLT optimization, each PLT call should be counted as an extra indirect call instruction, which in turn is a load, a call, an indirect call, and instruction entry in dyno stats. (cherry picked from commit 42e8adf70d961b572a6bfe2ae189a00582b4dd60) --- bolt/src/BinaryFunction.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index cd383b9a9a0d..8a25e179a9ea 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3896,8 +3896,10 @@ DynoStats BinaryFunction::getDynoStats() const { if (BC.MIB->isLoad(Instr)) { Stats[DynoStats::LOADS] += BBExecutionCount; } + if (!BC.MIB->isCall(Instr)) continue; + uint64_t CallFreq = BBExecutionCount; if (BC.MIB->getConditionalTailCall(Instr)) { CallFreq = @@ -3911,10 +3913,16 @@ DynoStats BinaryFunction::getDynoStats() const { if (BF && BF->isPLTFunction()) { Stats[DynoStats::PLT_CALLS] += CallFreq; - // We don't process PLT functions and hence have to adjust - // relevant dynostats here. - Stats[DynoStats::LOADS] += CallFreq; + // We don't process PLT functions and hence have to adjust relevant + // dynostats here for: + // + // jmp *GOT_ENTRY(%rip) + // + // NOTE: this is arch-specific. + Stats[DynoStats::FUNCTION_CALLS] += CallFreq; Stats[DynoStats::INDIRECT_CALLS] += CallFreq; + Stats[DynoStats::LOADS] += CallFreq; + Stats[DynoStats::INSTRUCTIONS] += CallFreq; } } } From 11a9c202baffc128745e955db49e3b18fbea778a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 11 May 2018 18:30:47 -0700 Subject: [PATCH 420/904] [BOLT] Add option to ignore function hash in profile Summary: When we make changes to MCInst opcodes (or get changes from upstream), a hash value for BinaryFunction changes. As a result, we are unable to match profile generated by a previous version of BOLT. Add option `-profile-ignore-hash` to match profile while ignoring function hash value. With this option we match functions with common names using the number of basic blocks. (cherry picked from commit 3e9a374bde6a6bdaef8b6d80f64bbd16e0489ea4) --- bolt/src/ProfileReader.cpp | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/bolt/src/ProfileReader.cpp b/bolt/src/ProfileReader.cpp index 5e2ad7986b91..9851814fcbfd 100644 --- a/bolt/src/ProfileReader.cpp +++ b/bolt/src/ProfileReader.cpp @@ -16,8 +16,21 @@ #include "ProfileYAMLMapping.h" #include "llvm/Support/CommandLine.h" +using namespace llvm; + namespace opts { -extern llvm::cl::opt Verbosity; + +extern cl::opt Verbosity; +extern cl::OptionCategory BoltOptCategory; + +static llvm::cl::opt +IgnoreHash("profile-ignore-hash", + cl::desc("ignore hash while reading function profile"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + } namespace llvm { @@ -59,7 +72,7 @@ ProfileReader::parseFunctionProfile(BinaryFunction &BF, BF.setExecutionCount(YamlBF.ExecCount); - if (YamlBF.Hash != BF.hash(true, true)) { + if (!opts::IgnoreHash && YamlBF.Hash != BF.hash(true, true)) { if (opts::Verbosity >= 1) errs() << "BOLT-WARNING: function hash mismatch\n"; ProfileMatched = false; @@ -251,21 +264,33 @@ ProfileReader::readProfile(const std::string &FileName, YamlProfileToFunction.resize(YamlBP.Functions.size() + 1); + auto profileMatches = [](const yaml::bolt::BinaryFunctionProfile &Profile, + BinaryFunction &BF) { + if (opts::IgnoreHash && Profile.NumBasicBlocks == BF.size()) + return true; + if (!opts::IgnoreHash && Profile.Hash == BF.hash(/*Recompute = */false)) + return true; + return false; + }; + // We have to do 2 passes since LTO introduces an ambiguity in function // names. The first pass assigns profiles that match 100% by name and // by hash. The second pass allows name ambiguity for LTO private functions. for (auto &BFI : Functions) { auto &Function = BFI.second; - auto Hash = Function.hash(true, true); + + // Recompute hash once per function. + if (!opts::IgnoreHash) + Function.hash(/*Recompute = */true, true); + for (auto &FunctionName : Function.getNames()) { auto PI = ProfileNameToProfile.find(FunctionName); if (PI == ProfileNameToProfile.end()) { continue; } auto &YamlBF = *PI->getValue(); - if (YamlBF.Hash == Hash) { + if (profileMatches(YamlBF, Function)) matchProfileToFunction(YamlBF, Function); - } } } @@ -275,7 +300,6 @@ ProfileReader::readProfile(const std::string &FileName, if (ProfiledFunctions.count(&Function)) continue; - auto Hash = Function.hash(/*Recompute = */false); // was just recomputed for (auto &FunctionName : Function.getNames()) { const auto CommonName = getLTOCommonName(FunctionName); if (CommonName) { @@ -288,7 +312,7 @@ ProfileReader::readProfile(const std::string &FileName, for (auto *YamlBF : LTOProfiles) { if (YamlBF->Used) continue; - if (YamlBF->Hash == Hash) { + if ((ProfileMatched = profileMatches(*YamlBF, Function))) { matchProfileToFunction(*YamlBF, Function); break; } From da3326f0942c541faaaaf38511257c2c4263abe5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 14 May 2018 11:10:26 -0700 Subject: [PATCH 421/904] [BOLT] Properly handle non-standard function refs Summary: Application code can reference functions in a non-standard way, e.g. using arithmetic and bitmask operations on them. One example is if a program checks if a function is below a certain address or within a certain address range to perform a low-level optimization or generate a proper code (JIT). Instead of relying on a relocation value (symbol+addend), we use only the symbol value, and then check if the value is inside the function. If it is, we treat it as a code reference against location within the function, otherwise we handle it as a non-standard function reference and issue a warning. (cherry picked from commit 17d20fba1061a37bf920a6f95b819e46f110cb5f) --- bolt/src/BinaryContext.h | 5 +++ bolt/src/BinaryFunction.cpp | 8 +++- bolt/src/RewriteInstance.cpp | 77 +++++++++++++++++++++++++----------- 3 files changed, 64 insertions(+), 26 deletions(-) diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index f7a41fd71e02..34dfea6d22bd 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -57,6 +57,11 @@ namespace bolt { class BinaryFunction; class DataReader; +/// Helper function to truncate a \p Value to given size in \p Bytes. +inline int64_t truncateToSize(int64_t Value, unsigned Bytes) { + return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8)); +} + /// Filter iterator. template > diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 8a25e179a9ea..2f45be190f4b 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1156,9 +1156,13 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Make sure we replaced the correct immediate (instruction // can have multiple immediate operands). - assert((BC.isAArch64() || - static_cast(Value) == Relocation.Value) && + if (BC.isX86()) { + assert(truncateToSize(static_cast(Value), + Relocation::getSizeForType(Relocation.Type)) == + truncateToSize(Relocation.Value, + Relocation::getSizeForType(Relocation.Type)) && "immediate value mismatch in function"); + } } // Convert instruction to a shorter version that could be relaxed if diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index bdf5443577f9..a9a466f4040c 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -482,10 +482,6 @@ MCPlusBuilder *createMCPlusBuilder(const Triple::ArchType Arch, } } -int64_t truncateToSize(int64_t Value, unsigned Bytes) { - return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8)); -} - } constexpr const char *RewriteInstance::SectionsToOverwrite[]; @@ -1889,7 +1885,7 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, Rel.getOffset()); } - // Weird stuff - section symbols are marked as ST_Debug. + // Section symbols are marked as ST_Debug. const bool SymbolIsSection = (cantFail(Symbol.getType()) == SymbolRef::ST_Debug); const auto PCRelOffset = IsPCRelative && !IsAArch64 ? Rel.getOffset() : 0; @@ -2144,6 +2140,31 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { // typically as a result of __builtin_unreachable(). Check it here. auto *ReferencedBF = getBinaryFunctionContainingAddress( Address, /*CheckPastEnd*/ true, /*UseMaxSize*/ IsAArch64); + + if (!IsSectionRelocation) { + if (auto *BF = getBinaryFunctionContainingAddress(SymbolAddress)) { + if (BF != ReferencedBF) { + // It's possible we are referencing a function without referencing any + // code, e.g. when taking a bitmask action on a function address. + errs() << "BOLT-WARNING: non-standard function reference (e.g. " + "bitmask) detected against function " << *BF; + if (IsFromCode) { + errs() << " from function " << *ContainingBF << '\n'; + } else { + errs() << " from data section at 0x" + << Twine::utohexstr(Rel.getOffset()) << '\n'; + } + DEBUG(printRelocationInfo(Rel, + SymbolName, + SymbolAddress, + Addend, + ExtractedValue) + ); + ReferencedBF = BF; + } + } + } + uint64_t RefFunctionOffset = 0; MCSymbol *ReferencedSymbol = nullptr; if (ForceRelocation) { @@ -2155,20 +2176,25 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { " symbol " << SymbolName << " with addend " << Addend << '\n'); } else if (ReferencedBF) { - RefFunctionOffset = Address - ReferencedBF->getAddress(); - DEBUG(dbgs() << " referenced function " << *ReferencedBF; - if (Address != ReferencedBF->getAddress()) { - dbgs() << " at offset 0x" << Twine::utohexstr(RefFunctionOffset); - } - dbgs() << '\n'); - if (RefFunctionOffset) { - ReferencedSymbol = - ReferencedBF->getOrCreateLocalLabel(Address, /*CreatePastEnd*/ true); - } else { - ReferencedSymbol = ReferencedBF->getSymbol(); + ReferencedSymbol = ReferencedBF->getSymbol(); + + // Adjust the point of reference to a code location inside a function. + if (ReferencedBF->containsAddress(Address, /*UseMaxSize = */true)) { + RefFunctionOffset = Address - ReferencedBF->getAddress(); + if (RefFunctionOffset) { + ReferencedSymbol = + ReferencedBF->getOrCreateLocalLabel(Address, + /*CreatePastEnd =*/ true); + } + SymbolAddress = Address; + Addend = 0; } - SymbolAddress = Address; - Addend = 0; + DEBUG( + dbgs() << " referenced function " << *ReferencedBF; + if (Address != ReferencedBF->getAddress()) + dbgs() << " at offset 0x" << Twine::utohexstr(RefFunctionOffset); + dbgs() << '\n' + ); } else { if (RefSection && RefSection->isText() && SymbolAddress) { // This can happen e.g. with PIC-style jump tables. @@ -2252,7 +2278,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { (BD == BC->getBinaryDataContainingAddress(SymbolAddress) || !BC->getBinaryDataContainingAddress(SymbolAddress) || (IsSectionRelocation && BD->getEndAddress() == - BC->getBinaryDataContainingAddress(SymbolAddress)->getAddress()))); + BC->getBinaryDataContainingAddress(SymbolAddress)-> + getAddress()))); // Note: this assertion is trying to check sanity of BinaryData objects // but AArch64 has inferred and incomplete object locations coming from @@ -2292,7 +2319,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { const uint64_t SymbolAlignment = IsAArch64 ? 1 : Symbol.getAlignment(); const unsigned SymbolFlags = Symbol.getFlags(); - if (cantFail(Symbol.getType()) != SymbolRef::ST_Debug) { + if (!IsSectionRelocation) { std::string Name; if (Symbol.getFlags() & SymbolRef::SF_Global) { Name = SymbolName; @@ -2353,7 +2380,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { << ReferencedSymbol->getName() << "\n"); } } else if (IsToCode) { - BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), Addend); + BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), + Addend); } else if (refersToReorderedSection(RefSection) || (opts::ForceToDataRelocations && checkMaxDataRelocations())) { BC->addRelocation(Rel.getOffset(), @@ -2997,7 +3025,8 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { continue; auto TooLarge = false; - auto FuncSection = BC->getUniqueSectionByName(Function.getCodeSectionName()); + auto FuncSection = + BC->getUniqueSectionByName(Function.getCodeSectionName()); assert(FuncSection && "cannot find section for function"); DEBUG(dbgs() << "BOLT: mapping 0x" << Twine::utohexstr(FuncSection->getAllocAddress()) @@ -3257,8 +3286,8 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, assert(Relocation.Offset < SectionContents.size() && "overflow detected"); if (SectionOffset < Relocation.Offset) { Streamer->EmitBytes( - SectionContents.substr(SectionOffset, - Relocation.Offset - SectionOffset)); + SectionContents.substr(SectionOffset, + Relocation.Offset - SectionOffset)); SectionOffset = Relocation.Offset; } DEBUG(dbgs() << "BOLT-DEBUG: emitting relocation for symbol " From 13fe1c2ca6220722aa98ce80f7d683d8c93b34b1 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 17 May 2018 16:58:29 -0700 Subject: [PATCH 422/904] [BOLT] Add option to print functions with bad layout Summary: Option `-report-bad-layout=N` prints top N functions with layouts that have cold blocks placed in the middle of hot blocks. The sorting is based on execution_count / number_of_basic_blocks formula. (cherry picked from commit ddcc248924511c845ede614c2d717627d1369cf5) --- bolt/src/BinaryFunction.h | 12 ++++++++ bolt/src/Passes/BinaryPasses.cpp | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index d7750f4315aa..48766103143e 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -834,6 +834,18 @@ class BinaryFunction { BasicBlocksLayout.end()); } + inline iterator_range rlayout() { + return + iterator_range(BasicBlocksLayout.rbegin(), + BasicBlocksLayout.rend()); + } + + inline iterator_range rlayout() const { + return + iterator_range(BasicBlocksLayout.rbegin(), + BasicBlocksLayout.rend()); + } + cfi_iterator cie_begin() { return CIEFrameInstructions.begin(); } const_cfi_iterator cie_begin() const { return CIEFrameInstructions.begin(); } cfi_iterator cie_end() { return CIEFrameInstructions.end(); } diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 85453af7b574..cdfe768dd8af 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -171,6 +171,14 @@ ReorderBlocks("reorder-blocks", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +ReportBadLayout("report-bad-layout", + cl::desc("print top functions with suboptimal code layout on input"), + cl::init(0), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + static cl::opt ReportStaleFuncs("report-stale", cl::desc("print the list of functions with stale profile"), @@ -1599,6 +1607,46 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, break; } } + + // Collect and print information about suboptimal code layout on input. + if (opts::ReportBadLayout) { + std::vector SuboptimalFuncs; + for (auto &BFI : BFs) { + const auto &BF = BFI.second; + if (!BF.hasValidProfile()) + continue; + + const auto HotThreshold = std::max(BF.getKnownExecutionCount(), 1UL); + bool HotSeen = false; + for (const auto *BB : BF.rlayout()) { + if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { + HotSeen = true; + continue; + } + if (HotSeen && BB->getKnownExecutionCount() == 0) { + SuboptimalFuncs.push_back(&BF); + break; + } + } + } + + if (!SuboptimalFuncs.empty()) { + std::sort(SuboptimalFuncs.begin(), SuboptimalFuncs.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + return A->getKnownExecutionCount() / A->getSize() > + B->getKnownExecutionCount() / B->getSize(); + }); + + outs() << "BOLT-INFO: " << SuboptimalFuncs.size() << " functions have " + "cold code in the middle of hot code. Top functions are:\n"; + for (unsigned I = 0; + I < std::min(static_cast(opts::ReportBadLayout), + SuboptimalFuncs.size()); + ++I) { + SuboptimalFuncs[I]->print(outs()); + } + } + } } void InstructionLowering::runOnFunctions( From 6622380870da2be7e1dde84f9d6cfc779b708ce2 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 16 May 2018 13:31:13 -0700 Subject: [PATCH 423/904] [PERF2BOLT] Improve file matching Summary: If the input binary for perf2bolt has a build-id and perf data has recorded build-ids, then try to match them. Adjust the file name if build-ids match to cover cases where the binary was renamed after data collection. If there's no matching build-id report an error and exit. While scanning task events, truncate the name to 15 characters prior to matching, since that's how names are reported by perf. (cherry picked from commit c28f085033cb46b951c28e3b3896f434bc7c5e9c) --- bolt/src/DataAggregator.cpp | 84 ++++++++++++++++++++++++------------ bolt/src/DataAggregator.h | 20 +++++---- bolt/src/RewriteInstance.cpp | 45 +++++-------------- bolt/src/RewriteInstance.h | 4 -- 4 files changed, 78 insertions(+), 75 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index b63a1cf08091..655cf8050bbe 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -35,16 +35,22 @@ namespace opts { extern cl::OptionCategory AggregatorCategory; -static llvm::cl::opt -TimeAggregator("time-aggr", - cl::desc("time BOLT aggregator"), +static cl::opt +BasicAggregation("nl", + cl::desc("aggregate basic samples (without LBR info)"), cl::init(false), cl::ZeroOrMore, cl::cat(AggregatorCategory)); -static llvm::cl::opt -BasicAggregation("nl", - cl::desc("aggregate basic samples (without LBR info)"), +static cl::opt +IgnoreBuildID("ignore-build-id", + cl::desc("continue even if build-ids in input binary and perf.data mismatch"), + cl::init(false), + cl::cat(AggregatorCategory)); + +static cl::opt +TimeAggregator("time-aggr", + cl::desc("time BOLT aggregator"), cl::init(false), cl::ZeroOrMore, cl::cat(AggregatorCategory)); @@ -219,7 +225,7 @@ bool DataAggregator::launchPerfTasksNoWait() { return true; } -Optional DataAggregator::getPerfBuildID() { +void DataAggregator::processFileBuildID(StringRef FileBuildID) { SmallVector Argv; SmallVector OutputPath; SmallVector ErrPath; @@ -265,7 +271,7 @@ Optional DataAggregator::getPerfBuildID() { errs() << ErrBuf; deleteTempFile(ErrPath.data()); deleteTempFile(OutputPath.data()); - return NoneType(); + return; } ErrorOr> MB = @@ -275,26 +281,44 @@ Optional DataAggregator::getPerfBuildID() { << EC.message() << "\n"; deleteTempFile(ErrPath.data()); deleteTempFile(OutputPath.data()); - return NoneType(); + return; } FileBuf.reset(MB->release()); ParsingBuf = FileBuf->getBuffer(); - Col = 0; - Line = 1; - auto ParseResult = parsePerfBuildID(); - if (!ParseResult) { - outs() << "PERF2BOLT: Failed to parse build-id from perf output\n"; + if (ParsingBuf.empty()) { + errs() << "PERF2BOLT-WARNING: build-id will not be checked because perf " + "data was recorded without it\n"; deleteTempFile(ErrPath.data()); deleteTempFile(OutputPath.data()); - return NoneType(); + return; } - outs() << "PERF2BOLT: Perf.data build-id is: " << *ParseResult << "\n"; + Col = 0; + Line = 1; + auto FileName = getFileNameForBuildID(FileBuildID); + if (!FileName) { + errs() << "PERF2BOLT-ERROR: failed to match build-id from perf output. " + "This indicates the input binary supplied for data aggregation " + "is not the same recorded by perf when collecting profiling " + "data. Use -ignore-build-id option to override.\n"; + if (!opts::IgnoreBuildID) { + deleteTempFile(ErrPath.data()); + deleteTempFile(OutputPath.data()); + abort(); + exit(1); + } + } else if (*FileName != BinaryName) { + errs() << "PERF2BOLT-WARNING: build-id matched a different file name. " + "Using \"" << *FileName << "\" for profile parsing.\n"; + BinaryName = *FileName; + } else { + outs() << "PERF2BOLT: matched build-id and file name\n"; + } deleteTempFile(ErrPath.data()); deleteTempFile(OutputPath.data()); - return std::string(ParseResult->data(), ParseResult->size()); + return; } bool DataAggregator::checkPerfDataMagic(StringRef FileName) { @@ -967,7 +991,7 @@ ErrorOr DataAggregator::parseTaskPID() { auto CommNameStr = parseString(FieldSeparator, true); if (std::error_code EC = CommNameStr.getError()) return EC; - if (CommNameStr.get() != BinaryName) { + if (CommNameStr.get() != BinaryName.substr(0, 15)) { consumeRestOfLine(); return -1; } @@ -1013,12 +1037,19 @@ std::error_code DataAggregator::parseTasks() { PIDs.insert(PID); } - if (!PIDs.empty()) + if (!PIDs.empty()) { outs() << "PERF2BOLT: Input binary is associated with " << PIDs.size() << " PID(s)\n"; - else - outs() << "PERF2BOLT: Could not bind input binary to a PID - will parse " - "all samples in perf data.\n"; + } else { + if (errs().has_colors()) + errs().changeColor(raw_ostream::YELLOW); + errs() << "PERF2BOLT-WARNING: Could not bind input binary to a PID - will " + "parse all samples in perf data. This could result in corrupted " + "samples for the input binary if system-wide profile collection " + "was used.\n"; + if (errs().has_colors()) + errs().resetColor(); + } return std::error_code(); } @@ -1039,16 +1070,15 @@ DataAggregator::parseNameBuildIDPair() { return std::make_pair(NameStr.get(), BuildIDStr.get()); } -Optional DataAggregator::parsePerfBuildID() { +Optional +DataAggregator::getFileNameForBuildID(StringRef FileBuildID) { while (hasData()) { auto IDPair = parseNameBuildIDPair(); if (!IDPair) return NoneType(); - if (sys::path::filename(IDPair->first) != BinaryName) - continue; - - return IDPair->second; + if (IDPair->second == FileBuildID) + return sys::path::filename(IDPair->first); } return NoneType(); } diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index f3ecafce622c..becce32b91e1 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -85,7 +85,7 @@ class DataAggregator : public DataReader { StringRef OutputFDataName; /// Our sampled binary name to look for in perf.data - StringRef BinaryName; + std::string BinaryName; DenseSet PIDs; @@ -189,9 +189,9 @@ class DataAggregator : public DataReader { /// Parse a single pair of binary full path and associated build-id Optional> parseNameBuildIDPair(); - /// Parse the output generated by perf buildid-list to extract the build-id - /// of the binary used when collecting profiling - Optional parsePerfBuildID(); + /// Parse the output generated by "perf buildid-list" to extract build-ids + /// and return a file name matching a given \p FileBuildID. + Optional getFileNameForBuildID(StringRef FileBuildID); public: DataAggregator(raw_ostream &Diag, StringRef BinaryName) @@ -221,11 +221,13 @@ class DataAggregator : public DataReader { /// Check whether \p FileName is a perf.data file static bool checkPerfDataMagic(StringRef FileName); - /// Launch a subprocess with perf buildid-list to extract the build-id of the - /// binary used when collecting profiling. Different than launchPerf*, this - /// one spawns the subprocess and blocks. Then it parses the result and - /// returns the build-id. - Optional getPerfBuildID(); + /// If we have a build-id available for the input file, use it to assist + /// matching profile to a binary. + /// + /// If the binary name changed after profile collection, use build-id + /// to get the proper name in perf data when build-ids are available. + /// If \p FileBuildID has no match, then issue an error and exit. + void processFileBuildID(StringRef FileBuildID); /// Debugging dump methods void dump() const; diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index a9a466f4040c..8eddbc2dff31 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -370,12 +370,6 @@ DiffOnly("diff-only", cl::Hidden, cl::cat(BoltDiffCategory)); -static cl::opt -IgnoreBuildID("ignore-build-id", - cl::desc("continue even if build-ids in input binary and perf.data mismatch"), - cl::init(false), - cl::cat(AggregatorCategory)); - static cl::opt TimeRewrite("time-rewrite", cl::desc("print time spent in rewriting passes"), @@ -950,38 +944,12 @@ RewriteInstance::getBuildID() const { OS << Twine::utohexstr(*CharIter); ++CharIter; } - outs() << "BOLT-INFO: Binary build-id is: " << OS.str() << "\n"; + outs() << "BOLT-INFO: binary build-id is: " << OS.str() << "\n"; return OS.str(); } return NoneType(); } -void RewriteInstance::checkBuildID() { - auto FileBuildID = getBuildID(); - if (!FileBuildID) { - outs() << "BOLT-WARNING: Build ID will not be checked because we could not " - "read one from input binary\n"; - return; - } - auto PerfBuildID = DA.getPerfBuildID(); - if (!PerfBuildID) { - outs() << "BOLT-WARNING: Build ID will not be checked because we could not " - "read one from perf.data\n"; - return; - } - if (*FileBuildID == *PerfBuildID) - return; - - outs() << "BOLT-ERROR: Build ID mismatch! This indicates the input binary " - "supplied for data aggregation is not the same recorded by perf " - "when collecting profiling data.\n"; - - if (!opts::IgnoreBuildID) { - DA.abort(); - exit(1); - } -} - void RewriteInstance::run() { if (!BC) { errs() << "BOLT-ERROR: failed to create a binary context\n"; @@ -1015,8 +983,15 @@ void RewriteInstance::run() { (llvm::Triple::ArchType)InputFile->getArch()) << "\n"; - if (DA.started()) - checkBuildID(); + if (DA.started()) { + if (auto FileBuildID = getBuildID()) { + DA.processFileBuildID(*FileBuildID); + } else { + errs() << "BOLT-WARNING: build-id will not be checked because we could " + "not read one from input binary\n"; + } + } + unsigned PassNumber = 1; executeRewritePass({}); if (opts::AggregateOnly || opts::DiffOnly) diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index edebf647622c..5b2c984d6740 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -121,10 +121,6 @@ class RewriteInstance { /// Run all the necessary steps to read, optimize and rewrite the binary. void run(); - /// Check that binary build ID matches the one used in perf.data to collect - /// profile - void checkBuildID(); - /// Diff this instance against another one. Non-const since we may run passes /// to fold identical functions. void compare(RewriteInstance &RI2); From bb79d9ad5d756e8f7ac992ed506697e44b5c01cc Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 22 May 2018 15:52:21 -0700 Subject: [PATCH 424/904] [BOLT][NFC] Move ICF pass into a separate file Summary: Consolidate code used by identical code folding under Passes/IdenticalCodeFolding.cpp. (cherry picked from commit f7da703cef802b6711aaa225c65da49a23d3e34e) --- bolt/src/BinaryBasicBlock.h | 4 +- bolt/src/BinaryFunction.cpp | 170 --------- bolt/src/BinaryFunction.h | 123 ++----- bolt/src/BinaryPassManager.cpp | 1 + bolt/src/BoltDiff.cpp | 2 +- bolt/src/Passes/BinaryPasses.cpp | 148 -------- bolt/src/Passes/BinaryPasses.h | 16 - bolt/src/Passes/CMakeLists.txt | 1 + bolt/src/Passes/IdenticalCodeFolding.cpp | 425 +++++++++++++++++++++++ bolt/src/Passes/IdenticalCodeFolding.h | 41 +++ 10 files changed, 495 insertions(+), 436 deletions(-) create mode 100644 bolt/src/Passes/IdenticalCodeFolding.cpp create mode 100644 bolt/src/Passes/IdenticalCodeFolding.h diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 946de333093e..57f883968788 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -825,9 +825,9 @@ class BinaryBasicBlock { LayoutIndex = Index; } - /// FIXME + /// Needed by graph traits. BinaryFunction *getParent() const { - return nullptr; + return getFunction(); } private: diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 2f45be190f4b..7d580cee5952 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -2981,136 +2981,6 @@ BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { return DFS; } -bool BinaryFunction::isIdenticalWith(const BinaryFunction &OtherBF, - bool IgnoreSymbols, - bool UseDFS) const { - assert(hasCFG() && OtherBF.hasCFG() && "both functions should have CFG"); - - // Compare the two functions, one basic block at a time. - // Currently we require two identical basic blocks to have identical - // instruction sequences and the same index in their corresponding - // functions. The latter is important for CFG equality. - - if (layout_size() != OtherBF.layout_size()) - return false; - - // Comparing multi-entry functions could be non-trivial. - if (isMultiEntry() || OtherBF.isMultiEntry()) - return false; - - // Process both functions in either DFS or existing order. - const auto &Order = UseDFS ? dfs() : BasicBlocksLayout; - const auto &OtherOrder = UseDFS ? OtherBF.dfs() : OtherBF.BasicBlocksLayout; - - auto BBI = OtherOrder.begin(); - for (const auto *BB : Order) { - const auto *OtherBB = *BBI; - - if (BB->getLayoutIndex() != OtherBB->getLayoutIndex()) - return false; - - // Compare successor basic blocks. - // NOTE: the comparison for jump tables is only partially verified here. - if (BB->succ_size() != OtherBB->succ_size()) - return false; - - auto SuccBBI = OtherBB->succ_begin(); - for (const auto *SuccBB : BB->successors()) { - const auto *SuccOtherBB = *SuccBBI; - if (SuccBB->getLayoutIndex() != SuccOtherBB->getLayoutIndex()) - return false; - ++SuccBBI; - } - - // Compare all instructions including pseudos. - auto I = BB->begin(), E = BB->end(); - auto OtherI = OtherBB->begin(), OtherE = OtherBB->end(); - while (I != E && OtherI != OtherE) { - - bool Identical; - if (IgnoreSymbols) { - Identical = - isInstrEquivalentWith(*I, *BB, *OtherI, *OtherBB, OtherBF, - [](const MCSymbol *A, const MCSymbol *B) { - return true; - }); - } else { - // Compare symbols. - auto AreSymbolsIdentical = [&] (const MCSymbol *A, const MCSymbol *B) { - if (A == B) - return true; - - // All local symbols are considered identical since they affect a - // control flow and we check the control flow separately. - // If a local symbol is escaped, then the function (potentially) has - // multiple entry points and we exclude such functions from - // comparison. - if (A->isTemporary() && B->isTemporary()) - return true; - - // Compare symbols as functions. - const auto *FunctionA = BC.getFunctionForSymbol(A); - const auto *FunctionB = BC.getFunctionForSymbol(B); - if (FunctionA && FunctionB) { - // Self-referencing functions and recursive calls. - if (FunctionA == this && FunctionB == &OtherBF) - return true; - return FunctionA == FunctionB; - } - - // Check if symbols are jump tables. - auto *SIA = BC.getBinaryDataByName(A->getName()); - if (!SIA) - return false; - auto *SIB = BC.getBinaryDataByName(B->getName()); - if (!SIB) - return false; - - assert((SIA->getAddress() != SIB->getAddress()) && - "different symbols should not have the same value"); - - const auto *JumpTableA = - getJumpTableContainingAddress(SIA->getAddress()); - if (!JumpTableA) - return false; - - const auto *JumpTableB = - OtherBF.getJumpTableContainingAddress(SIB->getAddress()); - if (!JumpTableB) - return false; - - if ((SIA->getAddress() - JumpTableA->getAddress()) != - (SIB->getAddress() - JumpTableB->getAddress())) - return false; - - return equalJumpTables(JumpTableA, JumpTableB, OtherBF); - }; - - Identical = - isInstrEquivalentWith(*I, *BB, *OtherI, *OtherBB, OtherBF, - AreSymbolsIdentical); - } - - if (!Identical) - return false; - - ++I; ++OtherI; - } - - // One of the identical blocks may have a trailing unconditional jump that - // is ignored for CFG purposes. - auto *TrailingInstr = (I != E ? &(*I) - : (OtherI != OtherE ? &(*OtherI) : 0)); - if (TrailingInstr && !BC.MIB->isUnconditionalBranch(*TrailingInstr)) { - return false; - } - - ++BBI; - } - - return true; -} - std::string BinaryFunction::generateJumpTableName(uint64_t Address) const { auto *JT = getJumpTableContainingAddress(Address); size_t Id; @@ -3129,46 +2999,6 @@ std::string BinaryFunction::generateJumpTableName(uint64_t Address) const { (Offset ? ("." + std::to_string(Offset)) : "")); } -bool BinaryFunction::equalJumpTables(const JumpTable *JumpTableA, - const JumpTable *JumpTableB, - const BinaryFunction &BFB) const { - if (JumpTableA->EntrySize != JumpTableB->EntrySize) - return false; - - if (JumpTableA->Type != JumpTableB->Type) - return false; - - if (JumpTableA->getSize() != JumpTableB->getSize()) - return false; - - for (uint64_t Index = 0; Index < JumpTableA->Entries.size(); ++Index) { - const auto *LabelA = JumpTableA->Entries[Index]; - const auto *LabelB = JumpTableB->Entries[Index]; - - const auto *TargetA = getBasicBlockForLabel(LabelA); - const auto *TargetB = BFB.getBasicBlockForLabel(LabelB); - - if (!TargetA || !TargetB) { - assert((TargetA || LabelA == getFunctionEndLabel()) && - "no target basic block found"); - assert((TargetB || LabelB == BFB.getFunctionEndLabel()) && - "no target basic block found"); - - if (TargetA != TargetB) - return false; - - continue; - } - - assert(TargetA && TargetB && "cannot locate target block(s)"); - - if (TargetA->getLayoutIndex() != TargetB->getLayoutIndex()) - return false; - } - - return true; -} - std::size_t BinaryFunction::hash(bool Recompute, bool UseDFS) const { if (size() == 0) return 0; diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 48766103143e..82c65d491390 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -424,60 +424,6 @@ class BinaryFunction { /// Synchronize branch instructions with CFG. void postProcessBranches(); - /// Helper function that compares an instruction of this function to the - /// given instruction of the given function. The functions should have - /// identical CFG. - template - bool isInstrEquivalentWith( - const MCInst &InstA, const BinaryBasicBlock &BBA, - const MCInst &InstB, const BinaryBasicBlock &BBB, - const BinaryFunction &BFB, Compare Comp) const { - if (InstA.getOpcode() != InstB.getOpcode()) { - return false; - } - - // In this function we check for special conditions: - // - // * instructions with landing pads - // - // Most of the common cases should be handled by MCPlus::equals() - // that compares regular instruction operands. - // - // NB: there's no need to compare jump table indirect jump instructions - // separately as jump tables are handled by comparing corresponding - // symbols. - const auto EHInfoA = BC.MIB->getEHInfo(InstA); - const auto EHInfoB = BC.MIB->getEHInfo(InstB); - - if (EHInfoA || EHInfoB) { - if (!EHInfoA && (EHInfoB->first || EHInfoB->second)) - return false; - - if (!EHInfoB && (EHInfoA->first || EHInfoA->second)) - return false; - - if (EHInfoA && EHInfoB) { - // Action indices should match. - if (EHInfoA->second != EHInfoB->second) - return false; - - if (!EHInfoA->first != !EHInfoB->first) - return false; - - if (EHInfoA->first && EHInfoB->first) { - const auto *LPA = BBA.getLandingPad(EHInfoA->first); - const auto *LPB = BBB.getLandingPad(EHInfoB->first); - assert(LPA && LPB && "cannot locate landing pad(s)"); - - if (LPA->getLayoutIndex() != LPB->getLayoutIndex()) - return false; - } - } - } - - return BC.MIB->equals(InstA, InstB, Comp); - } - /// Recompute landing pad information for the function and all its blocks. void recomputeLandingPads(); @@ -583,45 +529,16 @@ class BinaryFunction { /// jump table names. mutable std::map JumpTableIds; - /// Generate a unique name for this jump table at the given address that should - /// be repeatable no matter what the start address of the table is. + /// Generate a unique name for this jump table at the given address that + /// should be repeatable no matter what the start address of the table is. std::string generateJumpTableName(uint64_t Address) const; - /// Return jump table that covers a given \p Address in memory. - JumpTable *getJumpTableContainingAddress(uint64_t Address) { - auto JTI = JumpTables.upper_bound(Address); - if (JTI == JumpTables.begin()) - return nullptr; - --JTI; - if (JTI->first + JTI->second->getSize() > Address) { - return JTI->second; - } - return nullptr; - } - - const JumpTable *getJumpTableContainingAddress(uint64_t Address) const { - auto JTI = JumpTables.upper_bound(Address); - if (JTI == JumpTables.begin()) - return nullptr; - --JTI; - if (JTI->first + JTI->second->getSize() > Address) { - return JTI->second; - } - return nullptr; - } - /// Iterate over all jump tables associated with this function. iterator_range::const_iterator> jumpTables() const { return make_range(JumpTables.begin(), JumpTables.end()); } - /// Compare two jump tables in 2 functions. The function relies on consistent - /// ordering of basic blocks in both binary functions (e.g. DFS). - bool equalJumpTables(const JumpTable *JumpTableA, - const JumpTable *JumpTableB, - const BinaryFunction &BFB) const; - /// All jump table sites in the function. std::vector> JTSites; @@ -767,7 +684,6 @@ class BinaryFunction { } public: - BinaryFunction(BinaryFunction &&) = default; using iterator = pointee_iterator; @@ -872,6 +788,11 @@ class BinaryFunction { } } + /// Return current basic block layout. + const BasicBlockOrderType &getLayout() const { + return BasicBlocksLayout; + } + /// Return a list of basic blocks sorted using DFS and update layout indices /// using the same order. Does not modify the current layout. BasicBlockOrderType dfs() const; @@ -985,6 +906,23 @@ class BinaryFunction { /// CFG is constructed or while instruction offsets are available in CFG. MCInst *getInstructionAtOffset(uint64_t Offset); + /// Return jump table that covers a given \p Address in memory. + JumpTable *getJumpTableContainingAddress(uint64_t Address) { + auto JTI = JumpTables.upper_bound(Address); + if (JTI == JumpTables.begin()) + return nullptr; + --JTI; + if (JTI->first + JTI->second->getSize() > Address) { + return JTI->second; + } + return nullptr; + } + + const JumpTable *getJumpTableContainingAddress(uint64_t Address) const { + return const_cast(this)-> + getJumpTableContainingAddress(Address); + } + /// Return the name of the function as extracted from the binary file. /// If the function has multiple names - return the last one /// followed by "(*#)". @@ -2125,19 +2063,6 @@ class BinaryFunction { /// Convert function-level branch data into instruction annotations. void convertBranchData(); - /// Returns true if this function has identical code and CFG with - /// the given function \p BF. - /// - /// If \p IgnoreSymbols is set to true, then symbolic operands are ignored - /// during comparison. - /// - /// If \p UseDFS is set to true, then compute DFS of each function and use - /// is for CFG equivalency. Potentially it will help to catch more cases, - /// but is slower. - bool isIdenticalWith(const BinaryFunction &BF, - bool IgnoreSymbols = false, - bool UseDFS = false) const; - /// Returns a hash value for the function. To be used for ICF. Two congruent /// functions (functions with different symbolic references but identical /// otherwise) are required to have identical hashes. diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index 145810ddaf28..3a6af074618a 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -13,6 +13,7 @@ #include "Passes/Aligner.h" #include "Passes/AllocCombiner.h" #include "Passes/FrameOptimizer.h" +#include "Passes/IdenticalCodeFolding.h" #include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" #include "Passes/LongJmp.h" diff --git a/bolt/src/BoltDiff.cpp b/bolt/src/BoltDiff.cpp index c15d240f4fd8..3ecea1490b47 100644 --- a/bolt/src/BoltDiff.cpp +++ b/bolt/src/BoltDiff.cpp @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// #include "RewriteInstance.h" -#include "Passes/BinaryPasses.h" +#include "Passes/IdenticalCodeFolding.h" #include "llvm/Support/CommandLine.h" #undef DEBUG_TYPE diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index cdfe768dd8af..853b67b8f2b8 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -75,13 +75,6 @@ DynoStatsSortOrderOpt("print-sorted-by-order", cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); -static cl::opt -ICFUseDFS("icf-dfs", - cl::desc("use DFS ordering when using -icf option"), - cl::ReallyHidden, - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt MinBranchClusters("min-branch-clusters", cl::desc("use a modified clustering algorithm geared towards minimizing " @@ -1259,147 +1252,6 @@ void SimplifyRODataLoads::runOnFunctions( << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; } -void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &) { - const auto OriginalFunctionCount = BFs.size(); - uint64_t NumFunctionsFolded = 0; - uint64_t NumJTFunctionsFolded = 0; - uint64_t BytesSavedEstimate = 0; - uint64_t CallsSavedEstimate = 0; - static bool UseDFS = opts::ICFUseDFS; - - // This hash table is used to identify identical functions. It maps - // a function to a bucket of functions identical to it. - struct KeyHash { - std::size_t operator()(const BinaryFunction *F) const { - return F->hash(/*Recompute=*/false); - } - }; - struct KeyCongruent { - bool operator()(const BinaryFunction *A, const BinaryFunction *B) const { - return A->isIdenticalWith(*B, /*IgnoreSymbols=*/true, /*UseDFS=*/UseDFS); - } - }; - struct KeyEqual { - bool operator()(const BinaryFunction *A, const BinaryFunction *B) const { - return A->isIdenticalWith(*B, /*IgnoreSymbols=*/false, /*UseDFS=*/UseDFS); - } - }; - - // Create buckets with congruent functions - functions that potentially could - // be folded. - std::unordered_map, - KeyHash, KeyCongruent> CongruentBuckets; - for (auto &BFI : BFs) { - auto &BF = BFI.second; - if (!shouldOptimize(BF) || BF.isFolded()) - continue; - - // Make sure indices are in-order. - BF.updateLayoutIndices(); - - // Pre-compute hash before pushing into hashtable. - BF.hash(/*Recompute=*/true, /*UseDFS=*/UseDFS); - - CongruentBuckets[&BF].emplace(&BF); - } - - // We repeat the pass until no new modifications happen. - unsigned Iteration = 1; - uint64_t NumFoldedLastIteration; - do { - NumFoldedLastIteration = 0; - - DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n"); - - for (auto &CBI : CongruentBuckets) { - auto &Candidates = CBI.second; - if (Candidates.size() < 2) - continue; - - // Identical functions go into the same bucket. - std::unordered_map, - KeyHash, KeyEqual> IdenticalBuckets; - for (auto *BF : Candidates) { - IdenticalBuckets[BF].emplace_back(BF); - } - - for (auto &IBI : IdenticalBuckets) { - // Functions identified as identical. - auto &Twins = IBI.second; - if (Twins.size() < 2) - continue; - - // Fold functions. Keep the order consistent across invocations with - // different options. - std::stable_sort(Twins.begin(), Twins.end(), - [](const BinaryFunction *A, const BinaryFunction *B) { - return A->getFunctionNumber() < B->getFunctionNumber(); - }); - - BinaryFunction *ParentBF = Twins[0]; - for (unsigned i = 1; i < Twins.size(); ++i) { - auto *ChildBF = Twins[i]; - DEBUG(dbgs() << "BOLT-DEBUG: folding " << *ChildBF << " into " - << *ParentBF << '\n'); - - // Remove child function from the list of candidates. - auto FI = Candidates.find(ChildBF); - assert(FI != Candidates.end() && - "function expected to be in the set"); - Candidates.erase(FI); - - // Fold the function and remove from the list of processed functions. - BytesSavedEstimate += ChildBF->getSize(); - CallsSavedEstimate += std::min(ChildBF->getKnownExecutionCount(), - ParentBF->getKnownExecutionCount()); - BC.foldFunction(*ChildBF, *ParentBF, BFs); - - ++NumFoldedLastIteration; - - if (ParentBF->hasJumpTables()) - ++NumJTFunctionsFolded; - } - } - - } - NumFunctionsFolded += NumFoldedLastIteration; - ++Iteration; - - } while (NumFoldedLastIteration > 0); - - DEBUG( - // Print functions that are congruent but not identical. - for (auto &CBI : CongruentBuckets) { - auto &Candidates = CBI.second; - if (Candidates.size() < 2) - continue; - dbgs() << "BOLT-DEBUG: the following " << Candidates.size() - << " functions (each of size " << (*Candidates.begin())->getSize() - << " bytes) are congruent but not identical:\n"; - for (auto *BF : Candidates) { - dbgs() << " " << *BF; - if (BF->getKnownExecutionCount()) { - dbgs() << " (executed " << BF->getKnownExecutionCount() << " times)"; - } - dbgs() << '\n'; - } - } - ); - - if (NumFunctionsFolded) { - outs() << "BOLT-INFO: ICF folded " << NumFunctionsFolded - << " out of " << OriginalFunctionCount << " functions in " - << Iteration << " passes. " - << NumJTFunctionsFolded << " functions had jump tables.\n" - << "BOLT-INFO: Removing all identical functions will save " - << format("%.2lf", (double) BytesSavedEstimate / 1024) - << " KB of code space. Folded functions were called " - << CallsSavedEstimate << " times based on profile.\n"; - } -} - void PrintProgramStats::runOnFunctions(BinaryContext &BC, std::map &BFs, diff --git a/bolt/src/Passes/BinaryPasses.h b/bolt/src/Passes/BinaryPasses.h index 029a4e2cad97..de9c44f83c23 100644 --- a/bolt/src/Passes/BinaryPasses.h +++ b/bolt/src/Passes/BinaryPasses.h @@ -384,22 +384,6 @@ class SimplifyRODataLoads : public BinaryFunctionPass { std::set &LargeFunctions) override; }; -/// An optimization that replaces references to identical functions with -/// references to a single one of them. -/// -class IdenticalCodeFolding : public BinaryFunctionPass { -public: - explicit IdenticalCodeFolding(const cl::opt &PrintPass) - : BinaryFunctionPass(PrintPass) { } - - const char *getName() const override { - return "identical-code-folding"; - } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; -}; - /// Prints a list of the top 100 functions sorted by a set of /// dyno stats categories. class PrintProgramStats : public BinaryFunctionPass { diff --git a/bolt/src/Passes/CMakeLists.txt b/bolt/src/Passes/CMakeLists.txt index 46903cc5904d..e088752036e7 100644 --- a/bolt/src/Passes/CMakeLists.txt +++ b/bolt/src/Passes/CMakeLists.txt @@ -12,6 +12,7 @@ add_llvm_library(LLVMBOLTPasses FrameOptimizer.cpp HFSort.cpp HFSortPlus.cpp + IdenticalCodeFolding.cpp IndirectCallPromotion.cpp Inliner.cpp JTFootprintReduction.cpp diff --git a/bolt/src/Passes/IdenticalCodeFolding.cpp b/bolt/src/Passes/IdenticalCodeFolding.cpp new file mode 100644 index 000000000000..65bfbf1f31e7 --- /dev/null +++ b/bolt/src/Passes/IdenticalCodeFolding.cpp @@ -0,0 +1,425 @@ +//===--- IdenticalCodeFolding.cpp -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "Passes/IdenticalCodeFolding.h" +#include "llvm/Support/Options.h" +#include +#include +#include + +#define DEBUG_TYPE "bolt-icf" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +static cl::opt +UseDFS("icf-dfs", + cl::desc("use DFS ordering when using -icf option"), + cl::ReallyHidden, + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} // namespace opts + +namespace { + +/// Compare two jump tables in 2 functions. The function relies on consistent +/// ordering of basic blocks in both binary functions (e.g. DFS). +bool equalJumpTables(const JumpTable &JumpTableA, + const JumpTable &JumpTableB, + const BinaryFunction &FunctionA, + const BinaryFunction &FunctionB) { + if (JumpTableA.EntrySize != JumpTableB.EntrySize) + return false; + + if (JumpTableA.Type != JumpTableB.Type) + return false; + + if (JumpTableA.getSize() != JumpTableB.getSize()) + return false; + + for (uint64_t Index = 0; Index < JumpTableA.Entries.size(); ++Index) { + const auto *LabelA = JumpTableA.Entries[Index]; + const auto *LabelB = JumpTableB.Entries[Index]; + + const auto *TargetA = FunctionA.getBasicBlockForLabel(LabelA); + const auto *TargetB = FunctionB.getBasicBlockForLabel(LabelB); + + if (!TargetA || !TargetB) { + assert((TargetA || LabelA == FunctionA.getFunctionEndLabel()) && + "no target basic block found"); + assert((TargetB || LabelB == FunctionB.getFunctionEndLabel()) && + "no target basic block found"); + + if (TargetA != TargetB) + return false; + + continue; + } + + assert(TargetA && TargetB && "cannot locate target block(s)"); + + if (TargetA->getLayoutIndex() != TargetB->getLayoutIndex()) + return false; + } + + return true; +} + +/// Helper function that compares an instruction of this function to the +/// given instruction of the given function. The functions should have +/// identical CFG. +template +bool isInstrEquivalentWith(const MCInst &InstA, const BinaryBasicBlock &BBA, + const MCInst &InstB, const BinaryBasicBlock &BBB, + Compare Comp) { + if (InstA.getOpcode() != InstB.getOpcode()) { + return false; + } + + const auto &BC = BBA.getFunction()->getBinaryContext(); + + // In this function we check for special conditions: + // + // * instructions with landing pads + // + // Most of the common cases should be handled by MCPlus::equals() + // that compares regular instruction operands. + // + // NB: there's no need to compare jump table indirect jump instructions + // separately as jump tables are handled by comparing corresponding + // symbols. + const auto EHInfoA = BC.MIB->getEHInfo(InstA); + const auto EHInfoB = BC.MIB->getEHInfo(InstB); + + if (EHInfoA || EHInfoB) { + if (!EHInfoA && (EHInfoB->first || EHInfoB->second)) + return false; + + if (!EHInfoB && (EHInfoA->first || EHInfoA->second)) + return false; + + if (EHInfoA && EHInfoB) { + // Action indices should match. + if (EHInfoA->second != EHInfoB->second) + return false; + + if (!EHInfoA->first != !EHInfoB->first) + return false; + + if (EHInfoA->first && EHInfoB->first) { + const auto *LPA = BBA.getLandingPad(EHInfoA->first); + const auto *LPB = BBB.getLandingPad(EHInfoB->first); + assert(LPA && LPB && "cannot locate landing pad(s)"); + + if (LPA->getLayoutIndex() != LPB->getLayoutIndex()) + return false; + } + } + } + + return BC.MIB->equals(InstA, InstB, Comp); +} + + +/// Returns true if this function has identical code and CFG with +/// the given function \p BF. +/// +/// If \p IgnoreSymbols is set to true, then symbolic operands are ignored +/// during comparison. +/// +/// If \p UseDFS is set to true, then compute DFS of each function and use +/// is for CFG equivalency. Potentially it will help to catch more cases, +/// but is slower. +bool isIdenticalWith(const BinaryFunction &A, const BinaryFunction &B, + bool IgnoreSymbols, bool UseDFS) { + assert(A.hasCFG() && B.hasCFG() && "both functions should have CFG"); + + // Compare the two functions, one basic block at a time. + // Currently we require two identical basic blocks to have identical + // instruction sequences and the same index in their corresponding + // functions. The latter is important for CFG equality. + + if (A.layout_size() != B.layout_size()) + return false; + + // Comparing multi-entry functions could be non-trivial. + if (A.isMultiEntry() || B.isMultiEntry()) + return false; + + // Process both functions in either DFS or existing order. + const auto &OrderA = UseDFS ? A.dfs() : A.getLayout(); + const auto &OrderB = UseDFS ? B.dfs() : B.getLayout(); + + const auto &BC = A.getBinaryContext(); + + auto BBI = OrderB.begin(); + for (const auto *BB : OrderA) { + const auto *OtherBB = *BBI; + + if (BB->getLayoutIndex() != OtherBB->getLayoutIndex()) + return false; + + // Compare successor basic blocks. + // NOTE: the comparison for jump tables is only partially verified here. + if (BB->succ_size() != OtherBB->succ_size()) + return false; + + auto SuccBBI = OtherBB->succ_begin(); + for (const auto *SuccBB : BB->successors()) { + const auto *SuccOtherBB = *SuccBBI; + if (SuccBB->getLayoutIndex() != SuccOtherBB->getLayoutIndex()) + return false; + ++SuccBBI; + } + + // Compare all instructions including pseudos. + auto I = BB->begin(), E = BB->end(); + auto OtherI = OtherBB->begin(), OtherE = OtherBB->end(); + while (I != E && OtherI != OtherE) { + + bool Identical; + if (IgnoreSymbols) { + Identical = + isInstrEquivalentWith(*I, *BB, *OtherI, *OtherBB, + [](const MCSymbol *A, const MCSymbol *B) { + return true; + }); + } else { + // Compare symbols. + auto AreSymbolsIdentical = [&] (const MCSymbol *SymbolA, + const MCSymbol *SymbolB) { + if (SymbolA == SymbolB) + return true; + + // All local symbols are considered identical since they affect a + // control flow and we check the control flow separately. + // If a local symbol is escaped, then the function (potentially) has + // multiple entry points and we exclude such functions from + // comparison. + if (SymbolA->isTemporary() && SymbolB->isTemporary()) + return true; + + // Compare symbols as functions. + const auto *FunctionA = BC.getFunctionForSymbol(SymbolA); + const auto *FunctionB = BC.getFunctionForSymbol(SymbolB); + if (FunctionA && FunctionB) { + // Self-referencing functions and recursive calls. + if (FunctionA == &A && FunctionB == &B) + return true; + return FunctionA == FunctionB; + } + + // Check if symbols are jump tables. + auto *SIA = BC.getBinaryDataByName(SymbolA->getName()); + if (!SIA) + return false; + auto *SIB = BC.getBinaryDataByName(SymbolB->getName()); + if (!SIB) + return false; + + assert((SIA->getAddress() != SIB->getAddress()) && + "different symbols should not have the same value"); + + const auto *JumpTableA = + A.getJumpTableContainingAddress(SIA->getAddress()); + if (!JumpTableA) + return false; + + const auto *JumpTableB = + B.getJumpTableContainingAddress(SIB->getAddress()); + if (!JumpTableB) + return false; + + if ((SIA->getAddress() - JumpTableA->getAddress()) != + (SIB->getAddress() - JumpTableB->getAddress())) + return false; + + return equalJumpTables(*JumpTableA, *JumpTableB, A, B); + }; + + Identical = + isInstrEquivalentWith(*I, *BB, *OtherI, *OtherBB, + AreSymbolsIdentical); + } + + if (!Identical) { + return false; + } + + ++I; ++OtherI; + } + + // One of the identical blocks may have a trailing unconditional jump that + // is ignored for CFG purposes. + auto *TrailingInstr = (I != E ? &(*I) + : (OtherI != OtherE ? &(*OtherI) : 0)); + if (TrailingInstr && !BC.MIB->isUnconditionalBranch(*TrailingInstr)) { + return false; + } + + ++BBI; + } + + return true; +} +} + +namespace llvm { +namespace bolt { + +void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &) { + const auto OriginalFunctionCount = BFs.size(); + uint64_t NumFunctionsFolded = 0; + uint64_t NumJTFunctionsFolded = 0; + uint64_t BytesSavedEstimate = 0; + uint64_t CallsSavedEstimate = 0; + + // This hash table is used to identify identical functions. It maps + // a function to a bucket of functions identical to it. + struct KeyHash { + std::size_t operator()(const BinaryFunction *F) const { + return F->hash(/*Recompute=*/false); + } + }; + struct KeyCongruent { + bool operator()(const BinaryFunction *A, const BinaryFunction *B) const { + return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/true, opts::UseDFS); + } + }; + struct KeyEqual { + bool operator()(const BinaryFunction *A, const BinaryFunction *B) const { + return isIdenticalWith(*A, *B, /*IgnoreSymbols=*/false, opts::UseDFS); + } + }; + + // Create buckets with congruent functions - functions that potentially could + // be folded. + std::unordered_map, + KeyHash, KeyCongruent> CongruentBuckets; + for (auto &BFI : BFs) { + auto &BF = BFI.second; + if (!shouldOptimize(BF) || BF.isFolded()) + continue; + + // Make sure indices are in-order. + BF.updateLayoutIndices(); + + // Pre-compute hash before pushing into hashtable. + BF.hash(/*Recompute=*/true, opts::UseDFS); + + CongruentBuckets[&BF].emplace(&BF); + } + + // We repeat the pass until no new modifications happen. + unsigned Iteration = 1; + uint64_t NumFoldedLastIteration; + do { + NumFoldedLastIteration = 0; + + DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n"); + + for (auto &CBI : CongruentBuckets) { + auto &Candidates = CBI.second; + if (Candidates.size() < 2) + continue; + + // Identical functions go into the same bucket. + std::unordered_map, + KeyHash, KeyEqual> IdenticalBuckets; + for (auto *BF : Candidates) { + IdenticalBuckets[BF].emplace_back(BF); + } + + for (auto &IBI : IdenticalBuckets) { + // Functions identified as identical. + auto &Twins = IBI.second; + if (Twins.size() < 2) + continue; + + // Fold functions. Keep the order consistent across invocations with + // different options. + std::stable_sort(Twins.begin(), Twins.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + return A->getFunctionNumber() < B->getFunctionNumber(); + }); + + BinaryFunction *ParentBF = Twins[0]; + for (unsigned i = 1; i < Twins.size(); ++i) { + auto *ChildBF = Twins[i]; + DEBUG(dbgs() << "BOLT-DEBUG: folding " << *ChildBF << " into " + << *ParentBF << '\n'); + + // Remove child function from the list of candidates. + auto FI = Candidates.find(ChildBF); + assert(FI != Candidates.end() && + "function expected to be in the set"); + Candidates.erase(FI); + + // Fold the function and remove from the list of processed functions. + BytesSavedEstimate += ChildBF->getSize(); + CallsSavedEstimate += std::min(ChildBF->getKnownExecutionCount(), + ParentBF->getKnownExecutionCount()); + BC.foldFunction(*ChildBF, *ParentBF, BFs); + + ++NumFoldedLastIteration; + + if (ParentBF->hasJumpTables()) + ++NumJTFunctionsFolded; + } + } + + } + NumFunctionsFolded += NumFoldedLastIteration; + ++Iteration; + + } while (NumFoldedLastIteration > 0); + + DEBUG( + // Print functions that are congruent but not identical. + for (auto &CBI : CongruentBuckets) { + auto &Candidates = CBI.second; + if (Candidates.size() < 2) + continue; + dbgs() << "BOLT-DEBUG: the following " << Candidates.size() + << " functions (each of size " << (*Candidates.begin())->getSize() + << " bytes) are congruent but not identical:\n"; + for (auto *BF : Candidates) { + dbgs() << " " << *BF; + if (BF->getKnownExecutionCount()) { + dbgs() << " (executed " << BF->getKnownExecutionCount() << " times)"; + } + dbgs() << '\n'; + } + } + ); + + if (NumFunctionsFolded) { + outs() << "BOLT-INFO: ICF folded " << NumFunctionsFolded + << " out of " << OriginalFunctionCount << " functions in " + << Iteration << " passes. " + << NumJTFunctionsFolded << " functions had jump tables.\n" + << "BOLT-INFO: Removing all identical functions will save " + << format("%.2lf", (double) BytesSavedEstimate / 1024) + << " KB of code space. Folded functions were called " + << CallsSavedEstimate << " times based on profile.\n"; + } +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/Passes/IdenticalCodeFolding.h b/bolt/src/Passes/IdenticalCodeFolding.h new file mode 100644 index 000000000000..708fdb9a0107 --- /dev/null +++ b/bolt/src/Passes/IdenticalCodeFolding.h @@ -0,0 +1,41 @@ +//===--- IdenticalCodeFolding.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_IDENTICAL_CODE_FOLDING_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_IDENTICAL_CODE_FOLDING_H + +#include "BinaryContext.h" +#include "BinaryFunction.h" +#include "Passes/BinaryPasses.h" + +namespace llvm { +namespace bolt { + +/// An optimization that replaces references to identical functions with +/// references to a single one of them. +/// +class IdenticalCodeFolding : public BinaryFunctionPass { +public: + explicit IdenticalCodeFolding(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { } + + const char *getName() const override { + return "identical-code-folding"; + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif From 2ffa1549378adf07e72bdf79d017a79e489490f8 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 30 Apr 2018 14:47:32 -0700 Subject: [PATCH 425/904] [BOLT-AArch64] Detect linker stubs and address them Summary: In AArch64, when the binary gets large, the linker inserts stubs with 3 instructions: ADRP to load the PC-relative address of a page; ADD to add the offset of the page; and a branch instruction to do an indirect jump to the contents of X16 (the linker-reserved reg). The problem is that the linker does not issue a relocation for this (since this is not code coming from the assembler), so BOLT has no idea what is the real target, unless it recognizes these instructions and extract the target by combining the operands of the instructions from the stub. This diff does exactly that. (cherry picked from commit c24fde284e869e99b9bf594ad04143f488e35582) --- bolt/src/BinaryFunction.cpp | 114 +++++++++++------- bolt/src/BinaryFunction.h | 2 + bolt/src/MCPlusBuilder.h | 28 ++++- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 100 ++++++++++++--- 4 files changed, 179 insertions(+), 65 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 7d580cee5952..32ed7ba32e01 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -958,32 +958,14 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { Labels[0] = Ctx->createTempSymbol("BB0", false); addEntryPointAtOffset(0); - auto handlePCRelOperand = - [&](MCInst &Instruction, uint64_t Address, uint64_t Size) { - uint64_t TargetAddress{0}; - uint64_t TargetOffset{0}; - MCSymbol *TargetSymbol{nullptr}; - if (!MIB->evaluateMemOperandTarget(Instruction, TargetAddress, Address, - Size)) { - errs() << "BOLT-ERROR: PC-relative operand can't be evaluated:\n"; - BC.InstPrinter->printInst(&Instruction, errs(), "", *BC.STI); - errs() << '\n'; - Instruction.dump_pretty(errs(), BC.InstPrinter.get()); - errs() << '\n'; - return false; - } - if (TargetAddress == 0) { - if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: PC-relative operand is zero in function " - << *this << ".\n"; - } - } - + auto getOrCreateSymbolForAddress = [&](const MCInst &Instruction, + uint64_t TargetAddress, + uint64_t &SymbolAddend) { if (BC.isAArch64()) { // Check if this is an access to a constant island and create bookkeeping // to keep track of it and emit it later as part of this function if (MCSymbol *IslandSym = getOrCreateIslandAccess(TargetAddress).first) { - TargetSymbol = IslandSym; + return IslandSym; } else { // Detect custom code written in assembly that refers to arbitrary // constant islands from other functions. Write this reference so we @@ -997,9 +979,9 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { IslandIter->second->getOrCreateProxyIslandAccess(TargetAddress, this); if (IslandSym) { - TargetSymbol = IslandSym; addConstantIslandDependency(IslandIter->second, IslandSym, ColdIslandSym); + return IslandSym; } } } @@ -1008,11 +990,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Note that the address does not necessarily have to reside inside // a section, it could be an absolute address too. auto Section = BC.getSectionForAddress(TargetAddress); - // Assume AArch64's ADRP never references code - it does, but this is fixed - // after reading relocations. ADRP contents now are not really meaningful - // without its supporting relocation. - if (!TargetSymbol && Section && Section->isText() && - (!BC.isAArch64() || !BC.MIB->isADRP(Instruction))) { + if (Section && Section->isText()) { if (containsAddress(TargetAddress, /*UseMaxSize=*/ BC.isAArch64())) { if (TargetAddress != getAddress()) { @@ -1021,27 +999,53 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { DEBUG(dbgs() << "BOLT-DEBUG: potentially escaped address 0x" << Twine::utohexstr(TargetAddress) << " in function " << *this << '\n'); - TargetSymbol = getOrCreateLocalLabel(TargetAddress); - addEntryPointAtOffset(TargetAddress - getAddress()); + return addEntryPointAtOffset(TargetAddress - getAddress()); } } else { BC.InterproceduralReferences.insert(TargetAddress); } } - if (!TargetSymbol) { - auto *BD = BC.getBinaryDataContainingAddress(TargetAddress); - if (BD) { - TargetSymbol = BD->getSymbol(); - TargetOffset = TargetAddress - BD->getAddress(); - } else { - // TODO: use DWARF info to get size/alignment here? - TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, 0, 0, "DATAat"); - DEBUG(if (opts::Verbosity >= 2) { - dbgs() << "Created DATAat sym: " << TargetSymbol->getName() - << " in section " << BD->getSectionName() << "\n"; - }); + + auto *BD = BC.getBinaryDataContainingAddress(TargetAddress); + if (BD) { + auto *TargetSymbol = BD->getSymbol(); + SymbolAddend = TargetAddress - BD->getAddress(); + return TargetSymbol; + } + // TODO: use DWARF info to get size/alignment here? + auto *TargetSymbol = + BC.getOrCreateGlobalSymbol(TargetAddress, 0, 0, "DATAat"); + DEBUG(if (opts::Verbosity >= 2) { + dbgs() << "Created DATAat sym: " << TargetSymbol->getName() + << " in section " << BD->getSectionName() << "\n"; + }); + return TargetSymbol; + }; + + auto handlePCRelOperand = + [&](MCInst &Instruction, uint64_t Address, uint64_t Size) { + uint64_t TargetAddress{0}; + uint64_t TargetOffset{0}; + MCSymbol *TargetSymbol{nullptr}; + if (!MIB->evaluateMemOperandTarget(Instruction, TargetAddress, Address, + Size)) { + errs() << "BOLT-ERROR: PC-relative operand can't be evaluated:\n"; + BC.InstPrinter->printInst(&Instruction, errs(), "", *BC.STI); + errs() << '\n'; + Instruction.dump_pretty(errs(), BC.InstPrinter.get()); + errs() << '\n'; + return false; + } + if (TargetAddress == 0) { + if (opts::Verbosity >= 1) { + outs() << "BOLT-INFO: PC-relative operand is zero in function " + << *this << ".\n"; } } + + TargetSymbol = + getOrCreateSymbolForAddress(Instruction, TargetAddress, TargetOffset); + const MCExpr *Expr = MCSymbolRefExpr::create(TargetSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx); @@ -1057,6 +1061,20 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { return true; }; + // Used to fix the target of linker-generated AArch64 stubs with no relocation + // info + auto fixStubTarget = [&](MCInst &LoadLowBits, MCInst &LoadHiBits, + uint64_t Target) { + uint64_t Addend{0}; + int64_t Val; + MCSymbol *TargetSymbol; + TargetSymbol = getOrCreateSymbolForAddress(LoadLowBits, Target, Addend); + MIB->replaceImmWithSymbol(LoadHiBits, TargetSymbol, Addend, Ctx.get(), + Val, ELF::R_AARCH64_ADR_PREL_PG_HI21); + MIB->replaceImmWithSymbol(LoadLowBits, TargetSymbol, Addend, Ctx.get(), Val, + ELF::R_AARCH64_ADD_ABS_LO12_NC); + }; + uint64_t Size = 0; // instruction size for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) { MCInst Instruction; @@ -1328,6 +1346,16 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { IsSimple = false; } } + // AArch64 indirect call - check for linker veneers, which lack + // relocations and need manual adjustments + MCInst *TargetHiBits, *TargetLowBits; + uint64_t TargetAddress; + if (BC.isAArch64() && + MIB->matchLinkerVeneer(Instructions.begin(), Instructions.end(), + AbsoluteInstrAddr, Instruction, TargetHiBits, + TargetLowBits, TargetAddress)) { + fixStubTarget(*TargetLowBits, *TargetHiBits, TargetAddress); + } } } else { if (MIB->hasPCRelOperand(Instruction) && !UsedReloc) { @@ -3546,7 +3574,7 @@ std::set BinaryFunction::dataUses(bool OnlyHot) const { } return Uses; } - + DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList( const DWARFDebugLoc::LocationList &InputLL, BaseAddress BaseAddr) const { diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 82c65d491390..5859e6c31399 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -1740,6 +1740,8 @@ class BinaryFunction { if (!ColdIslandSymbols.count(Symbol)) { ColdSymbol = BC.Ctx->getOrCreateSymbol(Symbol->getName() + ".cold"); ColdIslandSymbols[Symbol] = ColdSymbol; + } else { + ColdSymbol = ColdIslandSymbols[Symbol]; } return std::make_pair(Symbol, ColdSymbol); } diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index 15aab393495a..a07ec6856076 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -1145,21 +1145,41 @@ class MCPlusBuilder { virtual void createLongJmp(std::vector &Seq, const MCSymbol *Target, MCContext *Ctx) const { - assert(0 && "not implemented"); + llvm_unreachable("not implemented"); } virtual void createShortJmp(std::vector &Seq, const MCSymbol *Target, MCContext *Ctx) const { - assert(0 && "not implemented"); + llvm_unreachable("not implemented"); + } + + /// Return true if the instruction CurInst, in combination with the recent + /// history of disassembled instructions supplied by [Begin, End), is a linker + /// generated veneer/stub that needs patching. This happens in AArch64 when + /// the code is large and the linker needs to generate stubs, but it does + /// not put any extra relocation information that could help us to easily + /// extract the real target. This function identifies and extract the real + /// target in Tgt. The instruction that loads the lower bits of the target + /// is put in TgtLowBits, and its pair in TgtHiBits. If the instruction in + /// TgtHiBits does not have an immediate operand, but an expression, then + /// this expression is put in TgtHiSym and Tgt only contains the lower bits. + virtual bool matchLinkerVeneer(InstructionIterator Begin, + InstructionIterator End, + uint64_t Address, + const MCInst &CurInst, + MCInst *&TargetHiBits, + MCInst *&TargetLowBits, + uint64_t &Target) const { + llvm_unreachable("not implemented"); } virtual int getShortJmpEncodingSize() const { - assert(0 && "not implemented"); + llvm_unreachable("not implemented"); return 0; } virtual int getUncondBranchEncodingSize() const { - assert(0 && "not implemented"); + llvm_unreachable("not implemented"); return 0; } diff --git a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp index f2b1eb63a4c8..6b5bc2b8f94a 100644 --- a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -212,7 +212,13 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } bool hasPCRelOperand(const MCInst &Inst) const override { - if (isADR(Inst) || isADRP(Inst)) + // ADRP is blacklisted and is an exception. Even though it has a + // PC-relative operand, this operand is not a complete symbol reference + // and BOLT shouldn't try to process it in isolation. + if (isADRP(Inst)) + return false; + + if (isADR(Inst)) return true; // Look for literal addressing mode (see C1-143 ARM DDI 0487B.a) @@ -492,8 +498,12 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // Match an ADR to load base address to be used when addressing JT targets auto &UsesAdd = UDChain[DefAdd]; - assert(UsesAdd.size() > 1 && UsesAdd[1] != nullptr && - UsesAdd[2] != nullptr && "Expected definition"); + if (UsesAdd.size() <= 1 || UsesAdd[1] == nullptr || UsesAdd[2] == nullptr) { + // This happens when we don't have enough context about this jump table + // because the jumping code sequence was split in multiple basic blocks. + // This was observed in the wild in HHVM code (dispatchImpl). + return false; + } auto *DefBaseAddr = UsesAdd[1]; assert(DefBaseAddr->getOpcode() == AArch64::ADR && "Failed to match indirect branch pattern! (fragment 3)"); @@ -533,9 +543,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } assert(DefJTBasePage->getOpcode() == AArch64::ADRP && "Failed to match jump table base page pattern! (2)"); - assert(DefJTBasePage->getOperand(1).isExpr() && - "Failed to match jump table base page pattern! (3)"); - JumpTable = DefJTBasePage->getOperand(1).getExpr(); + if (DefJTBasePage->getOperand(1).isExpr()) + JumpTable = DefJTBasePage->getOperand(1).getExpr(); return true; } @@ -912,6 +921,71 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Seq.emplace_back(Inst); } + /// Matching pattern here is + /// + /// ADRP x16, imm + /// ADD x16, x16, imm + /// BR x16 + /// + bool matchLinkerVeneer(InstructionIterator Begin, InstructionIterator End, + uint64_t Address, const MCInst &CurInst, + MCInst *&TargetHiBits, MCInst *&TargetLowBits, + uint64_t &Target) const override { + if (CurInst.getOpcode() != AArch64::BR || !CurInst.getOperand(0).isReg() || + CurInst.getOperand(0).getReg() != AArch64::X16) + return false; + + auto I = End; + if (I == Begin) + return false; + + --I; + Address -= 4; + if (I == Begin || + I->getOpcode() != AArch64::ADDXri || + MCPlus::getNumPrimeOperands(*I) < 3 || + !I->getOperand(0).isReg() || + !I->getOperand(1).isReg() || + I->getOperand(0).getReg() != AArch64::X16 || + I->getOperand(1).getReg() != AArch64::X16 || + !I->getOperand(2).isImm()) + return false; + TargetLowBits = &*I; + uint64_t Addr = I->getOperand(2).getImm() & 0xFFF; + + --I; + Address -= 4; + if (I->getOpcode() != AArch64::ADRP || + MCPlus::getNumPrimeOperands(*I) < 2 || + !I->getOperand(0).isReg() || + !I->getOperand(1).isImm() || + I->getOperand(0).getReg() != AArch64::X16) + return false; + TargetHiBits = &*I; + Addr |= (Address + ((int64_t)I->getOperand(1).getImm() << 12)) & + 0xFFFFFFFFFFFFF000ULL; + Target = Addr; + return true; + } + + bool setOperandToSymbolRef(MCInst &Inst, int OpNum, MCSymbol *Symbol, + int64_t Addend, MCContext *Ctx, + uint64_t RelType) const { + MCOperand Operand; + if (!Addend) { + Operand = MCOperand::createExpr(getTargetExprFor( + Inst, MCSymbolRefExpr::create(Symbol, *Ctx), *Ctx, RelType)); + } else { + Operand = MCOperand::createExpr(getTargetExprFor( + Inst, + MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, *Ctx), + MCConstantExpr::create(Addend, *Ctx), *Ctx), + *Ctx, RelType)); + } + Inst.getOperand(OpNum) = Operand; + return true; + } + bool replaceImmWithSymbol(MCInst &Inst, MCSymbol *Symbol, int64_t Addend, MCContext *Ctx, int64_t &Value, uint64_t RelType) const override { @@ -928,18 +1002,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Value = Inst.getOperand(ImmOpNo).getImm(); - MCOperand Operand; - if (!Addend) { - Operand = MCOperand::createExpr(getTargetExprFor( - Inst, MCSymbolRefExpr::create(Symbol, *Ctx), *Ctx, RelType)); - } else { - Operand = MCOperand::createExpr(getTargetExprFor( - Inst, - MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, *Ctx), - MCConstantExpr::create(Addend, *Ctx), *Ctx), - *Ctx, RelType)); - } - Inst.getOperand(ImmOpNo) = Operand; + setOperandToSymbolRef(Inst, ImmOpNo, Symbol, Addend, Ctx, RelType); + return true; } From 7bc059593db40fbf450094bc105f5d2b5d4f56dc Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sat, 26 May 2018 12:40:51 -0700 Subject: [PATCH 426/904] [BOLT] Initial support for memcpy() inlininig Summary: Add "-inline-memcpy" option to inline calls to memcpy() using "rep movsb" instruction. The pass is X86-specific. Calls to _memcpy8 are optimized too using a special return value (dest+size). The implementation is very primitive in that it does not track liveness of %rax after return, and no %rcx substitution. This is going to get improved if we find the optimization to be useful. (cherry picked from commit a7abd1b2329a6a5e054bc60f633c1c938cb6f5ee) --- bolt/src/BinaryBasicBlock.h | 16 +++++++- bolt/src/BinaryFunction.h | 1 - bolt/src/BinaryPassManager.cpp | 10 +++++ bolt/src/MCPlusBuilder.h | 7 ++++ bolt/src/Passes/BinaryPasses.cpp | 50 ++++++++++++++++++++++++ bolt/src/Passes/BinaryPasses.h | 15 +++++++ bolt/src/Target/X86/X86MCPlusBuilder.cpp | 24 ++++++++++++ 7 files changed, 121 insertions(+), 2 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 57f883968788..d20e8d653433 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -701,8 +701,22 @@ class BinaryBasicBlock { return replaceInstruction(Inst, Replacement.begin(), Replacement.end()); } + /// Return iterator pointing to the first inserted instruction. + template + iterator replaceInstruction(iterator II, Itr Begin, Itr End) { + adjustNumPseudos(*II, -1); + adjustNumPseudos(Begin, End, 1); + + return Instructions.insert(Instructions.erase(II), Begin, End); + } + + iterator replaceInstruction(iterator II, + const std::vector &Replacement) { + return replaceInstruction(II, Replacement.begin(), Replacement.end()); + } + /// Insert \p NewInst before \p At, which must be an existing instruction in - /// this BB. Return a pointer to the newly inserted instruction. + /// this BB. Return iterator pointing to the newly inserted instruction. iterator insertInstruction(iterator At, MCInst &&NewInst) { adjustNumPseudos(NewInst, 1); return Instructions.emplace(At, std::move(NewInst)); diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 5859e6c31399..b4f9826fb072 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -2305,7 +2305,6 @@ template <> struct GraphTraits : using nodes_iterator = pointer_iterator; -// typedef bolt::BinaryBasicBlock * nodes_iterator; static nodes_iterator nodes_begin(bolt::BinaryFunction *F) { llvm_unreachable("Not implemented"); return nodes_iterator(F->begin()); diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index 3a6af074618a..fbfbe718ba51 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -230,6 +230,13 @@ RegReAssign("reg-reassign", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +StringOps("inline-memcpy", + cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + static cl::opt StripRepRet("strip-rep-ret", cl::desc("strip 'repz' prefix from 'repz retq' sequence (on by default)"), @@ -358,6 +365,9 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintICF), opts::ICF); + Manager.registerPass(llvm::make_unique(NeverPrint), + opts::StringOps); + Manager.registerPass(llvm::make_unique(PrintICP)); Manager.registerPass(llvm::make_unique(PrintPeepholes)); diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index a07ec6856076..4b08ed46cf62 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -1283,6 +1283,13 @@ class MCPlusBuilder { return true; } + /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return + /// (dest + n) instead of dest. + virtual std::vector createInlineMemcpy(bool ReturnEnd) const { + llvm_unreachable("not implemented"); + return {}; + } + /// Returns true if instruction is a call frame pseudo instruction. virtual bool isCFI(const MCInst &Inst) const { return Inst.getOpcode() == TargetOpcode::CFI_INSTRUCTION; diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 853b67b8f2b8..1ee58c0cbb92 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -1540,5 +1540,55 @@ void StripRepRet::runOnFunctions( } } +void InlineMemcpy::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + if (!BC.isX86()) + return; + + uint64_t NumInlined = 0; + uint64_t NumInlinedDyno = 0; + for (auto &BFI : BFs) { + for (auto &BB : BFI.second) { + for(auto II = BB.begin(); II != BB.end(); ++II) { + auto &Inst = *II; + + if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || + !Inst.getOperand(0).isExpr()) + continue; + + const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); + if (CalleeSymbol->getName() != "memcpy" && + CalleeSymbol->getName() != "memcpy@PLT" && + CalleeSymbol->getName() != "_memcpy8") + continue; + + const auto IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); + const auto IsTailCall = BC.MIB->isTailCall(Inst); + + const auto NewCode = BC.MIB->createInlineMemcpy(IsMemcpy8); + II = BB.replaceInstruction(II, NewCode); + std::advance(II, NewCode.size() - 1); + if (IsTailCall) { + MCInst Return; + BC.MIB->createReturn(Return); + II = BB.insertInstruction(std::next(II), std::move(Return)); + } + + ++NumInlined; + NumInlinedDyno += BB.getKnownExecutionCount(); + } + } + } + + if (NumInlined) { + outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; + if (NumInlinedDyno) + outs() << ". The calls were executed " << NumInlinedDyno + << " times based on profile."; + outs() << '\n'; + } +} + } // namespace bolt } // namespace llvm diff --git a/bolt/src/Passes/BinaryPasses.h b/bolt/src/Passes/BinaryPasses.h index de9c44f83c23..8caade87dd8a 100644 --- a/bolt/src/Passes/BinaryPasses.h +++ b/bolt/src/Passes/BinaryPasses.h @@ -433,6 +433,21 @@ class StripRepRet : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// Pass for inlining calls to memcpy using 'rep movsb' on X86. +class InlineMemcpy : public BinaryFunctionPass { +public: + explicit InlineMemcpy(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { + return "inline-memcpy"; + } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + enum FrameOptimizationType : char { FOP_NONE, /// Don't perform FOP. FOP_HOT, /// Perform FOP on hot functions. diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index ca8eb62523c7..847dee95e9ac 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -2713,6 +2714,29 @@ class X86MCPlusBuilder : public MCPlusBuilder { return true; } + std::vector createInlineMemcpy(bool ReturnEnd) const override { + std::vector Code; + if (ReturnEnd) { + Code.emplace_back(MCInstBuilder(X86::LEA64r) + .addReg(X86::RAX) + .addReg(X86::RDI) + .addImm(1) + .addReg(X86::RDX) + .addImm(0) + .addReg(X86::NoRegister)); + } else { + Code.emplace_back(MCInstBuilder(X86::MOV64rr) + .addReg(X86::RAX) + .addReg(X86::RDI)); + } + Code.emplace_back(MCInstBuilder(X86::MOV32rr) + .addReg(X86::ECX) + .addReg(X86::EDX)); + Code.emplace_back(MCInstBuilder(X86::REP_MOVSB_64)); + + return Code; + } + bool replaceImmWithSymbol(MCInst &Inst, MCSymbol *Symbol, int64_t Addend, MCContext *Ctx, int64_t &Value, uint64_t RelType) const override { From abfc3c88c27a7fd420a2e2eb03b7e5ab48073a64 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Thu, 17 May 2018 11:14:15 -0700 Subject: [PATCH 427/904] [BOLT] merging cold basic blocks to reduce #jumps Summary: This diff introduces a modification of cache+ block ordering algorithm, which reordered and merges cold blocks in a function with the goal of reducing the number of (non-fallthrough) jumps, and thus, the code size. (cherry picked from commit 5c83b47d0371c8fe866a569ba9afc6124180e32b) --- bolt/src/Passes/BinaryPasses.cpp | 2 +- bolt/src/Passes/CachePlusReorderAlgorithm.cpp | 162 +++++++++++------- bolt/src/Passes/HFSortPlus.cpp | 35 +--- bolt/src/Passes/ReorderAlgorithm.h | 4 - 4 files changed, 111 insertions(+), 92 deletions(-) diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 1ee58c0cbb92..569598c5ae18 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -481,7 +481,7 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, break; case LT_OPTIMIZE_CACHE_PLUS: - Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo))); + Algo.reset(new CachePlusReorderAlgorithm()); break; case LT_OPTIMIZE_SHUFFLE: diff --git a/bolt/src/Passes/CachePlusReorderAlgorithm.cpp b/bolt/src/Passes/CachePlusReorderAlgorithm.cpp index 26d35f00011b..1e99792bfe53 100644 --- a/bolt/src/Passes/CachePlusReorderAlgorithm.cpp +++ b/bolt/src/Passes/CachePlusReorderAlgorithm.cpp @@ -82,7 +82,7 @@ class Cluster { return Blocks; } - /// Update the list of basic blocks and meta-info + /// Update the list of basic blocks and aggregated cluster data void merge(const Cluster *Other, const std::vector &MergedBlocks, double MergedScore) { @@ -93,6 +93,10 @@ class Cluster { Score = MergedScore; } + void clear() { + Blocks.clear(); + } + private: std::vector Blocks; size_t Id; @@ -219,65 +223,14 @@ class CachePlus { /// Run cache+ algorithm and return a basic block ordering std::vector run() { - // Merge blocks with their fallthrough successors - for (auto BB : BF.layout()) { - if (FallthroughPred[BB->getLayoutIndex()] == nullptr && - FallthroughSucc[BB->getLayoutIndex()] != nullptr) { - auto CurBB = BB; - while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) { - const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()]; - mergeClusters(&AllClusters[BB->getLayoutIndex()], - &AllClusters[NextBB->getLayoutIndex()], - 0); - CurBB = NextBB; - } - } - } + // Pass 1: Merge blocks with their fallthrough successors + mergeFallthroughs(); - // Merge pairs of clusters while there is an improvement in ExtTSP metric - while (Clusters.size() > 1) { - Cluster *BestClusterPred = nullptr; - Cluster *BestClusterSucc = nullptr; - std::pair BestGain(-1, 0); - for (auto ClusterPred : Clusters) { - // Do not merge cold blocks - if (ClusterPred->isCold()) - continue; + // Pass 2: Merge pairs of clusters while improving the ExtTSP metric + mergeClusterPairs(); - // Get candidates for merging with the current cluster - Adjacent.forAllAdjacent( - ClusterPred, - // Find the best candidate - [&](Cluster *ClusterSucc) { - assert(ClusterPred != ClusterSucc && "loop edges are not supported"); - assert(!ClusterSucc->isCold() && "cannot merge cold clusters"); - - // Compute the gain of merging two clusters - auto Gain = mergeGain(ClusterPred, ClusterSucc); - if (Gain.first <= 0.0) - return; - - // Breaking ties by density to make the hottest clusters be merged first - if (Gain.first > BestGain.first || - (std::abs(Gain.first - BestGain.first) < 1e-8 && - compareClusterPairs(ClusterPred, - ClusterSucc, - BestClusterPred, - BestClusterSucc))) { - BestGain = Gain; - BestClusterPred = ClusterPred; - BestClusterSucc = ClusterSucc; - } - }); - } - - // Stop merging when there is no improvement - if (BestGain.first <= 0.0) - break; - - // Merge the best pair of clusters - mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second); - } + // Pass 3: Merge cold blocks to reduce code size + mergeColdClusters(); // Sorting clusters by density std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters); @@ -339,12 +292,14 @@ class CachePlus { // Initialize clusters Clusters.reserve(BF.layout_size()); AllClusters.reserve(BF.layout_size()); + CurCluster.reserve(BF.layout_size()); Size.reserve(BF.layout_size()); for (auto BB : BF.layout()) { size_t Index = BB->getLayoutIndex(); Size.push_back(std::max(BB->estimateSize(), size_t(1))); AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]); Clusters.push_back(&AllClusters[Index]); + CurCluster.push_back(&AllClusters[Index]); } // Initialize adjacency matrix @@ -364,6 +319,88 @@ class CachePlus { findFallthroughBlocks(InWeight, OutWeight); } + /// Merge blocks with their fallthrough successors. + void mergeFallthroughs() { + for (auto BB : BF.layout()) { + if (FallthroughPred[BB->getLayoutIndex()] == nullptr && + FallthroughSucc[BB->getLayoutIndex()] != nullptr) { + auto CurBB = BB; + while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) { + const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()]; + mergeClusters(&AllClusters[BB->getLayoutIndex()], + &AllClusters[NextBB->getLayoutIndex()], + 0); + CurBB = NextBB; + } + } + } + } + + /// Merge pairs of clusters while improving the ExtTSP metric + void mergeClusterPairs() { + while (Clusters.size() > 1) { + Cluster *BestClusterPred = nullptr; + Cluster *BestClusterSucc = nullptr; + std::pair BestGain(-1, 0); + for (auto ClusterPred : Clusters) { + // Do not merge cold blocks + if (ClusterPred->isCold()) + continue; + + // Get candidates for merging with the current cluster + Adjacent.forAllAdjacent( + ClusterPred, + // Find the best candidate + [&](Cluster *ClusterSucc) { + assert(ClusterPred != ClusterSucc && "loop edges are not supported"); + assert(!ClusterSucc->isCold() && "cannot merge cold clusters"); + + // Compute the gain of merging two clusters + auto Gain = mergeGain(ClusterPred, ClusterSucc); + if (Gain.first <= 0.0) + return; + + // Breaking ties by density to make the hottest clusters be merged first + if (Gain.first > BestGain.first || + (std::abs(Gain.first - BestGain.first) < 1e-8 && + compareClusterPairs(ClusterPred, + ClusterSucc, + BestClusterPred, + BestClusterSucc))) { + BestGain = Gain; + BestClusterPred = ClusterPred; + BestClusterSucc = ClusterSucc; + } + }); + } + + // Stop merging when there is no improvement + if (BestGain.first <= 0.0) + break; + + // Merge the best pair of clusters + mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second); + } + } + + /// Merge cold blocks to reduce code size + void mergeColdClusters() { + for (auto SrcBB : BF.layout()) { + // Iterating in reverse order to make sure original fall-trough jumps are + // merged first + for (auto Itr = SrcBB->succ_rbegin(); Itr != SrcBB->succ_rend(); ++Itr) { + BinaryBasicBlock *DstBB = *Itr; + auto SrcCluster = CurCluster[SrcBB->getLayoutIndex()]; + auto DstCluster = CurCluster[DstBB->getLayoutIndex()]; + if (SrcCluster != DstCluster && !DstCluster->isEntryPoint() && + SrcCluster->blocks().back() == SrcBB && + DstCluster->blocks().front() == DstBB) { + mergeClusters(SrcCluster, DstCluster, 0); + } + } + } + } + /// For a pair of blocks, A and B, block B is the fallthrough successor of A, /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps /// to B are from A. Such blocks should be adjacent in an optimal ordering, @@ -558,11 +595,17 @@ class CachePlus { // Merge the blocks of clusters auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType); Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks)); + From->clear(); // Remove cluster From from the list of active clusters auto Iter = std::remove(Clusters.begin(), Clusters.end(), From); Clusters.erase(Iter, Clusters.end()); + // Update block clusters + for (auto BB : Into->blocks()) { + CurCluster[BB->getLayoutIndex()] = Into; + } + // Invalidate caches Cache.invalidate(Into); @@ -582,6 +625,9 @@ class CachePlus { // Active clusters. The vector gets udpated at runtime when clusters are merged std::vector Clusters; + // Current cluster of a basic block + std::vector CurCluster; + // Size of the block std::vector Size; diff --git a/bolt/src/Passes/HFSortPlus.cpp b/bolt/src/Passes/HFSortPlus.cpp index e02c965c2b2d..6bb9dfbae9a0 100644 --- a/bolt/src/Passes/HFSortPlus.cpp +++ b/bolt/src/Passes/HFSortPlus.cpp @@ -9,24 +9,6 @@ // //===----------------------------------------------------------------------===// -// TODO: copyright/license msg. - -/* - +----------------------------------------------------------------------+ - | HipHop for PHP | - +----------------------------------------------------------------------+ - | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | http://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ -*/ - #include "BinaryFunction.h" #include "HFSort.h" #include "ReorderUtils.h" @@ -112,14 +94,6 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1, return A1->target(0) < A2->target(0); } -/// Sorting clusters by their density in decreasing order. -template -std::vector sortByDensity(const C &Clusters_) { - std::vector Clusters(Clusters_.begin(), Clusters_.end()); - std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters); - return Clusters; -} - /// HFSortPlus - layout of hot functions with iTLB cache optimization /// /// Given an ordering of hot functions (and hence, their assignment to the @@ -398,15 +372,17 @@ class HFSortPlus { DEBUG(dbgs() << "Completed hfsort+ with " << Clusters.size() << " clusters\n"); + // Sorting clusters by density in decreasing order + std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters); + // Return the set of clusters that are left, which are the ones that // didn't get merged (so their first func is its original func) std::vector Result; - for (auto Cluster : sortByDensity(Clusters)) { + Result.reserve(Clusters.size()); + for (auto Cluster : Clusters) { Result.emplace_back(std::move(*Cluster)); } - assert(std::is_sorted(Result.begin(), Result.end(), compareClustersDensity)); - return Result; } @@ -473,6 +449,7 @@ class HFSortPlus { Adjacent.merge(Into, From); Into->merge(*From); + From->clear(); // Update the clusters and addresses for functions merged from From. size_t CurAddr = 0; diff --git a/bolt/src/Passes/ReorderAlgorithm.h b/bolt/src/Passes/ReorderAlgorithm.h index 5be8a93f6f1f..29c300c02ff9 100644 --- a/bolt/src/Passes/ReorderAlgorithm.h +++ b/bolt/src/Passes/ReorderAlgorithm.h @@ -246,10 +246,6 @@ class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm { /// A new reordering algorithm for basic blocks, cache+ class CachePlusReorderAlgorithm : public ReorderAlgorithm { public: - explicit CachePlusReorderAlgorithm( - std::unique_ptr CAlgo) : - ReorderAlgorithm(std::move(CAlgo)) { } - void reorderBasicBlocks( const BinaryFunction &BF, BasicBlockOrder &Order) const override; }; From de89cfd08db2d3e19469902ea2913077cc5c810e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 Jun 2018 03:17:32 -0700 Subject: [PATCH 428/904] [BOLT] Hash anonymous symbol names Summary: This diff replaces the addresses in all the {SYMBOLat,HOLEat,DATAat} symbols with hash values based on the data contained in the symbol. It should make the profiling data for anonymous symbols robust to address changes. The only small problem with this approach is that the hashed name for padding symbols of the same size collide frequently. This shouldn't be a big deal since it would be weird if those symbols were hot. On a test run with hhvm there were 26 collisions (out of ~338k symbols). Most of the collisions were from small (2,4,8 byte) objects. (cherry picked from commit 9293ff833f0403dacca36d5f1321ed5a8edaf52d) --- bolt/src/BinaryContext.cpp | 91 ++++++++++++++++++++++++---- bolt/src/BinaryContext.h | 3 + bolt/src/BinaryData.cpp | 4 ++ bolt/src/BinaryData.h | 6 +- bolt/src/BinarySection.cpp | 38 ++++++++++++ bolt/src/BinarySection.h | 9 +++ bolt/src/Relocation.h | 3 + bolt/src/RewriteInstance.cpp | 4 +- bolt/src/merge-fdata/merge-fdata.cpp | 3 +- 9 files changed, 143 insertions(+), 18 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 69e665cecfd0..5e4db93f4fa6 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -123,14 +123,14 @@ void BinaryContext::updateObjectNesting(BinaryDataMapType::iterator GAI) { auto fixParents = [&](BinaryDataMapType::iterator Itr, BinaryData *NewParent) { - auto *OldParent = Itr->second->Parent; + auto *OldParent = Itr->second->Parent; + Itr->second->Parent = NewParent; + ++Itr; + while (Itr != BinaryDataMap.end() && OldParent && + Itr->second->Parent == OldParent) { Itr->second->Parent = NewParent; ++Itr; - while (Itr != BinaryDataMap.end() && OldParent && - Itr->second->Parent == OldParent) { - Itr->second->Parent = NewParent; - ++Itr; - } + } }; // Check if the previous symbol contains the newly added symbol. @@ -225,11 +225,13 @@ MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, GlobalSymbols[Name] = BD; } updateObjectNesting(GAI); + BD = nullptr; } else if (!GAI->second->hasName(Name)) { GAI->second->Names.push_back(Name); GlobalSymbols[Name] = GAI->second; + } else { + BD = nullptr; } - BD = nullptr; } else { GAI = BinaryDataMap.emplace(Address, BD).first; GlobalSymbols[Name] = BD; @@ -240,7 +242,8 @@ MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, auto *Symbol = Ctx->getOrCreateSymbol(Name); if (BD) { BD->Symbols.push_back(Symbol); - assert(BD->Symbols.size() == BD->Names.size()); + assert(BD->Symbols.size() == BD->Names.size() && + "there should be a 1:1 mapping between names and symbols"); } return Symbol; } @@ -298,6 +301,69 @@ bool BinaryContext::setBinaryDataSize(uint64_t Address, uint64_t Size) { return false; } +void BinaryContext::generateSymbolHashes() { + auto isNonAnonymousName = [](StringRef Name) { + return !(Name.startswith("SYMBOLat") || + Name.startswith("DATAat") || + Name.startswith("HOLEat")); + }; + + auto isPadding = [](const BinaryData &BD) { + auto Contents = BD.getSection().getContents(); + auto SymData = Contents.substr(BD.getOffset(), BD.getSize()); + return (BD.getName().startswith("HOLEat") || + SymData.find_first_not_of(0) == StringRef::npos); + }; + + for (auto &Entry : BinaryDataMap) { + auto &BD = *Entry.second; + auto Name = BD.getName(); + + if (isNonAnonymousName(Name)) + continue; + + // First check if a non-anonymous alias exists and move it to the front. + if (BD.getNames().size() > 1) { + auto Itr = std::find_if(BD.Names.begin(), + BD.Names.end(), + isNonAnonymousName); + if (Itr != BD.Names.end()) { + assert(BD.Names.size() == BD.Symbols.size() && + "there should be a 1:1 mapping between names and symbols"); + auto Idx = std::distance(BD.Names.begin(), Itr); + std::swap(BD.Names[0], *Itr); + std::swap(BD.Symbols[0], BD.Symbols[Idx]); + continue; + } + } + + // We have to skip 0 size symbols since they will all collide. + if (BD.getSize() == 0) { + continue; + } + + const auto Hash = BD.getSection().hash(BD); + const auto Idx = Name.find("0x"); + std::string NewName = (Twine(Name.substr(0, Idx)) + + "_" + Twine::utohexstr(Hash)).str(); + if (getBinaryDataByName(NewName)) { + // Ignore collisions for symbols that appear to be padding + // (i.e. all zeros or a "hole") + if (!isPadding(BD)) { + outs() << "BOLT-WARNING: collision detected when hashing " << BD + << " with new name (" << NewName << "), skipping.\n"; + } + continue; + } + BD.Names.insert(BD.Names.begin(), NewName); + BD.Symbols.insert(BD.Symbols.begin(), + Ctx->getOrCreateSymbol(NewName)); + assert(BD.Names.size() == BD.Symbols.size() && + "there should be a 1:1 mapping between names and symbols"); + GlobalSymbols[NewName] = &BD; + } +} + void BinaryContext::postProcessSymbolTable() { fixBinaryDataHoles(); bool Valid = true; @@ -315,6 +381,7 @@ void BinaryContext::postProcessSymbolTable() { } assert(Valid); assignMemData(); + generateSymbolHashes(); } void BinaryContext::foldFunction(BinaryFunction &ChildBF, @@ -379,7 +446,7 @@ void BinaryContext::fixBinaryDataHoles() { while (Itr != End) { if (Itr->second->getAddress() > EndAddress) { - auto Gap = Itr->second->getAddress() - EndAddress; + auto Gap = Itr->second->getAddress() - EndAddress; Holes.push_back(std::make_pair(EndAddress, Gap)); } EndAddress = Itr->second->getEndAddress(); @@ -723,11 +790,11 @@ void BinaryContext::printInstruction(raw_ostream &OS, OS << " # TAILCALL "; if (MIB->isInvoke(Instruction)) { if (const auto EHInfo = MIB->getEHInfo(Instruction)) { - OS << " # handler: "; + OS << " # handler: "; if (EHInfo->first) OS << *EHInfo->first; - else - OS << '0'; + else + OS << '0'; OS << "; action: " << EHInfo->second; } auto GnuArgsSize = MIB->getGnuArgsSize(Instruction); diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 34dfea6d22bd..7d3a5db4ce6c 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -192,6 +192,9 @@ class BinaryContext { /// symbols are padded with the space before the next BinaryData object. void fixBinaryDataHoles(); + /// Generate names based on data hashes for unknown symbols. + void generateSymbolHashes(); + /// Populate \p GlobalMemData. This should be done after all symbol discovery /// is complete, e.g. after building CFGs for all functions. void assignMemData(); diff --git a/bolt/src/BinaryData.cpp b/bolt/src/BinaryData.cpp index 8bd72d792b5f..e8d806ec9b0f 100644 --- a/bolt/src/BinaryData.cpp +++ b/bolt/src/BinaryData.cpp @@ -32,6 +32,10 @@ PrintSymbolAliases("print-aliases", cl::cat(BoltCategory)); } +bool BinaryData::isAbsolute() const { + return Flags & SymbolRef::SF_Absolute; +} + bool BinaryData::isMoveable() const { return (!isAbsolute() && (IsMoveable && diff --git a/bolt/src/BinaryData.h b/bolt/src/BinaryData.h index 6b1f4eafe798..d709ef073cfe 100644 --- a/bolt/src/BinaryData.h +++ b/bolt/src/BinaryData.h @@ -23,7 +23,7 @@ namespace llvm { namespace bolt { -struct BinarySection; +class BinarySection; /// \p BinaryData represents an indivisible part of a data section section. /// BinaryData's may contain sub-components, e.g. jump tables but they are @@ -106,7 +106,7 @@ class BinaryData { bool isAtomic() const { return isTopLevelJumpTable() || !Parent; } - + iterator_range::const_iterator> names() const { return make_range(Names.begin(), Names.end()); } @@ -140,7 +140,7 @@ class BinaryData { return std::find(Symbols.begin(), Symbols.end(), Symbol) != Symbols.end(); } - bool isAbsolute() const { return getSymbol()->isAbsolute(); } + bool isAbsolute() const; bool isMoveable() const; uint64_t getAddress() const { return Address; } diff --git a/bolt/src/BinarySection.cpp b/bolt/src/BinarySection.cpp index 52d27835f412..daea0c4a06fb 100644 --- a/bolt/src/BinarySection.cpp +++ b/bolt/src/BinarySection.cpp @@ -23,6 +23,44 @@ namespace opts { extern cl::opt PrintRelocations; } +uint64_t +BinarySection::hash(const BinaryData &BD, + std::map &Cache) const { + auto Itr = Cache.find(&BD); + if (Itr != Cache.end()) + return Itr->second; + + Cache[&BD] = 0; + + auto Offset = BD.getAddress() - getAddress(); + const auto EndOffset = BD.getEndAddress() - getAddress(); + auto Begin = Relocations.lower_bound(Relocation{Offset, 0, 0, 0, 0}); + auto End = Relocations.upper_bound(Relocation{EndOffset, 0, 0, 0, 0}); + const auto Contents = getContents(); + + hash_code Hash = hash_combine(hash_value(BD.getSize()), + hash_value(BD.getSectionName())); + + while (Begin != End) { + const auto &Rel = *Begin++; + Hash = hash_combine( + Hash, + hash_value(Contents.substr(Offset, Begin->Offset - Offset))); + if (auto *RelBD = BC.getBinaryDataByName(Rel.Symbol->getName())) { + Hash = hash_combine(Hash, hash(*RelBD, Cache)); + } + Offset = Rel.Offset + Rel.getSize(); + } + + Hash = hash_combine( + Hash, + hash_value(Contents.substr(Offset, EndOffset - Offset))); + + Cache[&BD] = Hash; + + return Hash; +} + BinarySection::~BinarySection() { if (isReordered()) { delete[] getData(); diff --git a/bolt/src/BinarySection.h b/bolt/src/BinarySection.h index 3ae1e5ecd886..1aafbd838a7b 100644 --- a/bolt/src/BinarySection.h +++ b/bolt/src/BinarySection.h @@ -22,6 +22,7 @@ #include "llvm/Support/ErrorOr.h" #include "llvm/Support/raw_ostream.h" #include +#include namespace llvm { @@ -69,6 +70,9 @@ class BinarySection { // Set by ExecutableFileMemoryManager. mutable bool IsReordered{false}; // Have the contents been reordered? + uint64_t hash(const BinaryData &BD, + std::map &Cache) const; + // non-copyable BinarySection(const BinarySection &) = delete; BinarySection(BinarySection &&) = delete; @@ -340,6 +344,11 @@ class BinarySection { return Itr != Relocations.end() ? &*Itr : nullptr; } + uint64_t hash(const BinaryData &BD) const { + std::map Cache; + return hash(BD, Cache); + } + /// /// Property accessors related to output data. /// diff --git a/bolt/src/Relocation.h b/bolt/src/Relocation.h index 5b6beec86ec8..f6cd6791c565 100644 --- a/bolt/src/Relocation.h +++ b/bolt/src/Relocation.h @@ -44,6 +44,9 @@ struct Relocation { /// Return size of the given relocation \p Type. static size_t getSizeForType(uint64_t Type); + /// Return size of this relocation. + size_t getSize() const { return getSizeForType(Type); } + /// Extract current relocated value from binary contents. This is used for /// RISC architectures where values are encoded in specific bits depending /// on the relocation value. diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 8eddbc2dff31..6288da355536 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -2573,6 +2573,8 @@ void RewriteInstance::disassembleFunctions() { Function.print(outs(), "while building cfg", true); } // Iterate over all functions + + BC->postProcessSymbolTable(); } void RewriteInstance::postProcessFunctions() { @@ -2601,8 +2603,6 @@ void RewriteInstance::postProcessFunctions() { BC->SumExecutionCount += Function.getKnownExecutionCount(); } - BC->postProcessSymbolTable(); - if (opts::PrintGlobals) { outs() << "BOLT-INFO: Global symbols:\n"; BC->printGlobalSymbols(outs()); diff --git a/bolt/src/merge-fdata/merge-fdata.cpp b/bolt/src/merge-fdata/merge-fdata.cpp index a9b3c0fbe401..61d9cb956cc8 100644 --- a/bolt/src/merge-fdata/merge-fdata.cpp +++ b/bolt/src/merge-fdata/merge-fdata.cpp @@ -306,7 +306,8 @@ int main(int argc, char **argv) { // For consistency, sort functions by their IDs. std::sort(MergedProfile.Functions.begin(), MergedProfile.Functions.end(), - [] (BinaryFunctionProfile &A, BinaryFunctionProfile &B) { + [] (const BinaryFunctionProfile &A, + const BinaryFunctionProfile &B) { return A.Id < B.Id; }); From e19a0e4747817266cbb880477007dca1bf7655bc Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 11 Jun 2018 17:17:25 -0700 Subject: [PATCH 429/904] [Bolt] Reduce verbosity while reporting hash collisions Summary: Don't report all data objects with hash collisions by default. Only report the summary, and use -v=1 for providing the full list. (cherry picked from commit cfd7beb3904913d9e86dc8fedeb53c30c90922fb) --- bolt/src/BinaryContext.cpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 5e4db93f4fa6..52b521c54758 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -32,6 +32,8 @@ namespace opts { extern cl::OptionCategory BoltCategory; +extern cl::opt Verbosity; + static cl::opt PrintDebugInfo("print-debug-info", cl::desc("print debug info when printing functions"), @@ -315,6 +317,7 @@ void BinaryContext::generateSymbolHashes() { SymData.find_first_not_of(0) == StringRef::npos); }; + uint64_t NumCollisions = 0; for (auto &Entry : BinaryDataMap) { auto &BD = *Entry.second; auto Name = BD.getName(); @@ -350,8 +353,11 @@ void BinaryContext::generateSymbolHashes() { // Ignore collisions for symbols that appear to be padding // (i.e. all zeros or a "hole") if (!isPadding(BD)) { - outs() << "BOLT-WARNING: collision detected when hashing " << BD - << " with new name (" << NewName << "), skipping.\n"; + if (opts::Verbosity) { + errs() << "BOLT-WARNING: collision detected when hashing " << BD + << " with new name (" << NewName << "), skipping.\n"; + } + ++NumCollisions; } continue; } @@ -362,6 +368,13 @@ void BinaryContext::generateSymbolHashes() { "there should be a 1:1 mapping between names and symbols"); GlobalSymbols[NewName] = &BD; } + if (NumCollisions) { + errs() << "BOLT-WARNING: " << NumCollisions + << " collisions detected while hashing binary objects"; + if (!opts::Verbosity) + errs() << ". Use -v=1 to see the list."; + errs() << '\n'; + } } void BinaryContext::postProcessSymbolTable() { @@ -375,7 +388,7 @@ void BinaryContext::postProcessSymbolTable() { !BD->getSize() && !BD->isAbsolute() && BD->getSection()) { - outs() << "BOLT-WARNING: zero sized top level symbol: " << *BD << "\n"; + errs() << "BOLT-WARNING: zero sized top level symbol: " << *BD << "\n"; Valid = false; } } From 95a30ea78d1798f085454e9059a79ab41cf501df Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 11 Jun 2018 19:46:40 -0700 Subject: [PATCH 430/904] [Bolt][NFC] Change capitalization s/BOLT/Bolt/g (cherry picked from commit 72a66b25baa708998f2765d4dd7c38cc9d8a05dc) --- bolt/README.txt | 12 ++++---- bolt/src/BinaryContext.h | 2 +- bolt/src/BinaryFunction.cpp | 2 +- bolt/src/DataAggregator.cpp | 2 +- bolt/src/DataAggregator.h | 2 +- bolt/src/DataReader.cpp | 2 +- bolt/src/Passes/Inliner.cpp | 2 +- bolt/src/Passes/Inliner.h | 2 +- bolt/src/Passes/JTFootprintReduction.h | 2 +- bolt/src/Passes/LongJmp.h | 2 +- bolt/src/Passes/RegReAssign.cpp | 2 +- bolt/src/ProfileReader.cpp | 2 +- bolt/src/ProfileReader.h | 2 +- bolt/src/ProfileYAMLMapping.h | 2 +- bolt/src/RewriteInstance.cpp | 29 ++++++++++--------- bolt/src/RewriteInstance.h | 4 +-- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 2 +- bolt/src/llvm-bolt.cpp | 16 +++++----- 18 files changed, 45 insertions(+), 44 deletions(-) diff --git a/bolt/README.txt b/bolt/README.txt index aaf4e50f4913..a5d5259b3190 100644 --- a/bolt/README.txt +++ b/bolt/README.txt @@ -1,21 +1,21 @@ -BOLT +Bolt ==== - BOLT is a post-link optimizer developed to speed up large applications. + Bolt is a post-link optimizer developed to speed up large applications. It achieves speed-ups by optimizing application's code layout based on an execution profile gathered by sampling profilers such as Linux `perf` tool. - BOLT could operate on any binary with symbol table, but for maximum gains + Bolt could operate on any binary with symbol table, but for maximum gains it utilizes relocations saved by a linker (--emit-relocs). - NOTE: Currently BOLT support is limited to non-PIC/PIE binaries. + NOTE: Currently Bolt support is limited to non-PIC/non-PIE binaries. INSTALLATION ============ - BOLT heavily uses LLVM libraries and by design it is built as one of LLVM + Bolt heavily uses LLVM libraries and by design it is built as one of LLVM tools. The build process in not much different from regular LLVM. - Start with cloning LLVM and BOLT repos: + Start with cloning LLVM and Bolt repos: > git clone https://github.com/llvm-mirror/llvm llvm > cd llvm/tools diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 7d3a5db4ce6c..719c1765d821 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -258,7 +258,7 @@ class BinaryContext { uint64_t MissedMacroFusionExecCount{0}; /// Track next available address for new allocatable sections. RewriteInstance - /// sets this prior to running BOLT passes, so layout passes are aware of the + /// sets this prior to running Bolt passes, so layout passes are aware of the /// final addresses functions will have. uint64_t LayoutStartAddress{0}; diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 32ed7ba32e01..cc47cb56fe95 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -300,7 +300,7 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { return nullptr; /* - * This is commented out because it makes BOLT too slow. + * This is commented out because it makes Bolt too slow. * assert(std::is_sorted(BasicBlockOffsets.begin(), * BasicBlockOffsets.end(), * CompareBasicBlockOffsets()))); diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 655cf8050bbe..61fa1ac2af96 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -50,7 +50,7 @@ IgnoreBuildID("ignore-build-id", static cl::opt TimeAggregator("time-aggr", - cl::desc("time BOLT aggregator"), + cl::desc("time Bolt aggregator"), cl::init(false), cl::ZeroOrMore, cl::cat(AggregatorCategory)); diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index becce32b91e1..2820f5836b59 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -89,7 +89,7 @@ class DataAggregator : public DataReader { DenseSet PIDs; - /// References to core BOLT data structures + /// References to core Bolt data structures BinaryContext *BC{nullptr}; std::map *BFs{nullptr}; diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp index 348b83d4e334..c7945e5bcf5c 100644 --- a/bolt/src/DataReader.cpp +++ b/bolt/src/DataReader.cpp @@ -33,7 +33,7 @@ Optional getLTOCommonName(const StringRef Name) { namespace { -/// Return standard name of the function possibly renamed by BOLT. +/// Return standard name of the function possibly renamed by Bolt. StringRef normalizeName(StringRef Name) { // Strip "PG." prefix used for globalized locals. return Name.startswith("PG.") ? Name.substr(2) : Name; diff --git a/bolt/src/Passes/Inliner.cpp b/bolt/src/Passes/Inliner.cpp index abde11258782..21b82053c5ac 100644 --- a/bolt/src/Passes/Inliner.cpp +++ b/bolt/src/Passes/Inliner.cpp @@ -1,4 +1,4 @@ -//===--- Passes/Inliner.cpp - Inlining infra for BOLT ---------------------===// +//===--- Passes/Inliner.cpp - Inlining infra for Bolt ---------------------===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/Passes/Inliner.h b/bolt/src/Passes/Inliner.h index 4a548a5dba84..4c36634d4cfa 100644 --- a/bolt/src/Passes/Inliner.h +++ b/bolt/src/Passes/Inliner.h @@ -1,4 +1,4 @@ -//===--- Passes/Inliner.h - Inlining infra for BOLT -----------------------===// +//===--- Passes/Inliner.h - Inlining infra for Bolt -----------------------===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/Passes/JTFootprintReduction.h b/bolt/src/Passes/JTFootprintReduction.h index 81be253a6e3e..a1e9a4723e35 100644 --- a/bolt/src/Passes/JTFootprintReduction.h +++ b/bolt/src/Passes/JTFootprintReduction.h @@ -23,7 +23,7 @@ namespace bolt { /// This pass identify indirect jumps to jump tables and reduce their entries /// size from 8 to 4 bytes. For PIC jump tables, it will remove the PIC code -/// (since BOLT only process static code and it makes no sense to use expensive +/// (since Bolt only process static code and it makes no sense to use expensive /// PIC-style jumps in static code). class JTFootprintReduction : public BinaryFunctionPass { uint64_t TotalJTScore{0}; diff --git a/bolt/src/Passes/LongJmp.h b/bolt/src/Passes/LongJmp.h index e771b6767076..cf5c947748fe 100644 --- a/bolt/src/Passes/LongJmp.h +++ b/bolt/src/Passes/LongJmp.h @@ -18,7 +18,7 @@ namespace bolt { /// LongJmp is veneer-insertion pass originally written for AArch64 that /// compensates for its short-range branches, typically done during linking. We -/// pull this pass inside BOLT because here we can do a better job at stub +/// pull this pass inside Bolt because here we can do a better job at stub /// inserting by manipulating the CFG, something linkers can't do. /// /// LongJmp is a two-step process. In the first step, when function sizes are diff --git a/bolt/src/Passes/RegReAssign.cpp b/bolt/src/Passes/RegReAssign.cpp index 1c0f9e5ac636..b6ae08650af2 100644 --- a/bolt/src/Passes/RegReAssign.cpp +++ b/bolt/src/Passes/RegReAssign.cpp @@ -419,7 +419,7 @@ void RegReAssign::runOnFunctions(BinaryContext &BC, outs() << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections." << " Some registers were changed but associated AT_LOCATION for " << "impacted variables were NOT updated! This operation is " - << "currently unsupported by BOLT.\n"; + << "currently unsupported by Bolt.\n"; } outs() << "BOLT-INFO: Reg Reassignment Pass Stats:\n"; outs() << "\t " << FuncsChanged.size() << " functions affected.\n"; diff --git a/bolt/src/ProfileReader.cpp b/bolt/src/ProfileReader.cpp index 9851814fcbfd..38fb80f6f703 100644 --- a/bolt/src/ProfileReader.cpp +++ b/bolt/src/ProfileReader.cpp @@ -1,4 +1,4 @@ -//===-- ProfileReader.cpp - BOLT profile de-serializer ----------*- C++ -*-===// +//===-- ProfileReader.cpp - Bolt profile de-serializer ----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/ProfileReader.h b/bolt/src/ProfileReader.h index 7bd2ff85dbe2..b8619d6e3172 100644 --- a/bolt/src/ProfileReader.h +++ b/bolt/src/ProfileReader.h @@ -1,4 +1,4 @@ -//===-- ProfileReader.h - BOLT profile deserializer -------------*- C++ -*-===// +//===-- ProfileReader.h - Bolt profile deserializer -------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/ProfileYAMLMapping.h b/bolt/src/ProfileYAMLMapping.h index 46503b73180f..289f86612bc8 100644 --- a/bolt/src/ProfileYAMLMapping.h +++ b/bolt/src/ProfileYAMLMapping.h @@ -1,4 +1,4 @@ -//===-- ProfileYAMLMapping.h - mappings for BOLT profile --------*- C++ -*-===// +//===-- ProfileYAMLMapping.h - mappings for Bolt profile --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 6288da355536..c576418e5c01 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -353,7 +353,7 @@ Verbosity("v", static cl::opt AddBoltInfo("add-bolt-info", - cl::desc("add BOLT version and command line argument information to " + cl::desc("add Bolt version and command line argument information to " "processed binaries"), cl::init(true), cl::cat(BoltCategory)); @@ -482,7 +482,7 @@ constexpr const char *RewriteInstance::SectionsToOverwrite[]; const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; -const std::string RewriteInstance::BOLTSecPrefix = ".bolt"; +const std::string RewriteInstance::BoltSecPrefix = ".bolt"; const char RewriteInstance::TimerGroupName[] = "rewrite"; const char RewriteInstance::TimerGroupDesc[] = "Rewrite passes"; @@ -835,7 +835,7 @@ void RewriteInstance::discoverStorage() { } if (SectionName.startswith(OrgSecPrefix) || - SectionName.startswith(BOLTSecPrefix)) { + SectionName.startswith(BoltSecPrefix)) { errs() << "BOLT-ERROR: input file was processed by BOLT. " "Cannot re-optimize.\n"; exit(1); @@ -1001,7 +1001,8 @@ void RewriteInstance::run() { checkLargeFunctions()) { ++PassNumber; // Emit again because now some functions have been split - outs() << "BOLT: split-functions: starting pass " << PassNumber << "...\n"; + outs() << "BOLT-INFO: split-functions: starting pass " << PassNumber + << "...\n"; reset(); executeRewritePass({}); } @@ -1012,7 +1013,7 @@ void RewriteInstance::run() { if (opts::UpdateDebugSections && opts::FixDebugInfoLargeFunctions && checkLargeFunctions()) { ++PassNumber; - outs() << "BOLT: starting pass (ignoring large functions) " + outs() << "BOLT-INFO: starting pass (ignoring large functions) " << PassNumber << "...\n"; reset(); executeRewritePass(LargeFunctions); @@ -1182,7 +1183,7 @@ void RewriteInstance::discoverFileObjects() { std::string AlternativeName; if (Name.empty()) { if (PLTSection && PLTSection->getAddress() == Address) { - // Don't register BOLT_PLT_PSEUDO twice. + // Don't register __BOLT_PLT_PSEUDO twice. continue; } UniqueName = "ANONYMOUS." + std::to_string(AnonymousId++); @@ -2269,7 +2270,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { (BD->nameStartsWith("ANONYMOUS") && (BD->getSectionName().startswith(".plt") || BD->getSectionName().endswith(".plt")))) && - "BOLT symbol names of all non-section relocations must match " + "Bolt symbol names of all non-section relocations must match " "up with symbol names referenced in the relocation"); if (!opts::AllowSectionRelocations && IsSectionRelocation) { @@ -3072,7 +3073,7 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/true, /*IsText=*/true, /*IsAllocatable=*/true); - auto &Section = BC->registerOrUpdateSection(BOLTSecPrefix + ".text", + auto &Section = BC->registerOrUpdateSection(BoltSecPrefix + ".text", ELF::SHT_PROGBITS, Flags, nullptr, @@ -3577,7 +3578,7 @@ void RewriteInstance::addBoltInfoSection() { std::string DescStr; raw_string_ostream DescOS(DescStr); - DescOS << "BOLT revision: " << BoltRevision << ", " << "command line:"; + DescOS << "Bolt revision: " << BoltRevision << ", " << "command line:"; for (auto I = 0; I < Argc; ++I) { DescOS << " " << Argv[I]; } @@ -4377,7 +4378,7 @@ void RewriteInstance::rewriteFile() { OverwrittenScore += Function.getFunctionScore(); // Overwrite function in the output file. if (opts::Verbosity >= 2) { - outs() << "BOLT: rewriting function \"" << Function << "\"\n"; + outs() << "BOLT-INFO: rewriting function \"" << Function << "\"\n"; } OS.pwrite(reinterpret_cast(Function.getImageAddress()), Function.getImageSize(), @@ -4407,7 +4408,7 @@ void RewriteInstance::rewriteFile() { ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { - outs() << "BOLT: maximum number of functions reached\n"; + outs() << "BOLT-INFO: maximum number of functions reached\n"; break; } continue; @@ -4415,7 +4416,7 @@ void RewriteInstance::rewriteFile() { // Write cold part if (opts::Verbosity >= 2) { - outs() << "BOLT: rewriting function \"" << Function + outs() << "BOLT-INFO: rewriting function \"" << Function << "\" (cold part)\n"; } OS.pwrite(reinterpret_cast(Function.cold().getImageAddress()), @@ -4427,7 +4428,7 @@ void RewriteInstance::rewriteFile() { ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { - outs() << "BOLT: maximum number of functions reached\n"; + outs() << "BOLT-INFO: maximum number of functions reached\n"; break; } } @@ -4464,7 +4465,7 @@ void RewriteInstance::rewriteFile() { if (!Section.isFinalized() || Section.isLocal()) continue; if (opts::Verbosity >= 1) { - outs() << "BOLT: writing new section " << Section.getName() + outs() << "BOLT-INFO: writing new section " << Section.getName() << "\n data at 0x" << Twine::utohexstr(Section.getAllocAddress()) << "\n of size " << Section.getOutputSize() << "\n at offset " << Section.getFileOffset() << '\n'; diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 5b2c984d6740..d208ccbe65fc 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -323,7 +323,7 @@ class RewriteInstance { std::vector *OutputSections = nullptr, std::map *OutputSectionNameMap = nullptr); - /// Add a notes section containing the BOLT revision and command line options. + /// Add a notes section containing the Bolt revision and command line options. void addBoltInfoSection(); /// Computes output .debug_line line table offsets for each compile unit, @@ -522,7 +522,7 @@ class RewriteInstance { static const std::string OrgSecPrefix; - static const std::string BOLTSecPrefix; + static const std::string BoltSecPrefix; /// Number of processed to data relocations. Used to implement the /// -max-relocations debugging option. diff --git a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp index 6b5bc2b8f94a..394fb03d814f 100644 --- a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -214,7 +214,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { bool hasPCRelOperand(const MCInst &Inst) const override { // ADRP is blacklisted and is an exception. Even though it has a // PC-relative operand, this operand is not a complete symbol reference - // and BOLT shouldn't try to process it in isolation. + // and Bolt shouldn't try to process it in isolation. if (isADRP(Inst)) return false; diff --git a/bolt/src/llvm-bolt.cpp b/bolt/src/llvm-bolt.cpp index a07e31328d39..a29697635ad4 100644 --- a/bolt/src/llvm-bolt.cpp +++ b/bolt/src/llvm-bolt.cpp @@ -33,10 +33,10 @@ using namespace bolt; namespace opts { -cl::OptionCategory BoltCategory("BOLT generic options"); -cl::OptionCategory BoltDiffCategory("BOLTDIFF generic options"); -cl::OptionCategory BoltOptCategory("BOLT optimization options"); -cl::OptionCategory BoltRelocCategory("BOLT options in relocation mode"); +cl::OptionCategory BoltCategory("Bolt generic options"); +cl::OptionCategory BoltDiffCategory("BoltDiff generic options"); +cl::OptionCategory BoltOptCategory("Bolt optimization options"); +cl::OptionCategory BoltRelocCategory("Bolt options in relocation mode"); cl::OptionCategory BoltOutputCategory("Output options"); cl::OptionCategory AggregatorCategory("Data aggregation options"); @@ -124,14 +124,14 @@ const char *BoltRevision = } static void printBoltRevision(llvm::raw_ostream &OS) { - OS << "BOLT revision " << BoltRevision << "\n"; + OS << "Bolt revision " << BoltRevision << "\n"; } void perf2boltMode(int argc, char **argv) { cl::HideUnrelatedOptions(makeArrayRef(opts::Perf2BoltCategories)); cl::ParseCommandLineOptions( argc, argv, - "perf2bolt - BOLT data aggregator\n" + "perf2bolt - Bolt data aggregator\n" "\nEXAMPLE: perf2bolt -p=perf.data executable -o data.fdata\n"); if (opts::PerfData.empty()) { errs() << ToolName << ": expected -perfdata= option.\n"; @@ -159,7 +159,7 @@ void boltDiffMode(int argc, char **argv) { cl::HideUnrelatedOptions(makeArrayRef(opts::BoltDiffCategories)); cl::ParseCommandLineOptions( argc, argv, - "llvm-boltdiff - BOLT binary diff tool\n" + "llvm-boltdiff - Bolt binary diff tool\n" "\nEXAMPLE: llvm-boltdiff -data=a.fdata -data2=b.fdata exec1 exec2\n"); if (opts::InputDataFilename2.empty()) { errs() << ToolName << ": expected -data2= option.\n"; @@ -187,7 +187,7 @@ void boltMode(int argc, char **argv) { cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); cl::ParseCommandLineOptions(argc, argv, - "BOLT - Binary Optimization and Layout Tool\n"); + "Bolt - Binary Optimization and Layout Tool\n"); if (opts::OutputFilename.empty()) { errs() << ToolName << ": expected -o= option.\n"; From 24b7d4339e95b7c6df55af4aec2feaabd1a8278b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 14 Jun 2018 14:27:20 -0700 Subject: [PATCH 431/904] Revert "[Bolt][NFC] Change capitalization s/BOLT/Bolt/g" Summary: (cherry picked from commit 82d76b1a952964d0511d0e542f004c0d7fa4c0e3) --- bolt/README.txt | 12 ++++---- bolt/src/BinaryContext.h | 2 +- bolt/src/BinaryFunction.cpp | 2 +- bolt/src/DataAggregator.cpp | 2 +- bolt/src/DataAggregator.h | 2 +- bolt/src/DataReader.cpp | 2 +- bolt/src/Passes/Inliner.cpp | 2 +- bolt/src/Passes/Inliner.h | 2 +- bolt/src/Passes/JTFootprintReduction.h | 2 +- bolt/src/Passes/LongJmp.h | 2 +- bolt/src/Passes/RegReAssign.cpp | 2 +- bolt/src/ProfileReader.cpp | 2 +- bolt/src/ProfileReader.h | 2 +- bolt/src/ProfileYAMLMapping.h | 2 +- bolt/src/RewriteInstance.cpp | 29 +++++++++---------- bolt/src/RewriteInstance.h | 4 +-- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 2 +- bolt/src/llvm-bolt.cpp | 16 +++++----- 18 files changed, 44 insertions(+), 45 deletions(-) diff --git a/bolt/README.txt b/bolt/README.txt index a5d5259b3190..aaf4e50f4913 100644 --- a/bolt/README.txt +++ b/bolt/README.txt @@ -1,21 +1,21 @@ -Bolt +BOLT ==== - Bolt is a post-link optimizer developed to speed up large applications. + BOLT is a post-link optimizer developed to speed up large applications. It achieves speed-ups by optimizing application's code layout based on an execution profile gathered by sampling profilers such as Linux `perf` tool. - Bolt could operate on any binary with symbol table, but for maximum gains + BOLT could operate on any binary with symbol table, but for maximum gains it utilizes relocations saved by a linker (--emit-relocs). - NOTE: Currently Bolt support is limited to non-PIC/non-PIE binaries. + NOTE: Currently BOLT support is limited to non-PIC/PIE binaries. INSTALLATION ============ - Bolt heavily uses LLVM libraries and by design it is built as one of LLVM + BOLT heavily uses LLVM libraries and by design it is built as one of LLVM tools. The build process in not much different from regular LLVM. - Start with cloning LLVM and Bolt repos: + Start with cloning LLVM and BOLT repos: > git clone https://github.com/llvm-mirror/llvm llvm > cd llvm/tools diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 719c1765d821..7d3a5db4ce6c 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -258,7 +258,7 @@ class BinaryContext { uint64_t MissedMacroFusionExecCount{0}; /// Track next available address for new allocatable sections. RewriteInstance - /// sets this prior to running Bolt passes, so layout passes are aware of the + /// sets this prior to running BOLT passes, so layout passes are aware of the /// final addresses functions will have. uint64_t LayoutStartAddress{0}; diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index cc47cb56fe95..32ed7ba32e01 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -300,7 +300,7 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { return nullptr; /* - * This is commented out because it makes Bolt too slow. + * This is commented out because it makes BOLT too slow. * assert(std::is_sorted(BasicBlockOffsets.begin(), * BasicBlockOffsets.end(), * CompareBasicBlockOffsets()))); diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 61fa1ac2af96..655cf8050bbe 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -50,7 +50,7 @@ IgnoreBuildID("ignore-build-id", static cl::opt TimeAggregator("time-aggr", - cl::desc("time Bolt aggregator"), + cl::desc("time BOLT aggregator"), cl::init(false), cl::ZeroOrMore, cl::cat(AggregatorCategory)); diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index 2820f5836b59..becce32b91e1 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -89,7 +89,7 @@ class DataAggregator : public DataReader { DenseSet PIDs; - /// References to core Bolt data structures + /// References to core BOLT data structures BinaryContext *BC{nullptr}; std::map *BFs{nullptr}; diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp index c7945e5bcf5c..348b83d4e334 100644 --- a/bolt/src/DataReader.cpp +++ b/bolt/src/DataReader.cpp @@ -33,7 +33,7 @@ Optional getLTOCommonName(const StringRef Name) { namespace { -/// Return standard name of the function possibly renamed by Bolt. +/// Return standard name of the function possibly renamed by BOLT. StringRef normalizeName(StringRef Name) { // Strip "PG." prefix used for globalized locals. return Name.startswith("PG.") ? Name.substr(2) : Name; diff --git a/bolt/src/Passes/Inliner.cpp b/bolt/src/Passes/Inliner.cpp index 21b82053c5ac..abde11258782 100644 --- a/bolt/src/Passes/Inliner.cpp +++ b/bolt/src/Passes/Inliner.cpp @@ -1,4 +1,4 @@ -//===--- Passes/Inliner.cpp - Inlining infra for Bolt ---------------------===// +//===--- Passes/Inliner.cpp - Inlining infra for BOLT ---------------------===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/Passes/Inliner.h b/bolt/src/Passes/Inliner.h index 4c36634d4cfa..4a548a5dba84 100644 --- a/bolt/src/Passes/Inliner.h +++ b/bolt/src/Passes/Inliner.h @@ -1,4 +1,4 @@ -//===--- Passes/Inliner.h - Inlining infra for Bolt -----------------------===// +//===--- Passes/Inliner.h - Inlining infra for BOLT -----------------------===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/Passes/JTFootprintReduction.h b/bolt/src/Passes/JTFootprintReduction.h index a1e9a4723e35..81be253a6e3e 100644 --- a/bolt/src/Passes/JTFootprintReduction.h +++ b/bolt/src/Passes/JTFootprintReduction.h @@ -23,7 +23,7 @@ namespace bolt { /// This pass identify indirect jumps to jump tables and reduce their entries /// size from 8 to 4 bytes. For PIC jump tables, it will remove the PIC code -/// (since Bolt only process static code and it makes no sense to use expensive +/// (since BOLT only process static code and it makes no sense to use expensive /// PIC-style jumps in static code). class JTFootprintReduction : public BinaryFunctionPass { uint64_t TotalJTScore{0}; diff --git a/bolt/src/Passes/LongJmp.h b/bolt/src/Passes/LongJmp.h index cf5c947748fe..e771b6767076 100644 --- a/bolt/src/Passes/LongJmp.h +++ b/bolt/src/Passes/LongJmp.h @@ -18,7 +18,7 @@ namespace bolt { /// LongJmp is veneer-insertion pass originally written for AArch64 that /// compensates for its short-range branches, typically done during linking. We -/// pull this pass inside Bolt because here we can do a better job at stub +/// pull this pass inside BOLT because here we can do a better job at stub /// inserting by manipulating the CFG, something linkers can't do. /// /// LongJmp is a two-step process. In the first step, when function sizes are diff --git a/bolt/src/Passes/RegReAssign.cpp b/bolt/src/Passes/RegReAssign.cpp index b6ae08650af2..1c0f9e5ac636 100644 --- a/bolt/src/Passes/RegReAssign.cpp +++ b/bolt/src/Passes/RegReAssign.cpp @@ -419,7 +419,7 @@ void RegReAssign::runOnFunctions(BinaryContext &BC, outs() << "BOLT-WARNING: You used -reg-reassign and -update-debug-sections." << " Some registers were changed but associated AT_LOCATION for " << "impacted variables were NOT updated! This operation is " - << "currently unsupported by Bolt.\n"; + << "currently unsupported by BOLT.\n"; } outs() << "BOLT-INFO: Reg Reassignment Pass Stats:\n"; outs() << "\t " << FuncsChanged.size() << " functions affected.\n"; diff --git a/bolt/src/ProfileReader.cpp b/bolt/src/ProfileReader.cpp index 38fb80f6f703..9851814fcbfd 100644 --- a/bolt/src/ProfileReader.cpp +++ b/bolt/src/ProfileReader.cpp @@ -1,4 +1,4 @@ -//===-- ProfileReader.cpp - Bolt profile de-serializer ----------*- C++ -*-===// +//===-- ProfileReader.cpp - BOLT profile de-serializer ----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/ProfileReader.h b/bolt/src/ProfileReader.h index b8619d6e3172..7bd2ff85dbe2 100644 --- a/bolt/src/ProfileReader.h +++ b/bolt/src/ProfileReader.h @@ -1,4 +1,4 @@ -//===-- ProfileReader.h - Bolt profile deserializer -------------*- C++ -*-===// +//===-- ProfileReader.h - BOLT profile deserializer -------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/ProfileYAMLMapping.h b/bolt/src/ProfileYAMLMapping.h index 289f86612bc8..46503b73180f 100644 --- a/bolt/src/ProfileYAMLMapping.h +++ b/bolt/src/ProfileYAMLMapping.h @@ -1,4 +1,4 @@ -//===-- ProfileYAMLMapping.h - mappings for Bolt profile --------*- C++ -*-===// +//===-- ProfileYAMLMapping.h - mappings for BOLT profile --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index c576418e5c01..6288da355536 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -353,7 +353,7 @@ Verbosity("v", static cl::opt AddBoltInfo("add-bolt-info", - cl::desc("add Bolt version and command line argument information to " + cl::desc("add BOLT version and command line argument information to " "processed binaries"), cl::init(true), cl::cat(BoltCategory)); @@ -482,7 +482,7 @@ constexpr const char *RewriteInstance::SectionsToOverwrite[]; const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; -const std::string RewriteInstance::BoltSecPrefix = ".bolt"; +const std::string RewriteInstance::BOLTSecPrefix = ".bolt"; const char RewriteInstance::TimerGroupName[] = "rewrite"; const char RewriteInstance::TimerGroupDesc[] = "Rewrite passes"; @@ -835,7 +835,7 @@ void RewriteInstance::discoverStorage() { } if (SectionName.startswith(OrgSecPrefix) || - SectionName.startswith(BoltSecPrefix)) { + SectionName.startswith(BOLTSecPrefix)) { errs() << "BOLT-ERROR: input file was processed by BOLT. " "Cannot re-optimize.\n"; exit(1); @@ -1001,8 +1001,7 @@ void RewriteInstance::run() { checkLargeFunctions()) { ++PassNumber; // Emit again because now some functions have been split - outs() << "BOLT-INFO: split-functions: starting pass " << PassNumber - << "...\n"; + outs() << "BOLT: split-functions: starting pass " << PassNumber << "...\n"; reset(); executeRewritePass({}); } @@ -1013,7 +1012,7 @@ void RewriteInstance::run() { if (opts::UpdateDebugSections && opts::FixDebugInfoLargeFunctions && checkLargeFunctions()) { ++PassNumber; - outs() << "BOLT-INFO: starting pass (ignoring large functions) " + outs() << "BOLT: starting pass (ignoring large functions) " << PassNumber << "...\n"; reset(); executeRewritePass(LargeFunctions); @@ -1183,7 +1182,7 @@ void RewriteInstance::discoverFileObjects() { std::string AlternativeName; if (Name.empty()) { if (PLTSection && PLTSection->getAddress() == Address) { - // Don't register __BOLT_PLT_PSEUDO twice. + // Don't register BOLT_PLT_PSEUDO twice. continue; } UniqueName = "ANONYMOUS." + std::to_string(AnonymousId++); @@ -2270,7 +2269,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { (BD->nameStartsWith("ANONYMOUS") && (BD->getSectionName().startswith(".plt") || BD->getSectionName().endswith(".plt")))) && - "Bolt symbol names of all non-section relocations must match " + "BOLT symbol names of all non-section relocations must match " "up with symbol names referenced in the relocation"); if (!opts::AllowSectionRelocations && IsSectionRelocation) { @@ -3073,7 +3072,7 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/true, /*IsText=*/true, /*IsAllocatable=*/true); - auto &Section = BC->registerOrUpdateSection(BoltSecPrefix + ".text", + auto &Section = BC->registerOrUpdateSection(BOLTSecPrefix + ".text", ELF::SHT_PROGBITS, Flags, nullptr, @@ -3578,7 +3577,7 @@ void RewriteInstance::addBoltInfoSection() { std::string DescStr; raw_string_ostream DescOS(DescStr); - DescOS << "Bolt revision: " << BoltRevision << ", " << "command line:"; + DescOS << "BOLT revision: " << BoltRevision << ", " << "command line:"; for (auto I = 0; I < Argc; ++I) { DescOS << " " << Argv[I]; } @@ -4378,7 +4377,7 @@ void RewriteInstance::rewriteFile() { OverwrittenScore += Function.getFunctionScore(); // Overwrite function in the output file. if (opts::Verbosity >= 2) { - outs() << "BOLT-INFO: rewriting function \"" << Function << "\"\n"; + outs() << "BOLT: rewriting function \"" << Function << "\"\n"; } OS.pwrite(reinterpret_cast(Function.getImageAddress()), Function.getImageSize(), @@ -4408,7 +4407,7 @@ void RewriteInstance::rewriteFile() { ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { - outs() << "BOLT-INFO: maximum number of functions reached\n"; + outs() << "BOLT: maximum number of functions reached\n"; break; } continue; @@ -4416,7 +4415,7 @@ void RewriteInstance::rewriteFile() { // Write cold part if (opts::Verbosity >= 2) { - outs() << "BOLT-INFO: rewriting function \"" << Function + outs() << "BOLT: rewriting function \"" << Function << "\" (cold part)\n"; } OS.pwrite(reinterpret_cast(Function.cold().getImageAddress()), @@ -4428,7 +4427,7 @@ void RewriteInstance::rewriteFile() { ++CountOverwrittenFunctions; if (opts::MaxFunctions && CountOverwrittenFunctions == opts::MaxFunctions) { - outs() << "BOLT-INFO: maximum number of functions reached\n"; + outs() << "BOLT: maximum number of functions reached\n"; break; } } @@ -4465,7 +4464,7 @@ void RewriteInstance::rewriteFile() { if (!Section.isFinalized() || Section.isLocal()) continue; if (opts::Verbosity >= 1) { - outs() << "BOLT-INFO: writing new section " << Section.getName() + outs() << "BOLT: writing new section " << Section.getName() << "\n data at 0x" << Twine::utohexstr(Section.getAllocAddress()) << "\n of size " << Section.getOutputSize() << "\n at offset " << Section.getFileOffset() << '\n'; diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index d208ccbe65fc..5b2c984d6740 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -323,7 +323,7 @@ class RewriteInstance { std::vector *OutputSections = nullptr, std::map *OutputSectionNameMap = nullptr); - /// Add a notes section containing the Bolt revision and command line options. + /// Add a notes section containing the BOLT revision and command line options. void addBoltInfoSection(); /// Computes output .debug_line line table offsets for each compile unit, @@ -522,7 +522,7 @@ class RewriteInstance { static const std::string OrgSecPrefix; - static const std::string BoltSecPrefix; + static const std::string BOLTSecPrefix; /// Number of processed to data relocations. Used to implement the /// -max-relocations debugging option. diff --git a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp index 394fb03d814f..6b5bc2b8f94a 100644 --- a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -214,7 +214,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { bool hasPCRelOperand(const MCInst &Inst) const override { // ADRP is blacklisted and is an exception. Even though it has a // PC-relative operand, this operand is not a complete symbol reference - // and Bolt shouldn't try to process it in isolation. + // and BOLT shouldn't try to process it in isolation. if (isADRP(Inst)) return false; diff --git a/bolt/src/llvm-bolt.cpp b/bolt/src/llvm-bolt.cpp index a29697635ad4..a07e31328d39 100644 --- a/bolt/src/llvm-bolt.cpp +++ b/bolt/src/llvm-bolt.cpp @@ -33,10 +33,10 @@ using namespace bolt; namespace opts { -cl::OptionCategory BoltCategory("Bolt generic options"); -cl::OptionCategory BoltDiffCategory("BoltDiff generic options"); -cl::OptionCategory BoltOptCategory("Bolt optimization options"); -cl::OptionCategory BoltRelocCategory("Bolt options in relocation mode"); +cl::OptionCategory BoltCategory("BOLT generic options"); +cl::OptionCategory BoltDiffCategory("BOLTDIFF generic options"); +cl::OptionCategory BoltOptCategory("BOLT optimization options"); +cl::OptionCategory BoltRelocCategory("BOLT options in relocation mode"); cl::OptionCategory BoltOutputCategory("Output options"); cl::OptionCategory AggregatorCategory("Data aggregation options"); @@ -124,14 +124,14 @@ const char *BoltRevision = } static void printBoltRevision(llvm::raw_ostream &OS) { - OS << "Bolt revision " << BoltRevision << "\n"; + OS << "BOLT revision " << BoltRevision << "\n"; } void perf2boltMode(int argc, char **argv) { cl::HideUnrelatedOptions(makeArrayRef(opts::Perf2BoltCategories)); cl::ParseCommandLineOptions( argc, argv, - "perf2bolt - Bolt data aggregator\n" + "perf2bolt - BOLT data aggregator\n" "\nEXAMPLE: perf2bolt -p=perf.data executable -o data.fdata\n"); if (opts::PerfData.empty()) { errs() << ToolName << ": expected -perfdata= option.\n"; @@ -159,7 +159,7 @@ void boltDiffMode(int argc, char **argv) { cl::HideUnrelatedOptions(makeArrayRef(opts::BoltDiffCategories)); cl::ParseCommandLineOptions( argc, argv, - "llvm-boltdiff - Bolt binary diff tool\n" + "llvm-boltdiff - BOLT binary diff tool\n" "\nEXAMPLE: llvm-boltdiff -data=a.fdata -data2=b.fdata exec1 exec2\n"); if (opts::InputDataFilename2.empty()) { errs() << ToolName << ": expected -data2= option.\n"; @@ -187,7 +187,7 @@ void boltMode(int argc, char **argv) { cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); cl::ParseCommandLineOptions(argc, argv, - "Bolt - Binary Optimization and Layout Tool\n"); + "BOLT - Binary Optimization and Layout Tool\n"); if (opts::OutputFilename.empty()) { errs() << ToolName << ": expected -o= option.\n"; From 6df505d111513e2739d2fc0d1f9bafbc46056a8f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sun, 17 Jun 2018 22:29:27 -0700 Subject: [PATCH 432/904] [BOLT] Update llvm.patch Summary: (cherry picked from commit c8b37c380cac09d9178b92edb9a99cbf861a8334) --- bolt/llvm.patch | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/bolt/llvm.patch b/bolt/llvm.patch index 53272088cb62..94821280453d 100644 --- a/bolt/llvm.patch +++ b/bolt/llvm.patch @@ -2446,6 +2446,55 @@ index ed79f4f..95cb71f 100644 set(sources X86AsmPrinter.cpp +diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp +index c58254a..ab9241e 100644 +--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp ++++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp +@@ -247,6 +247,8 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction( + // It should not be 'pause' f3 90 + InternalInstr.opcode != 0x90) + Flags |= X86::IP_HAS_REPEAT; ++ if (InternalInstr.hasLockPrefix) ++ Flags |= X86::IP_HAS_LOCK; + } + Instr.setFlags(Flags); + } +diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +index 6a10278..626b143 100644 +--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp ++++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +@@ -298,6 +298,9 @@ static bool isREX(struct InternalInstruction *insn, uint8_t prefix) { + static void setPrefixPresent(struct InternalInstruction *insn, uint8_t prefix) { + uint8_t nextByte; + switch (prefix) { ++ case 0xf0: ++ insn->hasLockPrefix = true; ++ break; + case 0xf2: + case 0xf3: + if (lookAtByte(insn, &nextByte)) +@@ -1748,7 +1751,7 @@ static int readOperands(struct InternalInstruction* insn) { + + // If sibIndex was set to SIB_INDEX_NONE, index offset is 4. + if (insn->sibIndex == SIB_INDEX_NONE) +- insn->sibIndex = (SIBIndex)4; ++ insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4); + + // If EVEX.v2 is set this is one of the 16-31 registers. + if (insn->vectorExtensionType == TYPE_EVEX && +diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +index 44422a9..d60aa3f 100644 +--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h ++++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +@@ -563,6 +563,8 @@ struct InternalInstruction { + bool hasAdSize; + // Operand-size override + bool hasOpSize; ++ // Lock prefix ++ bool hasLockPrefix; + // The repeat prefix if any + uint8_t repeatPrefix; + diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index fa7c352..35d28c1 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp From 36cd423db89cdb100f01316ae05203b37824c25e Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 20 Jun 2018 12:03:24 -0700 Subject: [PATCH 433/904] [BOLT] Add a user friendly error reporting message Summary: In case we fail to disassemble or to build the CFG for a function, print instructions on bug reporting. (cherry picked from commit 6688c9ba2e407a51f8264cc202f6056958dd8adc) --- bolt/src/BinaryContext.cpp | 19 +++++++++++++++++++ bolt/src/BinaryContext.h | 3 +++ bolt/src/BinaryFunction.cpp | 10 ++++++++-- bolt/src/RewriteInstance.cpp | 16 +++++++--------- 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 52b521c54758..833924d4f667 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -1052,3 +1052,22 @@ const Relocation *BinaryContext::getRelocationAt(uint64_t Address) { assert(Section && "cannot find section for address"); return Section->getRelocationAt(Address - Section->getAddress()); } + +void BinaryContext::exitWithBugReport(StringRef Message, + const BinaryFunction &Function) const { + errs() << "=======================================\n"; + errs() << "BOLT is unable to proceed because it couldn't properly understand " + "this function.\n"; + errs() << "If you are running the most recent version of BOLT, you may " + "want to " + "report this and paste this dump.\nPlease check that there is no " + "sensitive contents being shared in this dump.\n"; + errs() << "\nOffending function: " << Function.getPrintName() << "\n\n"; + ScopedPrinter SP(errs()); + SP.printBinaryBlock("Function contents", *getFunctionData(Function)); + errs() << "\n"; + Function.dump(); + errs() << "ERROR: " << Message; + errs() << "\n=======================================\n"; + exit(1); +} diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 7d3a5db4ce6c..b320fbbf62cf 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -720,6 +720,9 @@ class BinaryContext { } return Offset; } + + void exitWithBugReport(StringRef Message, + const BinaryFunction &Function) const; }; } // namespace bolt diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 32ed7ba32e01..db441a2301e8 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1721,9 +1721,15 @@ bool BinaryFunction::buildCFG() { DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first) << "] -> [0x" << Twine::utohexstr(Branch.second) << "]\n"); auto *FromBB = getBasicBlockContainingOffset(Branch.first); - assert(FromBB && "cannot find BB containing FROM branch"); auto *ToBB = getBasicBlockAtOffset(Branch.second); - assert(ToBB && "cannot find BB containing TO branch"); + if (!FromBB || !ToBB) { + if (!FromBB) + errs() << "BOLT-ERROR: cannot find BB containing the branch.\n"; + if (!ToBB) + errs() << "BOLT-ERROR: cannot find BB containing branch destination.\n"; + BC.exitWithBugReport("disassembly failed - inconsistent branch found.", + *this); + } FromBB->addSuccessor(ToBB); } diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 6288da355536..47c7ff848ab6 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -2469,9 +2469,9 @@ void RewriteInstance::disassembleFunctions() { Function.disassemble(*FunctionData); if (!Function.isSimple() && BC->HasRelocations) { - errs() << "BOLT-ERROR: function " << Function << " cannot be properly " - << "disassembled. Unable to continue in relocation mode.\n"; - exit(1); + BC->exitWithBugReport("function cannot be properly disassembled. " + "Unable to continue in relocation mode.", + Function); } if (opts::PrintAll || opts::PrintDisasm) @@ -2550,9 +2550,7 @@ void RewriteInstance::disassembleFunctions() { if (!Function.trapsOnEntry()) { if (!CFIRdWrt->fillCFIInfoFor(Function)) { if (BC->HasRelocations) { - errs() << "BOLT-ERROR: unable to fill CFI for function " - << Function << ". Aborting.\n"; - exit(1); + BC->exitWithBugReport("unable to fill CFI.", Function); } else { errs() << "BOLT-WARNING: unable to fill CFI for function " << Function << ". Skipping.\n"; @@ -2952,7 +2950,7 @@ void RewriteInstance::mapFileSections(orc::VModuleKey Key) { mapTextSections(Key); mapDataSections(Key); } - + void RewriteInstance::mapTextSections(orc::VModuleKey Key) { NewTextSectionStartAddress = NextAvailableAddress; if (BC->HasRelocations) { @@ -3248,7 +3246,7 @@ void RewriteInstance::emitDataSection(MCStreamer *Streamer, if (BC->HasRelocations && opts::HotData && Section.isReordered()) Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_data_start")); - + DEBUG(dbgs() << "BOLT-DEBUG: emitting " << (Section.isAllocatable() ? "" : "non-") << "allocatable data section " << SectionName << '\n'); @@ -3718,7 +3716,7 @@ std::vector RewriteInstance::getOutputSections( AllocatableSections.push_back(&Section); } } - + for (const auto *Section : AllocatableSections) { // Ignore function sections. if (Section->getFileAddress() < NewTextSegmentAddress) { From 2e470bd8c0c1af54c1235b19729ec8037e70cebf Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 20 Jun 2018 21:43:22 -0700 Subject: [PATCH 434/904] [BOLT] Fix support for PIC jump tables Summary: BOLT heuristics failed to work if false PIC jump table entries were accepted when they were pointing inside a function, but not at an instruction boundary. This fix checks if the destination falls at instruction boundary, and if it does not, it truncates the jump table. This, of course, still does not guarantee that the entry corresponds to a real destination, and we can have "false positive" entry(ies). However, it shouldn't affect correctness of the function, but the CFG may have edges that are never taken. We may update an incorrect jump table entry, corresponding to an unrelated data, and for that reason we force moving of jump tables if a PIC jump table was detected. (cherry picked from commit 548d923bcb4905551b6e9619f66c6ebcf6207dc4) --- bolt/src/BinaryFunction.cpp | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index db441a2301e8..4d949ff092ae 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -854,12 +854,13 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, Type = IndirectBranchType::POSSIBLE_JUMP_TABLE; continue; } - // Potentially a switch table can contain __builtin_unreachable() entry + // Potentially a switch table can contain __builtin_unreachable() entry // pointing just right after the function. In this case we have to check // another entry. Otherwise the entry is outside of this function scope // and it's not a switch table. if (Value == getAddress() + getSize()) { - JTOffsetCandidates.push_back(Value - getAddress()); + JTOffsetCandidates.push_back(getSize()); + IgnoredBranches.emplace_back(Offset, getSize()); } else { break; } @@ -1388,16 +1389,32 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { addInstruction(Offset, std::move(Instruction)); } - postProcessJumpTables(); - updateState(State::Disassembled); + + postProcessJumpTables(); } void BinaryFunction::postProcessJumpTables() { // Create labels for all entries. for (auto &JTI : JumpTables) { auto &JT = *JTI.second; - for (auto Offset : JT.OffsetEntries) { + if (JT.Type == JumpTable::JTT_PIC && opts::JumpTables == JTS_BASIC) { + opts::JumpTables = JTS_MOVE; + outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was " + "detected\n"; + } + for (unsigned I = 0; I < JT.OffsetEntries.size(); ++I) { + auto Offset = JT.OffsetEntries[I]; + if (Offset != getSize() && !getInstructionAtOffset(Offset)) { + DEBUG(dbgs() << "BOLT-DEBUG: truncating jump table " << JT.getName() + << " at index " << I << " containing offset 0x" + << Twine::utohexstr(Offset) << '\n'); + assert(I > 1 && "jump table with a size smaller than 1 detected"); + assert(JT.Type == JumpTable::JTT_PIC && + "unexpected truncation of non-PIC jump table"); + JT.OffsetEntries.resize(I); + break; + } auto *Label = getOrCreateLocalLabel(getAddress() + Offset, /*CreatePastEnd*/ true); JT.Entries.push_back(Label); From ea3ce67b8fb1d249c4bc48a0f40c959bc6292139 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 21 Jun 2018 14:45:38 -0700 Subject: [PATCH 435/904] [merge-fdata] Support legacy/non-YAML profile format Summary: Concatenate profile contents if they are not in YAML format. (cherry picked from commit 51493efe737f11bfb36f1e8177a9b80bd20bb827) --- bolt/src/merge-fdata/merge-fdata.cpp | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/bolt/src/merge-fdata/merge-fdata.cpp b/bolt/src/merge-fdata/merge-fdata.cpp index 61d9cb956cc8..58ee4c817bd3 100644 --- a/bolt/src/merge-fdata/merge-fdata.cpp +++ b/bolt/src/merge-fdata/merge-fdata.cpp @@ -233,6 +233,30 @@ void mergeFunctionProfile(BinaryFunctionProfile &MergedBF, } } +bool isYAML(const StringRef Filename) { + auto MB = MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = MB.getError()) + report_error(Filename, EC); + auto Buffer = MB.get()->getBuffer(); + if (Buffer.startswith("---\n")) + return true; + return false; +} + +void mergeLegacyProfiles(const cl::list &Filenames) { + errs() << "Using legacy profile format.\n"; + for (auto &Filename : Filenames) { + if (isYAML(Filename)) + report_error(Filename, "cannot mix YAML and legacy formats"); + auto MB = MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = MB.getError()) + report_error(Filename, EC); + errs() << "Merging data from " << Filename << "...\n"; + outs() << MB.get()->getBuffer(); + } + errs() << "Profile from " << Filenames.size() << " files merged.\n"; +} + } // anonymous namespace int main(int argc, char **argv) { @@ -249,6 +273,11 @@ int main(int argc, char **argv) { ToolName = argv[0]; + if (!isYAML(opts::InputDataFilenames.front())) { + mergeLegacyProfiles(opts::InputDataFilenames); + return 0; + } + // Merged header. BinaryProfileHeader MergedHeader; MergedHeader.Version = 1; From f008574bf49b922c644fc5a3d9a9fb1e7c02e1f0 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 22 Jun 2018 13:50:07 -0700 Subject: [PATCH 436/904] [BOLT] Add initial bolt-only test infra Summary: Create folders and setup to make LIT run BOLT-only tests. Add a test example. This will add a new make/ninja rule "check-bolt" that the user can invoke to run LIT on this folder. (cherry picked from commit b13d2e5861d7125ac23ff9ef1f537a23d561e4a6) --- bolt/CMakeLists.txt | 4 ++ bolt/test/CMakeLists.txt | 39 +++++++++++ bolt/test/X86/Inputs/srol-bug-input.yaml | 61 +++++++++++++++++ bolt/test/X86/srol-bug.test | 43 ++++++++++++ bolt/test/lit.cfg.py | 86 ++++++++++++++++++++++++ bolt/test/lit.site.cfg.py.in | 37 ++++++++++ 6 files changed, 270 insertions(+) create mode 100644 bolt/test/CMakeLists.txt create mode 100644 bolt/test/X86/Inputs/srol-bug-input.yaml create mode 100644 bolt/test/X86/srol-bug.test create mode 100644 bolt/test/lit.cfg.py create mode 100644 bolt/test/lit.site.cfg.py.in diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index febd4f0ab6f8..7860179253b3 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -1 +1,5 @@ +set(BOLT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) + add_subdirectory(src) +add_subdirectory(test) diff --git a/bolt/test/CMakeLists.txt b/bolt/test/CMakeLists.txt new file mode 100644 index 000000000000..ca2756d16f20 --- /dev/null +++ b/bolt/test/CMakeLists.txt @@ -0,0 +1,39 @@ +llvm_canonicalize_cmake_booleans( + ENABLE_BACKTRACES) + +configure_lit_site_cfg( + ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in + ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py + MAIN_CONFIG + ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py + ) + +set(BOLT_TEST_PARAMS + bolt_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg + ) + +list(APPEND BOLT_TEST_DEPS + llvm-config + FileCheck count not + llvm-nm + llvm-objdump + llvm-readobj + yaml2obj + ) + +add_custom_target(bolt-test-depends DEPENDS ${BOLT_TEST_DEPS}) +set_target_properties(bolt-test-depends PROPERTIES FOLDER "BOLT tests") + +add_lit_testsuite(check-bolt "Running the BOLT regression tests" + ${CMAKE_CURRENT_BINARY_DIR} + #LIT ${LLVM_LIT} + PARAMS ${BOLT_TEST_PARAMS} + DEPENDS ${BOLT_TEST_DEPS} + ARGS ${BOLT_TEST_EXTRA_ARGS} + ) +set_target_properties(check-bolt PROPERTIES FOLDER "BOLT tests") + +add_lit_testsuites(BOLT ${CMAKE_CURRENT_SOURCE_DIR} + PARAMS ${BOLT_TEST_PARAMS} + DEPENDS ${BOLT_TEST_DEPS} +) diff --git a/bolt/test/X86/Inputs/srol-bug-input.yaml b/bolt/test/X86/Inputs/srol-bug-input.yaml new file mode 100644 index 000000000000..4393f845ca3d --- /dev/null +++ b/bolt/test/X86/Inputs/srol-bug-input.yaml @@ -0,0 +1,61 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x0000000000400000 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000400000 + AddressAlign: 0x0000000000000010 + Content: 0315fa0000002B15f40000002315ee0000000B15e80000003315e2000000500fB715da0000008A15d4000000668B15cd0000008B15c7000000488B15c00000003A15ba000000663B15b30000003B15ad000000483B15a60000008415a0000000668515990000008515930000004885158c000000C3 + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x0000000000400100 + AddressAlign: 0x0000000000000100 + Content: 010002000000000000000000 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x000000000061ADA8 + Link: .dynstr + AddressAlign: 0x0000000000000008 + Content: 01000000000000000100000000000000010000000000000096000000000000000100000000000000C400000000000000010000000000000001010000000000000C0000000000000028224000000000000D000000000000005C29410000000000190000000000000028A36100000000001B0000000000000008000000000000001A0000000000000030A36100000000001C000000000000000800000000000000F5FEFF6F0000000098024000000000000500000000000000300F4000000000000600000000000000D0024000000000000A00000000000000BC050000000000000B00000000000000180000000000000015000000000000000000000000000000030000000000000000B06100000000000200000000000000C80A0000000000001400000000000000070000000000000017000000000000006017400000000000070000000000000088164000000000000800000000000000D80000000000000009000000000000001800000000000000FEFFFF6F00000000F815400000000000FFFFFF6F000000000200000000000000F0FFFF6F00000000EC14400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +Symbols: + Global: + - Name: mydata + Section: .rodata + Value: 0x0000000000400100 + - Name: myfunc + Type: STT_FUNC + Section: .text + Value: 0x0000000000400000 +DynamicSymbols: + Global: + - Name: mydata + Section: .rodata + Value: 0x0000000000400100 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_X, PF_R ] + VAddr: 0x00400000 + PAddr: 0x00400000 + Sections: + - Section: .text + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x00400000 + PAddr: 0x00400000 + Sections: + - Section: .text + - Type: PT_DYNAMIC + Flags: [ PF_X, PF_R ] + VAddr: 0x0061ADA8 + PAddr: 0x0064ADA8 + Sections: + - Section: .dynamic +... diff --git a/bolt/test/X86/srol-bug.test b/bolt/test/X86/srol-bug.test new file mode 100644 index 000000000000..a6abbea4a2f2 --- /dev/null +++ b/bolt/test/X86/srol-bug.test @@ -0,0 +1,43 @@ +# RUN: yaml2obj %p/Inputs/srol-bug-input.yaml &> %t.exe +# RUN: llvm-bolt %t.exe -simplify-rodata-loads -print-finalized -relocs=0 \ +# RUN: -print-disasm -o %t.out | FileCheck %s + +CHECK: Binary Function "myfunc" after disassembly { +CHECK: 00000000: addl mydata(%rip), %edx +CHECK: 00000006: subl mydata(%rip), %edx +CHECK: 0000000c: andl mydata(%rip), %edx +CHECK: 00000012: orl mydata(%rip), %edx +CHECK: 00000018: xorl mydata(%rip), %edx +CHECK: 0000001f: movzwl mydata(%rip), %edx +CHECK: 00000026: movb mydata(%rip), %dl +CHECK: 0000002c: movw mydata(%rip), %dx +CHECK: 00000033: movl mydata(%rip), %edx +CHECK: 00000039: movq mydata(%rip), %rdx +CHECK: 00000040: cmpb mydata(%rip), %dl +CHECK: 00000046: cmpw mydata(%rip), %dx +CHECK: 0000004d: cmpl mydata(%rip), %edx +CHECK: 00000053: cmpq mydata(%rip), %rdx +CHECK: 0000005a: testb %dl, mydata(%rip) +CHECK: 00000060: testw %dx, mydata(%rip) +CHECK: 00000067: testl %edx, mydata(%rip) +CHECK: 0000006d: testq %rdx, mydata(%rip) + +CHECK: Binary Function "myfunc" after finalize-functions { +CHECK: 00000000: addl $0x20001, %edx +CHECK: 00000006: subl $0x20001, %edx +CHECK: 0000000c: andl $0x20001, %edx +CHECK: 00000012: orl $0x20001, %edx +CHECK: 00000018: xorl $0x20001, %edx +CHECK: 0000001f: movl $0x1, %edx +CHECK: 00000024: movb $0x1, %dl +CHECK: 00000026: movw $0x1, %dx +CHECK: 0000002a: movl $0x20001, %edx +CHECK: 0000002f: movq $0x20001, %rdx +CHECK: 00000036: cmpb $0x1, %dl +CHECK: 00000039: cmpw $0x1, %dx +CHECK: 0000003d: cmpl $0x20001, %edx +CHECK: 00000043: cmpq $0x20001, %rdx +CHECK: 0000004a: testb $0x1, %dl +CHECK: 0000004d: testw $0x1, %dx +CHECK: 00000052: testl $0x20001, %edx +CHECK: 00000058: testq $0x20001, %rdx diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py new file mode 100644 index 000000000000..9c678ca1a1bb --- /dev/null +++ b/bolt/test/lit.cfg.py @@ -0,0 +1,86 @@ +# -*- Python -*- + +import os +import platform +import re +import subprocess +import tempfile + +import lit.formats +import lit.util + +from lit.llvm import llvm_config +from lit.llvm.subst import ToolSubst +from lit.llvm.subst import FindTool + +# Configuration file for the 'lit' test runner. + +# name: The name of this test suite. +config.name = 'BOLT' + +# testFormat: The test format to use to interpret tests. +# +# For now we require '&&' between commands, until they get globally killed and +# the test runner updated. +config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell) + +# suffixes: A list of file extensions to treat as test files. +config.suffixes = ['.c', '.cpp', '.cppm', '.m', '.mm', '.cu', + '.ll', '.cl', '.s', '.S', '.modulemap', '.test', '.rs'] + +# excludes: A list of directories to exclude from the testsuite. The 'Inputs' +# subdirectories contain auxiliary inputs for various tests in their parent +# directories. +config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt'] + +# test_source_root: The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +# test_exec_root: The root path where tests should be run. +config.test_exec_root = os.path.join(config.bolt_obj_root, 'test') + +llvm_config.use_default_substitutions() + +tool_dirs = [config.llvm_tools_dir] +tools = [ + ToolSubst('llvm-bolt', unresolved='fatal'), +] +llvm_config.add_tool_substitutions([], tool_dirs) +llvm_config.with_environment('PATH', tool_dirs, append_path=True) + +# Propagate path to symbolizer for ASan/MSan. +llvm_config.with_system_environment( + ['ASAN_SYMBOLIZER_PATH', 'MSAN_SYMBOLIZER_PATH']) + +config.substitutions.append(('%PATH%', config.environment['PATH'])) + +# Plugins (loadable modules) +# TODO: This should be supplied by Makefile or autoconf. +if sys.platform in ['win32', 'cygwin']: + has_plugins = config.enable_shared +else: + has_plugins = True + +if has_plugins and config.llvm_plugin_ext: + config.available_features.add('plugins') + +def calculate_arch_features(arch_string): + features = [] + for arch in arch_string.split(): + features.append(arch.lower() + '-registered-target') + return features + + +llvm_config.feature_config( + [('--assertion-mode', {'ON': 'asserts'}), + ('--cxxflags', {r'-D_GLIBCXX_DEBUG\b': 'libstdcxx-safe-mode'}), + ('--targets-built', calculate_arch_features) + ]) + +if config.enable_backtrace: + config.available_features.add('backtrace') + +# Check if we should allow outputs to console. +run_console_tests = int(lit_config.params.get('enable_console', '0')) +if run_console_tests != 0: + config.available_features.add('console') diff --git a/bolt/test/lit.site.cfg.py.in b/bolt/test/lit.site.cfg.py.in new file mode 100644 index 000000000000..3f598a06ca85 --- /dev/null +++ b/bolt/test/lit.site.cfg.py.in @@ -0,0 +1,37 @@ +@LIT_SITE_CFG_IN_HEADER@ + +import sys + +config.llvm_src_root = "@LLVM_SOURCE_DIR@" +config.llvm_obj_root = "@LLVM_BINARY_DIR@" +config.bolt_obj_root = "@BOLT_BINARY_DIR@" +config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" +config.llvm_libs_dir = "@LLVM_LIBS_DIR@" +config.llvm_shlib_dir = "@SHLIBDIR@" +config.llvm_plugin_ext = "@LLVM_PLUGIN_EXT@" +config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" +config.host_triple = "@LLVM_HOST_TRIPLE@" +config.target_triple = "@TARGET_TRIPLE@" +config.host_cxx = "@CMAKE_CXX_COMPILER@" +config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" +config.enable_shared = @ENABLE_SHARED@ +config.enable_backtrace = @ENABLE_BACKTRACES@ +config.host_arch = "@HOST_ARCH@" +config.enable_abi_breaking_checks = "@LLVM_ENABLE_ABI_BREAKING_CHECKS@" +config.python_executable = "@PYTHON_EXECUTABLE@" + +# Support substitution of the tools and libs dirs with user parameters. This is +# used when we can't determine the tool dir at configuration time. +try: + config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params + config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params + config.llvm_libs_dir = config.llvm_libs_dir % lit_config.params +except KeyError: + e = sys.exc_info()[1] + key, = e.args + lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key)) + +@LIT_SITE_CFG_IN_FOOTER@ + +# Let the main config do the real work. +lit_config.load_config(config, "@BOLT_SOURCE_DIR@/test/lit.cfg.py") From 355538e29bf347013819a03eb50418851790b54c Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 21 Jun 2018 11:03:57 -0700 Subject: [PATCH 437/904] [BOLT] Fix call to evaluateX86MemOperands Summary: There was a call site not providing a displament immediate value. This assertion is firing in opensource. (cherry picked from commit c3460aa7b6c1496624cad5db9e17055d008e50ff) --- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index 847dee95e9ac..414486079d01 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -2309,10 +2309,11 @@ class X86MCPlusBuilder : public MCPlusBuilder { int64_t ScaleValue; unsigned IndexRegNum; const MCExpr *DispExpr = nullptr; + int64_t DispValue; unsigned SegRegNum; if (!evaluateX86MemoryOperand(Instr, &BaseRegNum, &ScaleValue, &IndexRegNum, - nullptr, &SegRegNum, &DispExpr)) + &DispValue, &SegRegNum, &DispExpr)) break; if (BaseRegNum != RegInfo->getProgramCounter() || IndexRegNum != X86::NoRegister || From 723581dad4354e8f628c668d61f15093ca12523b Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 25 Jun 2018 14:55:48 -0700 Subject: [PATCH 438/904] Disable -split-eh in non-relocation mode Summary: This option only works in relocation mode. In non-relocation mode, it generates invalid references that cause MCStreamer to fail. Disable this flag if the user requested and print a warning. (cherry picked from commit 01c7d286f0748006aaed532d5a7043d2a712cea2) --- bolt/src/Passes/BinaryPasses.cpp | 8 +------- bolt/src/RewriteInstance.cpp | 11 +++++++++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 569598c5ae18..98370d96d95a 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -53,6 +53,7 @@ extern cl::OptionCategory BoltOptCategory; extern cl::opt AlignMacroOpFusion; extern cl::opt Verbosity; +extern cl::opt SplitEH; extern cl::opt SplitFunctions; extern bool shouldProcess(const bolt::BinaryFunction &Function); @@ -201,13 +202,6 @@ SctcMode("sctc-mode", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -SplitEH("split-eh", - cl::desc("split C++ exception handling code (experimental)"), - cl::ZeroOrMore, - cl::Hidden, - cl::cat(BoltOptCategory)); - static cl::opt TSPThreshold("tsp-threshold", cl::desc("maximum number of hot basic blocks in a function for which to use " diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 47c7ff848ab6..158949abf43d 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -309,6 +309,13 @@ SplitFunctions("split-functions", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +cl::opt +SplitEH("split-eh", + cl::desc("split C++ exception handling code"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + cl::opt TrapOldCode("trap-old-code", cl::desc("insert traps in old function bodies (relocation mode)"), @@ -1775,6 +1782,10 @@ void RewriteInstance::adjustCommandLineOptions() { "mode\n"; opts::AlignMacroOpFusion = MFT_NONE; } + if (opts::SplitEH && !BC->HasRelocations) { + outs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n"; + opts::SplitEH = false; + } if (BC->isX86() && BC->HasRelocations && opts::AlignMacroOpFusion == MFT_HOT && !DA.started() && BC->DR.getAllFuncsData().empty() && From 39dc4b3f5823890511718ca25472ff92e464abeb Mon Sep 17 00:00:00 2001 From: Facebook Github Bot Date: Tue, 26 Jun 2018 17:02:00 -0700 Subject: [PATCH 439/904] [BOLT][PR] In some cases DB could be nullptr Summary: When processing binary with -debug mode in some cases, BD could be nullptr. It will be better to fail later on assert than here with segfault. Closes https://github.com/facebookincubator/BOLT/pull/18 GitHub Author: Alexander Gryanko (cherry picked from commit 391a830dddaad32d0f073a1be2d7db8d799d8953) --- bolt/src/BinaryFunction.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 4d949ff092ae..bd6704fcdba4 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1017,8 +1017,9 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { auto *TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, 0, 0, "DATAat"); DEBUG(if (opts::Verbosity >= 2) { + auto SectionName = BD ? BD->getSectionName() : ""; dbgs() << "Created DATAat sym: " << TargetSymbol->getName() - << " in section " << BD->getSectionName() << "\n"; + << " in section " << SectionName << "\n"; }); return TargetSymbol; }; From 1d60f6cd463d5232683b903ef7e8d5dd4e75ec18 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 11 Jun 2018 13:18:44 -0700 Subject: [PATCH 440/904] [X86] Support a subset of internal calls Summary: Add support for functions with internal calls, necessary for handling Intel MKL library and some code observed in google core dumper library. This is not optimizing these functions, but only identifying them, running analyses to assure we will not break those functions if we move them, and then "freezing" these functions (marking as not simple so Bolt will not try to reorder it or touch it in any way). (cherry picked from commit 310eca9d3dec7638ae0074aa09391b945f115b8a) --- bolt/src/BinaryBasicBlock.h | 14 + bolt/src/BinaryFunction.cpp | 15 +- bolt/src/BinaryPassManager.cpp | 3 + bolt/src/Passes/BinaryPasses.cpp | 2 +- bolt/src/Passes/CMakeLists.txt | 1 + bolt/src/Passes/DataflowAnalysis.h | 14 +- bolt/src/Passes/FrameOptimizer.cpp | 2 +- bolt/src/Passes/IndirectCallPromotion.cpp | 18 +- bolt/src/Passes/JTFootprintReduction.cpp | 2 +- bolt/src/Passes/ReachingDefOrUse.h | 31 +- bolt/src/Passes/RegAnalysis.cpp | 42 ++- bolt/src/Passes/RegAnalysis.h | 53 ++-- bolt/src/Passes/RegReAssign.cpp | 2 +- bolt/src/Passes/StokeInfo.cpp | 2 +- bolt/src/Passes/ValidateInternalCalls.cpp | 347 ++++++++++++++++++++++ bolt/src/Passes/ValidateInternalCalls.h | 106 +++++++ bolt/src/Target/X86/X86MCPlusBuilder.cpp | 8 + 17 files changed, 598 insertions(+), 64 deletions(-) create mode 100644 bolt/src/Passes/ValidateInternalCalls.cpp create mode 100644 bolt/src/Passes/ValidateInternalCalls.h diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index d20e8d653433..66f0da262e81 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -556,6 +556,20 @@ class BinaryBasicBlock { uint64_t Count = 0, uint64_t MispredictedCount = 0); + /// Move all of this block's successors to a new block, and set the + /// execution count of this new block with our execution count. This is + /// useful when splitting a block in two. + void moveAllSuccessorsTo(BinaryBasicBlock *New) { + New->addSuccessors(successors().begin(), + successors().end(), + branch_info_begin(), + branch_info_end()); + removeAllSuccessors(); + + // Update the execution count on the new block. + New->setExecutionCount(getExecutionCount()); + } + /// Remove /p Succ basic block from the list of successors. Update the /// list of predecessors of /p Succ and update branch info. void removeSuccessor(BinaryBasicBlock *Succ); diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index bd6704fcdba4..50064ca00756 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1207,11 +1207,16 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Recursive call. TargetSymbol = getSymbol(); } else { - // Possibly an old-style PIC code - errs() << "BOLT-WARNING: internal call detected at 0x" - << Twine::utohexstr(AbsoluteInstrAddr) << " in function " - << *this << ". Skipping.\n"; - IsSimple = false; + if (BC.isX86()) { + // Dangerous old-style x86 PIC code. We may need to freeze this + // function, so preserve the function as is for now. + PreserveNops = true; + } else { + errs() << "BOLT-WARNING: internal call detected at 0x" + << Twine::utohexstr(AbsoluteInstrAddr) << " in function " + << *this << ". Skipping.\n"; + IsSimple = false; + } } } diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index fbfbe718ba51..8d42e5304063 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -23,6 +23,7 @@ #include "Passes/ReorderFunctions.h" #include "Passes/ReorderData.h" #include "Passes/StokeInfo.h" +#include "Passes/ValidateInternalCalls.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include @@ -359,6 +360,8 @@ void BinaryFunctionPassManager::runAllPasses( // Run this pass first to use stats for the original functions. Manager.registerPass(llvm::make_unique(NeverPrint)); + Manager.registerPass(llvm::make_unique(NeverPrint)); + Manager.registerPass(llvm::make_unique(NeverPrint), opts::StripRepRet); diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 98370d96d95a..3367ca4f6d3e 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -586,7 +586,7 @@ void FixupBranches::runOnFunctions( for (auto &It : BFs) { auto &Function = It.second; if (BC.HasRelocations || shouldOptimize(Function)) { - if (BC.isAArch64() && !Function.isSimple()) + if (BC.HasRelocations && !Function.isSimple()) continue; Function.fixBranches(); } diff --git a/bolt/src/Passes/CMakeLists.txt b/bolt/src/Passes/CMakeLists.txt index e088752036e7..26842ce568fc 100644 --- a/bolt/src/Passes/CMakeLists.txt +++ b/bolt/src/Passes/CMakeLists.txt @@ -32,6 +32,7 @@ add_llvm_library(LLVMBOLTPasses StackPointerTracking.cpp StackReachingUses.cpp StokeInfo.cpp + ValidateInternalCalls.cpp DEPENDS intrinsics_gen diff --git a/bolt/src/Passes/DataflowAnalysis.h b/bolt/src/Passes/DataflowAnalysis.h index 6700245d1f2f..1c0ea3e6523b 100644 --- a/bolt/src/Passes/DataflowAnalysis.h +++ b/bolt/src/Passes/DataflowAnalysis.h @@ -458,7 +458,7 @@ class DataflowAnalysis { class ExprIterator : public std::iterator { const BitVector *BV; - const std::vector &Expressions; + const std::vector &Expressions; int Idx; public: @@ -475,15 +475,15 @@ class ExprIterator } bool operator==(const ExprIterator &Other) const { return Idx == Other.Idx; } bool operator!=(const ExprIterator &Other) const { return Idx != Other.Idx; } - const MCInst *operator*() { + MCInst *operator*() { assert(Idx != -1 && "Invalid access to end iterator"); return Expressions[Idx]; } - ExprIterator(const BitVector *BV, const std::vector &Exprs) + ExprIterator(const BitVector *BV, const std::vector &Exprs) : BV(BV), Expressions(Exprs) { Idx = BV->find_first(); } - ExprIterator(const BitVector *BV, const std::vector &Exprs, + ExprIterator(const BitVector *BV, const std::vector &Exprs, int Idx) : BV(BV), Expressions(Exprs), Idx(Idx) {} @@ -503,12 +503,12 @@ class InstrsDataflowAnalysis /// These iterator functions offer access to the set of pointers to /// instructions in a given program point template - ExprIterator expr_begin(T &Point) const { + ExprIterator expr_begin(const T &Point) const { if (auto State = this->getStateAt(Point)) return ExprIterator(&*State, Expressions); return expr_end(); } - ExprIterator expr_begin(BitVector &BV) const { + ExprIterator expr_begin(const BitVector &BV) const { return ExprIterator(&BV, Expressions); } ExprIterator expr_end() const { @@ -522,7 +522,7 @@ class InstrsDataflowAnalysis /// expression/def) into a vector because we need to associate them with /// small numbers. They will be tracked via BitVectors throughout the /// dataflow analysis. - std::vector Expressions; + std::vector Expressions; /// Maps expressions defs (MCInsts) to its index in the Expressions vector std::unordered_map ExprToIdx; diff --git a/bolt/src/Passes/FrameOptimizer.cpp b/bolt/src/Passes/FrameOptimizer.cpp index f014c16e6dcf..d885f831d0b7 100644 --- a/bolt/src/Passes/FrameOptimizer.cpp +++ b/bolt/src/Passes/FrameOptimizer.cpp @@ -230,7 +230,7 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, // Run FrameAnalysis pass BinaryFunctionCallGraph CG = buildCallGraph(BC, BFs); FrameAnalysis FA(BC, BFs, CG); - RegAnalysis RA(BC, BFs, CG); + RegAnalysis RA(BC, &BFs, &CG); // Our main loop: perform caller-saved register optimizations, then // callee-saved register optimizations (shrink wrapping). diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp index 6cb483a62d8f..3fd79ac1eb38 100644 --- a/bolt/src/Passes/IndirectCallPromotion.cpp +++ b/bolt/src/Passes/IndirectCallPromotion.cpp @@ -738,18 +738,6 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo; BinaryBasicBlock *MergeBlock = nullptr; - auto moveSuccessors = [](BinaryBasicBlock *Old, BinaryBasicBlock *New) { - // Copy over successors to the new block. - New->addSuccessors(Old->successors().begin(), - Old->successors().end(), - Old->branch_info_begin(), - Old->branch_info_end()); - Old->removeAllSuccessors(); - - // Update the execution count on the new block. - New->setExecutionCount(Old->getExecutionCount()); - }; - // Scale indirect call counts to the execution count of the original // basic block containing the indirect call. uint64_t TotalIndirectBranches = 0; @@ -796,7 +784,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( }; if (IsJumpTable) { - moveSuccessors(IndCallBlock, NewBBs.back().get()); + IndCallBlock->moveAllSuccessorsTo(NewBBs.back().get()); std::vector SymTargets; for (size_t I = 0; I < Targets.size(); ++I) { @@ -840,7 +828,7 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( if (!IsTailCall) { MergeBlock = NewBBs.back().get(); - moveSuccessors(IndCallBlock, MergeBlock); + IndCallBlock->moveAllSuccessorsTo(MergeBlock); } // Fix up successors and execution counts. @@ -1108,7 +1096,7 @@ void IndirectCallPromotion::runOnFunctions( std::unique_ptr CG; if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) { CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); - RA.reset(new RegAnalysis(BC, BFs, *CG)); + RA.reset(new RegAnalysis(BC, &BFs, &*CG)); } DEBUG_VERBOSE(2, { diff --git a/bolt/src/Passes/JTFootprintReduction.cpp b/bolt/src/Passes/JTFootprintReduction.cpp index dd47ff315161..ac7328406159 100644 --- a/bolt/src/Passes/JTFootprintReduction.cpp +++ b/bolt/src/Passes/JTFootprintReduction.cpp @@ -250,7 +250,7 @@ void JTFootprintReduction::runOnFunctions( std::unique_ptr CG; if (!opts::JTFootprintOnlyPIC) { CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); - RA.reset(new RegAnalysis(BC, BFs, *CG)); + RA.reset(new RegAnalysis(BC, &BFs, &*CG)); } for (auto &BFIt : BFs) { auto &Function = BFIt.second; diff --git a/bolt/src/Passes/ReachingDefOrUse.h b/bolt/src/Passes/ReachingDefOrUse.h index 67701948de2d..429f3c8f2060 100644 --- a/bolt/src/Passes/ReachingDefOrUse.h +++ b/bolt/src/Passes/ReachingDefOrUse.h @@ -14,6 +14,7 @@ #include "DataflowAnalysis.h" #include "RegAnalysis.h" +#include "llvm/ADT/Optional.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" @@ -35,8 +36,9 @@ class ReachingDefOrUse public: ReachingDefOrUse(const RegAnalysis &RA, const BinaryContext &BC, - BinaryFunction &BF) - : InstrsDataflowAnalysis, !Def>(BC, BF), RA(RA) {} + BinaryFunction &BF, Optional TrackingReg = None) + : InstrsDataflowAnalysis, !Def>(BC, BF), RA(RA), + TrackingReg(TrackingReg) {} virtual ~ReachingDefOrUse() {} bool isReachedBy(MCPhysReg Reg, ExprIterator Candidates) { @@ -67,6 +69,10 @@ class ReachingDefOrUse /// Reference to the result of reg analysis const RegAnalysis &RA; + /// If set, limit the dataflow to only track instructions affecting this + /// register. Otherwise the analysis can be too permissive. + Optional TrackingReg; + void preflight() { // Populate our universe of tracked expressions with all instructions // except pseudos @@ -103,6 +109,11 @@ class ReachingDefOrUse RA.getInstClobberList(*Y, YClobbers); else this->BC.MIB->getTouchedRegs(*Y, YClobbers); + // Limit the analysis, if requested + if (TrackingReg) { + XClobbers &= this->BC.MIB->getAliases(*TrackingReg); + YClobbers &= this->BC.MIB->getAliases(*TrackingReg); + } // X kills Y if it clobbers Y completely -- this is a conservative approach. // In practice, we may produce use-def links that may not exist. XClobbers &= YClobbers; @@ -120,7 +131,21 @@ class ReachingDefOrUse } // Gen if (!this->BC.MIB->isCFI(Point)) { - Next.set(this->ExprToIdx[&Point]); + if (TrackingReg == None) { + // Track all instructions + Next.set(this->ExprToIdx[&Point]); + } + else { + // Track only instructions relevant to TrackingReg + auto Regs = BitVector(this->BC.MRI->getNumRegs(), false); + if (Def) + RA.getInstClobberList(Point, Regs); + else + RA.getInstUsedRegsList(Point, Regs, false); + Regs &= this->BC.MIB->getAliases(*TrackingReg); + if (Regs.any()) + Next.set(this->ExprToIdx[&Point]); + } } return Next; } diff --git a/bolt/src/Passes/RegAnalysis.cpp b/bolt/src/Passes/RegAnalysis.cpp index 994da13838fc..14ef48b57c25 100644 --- a/bolt/src/Passes/RegAnalysis.cpp +++ b/bolt/src/Passes/RegAnalysis.cpp @@ -22,10 +22,14 @@ namespace llvm { namespace bolt { RegAnalysis::RegAnalysis(BinaryContext &BC, - std::map &BFs, - BinaryFunctionCallGraph &CG) - : BC(BC) { - CallGraphWalker CGWalker(CG); + std::map *BFs, + BinaryFunctionCallGraph *CG) + : BC(BC), CS(opts::AssumeABI ? ConservativeStrategy::CLOBBERS_ABI + : ConservativeStrategy::CLOBBERS_ALL) { + if (!CG) + return; + + CallGraphWalker CGWalker(*CG); CGWalker.registerVisitor([&](BinaryFunction *Func) -> bool { BitVector RegsKilled = getFunctionClobberList(Func); @@ -56,8 +60,11 @@ RegAnalysis::RegAnalysis(BinaryContext &BC, #endif } + if (!BFs) + return; + // This loop is for computing statistics only - for (auto &MapEntry : BFs) { + for (auto &MapEntry : *BFs) { auto *Func = &MapEntry.second; auto Iter = RegsKilledMap.find(Func); assert(Iter != RegsKilledMap.end() && @@ -89,25 +96,36 @@ RegAnalysis::RegAnalysis(BinaryContext &BC, } void RegAnalysis::beConservative(BitVector &Result) const { - if (!opts::AssumeABI) { + switch (CS) { + case ConservativeStrategy::CLOBBERS_ALL: Result.set(); - } else { + break; + case ConservativeStrategy::CLOBBERS_ABI: { BitVector BV(BC.MRI->getNumRegs(), false); BC.MIB->getCalleeSavedRegs(BV); BV.flip(); Result |= BV; } + case ConservativeStrategy::CLOBBERS_NONE: + Result.reset(); + break; + } } bool RegAnalysis::isConservative(BitVector &Vec) const { - if (!opts::AssumeABI) { + switch (CS) { + case ConservativeStrategy::CLOBBERS_ALL: return Vec.all(); - } else { + case ConservativeStrategy::CLOBBERS_ABI: { BitVector BV(BC.MRI->getNumRegs(), false); BC.MIB->getCalleeSavedRegs(BV); BV |= Vec; return BV.all(); } + case ConservativeStrategy::CLOBBERS_NONE: + return Vec.none(); + } + return false; } void RegAnalysis::getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet, @@ -120,6 +138,12 @@ void RegAnalysis::getInstUsedRegsList(const MCInst &Inst, BitVector &RegSet, return; } + // If no call graph supplied... + if (RegsKilledMap.size() == 0) { + beConservative(RegSet); + return; + } + const auto *TargetSymbol = BC.MIB->getTargetSymbol(Inst); // If indirect call, we know nothing if (TargetSymbol == nullptr) { diff --git a/bolt/src/Passes/RegAnalysis.h b/bolt/src/Passes/RegAnalysis.h index 5a6b9306381c..9c01fce5b8cb 100644 --- a/bolt/src/Passes/RegAnalysis.h +++ b/bolt/src/Passes/RegAnalysis.h @@ -26,24 +26,6 @@ namespace bolt { /// a call graph traversal to accurately extract the set of registers touched /// after the call returns. class RegAnalysis { - BinaryContext &BC; - - /// Map functions to the set of registers they may overwrite starting at when - /// it is called until it returns to the caller. - std::map RegsKilledMap; - - /// Similar concept above but for registers that are read in that function. - std::map RegsGenMap; - - /// Analysis stats counters - uint64_t NumFunctionsAllClobber{0}; - uint64_t CountFunctionsAllClobber{0}; - uint64_t CountDenominator{0}; - - /// Helper function used to get the set of clobbered/used regs whenever - /// we know nothing about the function. - void beConservative(BitVector &Result) const; - public: /// Compute the set of registers \p Func may read from during its execution. BitVector getFunctionUsedRegsList(const BinaryFunction *Func); @@ -54,8 +36,8 @@ class RegAnalysis { /// set of clobbered registers. BitVector getFunctionClobberList(const BinaryFunction *Func); - RegAnalysis(BinaryContext &BC, std::map &BFs, - BinaryFunctionCallGraph &CG); + RegAnalysis(BinaryContext &BC, std::map *BFs, + BinaryFunctionCallGraph *CG); /// Compute the set of registers \p Inst may read from, marking them in /// \p RegSet. If GetClobbers is true, the set set the instr may write to. @@ -72,8 +54,39 @@ class RegAnalysis { /// expressing no specific knowledge of reg usage. bool isConservative(BitVector &Vec) const; + + /// Set what to do when lacking information about a call + enum class ConservativeStrategy { + CLOBBERS_ALL, + CLOBBERS_ABI, + CLOBBERS_NONE + }; + void setConservativeStrategy(ConservativeStrategy S) { CS = S; } + /// Print stats about the quality of our analysis void printStats(); + +private: + BinaryContext &BC; + + /// Map functions to the set of registers they may overwrite starting at when + /// it is called until it returns to the caller. + std::map RegsKilledMap; + + /// Similar concept above but for registers that are read in that function. + std::map RegsGenMap; + + /// Analysis stats counters + uint64_t NumFunctionsAllClobber{0}; + uint64_t CountFunctionsAllClobber{0}; + uint64_t CountDenominator{0}; + + ConservativeStrategy CS; + + /// Helper function used to get the set of clobbered/used regs whenever + /// we know nothing about the function. + void beConservative(BitVector &Result) const; + }; } diff --git a/bolt/src/Passes/RegReAssign.cpp b/bolt/src/Passes/RegReAssign.cpp index 1c0f9e5ac636..402cc796c910 100644 --- a/bolt/src/Passes/RegReAssign.cpp +++ b/bolt/src/Passes/RegReAssign.cpp @@ -340,7 +340,7 @@ void RegReAssign::setupAggressivePass(BinaryContext &BC, std::map &BFs) { setupConservativePass(BC, BFs); CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); - RA.reset(new RegAnalysis(BC, BFs, *CG)); + RA.reset(new RegAnalysis(BC, &BFs, &*CG)); GPRegs = BitVector(BC.MRI->getNumRegs(), false); BC.MIB->getGPRegs(GPRegs); diff --git a/bolt/src/Passes/StokeInfo.cpp b/bolt/src/Passes/StokeInfo.cpp index cfb845432171..4ea885e72761 100644 --- a/bolt/src/Passes/StokeInfo.cpp +++ b/bolt/src/Passes/StokeInfo.cpp @@ -157,7 +157,7 @@ void StokeInfo::runOnFunctions( DEBUG(dbgs() << "\tgetNumRegs " << BC.MRI->getNumRegs() << "\n"); auto CG = buildCallGraph(BC, BFs); - RegAnalysis RA(BC, BFs, CG); + RegAnalysis RA(BC, &BFs, &CG); NumRegs = BC.MRI->getNumRegs(); assert(NumRegs > 0 && "STOKE-INFO: the target register number is incorrect!"); diff --git a/bolt/src/Passes/ValidateInternalCalls.cpp b/bolt/src/Passes/ValidateInternalCalls.cpp new file mode 100644 index 000000000000..8f8e3c08bc9c --- /dev/null +++ b/bolt/src/Passes/ValidateInternalCalls.cpp @@ -0,0 +1,347 @@ +//===--- Passes/ValidateInternalCalls.cpp ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "ValidateInternalCalls.h" +#include "Passes/DataflowInfoManager.h" + +#define DEBUG_TYPE "bolt-internalcalls" + +namespace llvm { +namespace bolt { + +namespace { + +// Helper used to extract the target basic block used in an internal call. +// Return nullptr if this is not an internal call target. +BinaryBasicBlock *getInternalCallTarget(BinaryFunction &Function, + const MCInst &Inst) { + const BinaryContext &BC = Function.getBinaryContext(); + if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || + !Inst.getOperand(0).isExpr()) + return nullptr; + + return Function.getBasicBlockForLabel(BC.MIB->getTargetSymbol(Inst)); +} + + +// A special StackPointerTracking that considers internal calls +class StackPointerTrackingForInternalCalls + : public StackPointerTrackingBase { + friend class DataflowAnalysis>; + + Optional AnnotationIndex; + +protected: + // We change the starting state to only consider the first block as an + // entry point, otherwise the analysis won't converge (there will be two valid + // stack offsets, one for an external call and another for an internal call). + std::pair getStartingStateAtBB(const BinaryBasicBlock &BB) { + if (&BB == &*Func.begin()) + return std::make_pair(-8, getEmpty()); + return std::make_pair(getEmpty(), getEmpty()); + } + + // Here we decrement SP for internal calls too, in addition to the regular + // StackPointerTracking processing. + std::pair computeNext(const MCInst &Point, + const std::pair &Cur) { + std::pair Res = StackPointerTrackingBase< + StackPointerTrackingForInternalCalls>::computeNext(Point, Cur); + if (Res.first == StackPointerTracking::SUPERPOSITION || + Res.first == StackPointerTracking::EMPTY) + return Res; + + if (BC.MIB->isReturn(Point)) { + Res.first += 8; + return Res; + } + + BinaryBasicBlock *Target = getInternalCallTarget(Func, Point); + if (!Target) + return Res; + + Res.first -= 8; + return Res; + } + + StringRef getAnnotationName() const { + return StringRef("StackPointerTrackingForInternalCalls"); + } + +public: + StackPointerTrackingForInternalCalls(const BinaryContext &BC, + BinaryFunction &BF) + : StackPointerTrackingBase(BC, BF) { + } + + void run() { + NamedRegionTimer T1("SPTIC", "Stack Pointer Tracking for Internal Calls", + "Dataflow", "Dataflow", opts::TimeOpts); + StackPointerTrackingBase::run(); + } +}; + +} // end anonymous namespace + +bool ValidateInternalCalls::fixCFGForPIC(BinaryFunction &Function) const { + const BinaryContext &BC = Function.getBinaryContext(); + for (auto &BB : Function) { + for(auto II = BB.begin(); II != BB.end(); ++II) { + auto &Inst = *II; + auto *Target = getInternalCallTarget(Function, Inst); + if (!Target || BC.MIB->hasAnnotation(Inst, getProcessedICTag())) + continue; + + BC.MIB->addAnnotation(Inst, getProcessedICTag(), 0U); + auto MovedInsts = BB.splitInstructions(&Inst); + if (!MovedInsts.empty()) { + // Split this block at the call instruction. Create an unreachable + // block. + std::vector> NewBBs; + NewBBs.emplace_back(Function.createBasicBlock(0)); + NewBBs.back()->addInstructions(MovedInsts.begin(), MovedInsts.end()); + BB.moveAllSuccessorsTo(NewBBs.back().get()); + Function.insertBasicBlocks(&BB, std::move(NewBBs)); + } + // Update successors + BB.removeAllSuccessors(); + BB.addSuccessor(Target, BB.getExecutionCount(), 0ULL); + return true; + } + } + return false; +} + +bool ValidateInternalCalls::fixCFGForIC(BinaryFunction &Function) const { + const BinaryContext &BC = Function.getBinaryContext(); + // Track SP value + StackPointerTrackingForInternalCalls SPTIC(BC, Function); + SPTIC.run(); + + // Track instructions reaching a given point of the CFG to answer + // "There is a path from entry to point A that contains instruction B" + ReachingInsns RI(BC, Function); + RI.run(); + + // We use the InsnToBB map that DataflowInfoManager provides us + DataflowInfoManager Info(BC, Function, nullptr, nullptr); + + bool Updated{false}; + + auto processReturns = [&] (BinaryBasicBlock &BB, MCInst &Return) { + // Check all reaching internal calls + for (auto I = RI.expr_begin(Return), E = RI.expr_end(); I != E; ++I) { + MCInst &ReachingInst = **I; + if (!getInternalCallTarget(Function, ReachingInst) || + BC.MIB->hasAnnotation(ReachingInst, getProcessedICTag())) + continue; + + // Stack pointer matching + int SPAtCall = SPTIC.getStateAt(ReachingInst)->first; + int SPAtRet = SPTIC.getStateAt(Return)->first; + if (SPAtCall != StackPointerTracking::SUPERPOSITION && + SPAtRet != StackPointerTracking::SUPERPOSITION && + SPAtCall != SPAtRet - 8) + continue; + + Updated = true; + + // Mark this call as processed, so we don't try to analyze it as a + // PIC-computation internal call. + BC.MIB->addAnnotation(ReachingInst, getProcessedICTag(), 0U); + + // Connect this block with the returning block of the caller + BinaryBasicBlock *CallerBlock = Info.getInsnToBBMap()[&ReachingInst]; + BinaryBasicBlock *ReturnDestBlock = + Function.getBasicBlockAfter(CallerBlock); + BB.addSuccessor(ReturnDestBlock, BB.getExecutionCount(), 0); + } + }; + + // This will connect blocks terminated with RETs to their respective + // internal caller return block. A note here: this is overly conservative + // because in nested calls, or unrelated calls, it will create edges + // connecting RETs to potentially unrelated internal calls. This is safe + // and if this causes a problem to recover the stack offsets properly, we + // will fail later. + for (auto &BB : Function) { + for (auto &Inst : BB) { + if (!BC.MIB->isReturn(Inst)) + continue; + + processReturns(BB, Inst); + } + } + return Updated; +} + +bool ValidateInternalCalls::hasTailCallsInRange(BinaryFunction &Function) const { + const BinaryContext &BC = Function.getBinaryContext(); + for (auto &BB : Function) { + for (auto &Inst : BB) { + if (BC.MIB->isTailCall(Inst)) + return true; + } + } + return false; +} + +bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const { + while (fixCFGForPIC(Function)) {} + clearAnnotations(Function); + while (fixCFGForIC(Function)) {} + + BinaryContext &BC = Function.getBinaryContext(); + RegAnalysis RA = RegAnalysis(BC, nullptr, nullptr); + RA.setConservativeStrategy(RegAnalysis::ConservativeStrategy::CLOBBERS_NONE); + bool HasTailCalls = hasTailCallsInRange(Function); + + for (auto &BB : Function) { + for (auto &Inst : BB) { + auto *Target = getInternalCallTarget(Function, Inst); + if (!Target || BC.MIB->hasAnnotation(Inst, getProcessedICTag())) + continue; + + if (HasTailCalls) { + DEBUG(dbgs() << Function << " has tail calls and internal calls.\n"); + return false; + } + + FrameIndexEntry FIE; + int32_t SrcImm{0}; + MCPhysReg Reg{0}; + int64_t StackOffset{0}; + bool IsIndexed{false}; + auto *TargetInst = ProgramPoint::getFirstPointAt(*Target).getInst(); + if (!BC.MIB->isStackAccess(*TargetInst, FIE.IsLoad, FIE.IsStore, + FIE.IsStoreFromReg, Reg, SrcImm, + FIE.StackPtrReg, StackOffset, FIE.Size, + FIE.IsSimple, IsIndexed)) { + DEBUG({ + dbgs() << "Frame analysis failed - not simple: " << Function << "\n"; + Function.dump(); + }); + return false; + } + if (!FIE.IsLoad || FIE.StackPtrReg != BC.MIB->getStackPointer() || + StackOffset != 0) { + DEBUG({ + dbgs() << "Target instruction does not fetch return address - not " + "simple: " + << Function << "\n"; + Function.dump(); + }); + return false; + } + // Now track how the return address is used by tracking uses of Reg + ReachingDefOrUse RU = + ReachingDefOrUse(RA, BC, Function, Reg); + RU.run(); + + int64_t Offset = static_cast(Target->getInputOffset()); + bool UseDetected{false}; + for (auto I = RU.expr_begin(*RU.getStateBefore(*TargetInst)), + E = RU.expr_end(); + I != E; ++I) { + MCInst &Use = **I; + auto UsedRegs = BitVector(BC.MRI->getNumRegs(), false); + BC.MIB->getTouchedRegs(Use, UsedRegs); + if (!UsedRegs[Reg]) + continue; + UseDetected = true; + int64_t Output; + std::pair Input1 = std::make_pair(Reg, 0); + std::pair Input2 = std::make_pair(0, 0); + if (!BC.MIB->evaluateSimple(Use, Output, Input1, Input2)) { + DEBUG(dbgs() << "Evaluate simple failed.\n"); + return false; + } + if (Offset + Output < 0 || + Offset + Output > static_cast(Function.getSize())) { + DEBUG({ + dbgs() << "Detected out-of-range PIC reference in " << Function + << "\nReturn address load: "; + BC.InstPrinter->printInst(TargetInst, dbgs(), "", *BC.STI); + dbgs() << "\nUse: "; + BC.InstPrinter->printInst(&Use, dbgs(), "", *BC.STI); + dbgs() << "\n"; + Function.dump(); + }); + return false; + } + DEBUG({ + dbgs() << "Validated access: "; + BC.InstPrinter->printInst(&Use, dbgs(), "", *BC.STI); + dbgs() << "\n"; + }); + } + if (!UseDetected) { + DEBUG(dbgs() << "No use detected.\n"); + return false; + } + } + } + return true; +} + +void ValidateInternalCalls::runOnFunctions( + BinaryContext &BC, std::map &BFs, + std::set &LargeFunctions) { + if (!BC.isX86()) + return; + + // Look for functions that need validation. This should be pretty rare. + std::set NeedsValidation; + for (auto &BFI : BFs) { + BinaryFunction &Function = BFI.second; + for (auto &BB : Function) { + for(auto &Inst : BB) { + if (getInternalCallTarget(Function, Inst)) { + NeedsValidation.insert(&Function); + Function.setSimple(false); + break; + } + } + } + } + + // Skip validation for non-relocation mode + if (!BC.HasRelocations) + return; + + // Since few functions need validation, we can work with our most expensive + // algorithms here. Fix the CFG treating internal calls as unconditional + // jumps. This optimistically assumes this call is a PIC trick to get the PC + // value, so it is not really a call, but a jump. If we find that it's not the + // case, we mark this function as non-simple and stop processing it. + std::set Invalid; + for (auto *Function : NeedsValidation) { + DEBUG(dbgs() << "Validating " << Function << "\n"); + if (!analyzeFunction(*Function)) { + Invalid.insert(Function); + } + clearAnnotations(*Function); + } + + if (!Invalid.empty()) { + errs() << "BOLT-ERROR: Unsupported internal calls detected in the " + "following functions:\n"; + for (auto *Function : Invalid) { + errs() << " " << Function << "\n"; + } + errs() << "BOLT-ERROR: Unable to proceed in relocation mode\n"; + exit(1); + } +} + +} +} diff --git a/bolt/src/Passes/ValidateInternalCalls.h b/bolt/src/Passes/ValidateInternalCalls.h new file mode 100644 index 000000000000..de5e4b6e5c4b --- /dev/null +++ b/bolt/src/Passes/ValidateInternalCalls.h @@ -0,0 +1,106 @@ +//===--- Passes/ValidateInternalCalls.h -----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_VALIDATEINTERNALCALLS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_VALIDATEINTERNALCALLS_H + +#include "BinaryPasses.h" + +namespace llvm { +namespace bolt { + +/// Post-processing for internal calls. What are those? They are call +/// instructions that do not transfer control to another function, but +/// rather branch to a basic block inside the caller function itself. +/// This pass checks that the internal calls observed in a function are +/// manageable. We support two types: +/// +/// 1. Position Independent Code (PIC) tricks: in this type of internal +/// call, we don't really have a call because the return address is +/// not utilized for branching to, but only as a base address to +/// reference other objects. We call it a "trick" because this is not +/// the standard way a compiler would do this and this will often come +/// from awkwardly written assembly code. +/// +/// 2. Real internal calls: in this case, a function was inlined inside +/// a caller, but the CALL instruction wasn't removed. This pair of +/// caller-callee is treated as a single function and is analyzed +/// here. +/// +/// In general, the rest of the BOLT pipeline (other optimizations, including +/// code reordering) will not support neither of these cases. In this pass, +/// we just identify them, verify they are safe (do not reference objects +/// that will be moved after reordering) and freeze these functions in the +/// way they were read. We do this by marking them as non-simple. +/// +/// Why do we freeze them? +/// +/// Type 1 is not safe to optimize because any changed offsets will break the +/// PIC references made in this code. Type 2 is not safe to optimize because +/// it requires BOLT to understand a new CFG format where internal calls are +/// broken into two BBs (calling block and returning block), and we currently do +/// not support this elsewhere. Only this pass is able to make sense of these +/// non-canonical CFGs (specifically, fixBranches does not support them). +/// +class ValidateInternalCalls : public BinaryFunctionPass { +public: + explicit ValidateInternalCalls(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { + return "validate-internal-calls"; + } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; + +private: + /// Fix the CFG to take into consideration internal calls that do not + /// return, but are only used as a trick to perform Position Independent + /// Code (PIC) computations. This will change internal calls to be treated + /// as unconditional jumps. + bool fixCFGForPIC(BinaryFunction &Function) const; + + /// Fix the CFG to take into consideration real internal calls (whole + /// functions that got inlined inside its caller, but the CALL instruction + /// wasn't removed). + bool fixCFGForIC(BinaryFunction &Function) const; + + /// Detect tail calls in the range of the PIC access and fail to validate if + /// one is detected. Tail calls are dangerous because they may be emitted + /// with a different size in comparison with the original code. + /// FIXME: shortenInstructions and NOP sizes can impact offsets too + bool hasTailCallsInRange(BinaryFunction &Function) const; + + /// Check that the PIC computations performed by Type 1 internal calls are + /// safe + bool analyzeFunction(BinaryFunction &Function) const; + + /// The annotation tag we use to keep track of internal calls we already + /// processed. + StringRef getProcessedICTag() const { + return "ProcessedInternalCall"; + } + + void clearAnnotations(BinaryFunction &Function) const { + const BinaryContext &BC = Function.getBinaryContext(); + for (auto &BB : Function) { + for (auto &Inst : BB) { + BC.MIB->removeAnnotation(Inst, getProcessedICTag()); + } + } + } +}; + +} +} + +#endif diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index 414486079d01..3dfe7f3dc81b 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -1610,6 +1610,14 @@ class X86MCPlusBuilder : public MCPlusBuilder { return false; } break; + case X86::ADD64i32: + assert(Inst.getOperand(0).isImm()); + if (auto InputVal = getOperandVal(X86::RAX)) { + Output = *InputVal + Inst.getOperand(0).getImm(); + } else { + return false; + } + break; case X86::LEA64r: { unsigned BaseRegNum; From 5fc5b8346536c3622faadc85171a2a4cf0cbf309 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sat, 30 Jun 2018 13:30:47 -0700 Subject: [PATCH 441/904] [BOLT] Allow jump tables with 2 entries Summary: GCC 8 can generate jump tables with just 2 entries. Modify our heuristic to accept it. We still assert that there's more than one entry. (cherry picked from commit bcdfe3f8357aea025457d1828dc476f7b778b7a3) --- bolt/src/BinaryFunction.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 50064ca00756..4dfc930c2df0 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -867,8 +867,8 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, } if (Type == IndirectBranchType::POSSIBLE_JUMP_TABLE || Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) { - assert(JTOffsetCandidates.size() > 2 && - "expected more than 2 jump table entries"); + assert(JTOffsetCandidates.size() > 1 && + "expected more than one jump table entry"); auto JumpTableName = generateJumpTableName(ArrayStart); auto JumpTableType = From 7c71f00d451c92add76b16adb02d7de963f1f08e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 29 Jun 2018 20:30:36 -0700 Subject: [PATCH 442/904] [LLVM] Accept `S` in augmentation strings in CIE Summary: Ignore 'S' in augmentation string on input. It just marks a signal frame. All we have to do is propagate it. Fixes #21 This was already in LLVM trunk rL331738. Update llvm.patch. (cherry picked from commit 384b3267a2251462192fba2d195ae41b1a7345a0) --- bolt/llvm.patch | 152 +++++++++++++++++++++++++----------------------- 1 file changed, 80 insertions(+), 72 deletions(-) diff --git a/bolt/llvm.patch b/bolt/llvm.patch index 94821280453d..5722429b2162 100644 --- a/bolt/llvm.patch +++ b/bolt/llvm.patch @@ -1,5 +1,5 @@ diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h -index 124c2a8..03af230 100644 +index 124c2a8c86d..03af230f2e7 100644 --- a/include/llvm/ADT/BitVector.h +++ b/include/llvm/ADT/BitVector.h @@ -591,6 +591,11 @@ public: @@ -15,7 +15,7 @@ index 124c2a8..03af230 100644 if (size() < RHS.size()) resize(RHS.size()); diff --git a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h -index 84b2339..9ed1792 100644 +index 84b23398b8c..9ed1792f0c9 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h +++ b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h @@ -28,12 +28,15 @@ class raw_ostream; @@ -67,7 +67,7 @@ index 84b2339..9ed1792 100644 bool extract(DataExtractor Data, uint32_t* OffsetPtr); void dump(raw_ostream &OS) const; diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h -index e842cf2..83b0dbe 100644 +index e842cf231e7..83b0dbe0676 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFContext.h +++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h @@ -225,6 +225,9 @@ public: @@ -109,7 +109,7 @@ index e842cf2..83b0dbe 100644 DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset); diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h -index ff1c7fb..2622a4e 100644 +index ff1c7fb3838..2622a4e7eef 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h @@ -16,6 +16,7 @@ @@ -156,7 +156,7 @@ index ff1c7fb..2622a4e 100644 }; diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h -index a6d319a..39674a9 100644 +index a6d319a9045..39674a9d499 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h @@ -68,6 +68,9 @@ public: @@ -170,7 +170,7 @@ index a6d319a..39674a9 100644 uint32_t *Offset); }; diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h -index 39a3dd3..8427987 100644 +index 39a3dd32c0f..84279875611 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDie.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h @@ -130,7 +130,8 @@ public: @@ -184,7 +184,7 @@ index 39a3dd3..8427987 100644 /// Extract the first value of any attribute in Attrs from this DIE. /// diff --git a/include/llvm/DebugInfo/DWARF/DWARFObject.h b/include/llvm/DebugInfo/DWARF/DWARFObject.h -index 795eddd..43243e7 100644 +index 795eddd1c5d..43243e70474 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFObject.h +++ b/include/llvm/DebugInfo/DWARF/DWARFObject.h @@ -41,6 +41,7 @@ public: @@ -196,7 +196,7 @@ index 795eddd..43243e7 100644 virtual StringRef getLineStringSection() const { return ""; } virtual StringRef getStringSection() const { return ""; } diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h -index 7932688..51bf471 100644 +index 7932688290e..51bf4719f6c 100644 --- a/include/llvm/ExecutionEngine/ExecutionEngine.h +++ b/include/llvm/ExecutionEngine/ExecutionEngine.h @@ -251,6 +251,16 @@ public: @@ -217,7 +217,7 @@ index 7932688..51bf471 100644 /// load it into memory. /// diff --git a/include/llvm/ExecutionEngine/JITSymbol.h b/include/llvm/ExecutionEngine/JITSymbol.h -index 86ab173..257ed03 100644 +index 86ab17363e1..257ed03371b 100644 --- a/include/llvm/ExecutionEngine/JITSymbol.h +++ b/include/llvm/ExecutionEngine/JITSymbol.h @@ -297,7 +297,17 @@ public: @@ -239,7 +239,7 @@ index 86ab173..257ed03 100644 }; diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h -index 26fec8b..c533003 100644 +index 26fec8b359f..c5330034335 100644 --- a/include/llvm/ExecutionEngine/Orc/Core.h +++ b/include/llvm/ExecutionEngine/Orc/Core.h @@ -110,7 +110,17 @@ public: @@ -261,7 +261,7 @@ index 26fec8b..c533003 100644 }; diff --git a/include/llvm/ExecutionEngine/Orc/Legacy.h b/include/llvm/ExecutionEngine/Orc/Legacy.h -index b2b389a..7c108ef 100644 +index b2b389ad339..7c108ef848f 100644 --- a/include/llvm/ExecutionEngine/Orc/Legacy.h +++ b/include/llvm/ExecutionEngine/Orc/Legacy.h @@ -25,6 +25,10 @@ public: @@ -302,7 +302,7 @@ index b2b389a..7c108ef 100644 private: diff --git a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h -index cfc3922..c0b43ce 100644 +index cfc3922ebb5..c0b43ce8639 100644 --- a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h +++ b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h @@ -80,6 +80,12 @@ public: @@ -319,7 +319,7 @@ index cfc3922..c0b43ce 100644 TransformFtor &getTransform() { return Transform; } diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h -index 8f0d9fa..ada93a2 100644 +index 8f0d9fa6eb6..ada93a275e5 100644 --- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h @@ -62,6 +62,8 @@ protected: @@ -380,7 +380,7 @@ index 8f0d9fa..ada93a2 100644 /// VModuleKey. /// @param K VModuleKey for object to emit/finalize. diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h -index 14da5af..27b0243 100644 +index 14da5af0206..27b02437b98 100644 --- a/include/llvm/ExecutionEngine/RuntimeDyld.h +++ b/include/llvm/ExecutionEngine/RuntimeDyld.h @@ -112,6 +112,14 @@ public: @@ -424,7 +424,7 @@ index 14da5af..27b0243 100644 /// registered with the memory manager. Note, RuntimeDyld is responsible /// for identifying the EH frame and calling the memory manager with the diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h -index c538c46..7b16897 100644 +index c538c46fc07..7b168971a3d 100644 --- a/include/llvm/MC/MCAsmInfo.h +++ b/include/llvm/MC/MCAsmInfo.h @@ -225,6 +225,10 @@ protected: @@ -447,7 +447,7 @@ index c538c46..7b16897 100644 bool doesSetDirectiveSuppressReloc() const { diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h -index c110ffd..a29f320 100644 +index c110ffd3a77..a29f32091ca 100644 --- a/include/llvm/MC/MCContext.h +++ b/include/llvm/MC/MCContext.h @@ -506,6 +506,10 @@ namespace llvm { @@ -478,7 +478,7 @@ index c110ffd..a29f320 100644 } diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h -index 5cdb176..cd46632 100644 +index 5cdb176e8e2..cd466322894 100644 --- a/include/llvm/MC/MCDwarf.h +++ b/include/llvm/MC/MCDwarf.h @@ -73,6 +73,7 @@ class MCDwarfLoc { @@ -727,7 +727,7 @@ index 5cdb176..cd46632 100644 assert(Operation == OpEscape); return StringRef(&Values[0], Values.size()); diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h -index fcbbe65..25847aa 100644 +index fcbbe650d26..25847aa2946 100644 --- a/include/llvm/MC/MCExpr.h +++ b/include/llvm/MC/MCExpr.h @@ -123,6 +123,9 @@ public: @@ -741,7 +741,7 @@ index fcbbe65..25847aa 100644 }; diff --git a/include/llvm/MC/MCFragment.h b/include/llvm/MC/MCFragment.h -index 38c3655..dec2957 100644 +index 38c365538e3..dec295707e9 100644 --- a/include/llvm/MC/MCFragment.h +++ b/include/llvm/MC/MCFragment.h @@ -34,6 +34,7 @@ class MCFragment : public ilist_node_with_parent { @@ -800,7 +800,7 @@ index 38c3655..dec2957 100644 /// This fragment is always inserted before an instruction, and holds that /// instruction as context information (as well as a mask of kinds) for diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h -index db28fd0..e136a10 100644 +index db28fd0fd6d..e136a10b264 100644 --- a/include/llvm/MC/MCInst.h +++ b/include/llvm/MC/MCInst.h @@ -187,7 +187,7 @@ public: @@ -813,7 +813,7 @@ index db28fd0..e136a10 100644 iterator begin() { return Operands.begin(); } const_iterator begin() const { return Operands.begin(); } diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h -index c99f252..e6b4a88 100644 +index c99f2521f8f..e6b4a88f469 100644 --- a/include/llvm/MC/MCObjectFileInfo.h +++ b/include/llvm/MC/MCObjectFileInfo.h @@ -65,6 +65,9 @@ protected: @@ -835,7 +835,7 @@ index c99f252..e6b4a88 100644 MCSection *getCompactUnwindSection() const { return CompactUnwindSection; } MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; } diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h -index 8e9b4ac..d2c569e 100644 +index 8e9b4ac5632..d2c569e3399 100644 --- a/include/llvm/MC/MCObjectStreamer.h +++ b/include/llvm/MC/MCObjectStreamer.h @@ -121,6 +121,8 @@ public: @@ -848,7 +848,7 @@ index 8e9b4ac..d2c569e 100644 SMLoc Loc) override; void diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h -index 582a836..0b15454 100644 +index 582a836023b..0b15454ecd6 100644 --- a/include/llvm/MC/MCStreamer.h +++ b/include/llvm/MC/MCStreamer.h @@ -199,7 +199,7 @@ class MCStreamer { @@ -916,7 +916,7 @@ index 582a836..0b15454 100644 /// Returns true if the relocation could not be emitted because Name is not /// known. diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h -index cc8fc02..7b7835e 100644 +index cc8fc02968a..7b7835e83d8 100644 --- a/include/llvm/MC/MCSymbol.h +++ b/include/llvm/MC/MCSymbol.h @@ -120,10 +120,15 @@ protected: @@ -975,7 +975,7 @@ index cc8fc02..7b7835e 100644 } diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h -index 9190149..25646fe 100644 +index 9190149f382..25646fe0241 100644 --- a/include/llvm/Object/COFF.h +++ b/include/llvm/Object/COFF.h @@ -899,6 +899,7 @@ protected: @@ -987,7 +987,7 @@ index 9190149..25646fe 100644 relocation_iterator section_rel_end(DataRefImpl Sec) const override; diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h -index 46504e7..836fd8d 100644 +index 46504e74bc2..836fd8ddc45 100644 --- a/include/llvm/Object/ELF.h +++ b/include/llvm/Object/ELF.h @@ -127,6 +127,18 @@ public: @@ -1009,10 +1009,11 @@ index 46504e7..836fd8d 100644 Expected sections() const; Expected symbols(const Elf_Shdr *Sec) const { -@@ -397,6 +409,34 @@ void ELFFile::getRelocationTypeName(uint32_t Type, +@@ -396,6 +408,34 @@ void ELFFile::getRelocationTypeName(uint32_t Type, + } } - template ++template +Expected::Elf_Dyn *> +ELFFile::dynamic_table_begin(const Elf_Phdr *Phdr) const { + if (!Phdr) @@ -1040,12 +1041,11 @@ index 46504e7..836fd8d 100644 + return reinterpret_cast(base() + End); +} + -+template + template Expected ELFFile::getRelocationSymbol(const Elf_Rel *Rel, - const Elf_Shdr *SymTab) const { diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h -index 4d00103..06a6295 100644 +index 4d001039238..06a629573cc 100644 --- a/include/llvm/Object/ELFObjectFile.h +++ b/include/llvm/Object/ELFObjectFile.h @@ -254,6 +254,7 @@ protected: @@ -1056,10 +1056,11 @@ index 4d00103..06a6295 100644 relocation_iterator section_rel_begin(DataRefImpl Sec) const override; relocation_iterator section_rel_end(DataRefImpl Sec) const override; section_iterator getRelocatedSection(DataRefImpl Sec) const override; -@@ -717,6 +718,14 @@ bool ELFObjectFile::isSectionVirtual(DataRefImpl Sec) const { +@@ -716,6 +717,14 @@ bool ELFObjectFile::isSectionVirtual(DataRefImpl Sec) const { + return getSection(Sec)->sh_type == ELF::SHT_NOBITS; } - template ++template +bool ELFObjectFile::isSectionReadOnly(DataRefImpl Sec) const { + const Elf_Shdr *EShdr = getSection(Sec); + return EShdr->sh_flags & ELF::SHF_ALLOC && @@ -1067,10 +1068,9 @@ index 4d00103..06a6295 100644 + EShdr->sh_type == ELF::SHT_PROGBITS; +} + -+template + template relocation_iterator ELFObjectFile::section_rel_begin(DataRefImpl Sec) const { - DataRefImpl RelData; @@ -751,9 +760,6 @@ ELFObjectFile::section_rel_end(DataRefImpl Sec) const { template section_iterator @@ -1091,7 +1091,7 @@ index 4d00103..06a6295 100644 if (sec->sh_type == ELF::SHT_REL) return getRel(Rel)->r_offset; diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h -index bfd3462..9be0b26 100644 +index bfd3462bf69..9be0b260f34 100644 --- a/include/llvm/Object/MachO.h +++ b/include/llvm/Object/MachO.h @@ -320,6 +320,7 @@ public: @@ -1112,7 +1112,7 @@ index bfd3462..9be0b26 100644 uint64_t getRelocationOffset(DataRefImpl Rel) const override; symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override; diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h -index 9c4ae94..6434272 100644 +index 9c4ae94d3a6..64342723371 100644 --- a/include/llvm/Object/ObjectFile.h +++ b/include/llvm/Object/ObjectFile.h @@ -110,6 +110,7 @@ public: @@ -1143,7 +1143,7 @@ index 9c4ae94..6434272 100644 return OwningObject->section_rel_begin(SectionPimpl); } diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h -index d49acf3a..5929a22 100644 +index d49acf3a38a..5929a22e1f3 100644 --- a/include/llvm/Object/Wasm.h +++ b/include/llvm/Object/Wasm.h @@ -177,6 +177,7 @@ public: @@ -1155,7 +1155,7 @@ index d49acf3a..5929a22 100644 bool isSectionBitcode(DataRefImpl Sec) const override; relocation_iterator section_rel_begin(DataRefImpl Sec) const override; diff --git a/include/llvm/Support/ToolOutputFile.h b/include/llvm/Support/ToolOutputFile.h -index 7fd5f20..2a47ef1 100644 +index 7fd5f20ee4e..2a47ef1bfdb 100644 --- a/include/llvm/Support/ToolOutputFile.h +++ b/include/llvm/Support/ToolOutputFile.h @@ -46,7 +46,7 @@ public: @@ -1168,7 +1168,7 @@ index 7fd5f20..2a47ef1 100644 ToolOutputFile(StringRef Filename, int FD); diff --git a/include/llvm/Support/X86DisassemblerDecoderCommon.h b/include/llvm/Support/X86DisassemblerDecoderCommon.h -index eeffb9c..2ec2496 100644 +index eeffb9c0167..2ec249671eb 100644 --- a/include/llvm/Support/X86DisassemblerDecoderCommon.h +++ b/include/llvm/Support/X86DisassemblerDecoderCommon.h @@ -62,7 +62,8 @@ namespace X86Disassembler { @@ -1192,7 +1192,7 @@ index eeffb9c..2ec2496 100644 #define ENUM_ENTRY(n, r, d) n, enum InstructionContext { diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h -index d11f5a8..0ad115c 100644 +index d11f5a83779..0ad115c886b 100644 --- a/include/llvm/Support/raw_ostream.h +++ b/include/llvm/Support/raw_ostream.h @@ -393,7 +393,7 @@ public: @@ -1205,7 +1205,7 @@ index d11f5a8..0ad115c 100644 /// FD is the file descriptor that this writes to. If ShouldClose is true, /// this closes the file when the stream is destroyed. If FD is for stdout or diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp -index adada67..c9c7997 100644 +index adada672af0..c9c79971a25 100644 --- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp +++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp @@ -38,7 +38,7 @@ DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() { @@ -1278,7 +1278,7 @@ index adada67..c9c7997 100644 if (Spec.isImplicitConst()) { FormValue.setSValue(Spec.getImplicitConstValue()); diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp -index 3a974dd..65bd4a6 100644 +index 3a974dddc4e..65bd4a69db8 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -681,6 +681,15 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() { @@ -1399,7 +1399,7 @@ index 3a974dd..65bd4a6 100644 } diff --git a/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp -index 03e3174..0436778 100644 +index 03e31746139..0436778e2e4 100644 --- a/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp +++ b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp @@ -7,6 +7,7 @@ @@ -1411,7 +1411,7 @@ index 03e3174..0436778 100644 #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp -index b9dc215..40a5790 100644 +index b9dc2151e06..9a4a6f024bb 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp @@ -352,7 +352,8 @@ static void LLVM_ATTRIBUTE_NORETURN ReportError(uint32_t StartOffset, @@ -1434,7 +1434,7 @@ index b9dc215..40a5790 100644 } // At this point, Offset points to the next field after Length. -@@ -425,6 +429,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { +@@ -425,11 +429,16 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { Personality = Data.getEncodedPointer( &Offset, *PersonalityEncoding, EHFrameAddress ? EHFrameAddress + Offset : 0); @@ -1443,7 +1443,15 @@ index b9dc215..40a5790 100644 break; } case 'R': -@@ -478,6 +484,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { + FDEPointerEncoding = Data.getU8(&Offset); + break; ++ case 'S': ++ // Current frame is a signal trampoline. ++ break; + case 'z': + if (i) + ReportError(StartOffset, +@@ -478,6 +487,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { EHFrameAddress ? EHFrameAddress + Offset : 0)) { InitialLocation = *Val; } @@ -1452,7 +1460,7 @@ index b9dc215..40a5790 100644 if (auto Val = Data.getEncodedPointer( &Offset, Cie->getFDEPointerEncoding(), 0)) { AddressRange = *Val; -@@ -496,6 +504,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { +@@ -496,6 +507,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { LSDAAddress = Data.getEncodedPointer( &Offset, Cie->getLSDAPointerEncoding(), EHFrameAddress ? Offset + EHFrameAddress : 0); @@ -1461,7 +1469,7 @@ index b9dc215..40a5790 100644 } if (Offset != EndAugmentationOffset) -@@ -531,6 +541,13 @@ FrameEntry *DWARFDebugFrame::getEntryAtOffset(uint64_t Offset) const { +@@ -531,6 +544,13 @@ FrameEntry *DWARFDebugFrame::getEntryAtOffset(uint64_t Offset) const { return nullptr; } @@ -1476,7 +1484,7 @@ index b9dc215..40a5790 100644 Optional Offset) const { if (Offset) { diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp -index 7ae38e6..f1fd34a 100644 +index 7ae38e6e053..f1fd34af238 100644 --- a/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -270,12 +270,13 @@ bool DWARFDie::isSubroutineDIE() const { @@ -1496,7 +1504,7 @@ index 7ae38e6..f1fd34a 100644 } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp -index 3d274b6..cef29f4 100644 +index 3d274b63a4f..cef29f4b41d 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -175,6 +175,12 @@ static Error getOffset(const SymbolRef &Sym, SectionRef Sec, @@ -1560,7 +1568,7 @@ index 3d274b6..cef29f4 100644 StringRef RuntimeDyld::getErrorString() { return Dyld->getErrorString(); } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp -index 36b43ec9..3dc3e8f 100644 +index 36b43ec9b78..3dc3e8f325c 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -270,6 +270,25 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section, @@ -1692,7 +1700,7 @@ index 36b43ec9..3dc3e8f 100644 } else if (RelType == ELF::R_X86_64_GOTPCREL || RelType == ELF::R_X86_64_GOTPCRELX || diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h -index 766a9b2..a36c791 100644 +index 766a9b21cb1..a36c791c843 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -540,6 +540,8 @@ public: @@ -1705,7 +1713,7 @@ index 766a9b2..a36c791 100644 bool hasError() { return HasError; } diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp -index a0f9a85..be32963 100644 +index a0f9a857e3c..be32963b705 100644 --- a/lib/MC/MCAssembler.cpp +++ b/lib/MC/MCAssembler.cpp @@ -318,6 +318,34 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, @@ -1792,7 +1800,7 @@ index a0f9a85..be32963 100644 assert((cast(F).getValue() == 0) && "Invalid fill in virtual section!"); diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp -index 0e0ea96..0044566 100644 +index 0e0ea965d14..0044566d9ab 100644 --- a/lib/MC/MCDwarf.cpp +++ b/lib/MC/MCDwarf.cpp @@ -41,6 +41,7 @@ @@ -2165,7 +2173,7 @@ index 0e0ea96..0044566 100644 Streamer.EmitBytes(Instr.getValues()); return; diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp -index 65fbe8e..4b32cd7 100644 +index 65fbe8e8428..4b32cd7c586 100644 --- a/lib/MC/MCExpr.cpp +++ b/lib/MC/MCExpr.cpp @@ -834,3 +834,7 @@ MCFragment *MCExpr::findAssociatedFragment() const { @@ -2177,7 +2185,7 @@ index 65fbe8e..4b32cd7 100644 + return cast(this)->getSymbol(); +} diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp -index 1aed50a..e740a0d 100644 +index 1aed50aaeb7..e740a0d304a 100644 --- a/lib/MC/MCFragment.cpp +++ b/lib/MC/MCFragment.cpp @@ -254,6 +254,9 @@ void MCFragment::destroy() { @@ -2227,7 +2235,7 @@ index 1aed50a..e740a0d 100644 } OS << "]>\n"; diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp -index 83da8ac..820aa68 100644 +index 83da8ac1bae..820aa688e5f 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -480,6 +480,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { @@ -2241,7 +2249,7 @@ index 83da8ac..820aa68 100644 Ctx->getELFSection(".tdata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_TLS | ELF::SHF_WRITE); diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp -index 0a68458..58199c9 100644 +index 0a684588110..58199c97420 100644 --- a/lib/MC/MCObjectStreamer.cpp +++ b/lib/MC/MCObjectStreamer.cpp @@ -494,6 +494,13 @@ void MCObjectStreamer::EmitCodeAlignment(unsigned ByteAlignment, @@ -2259,7 +2267,7 @@ index 0a68458..58199c9 100644 unsigned char Value, SMLoc Loc) { diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp -index 7765698..0954b70 100644 +index 776569894a5..0954b70df49 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -85,11 +85,15 @@ void MCStreamer::reset() { @@ -2336,7 +2344,7 @@ index 7765698..0954b70 100644 SMLoc Loc) {} void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {} diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp -index b544fa5..746c9f3 100644 +index b544fa5c147..746c9f32865 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -339,11 +339,16 @@ unsigned COFFObjectFile::getSectionID(SectionRef Sec) const { @@ -2358,7 +2366,7 @@ index b544fa5..746c9f3 100644 MemoryBufferRef M, const uint8_t *base) { // The field for the number of relocations in COFF section table is only diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp -index adc54b4..2fcc293 100644 +index adc54b42eba..2fcc293a14b 100644 --- a/lib/Object/MachOObjectFile.cpp +++ b/lib/Object/MachOObjectFile.cpp @@ -1986,6 +1986,11 @@ bool MachOObjectFile::isSectionStripped(DataRefImpl Sec) const { @@ -2374,7 +2382,7 @@ index adc54b4..2fcc293 100644 DataRefImpl Ret; Ret.d.a = Sec.d.a; diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp -index 0c78631..c0dac22 100644 +index 0c78631da25..c0dac222978 100644 --- a/lib/Object/WasmObjectFile.cpp +++ b/lib/Object/WasmObjectFile.cpp @@ -1140,6 +1140,8 @@ bool WasmObjectFile::isSectionBSS(DataRefImpl Sec) const { return false; } @@ -2387,7 +2395,7 @@ index 0c78631..c0dac22 100644 relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Ref) const { diff --git a/lib/Support/ToolOutputFile.cpp b/lib/Support/ToolOutputFile.cpp -index e12d9e8..1c74d40 100644 +index e12d9e824f7..1c74d40f094 100644 --- a/lib/Support/ToolOutputFile.cpp +++ b/lib/Support/ToolOutputFile.cpp @@ -35,8 +35,8 @@ ToolOutputFile::CleanupInstaller::~CleanupInstaller() { @@ -2402,7 +2410,7 @@ index e12d9e8..1c74d40 100644 if (EC) Installer.Keep = true; diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp -index e026111..9455379 100644 +index e0261110308..94553799b22 100644 --- a/lib/Support/raw_ostream.cpp +++ b/lib/Support/raw_ostream.cpp @@ -490,7 +490,7 @@ void format_object_base::home() { @@ -2435,7 +2443,7 @@ index e026111..9455379 100644 /// FD is the file descriptor that this writes to. If ShouldClose is true, this /// closes the file when the stream is destroyed. diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt -index ed79f4f..95cb71f 100644 +index ed79f4fec4e..95cb71fb867 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -19,6 +19,7 @@ if (X86_GEN_FOLD_TABLES) @@ -2447,7 +2455,7 @@ index ed79f4f..95cb71f 100644 set(sources X86AsmPrinter.cpp diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp -index c58254a..ab9241e 100644 +index c58254ae38c..ab9241e5530 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -247,6 +247,8 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction( @@ -2460,7 +2468,7 @@ index c58254a..ab9241e 100644 Instr.setFlags(Flags); } diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp -index 6a10278..626b143 100644 +index 6a10278dc7f..626b1439871 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -298,6 +298,9 @@ static bool isREX(struct InternalInstruction *insn, uint8_t prefix) { @@ -2483,7 +2491,7 @@ index 6a10278..626b143 100644 // If EVEX.v2 is set this is one of the 16-31 registers. if (insn->vectorExtensionType == TYPE_EVEX && diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h -index 44422a9..d60aa3f 100644 +index 44422a95f16..d60aa3fd198 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -563,6 +563,8 @@ struct InternalInstruction { @@ -2496,7 +2504,7 @@ index 44422a9..d60aa3f 100644 uint8_t repeatPrefix; diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp -index fa7c352..35d28c1 100644 +index fa7c352a1b6..35d28c19fc2 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -46,6 +46,8 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { @@ -2535,7 +2543,7 @@ index fa7c352..35d28c1 100644 UseIntegratedAssembler = true; } diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td -index 9fba65c..1c8eb27 100644 +index 9fba65c6cf6..1c8eb2708fc 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -200,7 +200,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { @@ -2568,7 +2576,7 @@ index 9fba65c..1c8eb27 100644 // Conditional tail calls are similar to the above, but they are branches diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td -index f25f1b0..8a36933 100644 +index f25f1b0e8e4..8a369331256 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -29,7 +29,8 @@ let mayLoad = 1, mayStore = 0, hasSideEffects = 1 in { From b516ebf9474fa38cd2e8393a8ff66731208fb439 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 29 Jun 2018 21:12:55 -0700 Subject: [PATCH 443/904] [BOLT] Reject processing of PIE binaries Summary: Check if the input binary ELF type. Reject any binary not of ET_EXEC type, including position-independent executables (PIEs). Also print the first function containing PIC jump table. (cherry picked from commit b998bacffdee96147c03d29d6613c0e432339ebd) --- bolt/src/BinaryFunction.cpp | 2 +- bolt/src/RewriteInstance.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 4dfc930c2df0..10e1e65a29d6 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1407,7 +1407,7 @@ void BinaryFunction::postProcessJumpTables() { if (JT.Type == JumpTable::JTT_PIC && opts::JumpTables == JTS_BASIC) { opts::JumpTables = JTS_MOVE; outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was " - "detected\n"; + "detected in function " << *this << '\n'; } for (unsigned I = 0; I < JT.OffsetEntries.size(); ++I) { auto Offset = JT.OffsetEntries[I]; diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 158949abf43d..990c7c6ee1b9 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -804,6 +804,11 @@ void RewriteInstance::discoverStorage() { exit(1); } auto Obj = ELF64LEFile->getELFFile(); + if (Obj->getHeader()->e_type != ELF::ET_EXEC) { + errs() << "BOLT-ERROR: only non-PIE ELF executables are supported at the " + "moment.\n"; + exit(1); + } EntryPoint = Obj->getHeader()->e_entry; From 853ba8b2a9875db2ac35d9ffef06a61c193f4c3a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 4 Jul 2018 10:33:26 -0700 Subject: [PATCH 444/904] [BOLT] Fix no-assertions build Summary: In release build without assertions MCInst::dump() is undefined and causes link time failure. Fixes #27. (cherry picked from commit 8e92e7d57dde634a976daf7b04de21840f139db8) --- bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp index 6b5bc2b8f94a..44d1edd6857b 100644 --- a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -691,7 +691,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Inst.getOperand(0).getImm() != AArch64CC::NV && "Can't reverse ALWAYS cond code"); } else { - Inst.dump(); + DEBUG(Inst.dump()); llvm_unreachable("Unrecognized branch instruction"); } return replaceBranchTarget(Inst, TBB, Ctx); From 3189aaca9f24ee47eafc6b470ceb8e86868063b2 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 5 Jul 2018 14:21:49 -0700 Subject: [PATCH 445/904] [DebugInfo] Change default value of FDEPointerEncoding Summary: If the encoding is not specified in CIE augmentation string, then it should be DW_EH_PE_absptr instead of DW_EH_PE_omit. (cherry picked from commit 20ffb311ecd1a5e2731af173db5df83ec3735e6b) --- bolt/llvm.patch | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bolt/llvm.patch b/bolt/llvm.patch index 5722429b2162..1b97aee1d362 100644 --- a/bolt/llvm.patch +++ b/bolt/llvm.patch @@ -1411,7 +1411,7 @@ index 03e31746139..0436778e2e4 100644 #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp -index b9dc2151e06..9a4a6f024bb 100644 +index b9dc2151e06..f3c7c81bb10 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp @@ -352,7 +352,8 @@ static void LLVM_ATTRIBUTE_NORETURN ReportError(uint32_t StartOffset, @@ -1434,6 +1434,15 @@ index b9dc2151e06..9a4a6f024bb 100644 } // At this point, Offset points to the next field after Length. +@@ -399,7 +403,7 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { + + // Parse the augmentation data for EH CIEs + StringRef AugmentationData(""); +- uint32_t FDEPointerEncoding = DW_EH_PE_omit; ++ uint32_t FDEPointerEncoding = DW_EH_PE_absptr; + uint32_t LSDAPointerEncoding = DW_EH_PE_omit; + Optional Personality; + Optional PersonalityEncoding; @@ -425,11 +429,16 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) { Personality = Data.getEncodedPointer( &Offset, *PersonalityEncoding, From 1c4957000d49bd63d49de763c9ad625c7bfdba48 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 5 Jul 2018 20:47:38 -0700 Subject: [PATCH 446/904] [BOLT] Fix diagnostics printing in data aggregator Summary: Print correct part of the string while reporting an error. (cherry picked from commit 17091040b238fe37251c35c0ea9b01a974f8d0aa) --- bolt/src/DataAggregator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 655cf8050bbe..c94be6f0b59b 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -655,7 +655,7 @@ ErrorOr DataAggregator::parseLBREntry() { if (MispredStr.size() != 1 || (MispredStr[0] != 'P' && MispredStr[0] != 'M')) { reportError("expected single char for mispred bit"); - Diag << "Found: " << OffsetStr << "\n"; + Diag << "Found: " << MispredStr << "\n"; return make_error_code(llvm::errc::io_error); } Res.Mispred = MispredStr[0] == 'M'; @@ -665,7 +665,7 @@ ErrorOr DataAggregator::parseLBREntry() { return EC; if (Rest.get().size() < 5) { reportError("expected rest of LBR entry"); - Diag << "Found: " << OffsetStr << "\n"; + Diag << "Found: " << Rest.get() << "\n"; return make_error_code(llvm::errc::io_error); } return Res; From 5c45fbced4705c01c3b1cd5ced532307647a763c Mon Sep 17 00:00:00 2001 From: Puyan Lotfi Date: Fri, 6 Jul 2018 12:31:36 -0700 Subject: [PATCH 447/904] [LongJumpPass] X86 enablement. First attempt. (cherry picked from commit 010b0f7603fc9fa209c6dc95ce4b9c08e7b70d75) --- bolt/src/BinaryPassManager.cpp | 3 +- bolt/src/Passes/LongJmp.cpp | 2 +- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 95 ++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 3 deletions(-) diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index 8d42e5304063..e1c0cfcea374 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -460,8 +460,7 @@ void BinaryFunctionPassManager::runAllPasses( // Thighten branches according to offset differences between branch and // targets. No extra instructions after this pass, otherwise we may have // relocations out of range and crash during linking. - if (BC.isAArch64()) - Manager.registerPass(llvm::make_unique(PrintLongJmp)); + Manager.registerPass(llvm::make_unique(PrintLongJmp)); // This pass turns tail calls into jumps which makes them invisible to // function reordering. It's unsafe to use any CFG or instruction analysis diff --git a/bolt/src/Passes/LongJmp.cpp b/bolt/src/Passes/LongJmp.cpp index c5ebd85792f2..e1f14a00cd6a 100644 --- a/bolt/src/Passes/LongJmp.cpp +++ b/bolt/src/Passes/LongJmp.cpp @@ -368,7 +368,7 @@ bool LongJmpPass::removeOrShrinkStubs(const BinaryContext &BC, BinaryFunction &Func) { bool Modified{false}; - assert(BC.isAArch64() && "Unsupported arch"); + assert((BC.isAArch64() || BC.isX86()) && "Unsupported arch"); constexpr auto InsnSize = 4; // AArch64 // Remove unnecessary stubs for branch targets we know we can fit in the // instruction diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index 3dfe7f3dc81b..29b7dc003c15 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -2215,6 +2215,13 @@ class X86MCPlusBuilder : public MCPlusBuilder { return true; } + void createLongJmp(std::vector &Seq, const MCSymbol *Target, + MCContext *Ctx) const override { + MCInst Inst; + createUncondBranch(Inst, Target, Ctx); + Seq.emplace_back(Inst); + } + template std::pair analyzePICJumpTable(Itr II, @@ -2816,6 +2823,94 @@ class X86MCPlusBuilder : public MCPlusBuilder { return true; } + int getPCRelEncodingSize(MCInst &Inst) const override { + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("Failed to get pcrel encoding size"); + return 0; + + case X86::TAILJMPd: return 32; + case X86::TAILJMPm: return 32; + + case X86::CALL64pcrel32: return 64; + + case X86::JCXZ: return 8; + case X86::JECXZ: return 8; + case X86::JRCXZ: return 8; + + case X86::JMP_1: return 8; + case X86::JMP_2: return 16; + case X86::JMP_4: return 32; + + case X86::JE_1: return 8; + case X86::JE_2: return 16; + case X86::JE_4: return 32; + case X86::JNE_1: return 8; + case X86::JNE_2: return 16; + case X86::JNE_4: return 32; + + case X86::JL_1: return 8; + case X86::JL_2: return 16; + case X86::JL_4: return 32; + case X86::JGE_1: return 8; + case X86::JGE_2: return 16; + case X86::JGE_4: return 32; + + case X86::JLE_1: return 8; + case X86::JLE_2: return 16; + case X86::JLE_4: return 32; + case X86::JG_1: return 8; + case X86::JG_2: return 16; + case X86::JG_4: return 32; + + case X86::JB_1: return 8; + case X86::JB_2: return 16; + case X86::JB_4: return 32; + case X86::JAE_1: return 8; + case X86::JAE_2: return 16; + case X86::JAE_4: return 32; + + case X86::JBE_1: return 8; + case X86::JBE_2: return 16; + case X86::JBE_4: return 32; + case X86::JA_1: return 8; + case X86::JA_2: return 16; + case X86::JA_4: return 32; + + case X86::JS_1: return 8; + case X86::JS_2: return 16; + case X86::JS_4: return 32; + case X86::JNS_1: return 8; + case X86::JNS_2: return 16; + case X86::JNS_4: return 32; + + case X86::JP_1: return 8; + case X86::JP_2: return 16; + case X86::JP_4: return 32; + case X86::JNP_1: return 8; + case X86::JNP_2: return 16; + case X86::JNP_4: return 32; + + case X86::JO_1: return 8; + case X86::JO_2: return 16; + case X86::JO_4: return 32; + case X86::JNO_1: return 8; + case X86::JNO_2: return 16; + case X86::JNO_4: return 32; + } + } + + // TODO + int getShortJmpEncodingSize() const override { + return 16; + } + + // TODO + int getUncondBranchEncodingSize() const override { + return 28; + } + unsigned getCanonicalBranchOpcode(unsigned Opcode) const override { switch (Opcode) { default: From 6b92907056343d7ab98f855d880bcfe6051fa7b3 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 6 Jul 2018 14:54:53 -0700 Subject: [PATCH 448/904] Revert "[LongJumpPass] X86 enablement. First attempt." This reverts commit 010b0f7603fc9fa209c6dc95ce4b9c08e7b70d75. (cherry picked from commit b16c165ec7b640686285a0a4bb291282c8c3445c) --- bolt/src/BinaryPassManager.cpp | 3 +- bolt/src/Passes/LongJmp.cpp | 2 +- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 95 ------------------------ 3 files changed, 3 insertions(+), 97 deletions(-) diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index e1c0cfcea374..8d42e5304063 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -460,7 +460,8 @@ void BinaryFunctionPassManager::runAllPasses( // Thighten branches according to offset differences between branch and // targets. No extra instructions after this pass, otherwise we may have // relocations out of range and crash during linking. - Manager.registerPass(llvm::make_unique(PrintLongJmp)); + if (BC.isAArch64()) + Manager.registerPass(llvm::make_unique(PrintLongJmp)); // This pass turns tail calls into jumps which makes them invisible to // function reordering. It's unsafe to use any CFG or instruction analysis diff --git a/bolt/src/Passes/LongJmp.cpp b/bolt/src/Passes/LongJmp.cpp index e1f14a00cd6a..c5ebd85792f2 100644 --- a/bolt/src/Passes/LongJmp.cpp +++ b/bolt/src/Passes/LongJmp.cpp @@ -368,7 +368,7 @@ bool LongJmpPass::removeOrShrinkStubs(const BinaryContext &BC, BinaryFunction &Func) { bool Modified{false}; - assert((BC.isAArch64() || BC.isX86()) && "Unsupported arch"); + assert(BC.isAArch64() && "Unsupported arch"); constexpr auto InsnSize = 4; // AArch64 // Remove unnecessary stubs for branch targets we know we can fit in the // instruction diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index 29b7dc003c15..3dfe7f3dc81b 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -2215,13 +2215,6 @@ class X86MCPlusBuilder : public MCPlusBuilder { return true; } - void createLongJmp(std::vector &Seq, const MCSymbol *Target, - MCContext *Ctx) const override { - MCInst Inst; - createUncondBranch(Inst, Target, Ctx); - Seq.emplace_back(Inst); - } - template std::pair analyzePICJumpTable(Itr II, @@ -2823,94 +2816,6 @@ class X86MCPlusBuilder : public MCPlusBuilder { return true; } - int getPCRelEncodingSize(MCInst &Inst) const override { - - switch (Inst.getOpcode()) { - default: - llvm_unreachable("Failed to get pcrel encoding size"); - return 0; - - case X86::TAILJMPd: return 32; - case X86::TAILJMPm: return 32; - - case X86::CALL64pcrel32: return 64; - - case X86::JCXZ: return 8; - case X86::JECXZ: return 8; - case X86::JRCXZ: return 8; - - case X86::JMP_1: return 8; - case X86::JMP_2: return 16; - case X86::JMP_4: return 32; - - case X86::JE_1: return 8; - case X86::JE_2: return 16; - case X86::JE_4: return 32; - case X86::JNE_1: return 8; - case X86::JNE_2: return 16; - case X86::JNE_4: return 32; - - case X86::JL_1: return 8; - case X86::JL_2: return 16; - case X86::JL_4: return 32; - case X86::JGE_1: return 8; - case X86::JGE_2: return 16; - case X86::JGE_4: return 32; - - case X86::JLE_1: return 8; - case X86::JLE_2: return 16; - case X86::JLE_4: return 32; - case X86::JG_1: return 8; - case X86::JG_2: return 16; - case X86::JG_4: return 32; - - case X86::JB_1: return 8; - case X86::JB_2: return 16; - case X86::JB_4: return 32; - case X86::JAE_1: return 8; - case X86::JAE_2: return 16; - case X86::JAE_4: return 32; - - case X86::JBE_1: return 8; - case X86::JBE_2: return 16; - case X86::JBE_4: return 32; - case X86::JA_1: return 8; - case X86::JA_2: return 16; - case X86::JA_4: return 32; - - case X86::JS_1: return 8; - case X86::JS_2: return 16; - case X86::JS_4: return 32; - case X86::JNS_1: return 8; - case X86::JNS_2: return 16; - case X86::JNS_4: return 32; - - case X86::JP_1: return 8; - case X86::JP_2: return 16; - case X86::JP_4: return 32; - case X86::JNP_1: return 8; - case X86::JNP_2: return 16; - case X86::JNP_4: return 32; - - case X86::JO_1: return 8; - case X86::JO_2: return 16; - case X86::JO_4: return 32; - case X86::JNO_1: return 8; - case X86::JNO_2: return 16; - case X86::JNO_4: return 32; - } - } - - // TODO - int getShortJmpEncodingSize() const override { - return 16; - } - - // TODO - int getUncondBranchEncodingSize() const override { - return 28; - } - unsigned getCanonicalBranchOpcode(unsigned Opcode) const override { switch (Opcode) { default: From 1dd02b90530a4befc13a388ac1e32fbe32450879 Mon Sep 17 00:00:00 2001 From: Laith Saed Sakka Date: Thu, 7 Jun 2018 11:10:37 -0700 Subject: [PATCH 449/904] -- Adding Veneer elimination pass and Veneer count to dyno stats. Summary: Create a pass that performs veneers elimination . (cherry picked from commit 9ae41452c6928a665e087c1f8696645c96dca9a8) --- bolt/src/BinaryFunction.cpp | 34 ++++++++- bolt/src/BinaryFunction.h | 14 +++- bolt/src/BinaryPassManager.cpp | 11 +++ bolt/src/Passes/CMakeLists.txt | 1 + bolt/src/Passes/VeneerElimination.cpp | 106 ++++++++++++++++++++++++++ bolt/src/Passes/VeneerElimination.h | 40 ++++++++++ 6 files changed, 202 insertions(+), 4 deletions(-) create mode 100644 bolt/src/Passes/VeneerElimination.cpp create mode 100644 bolt/src/Passes/VeneerElimination.h diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 10e1e65a29d6..ba6baedb21cb 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1361,6 +1361,14 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { MIB->matchLinkerVeneer(Instructions.begin(), Instructions.end(), AbsoluteInstrAddr, Instruction, TargetHiBits, TargetLowBits, TargetAddress)) { + MIB->addAnnotation(Instruction, "AArch64Veneer", true); + + uint8_t Counter = 0; + for (auto It = std::prev(Instructions.end()); Counter != 2; + --It, ++Counter) { + MIB->addAnnotation(It->second, "AArch64Veneer", true); + } + fixStubTarget(*TargetLowBits, *TargetHiBits, TargetAddress); } } @@ -3753,7 +3761,7 @@ void BinaryFunction::printLoopInfo(raw_ostream &OS) const { } DynoStats BinaryFunction::getDynoStats() const { - DynoStats Stats; + DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64()); // Return empty-stats about the function we don't completely understand. if (!isSimple() || !hasValidProfile()) @@ -3779,6 +3787,10 @@ DynoStats BinaryFunction::getDynoStats() const { if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0) continue; + // Count AArch64 linker-inserted veneers + if(isAArch64Veneer()) + Stats[DynoStats::VENEER_CALLS_AARCH64] += getKnownExecutionCount(); + // Count the number of calls by iterating through all instructions. for (const auto &Instr : *BB) { if (BC.MIB->isStore(Instr)) { @@ -3887,6 +3899,22 @@ DynoStats BinaryFunction::getDynoStats() const { return Stats; } +bool BinaryFunction::isAArch64Veneer() const { + if (BasicBlocks.size() != 1) + return false; + + auto &BB = **BasicBlocks.begin(); + if (BB.size() != 3) + return false; + + for (auto &Inst : BB) { + if (!BC.MIB->hasAnnotation(Inst, "AArch64Veneer")) + return false; + } + + return true; +} + void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat, uint64_t OtherStat) { @@ -3907,6 +3935,10 @@ void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; Stat < DynoStats::LAST_DYNO_STAT; ++Stat) { + + if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64) + continue; + printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0); } } diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index b4f9826fb072..ee48e40ece40 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -80,6 +80,7 @@ class DynoStats { Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\ D(ALL_CONDITIONAL, "all conditional branches",\ Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\ + D(VENEER_CALLS_AARCH64, "linker-inserted veneer calls", Fn)\ D(LAST_DYNO_STAT, "", 0) public: @@ -90,13 +91,15 @@ class DynoStats { private: uint64_t Stats[LAST_DYNO_STAT+1]; + bool PrintAArch64Stats; #define D(name, desc, ...) desc, static constexpr const char *Desc[] = { DYNO_STATS }; #undef D public: - DynoStats() { + DynoStats(bool PrintAArch64Stats ) { + this->PrintAArch64Stats = PrintAArch64Stats; for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat) Stats[Stat] = 0; } @@ -2198,6 +2201,9 @@ class BinaryFunction { const DWARFDebugLoc::LocationList &InputLL, BaseAddress BaseAddr) const; + /// Return true if the function is an AArch64 linker inserted veneer + bool isAArch64Veneer() const; + virtual ~BinaryFunction(); /// Info for fragmented functions. @@ -2230,7 +2236,8 @@ class BinaryFunction { /// Return program-wide dynostats. template inline DynoStats getDynoStats(const FuncsType &Funcs) { - DynoStats dynoStats; + bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64(); + DynoStats dynoStats(IsAArch64); for (auto &BFI : Funcs) { auto &BF = BFI.second; if (BF.isSimple()) { @@ -2247,7 +2254,8 @@ callWithDynoStats(FnType &&Func, const FuncsType &Funcs, StringRef Phase, const bool Flag) { - DynoStats DynoStatsBefore; + bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64(); + DynoStats DynoStatsBefore(IsAArch64); if (Flag) { DynoStatsBefore = getDynoStats(Funcs); } diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index 8d42e5304063..113c27cc252e 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -24,6 +24,7 @@ #include "Passes/ReorderData.h" #include "Passes/StokeInfo.h" #include "Passes/ValidateInternalCalls.h" +#include "Passes/VeneerElimination.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include @@ -274,6 +275,13 @@ PrintStoke("print-stoke", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static llvm::cl::opt + PrintVeneerElimination("print-veneer-elimination", + cl::desc("print functions after veneer elimination pass"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + } // namespace opts namespace llvm { @@ -368,6 +376,9 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintICF), opts::ICF); + if (BC.isAArch64()) + Manager.registerPass(llvm::make_unique(PrintVeneerElimination)); + Manager.registerPass(llvm::make_unique(NeverPrint), opts::StringOps); diff --git a/bolt/src/Passes/CMakeLists.txt b/bolt/src/Passes/CMakeLists.txt index 26842ce568fc..c8b09d7f7067 100644 --- a/bolt/src/Passes/CMakeLists.txt +++ b/bolt/src/Passes/CMakeLists.txt @@ -33,6 +33,7 @@ add_llvm_library(LLVMBOLTPasses StackReachingUses.cpp StokeInfo.cpp ValidateInternalCalls.cpp + VeneerElimination.cpp DEPENDS intrinsics_gen diff --git a/bolt/src/Passes/VeneerElimination.cpp b/bolt/src/Passes/VeneerElimination.cpp new file mode 100644 index 000000000000..9dd67694ad00 --- /dev/null +++ b/bolt/src/Passes/VeneerElimination.cpp @@ -0,0 +1,106 @@ +//===--- Passes/VeneerElimination.cpp--------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class implements a pass that removes linker-inserted veneers from the +// code and redirects veneer callers to call to veneers destinations +// +//===----------------------------------------------------------------------===// +#include "VeneerElimination.h" +#define DEBUG_TYPE "veneer-elim" + +using namespace llvm; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +static llvm::cl::opt +EliminateVeneers("elim-link-veneers", + cl::desc("run veneer elimination pass"), + cl::init(false), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); +} // namespace opts + +namespace llvm { +namespace bolt { + +void VeneerElimination::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + if (!opts::EliminateVeneers || !BC.isAArch64()) + return; + + std::unordered_map VeneerDestinations; + uint64_t VeneersCount = 0; + for (auto It = BFs.begin(); It != BFs.end();) { + auto CurrentIt = It; + ++It; + + if (CurrentIt->second.isAArch64Veneer()) { + VeneersCount++; + BinaryFunction &VeneerFunction = CurrentIt->second; + + auto &FirstInstruction = *(VeneerFunction.begin()->begin()); + const MCSymbol *VeneerTargetSymbol = + BC.MIB->getTargetSymbol(FirstInstruction, 1); + + // Functions can have multiple symbols + for (auto &Name : VeneerFunction.getNames()) { + auto *Symbol = BC.Ctx->lookupSymbol(Name); + VeneerDestinations[Symbol] = VeneerTargetSymbol; + BC.SymbolToFunctionMap.erase(Symbol); + } + + BC.BinaryDataMap.erase(VeneerFunction.getAddress()); + BFs.erase(CurrentIt); + } + } + + DEBUG(dbgs() << "BOLT-INFO: number of removed linker-inserted veneers :" << VeneersCount + << "\n"); + + // Handle veneers to veneers in case they occur + for (auto entry : VeneerDestinations) { + const MCSymbol *src = entry.first; + const MCSymbol *dest = entry.second; + while (VeneerDestinations.find(dest) != VeneerDestinations.end()) { + dest = VeneerDestinations[dest]; + } + VeneerDestinations[src] = dest; + } + + uint64_t VeneerCallers = 0; + for (auto &It : BFs) { + auto &Function = It.second; + for (auto &BB : Function) { + for (auto &Instr : BB) { + if (!BC.MIB->isCall(Instr) || BC.MIB->isIndirectCall(Instr)) + continue; + + auto *TargetSymbol = BC.MIB->getTargetSymbol(Instr, 0); + if (VeneerDestinations.find(TargetSymbol) == VeneerDestinations.end()) + continue; + + VeneerCallers++; + if (!BC.MIB->replaceBranchTarget( + Instr, VeneerDestinations[TargetSymbol], BC.Ctx.get())) { + assert(false && "updating veneer call destination failed"); + } + } + } + } + + DEBUG(dbgs() << "BOLT-INFO: number of linker-inserted veneers call sites :" << VeneerCallers + << "\n"); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/Passes/VeneerElimination.h b/bolt/src/Passes/VeneerElimination.h new file mode 100644 index 000000000000..9948ef890e5d --- /dev/null +++ b/bolt/src/Passes/VeneerElimination.h @@ -0,0 +1,40 @@ +//===--- Passes/VeneerElimination.h ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_VENEER_ELIMINATION_H +#define LLVM_TOOLS_LLVM_BOLT_VENEER_ELIMINATION_H + +#include "BinaryFunctionCallGraph.h" +#include "BinaryPasses.h" +#include "MCPlus.h" +#include "MCPlusBuilder.h" + +namespace llvm { +namespace bolt { + +class VeneerElimination : public BinaryFunctionPass { +public: + /// BinaryPass public interface + explicit VeneerElimination(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) { + ; + } + + const char *getName() const override { return "veneer-elimination"; } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; +} // namespace bolt +} // namespace llvm + +#endif From 33ed6293f16d9db6cb4866c5f5982aa43ada4c4e Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 3 Jul 2018 17:02:33 -0700 Subject: [PATCH 450/904] Avoid removing BBs referenced by JTs Summary: While removing unreachable blocks, we may decide to remove a block that is listed as a target in a jump table entry. If we do that, this label will be then undefined and LLVM assembler will crash. Mitigate this for now by not removing such blocks, as we don't support removing unnecessary jump tables yet. Fixes #20 (cherry picked from commit b12d104bea21ac9b90be98599258db4aa5f6dbdd) --- bolt/src/BinaryFunction.cpp | 11 ++++++ bolt/test/X86/Inputs/issue20.yaml | 62 +++++++++++++++++++++++++++++++ bolt/test/X86/issue20.test | 15 ++++++++ 3 files changed, 88 insertions(+) create mode 100644 bolt/test/X86/Inputs/issue20.yaml create mode 100755 bolt/test/X86/issue20.test diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index ba6baedb21cb..40e51eb676bb 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -327,6 +327,17 @@ void BinaryFunction::markUnreachableBlocks() { if (BB->isEntryPoint() || BB->isLandingPad()) { Stack.push(BB); BB->markValid(true); + continue; + } + // FIXME: + // Also mark BBs with indirect jumps as reachable, since we do not + // support removing unused jump tables yet (T29418024 / GH-issue20) + for (const auto &Inst : *BB) { + if (BC.MIB->getJumpTable(Inst)) { + Stack.push(BB); + BB->markValid(true); + break; + } } } diff --git a/bolt/test/X86/Inputs/issue20.yaml b/bolt/test/X86/Inputs/issue20.yaml new file mode 100644 index 000000000000..e6226f1dd3f3 --- /dev/null +++ b/bolt/test/X86/Inputs/issue20.yaml @@ -0,0 +1,62 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x00000000004004CD +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x00000000004003E0 + AddressAlign: 0x0000000000000010 + Content: 31ED4989D15E4889E24883E4F0505449C7C07005400048C7C10005400048C7C7CD044000E8B7FFFFFFF4660F1F440000B82F10600055482D281060004883F80E4889E577025DC3B8000000004885C074F45DBF28106000FFE00F1F8000000000B82810600055482D2810600048C1F8034889E54889C248C1EA3F4801D048D1F875025DC3BA000000004885D274F45D4889C6BF28106000FFE20F1F8000000000803D9D0B2000007511554889E5E87EFFFFFF5DC6058A0B200001F3C30F1F400048833D7809200000741EB8000000004885C0741455BF200E60004889E5FFD05DE97BFFFFFF0F1F00E973FFFFFF4831C0C34883E703FF24FD90054000B801000000EB13B802000000EB0CB803000000EB05B804000000C3660F1F84000000000041574189FF41564989F641554989D541544C8D25F808200055488D2DF8082000534C29E531DB48C1FD034883EC08E85DFEFFFF4885ED741E0F1F8400000000004C89EA4C89F64489FF41FF14DC4883C3014839EB75EA4883C4085B5D415C415D415E415FC390662E0F1F840000000000F3C3 + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x0000000000400580 + AddressAlign: 0x0000000000000008 + Content: 01000200000000000000000000000000DC04400000000000E304400000000000EA04400000000000F104400000000000 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x0000000000600E28 + Link: .dynstr + AddressAlign: 0x0000000000000008 + Content: 010000000000000001000000000000000C0000000000000090034000000000000D0000000000000074054000000000001900000000000000100E6000000000001B0000000000000008000000000000001A00000000000000180E6000000000001C000000000000000800000000000000F5FEFF6F000000009802400000000000050000000000000000034000000000000600000000000000B8024000000000000A0000000000000038000000000000000B0000000000000018000000000000001500000000000000000000000000000003000000000000000010600000000000020000000000000018000000000000001400000000000000070000000000000017000000000000007803400000000000070000000000000060034000000000000800000000000000180000000000000009000000000000001800000000000000FEFFFF6F000000004003400000000000FFFFFF6F000000000100000000000000F0FFFF6F000000003803400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +Symbols: + Global: + - Name: main + Type: STT_FUNC + Section: .text + Value: 0x00000000004004CD + Size: 0x000000000000002A + - Name: jumptbl + Section: .rodata + Value: 0x0000000000400590 +DynamicSymbols: + Global: + - Name: mydata + Section: .rodata + Value: 0x0000000000400100 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_X, PF_R ] + VAddr: 0x00400000 + PAddr: 0x00400000 + Sections: + - Section: .text + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x00400000 + PAddr: 0x00400000 + Sections: + - Section: .text + - Type: PT_DYNAMIC + Flags: [ PF_X, PF_R ] + VAddr: 0x00600E28 + PAddr: 0x00600E28 + Sections: + - Section: .dynamic +... diff --git a/bolt/test/X86/issue20.test b/bolt/test/X86/issue20.test new file mode 100755 index 000000000000..de9c39b947da --- /dev/null +++ b/bolt/test/X86/issue20.test @@ -0,0 +1,15 @@ +# This reproduces issue 20 from our github repo +# "BOLT crashes when removing unreachable BBs that are a target +# in a JT" + +# RUN: yaml2obj %p/Inputs/issue20.yaml &> %t.exe +# RUN: llvm-bolt %t.exe -relocs=0 -jump-tables=move -print-finalized \ +# RUN: -o %t.out | FileCheck %s + +CHECK: BOLT-INFO: UCE removed 0 blocks and 0 bytes of code. +CHECK: Binary Function "main" +CHECK: .LFT0 (2 instructions, align : 1) +CHECK-NEXT: CFI State : 0 +CHECK-NEXT: 00000004: andq +CHECK-NEXT: 00000008: jmpq +CHECK-NEXT: Successors: .Ltmp1, .Ltmp2, .Ltmp3, .Ltmp4 From a28b4e07d03913d52698541573f9750fecdcab6a Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 3 Jul 2018 11:57:46 -0700 Subject: [PATCH 451/904] Fix assembly after adding entry points Summary: When a given function B, located after function A, references one of A's basic blocks, it registers a new global symbol at the reference address and update A's Labels vector via BinaryFunction::addEntryPoint(). However, we don't update A's branch targets at this point. So we end up with an inconsistent CFG, where the basic block names are global symbols, but the internal branch operands are still referencing the old local name of the corresponding blocks that got promoted to an entry point. This patch fix this by detecting this situation in addEntryPoint and iterating over all instructions, looking for references to the old symbol and replacing them to use the new global symbol (since this is now an entry point). Fixes #26 (cherry picked from commit da7cde1758b806cc17015866d5f526dfd42f2db1) --- bolt/src/BinaryFunction.cpp | 19 +++++ bolt/src/BinaryFunction.h | 4 + bolt/src/MCPlusBuilder.cpp | 19 +++++ bolt/src/MCPlusBuilder.h | 6 ++ .../Target/AArch64/AArch64MCPlusBuilder.cpp | 18 ----- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 14 +--- bolt/test/X86/Inputs/issue26.yaml | 76 +++++++++++++++++++ bolt/test/X86/issue26.test | 9 +++ 8 files changed, 135 insertions(+), 30 deletions(-) create mode 100644 bolt/test/X86/Inputs/issue26.yaml create mode 100755 bolt/test/X86/issue26.test diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 40e51eb676bb..a591a87c98fc 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1928,6 +1928,20 @@ void BinaryFunction::removeTagsFromProfile() { } } +void BinaryFunction::updateReferences(const MCSymbol *From, const MCSymbol *To) { + assert(CurrentState == State::Empty || CurrentState == State::Disassembled); + assert(From && To && "invalid symbols"); + + for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { + auto &Inst = I->second; + for (int I = 0, E = MCPlus::getNumPrimeOperands(Inst); I != E; ++I) { + const MCSymbol *S = BC.MIB->getTargetSymbol(Inst, I); + if (S == From) + BC.MIB->setOperandToSymbolRef(Inst, I, To, 0, &*BC.Ctx, 0); + } + } +} + void BinaryFunction::addEntryPoint(uint64_t Address) { assert(containsAddress(Address) && "address does not belong to the function"); @@ -1943,6 +1957,8 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { // If we haven't built CFG for the function, we can add a new entry point // even if it doesn't have an associated entry in the symbol table. if (CurrentState == State::Empty || CurrentState == State::Disassembled) { + auto Iter = Labels.find(Offset); + const MCSymbol *OldSym = Iter != Labels.end() ? Iter->second : nullptr; if (!EntrySymbol) { DEBUG(dbgs() << "creating local label\n"); EntrySymbol = getOrCreateLocalLabel(Address); @@ -1951,6 +1967,9 @@ void BinaryFunction::addEntryPoint(uint64_t Address) { } addEntryPointAtOffset(Address - getAddress()); Labels.emplace(Offset, EntrySymbol); + if (OldSym != nullptr && EntrySymbol != OldSym) { + updateReferences(OldSym, EntrySymbol); + } return; } diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index ee48e40ece40..bf5d8933400b 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -639,6 +639,10 @@ class BinaryFunction { return getOrCreateLocalLabel(getAddress() + Offset); } + /// Update all \p From references in the code to refer to \p To. Used + /// in disassembled state only. + void updateReferences(const MCSymbol *From, const MCSymbol *To); + /// This is called in disassembled state. void addEntryPoint(uint64_t Address); diff --git a/bolt/src/MCPlusBuilder.cpp b/bolt/src/MCPlusBuilder.cpp index a2de5dd55c9f..0e1949299066 100644 --- a/bolt/src/MCPlusBuilder.cpp +++ b/bolt/src/MCPlusBuilder.cpp @@ -439,3 +439,22 @@ MCPlusBuilder::getRegSize(MCPhysReg Reg) const { return SizeMap[Reg]; } + +bool MCPlusBuilder::setOperandToSymbolRef(MCInst &Inst, int OpNum, + const MCSymbol *Symbol, + int64_t Addend, MCContext *Ctx, + uint64_t RelType) const { + MCOperand Operand; + if (!Addend) { + Operand = MCOperand::createExpr(getTargetExprFor( + Inst, MCSymbolRefExpr::create(Symbol, *Ctx), *Ctx, RelType)); + } else { + Operand = MCOperand::createExpr(getTargetExprFor( + Inst, + MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, *Ctx), + MCConstantExpr::create(Addend, *Ctx), *Ctx), + *Ctx, RelType)); + } + Inst.getOperand(OpNum) = Operand; + return true; +} diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index 4b08ed46cf62..43932d008dac 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -824,6 +824,12 @@ class MCPlusBuilder { return false; } + /// Discard operand \p OpNum replacing it by a new MCOperand that is a + /// MCExpr referencing \p Symbol + \p Addend. + virtual bool setOperandToSymbolRef(MCInst &Inst, int OpNum, + const MCSymbol *Symbol, int64_t Addend, + MCContext *Ctx, uint64_t RelType) const; + /// Replace an immediate operand in the instruction \p Inst with a reference /// of the passed \p Symbol plus \p Addend. If the instruction does not have /// an immediate operand or has more than one - then return false. Otherwise diff --git a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp index 44d1edd6857b..02cb252e0b8d 100644 --- a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -968,24 +968,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return true; } - bool setOperandToSymbolRef(MCInst &Inst, int OpNum, MCSymbol *Symbol, - int64_t Addend, MCContext *Ctx, - uint64_t RelType) const { - MCOperand Operand; - if (!Addend) { - Operand = MCOperand::createExpr(getTargetExprFor( - Inst, MCSymbolRefExpr::create(Symbol, *Ctx), *Ctx, RelType)); - } else { - Operand = MCOperand::createExpr(getTargetExprFor( - Inst, - MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, *Ctx), - MCConstantExpr::create(Addend, *Ctx), *Ctx), - *Ctx, RelType)); - } - Inst.getOperand(OpNum) = Operand; - return true; - } - bool replaceImmWithSymbol(MCInst &Inst, MCSymbol *Symbol, int64_t Addend, MCContext *Ctx, int64_t &Value, uint64_t RelType) const override { diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index 3dfe7f3dc81b..7f9e88e2ba82 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -2770,18 +2770,8 @@ class X86MCPlusBuilder : public MCPlusBuilder { Value = Inst.getOperand(ImmOpNo).getImm(); - MCOperand Operand; - if (!Addend) { - Operand = MCOperand::createExpr(getTargetExprFor( - Inst, MCSymbolRefExpr::create(Symbol, *Ctx), *Ctx, RelType)); - } else { - Operand = MCOperand::createExpr(getTargetExprFor( - Inst, - MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, *Ctx), - MCConstantExpr::create(Addend, *Ctx), *Ctx), - *Ctx, RelType)); - } - Inst.getOperand(ImmOpNo) = Operand; + setOperandToSymbolRef(Inst, ImmOpNo, Symbol, Addend, Ctx, RelType); + return true; } diff --git a/bolt/test/X86/Inputs/issue26.yaml b/bolt/test/X86/Inputs/issue26.yaml new file mode 100644 index 000000000000..6d30bb0d0b2b --- /dev/null +++ b/bolt/test/X86/Inputs/issue26.yaml @@ -0,0 +1,76 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x00000000004004FA +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x00000000004003E0 + AddressAlign: 0x0000000000000010 + Content: 31ED4989D15E4889E24883E4F0505449C7C07005400048C7C10005400048C7C7FA044000E8B7FFFFFFF4660F1F440000B82F10600055482D281060004883F80E4889E577025DC3B8000000004885C074F45DBF28106000FFE00F1F8000000000B82810600055482D2810600048C1F8034889E54889C248C1EA3F4801D048D1F875025DC3BA000000004885D274F45D4889C6BF28106000FFE20F1F8000000000803D9D0B2000007511554889E5E87EFFFFFF5DC6058A0B200001F3C30F1F400048833D7809200000741EB8000000004885C0741455BF200E60004889E5FFD05DE97BFFFFFF0F1F00E973FFFFFF648B0425B0FCFFFF39C70F850C0000004839160F850400000048890EC3B8FFFFFFFFC34839FE0F84F0FFFFFFC34831C0C3669041574189FF41564989F641554989D541544C8D25F808200055488D2DF8082000534C29E531DB48C1FD034883EC08E85DFEFFFF4885ED741E0F1F8400000000004C89EA4C89F64489FF41FF14DC4883C3014839EB75EA4883C4085B5D415C415D415E415FC390662E0F1F840000000000F3C3 + - Name: .rela.text + Type: SHT_RELA + Flags: [ SHF_INFO_LINK ] + Link: .symtab + AddressAlign: 0x0000000000000008 + Info: .text + Relocations: + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x0000000000400580 + AddressAlign: 0x0000000000000008 + Content: '01000200000000000000000000000000' + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x0000000000600E28 + Link: .dynstr + AddressAlign: 0x0000000000000008 + Content: 010000000000000001000000000000000C0000000000000090034000000000000D0000000000000074054000000000001900000000000000100E6000000000001B0000000000000008000000000000001A00000000000000180E6000000000001C000000000000000800000000000000F5FEFF6F000000009802400000000000050000000000000000034000000000000600000000000000B8024000000000000A0000000000000038000000000000000B0000000000000018000000000000001500000000000000000000000000000003000000000000000010600000000000020000000000000018000000000000001400000000000000070000000000000017000000000000007803400000000000070000000000000060034000000000000800000000000000180000000000000009000000000000001800000000000000FEFFFF6F000000004003400000000000FFFFFF6F000000000100000000000000F0FFFF6F000000003803400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +Symbols: + Global: + - Name: FUNC + Type: STT_FUNC + Section: .text + Value: 0x00000000004004F0 + Size: 0x000000000000000A + - Name: main + Type: STT_FUNC + Section: .text + Value: 0x00000000004004FA + Size: 0x0000000000000004 + - Name: XYZ + Type: STT_FUNC + Section: .text + Value: 0x00000000004004CD + Size: 0x0000000000000023 +DynamicSymbols: + Global: + - Name: mydata + Section: .rodata + Value: 0x0000000000400100 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_X, PF_R ] + VAddr: 0x00400000 + PAddr: 0x00400000 + Sections: + - Section: .text + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x00400000 + PAddr: 0x00400000 + Sections: + - Section: .text + - Type: PT_DYNAMIC + Flags: [ PF_X, PF_R ] + VAddr: 0x0061ADA8 + PAddr: 0x0064ADA8 + Sections: + - Section: .dynamic +... diff --git a/bolt/test/X86/issue26.test b/bolt/test/X86/issue26.test new file mode 100755 index 000000000000..e3e13e4e0eea --- /dev/null +++ b/bolt/test/X86/issue26.test @@ -0,0 +1,9 @@ +# This reproduces issue 26 from our github repo + +# RUN: yaml2obj %p/Inputs/issue26.yaml &> %t.exe +# RUN: llvm-bolt %t.exe -relocs -print-cfg -o %t.out \ +# RUN: | FileCheck %s + +CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0 +CHECK: Binary Function "XYZ" +CHECK: 0000000a: jne FUNCat0x4004e9 From dec724b4de7861240a2c034c70b972d395812335 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 10 Jul 2018 10:25:55 -0700 Subject: [PATCH 452/904] [perf2bolt] Accept `-` as a valid misprediction symbol Summary: As reported in GH-28 `perf` can produce `-` symbol for misprediction bit if the bit is not supported by the kernel/HW. In this case we can ignore the bit. (cherry picked from commit 0a4ad831e49388ccc49ae15ecdb51c58b6eaafff) --- bolt/src/DataAggregator.cpp | 8 +++++++- bolt/src/DataReader.cpp | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index c94be6f0b59b..8b3ee0bd56af 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -653,13 +653,19 @@ ErrorOr DataAggregator::parseLBREntry() { return EC; StringRef MispredStr = MispredStrRes.get(); if (MispredStr.size() != 1 || - (MispredStr[0] != 'P' && MispredStr[0] != 'M')) { + (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) { reportError("expected single char for mispred bit"); Diag << "Found: " << MispredStr << "\n"; return make_error_code(llvm::errc::io_error); } Res.Mispred = MispredStr[0] == 'M'; + static bool MispredWarning = true;; + if (MispredStr[0] == '-' && MispredWarning) { + errs() << "PERF2BOLT-WARNING: misprediction bit is missing in profile\n"; + MispredWarning = false; + } + auto Rest = parseString(FieldSeparator, true); if (std::error_code EC = Rest.getError()) return EC; diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp index 348b83d4e334..62e7be78b8a9 100644 --- a/bolt/src/DataReader.cpp +++ b/bolt/src/DataReader.cpp @@ -268,7 +268,7 @@ DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { } void DataReader::reportError(StringRef ErrorMsg) { - Diag << "Error reading bolt data input file: line " << Line << ", column " + Diag << "Error reading BOLT data input file: line " << Line << ", column " << Col << ": " << ErrorMsg << '\n'; } From 52dcae4c670e667a4f314de779681e1985b91417 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 6 Jul 2018 21:30:23 -0700 Subject: [PATCH 453/904] [BOLT] Fix llvm-dwarfdump issues Summary: llvm-dwarfdump is relying on getRelocatedSection() to return section_end() for ELF files of types other than relocatable objects. We've changed the function to return relocatable section for other types of ELF files. As a result, llvm-dwarfdump started re-processing relocations for sections that already had relocations applied, e.g. in executable files, and this resulted in wrong values reported. As a workaround/solution, we make this function return relocated section for executable (and any non-relocatable objects) files only if the section is allocatable. (cherry picked from commit db01395bd974b532031f6297090c10bba26d4bdc) --- bolt/llvm.patch | 14 ++++++++++++-- bolt/src/RewriteInstance.cpp | 3 +-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/bolt/llvm.patch b/bolt/llvm.patch index 1b97aee1d362..64904bb34d68 100644 --- a/bolt/llvm.patch +++ b/bolt/llvm.patch @@ -1045,7 +1045,7 @@ index 46504e74bc2..836fd8ddc45 100644 Expected ELFFile::getRelocationSymbol(const Elf_Rel *Rel, diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h -index 4d001039238..06a629573cc 100644 +index 4d001039238..62837bbcaa0 100644 --- a/include/llvm/Object/ELFObjectFile.h +++ b/include/llvm/Object/ELFObjectFile.h @@ -254,6 +254,7 @@ protected: @@ -1081,7 +1081,17 @@ index 4d001039238..06a629573cc 100644 const Elf_Shdr *EShdr = getSection(Sec); uintX_t Type = EShdr->sh_type; if (Type != ELF::SHT_REL && Type != ELF::SHT_RELA) -@@ -792,8 +798,6 @@ ELFObjectFile::getRelocationSymbol(DataRefImpl Rel) const { +@@ -762,6 +768,9 @@ ELFObjectFile::getRelocatedSection(DataRefImpl Sec) const { + auto R = EF.getSection(EShdr->sh_info); + if (!R) + report_fatal_error(errorToErrorCode(R.takeError()).message()); ++ if (EF.getHeader()->e_type != ELF::ET_REL && ++ !((*R)->sh_flags & ELF::SHF_ALLOC)) ++ return section_end(); + return section_iterator(SectionRef(toDRI(*R), this)); + } + +@@ -792,8 +801,6 @@ ELFObjectFile::getRelocationSymbol(DataRefImpl Rel) const { template uint64_t ELFObjectFile::getRelocationOffset(DataRefImpl Rel) const { diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 990c7c6ee1b9..0d57579d0c28 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1472,9 +1472,8 @@ void RewriteInstance::discoverFileObjects() { // Read all relocations now that we have binary functions mapped. for (const auto &Section : InputFile->sections()) { - if (Section.relocation_begin() != Section.relocation_end()) { + if (Section.getRelocatedSection() != InputFile->section_end()) readRelocations(Section); - } } } From 55bb31201d84e42361153f583eb0129918a55629 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 31 May 2018 10:33:53 -0700 Subject: [PATCH 454/904] [BOLT-AArch64] Create cold symbols on demand Summary: Rework the logic we use for managing references to constant islands. Defer the creation of the cold versions to when we split the function and will need them. (cherry picked from commit 5237ed04b639fb5a02f14ada472e5aeeadc0f659) --- bolt/src/BinaryFunction.cpp | 82 ++++++++++++++++++++----------------- bolt/src/BinaryFunction.h | 64 +++++++++++------------------ 2 files changed, 69 insertions(+), 77 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index a591a87c98fc..3be3b5a6ed91 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -942,7 +942,7 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address, return LI->second; // For AArch64, check if this address is part of a constant island. - if (MCSymbol *IslandSym = getOrCreateIslandAccess(Address).first) { + if (MCSymbol *IslandSym = getOrCreateIslandAccess(Address)) { return IslandSym; } @@ -976,7 +976,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { if (BC.isAArch64()) { // Check if this is an access to a constant island and create bookkeeping // to keep track of it and emit it later as part of this function - if (MCSymbol *IslandSym = getOrCreateIslandAccess(TargetAddress).first) { + if (MCSymbol *IslandSym = getOrCreateIslandAccess(TargetAddress)) { return IslandSym; } else { // Detect custom code written in assembly that refers to arbitrary @@ -986,13 +986,15 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { auto IslandIter = BC.AddressToConstantIslandMap.lower_bound(TargetAddress); if (IslandIter != BC.AddressToConstantIslandMap.end()) { - MCSymbol *IslandSym, *ColdIslandSym; - std::tie(IslandSym, ColdIslandSym) = - IslandIter->second->getOrCreateProxyIslandAccess(TargetAddress, - this); - if (IslandSym) { - addConstantIslandDependency(IslandIter->second, IslandSym, - ColdIslandSym); + if (MCSymbol *IslandSym = + IslandIter->second->getOrCreateProxyIslandAccess( + TargetAddress, this)) { + /// Make this function depend on IslandIter->second because we have + /// a reference to its constant island. When emitting this function, + /// we will also emit IslandIter->second's constants. This only + /// happens in custom AArch64 assembly code. + IslandDependency.insert(IslandIter->second); + ProxyIslandSymbols[IslandSym] = IslandIter->second; return IslandSym; } } @@ -2434,18 +2436,6 @@ void BinaryFunction::setTrapOnEntry() { TrapsOnEntry = true; } -void BinaryFunction::addConstantIslandDependency(BinaryFunction *OtherBF, - MCSymbol *HotSymbol, - MCSymbol *ColdSymbol) { - IslandDependency.insert(OtherBF); - if (!ColdIslandSymbols.count(HotSymbol)) { - ColdIslandSymbols[HotSymbol] = ColdSymbol; - } - DEBUG(dbgs() << "BOLT-DEBUG: Constant island dependency added! " - << getPrintName() << " refers to " << OtherBF->getPrintName() - << "\n"); -} - void BinaryFunction::emitConstantIslands( MCStreamer &Streamer, bool EmitColdPart, BinaryFunction *OnBehalfOf) { @@ -2474,7 +2464,7 @@ void BinaryFunction::emitConstantIslands( << "\n"; // We split the island into smaller blocks and output labels between them. - auto IS = IslandSymbols.begin(); + auto IS = IslandOffsets.begin(); for (auto DataIter = DataOffsets.begin(); DataIter != DataOffsets.end(); ++DataIter) { uint64_t FunctionOffset = *DataIter; @@ -2498,9 +2488,9 @@ void BinaryFunction::emitConstantIslands( // Emit labels, relocs and data auto RI = MoveRelocations.lower_bound(FunctionOffset); - while ((IS != IslandSymbols.end() && IS->first < EndOffset) || + while ((IS != IslandOffsets.end() && IS->first < EndOffset) || (RI != MoveRelocations.end() && RI->first < EndOffset)) { - auto NextLabelOffset = IS == IslandSymbols.end() ? EndOffset : IS->first; + auto NextLabelOffset = IS == IslandOffsets.end() ? EndOffset : IS->first; auto NextRelOffset = RI == MoveRelocations.end() ? EndOffset : RI->first; auto NextStop = std::min(NextLabelOffset, NextRelOffset); assert(NextStop <= EndOffset && "internal overflow error"); @@ -2508,7 +2498,7 @@ void BinaryFunction::emitConstantIslands( Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, NextStop)); FunctionOffset = NextStop; } - if (IS != IslandSymbols.end() && FunctionOffset == IS->first) { + if (IS != IslandOffsets.end() && FunctionOffset == IS->first) { // This is a slightly complex code to decide which label to emit. We // have 4 cases to handle: regular symbol, cold symbol, regular or cold // symbol being emitted on behalf of an external function. @@ -2521,7 +2511,7 @@ void BinaryFunction::emitConstantIslands( Streamer.EmitLabel(IS->second); else assert(hasName(IS->second->getName())); - } else { + } else if (ColdIslandSymbols.count(IS->second) != 0) { DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << ColdIslandSymbols[IS->second]->getName() << '\n'); if (ColdIslandSymbols[IS->second]->isUndefined()) @@ -2534,13 +2524,11 @@ void BinaryFunction::emitConstantIslands( << '\n'); Streamer.EmitLabel(Sym); } - } else { - if (MCSymbol *Sym = - IslandProxies[OnBehalfOf][ColdIslandSymbols[IS->second]]) { - DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName() - << '\n'); - Streamer.EmitLabel(Sym); - } + } else if (MCSymbol *Sym = + ColdIslandProxies[OnBehalfOf][IS->second]) { + DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName() + << '\n'); + Streamer.EmitLabel(Sym); } } ++IS; @@ -2560,7 +2548,7 @@ void BinaryFunction::emitConstantIslands( Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, EndOffset)); } } - assert(IS == IslandSymbols.end() && "some symbols were not emitted!"); + assert(IS == IslandOffsets.end() && "some symbols were not emitted!"); if (OnBehalfOf) return; @@ -2584,13 +2572,31 @@ void BinaryFunction::duplicateConstantIslands() { ++OpNum; continue; } - const auto *Symbol = BC.MIB->getTargetSymbol(Inst, OpNum); - auto ISym = ColdIslandSymbols.find(Symbol); - if (ISym == ColdIslandSymbols.end()) + auto *Symbol = BC.MIB->getTargetSymbol(Inst, OpNum); + // Check if this is an island symbol + if (!IslandSymbols.count(Symbol) && !ProxyIslandSymbols.count(Symbol)) continue; + + // Create cold symbol, if missing + auto ISym = ColdIslandSymbols.find(Symbol); + MCSymbol *ColdSymbol; + if (ISym != ColdIslandSymbols.end()) { + ColdSymbol = ISym->second; + } else { + ColdSymbol = BC.Ctx->getOrCreateSymbol(Symbol->getName() + ".cold"); + ColdIslandSymbols[Symbol] = ColdSymbol; + // Check if this is a proxy island symbol and update owner proxy map + if (ProxyIslandSymbols.count(Symbol)) { + BinaryFunction *Owner = ProxyIslandSymbols[Symbol]; + auto IProxiedSym = Owner->IslandProxies[this].find(Symbol); + Owner->ColdIslandProxies[this][IProxiedSym->second] = ColdSymbol; + } + } + + // Update instruction reference Operand = MCOperand::createExpr(BC.MIB->getTargetExprFor( Inst, - MCSymbolRefExpr::create(ISym->second, MCSymbolRefExpr::VK_None, + MCSymbolRefExpr::create(ColdSymbol, MCSymbolRefExpr::VK_None, *BC.Ctx), *BC.Ctx, 0)); ++OpNum; diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index bf5d8933400b..7fac9b5592e9 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -556,12 +556,14 @@ class BinaryFunction { /// Offsets in function that are data values in a constant island identified /// after disassembling - std::map IslandSymbols; + std::map IslandOffsets; + SmallPtrSet IslandSymbols; + std::map ProxyIslandSymbols; std::map ColdIslandSymbols; /// Keeps track of other functions we depend on because there is a reference /// to the constant islands in them. std::map> - IslandProxies; + IslandProxies, ColdIslandProxies; std::set IslandDependency; // The other way around // Blocks are kept sorted in the layout order. If we need to change the @@ -1730,62 +1732,46 @@ class BinaryFunction { /// hot code area while the second return value is the symbol for reference /// in the cold code area, as when the function is split the islands are /// duplicated. - std::pair getOrCreateIslandAccess(uint64_t Address) { - MCSymbol *Symbol, *ColdSymbol; + MCSymbol *getOrCreateIslandAccess(uint64_t Address) { + MCSymbol *Symbol; if (!isInConstantIsland(Address)) - return std::make_pair(nullptr, nullptr); + return nullptr; // Register our island at global namespace Symbol = BC.getOrCreateGlobalSymbol(Address, 0, 0, "ISLANDat"); // Internal bookkeeping const auto Offset = Address - getAddress(); - assert((!IslandSymbols.count(Offset) || IslandSymbols[Offset] == Symbol) && + assert((!IslandOffsets.count(Offset) || IslandOffsets[Offset] == Symbol) && "Inconsistent island symbol management"); - if (!IslandSymbols.count(Offset)) { - IslandSymbols[Offset] = Symbol; - } - if (!ColdIslandSymbols.count(Symbol)) { - ColdSymbol = BC.Ctx->getOrCreateSymbol(Symbol->getName() + ".cold"); - ColdIslandSymbols[Symbol] = ColdSymbol; - } else { - ColdSymbol = ColdIslandSymbols[Symbol]; + if (!IslandOffsets.count(Offset)) { + IslandOffsets[Offset] = Symbol; + IslandSymbols.insert(Symbol); } - return std::make_pair(Symbol, ColdSymbol); + return Symbol; } /// Called by an external function which wishes to emit references to constant /// island symbols of this function. We create a proxy for it, so we emit /// separate symbols when emitting our constant island on behalf of this other /// function. - std::pair + MCSymbol * getOrCreateProxyIslandAccess(uint64_t Address, BinaryFunction *Referrer) { - auto HotColdSymbols = getOrCreateIslandAccess(Address); - if (!HotColdSymbols.first) - return HotColdSymbols; - - MCSymbol *ProxyHot, *ProxyCold; - if (!IslandProxies[Referrer].count(HotColdSymbols.first)) { - ProxyHot = - BC.Ctx->getOrCreateSymbol(HotColdSymbols.first->getName() + - ".proxy.for." + Referrer->getPrintName()); - ProxyCold = - BC.Ctx->getOrCreateSymbol(HotColdSymbols.second->getName() + + auto Symbol = getOrCreateIslandAccess(Address); + if (!Symbol) + return nullptr; + + MCSymbol *Proxy; + if (!IslandProxies[Referrer].count(Symbol)) { + Proxy = + BC.Ctx->getOrCreateSymbol(Symbol->getName() + ".proxy.for." + Referrer->getPrintName()); - IslandProxies[Referrer][HotColdSymbols.first] = ProxyHot; - IslandProxies[Referrer][HotColdSymbols.second] = ProxyCold; + IslandProxies[Referrer][Symbol] = Proxy; + IslandProxies[Referrer][Proxy] = Symbol; } - ProxyHot = IslandProxies[Referrer][HotColdSymbols.first]; - ProxyCold = IslandProxies[Referrer][HotColdSymbols.second]; - return std::make_pair(ProxyHot, ProxyCold); + Proxy = IslandProxies[Referrer][Symbol]; + return Proxy; } - /// Make this function depend on \p OtherBF because we have a reference to its - /// constant island. When emitting this function, we will also emit OtherBF's - /// constants. This only happens in custom AArch64 assembly code (either - /// poorly written code or over-optimized). - void addConstantIslandDependency(BinaryFunction *OtherBF, MCSymbol *HotSymbol, - MCSymbol *ColdSymbol); - /// Detects whether \p Address is inside a data region in this function /// (constant islands). bool isInConstantIsland(uint64_t Address) const { From 2d860decba5fa771701ee9f0911e633de46939ef Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 13 Jul 2018 10:49:41 -0700 Subject: [PATCH 455/904] [perf2bolt] Fix perf build-id matching Summary: Recent compiler tool chains can produce build-ids that are less than 40 characters long. Linux perf, however, always outputs 40 characters, expanding the string with 0's as needed. Fix the matching by only checking the string prefix. (cherry picked from commit f7f00b1653796ecc50b5f3163dc9b8c60912b6c4) --- bolt/src/DataAggregator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 8b3ee0bd56af..74fab58155a3 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -1083,7 +1083,7 @@ DataAggregator::getFileNameForBuildID(StringRef FileBuildID) { if (!IDPair) return NoneType(); - if (IDPair->second == FileBuildID) + if (IDPair->second.startswith(FileBuildID)) return sys::path::filename(IDPair->first); } return NoneType(); From a7b14bdf54b94bfbb4cb52acaae6fcc6625b197d Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 13 Jul 2018 15:26:41 -0700 Subject: [PATCH 456/904] [perf2bolt] Enforce file matching in perf2bolt Summary: If the input binary does not have a build-id and the name does not match any file names in perf.data, then reject the binary, and issue an error message suggesting to rename it to one of the listed names from perf.data. (cherry picked from commit 97825b9e2ea30dbacad20e33607250092ceec92d) --- bolt/src/DataAggregator.cpp | 93 ++++++++++++++++++++++++------------- bolt/src/DataAggregator.h | 10 ++-- 2 files changed, 67 insertions(+), 36 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 74fab58155a3..b2d06b14d3b1 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -23,6 +23,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Regex.h" #include "llvm/Support/Timer.h" +#include #include @@ -309,9 +310,8 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) { exit(1); } } else if (*FileName != BinaryName) { - errs() << "PERF2BOLT-WARNING: build-id matched a different file name. " - "Using \"" << *FileName << "\" for profile parsing.\n"; - BinaryName = *FileName; + errs() << "PERF2BOLT-WARNING: build-id matched a different file name\n"; + BuildIDBinaryName = *FileName; } else { outs() << "PERF2BOLT: matched build-id and file name\n"; } @@ -991,39 +991,37 @@ std::error_code DataAggregator::parseMemEvents() { return std::error_code(); } -ErrorOr DataAggregator::parseTaskPID() { +ErrorOr> DataAggregator::parseTaskPID() { while (checkAndConsumeFS()) {} - auto CommNameStr = parseString(FieldSeparator, true); - if (std::error_code EC = CommNameStr.getError()) - return EC; - if (CommNameStr.get() != BinaryName.substr(0, 15)) { - consumeRestOfLine(); - return -1; - } - auto LineEnd = ParsingBuf.find_first_of("\n"); if (LineEnd == StringRef::npos) { reportError("expected rest of line"); Diag << "Found: " << ParsingBuf << "\n"; return make_error_code(llvm::errc::io_error); } - StringRef Line = ParsingBuf.substr(0, LineEnd); - if (Line.find("PERF_RECORD_COMM") != StringRef::npos) { - int64_t PID; - StringRef PIDStr = Line.rsplit(':').second.split('/').first; - if (PIDStr.getAsInteger(10, PID)) { - reportError("expected PID"); - Diag << "Found: " << PIDStr << "\n"; - return make_error_code(llvm::errc::io_error); - } - return PID; + if (Line.find("PERF_RECORD_COMM") == StringRef::npos) { + consumeRestOfLine(); + return std::make_pair(StringRef(), -1); + } + + auto FileName = Line.split(FieldSeparator).first; + if (FileName == "PERF_RECORD_COMM") + FileName = Line.rsplit(':').first.rsplit(FieldSeparator).second; + + int64_t PID; + StringRef PIDStr = Line.rsplit(':').second.split('/').first; + if (PIDStr.getAsInteger(10, PID)) { + reportError("expected PID"); + Diag << "Found: " << PIDStr << "\n"; + return make_error_code(llvm::errc::io_error); } consumeRestOfLine(); - return -1; + + return std::make_pair(FileName, PID); } std::error_code DataAggregator::parseTasks() { @@ -1031,30 +1029,59 @@ std::error_code DataAggregator::parseTasks() { NamedRegionTimer T("parseTasks", "Tasks parsing", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + std::multimap BinaryPIDs; while (hasData()) { - auto PIDRes = parseTaskPID(); - if (std::error_code EC = PIDRes.getError()) + auto NamePIDRes = parseTaskPID(); + if (std::error_code EC = NamePIDRes.getError()) return EC; - auto PID = PIDRes.get(); - if (PID == -1) { + auto NamePIDPair = NamePIDRes.get(); + if (NamePIDPair.second == -1) continue; + + BinaryPIDs.insert(NamePIDPair); + } + + DEBUG( + dbgs() << "FileName -> PID mapping:\n"; + for (const auto &Pair : BinaryPIDs) { + dbgs() << " " << Pair.first << " : " << Pair.second << '\n'; } + ); + + auto NameToUse = BinaryName.substr(0, 15); + if (BinaryPIDs.count(NameToUse) == 0 && !BuildIDBinaryName.empty()) { + errs() << "PERF2BOLT-WARNING: using \"" << BuildIDBinaryName + << "\" for profile matching\n"; + NameToUse = BuildIDBinaryName.substr(0, 15); + } - PIDs.insert(PID); + auto Range = BinaryPIDs.equal_range(NameToUse); + for (auto I = Range.first; I != Range.second; ++I) { + PIDs.insert(I->second); } + if (!PIDs.empty()) { outs() << "PERF2BOLT: Input binary is associated with " << PIDs.size() << " PID(s)\n"; } else { if (errs().has_colors()) - errs().changeColor(raw_ostream::YELLOW); - errs() << "PERF2BOLT-WARNING: Could not bind input binary to a PID - will " - "parse all samples in perf data. This could result in corrupted " - "samples for the input binary if system-wide profile collection " - "was used.\n"; + errs().changeColor(raw_ostream::RED); + errs() << "PERF2BOLT-ERROR: could not find a profile matching binary \"" + << BinaryName << "\"."; + if (!BinaryPIDs.empty()) { + errs() << " Profile for the following binary name(s) is available:\n"; + for (auto I = BinaryPIDs.begin(), IE = BinaryPIDs.end(); I != IE; + I = BinaryPIDs.upper_bound(I->first)) { + errs() << " " << I->first << '\n'; + } + errs() << "Please rename the input binary.\n"; + } else { + errs() << " Failed to extract any binary name from a profile.\n"; + } if (errs().has_colors()) errs().resetColor(); + exit(1); } return std::error_code(); diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index becce32b91e1..d56663177c5a 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -87,6 +87,10 @@ class DataAggregator : public DataReader { /// Our sampled binary name to look for in perf.data std::string BinaryName; + /// Name of the binary with matching build-id from perf.data if different + /// from BinaryName; + std::string BuildIDBinaryName; + DenseSet PIDs; /// References to core BOLT data structures @@ -178,9 +182,9 @@ class DataAggregator : public DataReader { std::error_code parseMemEvents(); /// Parse a single line of a PERF_RECORD_COMM event looking for an association - /// between the binary name and its PID. Return -1 if binary name is not - /// correct. - ErrorOr parseTaskPID(); + /// between the binary name and its PID. On success return a + /// pair. + ErrorOr> parseTaskPID(); /// Parse the full output generated by perf script to report PERF_RECORD_COMM /// events with the association of binary file names and their PIDs. From 51359beb9b91e16aaee5d5cfd12fdddba93c254f Mon Sep 17 00:00:00 2001 From: Laith Saed Sakka Date: Sun, 8 Jul 2018 12:14:08 -0700 Subject: [PATCH 457/904] Add initial function injection support Summary: This diff have the API needed to inject functions using bolt. In relocation mode injected functions are emitted between the cold and the hot functions, In non-reloc mode injected functions are emitted a next text section. (cherry picked from commit fb9ea8cf29072ab4e945b9c2ea5823f317513409) --- bolt/src/BinaryContext.cpp | 12 ++++ bolt/src/BinaryContext.h | 14 ++++- bolt/src/BinaryFunction.cpp | 36 ++++++++---- bolt/src/BinaryFunction.h | 38 ++++++++---- bolt/src/RewriteInstance.cpp | 110 +++++++++++++++++++++++++++++------ 5 files changed, 169 insertions(+), 41 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 833924d4f667..80c0dca0a4f9 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -61,6 +61,9 @@ BinaryContext::~BinaryContext() { for (auto *Section : Sections) { delete Section; } + for (auto *InjectedFunction : InjectedBinaryFunctions) { + delete InjectedFunction; + } clearBinaryData(); } @@ -1071,3 +1074,12 @@ void BinaryContext::exitWithBugReport(StringRef Message, errs() << "\n=======================================\n"; exit(1); } + +BinaryFunction * +BinaryContext::createInjectedBinaryFunction(const std::string &Name, + bool IsSimple) { + InjectedBinaryFunctions.push_back(new BinaryFunction(Name, *this, IsSimple)); + auto *BF = InjectedBinaryFunctions.back(); + setSymbolToFunctionMap(BF->getSymbol(), BF); + return BF; +} diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index b320fbbf62cf..d26314f3c228 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -136,8 +136,11 @@ class BinaryContext { /// Low level section registration. BinarySection ®isterSection(BinarySection *Section); -public: + /// Functions injected by BOLT + std::vector InjectedBinaryFunctions; + +public: /// [name] -> [BinaryData*] map used for global symbol resolution. using SymbolMapType = std::map; SymbolMapType GlobalSymbols; @@ -198,6 +201,15 @@ class BinaryContext { /// Populate \p GlobalMemData. This should be done after all symbol discovery /// is complete, e.g. after building CFGs for all functions. void assignMemData(); + + /// Create BOLT-injected function + BinaryFunction *createInjectedBinaryFunction(const std::string &Name, + bool IsSimple = true); + + std::vector &getInjectedBinaryFunctions() { + return InjectedBinaryFunctions; + } + public: /// Map address to a constant island owner (constant data in code section) std::map AddressToConstantIslandMap; diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 3be3b5a6ed91..d974e7435569 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -290,7 +290,6 @@ std::string BinaryFunction::getDemangledName() const { return NameStr; } - BinaryBasicBlock * BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { if (Offset > Size) @@ -401,6 +400,9 @@ bool BinaryFunction::isForwardCall(const MCSymbol *CalleeSymbol) const { // should have been ordered with a stable sort. const auto *CalleeBF = BC.getFunctionForSymbol(CalleeSymbol); if (CalleeBF) { + if(CalleeBF->isInjected()) + return true; + if (hasValidIndex() && CalleeBF->hasValidIndex()) { return getIndex() < CalleeBF->getIndex(); } else if (hasValidIndex() && !CalleeBF->hasValidIndex()) { @@ -428,7 +430,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (!opts::shouldProcess(*this) || !opts::shouldPrint(*this)) return; - StringRef SectionName = Section.getName(); + StringRef SectionName = + IsInjected ? "" : InputSection->getName(); OS << "Binary Function \"" << *this << "\" " << Annotation << " {"; if (Names.size() > 1) { OS << "\n Other names : "; @@ -2371,13 +2374,15 @@ void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { assert(false && "cannot emit raw body unless relocation accuracy is guaranteed"); + assert(!isInjected() && "cannot emit raw body of injected function"); + // Raw contents of the function. - StringRef SectionContents = Section.getContents(); + StringRef SectionContents = InputSection->getContents(); // Raw contents of the function. StringRef FunctionContents = - SectionContents.substr(getAddress() - Section.getAddress(), - getSize()); + SectionContents.substr(getAddress() - InputSection->getAddress(), + getSize()); if (opts::Verbosity) outs() << "BOLT-INFO: emitting function " << *this << " in raw (" @@ -2451,12 +2456,15 @@ void BinaryFunction::emitConstantIslands( assert((!OnBehalfOf || IslandProxies[OnBehalfOf].size() > 0) && "spurious OnBehalfOf constant island emission"); + + assert(!isInjected() && + "injected functions should not have constant islands"); // Raw contents of the function. - StringRef SectionContents = Section.getContents(); + StringRef SectionContents = InputSection->getContents(); // Raw contents of the function. StringRef FunctionContents = - SectionContents.substr(getAddress() - Section.getAddress(), + SectionContents.substr(getAddress() - InputSection->getAddress(), getMaxSize()); if (opts::Verbosity && !OnBehalfOf) @@ -3148,10 +3156,10 @@ void BinaryFunction::insertBasicBlocks( std::vector> &&NewBBs, const bool UpdateLayout, const bool UpdateCFIState) { - const auto StartIndex = getIndex(Start); + const auto StartIndex = Start ? getIndex(Start) : -1; const auto NumNewBlocks = NewBBs.size(); - BasicBlocks.insert(BasicBlocks.begin() + StartIndex + 1, + BasicBlocks.insert(BasicBlocks.begin() + (StartIndex + 1), NumNewBlocks, nullptr); @@ -3219,8 +3227,16 @@ void BinaryFunction::updateCFIState(BinaryBasicBlock *Start, } } -void BinaryFunction::updateLayout(BinaryBasicBlock* Start, +void BinaryFunction::updateLayout(BinaryBasicBlock *Start, const unsigned NumNewBlocks) { + // If start not provided insert new blocks at the beginning + if (!Start) { + BasicBlocksLayout.insert(layout_begin(), BasicBlocks.begin(), + BasicBlocks.begin() + NumNewBlocks); + updateLayoutIndices(); + return; + } + // Insert new blocks in the layout immediately after Start. auto Pos = std::find(layout_begin(), layout_end(), Start); assert(Pos != layout_end()); diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 7fac9b5592e9..5c7e09b0578e 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -249,8 +249,10 @@ class BinaryFunction { using BasicBlockOrderType = std::vector; -private: + /// Mark injected functions + bool IsInjected = false; +private: /// Current state of the function. State CurrentState{State::Empty}; @@ -258,7 +260,7 @@ class BinaryFunction { std::vector Names; /// Containing section - BinarySection &Section; + BinarySection *InputSection = nullptr; /// Address of the function in memory. Also could be an offset from /// base address for position independent binaries. @@ -407,11 +409,6 @@ class BinaryFunction { return *this; } - BinaryFunction &updateState(BinaryFunction::State State) { - CurrentState = State; - return *this; - } - /// Update the indices of all the basic blocks starting at StartIndex. void updateBBIndices(const unsigned StartIndex); @@ -680,11 +677,11 @@ class BinaryFunction { friend class RewriteInstance; friend class BinaryContext; - /// Creation should be handled by RewriteInstance::createBinaryFunction(). + /// Creation should be handled by RewriteInstance or BinaryContext BinaryFunction(const std::string &Name, BinarySection &Section, uint64_t Address, uint64_t Size, BinaryContext &BC, bool IsSimple) : - Names({Name}), Section(Section), Address(Address), + Names({Name}), InputSection(&Section), Address(Address), Size(Size), BC(BC), IsSimple(IsSimple), CodeSectionName(".local.text." + Name), ColdCodeSectionName(".local.cold.text." + Name), @@ -692,6 +689,16 @@ class BinaryFunction { OutputSymbol = BC.Ctx->getOrCreateSymbol(Name); } + /// This constructor is used to create an injected function + BinaryFunction(const std::string &Name, BinaryContext &BC, bool IsSimple) + : Names({Name}), Address(0), Size(0), BC(BC), IsSimple(IsSimple), + CodeSectionName(".local.text." + Name), + ColdCodeSectionName(".local.cold.text." + Name), + FunctionNumber(++Count) { + OutputSymbol = BC.Ctx->getOrCreateSymbol(Name); + IsInjected = true; + } + public: BinaryFunction(BinaryFunction &&) = default; @@ -784,6 +791,11 @@ class BinaryFunction { return iterator_range(cie_begin(), cie_end()); } + BinaryFunction &updateState(BinaryFunction::State State) { + CurrentState = State; + return *this; + } + /// Update layout of basic blocks used for output. void updateBasicBlockLayout(BasicBlockOrderType &NewLayout, bool SavePrevLayout) { @@ -985,9 +997,13 @@ class BinaryFunction { return getState() == State::Emitted; } - /// Return containing file section. BinarySection &getSection() const { - return Section; + assert(InputSection); + return *InputSection; + } + + bool isInjected() const { + return IsInjected; } /// Return original address of the function (or offset from base for PIC). diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 0d57579d0c28..181781d4a3da 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -2632,22 +2632,22 @@ void RewriteInstance::runOptimizationPasses() { void RewriteInstance::emitFunction(MCStreamer &Streamer, BinaryFunction &Function, bool EmitColdPart) { - if (Function.getSize() == 0) + if (Function.size() == 0) return; if (Function.getState() == BinaryFunction::State::Empty) return; MCSection *Section; - if (BC->HasRelocations) { + if (BC->HasRelocations || Function.isInjected()) { Section = BC->MOFI->getTextSection(); } else { // Each fuction is emmitted into its own section. Section = BC->Ctx->getELFSection(EmitColdPart ? Function.getColdCodeSectionName() : Function.getCodeSectionName(), - ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); } Section->setHasInstructions(true); @@ -2837,6 +2837,16 @@ void RewriteInstance::emitFunctions() { } ColdFunctionSeen = true; + + // Emit injected functions hot part + for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) + emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/false); + + // Emit injected functions cold part + for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) + emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/true); + + //TODO: this code is unreachable if all functions are hot if (opts::SplitFunctions != BinaryFunction::ST_NONE) { DEBUG(dbgs() << "BOLT-DEBUG: generating code for split functions\n"); for (auto *FPtr : SortedFunctions) { @@ -2866,6 +2876,14 @@ void RewriteInstance::emitFunctions() { ++CurrentIndex; } + // Emit injected functions in non-reloc mode + if (!BC->HasRelocations) { + for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()){ + emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/false); + emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/true); + } + } + if (!ColdFunctionSeen && opts::HotText) { Streamer->SwitchSection(BC->MOFI->getTextSection()); Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_end")); @@ -3007,6 +3025,28 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { OLT->mapSectionAddress(Key, TextSection->getSectionID(), NewTextSectionStartAddress); } else { + + // Prepare .text section for injected functions + auto TextSection = BC->getUniqueSectionByName(".text"); + assert(TextSection && ".text not found in output"); + if (TextSection->hasValidSectionID()) { + uint64_t NewTextSectionOffset = 0; + auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); + NextAvailableAddress += Padding; + NewTextSectionStartAddress = NextAvailableAddress; + NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress); + NextAvailableAddress += Padding + TextSection->getOutputSize(); + TextSection->setFileAddress(NewTextSectionStartAddress); + TextSection->setFileOffset(NewTextSectionOffset); + + DEBUG(dbgs() << "BOLT: mapping .text 0x" + << Twine::utohexstr(TextSection->getAllocAddress()) + << " to 0x" << Twine::utohexstr(NewTextSectionStartAddress) + << '\n'); + OLT->mapSectionAddress(Key, TextSection->getSectionID(), + NewTextSectionStartAddress); + } + for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; if (!Function.isSimple() || !opts::shouldProcess(Function)) @@ -3014,7 +3054,7 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { auto TooLarge = false; auto FuncSection = - BC->getUniqueSectionByName(Function.getCodeSectionName()); + BC->getUniqueSectionByName(Function.getCodeSectionName()); assert(FuncSection && "cannot find section for function"); DEBUG(dbgs() << "BOLT: mapping 0x" << Twine::utohexstr(FuncSection->getAllocAddress()) @@ -3160,16 +3200,15 @@ void RewriteInstance::mapDataSections(orc::VModuleKey Key) { } void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; + auto updateOutputValue = [&](BinaryFunction &Function) { if (!Function.isEmitted()) { + assert(!Function.isInjected() && "injected function should be emitted"); Function.setOutputAddress(Function.getAddress()); Function.setOutputSize(Function.getSize()); - continue; + return; } - - if (BC->HasRelocations) { + if (BC->HasRelocations || Function.isInjected()) { const auto BaseAddress = NewTextSectionStartAddress; const auto StartOffset = Layout.getSymbolOffset(*Function.getSymbol()); const auto EndOffset = @@ -3206,15 +3245,15 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { // Update basic block output ranges only for the debug info. if (!opts::UpdateDebugSections) - continue; + return; // Output ranges should match the input if the body hasn't changed. if (!Function.isSimple() && !BC->HasRelocations) - continue; + return; // AArch64 may have functions that only contains a constant island (no code) if (Function.layout_begin() == Function.layout_end()) - continue; + return; BinaryBasicBlock *PrevBB = nullptr; for (auto BBI = Function.layout_begin(), BBE = Function.layout_end(); @@ -3235,7 +3274,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { auto PrevBBEndAddress = Address; if (BB->isCold() != PrevBB->isCold()) { PrevBBEndAddress = - Function.getOutputAddress() + Function.getOutputSize(); + Function.getOutputAddress() + Function.getOutputSize(); } PrevBB->setOutputEndAddress(PrevBBEndAddress); } @@ -3244,6 +3283,15 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { PrevBB->setOutputEndAddress(PrevBB->isCold() ? Function.cold().getAddress() + Function.cold().getImageSize() : Function.getOutputAddress() + Function.getOutputSize()); + }; + + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + updateOutputValue(Function); + } + + for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) { + updateOutputValue(*InjectedFunction); } } @@ -3949,6 +3997,30 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { return IslandSizes[BF] = BF->estimateConstantIslandSize(); }; + // Add symbols of injected functions + for (BinaryFunction *Function : BC->getInjectedBinaryFunctions()) { + Elf_Sym NewSymbol; + NewSymbol.st_shndx = NewTextSectionIndex; + NewSymbol.st_value = Function->getOutputAddress(); + NewSymbol.st_name = AddToStrTab(Function->getPrintName()); + NewSymbol.st_size = Function->getOutputSize(); + NewSymbol.st_other = 0; + NewSymbol.setBindingAndType(ELF::STB_LOCAL, ELF::STT_FUNC); + Write(0, reinterpret_cast(&NewSymbol), sizeof(NewSymbol)); + + if (Function->isSplit()) { + auto NewColdSym = NewSymbol; + NewColdSym.setType(ELF::STT_NOTYPE); + SmallVector Buf; + NewColdSym.st_name = AddToStrTab( + Twine(Function->getPrintName()).concat(".cold.0").toStringRef(Buf)); + NewColdSym.st_value = Function->cold().getAddress(); + NewColdSym.st_size = Function->cold().getImageSize(); + Write(0, reinterpret_cast(&NewColdSym), + sizeof(NewColdSym)); + } + } + for (const Elf_Sym &Symbol : cantFail(Obj->symbols(Section))) { auto NewSymbol = Symbol; const auto *Function = getBinaryFunctionAtAddress(Symbol.st_value); @@ -4121,12 +4193,12 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (opts::HotText && !IsHotTextUpdated && !PatchExisting) { addSymbol("__hot_start"); addSymbol("__hot_end"); - } + } - if (opts::HotData && !IsHotDataUpdated && !PatchExisting) { - addSymbol("__hot_data_start"); - addSymbol("__hot_data_end"); - } + if (opts::HotData && !IsHotDataUpdated && !PatchExisting) { + addSymbol("__hot_data_start"); + addSymbol("__hot_data_end"); + } }; // Update dynamic symbol table. From a92a8288ad4b738145508ef867ba142ec8315b84 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 17 Jul 2018 18:31:46 -0700 Subject: [PATCH 458/904] [BOLT] Add parser for pre-aggregated perf data Summary: The regular perf2bolt aggregation job is to read perf output directly. However, if the data is coming from a database instead of perf, one could write a query to produce a pre-aggregated file. This function deals with this case. The pre-aggregated file contains aggregated LBR data, but without binary knowledge. BOLT will parse it and, using information from the disassembled binary, augment it with fall-through edge frequency information. After this step is finished, this data can be either written to disk to be consumed by BOLT later, or can be used by BOLT immediately if kept in memory. File format syntax: {B|F|f} [:] [:] [] B - indicates an aggregated branch F - an aggregated fall-through (trace) f - an aggregated fall-through with external origin - used to disambiguate between a return hitting a basic block head and a regular internal jump to the block - build id of the object containing the start address. We can skip it for the main binary and use "X" for an unknown object. This will save some space and facilitate human parsing. - hex offset from the object base load address (0 for the main executable unless it's PIE) to the start address. , - same for the end address. - total aggregated count of the branch or a fall-through. - the number of times the branch was mispredicted. Omitted for fall-throughs. Example F 41be50 41be50 3 F 41be90 41be90 4 f 41be90 41be90 7 B 4b1942 39b57f0 3 0 B 4b196f 4b19e0 2 0 (cherry picked from commit d8ea85dbf0af992a3bdc91735abd4723c7932564) --- bolt/src/BinaryFunction.h | 3 +- bolt/src/BinaryFunctionProfile.cpp | 5 +- bolt/src/DataAggregator.cpp | 267 +++++++++++++++++++++--- bolt/src/DataAggregator.h | 76 ++++++- bolt/src/DataReader.cpp | 32 ++- bolt/src/DataReader.h | 9 +- bolt/test/X86/Inputs/blarge.yaml | 136 ++++++++++++ bolt/test/X86/Inputs/pre-aggregated.txt | 8 + bolt/test/X86/pre-aggregated-perf.test | 42 ++++ 9 files changed, 519 insertions(+), 59 deletions(-) create mode 100644 bolt/test/X86/Inputs/blarge.yaml create mode 100644 bolt/test/X86/Inputs/pre-aggregated.txt create mode 100644 bolt/test/X86/pre-aggregated-perf.test diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 5c7e09b0578e..bd7bb1e0cd43 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -2138,7 +2138,8 @@ class BinaryFunction { /// Return a vector of offsets corresponding to a trace in a function /// (see recordTrace() above). Optional, 16>> - getFallthroughsInTrace(const LBREntry &First, const LBREntry &Second); + getFallthroughsInTrace(const LBREntry &First, const LBREntry &Second, + uint64_t Count = 1); /// Returns an estimate of the function's hot part after splitting. /// This is a very rough estimate, as with C++ exceptions there are diff --git a/bolt/src/BinaryFunctionProfile.cpp b/bolt/src/BinaryFunctionProfile.cpp index ebc7e280fdd6..0aa5cef805d9 100644 --- a/bolt/src/BinaryFunctionProfile.cpp +++ b/bolt/src/BinaryFunctionProfile.cpp @@ -414,10 +414,11 @@ void BinaryFunction::postProcessProfile() { Optional, 16>> BinaryFunction::getFallthroughsInTrace(const LBREntry &FirstLBR, - const LBREntry &SecondLBR) { + const LBREntry &SecondLBR, + uint64_t Count) { SmallVector, 16> Res; - if (!recordTrace(FirstLBR, SecondLBR, 1, &Res)) + if (!recordTrace(FirstLBR, SecondLBR, Count, &Res)) return NoneType(); return Res; diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index b2d06b14d3b1..bddc4c3cdd78 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -43,6 +43,13 @@ BasicAggregation("nl", cl::ZeroOrMore, cl::cat(AggregatorCategory)); +static cl::opt +ReadPreAggregated("pa", + cl::desc("skip perf and read data from a pre-aggregated file format"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + static cl::opt IgnoreBuildID("ignore-build-id", cl::desc("continue even if build-ids in input binary and perf.data mismatch"), @@ -79,6 +86,11 @@ void DataAggregator::start(StringRef PerfDataFilename) { this->PerfDataFilename = PerfDataFilename; outs() << "PERF2BOLT: Starting data aggregation job for " << PerfDataFilename << "\n"; + + // Don't launch perf for pre-aggregated files + if (opts::ReadPreAggregated) + return; + findPerfExecutable(); launchPerfBranchEventsNoWait(); launchPerfMemEventsNoWait(); @@ -86,6 +98,9 @@ void DataAggregator::start(StringRef PerfDataFilename) { } void DataAggregator::abort() { + if (opts::ReadPreAggregated) + return; + std::string Error; // Kill subprocesses in case they are not finished @@ -227,6 +242,9 @@ bool DataAggregator::launchPerfTasksNoWait() { } void DataAggregator::processFileBuildID(StringRef FileBuildID) { + if (opts::ReadPreAggregated) + return; + SmallVector Argv; SmallVector OutputPath; SmallVector ErrPath; @@ -322,6 +340,9 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) { } bool DataAggregator::checkPerfDataMagic(StringRef FileName) { + if (opts::ReadPreAggregated) + return true; + int FD; if (sys::fs::openFileForRead(FileName, FD)) { return false; @@ -356,6 +377,38 @@ void DataAggregator::deleteTempFiles() { deleteTempFile(PerfTasksOutputPath.data()); } +bool DataAggregator::processPreAggregated() { + std::string Error; + + auto MB = MemoryBuffer::getFileOrSTDIN(PerfDataFilename); + if (std::error_code EC = MB.getError()) { + errs() << "PERF2BOLT-ERROR: cannot open " << PerfDataFilename << ": " + << EC.message() << "\n"; + exit(1); + } + + FileBuf.reset(MB->release()); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + if (parseAggregatedLBRSamples()) { + outs() << "PERF2BOLT: Failed to parse samples\n"; + exit(1); + } + + // Mark all functions with registered events as having a valid profile. + for (auto &BFI : *BFs) { + auto &BF = BFI.second; + if (BF.getBranchData()) { + const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE + : BinaryFunction::PF_LBR; + BF.markProfiled(Flags); + } + } + + return true; +} + bool DataAggregator::aggregate(BinaryContext &BC, std::map &BFs) { std::string Error; @@ -363,6 +416,9 @@ bool DataAggregator::aggregate(BinaryContext &BC, this->BC = &BC; this->BFs = &BFs; + if (opts::ReadPreAggregated) + return processPreAggregated(); + outs() << "PERF2BOLT: Waiting for perf tasks collection to finish...\n"; auto PI1 = sys::Wait(TasksPI, 0, true, &Error); @@ -517,8 +573,9 @@ DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) { return true; } -bool -DataAggregator::doIntraBranch(BinaryFunction &Func, const LBREntry &Branch) { +bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From, + uint64_t To, uint64_t Count, + uint64_t Mispreds) { FuncBranchData *AggrData = Func.getBranchData(); if (!AggrData) { AggrData = &FuncsToBranches[Func.getNames()[0]]; @@ -526,21 +583,19 @@ DataAggregator::doIntraBranch(BinaryFunction &Func, const LBREntry &Branch) { Func.setBranchData(AggrData); } - AggrData->bumpBranchCount(Branch.From - Func.getAddress(), - Branch.To - Func.getAddress(), - Branch.Mispred); + AggrData->bumpBranchCount(From - Func.getAddress(), To - Func.getAddress(), + Count, Mispreds); return true; } bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, - BinaryFunction *ToFunc, - const LBREntry &Branch) { + BinaryFunction *ToFunc, uint64_t From, + uint64_t To, uint64_t Count, + uint64_t Mispreds) { FuncBranchData *FromAggrData{nullptr}; FuncBranchData *ToAggrData{nullptr}; StringRef SrcFunc; StringRef DstFunc; - auto From = Branch.From; - auto To = Branch.To; if (FromFunc) { SrcFunc = FromFunc->getNames()[0]; FromAggrData = FromFunc->getBranchData(); @@ -551,7 +606,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, } From -= FromFunc->getAddress(); - FromFunc->recordExit(From, Branch.Mispred); + FromFunc->recordExit(From, Mispreds, Count); } if (ToFunc) { DstFunc = ToFunc->getNames()[0]; @@ -563,44 +618,44 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, } To -= ToFunc->getAddress(); - ToFunc->recordEntry(To, Branch.Mispred); + ToFunc->recordEntry(To, Mispreds, Count); } if (FromAggrData) FromAggrData->bumpCallCount(From, Location(!DstFunc.empty(), DstFunc, To), - Branch.Mispred); + Count, Mispreds); if (ToAggrData) ToAggrData->bumpEntryCount(Location(!SrcFunc.empty(), SrcFunc, From), To, - Branch.Mispred); + Count, Mispreds); return true; } -bool DataAggregator::doBranch(const LBREntry &Branch) { - auto *FromFunc = getBinaryFunctionContainingAddress(Branch.From); - auto *ToFunc = getBinaryFunctionContainingAddress(Branch.To); +bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, + uint64_t Mispreds) { + auto *FromFunc = getBinaryFunctionContainingAddress(From); + auto *ToFunc = getBinaryFunctionContainingAddress(To); if (!FromFunc && !ToFunc) return false; if (FromFunc == ToFunc) { - FromFunc->recordBranch(Branch.From - FromFunc->getAddress(), - Branch.To - FromFunc->getAddress(), - 1, - Branch.Mispred); - return doIntraBranch(*FromFunc, Branch); + FromFunc->recordBranch(From - FromFunc->getAddress(), + To - FromFunc->getAddress(), Count, Mispreds); + return doIntraBranch(*FromFunc, From, To, Count, Mispreds); } - return doInterBranch(FromFunc, ToFunc, Branch); + return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds); } -bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second) { +bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, + uint64_t Count) { auto *FromFunc = getBinaryFunctionContainingAddress(First.To); auto *ToFunc = getBinaryFunctionContainingAddress(Second.From); if (!FromFunc || !ToFunc) { - ++NumLongRangeTraces; + NumLongRangeTraces += Count; return false; } if (FromFunc != ToFunc) { - ++NumInvalidTraces; + NumInvalidTraces += Count; DEBUG(dbgs() << "Trace starting in " << FromFunc->getPrintName() << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress()) << " and ending in " << ToFunc->getPrintName() << " @ " @@ -610,17 +665,15 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second) { return false; } - auto FTs = FromFunc->getFallthroughsInTrace(First, Second); + auto FTs = FromFunc->getFallthroughsInTrace(First, Second, Count); if (!FTs) { - ++NumInvalidTraces; + NumInvalidTraces += Count; return false; } for (const auto &Pair : *FTs) { - doIntraBranch(*FromFunc, - LBREntry{Pair.first + FromFunc->getAddress(), - Pair.second + FromFunc->getAddress(), - false}); + doIntraBranch(*FromFunc, Pair.first + FromFunc->getAddress(), + Pair.second + FromFunc->getAddress(), Count, false); } return true; @@ -802,6 +855,83 @@ ErrorOr DataAggregator::parseMemSample() { return PerfMemSample{PCRes.get(), AddrRes.get()}; } +ErrorOr DataAggregator::parseLocationOrOffset() { + auto parseOffset = [this]() -> ErrorOr { + auto Res = parseHexField(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + return Location(Res.get()); + }; + + auto Sep = ParsingBuf.find_first_of(" \n"); + if (Sep == StringRef::npos) + return parseOffset(); + auto LookAhead = ParsingBuf.substr(0, Sep); + if (LookAhead.find_first_of(":") == StringRef::npos) + return parseOffset(); + + auto BuildID = parseString(':'); + if (std::error_code EC = BuildID.getError()) + return EC; + auto Offset = parseHexField(FieldSeparator); + if (std::error_code EC = Offset.getError()) + return EC; + return Location(true, BuildID.get(), Offset.get()); +} + +ErrorOr DataAggregator::parseAggregatedLBREntry() { + while (checkAndConsumeFS()) {} + + auto TypeOrErr = parseString(FieldSeparator); + if (std::error_code EC = TypeOrErr.getError()) + return EC; + auto Type{AggregatedLBREntry::BRANCH}; + if (TypeOrErr.get() == "B") { + Type = AggregatedLBREntry::BRANCH; + } else if (TypeOrErr.get() == "F") { + Type = AggregatedLBREntry::FT; + } else if (TypeOrErr.get() == "f") { + Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN; + } else { + reportError("expected B, F or f"); + return make_error_code(llvm::errc::io_error); + } + + while (checkAndConsumeFS()) {} + auto From = parseLocationOrOffset(); + if (std::error_code EC = From.getError()) + return EC; + + while (checkAndConsumeFS()) {} + auto To = parseLocationOrOffset(); + if (std::error_code EC = To.getError()) + return EC; + + while (checkAndConsumeFS()) {} + auto Frequency = parseNumberField(FieldSeparator, + Type != AggregatedLBREntry::BRANCH); + if (std::error_code EC = Frequency.getError()) + return EC; + + uint64_t Mispreds{0}; + if (Type == AggregatedLBREntry::BRANCH) { + while (checkAndConsumeFS()) {} + auto MispredsOrErr = parseNumberField(FieldSeparator, true); + if (std::error_code EC = MispredsOrErr.getError()) + return EC; + Mispreds = static_cast(MispredsOrErr.get()); + } + + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + return AggregatedLBREntry{From.get(), To.get(), + static_cast(Frequency.get()), Mispreds, + Type}; +} + bool DataAggregator::hasData() { if (ParsingBuf.size() == 0) return false; @@ -836,7 +966,7 @@ std::error_code DataAggregator::parseBranchEvents() { doTrace(LBR, *NextLBR); ++NumTraces; } - doBranch(LBR); + doBranch(LBR.From, LBR.To, 1, LBR.Mispred); NextLBR = &LBR; } } @@ -991,6 +1121,79 @@ std::error_code DataAggregator::parseMemEvents() { return std::error_code(); } +std::error_code DataAggregator::parseAggregatedLBRSamples() { + outs() << "PERF2BOLT: Aggregating...\n"; + NamedRegionTimer T("parseAggregated", "Aggregated LBR parsing", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + uint64_t NumAggrEntries{0}; + uint64_t NumTraces{0}; + while (hasData()) { + auto AggrEntryRes = parseAggregatedLBREntry(); + if (std::error_code EC = AggrEntryRes.getError()) + return EC; + + auto &AggrEntry = AggrEntryRes.get(); + + ++NumAggrEntries; + switch (AggrEntry.EntryType) { + case AggregatedLBREntry::BRANCH: + doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, + AggrEntry.Mispreds); + break; + case AggregatedLBREntry::FT: + case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: { + LBREntry First{AggrEntry.EntryType == AggregatedLBREntry::FT + ? AggrEntry.From.Offset + : 0, + AggrEntry.From.Offset, false}; + LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false}; + doTrace(First, Second, AggrEntry.Count); + ++NumTraces; + break; + } + } + } + outs() << "PERF2BOLT: Read " << NumAggrEntries << " aggregated LBR entries\n"; + outs() << "PERF2BOLT: Traces mismatching disassembled function contents: " + << NumInvalidTraces; + float Perc{0.0f}; + if (NumTraces > 0) { + outs() << " ("; + Perc = NumInvalidTraces * 100.0f / NumTraces; + if (outs().has_colors()) { + if (Perc > 10.0f) { + outs().changeColor(raw_ostream::RED); + } else if (Perc > 5.0f) { + outs().changeColor(raw_ostream::YELLOW); + } else { + outs().changeColor(raw_ostream::GREEN); + } + } + outs() << format("%.1f%%", Perc); + if (outs().has_colors()) + outs().resetColor(); + outs() << ")"; + } + outs() << "\n"; + if (Perc > 10.0f) { + outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " + "binary is probably not the same binary used during profiling " + "collection. The generated data may be ineffective for improving " + "performance.\n\n"; + } + + outs() << "PERF2BOLT: Out of range traces involving unknown regions: " + << NumLongRangeTraces; + if (NumTraces > 0) { + outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); + } + outs() << "\n"; + + dump(); + + return std::error_code(); +} + ErrorOr> DataAggregator::parseTaskPID() { while (checkAndConsumeFS()) {} diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index d56663177c5a..886117e729ce 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -42,6 +42,16 @@ struct PerfMemSample { uint64_t Addr; }; +/// Used for parsing specific pre-aggregated input files. +struct AggregatedLBREntry { + enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN }; + Location From; + Location To; + uint64_t Count; + uint64_t Mispreds; + Type EntryType; +}; + /// DataAggregator inherits all parsing logic from DataReader as well as /// its data structures used to represent aggregated profile data in memory. /// @@ -132,18 +142,21 @@ class DataAggregator : public DataReader { bool doSample(BinaryFunction &Func, const uint64_t Address); /// Register an intraprocedural branch \p Branch. - bool doIntraBranch(BinaryFunction &Func, const LBREntry &Branch); + bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To, + uint64_t Count, uint64_t Mispreds); /// Register an interprocedural branch from \p FromFunc to \p ToFunc with /// offsets \p From and \p To, respectively. bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, - const LBREntry &Branch); + uint64_t From, uint64_t To, uint64_t Count, + uint64_t Mispreds); /// Register a \p Branch. - bool doBranch(const LBREntry &Branch); + bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds); /// Register a trace between two LBR entries supplied in execution order. - bool doTrace(const LBREntry &First, const LBREntry &Second); + bool doTrace(const LBREntry &First, const LBREntry &Second, + uint64_t Count = 1); /// Parser helpers /// Return false if we exhausted our parser buffer and finished parsing @@ -162,6 +175,13 @@ class DataAggregator : public DataReader { /// address. ErrorOr parseMemSample(); + /// Parse pre-aggregated LBR samples created by an external tool + ErrorOr parseAggregatedLBREntry(); + + /// Parse either buildid:offset or just offset, representing a location in the + /// binary. Used exclusevely for pre-aggregated LBR samples. + ErrorOr parseLocationOrOffset(); + /// Check if a field separator is the next char to parse and, if yes, consume /// it and return true bool checkAndConsumeFS(); @@ -181,6 +201,10 @@ class DataAggregator : public DataReader { /// Parse the full output generated by perf script to report memory events. std::error_code parseMemEvents(); + /// Parse the full output of pre-aggregated LBR samples generated by + /// an external tool. + std::error_code parseAggregatedLBRSamples(); + /// Parse a single line of a PERF_RECORD_COMM event looking for an association /// between the binary name and its PID. On success return a /// pair. @@ -197,6 +221,50 @@ class DataAggregator : public DataReader { /// and return a file name matching a given \p FileBuildID. Optional getFileNameForBuildID(StringRef FileBuildID); + /// Coordinate reading and parsing of pre-aggregated file + /// + /// The regular perf2bolt aggregation job is to read perf output directly. + /// However, if the data is coming from a database instead of perf, one could + /// write a query to produce a pre-aggregated file. This function deals with + /// this case. + /// + /// The pre-aggregated file contains aggregated LBR data, but without binary + /// knowledge. BOLT will parse it and, using information from the disassembled + /// binary, augment it with fall-through edge frequency information. After this + /// step is finished, this data can be either written to disk to be consumed by + /// BOLT later, or can be used by BOLT immediately if kept in memory. + /// + /// File format syntax: + /// {B|F|f} [:] [:] + /// [] + /// + /// B - indicates an aggregated branch + /// F - an aggregated fall-through + /// f - an aggregated fall-through with external origin - used to disambiguate + /// between a return hitting a basic block head and a regular internal + /// jump to the block + /// + /// - build id of the object containing the start address. We can skip it + /// for the main binary and use "X" for an unknown object. This will save some space + /// and facilitate human parsing. + /// + /// - hex offset from the object base load address (0 for the main + /// executable unless it's PIE) to the start address. + /// + /// , - same for the end address. + /// + /// - total aggregated count of the branch or a fall-through. + /// + /// - the number of times the branch was mispredicted. Omitted for + /// fall-throughs. + /// + /// Example: + /// F 41be50 41be50 3 + /// F 41be90 41be90 4 + /// B 4b1942 39b57f0 3 0 + /// B 4b196f 4b19e0 2 0 + bool processPreAggregated(); + public: DataAggregator(raw_ostream &Diag, StringRef BinaryName) : DataReader(Diag), BinaryName(llvm::sys::path::filename(BinaryName)) {} diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp index 62e7be78b8a9..335119e30c3e 100644 --- a/bolt/src/DataReader.cpp +++ b/bolt/src/DataReader.cpp @@ -130,46 +130,44 @@ void FuncSampleData::bumpCount(uint64_t Offset) { } void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, - bool Mispred) { + uint64_t Count, uint64_t Mispreds) { auto Iter = IntraIndex[OffsetFrom].find(OffsetTo); if (Iter == IntraIndex[OffsetFrom].end()) { Data.emplace_back(Location(true, Name, OffsetFrom), - Location(true, Name, OffsetTo), Mispred, 1); + Location(true, Name, OffsetTo), Mispreds, Count); IntraIndex[OffsetFrom][OffsetTo] = Data.size() - 1; return; } auto &BI = Data[Iter->second]; - ++BI.Branches; - if (Mispred) - ++BI.Mispreds; + BI.Branches += Count; + BI.Mispreds += Mispreds; } void FuncBranchData::bumpCallCount(uint64_t OffsetFrom, const Location &To, - bool Mispred) { + uint64_t Count, uint64_t Mispreds) { auto Iter = InterIndex[OffsetFrom].find(To); if (Iter == InterIndex[OffsetFrom].end()) { - Data.emplace_back(Location(true, Name, OffsetFrom), To, Mispred, 1); + Data.emplace_back(Location(true, Name, OffsetFrom), To, Mispreds, Count); InterIndex[OffsetFrom][To] = Data.size() - 1; return; } auto &BI = Data[Iter->second]; - ++BI.Branches; - if (Mispred) - ++BI.Mispreds; + BI.Branches += Count; + BI.Mispreds += Mispreds; } void FuncBranchData::bumpEntryCount(const Location &From, uint64_t OffsetTo, - bool Mispred) { + uint64_t Count, uint64_t Mispreds) { auto Iter = EntryIndex[OffsetTo].find(From); if (Iter == EntryIndex[OffsetTo].end()) { - EntryData.emplace_back(From, Location(true, Name, OffsetTo), Mispred, 1); + EntryData.emplace_back(From, Location(true, Name, OffsetTo), Mispreds, + Count); EntryIndex[OffsetTo][From] = EntryData.size() - 1; return; } auto &BI = EntryData[Iter->second]; - ++BI.Branches; - if (Mispred) - ++BI.Mispreds; + BI.Branches += Count; + BI.Mispreds += Mispreds; } void BranchInfo::mergeWith(const BranchInfo &BI) { @@ -306,9 +304,9 @@ ErrorOr DataReader::parseString(char EndChar, bool EndNl) { // If EndNl was set and nl was found instead of EndChar, do not consume the // new line. - bool EndNlInstreadOfEndChar = + bool EndNlInsteadOfEndChar = ParsingBuf[StringEnd] == '\n' && EndChar != '\n'; - unsigned End = EndNlInstreadOfEndChar ? StringEnd : StringEnd + 1; + unsigned End = EndNlInsteadOfEndChar ? StringEnd : StringEnd + 1; ParsingBuf = ParsingBuf.drop_front(End); if (EndChar == '\n') { diff --git a/bolt/src/DataReader.h b/bolt/src/DataReader.h index 57165c951e88..fe5c6a548ffd 100644 --- a/bolt/src/DataReader.h +++ b/bolt/src/DataReader.h @@ -168,9 +168,12 @@ struct FuncBranchData { DenseMap> InterIndex; DenseMap> EntryIndex; - void bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, bool Mispred); - void bumpCallCount(uint64_t OffsetFrom, const Location &To, bool Mispred); - void bumpEntryCount(const Location &From, uint64_t OffsetTo, bool Mispred); + void bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, uint64_t Count, + uint64_t Mispreds); + void bumpCallCount(uint64_t OffsetFrom, const Location &To, uint64_t Count, + uint64_t Mispreds); + void bumpEntryCount(const Location &From, uint64_t OffsetTo, uint64_t Count, + uint64_t Mispreds); }; /// MemInfo represents a single memory load from an address \p Addr at an \p diff --git a/bolt/test/X86/Inputs/blarge.yaml b/bolt/test/X86/Inputs/blarge.yaml new file mode 100644 index 000000000000..3e649c344ceb --- /dev/null +++ b/bolt/test/X86/Inputs/blarge.yaml @@ -0,0 +1,136 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x0000000000400CC0 +Sections: + - Name: .noplt + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x00000000004005E0 + AddressAlign: 0x0000000000000010 + Content: FF35BA122000FF25BC1220000F1F4000FF25BA1220006800000000E9E0FFFFFFFF25B21220006801000000E9D0FFFFFFFF25AA1220006802000000E9C0FFFFFFFF25A21220006803000000E9B0FFFFFFFF259A1220006804000000E9A0FFFFFFFF25921220006805000000E990FFFFFFFF258A1220006806000000E980FFFFFFFF25821220006807000000E970FFFFFFFF257A1220006808000000E960FFFFFF + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000400680 + AddressAlign: 0x0000000000000010 + Content: 41574156BF4812400041554154555331DB4883EC78E866FFFFFF488D742450488D7C243C488D6C2450F20F101D9F0C0000F20F10159F0C0000F20F100D9F0C0000F20F10057F0C0000E83207000031C0BF00134000E836FFFFFF448B5C243C4585DB7E21F20F104500BF0B134000B80100000083C3014883C508E811FFFFFF395C243C7FDFBF0A00000031DB488D6C2450E8DAFEFFFF488D742450488D7C243CF20F101D280C0000F20F1015380C0000F20F100D380C0000F20F1005080C0000E8BB06000031C0BF00134000E8BFFEFFFF448B54243C4585D27E21F20F104500BF0B134000B80100000083C3014883C508E89AFEFFFF395C243C7FDFBF0A00000031DB488D6C2450E863FEFFFF488D742450488D7C243CF20F101DD90B0000F20F1015D90B0000F20F100DD90B0000F20F1005910B0000E84406000031C0BF00134000E848FEFFFF448B4C243C4585C97E21F20F104500BF0B134000B80100000083C3014883C508E823FEFFFF395C243C7FDFBF0A00000031DB488D6C2450E8ECFDFFFFF20F10153C0B0000488D742450488D7C243CF20F101D720B0000F20F100D720B0000660F28C2E8D105000031C0BF00134000E8D5FDFFFF448B44243C4585C07E21F20F104500BF0B134000B80100000083C3014883C508E8B0FDFFFF395C243C7FDFBF0A00000031DB488D6C2450E879FDFFFF488D742450488D7C243CF20F101D170B0000F20F1015A70A0000F20F100D0F0B0000F20F10050F0B0000E85A050000BF0013400031C0E85EFDFFFF8B7C243C85FF7E21F20F104500BF0B134000B80100000083C3014883C508E83BFDFFFF395C243C7FDFBF0A00000031DB488D6C2450E804FDFFFF488D742450488D7C243CF20F101DBA0A0000F20F1015BA0A0000F20F100DBA0A0000F20F1005BA0A0000E8E504000031C0BF00134000E8E9FCFFFF8B74243C85F67E21F20F104500BF0B134000B80100000083C3014883C508E8C6FCFFFF395C243C7FDFBF0A00000031DB488D6C2450E88FFCFFFF488D742450488D7C243CF20F101D650A0000F20F1015650A0000F20F100D650A0000F20F1005650A0000E87004000031C0BF00134000E874FCFFFF8B4C243C85C97E21F20F104500BF0B134000B80100000083C3014883C508E851FCFFFF395C243C7FDFBF0A00000031DB488D6C2450E81AFCFFFF488D742450488D7C243CF20F101D100A0000F20F1015100A0000F20F100D100A0000F20F1005100A0000E8FB03000031C0BF00134000E8FFFBFFFF8B54243C85D27E21F20F104500BF0B134000B80100000083C3014883C508E8DCFBFFFF395C243C7FDFBF0A00000041BF09000000E8A6FBFFFFF20F1035F6080000F20F103DDE080000F20F11742420F20F117C2428F20F103DD208000041BE28000000F20F117C2418F20F107C242841BD11000000F20F117C24100F1F4000F20F103D9808000041BC09000000F20F117C24080F1F4000488D742450488D7C243C31DBF20F105C2408488D6C2450F20F10542410F20F104C2418F20F10442420E82A03000031C0BF00134000E82EFBFFFF8B44243C85C07E21F20F104500BF0B134000B80100000083C3014883C508E80BFBFFFF395C243C7FDFBF0A000000E8DBFAFFFFF20F106424084183EC01F20F5C25F1080000F20F116424080F8575FFFFFFF20F102DE50800004183ED01F20F586C2410F20F116C24100F853FFFFFFFF20F107424184183EE01F20F5C35C5080000F20F117424180F850BFFFFFFF20F103DD10700004183EF01F20F587C2420F20F117C24200F85D9FEFFFFBF7012400031DBE867FAFFFF488D7424404889DFE8CA0500008B54244089DE31C0BF0F1340004883C302E854FAFFFF4881FBA086010075D4BF0A000000BB6901ED3FE81CFAFFFF488D7424404889DFE88F0500008B5424404889DE31C0BF201340004883C301E818FAFFFF4881FB6941ED3F75D3BF98124000E8F5F9FFFF660FEFD2F20F100D19080000660F28C2BFC0124000B802000000F20F11542408F20F59CAF20F5E0D01080000E8D4F9FFFFF20F10542408F20F1035FE070000F20F5815EE070000660F2EF273B7BF2F134000E89EF9FFFF660FEFD2F20F100DCA070000660F28C2BFE0124000B802000000F20F11542408F20F59CAF20F5E0DA2070000E87DF9FFFFF20F10542408F20F103DB7070000F20F5815A7070000660F2EFA73B74883C47831C05B5D415C415D415E415FC331ED4989D15E4889E24883E4F0505449C7C03012400048C7C1C011400048C7C780064000E867F9FFFFF4660F1F440000B80F19600055482D081960004883F80E4889E5761BB8000000004885C074115DBF08196000FFE0660F1F8400000000005DC366666666662E0F1F840000000000BE08196000554881EE0819600048C1FE034889E54889F048C1E83F4801C648D1FE7415B8000000004885C0740B5DBF08196000FFE00F1F005DC3660F1F440000803D910B2000007511554889E5E86EFFFFFF5DC6057E0B200001F3C30F1F4000BFA816600048833F007505EB930F1F00B8000000004885C074F1554889E5FFD05DE97AFFFFFF662E0F1F840000000000F20F590570060000F20F5E0560060000C36666666666662E0F1F840000000000F20F590548060000F20F5E0548060000C3662E0F1F8400000000000F1F440000F20F5EC8534889F34883EC50F20F5ED0F20F110C24DD0424F20F5ED8F20F111424DD0424D9C1D8CA660FEFC0F20F111C24D90549060000D8CADEE9D90543060000DCF9D9C3D8C4D8CCD8CCD9C9D8CCDECBDEE2D9052F060000DC0C24DEC2D9C9D83526060000D9C0D9C2D8CBD8CBD9CAD8C8D8E2DD5C2448F20F104C2448660F2EC10F83A8000000DDD9F20F51D1660F2ED2C707010000000F8A09020000D9C9DB7C2430D9C9F20F1005E2050000DB7C2420DD542448F20F104C2448DB7C2410660F54C1F20F100DAC050000F20F58C2E84BF7FFFFF20F110424F20F100D66040000DD0424DB6C2430D8F1DEC1DD5C2448D9EEF20F10442448DB6C2410D9C9DFE9DDD8DB6C24207708F20F100D1F040000D90569050000F20F59C8DEF9F20F110C24DC2C24DD1B4883C4505BC30F1F00D9C9DD5C2448C70703000000F20F104C2448F20F51C1660F2EC00F8AF0010000D9C9DB7C2420D9C9F20F110424DB7C2410DD0424DEF9DD5C2448F20F10442448E8CBF6FFFFDB6C2420F20F110424DD5C2448F20F10642448DB6C2410F20F51CC660F2EC9F20F11642440660F28D10F8A73010000F20F5E0504040000F20F114C2430F20F11542420DB7C2410E86FF6FFFFF20F10542420F20F104C2430D905AD040000F20F591585040000660F2EC9DB6C2410F20F59D0DEF1F20F11542410660F28D1D9C0DC6C2410DD1B0F8AEE000000F20F100424DB7C2420F20F580556040000F20F114C2430F20F11542410F20F5E058A030000E805F6FFFFDB6C2420F20F10542410F20F104C2430F20F59151D040000660F2EC9D9C0F20F59D0F20F11542410DC6C2410DD5B087A77F20F100424DB7C2410F20F580503040000F20F114C2420F20F5E0535030000E8B0F5FFFFF20F104C2420DB6C2410F20F590DCE030000F20F59C8F20F110C24DC2C24DD5B104883C4505BC3DB7C2420660F28C1DB7C2410DB3C24E8B5F5FFFF660F28D0DB6C2420DB6C2410DB2C24D9CAD9C9E9CDFDFFFFDB7C2410F20F10442440E88EF5FFFF660F28C8DB6C2410E96DFFFFFFDB7C2410F20F10442440F20F114C2420E86CF5FFFFF20F104C2420660F28D0DB6C2410E9EAFEFFFFDDD8660F28C4F20F114C2420E848F5FFFFF20F104C2420660F28D0F20F100424DB6C2410E964FEFFFFDB7C2420660F28C1DB7C2410DB3C24E81CF5FFFFDB6C2420DB6C2410DB2C24D9CAD9C9E9EAFDFFFF0F1F84000000000041B82000000031C031D2660F1F4400004889F94801C048C1E70281E1000000C048C1E91E488D1491488D4C00014839CA72074829CA4883C0014183E80175D1488906C3662E0F1F8400000000000F1F00415741564189FF415541544C8D25C604200055488D2DC6042000534989F64989D531DB4C29E54883EC0848C1FD03E8CDF3FFFF4885ED741E0F1F8400000000004C89EA4C89F64489FF41FF14DC4883C3014839EB75EA4883C4085B5D415C415D415E415FC366662E0F1F840000000000F3C3 + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x0000000000401240 + AddressAlign: 0x0000000000000010 + Content: 01000200000000002A2A2A2A2A2A2A2A2A2043554249432046554E4354494F4E53202A2A2A2A2A2A2A2A2A2A2A0000002A2A2A2A2A2A2A2A2A20494E54454745522053515220524F4F5453202A2A2A2A2A2A2A2A2A2A2A002A2A2A2A2A2A2A2A2A20414E474C4520434F4E56455253494F4E202A2A2A2A2A2A2A2A2A2A2A000025332E30662064656772656573203D20252E3132662072616469616E730A0000252E3132662072616469616E73203D2025332E306620646567726565730A0000536F6C7574696F6E733A0020256600737172742825336429203D202532640A007371727428256C5829203D2025580A00000000000000F0BF00000000000014400000000000002440000000000000F03F0000000000003EC0000000000000404000000000000025C0000000000000314000000000000012C00000000000003FC000000000000036400000000000000CC000000000008041C06666666666662BC00000000000002840AE47E17A14AE284000000000000008409A999999999937C00000000000001840295C8FC2F5F850C000000000000020C000000000000041400000000000001E40D7A3703D0A572140000000000080464000000000000030403333333333331540333333333333FBBF00000000000028C077BE9F1A2FDDDC3F85EB51B81E85E33F000000000000D03F182D4454FB2109400000000000806640FCA9F1D24D62503F0000000000807640399D52A246DF413F9B0B6097FB21194000000000000000C0182D4454FB211940182D4454FB212940555555555555D53F00004040000010410000D84100005842FFFFFFFFFFFFFF7F0000000000000000 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x00000000006016B0 + Link: .dynstr + AddressAlign: 0x0000000000000008 + Content: 01000000000000000100000000000000010000000000000072000000000000000C00000000000000C0054000000000000D000000000000003412400000000000190000000000000098166000000000001B0000000000000008000000000000001A00000000000000A0166000000000001C000000000000000800000000000000040000000000000048024000000000000500000000000000C803400000000000060000000000000090024000000000000A00000000000000AE000000000000000B00000000000000180000000000000015000000000000000000000000000000030000000000000098186000000000000200000000000000D800000000000000140000000000000007000000000000001700000000000000E8044000000000000700000000000000D0044000000000000800000000000000180000000000000009000000000000001800000000000000FEFFFF6F000000009004400000000000FFFFFF6F000000000200000000000000F0FFFF6F000000007604400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x00000000006018F8 + AddressAlign: 0x0000000000000008 + Content: '00000000000000000000000000000000' + - Name: .bss + Type: SHT_NOBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x0000000000601908 + AddressAlign: 0x0000000000000001 + Size: 0x0000000000000008 +Symbols: + Local: + - Name: completed.6650 + Type: STT_OBJECT + Section: .bss + Value: 0x0000000000601908 + Size: 0x0000000000000001 + - Name: frame_dummy + Type: STT_FUNC + Section: .text + Value: 0x0000000000400D90 + - Name: basicmath_large.c + Type: STT_FILE + - Name: rad2deg.c + Type: STT_FILE + - Name: cubic.c + Type: STT_FILE + - Name: isqrt.c + Type: STT_FILE + - Name: elf-init.c + Type: STT_FILE + - Name: crtstuff.c + Type: STT_FILE + - Name: _DYNAMIC + Type: STT_OBJECT + Section: .dynamic + Value: 0x00000000006016B0 + Global: + - Name: usqrt + Type: STT_FUNC + Section: .text + Value: 0x0000000000401170 + Size: 0x0000000000000043 + - Name: deg2rad + Type: STT_FUNC + Section: .text + Value: 0x0000000000400DE0 + Size: 0x0000000000000011 + - Name: SolveCubic + Type: STT_FUNC + Section: .text + Value: 0x0000000000400E00 + Size: 0x0000000000000368 + - Name: _start + Type: STT_FUNC + Section: .text + Value: 0x0000000000400CC0 + Size: 0x000000000000002A + - Name: rad2deg + Type: STT_FUNC + Section: .text + Value: 0x0000000000400DC0 + Size: 0x0000000000000011 + - Name: main + Type: STT_FUNC + Section: .text + Value: 0x0000000000400680 + Size: 0x0000000000000640 +DynamicSymbols: + Global: + - Name: sqrt + Type: STT_FUNC +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_X, PF_R ] + VAddr: 0x0000000000400000 + PAddr: 0x0000000000400000 + Sections: + - Section: .noplt + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x0000000000400000 + PAddr: 0x0000000000400000 + Sections: + - Section: .noplt + - Section: .text + - Section: .rodata + - Type: PT_LOAD + Flags: [ PF_R, PF_W ] + VAddr: 0x0000000000601698 + PAddr: 0x0000000000601698 + Sections: + - Section: .data + - Section: .bss + - Type: PT_DYNAMIC + Flags: [ PF_R, PF_W ] + VAddr: 0x00000000006016B0 + PAddr: 0x00000000006016B0 + Sections: + - Section: .dynamic +... diff --git a/bolt/test/X86/Inputs/pre-aggregated.txt b/bolt/test/X86/Inputs/pre-aggregated.txt new file mode 100644 index 000000000000..788ceb45607b --- /dev/null +++ b/bolt/test/X86/Inputs/pre-aggregated.txt @@ -0,0 +1,8 @@ +B X:7f36d18d60c0 400bbc 2 0 +B 400ad1 400e00 2 0 +B 400b10 4005f0 1 0 +B 400bb7 400610 1 0 +B 4005f0 X:7f36d18f2ce0 1 0 +B 4011a0 4011a9 33 4 +B 4011ad 401180 58 0 +F 401170 4011b2 22 diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test new file mode 100644 index 000000000000..92fa586ceded --- /dev/null +++ b/bolt/test/X86/pre-aggregated-perf.test @@ -0,0 +1,42 @@ +# This script checks that perf2bolt is reading pre-aggregated perf information +# correctly for a simple example. The perf.data of this example was generated +# with the following command: +# +# $ perf record -j any,u -e branch -o perf.data -- ./blarge +# +# blarge is the binary for "basicmath large inputs" taken from Mibench. + +RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe +RUN: perf2bolt %t.exe -o %t -pa -p %p/Inputs/pre-aggregated.txt -w %t.new +RUN: cat %t | sort | FileCheck %s -check-prefix=PERF2BOLT +RUN: cat %t.new | FileCheck %s -check-prefix=NEWFORMAT + +PERF2BOLT: 0 [unknown] 7f36d18d60c0 1 main 53c 0 2 +PERF2BOLT: 1 main 451 1 SolveCubic 0 0 2 +PERF2BOLT: 1 main 490 0 [unknown] 4005f0 0 1 +PERF2BOLT: 1 main 537 0 [unknown] 400610 0 1 +PERF2BOLT: 1 usqrt 30 1 usqrt 32 0 22 +PERF2BOLT: 1 usqrt 30 1 usqrt 39 4 33 +PERF2BOLT: 1 usqrt 35 1 usqrt 39 0 22 +PERF2BOLT: 1 usqrt 3d 1 usqrt 10 0 58 +PERF2BOLT: 1 usqrt 3d 1 usqrt 3f 0 22 +PERF2BOLT: 1 usqrt 8 1 usqrt 10 0 22 + +NEWFORMAT: - name: usqrt +NEWFORMAT: fid: 7 +NEWFORMAT: hash: 0x7EA5C50FA9564489 +NEWFORMAT: exec: 0 +NEWFORMAT: nblocks: 5 +NEWFORMAT: blocks: +NEWFORMAT: - bid: 0 +NEWFORMAT: insns: 3 +NEWFORMAT: succ: [ { bid: 1, cnt: 22 } ] +NEWFORMAT: - bid: 1 +NEWFORMAT: insns: 9 +NEWFORMAT: succ: [ { bid: 3, cnt: 33, mis: 4 }, { bid: 2, cnt: 22 } ] +NEWFORMAT: - bid: 2 +NEWFORMAT: insns: 2 +NEWFORMAT: succ: [ { bid: 3, cnt: 22 } ] +NEWFORMAT: - bid: 3 +NEWFORMAT: insns: 2 +NEWFORMAT: succ: [ { bid: 1, cnt: 58 }, { bid: 4, cnt: 22 } ] From 746372c31f009cd1a2eeb07d9b760cfd9a77ac01 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Thu, 17 May 2018 18:27:13 -0700 Subject: [PATCH 459/904] [BOLT] further speeding up cache+ Summary: For large binaries, cache+ algorithm adds a noticeable overhead in comparison with cache. This modification restricts search space of the optimization, which makes cache+ as fast as cache for all tested binaries. There is a tiny (in the order of 0.01%) regression in cache-related metrics, but this is not noticeable in practice. (cherry picked from commit a90e961e1a0298541daa95386f53512422c4e5ab) --- bolt/src/CacheMetrics.cpp | 1 - bolt/src/Passes/CachePlusReorderAlgorithm.cpp | 90 +++++++++++-------- bolt/src/RewriteInstance.cpp | 5 +- 3 files changed, 58 insertions(+), 38 deletions(-) diff --git a/bolt/src/CacheMetrics.cpp b/bolt/src/CacheMetrics.cpp index fd6fe8cb9587..f5a80e3abc62 100644 --- a/bolt/src/CacheMetrics.cpp +++ b/bolt/src/CacheMetrics.cpp @@ -88,7 +88,6 @@ void extractBasicBlockInfo( BBAddr[BB] = BB->getInputAddressRange().first + BF->getAddress(); BBSize[BB] = BB->getOriginalSize(); } - assert(BBAddr[BB] > 0 && "incorrect output block address"); } } } diff --git a/bolt/src/Passes/CachePlusReorderAlgorithm.cpp b/bolt/src/Passes/CachePlusReorderAlgorithm.cpp index 1e99792bfe53..52db989c9105 100644 --- a/bolt/src/Passes/CachePlusReorderAlgorithm.cpp +++ b/bolt/src/Passes/CachePlusReorderAlgorithm.cpp @@ -26,8 +26,8 @@ extern cl::OptionCategory BoltOptCategory; cl::opt ClusterSplitThreshold("cluster-split-threshold", - cl::desc("The maximum size of a function to apply splitting of clusters"), - cl::init(2048), + cl::desc("The maximum size of a cluster to apply splitting"), + cl::init(128), cl::ZeroOrMore, cl::cat(BoltOptCategory)); @@ -213,9 +213,8 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1, /// while keeping the implementation sufficiently fast. class CachePlus { public: - CachePlus(const BinaryFunction &BF, bool UseClusterSplitting) + CachePlus(const BinaryFunction &BF) : BF(BF), - UseClusterSplitting(UseClusterSplitting), Adjacent(BF.layout_size()), Cache(BF.layout_size()) { initialize(); @@ -489,6 +488,40 @@ class CachePlus { return Score; } + /// Verify if it is valid to merge two clusters into the new one + bool isValidMerge(const Cluster *ClusterPred, + const Cluster *ClusterSucc, + size_t MergeType, + const MergedCluster& MergedBlocks) const { + // Does the new cluster preserve the original entry point? + if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) && + MergedBlocks.getFirstBlock()->getLayoutIndex() != 0) + return false; + + // This corresponds to a concatentation of clusters w/o splitting, which is + // always safe + if (MergeType == 0) + return true; + + size_t Offset = MergeType / 5; + // The basic blocks on the boundary of a split of ClusterPred + auto BB1 = ClusterPred->blocks()[Offset - 1]; + auto BB2 = ClusterPred->blocks()[Offset]; + // Does the splitting break FT successors? + if (FallthroughSucc[BB1->getLayoutIndex()] != nullptr) { + assert(FallthroughSucc[BB1->getLayoutIndex()] == BB2 && + "Fallthrough successor is not preserved"); + return false; + } + + // Do not split large clusters to reduce computation time + if (ClusterPred->blocks().size() > opts::ClusterSplitThreshold) { + return false; + } + + return true; + } + /// The gain of merging two clusters. /// /// The function considers all possible ways of merging two clusters and @@ -512,9 +545,8 @@ class CachePlus { auto MergedBlocks = mergeBlocks(ClusterPred->blocks(), ClusterSucc->blocks(), MergeType); - // Does the new cluster preserve the original entry point? - if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) && - MergedBlocks.getFirstBlock()->getLayoutIndex() != 0) + + if (!isValidMerge(ClusterPred, ClusterSucc, MergeType, MergedBlocks)) return CurGain; // The score of the new cluster @@ -528,20 +560,12 @@ class CachePlus { std::pair Gain = std::make_pair(-1, 0); // Try to concatenate two clusters w/o splitting Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0); - if (UseClusterSplitting) { - // Try to split ClusterPred into two and merge with ClusterSucc - for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) { - // Make sure the splitting does not break FT successors - auto BB = ClusterPred->blocks()[Offset - 1]; - if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) { - assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]); - continue; - } - - for (size_t Type = 0; Type < 4; Type++) { - size_t MergeType = 1 + Type + Offset * 4; - Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType); - } + // Try to split ClusterPred into two sub-clusters in various ways and then + // merge it with ClusterSucc + for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) { + for (size_t Type = 1; Type <= 4; Type++) { + size_t MergeType = Type + Offset * 5; + Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType); } } @@ -549,11 +573,11 @@ class CachePlus { return Gain; } - /// Merge two clusters (orders) of blocks according to a given 'merge type'. + /// Merge two clusters of blocks respecting a given merge 'type' and 'offset'. /// /// If MergeType == 0, then the result is a concatentation of two clusters. - /// Otherwise, the first cluster is cut into two and we consider all possible - /// ways of concatenating three clusters. + /// Otherwise, the first cluster is cut into two sub-clusters at the offset, + /// and merged using all possible ways of concatenating three clusters. MergedCluster mergeBlocks(const std::vector &X, const std::vector &Y, size_t MergeType) const { @@ -563,9 +587,8 @@ class CachePlus { return MergedCluster(X.begin(), X.end(), Y.begin(), Y.end(), Empty, Empty); } - MergeType--; - size_t Type = MergeType % 4; - size_t Offset = MergeType / 4; + size_t Type = MergeType % 5; + size_t Offset = MergeType / 5; assert(0 < Offset && Offset < X.size() && "Invalid offset while merging clusters"); // Split the first cluster, X, into X1 and X2 @@ -578,10 +601,10 @@ class CachePlus { // Construct a new cluster from three existing ones switch(Type) { - case 0: return MergedCluster(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); - case 1: return MergedCluster(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); - case 2: return MergedCluster(BeginX2, EndX2, BeginY, EndY, BeginX1, EndX1); - case 3: return MergedCluster(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); + case 1: return MergedCluster(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); + case 2: return MergedCluster(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); + case 3: return MergedCluster(BeginX2, EndX2, BeginY, EndY, BeginX1, EndX1); + case 4: return MergedCluster(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); default: llvm_unreachable("unexpected merge type"); } @@ -616,9 +639,6 @@ class CachePlus { // The binary function const BinaryFunction &BF; - // Indicates whether to use cluster splitting for optimization - bool UseClusterSplitting; - // All clusters std::vector AllClusters; @@ -673,7 +693,7 @@ void CachePlusReorderAlgorithm::reorderBasicBlocks( } // Apply the algorithm - Order = CachePlus(BF, NumHotBlocks <= opts::ClusterSplitThreshold).run(); + Order = CachePlus(BF).run(); // Verify correctness assert(Order[0]->isEntryPoint() && "Original entry point is not preserved"); diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 181781d4a3da..d0d0014b91d3 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1024,8 +1024,9 @@ void RewriteInstance::run() { if (opts::UpdateDebugSections && opts::FixDebugInfoLargeFunctions && checkLargeFunctions()) { ++PassNumber; - outs() << "BOLT: starting pass (ignoring large functions) " - << PassNumber << "...\n"; + outs() << format("BOLT: starting pass %zu (ignoring %zu large functions) ", + PassNumber, LargeFunctions.size()) + << "...\n"; reset(); executeRewritePass(LargeFunctions); } From 905c4897a955360675e67bcc29cdb43c3e953c45 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 24 Jul 2018 14:30:16 -0700 Subject: [PATCH 460/904] [BOLT] Add R_X86_64_PC64 relocation support (cherry picked from commit da6f137ed7482f098ff79b896d2c423d9f533fdf) --- bolt/src/Relocation.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/src/Relocation.cpp b/bolt/src/Relocation.cpp index 80c8f42ce0fa..84767c9e476a 100644 --- a/bolt/src/Relocation.cpp +++ b/bolt/src/Relocation.cpp @@ -257,6 +257,7 @@ bool Relocation::isPCRelative(uint64_t Type) { case ELF::R_X86_64_PC8: case ELF::R_X86_64_PC32: + case ELF::R_X86_64_PC64: case ELF::R_X86_64_GOTPCREL: case ELF::R_X86_64_PLT32: case ELF::R_X86_64_GOTTPOFF: From 3db480cd20a0edb64c14de83bcedfd1f5d2790c1 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 12 Jul 2018 10:13:03 -0700 Subject: [PATCH 461/904] [BOLT][NFC] Minor code refactoring (cherry picked from commit 6ea90c7841d7681bbc7574041c3f115aedae0557) --- bolt/src/BinaryFunction.cpp | 2 +- bolt/src/Passes/ValidateInternalCalls.cpp | 4 +- bolt/src/RewriteInstance.cpp | 50 ++++++++++++++--------- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index d974e7435569..76a7b5f62ee3 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -644,7 +644,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, void BinaryFunction::printRelocations(raw_ostream &OS, uint64_t Offset, uint64_t Size) const { - const char* Sep = " # Relocs: "; + const char *Sep = " # Relocs: "; auto RI = Relocations.lower_bound(Offset); while (RI != Relocations.end() && RI->first < Offset + Size) { diff --git a/bolt/src/Passes/ValidateInternalCalls.cpp b/bolt/src/Passes/ValidateInternalCalls.cpp index 8f8e3c08bc9c..28f9d3075a7c 100644 --- a/bolt/src/Passes/ValidateInternalCalls.cpp +++ b/bolt/src/Passes/ValidateInternalCalls.cpp @@ -325,7 +325,7 @@ void ValidateInternalCalls::runOnFunctions( // case, we mark this function as non-simple and stop processing it. std::set Invalid; for (auto *Function : NeedsValidation) { - DEBUG(dbgs() << "Validating " << Function << "\n"); + DEBUG(dbgs() << "Validating " << *Function << "\n"); if (!analyzeFunction(*Function)) { Invalid.insert(Function); } @@ -336,7 +336,7 @@ void ValidateInternalCalls::runOnFunctions( errs() << "BOLT-ERROR: Unsupported internal calls detected in the " "following functions:\n"; for (auto *Function : Invalid) { - errs() << " " << Function << "\n"; + errs() << " " << *Function << "\n"; } errs() << "BOLT-ERROR: Unable to proceed in relocation mode\n"; exit(1); diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index d0d0014b91d3..9105c83f9d0d 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -2030,8 +2030,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { auto Section = BC->getSectionForAddress(SymbolAddress); dbgs() << "Relocation: offset = 0x" << Twine::utohexstr(Rel.getOffset()) - << "; type = " << Rel.getType() - << "; type name = " << TypeName + << "; type = " << TypeName << "; value = 0x" << Twine::utohexstr(ExtractedValue) << "; symbol = " << SymbolName << " (" << (Section ? Section->getName() : "") << ")" @@ -2071,13 +2070,6 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } const auto Address = SymbolAddress + Addend; - const bool ForceRelocation = - (opts::HotText && (SymbolName == "__hot_start" || - SymbolName == "__hot_end")) || - (opts::HotData && (SymbolName == "__hot_data_start" || - SymbolName == "__hot_data_end")) || - SymbolName == "_end" || - Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE; DEBUG( dbgs() << "BOLT-DEBUG: "; @@ -2116,6 +2108,24 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { continue; } + auto ForceRelocation = [&](StringRef SymbolName) { + if (opts::HotText && (SymbolName == "__hot_start" || + SymbolName == "__hot_end")) + return true; + + if (opts::HotData && (SymbolName == "__hot_data_start" || + SymbolName == "__hot_data_end")) + return true; + + if (SymbolName == "_end") + return true; + + return false; + }(SymbolName); + + if (BC->isAArch64() && Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE) + ForceRelocation = true; + // TODO: RefSection should be the same as **Rel.getSymbol().getSection() auto RefSection = BC->getSectionForAddress(SymbolAddress); if (!RefSection && !ForceRelocation) { @@ -2163,9 +2173,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { ReferencedSymbol = BC->registerNameAtAddress(Name, 0, 0, 0); SymbolAddress = 0; Addend = Address; - DEBUG(dbgs() << "BOLT-DEBUG: creating relocations for huge pages against" - " symbol " << SymbolName << " with addend " << Addend - << '\n'); + DEBUG(dbgs() << "BOLT-DEBUG: forcing relocation against symbol " + << SymbolName << " with addend " << Addend << '\n'); } else if (ReferencedBF) { ReferencedSymbol = ReferencedBF->getSymbol(); @@ -2357,10 +2366,15 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { NumDataRelocations < opts::MaxDataRelocations); }; + if (IsFromCode && IsAArch64) + ForceRelocation = true; + + if (refersToReorderedSection(RefSection) || + (opts::ForceToDataRelocations && checkMaxDataRelocations())) + ForceRelocation = true; + if (IsFromCode) { - if (ReferencedBF || ForceRelocation || IsAArch64 || - refersToReorderedSection(RefSection) || - (opts::ForceToDataRelocations && checkMaxDataRelocations())) { + if (ReferencedBF || ForceRelocation) { ContainingBF->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), @@ -2370,11 +2384,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocation from code to data " << ReferencedSymbol->getName() << "\n"); } - } else if (IsToCode) { - BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), - Addend); - } else if (refersToReorderedSection(RefSection) || - (opts::ForceToDataRelocations && checkMaxDataRelocations())) { + } else if (IsToCode || ForceRelocation) { BC->addRelocation(Rel.getOffset(), ReferencedSymbol, Rel.getType(), From e28430de6aba7ac64f151c8514d5724682bac102 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 30 Jul 2018 16:30:18 -0700 Subject: [PATCH 462/904] [BOLT] Fix TBSS-related issue Summary: TLS segment provide a template for initializing thread-local storage for every new thread. It consists of initialized and uninitialized parts. The uninitialized part of TLS, .tbss, is completely meaningless from a binary analysis perspective. It doesn't take any space in the file, or in memory. Note that this is different from a regular .bss section that takes space in memory. We should not place .tbss into a list of allocatable sections, otherwise it may cause conflicts with objects contained in the next section. (cherry picked from commit d5aa6a798fd253390f7eeb3592f6e8f819a66bc5) --- bolt/src/BinaryContext.cpp | 4 ++-- bolt/src/BinaryData.cpp | 6 ++++-- bolt/src/BinarySection.h | 5 ++++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 80c0dca0a4f9..a56ff707427c 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -158,7 +158,7 @@ void BinaryContext::updateObjectNesting(BinaryDataMapType::iterator GAI) { auto Itr = std::next(GAI); while (Itr != BinaryDataMap.end() && BD->containsRange(Itr->second->getAddress(), - Itr->second->getSize())) { + Itr->second->getSize())) { Itr->second->Parent = BD; ++Itr; } @@ -391,7 +391,7 @@ void BinaryContext::postProcessSymbolTable() { !BD->getSize() && !BD->isAbsolute() && BD->getSection()) { - errs() << "BOLT-WARNING: zero sized top level symbol: " << *BD << "\n"; + errs() << "BOLT-WARNING: zero-sized top level symbol: " << *BD << "\n"; Valid = false; } } diff --git a/bolt/src/BinaryData.cpp b/bolt/src/BinaryData.cpp index e8d806ec9b0f..3374a8c31543 100644 --- a/bolt/src/BinaryData.cpp +++ b/bolt/src/BinaryData.cpp @@ -113,8 +113,10 @@ void BinaryData::printBrief(raw_ostream &OS) const { OS << ")"; } - if (opts::Verbosity > 1 && Parent) { - OS << " (" << Parent->getName() << "/" << Parent->getSize() << ")"; + if (Parent) { + OS << " (parent: "; + Parent->printBrief(OS); + OS << ")"; } OS << ", 0x" << Twine::utohexstr(getAddress()) diff --git a/bolt/src/BinarySection.h b/bolt/src/BinarySection.h index 1aafbd838a7b..ffd0b5dc1371 100644 --- a/bolt/src/BinarySection.h +++ b/bolt/src/BinarySection.h @@ -246,6 +246,9 @@ class BinarySection { bool isTLS() const { return (ELFFlags & ELF::SHF_TLS); } + bool isTBSS() const { + return isBSS() && isTLS(); + } bool isNote() const { return ELFType == ELF::SHT_NOTE; } bool isStrTab() const { return ELFType == ELF::SHT_STRTAB; } bool isSymTab() const { return ELFType == ELF::SHT_SYMTAB; } @@ -257,7 +260,7 @@ class BinarySection { ELFType == ELF::SHT_PROGBITS); } bool isAllocatable() const { - return (ELFFlags & ELF::SHF_ALLOC); + return (ELFFlags & ELF::SHF_ALLOC) && !isTBSS(); } bool isLocal() const { return IsLocal; } bool isReordered() const { return IsReordered; } From 1996d1f08116d9b20e24d72b8f1a549deae1e0df Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 30 Jul 2018 16:30:18 -0700 Subject: [PATCH 463/904] [BOLT] Fix range checks Summary: containsRange() functions were incorrectly checking for an empty range at the end of containing object. I.e. [a,b) was reporting true for containing [b,b). (cherry picked from commit 43808ecb0262129ef19da9a64e6ae270bb442288) --- bolt/src/BinaryData.h | 2 +- bolt/src/BinarySection.h | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bolt/src/BinaryData.h b/bolt/src/BinaryData.h index d709ef073cfe..baf8950b995a 100644 --- a/bolt/src/BinaryData.h +++ b/bolt/src/BinaryData.h @@ -166,7 +166,7 @@ class BinaryData { (getAddress() == Address && !getSize())); } bool containsRange(uint64_t Address, uint64_t Size) const { - return (getAddress() <= Address && Address + Size <= getEndAddress()); + return containsAddress(Address) && Address + Size <= getEndAddress(); } const BinaryData *getParent() const { diff --git a/bolt/src/BinarySection.h b/bolt/src/BinarySection.h index ffd0b5dc1371..26e071e85f4e 100644 --- a/bolt/src/BinarySection.h +++ b/bolt/src/BinarySection.h @@ -277,15 +277,16 @@ class BinarySection { bool hasSectionRef() const { return Section != SectionRef(); } SectionRef getSectionRef() const { return Section; } - /// Does this section contain the given /p Addr? + /// Does this section contain the given \p Address? /// Note: this is in terms of the original mapped binary addresses. - bool containsAddress(uint64_t Addr) const { - return getAddress() <= Addr && Addr < getEndAddress(); + bool containsAddress(uint64_t Address) const { + return getAddress() <= Address && Address < getEndAddress(); } - /// Does this section contain the range given by /p Addr and /p Sz? + + /// Does this section contain the range [\p Address, \p Address + \p Size)? /// Note: this is in terms of the original mapped binary addresses. - bool containsRange(uint64_t Addr, uint64_t Sz) const { - return getAddress() <= Addr && Addr + Sz <= getEndAddress(); + bool containsRange(uint64_t Address, uint64_t Size) const { + return containsAddress(Address) && Address + Size <= getEndAddress(); } /// Iterate over all non-pending relocations for this section. From ced36ad56a5942d3fa94c1ef6e5b319515e10f82 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 30 Jul 2018 10:29:47 -0700 Subject: [PATCH 464/904] [BOLT] Add support for IFUNC Summary: Relocation value verification was failing for IFUNC as the real value used for relocation wasn't the symbol value, but a corresponding PLT entry. Relax the verification and skip any symbols of ST_Other type. (cherry picked from commit a07ca04677ffe5db2be73c1a3345010313cfe2ab) --- bolt/src/RewriteInstance.cpp | 77 ++++++++---------------------------- 1 file changed, 17 insertions(+), 60 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 9105c83f9d0d..579f98245c2a 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1859,7 +1859,7 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, auto SymbolIter = Rel.getSymbol(); assert(SymbolIter != InputFile->symbol_end() && "relocation symbol must exist"); - auto Symbol = *SymbolIter; + const auto &Symbol = *SymbolIter; SymbolName = cantFail(Symbol.getName()); SymbolAddress = cantFail(Symbol.getAddress()); Addend = getRelocationAddend(InputFile, Rel); @@ -1920,67 +1920,24 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, } } - if (!IsPCRelative && Addend != 0 && IsFromCode && !SymbolIsSection) { - // TODO: RefSection should be the same as **(Symbol.getSection()). - auto RefSection = BC->getSectionForAddress(SymbolAddress); - if (RefSection && RefSection->isText()) { - if (opts::Verbosity > 1) { - SmallString<16> TypeName; - Rel.getTypeName(TypeName); - errs() << "BOLT-WARNING: detected absolute reference from code into " - << "a middle of a function:\n" - << " offset = 0x" << Twine::utohexstr(Rel.getOffset()) - << "; type = " << Rel.getType() - << "; type name = " << TypeName - << "; value = 0x" << Twine::utohexstr(ExtractedValue) - << "; symbol = " << SymbolName - << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) - << "; symbol section = " << RefSection->getName() - << "; addend = 0x" << Twine::utohexstr(Addend) - << "; address = 0x" << Twine::utohexstr(SymbolAddress + Addend) - << '\n'; - } - assert(truncateToSize(ExtractedValue, RelSize) == - truncateToSize(SymbolAddress + Addend, RelSize) && - "value mismatch"); - } - } + auto verifyExtractedValue = [&]() { + if (IsAArch64) + return true; - DEBUG( - if (!Relocation::isTLS(Rel.getType()) && - SymbolName != "__hot_start" && - SymbolName != "__hot_end" && - truncateToSize(ExtractedValue, RelSize) != - truncateToSize(SymbolAddress + Addend - PCRelOffset, RelSize)) { - auto Section = cantFail(Symbol.getSection()); - SmallString<16> TypeName; - Rel.getTypeName(TypeName); - dbgs() << "BOLT-DEBUG: Mismatch between extracted value and relocation " - << "data:\n" - << "BOLT-DEBUG: offset = 0x" - << Twine::utohexstr(Rel.getOffset()) - << "; type = " << Rel.getType() - << "; type name = " << TypeName - << "; value = 0x" << Twine::utohexstr(ExtractedValue) - << "; symbol = " << SymbolName - << "; symbol type = " << cantFail(Symbol.getType()) - << "; symbol address = 0x" << Twine::utohexstr(SymbolAddress) - << "; orig symbol address = 0x" - << Twine::utohexstr(cantFail(Symbol.getAddress())) - << "; symbol section = " << getSectionName(*Section) - << "; addend = 0x" << Twine::utohexstr(Addend) - << "; original addend = 0x" - << Twine::utohexstr(getRelocationAddend(InputFile, Rel)) - << '\n'; - }); + if (SymbolName == "__hot_start" || SymbolName == "__hot_end") + return true; + + if (Relocation::isTLS(Rel.getType())) + return true; + + if (cantFail(Symbol.getType()) == SymbolRef::ST_Other) + return true; + + return truncateToSize(ExtractedValue, RelSize) == + truncateToSize(SymbolAddress + Addend - PCRelOffset, RelSize); + }; - assert((IsAArch64 || - Relocation::isTLS(Rel.getType()) || - SymbolName == "__hot_start" || - SymbolName == "__hot_end" || - truncateToSize(ExtractedValue, RelSize) == - truncateToSize(SymbolAddress + Addend - PCRelOffset, RelSize)) && - "extracted relocation value should match relocation components"); + assert(verifyExtractedValue() && "mismatched extracted relocation value"); return true; } From 77e24a936022acb339cebdc0fb03415e91e2e0b7 Mon Sep 17 00:00:00 2001 From: Laith Saed Sakka Date: Wed, 25 Jul 2018 19:07:41 -0700 Subject: [PATCH 465/904] Retpoline Insertion Pass Summary: retpoline insertion implemented for reloc mode, (cherry picked from commit 4e6251f6c88e9c45e68189d80149056b46f55299) --- bolt/src/BinaryContext.h | 2 +- bolt/src/BinaryPassManager.cpp | 11 + bolt/src/MCPlusBuilder.h | 48 +++- bolt/src/Passes/CMakeLists.txt | 1 + bolt/src/Passes/RetpolineInsertion.cpp | 327 +++++++++++++++++++++++ bolt/src/Passes/RetpolineInsertion.h | 81 ++++++ bolt/src/Target/X86/X86MCPlusBuilder.cpp | 100 ++++++- 7 files changed, 562 insertions(+), 8 deletions(-) create mode 100644 bolt/src/Passes/RetpolineInsertion.cpp create mode 100644 bolt/src/Passes/RetpolineInsertion.h diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index d26314f3c228..5817e15573ea 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -253,7 +253,7 @@ class BinaryContext { DataReader &DR; - /// Indicates if relocations are availabe for usage. + /// Indicates if relocations are available for usage. bool HasRelocations{false}; /// Sum of execution count of all functions diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index 113c27cc252e..c8ce0eb2c214 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -23,6 +23,7 @@ #include "Passes/ReorderFunctions.h" #include "Passes/ReorderData.h" #include "Passes/StokeInfo.h" +#include "Passes/RetpolineInsertion.h" #include "Passes/ValidateInternalCalls.h" #include "Passes/VeneerElimination.h" #include "llvm/Support/Timer.h" @@ -282,6 +283,13 @@ static llvm::cl::opt cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static llvm::cl::opt + PrintRetpolineInsertion("print-retpoline-insertion", + cl::desc("print functions after retpoline insertion pass"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + } // namespace opts namespace llvm { @@ -468,6 +476,9 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintFOP)); + Manager.registerPass( + llvm::make_unique(PrintRetpolineInsertion)); + // Thighten branches according to offset differences between branch and // targets. No extra instructions after this pass, otherwise we may have // relocations out of range and crash during linking. diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index 43932d008dac..d42cb425172a 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -321,6 +321,51 @@ class MCPlusBuilder { return Analysis->isIndirectBranch(Inst); } + /// Returns true if the instruction is memory indirect call or jump + virtual bool isBranchOnMem(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + + /// Returns true if the instruction is register indirect call or jump + virtual bool isBranchOnReg(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + + /// Creates x86 pause instruction. + virtual void createPause(MCInst &Inst) const { + llvm_unreachable("not implemented"); + } + + virtual void createLfence(MCInst &Inst) const { + llvm_unreachable("not implemented"); + } + + virtual void createPushRegister(MCInst &Inst, MCPhysReg Reg, + unsigned Size) const { + llvm_unreachable("not implemented"); + } + + virtual void createPopRegister(MCInst &Inst, MCPhysReg Reg, + unsigned Size) const { + llvm_unreachable("not implemented"); + } + + virtual bool createDirectCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) { + llvm_unreachable("not implemented"); + return false; + } + + virtual MCPhysReg getX86R11() const { + llvm_unreachable("not implemented"); + } + + virtual MCPhysReg getX86NoRegister() const { + llvm_unreachable("not implemented"); + } + virtual bool isIndirectCall(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; @@ -1254,7 +1299,8 @@ class MCPlusBuilder { virtual bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int Scale, const MCPhysReg &IndexReg, int Offset, - const MCPhysReg &DstReg, int Size) const { + const MCExpr *OffsetExpr, const MCPhysReg &DstReg, + int Size) const { llvm_unreachable("not implemented"); return false; } diff --git a/bolt/src/Passes/CMakeLists.txt b/bolt/src/Passes/CMakeLists.txt index c8b09d7f7067..2f3a3ca1b33e 100644 --- a/bolt/src/Passes/CMakeLists.txt +++ b/bolt/src/Passes/CMakeLists.txt @@ -34,6 +34,7 @@ add_llvm_library(LLVMBOLTPasses StokeInfo.cpp ValidateInternalCalls.cpp VeneerElimination.cpp + RetpolineInsertion.cpp DEPENDS intrinsics_gen diff --git a/bolt/src/Passes/RetpolineInsertion.cpp b/bolt/src/Passes/RetpolineInsertion.cpp new file mode 100644 index 000000000000..eb66444f99ec --- /dev/null +++ b/bolt/src/Passes/RetpolineInsertion.cpp @@ -0,0 +1,327 @@ +//===--- Passes/RetpolineInsertion.cpp-------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class implements a pass that replaces indirect branches (calls and +// jumps) with calls to retpolines to protect against branch target injection +// attacks. +// A unique retpoline is created for each register holding the address of the +// callee, if the callee address is in memory %r11 is used if available to +// hold the address of the callee before calling the retpoline, otherwise an +// address pattern specific retpoline is called where the callee address is +// loaded inside the retpoline. +// The user can determine when to assume %r11 available using r11-availability +// option, by default %r11 is assumed not available. +// Adding lfence instruction to the body of the speculate code is enabled by +// default and can be controlled by the user using retpoline-lfence option. +//===----------------------------------------------------------------------===// +#include "RetpolineInsertion.h" +#include "RewriteInstance.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "bolt-retpoline" + +using namespace llvm; +using namespace bolt; +namespace opts { + +extern cl::OptionCategory BoltCategory; + +llvm::cl::opt +InsertRetpolines("insert-retpolines", + cl::desc("run retpoline insertion pass"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +llvm::cl::opt +RetpolineLfence("retpoline-lfence", + cl::desc("determine if lfence instruction should exist in the retpoline"), + cl::init(true), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + +cl::opt +R11Availability("r11-availability", + cl::desc("determine the availablity of r11 before indirect branches"), + cl::init(RetpolineInsertion::AvailabilityOptions::NEVER), + cl::values( + clEnumValN(RetpolineInsertion::AvailabilityOptions::NEVER, + "never", "r11 not available"), + clEnumValN(RetpolineInsertion::AvailabilityOptions::ALWAYS, + "always", "r11 avaialable before calls and jumps"), + clEnumValN(RetpolineInsertion::AvailabilityOptions::ABI, + "abi", "r11 avaialable before calls but not before jumps")), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +// Retpoline function structure: +// BB0: call BB2 +// BB1: pause +// lfence +// jmp BB1 +// BB2: mov %reg, (%rsp) +// ret +// or +// BB2: push %r11 +// mov Address, %r11 +// mov %r11, 8(%rsp) +// pop %r11 +// ret +BinaryFunction *createNewRetpoline(BinaryContext &BC, + const std::string &RetpolineTag, + const IndirectBranchInfo &BrInfo, + bool R11Available) { + auto &MIB = *BC.MIB; + auto &Ctx = *BC.Ctx.get(); + DEBUG(dbgs() << "BOLT-DEBUG: Creating a new retpoline function[" + << RetpolineTag << "]\n"); + + auto *NewRetpoline = BC.createInjectedBinaryFunction(RetpolineTag, true); + std::vector> NewBlocks(3); + for (int I = 0; I < 3; I++) { + auto Symbol = + Ctx.createTempSymbol(Twine(RetpolineTag + "_BB" + to_string(I)), true); + NewBlocks[I] = NewRetpoline->createBasicBlock( + BinaryBasicBlock::INVALID_OFFSET, Symbol); + } + + auto &BB0 = *NewBlocks[0].get(); + auto &BB1 = *NewBlocks[1].get(); + auto &BB2 = *NewBlocks[2].get(); + + BB0.addSuccessor(&BB2, 0, 0); + BB1.addSuccessor(&BB1, 0, 0); + + // Build BB0 + MCInst DirectCall; + MIB.createDirectCall(DirectCall, BB2.getLabel(), &Ctx); + BB0.addInstruction(DirectCall); + + // Build BB1 + MCInst Pause; + MIB.createPause(Pause); + BB1.addInstruction(Pause); + + if (opts::RetpolineLfence) { + MCInst Lfence; + MIB.createLfence(Lfence); + BB1.addInstruction(Lfence); + } + + std::vector Seq; + MIB.createShortJmp(Seq, BB1.getLabel(), &Ctx); + BB1.addInstructions(Seq.begin(), Seq.end()); + + // Build BB2 + if (BrInfo.isMem()) { + if (R11Available) { + MCInst StoreToStack; + MIB.createSaveToStack(StoreToStack, MIB.getStackPointer(), 0, + MIB.getX86R11(), 8); + BB2.addInstruction(StoreToStack); + } else { + MCInst PushR11; + MIB.createPushRegister(PushR11, MIB.getX86R11(), 8); + BB2.addInstruction(PushR11); + + MCInst LoadCalleeAddrs; + MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue, + BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr, + MIB.getX86R11(), 8); + BB2.addInstruction(LoadCalleeAddrs); + + MCInst StoreToStack; + MIB.createSaveToStack(StoreToStack, MIB.getStackPointer(), 8, + MIB.getX86R11(), 8); + BB2.addInstruction(StoreToStack); + + MCInst PopR11; + MIB.createPopRegister(PopR11, MIB.getX86R11(), 8); + BB2.addInstruction(PopR11); + } + } else if (BrInfo.isReg()) { + MCInst StoreToStack; + MIB.createSaveToStack(StoreToStack, MIB.getStackPointer(), 0, + BrInfo.BranchReg, 8); + BB2.addInstruction(StoreToStack); + } else { + llvm_unreachable("not expected"); + } + + // return + MCInst Return; + MIB.createReturn(Return); + BB2.addInstruction(Return); + NewRetpoline->insertBasicBlocks(nullptr, move(NewBlocks), + /* UpdateLayout */ true, + /* UpdateCFIState */ false); + + NewRetpoline->updateState(BinaryFunction::State::CFG_Finalized); + return NewRetpoline; +} + +std::string createRetpolineFunctionTag(BinaryContext &BC, + const IndirectBranchInfo &BrInfo, + bool R11Available) { + if (BrInfo.isReg()) + return "__retpoline_r" + to_string(BrInfo.BranchReg) + "_"; + + // Memory Branch + if (R11Available) + return "__retpoline_r11"; + + std::string Tag = "__retpoline_mem_"; + + std::string DispExprStr; + if (BrInfo.DispExpr) { + llvm::raw_string_ostream Ostream(DispExprStr); + BrInfo.DispExpr->print(Ostream, BC.AsmInfo.get()); + Ostream.flush(); + } + + Tag += BrInfo.BaseRegNum != BC.MIB->getX86NoRegister() + ? "r" + to_string(BrInfo.BaseRegNum) + : ""; + Tag += BrInfo.DispValue ? "+" + to_string(BrInfo.DispValue) : ""; + Tag += BrInfo.DispExpr ? "+" + DispExprStr : ""; + Tag += BrInfo.IndexRegNum != BC.MIB->getX86NoRegister() + ? "+" + to_string(BrInfo.ScaleValue) + "*" + + to_string(BrInfo.IndexRegNum) + : ""; + + return Tag; +} + +BinaryFunction *RetpolineInsertion::getOrCreateRetpoline( + BinaryContext &BC, const IndirectBranchInfo &BrInfo, bool R11Available) { + const auto RetpolineTag = + createRetpolineFunctionTag(BC, BrInfo, R11Available); + + if (CreatedRetpolines.count(RetpolineTag)) + return CreatedRetpolines[RetpolineTag]; + + return CreatedRetpolines[RetpolineTag] = + createNewRetpoline(BC, RetpolineTag, BrInfo, R11Available); +} + +void createBranchReplacement(BinaryContext &BC, + const IndirectBranchInfo &BrInfo, + bool R11Available, + std::vector &Replacement, + const MCSymbol *RetpolineSymbol) { + auto &MIB = *BC.MIB; + // Load the branch address in r11 if available + if (BrInfo.isMem() && R11Available) { + MCInst LoadCalleeAddrs; + MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue, + BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr, + MIB.getX86R11(), 8); + Replacement.push_back(LoadCalleeAddrs); + } + + // Call the retpoline + MCInst RetpolineCall; + if (BrInfo.isJump() || BrInfo.isTailCall()) + MIB.createTailCall(RetpolineCall, RetpolineSymbol, BC.Ctx.get()); + else + MIB.createDirectCall(RetpolineCall, RetpolineSymbol, BC.Ctx.get()); + + Replacement.push_back(RetpolineCall); +} + +IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) { + IsCall = MIB.isCall(Inst); + IsTailCall = MIB.isTailCall(Inst); + + if (MIB.isBranchOnMem(Inst)) { + IsMem = true; + if (!MIB.evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue, + &IndexRegNum, &DispValue, &SegRegNum, + &DispExpr)) { + assert(false && "not expected"); + } + } else if (MIB.isBranchOnReg(Inst)) { + assert(MCPlus::getNumPrimeOperands(Inst) == 1 && "expect 1 operand"); + BranchReg = Inst.getOperand(0).getReg(); + } else { + llvm_unreachable("unexpected instruction"); + } +} + +void RetpolineInsertion::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + + if (!opts::InsertRetpolines) + return; + + assert(BC.isX86() && + "retpoline insertion not supported for target architecture"); + + assert(BC.HasRelocations && "retpoline mode not supported in non-reloc"); + + auto &MIB = *BC.MIB; + uint32_t RetpolinedBranches = 0; + for (auto &It : BFs) { + auto &Function = It.second; + for (auto &BB : Function) { + for (auto It = BB.begin(); It != BB.end(); ++It) { + auto &Inst = *It; + + if (!MIB.isIndirectCall(Inst) && !MIB.isIndirectBranch(Inst)) + continue; + + IndirectBranchInfo BrInfo(Inst, MIB); + bool R11Available = false; + BinaryFunction *TargetRetpoline; + std::vector Replacement; + + // Determine if r11 is available before this instruction + if (BrInfo.isMem()) { + if (opts::R11Availability == AvailabilityOptions::ALWAYS) + R11Available = true; + else if (opts::R11Availability == AvailabilityOptions::ABI) + R11Available = BrInfo.isCall(); + } + + // If the instruction addressing pattern uses rsp and the retpoline + // loads the callee address then displacement needs to be updated + if (BrInfo.isMem() && !R11Available) { + auto Addend = (BrInfo.isJump() || BrInfo.isTailCall()) ? 8 : 16; + if (BrInfo.BaseRegNum == MIB.getStackPointer()) { + BrInfo.DispValue += Addend; + } + if (BrInfo.IndexRegNum == MIB.getStackPointer()) + BrInfo.DispValue += Addend * BrInfo.ScaleValue; + } + + TargetRetpoline = getOrCreateRetpoline(BC, BrInfo, R11Available); + + createBranchReplacement(BC, BrInfo, R11Available, Replacement, + TargetRetpoline->getSymbol()); + + It = BB.replaceInstruction(It, Replacement.begin(), Replacement.end()); + RetpolinedBranches++; + } + } + } + outs() << "The number of created retpoline functions is : " + << CreatedRetpolines.size() + << "\nThe number of retpolined branches is : " << RetpolinedBranches + << "\n"; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/Passes/RetpolineInsertion.h b/bolt/src/Passes/RetpolineInsertion.h new file mode 100644 index 000000000000..dd29ff47066a --- /dev/null +++ b/bolt/src/Passes/RetpolineInsertion.h @@ -0,0 +1,81 @@ +//===--- Passes/RetpolineInsertion.h ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_RETPOLINE_INSERTION_H +#define LLVM_TOOLS_LLVM_BOLT_RETPOLINE_INSERTION_H + +#include "BinaryPasses.h" +#include "BinarySection.h" +#include +#include + +namespace llvm { +namespace bolt { + +struct IndirectBranchInfo { +private: + bool IsMem = false; + bool IsCall = false; + bool IsTailCall = false; + +public: + IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB); + bool isMem() const { return IsMem; } + bool isReg() const { return !IsMem; } + bool isCall() const { return IsCall; } + bool isJump() const { return !IsCall; } + bool isTailCall() const { return IsTailCall; } + + union { + // Register branch information + MCPhysReg BranchReg; + + // Memory branch information + struct { + unsigned BaseRegNum; + int64_t ScaleValue; + unsigned IndexRegNum; + int64_t DispValue; + unsigned SegRegNum; + const MCExpr *DispExpr{nullptr}; + }; + }; +}; + +class RetpolineInsertion : public BinaryFunctionPass { +private: + std::unordered_map CreatedRetpolines; + + BinaryFunction *getOrCreateRetpoline(BinaryContext &BC, + const IndirectBranchInfo &BrInfo, + bool R11Available); + +public: + /// Register r11 availability options + enum AvailabilityOptions : char { + ALWAYS = 0, /// r11 available before calls and jumps + ABI = 1, /// r11 available before calls + NEVER = 2 /// r11 not available + }; + + explicit RetpolineInsertion(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { return "retpoline-insertion"; } + + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index 7f9e88e2ba82..b4a2f1386fcb 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -2660,13 +2660,13 @@ class X86MCPlusBuilder : public MCPlusBuilder { int Offset, const MCPhysReg &DstReg, int Size) const override { return createLoad(Inst, StackReg, /*Scale=*/1, /*IndexReg=*/X86::NoRegister, - Offset, DstReg, Size); + Offset, nullptr, DstReg, Size); } - bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, - int Scale, const MCPhysReg &IndexReg, - int Offset, const MCPhysReg &DstReg, - int Size) const override { + bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int Scale, + const MCPhysReg &IndexReg, int Offset, + const MCExpr *OffsetExpr, const MCPhysReg &DstReg, + int Size) const{ unsigned NewOpcode; switch (Size) { default: @@ -2681,7 +2681,10 @@ class X86MCPlusBuilder : public MCPlusBuilder { Inst.addOperand(MCOperand::createReg(BaseReg)); Inst.addOperand(MCOperand::createImm(Scale)); Inst.addOperand(MCOperand::createReg(IndexReg)); - Inst.addOperand(MCOperand::createImm(Offset)); // Displacement + if (OffsetExpr) + Inst.addOperand(MCOperand::createExpr(OffsetExpr)); // Displacement + else + Inst.addOperand(MCOperand::createImm(Offset)); // Displacement Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg return true; } @@ -2878,6 +2881,91 @@ class X86MCPlusBuilder : public MCPlusBuilder { return true; } + MCPhysReg getX86R11() const override { + return X86::R11; + } + + MCPhysReg getX86NoRegister() const override { + return X86::NoRegister; + } + + void createPause(MCInst &Inst) const override { + Inst.clear(); + Inst.setOpcode(X86::PAUSE); + } + + void createLfence(MCInst &Inst) const override { + Inst.clear(); + Inst.setOpcode(X86::LFENCE); + } + + bool createDirectCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { + Inst.clear(); + Inst.setOpcode(X86::CALL64pcrel32); + Inst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx))); + return true; + } + + void createShortJmp(std::vector &Seq, const MCSymbol *Target, + MCContext *Ctx) const override { + Seq.clear(); + MCInst Inst; + Inst.setOpcode(X86::JMP_1); + Inst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx))); + Seq.emplace_back(Inst); + } + + bool isBranchOnMem(const MCInst &Inst) const override { + auto OpCode = Inst.getOpcode(); + if (OpCode == X86::CALL64m || OpCode == X86::TAILJMPm || + OpCode == X86::JMP64m) + return true; + + return false; + } + + bool isBranchOnReg(const MCInst &Inst) const override { + auto OpCode = Inst.getOpcode(); + if (OpCode == X86::CALL64r || OpCode == X86::TAILJMPr || + OpCode == X86::JMP64r) + return true; + + return false; + } + + void createPushRegister(MCInst &Inst, MCPhysReg Reg, + unsigned Size) const override { + Inst.clear(); + unsigned NewOpcode = 0; + switch (Size) { + case 2: NewOpcode = X86::PUSH16r; break; + case 4: NewOpcode = X86::PUSH32r; break; + case 8: NewOpcode = X86::PUSH64r; break; + default: + assert(false); + } + Inst.setOpcode(NewOpcode); + Inst.addOperand(MCOperand::createReg(Reg)); + } + + void createPopRegister(MCInst &Inst, MCPhysReg Reg, + unsigned Size) const override { + Inst.clear(); + unsigned NewOpcode = 0; + switch (Size) { + case 2: NewOpcode = X86::POP16r; break; + case 4: NewOpcode = X86::POP32r; break; + case 8: NewOpcode = X86::POP64r; break; + default: + assert(false); + } + Inst.setOpcode(NewOpcode); + Inst.addOperand(MCOperand::createReg(Reg)); + } + ICPdata indirectCallPromotion( const MCInst &CallInst, const std::vector> &Targets, From 514fe1cf579b7ded1babe698bfb22656f76652a0 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 6 Aug 2018 11:22:45 -0700 Subject: [PATCH 466/904] [BOLT] Detect and handle fixed indirect branches Summary: Sometimes GCC can generate code where one of jump table entries is being used by an indirect branch with a fixed memory reference, such as "jmp *(JT+8)". If we don't convert such branches to direct ones and move jump tables, then the indirect branch will reference the old table value and will end up at the non-updated destination, possibly causing a runtime crash. This fix converts such indirect branches into direct ones. For now we mark functions containing indirect branches with fixed destination as non-simple to prevent unreachable code elimination problem triggered by related dead/unreachable jump table. (cherry picked from commit 858f31e80f6ef70afba3f9bdcbfec45ca7fb212d) --- bolt/src/BinaryFunction.cpp | 47 +++++++++++++++++++++--- bolt/src/BinaryFunction.h | 7 +++- bolt/src/MCPlusBuilder.h | 3 +- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 15 ++++++-- 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 76a7b5f62ee3..68c909872f01 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -666,9 +666,11 @@ void BinaryFunction::printRelocations(raw_ostream &OS, } } -IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, - unsigned Size, - uint64_t Offset) { +IndirectBranchType +BinaryFunction::processIndirectBranch(MCInst &Instruction, + unsigned Size, + uint64_t Offset, + uint64_t &TargetAddress) { const auto PtrSize = BC.AsmInfo->getCodePointerSize(); // An instruction referencing memory used by jump instruction (directly or @@ -789,7 +791,7 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, DEBUG(dbgs() << "BOLT-DEBUG: adjusting size of jump table at 0x" << Twine::utohexstr(JT->getAddress()) << '\n'); JT->OffsetEntries.resize(JTOffset / JT->EntrySize); - } else { + } else if (Type != IndirectBranchType::POSSIBLE_FIXED_BRANCH) { // Re-use an existing jump table. Perhaps parts of it. if (Type != IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) { assert(JT->Type == JumpTable::JTT_NORMAL && @@ -838,6 +840,7 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, // The contents are filled at runtime. return IndirectBranchType::POSSIBLE_TAIL_CALL; } + // Extract the value at the start of the array. StringRef SectionContents = Section->getContents(); const auto EntrySize = @@ -861,6 +864,17 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, } DEBUG(dbgs() << ", which contains value " << Twine::utohexstr(Value) << '\n'); + if (Type == IndirectBranchType::POSSIBLE_FIXED_BRANCH) { + if (Section->isReadOnly()) { + outs() << "BOLT-INFO: fixed indirect branch detected in " << *this + << " at 0x" << Twine::utohexstr(getAddress() + Offset) + << " the destination value is 0x" << Twine::utohexstr(Value) + << '\n'; + TargetAddress = Value; + return Type; + } + return IndirectBranchType::UNKNOWN; + } if (containsAddress(Value) && Value != getAddress()) { // Is it possible to have a jump table with function start as an entry? JTOffsetCandidates.push_back(Value - getAddress()); @@ -920,6 +934,7 @@ IndirectBranchType BinaryFunction::processIndirectBranch(MCInst &Instruction, } assert(!Value || BC.getSectionForAddress(Value)); BC.InterproceduralReferences.insert(Value); + return IndirectBranchType::POSSIBLE_TAIL_CALL; } @@ -1338,7 +1353,9 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Could not evaluate branch. Should be an indirect call or an // indirect branch. Bail out on the latter case. if (MIB->isIndirectBranch(Instruction)) { - auto Result = processIndirectBranch(Instruction, Size, Offset); + uint64_t IndirectTarget{0}; + auto Result = + processIndirectBranch(Instruction, Size, Offset, IndirectTarget); switch (Result) { default: llvm_unreachable("unexpected result"); @@ -1346,12 +1363,26 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { auto Result = MIB->convertJmpToTailCall(Instruction, BC.Ctx.get()); (void)Result; assert(Result); - } break; + break; + } case IndirectBranchType::POSSIBLE_JUMP_TABLE: case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE: if (opts::JumpTables == JTS_NONE) IsSimple = false; break; + case IndirectBranchType::POSSIBLE_FIXED_BRANCH: { + if (containsAddress(IndirectTarget)) { + const auto *TargetSymbol = getOrCreateLocalLabel(IndirectTarget); + Instruction.clear(); + MIB->createUncondBranch(Instruction, TargetSymbol, BC.Ctx.get()); + TakenBranches.emplace_back(Offset, IndirectTarget - getAddress()); + HasFixedIndirectBranch = true; + } else { + MIB->convertJmpToTailCall(Instruction, BC.Ctx.get()); + BC.InterproceduralReferences.insert(IndirectTarget); + } + break; + } case IndirectBranchType::UNKNOWN: // Keep processing. We'll do more checks and fixes in // postProcessIndirectBranches(). @@ -1594,6 +1625,10 @@ bool BinaryFunction::postProcessIndirectBranches() { BC.MIB->convertJmpToTailCall(Instr, BC.Ctx.get()); } } + + if (HasFixedIndirectBranch) + return false; + return true; } diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index bd7bb1e0cd43..d753f8845074 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -322,6 +322,10 @@ class BinaryFunction { /// Execution halts whenever this function is entered. bool TrapsOnEntry{false}; + /// True if the function had an indirect branch with a fixed internal + /// destination. + bool HasFixedIndirectBranch{false}; + /// The address for the code for this function in codegen memory. uint64_t ImageAddress{0}; @@ -660,7 +664,8 @@ class BinaryFunction { /// added to Instructions list. IndirectBranchType processIndirectBranch(MCInst &Instruction, unsigned Size, - uint64_t Offset); + uint64_t Offset, + uint64_t &TargetAddress); DenseMap> computeLocalUDChain(const MCInst *CurInstr); diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index d42cb425172a..d0ac8b08aa21 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -48,7 +48,8 @@ enum class IndirectBranchType : char { POSSIBLE_TAIL_CALL, /// Possibly a tail call. POSSIBLE_JUMP_TABLE, /// Possibly a switch/jump table. POSSIBLE_PIC_JUMP_TABLE, /// Possibly a jump table for PIC. - POSSIBLE_GOTO /// Possibly a gcc's computed goto. + POSSIBLE_GOTO, /// Possibly a gcc's computed goto. + POSSIBLE_FIXED_BRANCH, /// Possibly an indirect branch to a fixed location. }; class MCPlusBuilder { diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index b4a2f1386fcb..f1a664275445 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -2443,10 +2443,21 @@ class X86MCPlusBuilder : public MCPlusBuilder { &DispExpr)) return IndirectBranchType::UNKNOWN; + BaseRegNumOut = BaseRegNum; + IndexRegNumOut = IndexRegNum; + DispValueOut = DispValue; + DispExprOut = DispExpr; + if ((BaseRegNum != X86::NoRegister && BaseRegNum != RIPRegister) || SegRegNum != X86::NoRegister) return IndirectBranchType::UNKNOWN; + if (MemLocInstr == &Instruction && + (!ScaleValue || IndexRegNum == X86::NoRegister)) { + MemLocInstrOut = MemLocInstr; + return IndirectBranchType::POSSIBLE_FIXED_BRANCH; + } + if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE && (ScaleValue != 1 || BaseRegNum != RIPRegister)) return IndirectBranchType::UNKNOWN; @@ -2456,10 +2467,6 @@ class X86MCPlusBuilder : public MCPlusBuilder { return IndirectBranchType::UNKNOWN; MemLocInstrOut = MemLocInstr; - BaseRegNumOut = BaseRegNum; - IndexRegNumOut = IndexRegNum; - DispValueOut = DispValue; - DispExprOut = DispExpr; return Type; } From e1f7755e03367998880bbda5f877a1f40a86e196 Mon Sep 17 00:00:00 2001 From: Laith Saed Sakka Date: Fri, 3 Aug 2018 16:36:06 -0700 Subject: [PATCH 467/904] retpoline insertion : further updates. Summary: Couple of updates: 1) Handle address pattern with segment register. 2) Assume R11 available for PLT calls always. 3) Add CFI state to each BB. 4) early exit getMacroOpFusionPair if Instruction.size() <2. (cherry picked from commit 88ae1287f64f34b91a258d895e8cb0b3308c7219) --- bolt/src/BinaryBasicBlock.cpp | 2 +- bolt/src/MCPlusBuilder.h | 9 ++++---- bolt/src/Passes/PLTCall.cpp | 1 + bolt/src/Passes/RetpolineInsertion.cpp | 26 ++++++++++++++++-------- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 16 +++++++++------ 5 files changed, 35 insertions(+), 19 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.cpp b/bolt/src/BinaryBasicBlock.cpp index 908e9587fe03..9e6567d19191 100644 --- a/bolt/src/BinaryBasicBlock.cpp +++ b/bolt/src/BinaryBasicBlock.cpp @@ -358,7 +358,7 @@ BinaryBasicBlock::getMacroOpFusionPair() const { if (!Function->getBinaryContext().isX86()) return end(); - if (succ_size() != 2) + if (getNumNonPseudos() < 2 || succ_size() != 2) return end(); auto RI = getLastNonPseudo(); diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index d0ac8b08aa21..3438342a8aa1 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -1298,10 +1298,11 @@ class MCPlusBuilder { return false; } - virtual bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int Scale, - const MCPhysReg &IndexReg, int Offset, - const MCExpr *OffsetExpr, const MCPhysReg &DstReg, - int Size) const { + virtual bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale, + const MCPhysReg &IndexReg, int64_t Offset, + const MCExpr *OffsetExpr, + const MCPhysReg &AddrSegmentReg, + const MCPhysReg &DstReg, int Size) const { llvm_unreachable("not implemented"); return false; } diff --git a/bolt/src/Passes/PLTCall.cpp b/bolt/src/Passes/PLTCall.cpp index 8d8351261ca1..966ff2e09192 100644 --- a/bolt/src/Passes/PLTCall.cpp +++ b/bolt/src/Passes/PLTCall.cpp @@ -76,6 +76,7 @@ void PLTCall::runOnFunctions( BC.MIB->convertCallToIndirectCall(Instr, CalleeBF->getPLTSymbol(), BC.Ctx.get()); + BC.MIB->addAnnotation(Instr, "PLTCall", true); ++NumCallsOptimized; } } diff --git a/bolt/src/Passes/RetpolineInsertion.cpp b/bolt/src/Passes/RetpolineInsertion.cpp index eb66444f99ec..5f330e32f791 100644 --- a/bolt/src/Passes/RetpolineInsertion.cpp +++ b/bolt/src/Passes/RetpolineInsertion.cpp @@ -95,6 +95,7 @@ BinaryFunction *createNewRetpoline(BinaryContext &BC, Ctx.createTempSymbol(Twine(RetpolineTag + "_BB" + to_string(I)), true); NewBlocks[I] = NewRetpoline->createBasicBlock( BinaryBasicBlock::INVALID_OFFSET, Symbol); + NewBlocks[I].get()->setCFIState(0); } auto &BB0 = *NewBlocks[0].get(); @@ -139,7 +140,8 @@ BinaryFunction *createNewRetpoline(BinaryContext &BC, MCInst LoadCalleeAddrs; MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue, BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr, - MIB.getX86R11(), 8); + BrInfo.SegRegNum, MIB.getX86R11(), 8); + BB2.addInstruction(LoadCalleeAddrs); MCInst StoreToStack; @@ -194,13 +196,19 @@ std::string createRetpolineFunctionTag(BinaryContext &BC, Tag += BrInfo.BaseRegNum != BC.MIB->getX86NoRegister() ? "r" + to_string(BrInfo.BaseRegNum) : ""; - Tag += BrInfo.DispValue ? "+" + to_string(BrInfo.DispValue) : ""; - Tag += BrInfo.DispExpr ? "+" + DispExprStr : ""; + + Tag += + BrInfo.DispExpr ? "+" + DispExprStr : "+" + to_string(BrInfo.DispValue); + Tag += BrInfo.IndexRegNum != BC.MIB->getX86NoRegister() ? "+" + to_string(BrInfo.ScaleValue) + "*" + to_string(BrInfo.IndexRegNum) : ""; + Tag += BrInfo.SegRegNum != BC.MIB->getX86NoRegister() + ? "_seg_" + to_string(BrInfo.SegRegNum) + : ""; + return Tag; } @@ -227,7 +235,7 @@ void createBranchReplacement(BinaryContext &BC, MCInst LoadCalleeAddrs; MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue, BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr, - MIB.getX86R11(), 8); + BrInfo.SegRegNum, MIB.getX86R11(), 8); Replacement.push_back(LoadCalleeAddrs); } @@ -250,7 +258,7 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) { if (!MIB.evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue, &IndexRegNum, &DispValue, &SegRegNum, &DispExpr)) { - assert(false && "not expected"); + llvm_unreachable("not expected"); } } else if (MIB.isBranchOnReg(Inst)) { assert(MCPlus::getNumPrimeOperands(Inst) == 1 && "expect 1 operand"); @@ -290,7 +298,9 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC, // Determine if r11 is available before this instruction if (BrInfo.isMem()) { - if (opts::R11Availability == AvailabilityOptions::ALWAYS) + if(MIB.hasAnnotation(Inst, "PLTCall")) + R11Available= true; + else if (opts::R11Availability == AvailabilityOptions::ALWAYS) R11Available = true; else if (opts::R11Availability == AvailabilityOptions::ABI) R11Available = BrInfo.isCall(); @@ -317,9 +327,9 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC, } } } - outs() << "The number of created retpoline functions is : " + outs() << "BOLT-INFO: The number of created retpoline functions is : " << CreatedRetpolines.size() - << "\nThe number of retpolined branches is : " << RetpolinedBranches + << "\nBOLT-INFO: The number of retpolined branches is : " << RetpolinedBranches << "\n"; } diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index f1a664275445..5497db157f8d 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -1040,6 +1040,9 @@ class X86MCPlusBuilder : public MCPlusBuilder { } else { assert(DispExpr && "DispExpr needs to be set"); *DispExpr = Disp.getExpr(); + if (DispImm) { + *DispImm = 0; + } } *SegmentRegNum = Segment.getReg(); return true; @@ -2667,13 +2670,14 @@ class X86MCPlusBuilder : public MCPlusBuilder { int Offset, const MCPhysReg &DstReg, int Size) const override { return createLoad(Inst, StackReg, /*Scale=*/1, /*IndexReg=*/X86::NoRegister, - Offset, nullptr, DstReg, Size); + Offset, nullptr, /*AddrSegmentReg=*/X86::NoRegister, + DstReg, Size); } - bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int Scale, - const MCPhysReg &IndexReg, int Offset, - const MCExpr *OffsetExpr, const MCPhysReg &DstReg, - int Size) const{ + bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale, + const MCPhysReg &IndexReg, int64_t Offset, + const MCExpr *OffsetExpr, const MCPhysReg &AddrSegmentReg, + const MCPhysReg &DstReg, int Size) const { unsigned NewOpcode; switch (Size) { default: @@ -2692,7 +2696,7 @@ class X86MCPlusBuilder : public MCPlusBuilder { Inst.addOperand(MCOperand::createExpr(OffsetExpr)); // Displacement else Inst.addOperand(MCOperand::createImm(Offset)); // Displacement - Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg + Inst.addOperand(MCOperand::createReg(AddrSegmentReg)); // AddrSegmentReg return true; } From 28a5e0688968570af9a47c18d36063e07c07f43f Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 13 Aug 2018 14:36:38 -0700 Subject: [PATCH 468/904] [BOLT] Fix pseudo calculation in BinaryBasicBlock Summary: A recent commit broke our tests because it was depending on getNumNonPseudos() at a very late stage of our optimization pipeline. The problem was in a instruction deletion member function in BinaryBasicBlock that was not updating the number of pseudos after deletion. Fix this. (cherry picked from commit 86cd4ac1a2626da9c5cf10652a3c458924006046) --- bolt/src/BinaryBasicBlock.h | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 66f0da262e81..6cbabaf29c52 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -669,6 +669,7 @@ class BinaryBasicBlock { /// Erase non-pseudo instruction at a given iterator \p II. iterator eraseInstruction(iterator II) { + adjustNumPseudos(*II, -1); return Instructions.erase(II); } From c233ce43986d2b829b404ce03436b0fc38241e24 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 14 Aug 2018 13:24:44 -0700 Subject: [PATCH 469/904] [perf2bolt] Use mmap events for PID collection Summary: Switch from using `perf script --show-task-events` to `perf script --show-mmap-events` for associating a binary with PIDs in perf.data. The output of the former command does not provide enough information for PIE/.so processing. (cherry picked from commit 7256d8ad6af857f0f24bb61eacb80005a1110c1d) --- bolt/src/DataAggregator.cpp | 93 ++++++++++++++++++++----------------- bolt/src/DataAggregator.h | 29 ++++++------ 2 files changed, 64 insertions(+), 58 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index bddc4c3cdd78..15fce830ad4c 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -94,7 +94,7 @@ void DataAggregator::start(StringRef PerfDataFilename) { findPerfExecutable(); launchPerfBranchEventsNoWait(); launchPerfMemEventsNoWait(); - launchPerfTasksNoWait(); + launchPerfMMapEventsNoWait(); } void DataAggregator::abort() { @@ -104,7 +104,7 @@ void DataAggregator::abort() { std::string Error; // Kill subprocesses in case they are not finished - sys::Wait(TasksPI, 1, false, &Error); + sys::Wait(MMapEventsPI, 1, false, &Error); sys::Wait(BranchEventsPI, 1, false, &Error); sys::Wait(MemEventsPI, 1, false, &Error); @@ -201,42 +201,43 @@ bool DataAggregator::launchPerfMemEventsNoWait() { return true; } -bool DataAggregator::launchPerfTasksNoWait() { +bool DataAggregator::launchPerfMMapEventsNoWait() { SmallVector Argv; - outs() << "PERF2BOLT: Spawning perf-script job to read tasks\n"; + outs() << "PERF2BOLT: Spawning perf-script job to read process info\n"; Argv.push_back(PerfPath.data()); Argv.push_back("script"); - Argv.push_back("--show-task-events"); + Argv.push_back("--show-mmap-events"); Argv.push_back("-i"); Argv.push_back(PerfDataFilename.data()); Argv.push_back(nullptr); if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out", - PerfTasksOutputPath)) { + PerfMMapEventsOutputPath)) { outs() << "PERF2BOLT: Failed to create temporary file " - << PerfTasksOutputPath << " with error " << Errc.message() << "\n"; + << PerfMMapEventsOutputPath << " with error " << Errc.message() + << "\n"; exit(1); } if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", - PerfTasksErrPath)) { + PerfMMapEventsErrPath)) { outs() << "PERF2BOLT: Failed to create temporary file " - << PerfTasksErrPath << " with error " << Errc.message() << "\n"; + << PerfMMapEventsErrPath << " with error " << Errc.message() << "\n"; exit(1); } Optional Redirects[] = { - llvm::None, // Stdin - StringRef(PerfTasksOutputPath.data()), // Stdout - StringRef(PerfTasksErrPath.data())}; // Stderr + llvm::None, // Stdin + StringRef(PerfMMapEventsOutputPath.data()), // Stdout + StringRef(PerfMMapEventsErrPath.data())}; // Stderr DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " - << PerfTasksOutputPath.data() << " 2> " - << PerfTasksErrPath.data() << "\n"); + << PerfMMapEventsOutputPath.data() << " 2> " + << PerfMMapEventsErrPath.data() << "\n"); - TasksPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, Redirects); + MMapEventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), + /*envp*/ nullptr, Redirects); return true; } @@ -296,7 +297,7 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) { ErrorOr> MB = MemoryBuffer::getFileOrSTDIN(OutputPath.data()); if (std::error_code EC = MB.getError()) { - errs() << "Cannot open " << PerfTasksOutputPath.data() << ": " + errs() << "Cannot open " << PerfMMapEventsOutputPath.data() << ": " << EC.message() << "\n"; deleteTempFile(ErrPath.data()); deleteTempFile(OutputPath.data()); @@ -373,8 +374,8 @@ void DataAggregator::deleteTempFiles() { deleteTempFile(PerfBranchEventsOutputPath.data()); deleteTempFile(PerfMemEventsErrPath.data()); deleteTempFile(PerfMemEventsOutputPath.data()); - deleteTempFile(PerfTasksErrPath.data()); - deleteTempFile(PerfTasksOutputPath.data()); + deleteTempFile(PerfMMapEventsErrPath.data()); + deleteTempFile(PerfMMapEventsOutputPath.data()); } bool DataAggregator::processPreAggregated() { @@ -419,8 +420,8 @@ bool DataAggregator::aggregate(BinaryContext &BC, if (opts::ReadPreAggregated) return processPreAggregated(); - outs() << "PERF2BOLT: Waiting for perf tasks collection to finish...\n"; - auto PI1 = sys::Wait(TasksPI, 0, true, &Error); + outs() << "PERF2BOLT: Waiting for perf mmap events collection to finish...\n"; + auto PI1 = sys::Wait(MMapEventsPI, 0, true, &Error); if (!Error.empty()) { errs() << "PERF-ERROR: " << Error << "\n"; @@ -430,7 +431,7 @@ bool DataAggregator::aggregate(BinaryContext &BC, if (PI1.ReturnCode != 0) { ErrorOr> MB = - MemoryBuffer::getFileOrSTDIN(PerfTasksErrPath.data()); + MemoryBuffer::getFileOrSTDIN(PerfMMapEventsErrPath.data()); StringRef ErrBuf = (*MB)->getBuffer(); errs() << "PERF-ERROR: Return code " << PI1.ReturnCode << "\n"; @@ -440,9 +441,9 @@ bool DataAggregator::aggregate(BinaryContext &BC, } ErrorOr> MB1 = - MemoryBuffer::getFileOrSTDIN(PerfTasksOutputPath.data()); + MemoryBuffer::getFileOrSTDIN(PerfMMapEventsOutputPath.data()); if (std::error_code EC = MB1.getError()) { - errs() << "Cannot open " << PerfTasksOutputPath.data() << ": " + errs() << "Cannot open " << PerfMMapEventsOutputPath.data() << ": " << EC.message() << "\n"; deleteTempFiles(); exit(1); @@ -452,8 +453,8 @@ bool DataAggregator::aggregate(BinaryContext &BC, ParsingBuf = FileBuf->getBuffer(); Col = 0; Line = 1; - if (parseTasks()) { - outs() << "PERF2BOLT: Failed to parse tasks\n"; + if (parseMMapEvents()) { + outs() << "PERF2BOLT: Failed to parse mmap events\n"; } outs() @@ -760,7 +761,7 @@ ErrorOr DataAggregator::parseBranchSample() { auto PIDRes = parseNumberField(FieldSeparator, true); if (std::error_code EC = PIDRes.getError()) return EC; - if (!PIDs.empty() && !PIDs.count(PIDRes.get())) { + if (!PIDs.count(PIDRes.get())) { consumeRestOfLine(); return Res; } @@ -783,7 +784,7 @@ ErrorOr DataAggregator::parseBasicSample() { auto PIDRes = parseNumberField(FieldSeparator, true); if (std::error_code EC = PIDRes.getError()) return EC; - if (!PIDs.empty() && !PIDs.count(PIDRes.get())) { + if (!PIDs.count(PIDRes.get())) { consumeRestOfLine(); return PerfBasicSample{StringRef(), 0}; } @@ -817,7 +818,7 @@ ErrorOr DataAggregator::parseMemSample() { auto PIDRes = parseNumberField(FieldSeparator, true); if (std::error_code EC = PIDRes.getError()) return EC; - if (!PIDs.empty() && !PIDs.count(PIDRes.get())) { + if (!PIDs.count(PIDRes.get())) { consumeRestOfLine(); return Res; } @@ -1194,7 +1195,7 @@ std::error_code DataAggregator::parseAggregatedLBRSamples() { return std::error_code(); } -ErrorOr> DataAggregator::parseTaskPID() { +ErrorOr> DataAggregator::parseMMapEvent() { while (checkAndConsumeFS()) {} auto LineEnd = ParsingBuf.find_first_of("\n"); @@ -1205,17 +1206,22 @@ ErrorOr> DataAggregator::parseTaskPID() { } StringRef Line = ParsingBuf.substr(0, LineEnd); - if (Line.find("PERF_RECORD_COMM") == StringRef::npos) { + auto Pos = Line.find("PERF_RECORD_MMAP2"); + if (Pos == StringRef::npos) { consumeRestOfLine(); return std::make_pair(StringRef(), -1); } + Line = Line.drop_front(Pos); - auto FileName = Line.split(FieldSeparator).first; - if (FileName == "PERF_RECORD_COMM") - FileName = Line.rsplit(':').first.rsplit(FieldSeparator).second; + auto FileName = Line.rsplit(FieldSeparator).second; + if (FileName.startswith("//") || FileName.startswith("[")) { + consumeRestOfLine(); + return std::make_pair(StringRef(), -1); + } + FileName = sys::path::filename(FileName); int64_t PID; - StringRef PIDStr = Line.rsplit(':').second.split('/').first; + StringRef PIDStr = Line.split(FieldSeparator).second.split('/').first; if (PIDStr.getAsInteger(10, PID)) { reportError("expected PID"); Diag << "Found: " << PIDStr << "\n"; @@ -1227,14 +1233,14 @@ ErrorOr> DataAggregator::parseTaskPID() { return std::make_pair(FileName, PID); } -std::error_code DataAggregator::parseTasks() { - outs() << "PERF2BOLT: Parsing perf-script tasks output\n"; - NamedRegionTimer T("parseTasks", "Tasks parsing", TimerGroupName, +std::error_code DataAggregator::parseMMapEvents() { + outs() << "PERF2BOLT: Parsing perf-script mmap events output\n"; + NamedRegionTimer T("parseMMapEvents", "Parsing mmap events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); std::multimap BinaryPIDs; while (hasData()) { - auto NamePIDRes = parseTaskPID(); + auto NamePIDRes = parseMMapEvent(); if (std::error_code EC = NamePIDRes.getError()) return EC; @@ -1264,10 +1270,7 @@ std::error_code DataAggregator::parseTasks() { PIDs.insert(I->second); } - if (!PIDs.empty()) { - outs() << "PERF2BOLT: Input binary is associated with " << PIDs.size() - << " PID(s)\n"; - } else { + if (PIDs.empty()) { if (errs().has_colors()) errs().changeColor(raw_ostream::RED); errs() << "PERF2BOLT-ERROR: could not find a profile matching binary \"" @@ -1284,9 +1287,13 @@ std::error_code DataAggregator::parseTasks() { } if (errs().has_colors()) errs().resetColor(); + exit(1); } + outs() << "PERF2BOLT: Input binary is associated with " << PIDs.size() + << " PID(s)\n"; + return std::error_code(); } diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index 886117e729ce..20bdd0e4d529 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -77,13 +77,13 @@ class DataAggregator : public DataReader { std::string PerfPath; sys::ProcessInfo BranchEventsPI; sys::ProcessInfo MemEventsPI; - sys::ProcessInfo TasksPI; + sys::ProcessInfo MMapEventsPI; SmallVector PerfBranchEventsOutputPath; SmallVector PerfBranchEventsErrPath; SmallVector PerfMemEventsOutputPath; SmallVector PerfMemEventsErrPath; - SmallVector PerfTasksOutputPath; - SmallVector PerfTasksErrPath; + SmallVector PerfMMapEventsOutputPath; + SmallVector PerfMMapEventsErrPath; /// Whether aggregator was scheduled to run bool Enabled{false}; @@ -98,7 +98,7 @@ class DataAggregator : public DataReader { std::string BinaryName; /// Name of the binary with matching build-id from perf.data if different - /// from BinaryName; + /// from BinaryName. std::string BuildIDBinaryName; DenseSet PIDs; @@ -122,10 +122,9 @@ class DataAggregator : public DataReader { /// to an output file we will parse later bool launchPerfMemEventsNoWait(); - /// Launch a subprocess to read all perf task events. They contain the mapping - /// of binary file name to PIDs used during data collection time. We later use - /// the PIDs to filter samples. - bool launchPerfTasksNoWait(); + /// Launch a subprocess to read memory mapping for the binary. We later use + /// PIDs to filter samples, and memory mapping to adjust addresses. + bool launchPerfMMapEventsNoWait(); /// Delete all temporary files created to hold the output generated by spawned /// subprocesses during the aggregation job @@ -205,14 +204,14 @@ class DataAggregator : public DataReader { /// an external tool. std::error_code parseAggregatedLBRSamples(); - /// Parse a single line of a PERF_RECORD_COMM event looking for an association - /// between the binary name and its PID. On success return a - /// pair. - ErrorOr> parseTaskPID(); + /// Parse a single line of a PERF_RECORD_MMAP2 event looking for an + /// association between the binary name and its PID. + /// On success return a pair. + ErrorOr> parseMMapEvent(); - /// Parse the full output generated by perf script to report PERF_RECORD_COMM - /// events with the association of binary file names and their PIDs. - std::error_code parseTasks(); + /// Parse the full output generated by perf script to report PERF_RECORD_MMAP2 + /// events with the association of binary file name and their PIDs. + std::error_code parseMMapEvents(); /// Parse a single pair of binary full path and associated build-id Optional> parseNameBuildIDPair(); From 50cf1b65e2cf4324fb6dcf0c6c6b91be5e1d6f37 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 14 Aug 2018 13:24:44 -0700 Subject: [PATCH 470/904] [perf2bolt] Support profiling of PIEs and .so's Summary: Processing profile data for binaries with flexible load address (such as position-independent executables and shared objects) requires adjusting binary addresses depending on the base load address. For every PID the mapping will be more or less unique when executing with ASLR enabled, thus we have to keep the mapping record for all PIDs associated with the binary. Then we adjust the addresses based on those mappings. (cherry picked from commit 61df17afe41b50d61b43c321bf4ce818262ea398) --- bolt/src/BinaryContext.h | 3 + bolt/src/DataAggregator.cpp | 111 +++++++++++++++++++++++++---------- bolt/src/DataAggregator.h | 66 +++++++++++++++------ bolt/src/RewriteInstance.cpp | 6 +- 4 files changed, 134 insertions(+), 52 deletions(-) diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 5817e15573ea..6db40c3da746 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -256,6 +256,9 @@ class BinaryContext { /// Indicates if relocations are available for usage. bool HasRelocations{false}; + /// Is the binary always loaded at a fixed address. + bool HasFixedLoadAddress{true}; + /// Sum of execution count of all functions uint64_t SumExecutionCount{0}; diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 15fce830ad4c..df6aa3d02737 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -321,7 +321,8 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) { errs() << "PERF2BOLT-ERROR: failed to match build-id from perf output. " "This indicates the input binary supplied for data aggregation " "is not the same recorded by perf when collecting profiling " - "data. Use -ignore-build-id option to override.\n"; + "data, or there were no samples recorded for the binary. " + "Use -ignore-build-id option to override.\n"; if (!opts::IgnoreBuildID) { deleteTempFile(ErrPath.data()); deleteTempFile(OutputPath.data()); @@ -761,7 +762,8 @@ ErrorOr DataAggregator::parseBranchSample() { auto PIDRes = parseNumberField(FieldSeparator, true); if (std::error_code EC = PIDRes.getError()) return EC; - if (!PIDs.count(PIDRes.get())) { + auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes); + if (MMapInfoIter == BinaryMMapInfo.end()) { consumeRestOfLine(); return Res; } @@ -772,7 +774,10 @@ ErrorOr DataAggregator::parseBranchSample() { auto LBRRes = parseLBREntry(); if (std::error_code EC = LBRRes.getError()) return EC; - Res.LBR.push_back(LBRRes.get()); + auto LBR = LBRRes.get(); + if (!BC->HasFixedLoadAddress) + adjustLBR(LBR, MMapInfoIter->second); + Res.LBR.push_back(LBR); } return Res; @@ -784,7 +789,9 @@ ErrorOr DataAggregator::parseBasicSample() { auto PIDRes = parseNumberField(FieldSeparator, true); if (std::error_code EC = PIDRes.getError()) return EC; - if (!PIDs.count(PIDRes.get())) { + + auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes); + if (MMapInfoIter == BinaryMMapInfo.end()) { consumeRestOfLine(); return PerfBasicSample{StringRef(), 0}; } @@ -807,7 +814,11 @@ ErrorOr DataAggregator::parseBasicSample() { return make_error_code(llvm::errc::io_error); } - return PerfBasicSample{Event.get(), AddrRes.get()}; + auto Address = *AddrRes; + if (!BC->HasFixedLoadAddress) + adjustAddress(Address, MMapInfoIter->second); + + return PerfBasicSample{Event.get(), Address}; } ErrorOr DataAggregator::parseMemSample() { @@ -818,7 +829,9 @@ ErrorOr DataAggregator::parseMemSample() { auto PIDRes = parseNumberField(FieldSeparator, true); if (std::error_code EC = PIDRes.getError()) return EC; - if (!PIDs.count(PIDRes.get())) { + + auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes); + if (MMapInfoIter == BinaryMMapInfo.end()) { consumeRestOfLine(); return Res; } @@ -853,7 +866,11 @@ ErrorOr DataAggregator::parseMemSample() { return make_error_code(llvm::errc::io_error); } - return PerfMemSample{PCRes.get(), AddrRes.get()}; + auto Address = *AddrRes; + if (!BC->HasFixedLoadAddress) + adjustAddress(Address, MMapInfoIter->second); + + return PerfMemSample{PCRes.get(), Address}; } ErrorOr DataAggregator::parseLocationOrOffset() { @@ -1195,9 +1212,12 @@ std::error_code DataAggregator::parseAggregatedLBRSamples() { return std::error_code(); } -ErrorOr> DataAggregator::parseMMapEvent() { +ErrorOr> +DataAggregator::parseMMapEvent() { while (checkAndConsumeFS()) {} + MMapInfo ParsedInfo; + auto LineEnd = ParsingBuf.find_first_of("\n"); if (LineEnd == StringRef::npos) { reportError("expected rest of line"); @@ -1209,28 +1229,41 @@ ErrorOr> DataAggregator::parseMMapEvent() { auto Pos = Line.find("PERF_RECORD_MMAP2"); if (Pos == StringRef::npos) { consumeRestOfLine(); - return std::make_pair(StringRef(), -1); + return std::make_pair(StringRef(), ParsedInfo); } Line = Line.drop_front(Pos); auto FileName = Line.rsplit(FieldSeparator).second; if (FileName.startswith("//") || FileName.startswith("[")) { consumeRestOfLine(); - return std::make_pair(StringRef(), -1); + return std::make_pair(StringRef(), ParsedInfo); } FileName = sys::path::filename(FileName); - int64_t PID; StringRef PIDStr = Line.split(FieldSeparator).second.split('/').first; - if (PIDStr.getAsInteger(10, PID)) { + if (PIDStr.getAsInteger(10, ParsedInfo.PID)) { reportError("expected PID"); - Diag << "Found: " << PIDStr << "\n"; + Diag << "Found: " << PIDStr << "in '" << Line << "'\n"; + return make_error_code(llvm::errc::io_error); + } + + StringRef BaseAddressStr = Line.split('[').second.split('(').first; + if (BaseAddressStr.getAsInteger(0, ParsedInfo.BaseAddress)) { + reportError("expected base address"); + Diag << "Found: " << BaseAddressStr << "in '" << Line << "'\n"; + return make_error_code(llvm::errc::io_error); + } + + StringRef SizeStr = Line.split('(').second.split(')').first; + if (SizeStr.getAsInteger(0, ParsedInfo.Size)) { + reportError("expected mmaped size"); + Diag << "Found: " << SizeStr << "in '" << Line << "'\n"; return make_error_code(llvm::errc::io_error); } consumeRestOfLine(); - return std::make_pair(FileName, PID); + return std::make_pair(FileName, ParsedInfo); } std::error_code DataAggregator::parseMMapEvents() { @@ -1238,47 +1271,61 @@ std::error_code DataAggregator::parseMMapEvents() { NamedRegionTimer T("parseMMapEvents", "Parsing mmap events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - std::multimap BinaryPIDs; + std::multimap GlobalMMapInfo; while (hasData()) { - auto NamePIDRes = parseMMapEvent(); - if (std::error_code EC = NamePIDRes.getError()) + auto FileMMapInfoRes = parseMMapEvent(); + if (std::error_code EC = FileMMapInfoRes.getError()) return EC; - auto NamePIDPair = NamePIDRes.get(); - if (NamePIDPair.second == -1) + auto FileMMapInfo = FileMMapInfoRes.get(); + if (FileMMapInfo.second.PID == -1) + continue; + + // Consider only the first mapping of the file for any given PID + bool PIDExists = false; + auto Range = GlobalMMapInfo.equal_range(FileMMapInfo.first); + for (auto MI = Range.first; MI != Range.second; ++MI) { + if (MI->second.PID == FileMMapInfo.second.PID) { + PIDExists = true; + break; + } + } + if (PIDExists) continue; - BinaryPIDs.insert(NamePIDPair); + GlobalMMapInfo.insert(FileMMapInfo); } DEBUG( - dbgs() << "FileName -> PID mapping:\n"; - for (const auto &Pair : BinaryPIDs) { - dbgs() << " " << Pair.first << " : " << Pair.second << '\n'; + dbgs() << "FileName -> mmap info:\n"; + for (const auto &Pair : GlobalMMapInfo) { + dbgs() << " " << Pair.first << " : " << Pair.second.PID << " [0x" + << Twine::utohexstr(Pair.second.BaseAddress) << ", " + << Twine::utohexstr(Pair.second.Size) << "]\n"; } ); auto NameToUse = BinaryName.substr(0, 15); - if (BinaryPIDs.count(NameToUse) == 0 && !BuildIDBinaryName.empty()) { + if (GlobalMMapInfo.count(NameToUse) == 0 && !BuildIDBinaryName.empty()) { errs() << "PERF2BOLT-WARNING: using \"" << BuildIDBinaryName << "\" for profile matching\n"; NameToUse = BuildIDBinaryName.substr(0, 15); } - auto Range = BinaryPIDs.equal_range(NameToUse); + auto Range = GlobalMMapInfo.equal_range(NameToUse); for (auto I = Range.first; I != Range.second; ++I) { - PIDs.insert(I->second); + BinaryMMapInfo.insert(std::make_pair(I->second.PID, I->second)); } - if (PIDs.empty()) { + if (BinaryMMapInfo.empty()) { if (errs().has_colors()) errs().changeColor(raw_ostream::RED); errs() << "PERF2BOLT-ERROR: could not find a profile matching binary \"" << BinaryName << "\"."; - if (!BinaryPIDs.empty()) { + if (!GlobalMMapInfo.empty()) { errs() << " Profile for the following binary name(s) is available:\n"; - for (auto I = BinaryPIDs.begin(), IE = BinaryPIDs.end(); I != IE; - I = BinaryPIDs.upper_bound(I->first)) { + for (auto I = GlobalMMapInfo.begin(), IE = GlobalMMapInfo.end(); I != IE; + I = GlobalMMapInfo.upper_bound(I->first)) { errs() << " " << I->first << '\n'; } errs() << "Please rename the input binary.\n"; @@ -1291,8 +1338,8 @@ std::error_code DataAggregator::parseMMapEvents() { exit(1); } - outs() << "PERF2BOLT: Input binary is associated with " << PIDs.size() - << " PID(s)\n"; + outs() << "PERF2BOLT: Input binary is associated with " + << BinaryMMapInfo.size() << " PID(s)\n"; return std::error_code(); } diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index 20bdd0e4d529..64a2e8e2567b 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -21,6 +21,7 @@ #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include +#include namespace llvm { namespace bolt { @@ -101,7 +102,15 @@ class DataAggregator : public DataReader { /// from BinaryName. std::string BuildIDBinaryName; - DenseSet PIDs; + /// Memory map info for a single file + struct MMapInfo { + int64_t PID{-1LL}; + uint64_t BaseAddress; + uint64_t Size; + }; + + /// Per-PID map info for the binary + std::unordered_map BinaryMMapInfo; /// References to core BOLT data structures BinaryContext *BC{nullptr}; @@ -204,13 +213,15 @@ class DataAggregator : public DataReader { /// an external tool. std::error_code parseAggregatedLBRSamples(); - /// Parse a single line of a PERF_RECORD_MMAP2 event looking for an - /// association between the binary name and its PID. - /// On success return a pair. - ErrorOr> parseMMapEvent(); + /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping + /// between the binary name and its memory layout in a process with a given + /// PID. + /// On success return a pair. + ErrorOr> parseMMapEvent(); - /// Parse the full output generated by perf script to report PERF_RECORD_MMAP2 - /// events with the association of binary file name and their PIDs. + /// Parse the full output generated by `perf script --show-mmap-events` + /// to generate mapping between binary files and their memory mappings for + /// all PIDs. std::error_code parseMMapEvents(); /// Parse a single pair of binary full path and associated build-id @@ -229,9 +240,10 @@ class DataAggregator : public DataReader { /// /// The pre-aggregated file contains aggregated LBR data, but without binary /// knowledge. BOLT will parse it and, using information from the disassembled - /// binary, augment it with fall-through edge frequency information. After this - /// step is finished, this data can be either written to disk to be consumed by - /// BOLT later, or can be used by BOLT immediately if kept in memory. + /// binary, augment it with fall-through edge frequency information. After + /// this step is finished, this data can be either written to disk to be + /// consumed by BOLT later, or can be used by BOLT immediately if kept in + /// memory. /// /// File format syntax: /// {B|F|f} [:] [:] @@ -243,19 +255,19 @@ class DataAggregator : public DataReader { /// between a return hitting a basic block head and a regular internal /// jump to the block /// - /// - build id of the object containing the start address. We can skip it - /// for the main binary and use "X" for an unknown object. This will save some space - /// and facilitate human parsing. + /// - build id of the object containing the start address. We can + /// skip it for the main binary and use "X" for an unknown object. This will + /// save some space and facilitate human parsing. /// - /// - hex offset from the object base load address (0 for the main - /// executable unless it's PIE) to the start address. + /// - hex offset from the object base load address (0 for the + /// main executable unless it's PIE) to the start address. /// /// , - same for the end address. /// /// - total aggregated count of the branch or a fall-through. /// - /// - the number of times the branch was mispredicted. Omitted for - /// fall-throughs. + /// - the number of times the branch was mispredicted. + /// Omitted for fall-throughs. /// /// Example: /// F 41be50 41be50 3 @@ -264,6 +276,26 @@ class DataAggregator : public DataReader { /// B 4b196f 4b19e0 2 0 bool processPreAggregated(); + /// If \p Address falls into to the binary address space based on memory + /// mapping info \p MMI, then adjust it for further processing by subtracting + /// the base load address. External addresses, i.e. addresses that do not + /// correspond to the binary allocated address space, are adjusted to avoid + /// conflicts. + void adjustAddress(uint64_t &Address, const MMapInfo &MMI) const { + if (Address >= MMI.BaseAddress && Address < MMI.BaseAddress + MMI.Size) { + Address -= MMI.BaseAddress; + } else if (Address < MMI.Size) { + // Make sure the address is not treated as belonging to the binary. + Address = (-1ULL); + } + } + + /// Adjust addresses in \p LBR entry. + void adjustLBR(LBREntry &LBR, const MMapInfo &MMI) const { + adjustAddress(LBR.From, MMI); + adjustAddress(LBR.To, MMI); + } + public: DataAggregator(raw_ostream &Diag, StringRef BinaryName) : DataReader(Diag), BinaryName(llvm::sys::path::filename(BinaryName)) {} diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 579f98245c2a..7e97c3021205 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -805,9 +805,9 @@ void RewriteInstance::discoverStorage() { } auto Obj = ELF64LEFile->getELFFile(); if (Obj->getHeader()->e_type != ELF::ET_EXEC) { - errs() << "BOLT-ERROR: only non-PIE ELF executables are supported at the " - "moment.\n"; - exit(1); + outs() << "BOLT-INFO: shared object or position-independent executable " + "detected\n"; + BC->HasFixedLoadAddress = false; } EntryPoint = Obj->getHeader()->e_entry; From 98f45a6e4e474c82ded26c9270ad1206dc358148 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 16 Aug 2018 16:53:14 -0700 Subject: [PATCH 471/904] [BOLT] Update allocatable relocation sections Summary: Position-independent binaries may have runtime relocations of type R_X86_64_RELATIVE that need an update if they were pointing to one of the functions that we have relocated. (cherry picked from commit 5334076d1e6138272574f00d599997339fde20e4) --- bolt/src/BinaryContext.h | 13 +++++++++++ bolt/src/DataAggregator.h | 2 +- bolt/src/RewriteInstance.cpp | 42 +++++++++++++++++++----------------- bolt/src/RewriteInstance.h | 4 ++-- 4 files changed, 38 insertions(+), 23 deletions(-) diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 6db40c3da746..b5c513dd2fa5 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -564,6 +564,19 @@ class BinaryContext { return const_cast(this)->nonAllocatableSections(); } + /// Iterate over all allocatable relocation sections. + iterator_range allocatableRelaSections() { + auto isAllocatableRela = [](const SectionIterator &Itr) { + return *Itr && Itr->isAllocatable() && Itr->isRela(); + }; + return make_range(FilteredSectionIterator(isAllocatableRela, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(isAllocatableRela, + Sections.end(), + Sections.end())); + } + /// Return section name containing the given \p Address. ErrorOr getSectionNameForAddress(uint64_t Address) const; diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index 64a2e8e2567b..703a5551d91c 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -276,7 +276,7 @@ class DataAggregator : public DataReader { /// B 4b196f 4b19e0 2 0 bool processPreAggregated(); - /// If \p Address falls into to the binary address space based on memory + /// If \p Address falls into the binary address space based on memory /// mapping info \p MMI, then adjust it for further processing by subtracting /// the base load address. External addresses, i.e. addresses that do not /// correspond to the binary allocated address space, are adjusted to avoid diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 7e97c3021205..657304c00f03 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -4235,27 +4235,29 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { } template -void RewriteInstance::patchELFRelaPLT(ELFObjectFile *File) { +void +RewriteInstance::patchELFAllocatableRelaSections(ELFObjectFile *File) { auto &OS = Out->os(); - if (!RelaPLTSection) { - errs() << "BOLT-INFO: no .rela.plt section found\n"; - return; - } - - for (const auto &Rel : RelaPLTSection->getSectionRef().relocations()) { - if (Rel.getType() == ELF::R_X86_64_IRELATIVE) { - DataRefImpl DRI = Rel.getRawDataRefImpl(); - const auto *RelA = File->getRela(DRI); - auto Address = RelA->r_addend; - auto NewAddress = getNewFunctionAddress(Address); - DEBUG(dbgs() << "BOLT-DEBUG: patching IRELATIVE .rela.plt entry 0x" - << Twine::utohexstr(Address) << " with 0x" - << Twine::utohexstr(NewAddress) << '\n'); - auto NewRelA = *RelA; - NewRelA.r_addend = NewAddress; - OS.pwrite(reinterpret_cast(&NewRelA), sizeof(NewRelA), - reinterpret_cast(RelA) - File->getData().data()); + for (auto &RelaSection : BC->allocatableRelaSections()) { + for (const auto &Rel : RelaSection.getSectionRef().relocations()) { + if (Rel.getType() == ELF::R_X86_64_IRELATIVE || + Rel.getType() == ELF::R_X86_64_RELATIVE) { + DataRefImpl DRI = Rel.getRawDataRefImpl(); + const auto *RelA = File->getRela(DRI); + auto Address = RelA->r_addend; + auto NewAddress = getNewFunctionAddress(Address); + if (!NewAddress) + continue; + DEBUG(dbgs() << "BOLT-DEBUG: patching (I)RELATIVE " + << RelaSection.getName() << " entry 0x" + << Twine::utohexstr(Address) << " with 0x" + << Twine::utohexstr(NewAddress) << '\n'); + auto NewRelA = *RelA; + NewRelA.r_addend = NewAddress; + OS.pwrite(reinterpret_cast(&NewRelA), sizeof(NewRelA), + reinterpret_cast(RelA) - File->getData().data()); + } } } } @@ -4548,7 +4550,7 @@ void RewriteInstance::rewriteFile() { patchELFDynamic(); if (BC->HasRelocations) { - patchELFRelaPLT(); + patchELFAllocatableRelaSections(); patchELFGOT(); } diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 5b2c984d6740..ee9095592dd8 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -307,8 +307,8 @@ class RewriteInstance { /// Patch .got ELF_FUNCTION(patchELFGOT); - /// Patch .rela.plt section. - ELF_FUNCTION(patchELFRelaPLT); + /// Patch allocatable relocation sections. + ELF_FUNCTION(patchELFAllocatableRelaSections); /// Finalize memory image of section header string table. ELF_FUNCTION(finalizeSectionStringTable); From 96335de8e1357b5d3d22ea9fac8db4fd4b9e1771 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 14 Aug 2018 17:32:06 -0700 Subject: [PATCH 472/904] [BOLT] Fix shrink-wrapping CFI update Summary: When updating CFI for a function that was optimized by shrink-wrapping, if the function had no frame pointers, the CFI update algorithm was incorrect. (cherry picked from commit ca8f4f344671cdc5d1ba5b167ed176b8eb9efd14) --- bolt/src/Passes/ShrinkWrapping.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/bolt/src/Passes/ShrinkWrapping.cpp b/bolt/src/Passes/ShrinkWrapping.cpp index cde6d7972caa..c89ad2f46a57 100644 --- a/bolt/src/Passes/ShrinkWrapping.cpp +++ b/bolt/src/Passes/ShrinkWrapping.cpp @@ -961,6 +961,17 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR, // into edges transitioning to the dominance frontier, otherwise we pull these // restores to inside the dominated area. Frontier = DA.getDominanceFrontierFor(*BestPosSave); + DEBUG({ + dbgs() << "Dumping dominance frontier for "; + BC.printInstruction(dbgs(), *BestPosSave); + for (auto &PP : Frontier) { + if (PP.isInst()) { + BC.printInstruction(dbgs(), *PP.getInst()); + } else { + dbgs() << PP.getBB()->getName() << "\n"; + } + } + }); for (auto &PP : Frontier) { bool HasCritEdges{false}; if (PP.isInst() && BC.MIB->isTerminator(*PP.getInst()) && @@ -1562,19 +1573,19 @@ void ShrinkWrapping::rebuildCFIForSP() { ++InsertionIter; Iter = BF.addCFIInstruction( BB, InsertionIter, - MCCFIInstruction::createDefCfaOffset(nullptr, SPVal)); + MCCFIInstruction::createDefCfaOffset(nullptr, -CurVal)); SPVal = CurVal; } } if (BF.isSplit() && PrevBB && BB->isCold() != PrevBB->isCold()) { BF.addCFIInstruction( BB, BB->begin(), - MCCFIInstruction::createDefCfaOffset(nullptr, SPValAtBegin)); + MCCFIInstruction::createDefCfaOffset(nullptr, -SPValAtBegin)); } else { if (SPValAtBegin != PrevSPVal) { BF.addCFIInstruction( PrevBB, PrevBB->end(), - MCCFIInstruction::createDefCfaOffset(nullptr, SPValAtBegin)); + MCCFIInstruction::createDefCfaOffset(nullptr, -SPValAtBegin)); } } PrevSPVal = SPValAtEnd; From b023b74f270130d5969c8a139332490372046ea4 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 8 Aug 2018 17:55:24 -0700 Subject: [PATCH 473/904] [BOLT] Add update-build-id option, on by default Summary: The build-id is used by tools to uniquely identify binaries. Update the output binary build-id with a different number to make it distinguishable from the input binary. This implementation just flips the last build-id bit. (cherry picked from commit 491ebb95ba1f353841e52c24937ace5177e710c1) --- bolt/src/ProfileWriter.cpp | 2 +- bolt/src/RewriteInstance.cpp | 220 +++++++++++++++++++---------------- bolt/src/RewriteInstance.h | 21 +++- 3 files changed, 138 insertions(+), 105 deletions(-) diff --git a/bolt/src/ProfileWriter.cpp b/bolt/src/ProfileWriter.cpp index a8930f6cd311..0f3138648736 100644 --- a/bolt/src/ProfileWriter.cpp +++ b/bolt/src/ProfileWriter.cpp @@ -172,7 +172,7 @@ ProfileWriter::writeProfile(const RewriteInstance &RI) { BP.Header.Version = 1; auto FileName = RI.getInputFileName(); BP.Header.FileName = FileName ? *FileName : ""; - auto BuildID = RI.getBuildID(); + auto BuildID = RI.getPrintableBuildID(); BP.Header.Id = BuildID ? *BuildID : ""; if (RI.getDataAggregator().started()) { diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 657304c00f03..37fbfd104ac3 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -358,13 +358,6 @@ Verbosity("v", cl::ZeroOrMore, cl::cat(BoltCategory)); -static cl::opt -AddBoltInfo("add-bolt-info", - cl::desc("add BOLT version and command line argument information to " - "processed binaries"), - cl::init(true), - cl::cat(BoltCategory)); - cl::opt AggregateOnly("aggregate-only", cl::desc("exit after writing aggregated data file"), @@ -908,58 +901,77 @@ void RewriteInstance::discoverStorage() { BC->LayoutStartAddress = NextAvailableAddress; } -Optional -RewriteInstance::getBuildID() const { - for (auto &Section : InputFile->sections()) { - StringRef SectionName; - Section.getName(SectionName); +void RewriteInstance::parseBuildID() { + if (!BuildIDSection) + return; - if (SectionName != ".note.gnu.build-id") - continue; + StringRef Buf = BuildIDSection->getContents(); - StringRef SectionContents; - Section.getContents(SectionContents); + // Reading notes section (see Portable Formats Specification, Version 1.1, + // pg 2-5, section "Note Section"). + DataExtractor DE = DataExtractor(Buf, true, 8); + uint32_t Offset = 0; + if (!DE.isValidOffset(Offset)) + return; + uint32_t NameSz = DE.getU32(&Offset); + if (!DE.isValidOffset(Offset)) + return; + uint32_t DescSz = DE.getU32(&Offset); + if (!DE.isValidOffset(Offset)) + return; + uint32_t Type = DE.getU32(&Offset); - // Reading notes section (see Portable Formats Specification, Version 1.1, - // pg 2-5, section "Note Section"). - DataExtractor DE = DataExtractor(SectionContents, true, 8); - uint32_t Offset = 0; - if (!DE.isValidOffset(Offset)) - return NoneType(); - uint32_t NameSz = DE.getU32(&Offset); - if (!DE.isValidOffset(Offset)) - return NoneType(); - uint32_t DescSz = DE.getU32(&Offset); - if (!DE.isValidOffset(Offset)) - return NoneType(); - uint32_t Type = DE.getU32(&Offset); - - DEBUG(dbgs() << "NameSz = " << NameSz << "; DescSz = " << DescSz - << "; Type = " << Type << "\n"); - - // Type 3 is a GNU build-id note section - if (Type != 3) - return NoneType(); - - StringRef Name = SectionContents.slice(Offset, Offset + NameSz); - Offset = alignTo(Offset + NameSz, 4); - StringRef BinaryBuildID = SectionContents.slice(Offset, Offset + DescSz); - if (Name.substr(0, 3) != "GNU") - return NoneType(); - - std::string Str; - raw_string_ostream OS(Str); - auto CharIter = BinaryBuildID.bytes_begin(); - while (CharIter != BinaryBuildID.bytes_end()) { - if (*CharIter < 0x10) - OS << "0"; - OS << Twine::utohexstr(*CharIter); - ++CharIter; - } - outs() << "BOLT-INFO: binary build-id is: " << OS.str() << "\n"; - return OS.str(); + DEBUG(dbgs() << "NameSz = " << NameSz << "; DescSz = " << DescSz + << "; Type = " << Type << "\n"); + + // Type 3 is a GNU build-id note section + if (Type != 3) + return; + + StringRef Name = Buf.slice(Offset, Offset + NameSz); + Offset = alignTo(Offset + NameSz, 4); + if (Name.substr(0, 3) != "GNU") + return; + + BuildID = Buf.slice(Offset, Offset + DescSz); +} + +Optional RewriteInstance::getPrintableBuildID() const { + if (BuildID.empty()) + return NoneType(); + + std::string Str; + raw_string_ostream OS(Str); + auto CharIter = BuildID.bytes_begin(); + while (CharIter != BuildID.bytes_end()) { + if (*CharIter < 0x10) + OS << "0"; + OS << Twine::utohexstr(*CharIter); + ++CharIter; + } + return OS.str(); +} + +void RewriteInstance::patchBuildID() { + auto &OS = Out->os(); + + if (BuildID.empty()) + return; + + size_t IDOffset = BuildIDSection->getContents().rfind(BuildID); + assert(IDOffset != StringRef::npos && "failed to patch build-id"); + + auto FileOffset = getFileOffsetForAddress(BuildIDSection->getAddress()); + if (!FileOffset) { + errs() << "BOLT-WARNING: Non-allocatable build-id will not be updated.\n"; + return; } - return NoneType(); + + char LastIDByte = BuildID[BuildID.size() - 1]; + LastIDByte ^= 1; + OS.pwrite(&LastIDByte, 1, FileOffset + IDOffset + BuildID.size() - 1); + + outs() << "BOLT-INFO: patched build-id (flipped last bit)\n"; } void RewriteInstance::run() { @@ -995,15 +1007,6 @@ void RewriteInstance::run() { (llvm::Triple::ArchType)InputFile->getArch()) << "\n"; - if (DA.started()) { - if (auto FileBuildID = getBuildID()) { - DA.processFileBuildID(*FileBuildID); - } else { - errs() << "BOLT-WARNING: build-id will not be checked because we could " - "not read one from input binary\n"; - } - } - unsigned PassNumber = 1; executeRewritePass({}); if (opts::AggregateOnly || opts::DiffOnly) @@ -1738,6 +1741,7 @@ void RewriteInstance::readSpecialSections() { GOTPLTSection = BC->getUniqueSectionByName(".got.plt"); PLTGOTSection = BC->getUniqueSectionByName(".plt.got"); RelaPLTSection = BC->getUniqueSectionByName(".rela.plt"); + BuildIDSection = BC->getUniqueSectionByName(".note.gnu.build-id"); if (opts::PrintSections) { outs() << "BOLT-INFO: Sections from original binary:\n"; @@ -1768,6 +1772,18 @@ void RewriteInstance::readSpecialSections() { EHFrame->dump(outs(), &*BC->MRI, NoneType()); } CFIRdWrt.reset(new CFIReaderWriter(*EHFrame)); + + // Parse build-id + parseBuildID(); + if (DA.started()) { + if (auto FileBuildID = getPrintableBuildID()) { + outs() << "BOLT-INFO: binary build-id is: " << *FileBuildID << "\n"; + DA.processFileBuildID(*FileBuildID); + } else { + errs() << "BOLT-WARNING: build-id will not be checked because we could " + "not read one from input binary\n"; + } + } } void RewriteInstance::adjustCommandLineOptions() { @@ -1846,7 +1862,6 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, return false; const bool IsAArch64 = BC->isAArch64(); - const bool IsFromCode = RelocatedSection.isText(); // For value extraction. StringRef RelocatedSectionContents; @@ -3601,45 +3616,46 @@ void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { ELF::SHT_STRTAB); } -void RewriteInstance::addBoltInfoSection() { - if (opts::AddBoltInfo) { - std::string DescStr; - raw_string_ostream DescOS(DescStr); +namespace { - DescOS << "BOLT revision: " << BoltRevision << ", " << "command line:"; - for (auto I = 0; I < Argc; ++I) { - DescOS << " " << Argv[I]; - } - DescOS.flush(); - - std::string Str; - raw_string_ostream OS(Str); - std::string NameStr = "GNU"; - const uint32_t NameSz = NameStr.size() + 1; - const uint32_t DescSz = DescStr.size(); - const uint32_t Type = 4; // NT_GNU_GOLD_VERSION (gold version) - OS.write(reinterpret_cast(&(NameSz)), 4); - OS.write(reinterpret_cast(&(DescSz)), 4); - OS.write(reinterpret_cast(&(Type)), 4); - OS << NameStr; - for (uint64_t I = NameStr.size(); - I < alignTo(NameStr.size(), 4); ++I) { - OS << '\0'; - } - OS << DescStr; - for (uint64_t I = DescStr.size(); - I < alignTo(DescStr.size(), 4); ++I) { - OS << '\0'; - } +std::string encodeELFNote(StringRef NameStr, StringRef DescStr, uint32_t Type) { + std::string Str; + raw_string_ostream OS(Str); + const uint32_t NameSz = NameStr.size() + 1; + const uint32_t DescSz = DescStr.size(); + OS.write(reinterpret_cast(&(NameSz)), 4); + OS.write(reinterpret_cast(&(DescSz)), 4); + OS.write(reinterpret_cast(&(Type)), 4); + OS << NameStr; + for (uint64_t I = NameStr.size(); I < alignTo(NameStr.size(), 4); ++I) { + OS << '\0'; + } + OS << DescStr; + for (uint64_t I = DescStr.size(); I < alignTo(DescStr.size(), 4); ++I) { + OS << '\0'; + } + return OS.str(); +} + +} + +void RewriteInstance::addBoltInfoSection() { + std::string DescStr; + raw_string_ostream DescOS(DescStr); - const auto BoltInfo = OS.str(); - BC->registerOrUpdateNoteSection(".note.bolt_info", - copyByteArray(BoltInfo), - BoltInfo.size(), - /*Alignment=*/1, - /*IsReadOnly=*/true, - ELF::SHT_NOTE); + DescOS << "BOLT revision: " << BoltRevision << ", " + << "command line:"; + for (auto I = 0; I < Argc; ++I) { + DescOS << " " << Argv[I]; } + DescOS.flush(); + + const auto BoltInfo = + encodeELFNote("GNU", DescStr, 4 /*NT_GNU_GOLD_VERSION*/); + BC->registerOrUpdateNoteSection(".note.bolt_info", copyByteArray(BoltInfo), + BoltInfo.size(), + /*Alignment=*/1, + /*IsReadOnly=*/true, ELF::SHT_NOTE); } // Provide a mapping of the existing input binary sections to the output binary @@ -4543,6 +4559,8 @@ void RewriteInstance::rewriteFile() { // Update symbol tables. patchELFSymTabs(); + patchBuildID(); + // Copy non-allocatable sections once allocatable part is finished. rewriteNoteSections(); diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index ee9095592dd8..be00f87aa94f 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -326,6 +326,11 @@ class RewriteInstance { /// Add a notes section containing the BOLT revision and command line options. void addBoltInfoSection(); + /// Update the ELF note section containing the binary build-id to reflect + /// a new build-id, so tools can differentiate between the old and the + /// rewritten binary. + void patchBuildID(); + /// Computes output .debug_line line table offsets for each compile unit, /// and updates stmt_list for a corresponding compile unit. void updateLineTableOffsets(); @@ -502,6 +507,12 @@ class RewriteInstance { /// .gdb_index section. ErrorOr GdbIndexSection{std::errc::bad_address}; + /// .note.gnu.build-id section. + ErrorOr BuildIDSection{std::errc::bad_address}; + + /// A reference to the build-id bytes in the original binary + StringRef BuildID; + uint64_t NewSymTabOffset{0}; /// Keep track of functions we fail to write in the binary. We need to avoid @@ -554,9 +565,13 @@ class RewriteInstance { return NoneType(); } - /// Read binary sections and find a gnu note section with the build-id - /// of the input file. - Optional getBuildID() const; + /// Set the build-id string if we did not fail to parse the contents of the + /// ELF note section containing build-id information. + void parseBuildID(); + + /// The build-id is typically a stream of 20 bytes. Return these bytes in + /// printable hexadecimal form if they are available, or NoneType otherwise. + Optional getPrintableBuildID() const; /// Provide an access to the profile data aggregator. const DataAggregator &getDataAggregator() const { From ca2d9d0930af591ffdd847eb0e7123242bb1963d Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 22 Aug 2018 18:47:39 -0700 Subject: [PATCH 474/904] [BOLT] Add mattr options to AArch64 target Summary: Make the AArch64 subtarget enable all features, so the disassembler won't choke on extension instructions. (cherry picked from commit 28736fc6d54a3672eb3259ec086837f1597409c5) --- bolt/src/RewriteInstance.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 37fbfd104ac3..06802b40f485 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -628,12 +628,16 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, std::string ArchName; std::string TripleName; llvm::Triple::ArchType Arch = (llvm::Triple::ArchType)File->getArch(); + std::string FeaturesStr; if (Arch == llvm::Triple::x86_64) { ArchName = "x86-64"; TripleName = "x86_64-unknown-linux"; + FeaturesStr = ""; } else if (Arch == llvm::Triple::aarch64) { ArchName = "aarch64"; TripleName = "aarch64-unknown-linux"; + FeaturesStr = "+armv8.1a,+armv8.2a,+armv8.3a,+fp-armv8,+neon,+crypto," + "+dotprod,+crc,+lse,+ras,+rdm,+fullfp16,+spe,+fuse-aes,+svr,+rcpc"; } else { errs() << "BOLT-ERROR: Unrecognized machine in ELF file.\n"; return nullptr; @@ -665,7 +669,7 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, } std::unique_ptr STI( - TheTarget->createMCSubtargetInfo(TripleName, "", "")); + TheTarget->createMCSubtargetInfo(TripleName, "", FeaturesStr)); if (!STI) { errs() << "BOLT-ERROR: no subtarget info for target " << TripleName << "\n"; return nullptr; From 7f49d4d2c0448f989c150b3c66e8bdc000a567f8 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 24 Aug 2018 10:42:00 -0700 Subject: [PATCH 475/904] [BOLT] Reduce AArch64 target feature flags Summary: Eliminate some flags that are not recognized and are currently printing warnings when BOLT runs on AArch64. (cherry picked from commit d41f5c3bd87b65dcd091d20846a3f152619af627) --- bolt/src/RewriteInstance.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 06802b40f485..02574eb594ca 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -636,8 +636,8 @@ createBinaryContext(ELFObjectFileBase *File, DataReader &DR, } else if (Arch == llvm::Triple::aarch64) { ArchName = "aarch64"; TripleName = "aarch64-unknown-linux"; - FeaturesStr = "+armv8.1a,+armv8.2a,+armv8.3a,+fp-armv8,+neon,+crypto," - "+dotprod,+crc,+lse,+ras,+rdm,+fullfp16,+spe,+fuse-aes,+svr,+rcpc"; + FeaturesStr = "+fp-armv8,+neon,+crypto,+dotprod,+crc,+lse,+ras,+rdm," + "+fullfp16,+spe,+fuse-aes,+rcpc"; } else { errs() << "BOLT-ERROR: Unrecognized machine in ELF file.\n"; return nullptr; From 924b2f58947f0cb61d3dbeb25db95137ca150b70 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 27 Aug 2018 20:12:59 -0700 Subject: [PATCH 476/904] [BOLT][DWARF] Fix line info for empty CU DIEs Summary: In some rare cases a compiler may generate DWARF that contains an empty CU DIE that references a debug line fragment. That fragment will contain no file name information, and we fail to register it. Then, as a result, DW_AT_stmt_list is not updated for the CU. This may cause some DWARF-processing tools to segfault. As a solution/workaround, we register "" file name for such debug line tables. (cherry picked from commit 350295ee21b73bdbdbf6d9241716d3a6a48c643d) --- bolt/src/BinaryContext.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index a56ff707427c..b6d61a89aa21 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -667,6 +667,11 @@ void BinaryContext::preprocessDebugInfo( const auto CUID = CU->getOffset(); auto *LineTable = DwCtx->getLineTableForUnit(CU.get()); const auto &FileNames = LineTable->Prologue.FileNames; + // Make sure empty debug line tables are registered too. + if (FileNames.empty()) { + cantFail(Ctx->getDwarfFile("", "", 0, nullptr, None, CUID)); + continue; + } for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) { // Dir indexes start at 1, as DWARF file numbers, and a dir index 0 // means empty dir. From 4e8fc4e561b6a76919eed5c089244c0de07bc5bb Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 23 Aug 2018 22:47:46 -0700 Subject: [PATCH 477/904] [BOLT] Fix profile after ICP Summary: After optimizing a target of a jump table, ICP was not updating edge counts corresponding to that target. As a result the edge could be left hot and negatively influence the code layout. (cherry picked from commit 999165dd9780e2cc8a8ed998265da631e7a9ccb6) --- bolt/src/BinaryBasicBlock.h | 14 ++- bolt/src/BinaryFunction.cpp | 4 +- bolt/src/Passes/BinaryPasses.cpp | 7 +- bolt/src/Passes/IndirectCallPromotion.cpp | 128 ++++++++-------------- 4 files changed, 65 insertions(+), 88 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 6cbabaf29c52..9093ddaef864 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -365,14 +365,24 @@ class BinaryBasicBlock { return getSuccessor(); } - const BinaryBranchInfo &getBranchInfo(bool Condition) const { + /// Return branch info corresponding to a taken branch. + const BinaryBranchInfo &getTakenBranchInfo() const { assert(BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"); - return BranchInfo[Condition == true ? 0 : 1]; + return BranchInfo[0]; }; + /// Return branch info corresponding to a fall-through branch. + const BinaryBranchInfo &getFallthroughBranchInfo() const { + assert(BranchInfo.size() == 2 && + "could only be called for blocks with 2 successors"); + return BranchInfo[1]; + }; + + /// Return branch info corresponding to an edge going to \p Succ basic block. BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ); + /// Set branch information for the outgoing edge to block \p Succ. void setSuccessorBranchInfo(const BinaryBasicBlock &Succ, uint64_t Count, uint64_t MispredictedCount) { diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 68c909872f01..0220a2f7219e 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3962,11 +3962,11 @@ DynoStats BinaryFunction::getDynoStats() const { } // Conditional branch that could be followed by an unconditional branch. - uint64_t TakenCount = BB->getBranchInfo(true).Count; + auto TakenCount = BB->getTakenBranchInfo().Count; if (TakenCount == COUNT_NO_PROFILE) TakenCount = 0; - uint64_t NonTakenCount = BB->getBranchInfo(false).Count; + auto NonTakenCount = BB->getFallthroughBranchInfo().Count; if (NonTakenCount == COUNT_NO_PROFILE) NonTakenCount = 0; diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 3367ca4f6d3e..e42a8551417d 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -904,7 +904,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // Record this block so that we don't try to optimize it twice. BeenOptimized.insert(PredBB); - bool BranchForStats; + uint64_t Count = 0; if (CondSucc != BB) { // Patch the new target address into the conditional branch. MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, BC.Ctx.get()); @@ -913,13 +913,12 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, // branch to the old target. This has to be done manually since // fixupBranches is not called after SCTC. NeedsUncondBranch.emplace_back(std::make_pair(PredBB, CondSucc)); - BranchForStats = false; + Count = PredBB->getFallthroughBranchInfo().Count; } else { // Change destination of the conditional branch. MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get()); - BranchForStats = true; + Count = PredBB->getTakenBranchInfo().Count; } - const auto Count = PredBB->getBranchInfo(BranchForStats).Count; const uint64_t CTCTakenFreq = Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp index 3fd79ac1eb38..43071b2dab4c 100644 --- a/bolt/src/Passes/IndirectCallPromotion.cpp +++ b/bolt/src/Passes/IndirectCallPromotion.cpp @@ -740,92 +740,67 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( // Scale indirect call counts to the execution count of the original // basic block containing the indirect call. + auto TotalCount = IndCallBlock->getKnownExecutionCount(); uint64_t TotalIndirectBranches = 0; - uint64_t TotalIndirectMispreds = 0; - for (const auto &BI : Targets) { - TotalIndirectBranches += BI.Branches; - TotalIndirectMispreds += BI.Mispreds; + for (const auto &Target : Targets) { + TotalIndirectBranches += Target.Branches; } - - uint64_t TotalCount = 0; - uint64_t TotalMispreds = 0; - - if (Function.hasValidProfile()) { - TotalCount = IndCallBlock->getExecutionCount(); - TotalMispreds = - TotalCount * ((double)TotalIndirectMispreds / TotalIndirectBranches); - assert(TotalCount != BinaryBasicBlock::COUNT_NO_PROFILE); - } - - // New BinaryBranchInfo scaled to the execution count of the original BB. std::vector BBI; - for (auto Itr = Targets.begin(); Itr != Targets.end(); ++Itr) { - const auto BranchPct = (double)Itr->Branches / TotalIndirectBranches; - const auto MispredPct = - (double)Itr->Mispreds / std::max(TotalIndirectMispreds, 1ul); - if (Itr->JTIndex.empty()) { - BBI.push_back(BinaryBranchInfo{uint64_t(TotalCount * BranchPct), - uint64_t(TotalMispreds * MispredPct)}); - continue; - } - for (size_t I = 0, E = Itr->JTIndex.size(); I != E; ++I) { - BBI.push_back( - BinaryBranchInfo{uint64_t(TotalCount * (BranchPct / E)), - uint64_t(TotalMispreds * (MispredPct / E))}); + std::vector ScaledBBI; + for (const auto &Target : Targets) { + const auto NumEntries = std::max(1UL, Target.JTIndex.size()); + for (size_t I = 0; I < NumEntries; ++I) { + BBI.push_back(BinaryBranchInfo{Target.Branches / NumEntries, + Target.Mispreds / NumEntries}); + ScaledBBI.push_back(BinaryBranchInfo{ + uint64_t(TotalCount * Target.Branches / + (NumEntries * TotalIndirectBranches)), + uint64_t(TotalCount * Target.Mispreds / + (NumEntries * TotalIndirectBranches))}); } } - auto BI = BBI.begin(); - auto updateCurrentBranchInfo = [&]{ - assert(BI < BBI.end()); - TotalCount -= BI->Count; - TotalMispreds -= BI->MispredictedCount; - ++BI; - }; - if (IsJumpTable) { - IndCallBlock->moveAllSuccessorsTo(NewBBs.back().get()); + auto *NewIndCallBlock = NewBBs.back().get(); + IndCallBlock->moveAllSuccessorsTo(NewIndCallBlock); std::vector SymTargets; - for (size_t I = 0; I < Targets.size(); ++I) { - assert(Targets[I].To.Sym); - if (Targets[I].JTIndex.empty()) - SymTargets.push_back(Targets[I].To.Sym); - else { - for (size_t Idx = 0, E = Targets[I].JTIndex.size(); Idx != E; ++Idx) { - SymTargets.push_back(Targets[I].To.Sym); - } + for (const auto &Target : Targets) { + const auto NumEntries = std::max(1UL, Target.JTIndex.size()); + for (size_t I = 0; I < NumEntries; ++I) { + SymTargets.push_back(Target.To.Sym); } } assert(SymTargets.size() > NewBBs.size() - 1 && "There must be a target symbol associated with each new BB."); - // Fix up successors and execution counts. - updateCurrentBranchInfo(); - auto *Succ = Function.getBasicBlockForLabel(SymTargets[0]); - assert(Succ && "each jump target must be a legal BB label"); - IndCallBlock->addSuccessor(Succ, BBI[0]); // cond branch - IndCallBlock->addSuccessor(NewBBs[0].get(), TotalCount); // fallthru branch + for (uint64_t I = 0; I < NewBBs.size(); ++I) { + BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : IndCallBlock; + SourceBB->setExecutionCount(TotalCount); - for (size_t I = 0; I < NewBBs.size() - 1; ++I) { - assert(TotalCount <= IndCallBlock->getExecutionCount() || - TotalCount <= uint64_t(TotalIndirectBranches)); - uint64_t ExecCount = BBI[I+1].Count; - updateCurrentBranchInfo(); - auto *Succ = Function.getBasicBlockForLabel(SymTargets[I+1]); - assert(Succ && "each jump target must be a legal BB label"); - NewBBs[I]->addSuccessor(Succ, BBI[I+1]); - NewBBs[I]->addSuccessor(NewBBs[I+1].get(), TotalCount); // fallthru - ExecCount += TotalCount; - NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); - NewBBs[I]->setIsCold(IndCallBlock->isCold()); - NewBBs[I]->setExecutionCount(ExecCount); + auto *TargetBB = Function.getBasicBlockForLabel(SymTargets[I]); + SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken + + TotalCount -= ScaledBBI[I].Count; + SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through + + // Update branch info for the indirect jump. + auto &BranchInfo = NewIndCallBlock->getBranchInfo(*TargetBB); + BranchInfo.Count -= BBI[I].Count; + BranchInfo.MispredictedCount -= BBI[I].MispredictedCount; } } else { assert(NewBBs.size() >= 2); assert(NewBBs.size() % 2 == 1 || IndCallBlock->succ_empty()); assert(NewBBs.size() % 2 == 1 || IsTailCall); + auto ScaledBI = ScaledBBI.begin(); + auto updateCurrentBranchInfo = [&]{ + assert(ScaledBI != ScaledBBI.end()); + TotalCount -= ScaledBI->Count; + ++ScaledBI; + }; + if (!IsTailCall) { MergeBlock = NewBBs.back().get(); IndCallBlock->moveAllSuccessorsTo(MergeBlock); @@ -833,25 +808,23 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( // Fix up successors and execution counts. updateCurrentBranchInfo(); - IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount); // cond branch - IndCallBlock->addSuccessor(NewBBs[0].get(), BBI[0]); // uncond branch + IndCallBlock->addSuccessor(NewBBs[1].get(), TotalCount); + IndCallBlock->addSuccessor(NewBBs[0].get(), ScaledBBI[0]); const size_t Adj = IsTailCall ? 1 : 2; for (size_t I = 0; I < NewBBs.size() - Adj; ++I) { assert(TotalCount <= IndCallBlock->getExecutionCount() || TotalCount <= uint64_t(TotalIndirectBranches)); - uint64_t ExecCount = BBI[(I+1)/2].Count; - NewBBs[I]->setCanOutline(IndCallBlock->canOutline()); - NewBBs[I]->setIsCold(IndCallBlock->isCold()); + auto ExecCount = ScaledBBI[(I+1)/2].Count; if (I % 2 == 0) { if (MergeBlock) { - NewBBs[I]->addSuccessor(MergeBlock, BBI[(I+1)/2].Count); // uncond + NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I+1)/2].Count); } } else { assert(I + 2 < NewBBs.size()); updateCurrentBranchInfo(); - NewBBs[I]->addSuccessor(NewBBs[I+2].get(), TotalCount); // uncond branch - NewBBs[I]->addSuccessor(NewBBs[I+1].get(), BBI[(I+1)/2]); // cond. branch + NewBBs[I]->addSuccessor(NewBBs[I+2].get(), TotalCount); + NewBBs[I]->addSuccessor(NewBBs[I+1].get(), ScaledBBI[(I+1)/2]); ExecCount += TotalCount; } NewBBs[I]->setExecutionCount(ExecCount); @@ -860,8 +833,6 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( if (MergeBlock) { // Arrange for the MergeBlock to be the fallthrough for the first // promoted call block. - MergeBlock->setCanOutline(IndCallBlock->canOutline()); - MergeBlock->setIsCold(IndCallBlock->isCold()); std::unique_ptr MBPtr; std::swap(MBPtr, NewBBs.back()); NewBBs.pop_back(); @@ -871,13 +842,10 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( } } - // cold call block - // TODO: should be able to outline/cold this block. + // Update the execution count. NewBBs.back()->setExecutionCount(TotalCount); - NewBBs.back()->setCanOutline(IndCallBlock->canOutline()); - NewBBs.back()->setIsCold(IndCallBlock->isCold()); - // update BB and BB layout. + // Update BB and BB layout. Function.insertBasicBlocks(IndCallBlock, std::move(NewBBs)); assert(Function.validateCFG()); From 47eba3a6a36b2459e1340d5dc3cdda59d44ed0ab Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 28 Aug 2018 18:15:13 -0700 Subject: [PATCH 478/904] [BOLT] Change ForceRelocation behavior Summary: Only record address as addend if the target of the relocation is the pseudo-symbol Zero. (cherry picked from commit 5ea872118f1a3f1b3412c1e5e6b9073cafb3fe69) --- bolt/src/RewriteInstance.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 02574eb594ca..7be8a1b49e8d 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -2148,7 +2148,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { auto Name = Relocation::isGOT(Rel.getType()) ? "Zero" : SymbolName; ReferencedSymbol = BC->registerNameAtAddress(Name, 0, 0, 0); SymbolAddress = 0; - Addend = Address; + if (Relocation::isGOT(Rel.getType())) + Addend = Address; DEBUG(dbgs() << "BOLT-DEBUG: forcing relocation against symbol " << SymbolName << " with addend " << Addend << '\n'); } else if (ReferencedBF) { From e28af65cfbd061dda8114976e4a908ca54c04612 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 30 Aug 2018 14:51:10 -0700 Subject: [PATCH 479/904] [perf2bolt] Fix processing of binaries with names over 15 chars long Summary: Do not truncate the binary name for comparison purposes as the binary name we are getting from "perf script" is no longer truncated. (cherry picked from commit 96213df6dd8ecd3bb47737ab0e23bcc8779971e9) --- bolt/src/DataAggregator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index df6aa3d02737..d5007fdc1ff2 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -1305,11 +1305,11 @@ std::error_code DataAggregator::parseMMapEvents() { } ); - auto NameToUse = BinaryName.substr(0, 15); + auto NameToUse = BinaryName; if (GlobalMMapInfo.count(NameToUse) == 0 && !BuildIDBinaryName.empty()) { errs() << "PERF2BOLT-WARNING: using \"" << BuildIDBinaryName << "\" for profile matching\n"; - NameToUse = BuildIDBinaryName.substr(0, 15); + NameToUse = BuildIDBinaryName; } auto Range = GlobalMMapInfo.equal_range(NameToUse); From edc4914ec140aaec3b3a2e3db2fafa95aff80948 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 30 Aug 2018 13:21:29 -0700 Subject: [PATCH 480/904] [BOLT] Merge jump table profile data Summary: While running ICF pass we have skipped merging profile data for jump tables. We were only updating profile in the CFG. Fix that. (cherry picked from commit 5d4d5e63d9362bbb960a5fc6be05a5cb919e8e65) --- bolt/src/BinaryFunctionProfile.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/bolt/src/BinaryFunctionProfile.cpp b/bolt/src/BinaryFunctionProfile.cpp index 0aa5cef805d9..047fcd02a043 100644 --- a/bolt/src/BinaryFunctionProfile.cpp +++ b/bolt/src/BinaryFunctionProfile.cpp @@ -587,6 +587,23 @@ void BinaryFunction::mergeProfileDataInto(BinaryFunction &BF) const { ++BBMergeI; } assert(BBMergeI == BF.end()); + + // Merge jump tables profile info. + auto JTMergeI = BF.JumpTables.begin(); + for (const auto &JTEntry : JumpTables) { + if (JTMergeI->second->Counts.empty()) + JTMergeI->second->Counts.resize(JTEntry.second->Counts.size()); + auto CountMergeI = JTMergeI->second->Counts.begin(); + for (const auto &JI : JTEntry.second->Counts) { + CountMergeI->Count += JI.Count; + CountMergeI->Mispreds += JI.Mispreds; + ++CountMergeI; + } + assert(CountMergeI == JTMergeI->second->Counts.end()); + + ++JTMergeI; + } + assert(JTMergeI == BF.JumpTables.end()); } void BinaryFunction::readSampleData() { From f037c4e87082a5f670fe832a8a694dd00f013e59 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Tue, 14 Aug 2018 14:49:10 -0700 Subject: [PATCH 481/904] [BOLT] turning on the compact aligner by default Summary: Making UseCompactAligner true by default (cherry picked from commit 75e13deeefa5fc6b20f1c45cbfce351d2c907323) --- bolt/src/Passes/Aligner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/src/Passes/Aligner.cpp b/bolt/src/Passes/Aligner.cpp index c4d4434f5b30..d95634acbede 100644 --- a/bolt/src/Passes/Aligner.cpp +++ b/bolt/src/Passes/Aligner.cpp @@ -65,7 +65,7 @@ BlockAlignment("block-alignment", cl::opt UseCompactAligner("use-compact-aligner", cl::desc("Use compact approach for aligning functions"), - cl::init(false), + cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory)); From 5a908a1a759376ea2066181f31b6b599044f66f4 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 30 Aug 2018 13:21:50 -0700 Subject: [PATCH 482/904] [BOLT] Fix another issue with profile after ICP Summary: For jump tables ICP was using profile from the jump table itself which doesn't work correct if the jump table is re-used at different code locations. (cherry picked from commit 3791b31694c4210dc6c73699fd82f1ef02eb7dba) --- bolt/src/BinaryBasicBlock.cpp | 13 ++++ bolt/src/BinaryBasicBlock.h | 4 ++ bolt/src/Passes/IndirectCallPromotion.cpp | 73 ++++++++++++++++++----- bolt/src/Passes/IndirectCallPromotion.h | 2 +- 4 files changed, 76 insertions(+), 16 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.cpp b/bolt/src/BinaryBasicBlock.cpp index 9e6567d19191..2fdf00878962 100644 --- a/bolt/src/BinaryBasicBlock.cpp +++ b/bolt/src/BinaryBasicBlock.cpp @@ -530,5 +530,18 @@ BinaryBasicBlock::getBranchInfo(const BinaryBasicBlock &Succ) { return *BI; } +BinaryBasicBlock::BinaryBranchInfo & +BinaryBasicBlock::getBranchInfo(const MCSymbol *Label) { + auto BI = branch_info_begin(); + for (auto BB : successors()) { + if (BB->getLabel() == Label) + return *BI; + ++BI; + } + + llvm_unreachable("Invalid successor"); + return *BI; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 9093ddaef864..84010150b5ae 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -382,6 +382,10 @@ class BinaryBasicBlock { /// Return branch info corresponding to an edge going to \p Succ basic block. BinaryBranchInfo &getBranchInfo(const BinaryBasicBlock &Succ); + /// Return branch info corresponding to an edge going to a basic block with + /// label \p Label. + BinaryBranchInfo &getBranchInfo(const MCSymbol *Label); + /// Set branch information for the outgoing edge to block \p Succ. void setSuccessorBranchInfo(const BinaryBasicBlock &Succ, uint64_t Count, diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp index 43071b2dab4c..fb530ce2a177 100644 --- a/bolt/src/Passes/IndirectCallPromotion.cpp +++ b/bolt/src/Passes/IndirectCallPromotion.cpp @@ -140,6 +140,33 @@ static cl::opt ICPJumpTablesByTarget( namespace llvm { namespace bolt { +namespace { + +bool verifyProfile(std::map &BFs) { + bool IsValid = true; + for (auto &BFI : BFs) { + auto &BF = BFI.second; + if (!BF.isSimple()) continue; + for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); + for (auto SuccBB : BB->successors()) { + if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) { + if (BB->getKnownExecutionCount() == 0 || + SuccBB->getKnownExecutionCount() == 0) { + errs() << "BOLT-WARNING: profile verification failed after ICP for " + "function " << BF << '\n'; + IsValid = false; + } + } + ++BI; + } + } + } + return IsValid; +} + +} + IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF, const IndirectCallProfile &ICP) : From(BF.getSymbol()), @@ -158,9 +185,10 @@ IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF, // called first. std::vector IndirectCallPromotion::getCallTargets( - BinaryFunction &BF, + BinaryBasicBlock &BB, const MCInst &Inst ) const { + auto &BF = *BB.getFunction(); auto &BC = BF.getBinaryContext(); std::vector Targets; @@ -185,8 +213,9 @@ IndirectCallPromotion::getCallTargets( Entry == BF.getFunctionColdEndLabel()) continue; const Location To(Entry); + const auto &BI = BB.getBranchInfo(Entry); Targets.emplace_back( - From, To, JI->Mispreds, JI->Count, I - Range.first); + From, To, BI.MispredictedCount, BI.Count, I - Range.first); } // Sort by symbol then addr. @@ -244,10 +273,10 @@ IndirectCallPromotion::getCallTargets( } // Sort by most commonly called targets. - std::sort(Targets.begin(), Targets.end(), - [](const Callsite &A, const Callsite &B) { - return A.Branches > B.Branches; - }); + std::stable_sort(Targets.begin(), Targets.end(), + [](const Callsite &A, const Callsite &B) { + return A.Branches > B.Branches; + }); // Remove non-symbol targets auto Last = std::remove_if(Targets.begin(), @@ -750,8 +779,9 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( for (const auto &Target : Targets) { const auto NumEntries = std::max(1UL, Target.JTIndex.size()); for (size_t I = 0; I < NumEntries; ++I) { - BBI.push_back(BinaryBranchInfo{Target.Branches / NumEntries, - Target.Mispreds / NumEntries}); + BBI.push_back( + BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries, + (Target.Mispreds + NumEntries - 1) / NumEntries}); ScaledBBI.push_back(BinaryBranchInfo{ uint64_t(TotalCount * Target.Branches / (NumEntries * TotalIndirectBranches)), @@ -786,8 +816,15 @@ BinaryBasicBlock *IndirectCallPromotion::fixCFG( // Update branch info for the indirect jump. auto &BranchInfo = NewIndCallBlock->getBranchInfo(*TargetBB); - BranchInfo.Count -= BBI[I].Count; - BranchInfo.MispredictedCount -= BBI[I].MispredictedCount; + if (BranchInfo.Count > BBI[I].Count) + BranchInfo.Count -= BBI[I].Count; + else + BranchInfo.Count = 0; + + if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount) + BranchInfo.MispredictedCount -= BBI[I].MispredictedCount; + else + BranchInfo.MispredictedCount = 0; } } else { assert(NewBBs.size() >= 2); @@ -1062,7 +1099,7 @@ void IndirectCallPromotion::runOnFunctions( std::unique_ptr RA; std::unique_ptr CG; - if (opts::IndirectCallPromotion >= ICP_JUMP_TABLES) { + if (OptimizeJumpTables) { CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); RA.reset(new RegAnalysis(BC, &BFs, &*CG)); } @@ -1142,7 +1179,7 @@ void IndirectCallPromotion::runOnFunctions( ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) || (IsJumpTable && OptimizeJumpTables))) { uint64_t NumCalls = 0; - for (const auto &BInfo : getCallTargets(Function, Inst)) { + for (const auto &BInfo : getCallTargets(BB, Inst)) { NumCalls += BInfo.Branches; } @@ -1221,8 +1258,10 @@ void IndirectCallPromotion::runOnFunctions( TotalCalls += BB->getKnownExecutionCount(); } - if (!((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) || - (IsJumpTable && OptimizeJumpTables))) + if (IsJumpTable && !OptimizeJumpTables) + continue; + + if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls)) continue; // Ignore direct calls. @@ -1237,7 +1276,7 @@ void IndirectCallPromotion::runOnFunctions( else ++TotalIndirectCallsites; - auto Targets = getCallTargets(Function, Inst); + auto Targets = getCallTargets(*BB, Inst); // Compute the total number of calls from this particular callsite. uint64_t NumCalls = 0; @@ -1440,6 +1479,10 @@ void IndirectCallPromotion::runOnFunctions( << format("%.1f", (100.0 * TotalIndexBasedJumps) / std::max(TotalIndexBasedCandidates, 1ul)) << "%\n"; + +#ifndef NDEBUG + verifyProfile(BFs); +#endif } } // namespace bolt diff --git a/bolt/src/Passes/IndirectCallPromotion.h b/bolt/src/Passes/IndirectCallPromotion.h index b0262bdbcaf4..a391aee6e53c 100644 --- a/bolt/src/Passes/IndirectCallPromotion.h +++ b/bolt/src/Passes/IndirectCallPromotion.h @@ -178,7 +178,7 @@ class IndirectCallPromotion : public BinaryFunctionPass { // Total number of jump table sites that use hot indices. uint64_t TotalIndexBasedJumps{0}; - std::vector getCallTargets(BinaryFunction &BF, + std::vector getCallTargets(BinaryBasicBlock &BB, const MCInst &Inst) const; size_t canPromoteCallsite(const BinaryBasicBlock *BB, From 79681d436ae54817a3ed419ea2e65ee8c7198628 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 5 Sep 2018 14:36:52 -0700 Subject: [PATCH 483/904] [BOLT] Ignore symbols from non-allocatable sections Summary: While creating BinaryData objects we used to process all symbol table entries. However, some symbols could belong to non-allocatable sections, and thus we have to ignore them for the purpose of analyzing in-memory data. (cherry picked from commit 21441ad6a0b1f451b9c2b378c9d58efb6704c00e) --- bolt/src/BinaryContext.cpp | 11 +- bolt/src/BinaryFunction.cpp | 490 +++++++++++++++++++++++++------ bolt/src/BinaryFunction.h | 25 +- bolt/src/Passes/BinaryPasses.cpp | 2 +- bolt/src/RewriteInstance.cpp | 20 +- 5 files changed, 442 insertions(+), 106 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index b6d61a89aa21..32da5830479b 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -756,22 +756,23 @@ void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) { OS << "OpDefCfa Reg" << Inst.getRegister() << " " << Inst.getOffset(); break; case MCCFIInstruction::OpRelOffset: - OS << "OpRelOffset"; + OS << "OpRelOffset Reg" << Inst.getRegister() << " " << Inst.getOffset(); break; case MCCFIInstruction::OpAdjustCfaOffset: - OS << "OfAdjustCfaOffset"; + OS << "OfAdjustCfaOffset " << Inst.getOffset(); break; case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; case MCCFIInstruction::OpRestore: - OS << "OpRestore"; + OS << "OpRestore Reg" << Inst.getRegister(); break; case MCCFIInstruction::OpUndefined: - OS << "OpUndefined"; + OS << "OpUndefined Reg" << Inst.getRegister(); break; case MCCFIInstruction::OpRegister: - OS << "OpRegister"; + OS << "OpRegister Reg" << Inst.getRegister() << " Reg" + << Inst.getRegister2(); break; case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 0220a2f7219e..555dcb49151a 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1132,7 +1132,11 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // Functions with "soft" boundaries, e.g. coming from assembly source, // can have 0-byte padding at the end. bool IsZeroPadding = true; - for (auto I = Offset; I < getSize(); ++I) { + uint64_t EndOfCode = getSize(); + auto Iter = DataOffsets.upper_bound(Offset); + if (Iter != DataOffsets.end()) + EndOfCode = *Iter; + for (auto I = Offset; I < EndOfCode; ++I) { if (FunctionData[I] != 0) { IsZeroPadding = false; break; @@ -2184,56 +2188,397 @@ void BinaryFunction::annotateCFIState() { assert(StateStack.empty() && "corrupt CFI stack"); } -bool BinaryFunction::fixCFIState() { - DEBUG(dbgs() << "Trying to fix CFI states for each BB after reordering.\n"); - DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this - << ": "); +namespace { - auto replayCFIInstrs = - [this](int32_t FromState, int32_t ToState, BinaryBasicBlock *InBB, - BinaryBasicBlock::iterator InsertIt) -> bool { - if (FromState == ToState) - return true; - assert(FromState < ToState && "can only replay CFIs forward"); - - std::vector NewCFIs; - uint32_t NestedLevel = 0; - for (auto CurState = FromState; CurState < ToState; ++CurState) { - MCCFIInstruction *Instr = &FrameInstructions[CurState]; - if (Instr->getOperation() == MCCFIInstruction::OpRememberState) - ++NestedLevel; - if (!NestedLevel) - NewCFIs.push_back(CurState); - if (Instr->getOperation() == MCCFIInstruction::OpRestoreState) - --NestedLevel; - } - - // TODO: If in replaying the CFI instructions to reach this state we - // have state stack instructions, we could still work out the logic - // to extract only the necessary instructions to reach this state - // without using the state stack. Not sure if it is worth the effort - // because this happens rarely. - if (NestedLevel != 0) { - errs() << "BOLT-WARNING: CFI rewriter detected nested CFI state" - << " while replaying CFI instructions for BB " - << InBB->getName() << " in function " << *this << '\n'; - return false; +/// Our full interpretation of a DWARF CFI machine state at a given point +struct CFISnapshot { + /// CFA register number and offset defining the canonical frame at this + /// point, or the number of a rule (CFI state) that computes it with a + /// DWARF expression. This number will be negative if it refers to a CFI + /// located in the CIE instead of the FDE. + uint32_t CFAReg; + int32_t CFAOffset; + int32_t CFARule; + /// Mapping of rules (CFI states) that define the location of each + /// register. If absent, no rule defining the location of such register + /// was ever read. This number will be negative if it refers to a CFI + /// located in the CIE instead of the FDE. + DenseMap RegRule; + + /// References to CIE, FDE and expanded instructions after a restore state + const std::vector &CIE; + const std::vector &FDE; + const DenseMap> &FrameRestoreEquivalents; + + /// Current FDE CFI number representing the state where the snapshot is at + int32_t CurState; + + /// Used when we don't have information about which state/rule to apply + /// to recover the location of either the CFA or a specific register + constexpr static int32_t UNKNOWN = std::numeric_limits::min(); + +private: + /// Update our snapshot by executing a single CFI + void update(const MCCFIInstruction &Instr, int32_t RuleNumber) { + switch (Instr.getOperation()) { + case MCCFIInstruction::OpSameValue: + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpOffset: + case MCCFIInstruction::OpRestore: + case MCCFIInstruction::OpUndefined: + case MCCFIInstruction::OpRegister: + case MCCFIInstruction::OpExpression: + case MCCFIInstruction::OpValExpression: + RegRule[Instr.getRegister()] = RuleNumber; + break; + case MCCFIInstruction::OpDefCfaRegister: + CFAReg = Instr.getRegister(); + CFARule = UNKNOWN; + break; + case MCCFIInstruction::OpDefCfaOffset: + CFAOffset = Instr.getOffset(); + CFARule = UNKNOWN; + break; + case MCCFIInstruction::OpDefCfa: + CFAReg = Instr.getRegister(); + CFAOffset = Instr.getOffset(); + CFARule = UNKNOWN; + break; + case MCCFIInstruction::OpDefCfaExpression: + CFARule = RuleNumber; + break; + case MCCFIInstruction::OpAdjustCfaOffset: + case MCCFIInstruction::OpWindowSave: + case MCCFIInstruction::OpEscape: + llvm_unreachable("unsupported CFI opcode"); + break; + case MCCFIInstruction::OpRememberState: + case MCCFIInstruction::OpRestoreState: + case MCCFIInstruction::OpGnuArgsSize: + // do not affect CFI state + break; } + } - for (auto CFI : NewCFIs) { - // Ignore GNU_args_size instructions. - if (FrameInstructions[CFI].getOperation() != - MCCFIInstruction::OpGnuArgsSize) { - InsertIt = addCFIPseudo(InBB, InsertIt, CFI); - ++InsertIt; +public: + /// Advance state reading FDE CFI instructions up to State number + void advanceTo(int32_t State) { + for (int32_t I = CurState, E = State; I != E; ++I) { + const auto &Instr = FDE[I]; + if (Instr.getOperation() != MCCFIInstruction::OpRestoreState) { + update(Instr, I); + continue; } + // If restore state instruction, fetch the equivalent CFIs that have + // the same effect of this restore. This is used to ensure remember- + // restore pairs are completely removed. + auto Iter = FrameRestoreEquivalents.find(I); + if (Iter == FrameRestoreEquivalents.end()) + continue; + for (int32_t RuleNumber : Iter->second) { + update(FDE[RuleNumber], RuleNumber); + } + } + + assert(((CFAReg != (uint32_t)UNKNOWN && CFAOffset != UNKNOWN) || + CFARule != UNKNOWN) && + "CIE did not define default CFA?"); + + CurState = State; + } + + /// Interpret all CIE and FDE instructions up until CFI State number and + /// populate this snapshot + CFISnapshot( + const std::vector &CIE, + const std::vector &FDE, + const DenseMap> &FrameRestoreEquivalents, + int32_t State) + : CIE(CIE), FDE(FDE), FrameRestoreEquivalents(FrameRestoreEquivalents) { + CFAReg = UNKNOWN; + CFAOffset = UNKNOWN; + CFARule = UNKNOWN; + CurState = 0; + + for (int32_t I = 0, E = CIE.size(); I != E; ++I) { + const auto &Instr = CIE[I]; + update(Instr, -I); } + advanceTo(State); + } + +}; + +/// A CFI snapshot with the capability of checking if incremental additions to +/// it are redundant. This is used to ensure we do not emit two CFI instructions +/// back-to-back that are doing the same state change, or to avoid emitting a +/// CFI at all when the state at that point would not be modified after that CFI +struct CFISnapshotDiff : public CFISnapshot { + bool RestoredCFAReg{false}; + bool RestoredCFAOffset{false}; + DenseMap RestoredRegs; + + CFISnapshotDiff(const CFISnapshot &S) : CFISnapshot(S) {} + + CFISnapshotDiff( + const std::vector &CIE, + const std::vector &FDE, + const DenseMap> &FrameRestoreEquivalents, + int32_t State) + : CFISnapshot(CIE, FDE, FrameRestoreEquivalents, State) {} + + /// Return true if applying Instr to this state is redundant and can be + /// dismissed. + bool isRedundant(const MCCFIInstruction &Instr) { + switch (Instr.getOperation()) { + case MCCFIInstruction::OpSameValue: + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpOffset: + case MCCFIInstruction::OpRestore: + case MCCFIInstruction::OpUndefined: + case MCCFIInstruction::OpRegister: + case MCCFIInstruction::OpExpression: + case MCCFIInstruction::OpValExpression: { + if (RestoredRegs[Instr.getRegister()]) + return true; + RestoredRegs[Instr.getRegister()] = true; + const int32_t CurRegRule = + RegRule.find(Instr.getRegister()) != RegRule.end() + ? RegRule[Instr.getRegister()] + : UNKNOWN; + if (CurRegRule == UNKNOWN) { + if (Instr.getOperation() == MCCFIInstruction::OpRestore || + Instr.getOperation() == MCCFIInstruction::OpSameValue) + return true; + return false; + } + const MCCFIInstruction &LastDef = + CurRegRule < 0 ? CIE[-CurRegRule] : FDE[CurRegRule]; + return LastDef == Instr; + } + case MCCFIInstruction::OpDefCfaRegister: + if (RestoredCFAReg) + return true; + RestoredCFAReg = true; + return CFAReg == Instr.getRegister(); + case MCCFIInstruction::OpDefCfaOffset: + if (RestoredCFAOffset) + return true; + RestoredCFAOffset = true; + return CFAOffset == Instr.getOffset(); + case MCCFIInstruction::OpDefCfa: + if (RestoredCFAReg && RestoredCFAOffset) + return true; + RestoredCFAReg = true; + RestoredCFAOffset = true; + return CFAReg == Instr.getRegister() && CFAOffset == Instr.getOffset(); + case MCCFIInstruction::OpDefCfaExpression: + if (RestoredCFAReg && RestoredCFAOffset) + return true; + RestoredCFAReg = true; + RestoredCFAOffset = true; + return false; + case MCCFIInstruction::OpAdjustCfaOffset: + case MCCFIInstruction::OpWindowSave: + case MCCFIInstruction::OpEscape: + llvm_unreachable("unsupported CFI opcode"); + return false; + case MCCFIInstruction::OpRememberState: + case MCCFIInstruction::OpRestoreState: + case MCCFIInstruction::OpGnuArgsSize: + // do not affect CFI state + return true; + } + return false; + } +}; + +} // end anonymous namespace + +bool BinaryFunction::replayCFIInstrs(int32_t FromState, int32_t ToState, + BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator InsertIt) { + if (FromState == ToState) return true; + assert(FromState < ToState && "can only replay CFIs forward"); + + CFISnapshotDiff CFIDiff(CIEFrameInstructions, FrameInstructions, + FrameRestoreEquivalents, FromState); + + std::vector NewCFIs; + for (auto CurState = FromState; CurState < ToState; ++CurState) { + MCCFIInstruction *Instr = &FrameInstructions[CurState]; + if (Instr->getOperation() == MCCFIInstruction::OpRestoreState) { + auto Iter = FrameRestoreEquivalents.find(CurState); + assert(Iter != FrameRestoreEquivalents.end()); + NewCFIs.insert(NewCFIs.end(), Iter->second.begin(), + Iter->second.end()); + // RestoreState / Remember will be filtered out later by CFISnapshotDiff, + // so we might as well fall-through here. + } + NewCFIs.push_back(CurState); + continue; + } + + // Replay instructions while avoiding duplicates + for (auto I = NewCFIs.rbegin(), E = NewCFIs.rend(); I != E; ++I) { + if (CFIDiff.isRedundant(FrameInstructions[*I])) + continue; + InsertIt = addCFIPseudo(InBB, InsertIt, *I); + } + + return true; +} + +SmallVector +BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState, + BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator &InsertIt) { + SmallVector NewStates; + + CFISnapshot ToCFITable(CIEFrameInstructions, FrameInstructions, + FrameRestoreEquivalents, ToState); + CFISnapshotDiff FromCFITable(ToCFITable); + FromCFITable.advanceTo(FromState); + + auto undoState = [&](const MCCFIInstruction &Instr) { + switch (Instr.getOperation()) { + case MCCFIInstruction::OpRememberState: + case MCCFIInstruction::OpRestoreState: + break; + case MCCFIInstruction::OpSameValue: + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpOffset: + case MCCFIInstruction::OpRestore: + case MCCFIInstruction::OpUndefined: + case MCCFIInstruction::OpRegister: + case MCCFIInstruction::OpExpression: + case MCCFIInstruction::OpValExpression: { + if (ToCFITable.RegRule.find(Instr.getRegister()) == + ToCFITable.RegRule.end()) { + FrameInstructions.emplace_back( + MCCFIInstruction::createRestore(nullptr, Instr.getRegister())); + if (FromCFITable.isRedundant(FrameInstructions.back())) { + FrameInstructions.pop_back(); + break; + } + NewStates.push_back(FrameInstructions.size() - 1); + InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size() - 1); + ++InsertIt; + break; + } + const int32_t Rule = ToCFITable.RegRule[Instr.getRegister()]; + if (Rule < 0) { + if (FromCFITable.isRedundant(CIEFrameInstructions[-Rule])) + break; + NewStates.push_back(FrameInstructions.size()); + InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size()); + ++InsertIt; + FrameInstructions.emplace_back(CIEFrameInstructions[-Rule]); + break; + } + if (FromCFITable.isRedundant(FrameInstructions[Rule])) + break; + NewStates.push_back(Rule); + InsertIt = addCFIPseudo(InBB, InsertIt, Rule); + ++InsertIt; + break; + } + case MCCFIInstruction::OpDefCfaRegister: + case MCCFIInstruction::OpDefCfaOffset: + case MCCFIInstruction::OpDefCfa: + case MCCFIInstruction::OpDefCfaExpression: + if (ToCFITable.CFARule == CFISnapshot::UNKNOWN) { + FrameInstructions.emplace_back(MCCFIInstruction::createDefCfa( + nullptr, ToCFITable.CFAReg, -ToCFITable.CFAOffset)); + if (FromCFITable.isRedundant(FrameInstructions.back())) { + FrameInstructions.pop_back(); + break; + } + NewStates.push_back(FrameInstructions.size() - 1); + InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size() - 1); + ++InsertIt; + } else if (ToCFITable.CFARule < 0) { + if (FromCFITable.isRedundant(CIEFrameInstructions[-ToCFITable.CFARule])) + break; + NewStates.push_back(FrameInstructions.size()); + InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size()); + ++InsertIt; + FrameInstructions.emplace_back( + CIEFrameInstructions[-ToCFITable.CFARule]); + } else if (!FromCFITable.isRedundant( + FrameInstructions[ToCFITable.CFARule])) { + NewStates.push_back(ToCFITable.CFARule); + InsertIt = addCFIPseudo(InBB, InsertIt, ToCFITable.CFARule); + ++InsertIt; + } + break; + case MCCFIInstruction::OpAdjustCfaOffset: + case MCCFIInstruction::OpWindowSave: + case MCCFIInstruction::OpEscape: + llvm_unreachable("unsupported CFI opcode"); + break; + case MCCFIInstruction::OpGnuArgsSize: + // do not affect CFI state + break; + } }; + + // Undo all modifications from ToState to FromState + for (int32_t I = ToState, E = FromState; I != E; ++I) { + const auto &Instr = FrameInstructions[I]; + if (Instr.getOperation() != MCCFIInstruction::OpRestoreState) { + undoState(Instr); + continue; + } + auto Iter = FrameRestoreEquivalents.find(I); + if (Iter == FrameRestoreEquivalents.end()) + continue; + for (int32_t State : Iter->second) + undoState(FrameInstructions[State]); + } + + return NewStates; +} + +bool BinaryFunction::fixCFIState() { + DEBUG(dbgs() << "Trying to fix CFI states for each BB after reordering.\n"); + DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this + << ": "); + + std::stack Stack; + auto &OriginalBBOrder = BasicBlocksPreviousLayout.empty() + ? BasicBlocksLayout + : BasicBlocksPreviousLayout; + + // Reordering blocks with remember-restore state instructions can be specially + // tricky. When rewriting the CFI, we omit remember-restore state instructions + // entirely. For restore state, we build a map expanding each restore to the + // equivalent unwindCFIState sequence required at that point to achieve the + // same effect of the restore. All remember state are then just ignored. + for (BinaryBasicBlock *CurBB : OriginalBBOrder) { + for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { + if (auto *CFI = getCFIFor(*II)) { + if (CFI->getOperation() == MCCFIInstruction::OpRememberState) { + Stack.push(II->getOperand(0).getImm()); + BC.MIB->addAnnotation(*II, "DeleteMe", 0U); + continue; + } + if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) { + const int32_t RememberState = Stack.top(); + const int32_t CurState = II->getOperand(0).getImm(); + BC.MIB->addAnnotation(*II, "DeleteMe", 0U); + FrameRestoreEquivalents[CurState] = + unwindCFIState(CurState, RememberState, CurBB, II); + Stack.pop(); + } + } + } + } + int32_t State = 0; - auto *FDEStartBB = BasicBlocksLayout[0]; bool SeenCold = false; auto Sep = ""; (void)Sep; @@ -2241,11 +2586,9 @@ bool BinaryFunction::fixCFIState() { const auto CFIStateAtExit = BB->getCFIStateAtExit(); // Hot-cold border: check if this is the first BB to be allocated in a cold - // region (with a different FDE). If yes, we need to reset the CFI state and - // the FDEStartBB that is used to insert remember_state CFIs. + // region (with a different FDE). If yes, we need to reset the CFI state. if (!SeenCold && BB->isCold()) { State = 0; - FDEStartBB = BB; SeenCold = true; } @@ -2253,55 +2596,10 @@ bool BinaryFunction::fixCFIState() { // state at BB entry point. if (BB->getCFIState() < State) { // In this case, State is currently higher than what this BB expect it - // to be. To solve this, we need to insert a CFI instruction to remember - // the old state at function entry, then another CFI instruction to - // restore it at the entry of this BB and replay CFI instructions to - // reach the desired state. - int32_t OldState = BB->getCFIState(); - // Remember state at function entry point (our reference state). - auto InsertIt = FDEStartBB->begin(); - while (InsertIt != FDEStartBB->end() && BC.MIB->isCFI(*InsertIt)) - ++InsertIt; - addCFIPseudo(FDEStartBB, InsertIt, FrameInstructions.size()); - FrameInstructions.emplace_back( - MCCFIInstruction::createRememberState(nullptr)); - // Restore state - InsertIt = addCFIPseudo(BB, BB->begin(), FrameInstructions.size()); - ++InsertIt; - FrameInstructions.emplace_back( - MCCFIInstruction::createRestoreState(nullptr)); - if (!replayCFIInstrs(0, OldState, BB, InsertIt)) - return false; - // Check if we messed up the stack in this process - int StackOffset = 0; - for (BinaryBasicBlock *CurBB : BasicBlocksLayout) { - if (CurBB == BB) - break; - for (auto &Instr : *CurBB) { - if (auto *CFI = getCFIFor(Instr)) { - if (CFI->getOperation() == MCCFIInstruction::OpRememberState) - ++StackOffset; - if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) - --StackOffset; - } - } - } - auto Pos = BB->begin(); - while (Pos != BB->end() && BC.MIB->isCFI(*Pos)) { - auto CFI = getCFIFor(*Pos); - if (CFI->getOperation() == MCCFIInstruction::OpRememberState) - ++StackOffset; - if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) - --StackOffset; - ++Pos; - } - - if (StackOffset != 0) { - errs() << "BOLT-WARNING: not possible to remember/recover state" - << " without corrupting CFI state stack in function " - << *this << " @ " << BB->getName() << "\n"; - return false; - } + // to be. To solve this, we need to insert CFI instructions to undo + // the effect of all CFI from BB's state to current State. + auto InsertIt = BB->begin(); + unwindCFIState(State, BB->getCFIState(), BB, InsertIt); } else if (BB->getCFIState() > State) { // If BB's CFI state is greater than State, it means we are behind in the // state. Just emit all instructions to reach this state at the @@ -2315,6 +2613,12 @@ bool BinaryFunction::fixCFIState() { DEBUG(dbgs() << Sep << State; Sep = ", "); } DEBUG(dbgs() << "\n"); + + for (auto BB : BasicBlocksLayout) + for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) + if (BC.MIB->hasAnnotation(*I, "DeleteMe")) + BB->eraseInstruction(&*I); + return true; } diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index d753f8845074..b4ae1359b992 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -496,6 +496,11 @@ class BinaryFunction { /// in a different order. CFIInstrMapType FrameInstructions; + /// A map of restore state CFI instructions to their equivalent CFI + /// instructions that produce the same state, in order to eliminate + /// remember-restore CFI instructions when rewriting CFI. + DenseMap> FrameRestoreEquivalents; + /// Exception handling ranges. struct CallSite { const MCSymbol *Start; @@ -802,10 +807,8 @@ class BinaryFunction { } /// Update layout of basic blocks used for output. - void updateBasicBlockLayout(BasicBlockOrderType &NewLayout, - bool SavePrevLayout) { - if (SavePrevLayout) - BasicBlocksPreviousLayout = BasicBlocksLayout; + void updateBasicBlockLayout(BasicBlockOrderType &NewLayout) { + BasicBlocksPreviousLayout = BasicBlocksLayout; if (NewLayout != BasicBlocksLayout) { ModifiedLayout = true; @@ -2013,6 +2016,20 @@ class BinaryFunction { return FrameInstructions; } + void moveRememberRestorePair(BinaryBasicBlock *BB); + + bool replayCFIInstrs(int32_t FromState, int32_t ToState, + BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator InsertIt); + + /// unwindCFIState is used to unwind from a higher to a lower state number + /// without using remember-restore instructions. We do that by keeping track + /// of what values have been changed from state A to B and emitting + /// instructions that undo this change. + SmallVector unwindCFIState(int32_t FromState, int32_t ToState, + BinaryBasicBlock *InBB, + BinaryBasicBlock::iterator &InsertIt); + /// After reordering, this function checks the state of CFI and fixes it if it /// is corrupted. If it is unable to fix it, it returns false. bool fixCFIState(); diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index e42a8551417d..c340eaf105dc 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -489,7 +489,7 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, Algo->reorderBasicBlocks(BF, NewLayout); - BF.updateBasicBlockLayout(NewLayout, /*SavePrevLayout=*/opts::PrintFuncStat); + BF.updateBasicBlockLayout(NewLayout); if (Split) splitFunction(BF); diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 7be8a1b49e8d..3f49932cd4eb 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1105,9 +1105,23 @@ void RewriteInstance::discoverFileObjects() { } } - // Sort symbols in the file by value. - std::vector SortedFileSymbols(InputFile->symbol_begin(), - InputFile->symbol_end()); + // Sort symbols in the file by value. Ignore symbols from non-allocatable + // sections. + auto isSymbolInMemory = [this](const SymbolRef &Sym) { + if (cantFail(Sym.getType()) == SymbolRef::ST_File) + return false; + if (Sym.getFlags() & SymbolRef::SF_Absolute) + return true; + if (Sym.getFlags() & SymbolRef::SF_Undefined) + return false; + BinarySection Section(*BC, *cantFail(Sym.getSection())); + return Section.isAllocatable(); + }; + std::vector SortedFileSymbols; + std::copy_if(InputFile->symbol_begin(), InputFile->symbol_end(), + std::back_inserter(SortedFileSymbols), + isSymbolInMemory); + std::stable_sort(SortedFileSymbols.begin(), SortedFileSymbols.end(), [](const SymbolRef &A, const SymbolRef &B) { // FUNC symbols have higher precedence. From ec0929aded2e57b97986988c704822ac65b30537 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 24 Sep 2018 20:58:31 -0700 Subject: [PATCH 484/904] [BOLT] Keep .text section in file when using old text Summary: If we reuse text section under `-use-old-text` option, then there's no need to rename it. Tools, such as perf, seem to not like binaries without `.text`. Additionally, check if the code fits into `.text` using the page alignment, otherwise we were skipping the alignment relying on the user detecting the warning message. This could have resulted in unexpected performance drops. Also add `-no-huge-pages` option to use regular page size for code alignment purposes (i.e. 4KiB instead of 2MiB). (cherry picked from commit a6d41445c687903c576951f95323b4ee0355f0e1) --- bolt/src/BinaryContext.cpp | 43 ++++++++++++++++++++++++++++++++++++ bolt/src/BinaryContext.h | 29 +++++++++--------------- bolt/src/Passes/LongJmp.cpp | 5 ++--- bolt/src/RewriteInstance.cpp | 43 ++++++++++++++++++++---------------- bolt/src/RewriteInstance.h | 3 --- 5 files changed, 79 insertions(+), 44 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 32da5830479b..dad0bfdf9030 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -34,6 +34,13 @@ extern cl::OptionCategory BoltCategory; extern cl::opt Verbosity; +cl::opt +NoHugePages("no-huge-pages", + cl::desc("use regular size pages for code alignment"), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltCategory)); + static cl::opt PrintDebugInfo("print-debug-info", cl::desc("print debug info when printing functions"), @@ -57,6 +64,42 @@ PrintMemData("print-mem-data", } // namespace opts +BinaryContext::BinaryContext(std::unique_ptr Ctx, + std::unique_ptr DwCtx, + std::unique_ptr TheTriple, + const Target *TheTarget, + std::string TripleName, + std::unique_ptr MCE, + std::unique_ptr MOFI, + std::unique_ptr AsmInfo, + std::unique_ptr MII, + std::unique_ptr STI, + std::unique_ptr InstPrinter, + std::unique_ptr MIA, + std::unique_ptr MIB, + std::unique_ptr MRI, + std::unique_ptr DisAsm, + DataReader &DR) + : Ctx(std::move(Ctx)), + DwCtx(std::move(DwCtx)), + TheTriple(std::move(TheTriple)), + TheTarget(TheTarget), + TripleName(TripleName), + MCE(std::move(MCE)), + MOFI(std::move(MOFI)), + AsmInfo(std::move(AsmInfo)), + MII(std::move(MII)), + STI(std::move(STI)), + InstPrinter(std::move(InstPrinter)), + MIA(std::move(MIA)), + MIB(std::move(MIB)), + MRI(std::move(MRI)), + DisAsm(std::move(DisAsm)), + DR(DR) { + Relocation::Arch = this->TheTriple->getArch(); + PageAlign = opts::NoHugePages ? RegularPageSize : HugePageSize; +} + BinaryContext::~BinaryContext() { for (auto *Section : Sections) { delete Section; diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index b5c513dd2fa5..be5b581aa72d 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -211,6 +211,12 @@ class BinaryContext { } public: + /// Regular page size. + static constexpr unsigned RegularPageSize = 0x1000; + + /// Huge page size to use. + static constexpr unsigned HugePageSize = 0x200000; + /// Map address to a constant island owner (constant data in code section) std::map AddressToConstantIslandMap; @@ -282,6 +288,9 @@ class BinaryContext { uint64_t OldTextSectionOffset{0}; uint64_t OldTextSectionSize{0}; + /// Page alignment used for code layout. + uint64_t PageAlign{HugePageSize}; + /// True if the binary requires immediate relocation processing. bool RequiresZNow{false}; @@ -303,25 +312,7 @@ class BinaryContext { std::unique_ptr MIB, std::unique_ptr MRI, std::unique_ptr DisAsm, - DataReader &DR) : - Ctx(std::move(Ctx)), - DwCtx(std::move(DwCtx)), - TheTriple(std::move(TheTriple)), - TheTarget(TheTarget), - TripleName(TripleName), - MCE(std::move(MCE)), - MOFI(std::move(MOFI)), - AsmInfo(std::move(AsmInfo)), - MII(std::move(MII)), - STI(std::move(STI)), - InstPrinter(std::move(InstPrinter)), - MIA(std::move(MIA)), - MIB(std::move(MIB)), - MRI(std::move(MRI)), - DisAsm(std::move(DisAsm)), - DR(DR) { - Relocation::Arch = this->TheTriple->getArch(); - } + DataReader &DR); ~BinaryContext(); diff --git a/bolt/src/Passes/LongJmp.cpp b/bolt/src/Passes/LongJmp.cpp index c5ebd85792f2..67d233a030dc 100644 --- a/bolt/src/Passes/LongJmp.cpp +++ b/bolt/src/Passes/LongJmp.cpp @@ -26,7 +26,6 @@ namespace bolt { namespace { constexpr unsigned ColdFragAlign = 16; -constexpr unsigned PageAlign = 0x200000; std::pair, MCSymbol *> createNewStub(const BinaryContext &BC, BinaryFunction &Func, @@ -298,12 +297,12 @@ void LongJmpPass::tentativeLayout( // Initial padding if (opts::UseOldText && EstimatedTextSize <= BC.OldTextSectionSize) { DotAddress = BC.OldTextSectionAddress; - auto Pad = OffsetToAlignment(DotAddress, PageAlign); + auto Pad = OffsetToAlignment(DotAddress, BC.PageAlign); if (Pad + EstimatedTextSize <= BC.OldTextSectionSize) { DotAddress += Pad; } } else { - DotAddress = alignTo(BC.LayoutStartAddress, PageAlign); + DotAddress = alignTo(BC.LayoutStartAddress, BC.PageAlign); } tentativeLayoutRelocMode(BC, SortedFunctions, DotAddress); diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 3f49932cd4eb..43e18366d236 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -859,8 +859,8 @@ void RewriteInstance::discoverStorage() { FirstNonAllocatableOffset = NextAvailableOffset; - NextAvailableAddress = alignTo(NextAvailableAddress, PageAlign); - NextAvailableOffset = alignTo(NextAvailableOffset, PageAlign); + NextAvailableAddress = alignTo(NextAvailableAddress, BC->PageAlign); + NextAvailableOffset = alignTo(NextAvailableOffset, BC->PageAlign); if (!opts::UseGnuStack) { // This is where the black magic happens. Creating PHDR table in a segment @@ -2993,25 +2993,25 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { assert(TextSection && ".text not found in output"); uint64_t NewTextSectionOffset = 0; + auto Padding = OffsetToAlignment(BC->OldTextSectionAddress, BC->PageAlign); if (opts::UseOldText && - TextSection->getOutputSize() <= BC->OldTextSectionSize) { - outs() << "BOLT-INFO: using original .text for new code\n"; + Padding + TextSection->getOutputSize() <= BC->OldTextSectionSize) { // Utilize the original .text for storage. - NewTextSectionStartAddress = BC->OldTextSectionAddress; - NewTextSectionOffset = BC->OldTextSectionOffset; - auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); - if (Padding + TextSection->getOutputSize() <= BC->OldTextSectionSize) { - outs() << "BOLT-INFO: using 0x200000 alignment\n"; - NewTextSectionStartAddress += Padding; - NewTextSectionOffset += Padding; - } + outs() << "BOLT-INFO: using original .text for new code with 0x" + << Twine::utohexstr(BC->PageAlign) << " alignment\n"; + NewTextSectionStartAddress = BC->OldTextSectionAddress + Padding; + NewTextSectionOffset = BC->OldTextSectionOffset + Padding; } else { if (opts::UseOldText) { - errs() << "BOLT-ERROR: original .text too small to fit the new code. " - << TextSection->getOutputSize() << " bytes needed, have " - << BC->OldTextSectionSize << " bytes available.\n"; + errs() << "BOLT-WARNING: original .text too small to fit the new code" + << " using 0x" << Twine::utohexstr(BC->PageAlign) + << " aligment. " << Padding + TextSection->getOutputSize() + << " bytes needed, have " << BC->OldTextSectionSize + << " bytes available.\n"; + opts::UseOldText = false; } - auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); + auto Padding = OffsetToAlignment(NewTextSectionStartAddress, + BC->PageAlign); NextAvailableAddress += Padding; NewTextSectionStartAddress = NextAvailableAddress; NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress); @@ -3033,7 +3033,8 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { assert(TextSection && ".text not found in output"); if (TextSection->hasValidSectionID()) { uint64_t NewTextSectionOffset = 0; - auto Padding = OffsetToAlignment(NewTextSectionStartAddress, PageAlign); + auto Padding = OffsetToAlignment(NewTextSectionStartAddress, + BC->PageAlign); NextAvailableAddress += Padding; NewTextSectionStartAddress = NextAvailableAddress; NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress); @@ -3437,7 +3438,7 @@ void RewriteInstance::patchELFPHDRTable() { NewPhdr.p_filesz = NewTextSegmentSize; NewPhdr.p_memsz = NewTextSegmentSize; NewPhdr.p_flags = ELF::PF_X | ELF::PF_R; - NewPhdr.p_align = PageAlign; + NewPhdr.p_align = BC->PageAlign; ModdedGnuStack = true; } else if (!opts::UseGnuStack && Phdr.p_type == ELF::PT_DYNAMIC) { // Insert new pheader @@ -3449,7 +3450,7 @@ void RewriteInstance::patchELFPHDRTable() { NewTextPhdr.p_filesz = NewTextSegmentSize; NewTextPhdr.p_memsz = NewTextSegmentSize; NewTextPhdr.p_flags = ELF::PF_X | ELF::PF_R; - NewTextPhdr.p_align = PageAlign; + NewTextPhdr.p_align = BC->PageAlign; OS.write(reinterpret_cast(&NewTextPhdr), sizeof(NewTextPhdr)); AddedSegment = true; @@ -4708,6 +4709,10 @@ bool RewriteInstance::willOverwriteSection(StringRef SectionName) { return true; } + // Special handling for .text + if (SectionName == ".text" && opts::UseOldText) + return false; + auto Section = BC->getUniqueSectionByName(SectionName); return Section && Section->isAllocatable() && Section->isFinalized(); } diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index be00f87aa94f..fb0f4b42f038 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -404,9 +404,6 @@ class RewriteInstance { static const char TimerGroupDesc[]; - /// Huge page size used for alignment. - static constexpr unsigned PageAlign = 0x200000; - /// Alignment value used for .eh_frame_hdr. static constexpr uint64_t EHFrameHdrAlign = 4; From b7a42ebc294787526bce3807ce946f5829d92b8b Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 17 Sep 2018 13:36:59 -0700 Subject: [PATCH 485/904] [BOLT] Change stub-insertion pass for AArch64 Summary: Previously, we were expanding eligible branches with stubs. After expansion, we were computing which stubs were unnecessary and removing them, assuming ranges were shortening as code is removed. The problem with this approach is that for branches that refer to code that is not managed by BOLT, the distance to that location can increase and we can end up with an out-of-range branch. This rewrites the pass to be simpler, only increasing size and expanding code with stubs as needed after each iteration, stopping when code stops increasing. Besides this rewrite, the stub-insertion pass now supports stubs grouping similar to what the linker does, allowing different functions to share the same veneer that jumps to a common callee. It also fixes a bug in the previous implementation that, in very large functions that use TBZ/TBNZ (+-32KB range), it would mistakenly try to reuse a local stub BB that is out of range. This includes a change to allow hot functions to be put at the end of the .text section, closer to the heap, requiring no veneers to jump to JITted code. And finally it enables eliminate veneers pass by default. (cherry picked from commit 813065787d6217572c67275e2c0c395e230c106c) --- bolt/src/BinaryContext.cpp | 31 +- bolt/src/BinaryFunction.cpp | 8 +- bolt/src/MCPlusBuilder.h | 2 +- bolt/src/Passes/LongJmp.cpp | 572 +++++++++++------- bolt/src/Passes/LongJmp.h | 125 ++-- bolt/src/Passes/VeneerElimination.cpp | 2 +- bolt/src/RewriteInstance.cpp | 36 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 2 +- 8 files changed, 483 insertions(+), 295 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index dad0bfdf9030..5a21a812b576 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -62,6 +62,14 @@ PrintMemData("print-mem-data", cl::ZeroOrMore, cl::cat(BoltCategory)); +cl::opt +HotFunctionsAtEnd( + "hot-functions-at-end", + cl::desc( + "if reorder-functions is used, order functions putting hottest last"), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + } // namespace opts BinaryContext::BinaryContext(std::unique_ptr Ctx, @@ -683,6 +691,15 @@ unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID, return cantFail(Ctx->getDwarfFile(Dir, FileName, 0, nullptr, None, DestCUID)); } +template +bool Comparator(const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } else { + return FuncsAtEnd ? B->hasValidIndex() : A->hasValidIndex(); + } +} + std::vector BinaryContext::getSortedFunctions( std::map &BinaryFunctions) { std::vector SortedFunctions(BinaryFunctions.size()); @@ -692,14 +709,14 @@ std::vector BinaryContext::getSortedFunctions( return &BFI.second; }); + if (opts::HotFunctionsAtEnd) { + std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), + Comparator); + return SortedFunctions; + } + std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), - [](const BinaryFunction *A, const BinaryFunction *B) { - if (A->hasValidIndex() && B->hasValidIndex()) { - return A->getIndex() < B->getIndex(); - } else { - return A->hasValidIndex(); - } - }); + Comparator); return SortedFunctions; } diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 555dcb49151a..48bbc3620160 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3241,8 +3241,7 @@ void BinaryFunction::fixBranches() { assert(CondBranch && "conditional branch expected"); const auto *TSuccessor = BB->getConditionalSuccessor(true); const auto *FSuccessor = BB->getConditionalSuccessor(false); - if (NextBB && NextBB == TSuccessor && - !BC.MIB->hasAnnotation(*CondBranch, "DoNotChangeTarget")) { + if (NextBB && NextBB == TSuccessor) { std::swap(TSuccessor, FSuccessor); MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx); BB->swapConditionalSuccessors(); @@ -3252,10 +3251,7 @@ void BinaryFunction::fixBranches() { if (TSuccessor == FSuccessor) { BB->removeDuplicateConditionalSuccessor(CondBranch); } - if (!NextBB || - ((NextBB != TSuccessor || - BC.MIB->hasAnnotation(*CondBranch, "DoNotChangeTarget")) && - NextBB != FSuccessor)) { + if (!NextBB || (NextBB != TSuccessor && NextBB != FSuccessor)) { BB->addBranchInstruction(FSuccessor); } } diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index 3438342a8aa1..d4beb0862228 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -1102,7 +1102,7 @@ class MCPlusBuilder { /// Returns how many bits we have in this instruction to encode a PC-rel /// imm. - virtual int getPCRelEncodingSize(MCInst &Inst) const { + virtual int getPCRelEncodingSize(const MCInst &Inst) const { llvm_unreachable("not implemented"); return 0; } diff --git a/bolt/src/Passes/LongJmp.cpp b/bolt/src/Passes/LongJmp.cpp index 67d233a030dc..e8b6719d1bcc 100644 --- a/bolt/src/Passes/LongJmp.cpp +++ b/bolt/src/Passes/LongJmp.cpp @@ -16,9 +16,19 @@ using namespace llvm; namespace opts { +extern cl::OptionCategory BoltOptCategory; + extern cl::opt UseOldText; extern cl::opt AlignFunctions; extern cl::opt AlignFunctionsMaxBytes; +extern cl::opt HotFunctionsAtEnd; + +static cl::opt +GroupStubs("group-stubs", + cl::desc("share stubs across functions"), + cl::init(true), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); } namespace llvm { @@ -27,34 +37,20 @@ namespace bolt { namespace { constexpr unsigned ColdFragAlign = 16; -std::pair, MCSymbol *> -createNewStub(const BinaryContext &BC, BinaryFunction &Func, - const MCSymbol *TgtSym) { - auto *StubSym = BC.Ctx->createTempSymbol("Stub", true); - auto StubBB = Func.createBasicBlock(0, StubSym); - std::vector Seq; - BC.MIB->createLongJmp(Seq, TgtSym, BC.Ctx.get()); - StubBB->addInstructions(Seq.begin(), Seq.end()); - StubBB->setExecutionCount(0); - return std::make_pair(std::move(StubBB), StubSym); -} - -void shrinkStubToShortJmp(const BinaryContext &BC, BinaryBasicBlock &StubBB, - const MCSymbol *Tgt) { +void relaxStubToShortJmp(BinaryBasicBlock &StubBB, const MCSymbol *Tgt) { + const BinaryContext &BC = StubBB.getFunction()->getBinaryContext(); std::vector Seq; BC.MIB->createShortJmp(Seq, Tgt, BC.Ctx.get()); StubBB.clear(); StubBB.addInstructions(Seq.begin(), Seq.end()); } -void shrinkStubToSingleInst(const BinaryContext &BC, BinaryBasicBlock &StubBB, - const MCSymbol *Tgt, bool TgtIsFunc) { - MCInst Inst; - BC.MIB->createUncondBranch(Inst, Tgt, BC.Ctx.get()); - if (TgtIsFunc) - BC.MIB->convertJmpToTailCall(Inst, BC.Ctx.get()); +void relaxStubToLongJmp(BinaryBasicBlock &StubBB, const MCSymbol *Tgt) { + const BinaryContext &BC = StubBB.getFunction()->getBinaryContext(); + std::vector Seq; + BC.MIB->createLongJmp(Seq, Tgt, BC.Ctx.get()); StubBB.clear(); - StubBB.addInstruction(Inst); + StubBB.addInstructions(Seq.begin(), Seq.end()); } BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) { @@ -69,41 +65,168 @@ BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) { } llvm_unreachable("No hot-colt split point found"); } + +bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) { + return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) && + !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst); +} + +} // end anonymous namespace + +std::pair, MCSymbol *> +LongJmpPass::createNewStub(BinaryBasicBlock &SourceBB, const MCSymbol *TgtSym, + bool TgtIsFunc, uint64_t AtAddress) { + BinaryFunction &Func = *SourceBB.getFunction(); + const BinaryContext &BC = Func.getBinaryContext(); + const bool IsCold = SourceBB.isCold(); + auto *StubSym = BC.Ctx->createTempSymbol("Stub", true); + auto StubBB = Func.createBasicBlock(0, StubSym); + MCInst Inst; + BC.MIB->createUncondBranch(Inst, TgtSym, BC.Ctx.get()); + if (TgtIsFunc) + BC.MIB->convertJmpToTailCall(Inst, BC.Ctx.get()); + StubBB->addInstruction(Inst); + StubBB->setExecutionCount(0); + + // Register this in stubs maps + auto registerInMap = [&](StubGroupsTy &Map) { + auto &StubGroup = Map[TgtSym]; + StubGroup.insert( + std::lower_bound( + StubGroup.begin(), StubGroup.end(), + std::make_pair(AtAddress, nullptr), + [&](const std::pair &LHS, + const std::pair &RHS) { + return LHS.first < RHS.first; + }), + std::make_pair(AtAddress, StubBB.get())); + }; + + Stubs[&Func].insert(StubBB.get()); + StubBits[StubBB.get()] = BC.MIB->getUncondBranchEncodingSize(); + if (IsCold) { + registerInMap(ColdLocalStubs[&Func]); + if (opts::GroupStubs && TgtIsFunc) + registerInMap(ColdStubGroups); + ++NumColdStubs; + } else { + registerInMap(HotLocalStubs[&Func]); + if (opts::GroupStubs && TgtIsFunc) + registerInMap(HotStubGroups); + ++NumHotStubs; + } + + return std::make_pair(std::move(StubBB), StubSym); +} + +BinaryBasicBlock *LongJmpPass::lookupStubFromGroup( + const StubGroupsTy &StubGroups, const BinaryFunction &Func, + const MCInst &Inst, const MCSymbol *TgtSym, uint64_t DotAddress) const { + const BinaryContext &BC = Func.getBinaryContext(); + auto CandidatesIter = StubGroups.find(TgtSym); + if (CandidatesIter == StubGroups.end()) + return nullptr; + auto &Candidates = CandidatesIter->second; + if (Candidates.empty()) + return nullptr; + auto Cand = std::lower_bound( + Candidates.begin(), Candidates.end(), std::make_pair(DotAddress, nullptr), + [&](const std::pair &LHS, + const std::pair &RHS) { + return LHS.first < RHS.first; + }); + if (Cand != Candidates.begin()) { + auto LeftCand = Cand; + --LeftCand; + if (Cand->first - DotAddress > + DotAddress - LeftCand->first) + Cand = LeftCand; + } + auto BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 1; + uint64_t Mask = ~((1ULL << BitsAvail) - 1); + uint64_t PCRelTgtAddress = Cand->first; + PCRelTgtAddress = DotAddress > PCRelTgtAddress ? DotAddress - PCRelTgtAddress + : PCRelTgtAddress - DotAddress; + DEBUG({ + if (Candidates.size() > 1) + dbgs() << "Considering stub group with " << Candidates.size() + << " candidates. DotAddress is " << Twine::utohexstr(DotAddress) + << ", chosen candidate address is " + << Twine::utohexstr(Cand->first) << "\n"; + }); + return PCRelTgtAddress & Mask ? nullptr : Cand->second; +} + +BinaryBasicBlock * +LongJmpPass::lookupGlobalStub(const BinaryBasicBlock &SourceBB, + const MCInst &Inst, const MCSymbol *TgtSym, + uint64_t DotAddress) const { + const BinaryFunction &Func = *SourceBB.getFunction(); + const StubGroupsTy &StubGroups = + SourceBB.isCold() ? ColdStubGroups : HotStubGroups; + return lookupStubFromGroup(StubGroups, Func, Inst, TgtSym, + DotAddress); +} + +BinaryBasicBlock *LongJmpPass::lookupLocalStub(const BinaryBasicBlock &SourceBB, + const MCInst &Inst, + const MCSymbol *TgtSym, + uint64_t DotAddress) const { + const BinaryFunction &Func = *SourceBB.getFunction(); + const DenseMap &StubGroups = + SourceBB.isCold() ? ColdLocalStubs : HotLocalStubs; + const auto Iter = StubGroups.find(&Func); + if (Iter == StubGroups.end()) + return nullptr; + return lookupStubFromGroup(Iter->second, Func, Inst, TgtSym, DotAddress); } std::unique_ptr -LongJmpPass::replaceTargetWithStub(const BinaryContext &BC, - BinaryFunction &Func, BinaryBasicBlock &BB, - MCInst &Inst) { +LongJmpPass::replaceTargetWithStub(BinaryBasicBlock &BB, MCInst &Inst, + uint64_t DotAddress, + uint64_t StubCreationAddress) { + const BinaryFunction &Func = *BB.getFunction(); + const BinaryContext &BC = Func.getBinaryContext(); std::unique_ptr NewBB; auto TgtSym = BC.MIB->getTargetSymbol(Inst); assert (TgtSym && "getTargetSymbol failed"); BinaryBasicBlock::BinaryBranchInfo BI{0, 0}; auto *TgtBB = BB.getSuccessor(TgtSym, BI); - // Do not issue a long jmp for blocks in the same region, except if - // the region is too large to fit in this branch - if (TgtBB && TgtBB->isCold() == BB.isCold()) { - // Suppose we have half the available space to account for increase in the - // function size due to extra blocks being inserted (conservative estimate) - auto BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 2; - uint64_t Mask = ~((1ULL << BitsAvail) - 1); - if (!(Func.getMaxSize() & Mask)) - return nullptr; - // This is a special case for fixBranches, which is usually free to swap - // targets when a block has two successors. The other successor may not - // fit in this instruction as well. - BC.MIB->addAnnotation(Inst, "DoNotChangeTarget", true); + auto LocalStubsIter = Stubs.find(&Func); + + // If already using stub and the stub is from another function, create a local + // stub, since the foreign stub is now out of range + if (!TgtBB) { + auto SSIter = SharedStubs.find(TgtSym); + if (SSIter != SharedStubs.end()) { + TgtSym = BC.MIB->getTargetSymbol(*SSIter->second->begin()); + --NumSharedStubs; + } + } else if (LocalStubsIter != Stubs.end() && + LocalStubsIter->second.count(TgtBB)) { + // If we are replacing a local stub (because it is now out of range), + // use its target instead of creating a stub to jump to another stub + TgtSym = BC.MIB->getTargetSymbol(*TgtBB->begin()); + TgtBB = BB.getSuccessor(TgtSym, BI); } - BinaryBasicBlock *StubBB = - BB.isCold() ? ColdStubs[&Func][TgtSym] : HotStubs[&Func][TgtSym]; + BinaryBasicBlock *StubBB = lookupLocalStub(BB, Inst, TgtSym, DotAddress); + // If not found, look it up in globally shared stub maps if it is a function + // call (TgtBB is not set) + if (!StubBB && !TgtBB) { + StubBB = lookupGlobalStub(BB, Inst, TgtSym, DotAddress); + if (StubBB) { + SharedStubs[StubBB->getLabel()] = StubBB; + ++NumSharedStubs; + } + } MCSymbol *StubSymbol = StubBB ? StubBB->getLabel() : nullptr; if (!StubBB) { - std::tie(NewBB, StubSymbol) = createNewStub(BC, Func, TgtSym); + std::tie(NewBB, StubSymbol) = + createNewStub(BB, TgtSym, /*is func?*/ !TgtBB, StubCreationAddress); StubBB = NewBB.get(); - Stubs[&Func].insert(StubBB); } // Local branch @@ -128,72 +251,34 @@ LongJmpPass::replaceTargetWithStub(const BinaryContext &BC, } } BC.MIB->replaceBranchTarget(Inst, StubSymbol, BC.Ctx.get()); - ++StubRefCount[StubBB]; - StubBits[StubBB] = BC.AsmInfo->getCodePointerSize() * 8; - - if (NewBB) { - if (BB.isCold()) - ColdStubs[&Func][TgtSym] = StubBB; - else - HotStubs[&Func][TgtSym] = StubBB; - } return NewBB; } -namespace { - -bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) { - return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) && - !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst); -} - -} - -void LongJmpPass::insertStubs(const BinaryContext &BC, BinaryFunction &Func) { - std::vector>> - Insertions; - - BinaryBasicBlock *Frontier = getBBAtHotColdSplitPoint(Func); - - for (auto &BB : Func) { - for (auto &Inst : BB) { - // Only analyze direct branches with target distance constraints - if (!shouldInsertStub(BC, Inst)) - continue; - - // Insert stubs close to the patched BB if call, but far away from the - // hot path if a branch, since this branch target is the cold region. - BinaryBasicBlock *InsertionPoint = &BB; - if (!BC.MIB->isCall(Inst) && Frontier && !BB.isCold()) { - auto BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 2; - uint64_t Mask = ~((1ULL << BitsAvail) - 1); - if (!(Func.getMaxSize() & Mask)) - InsertionPoint = Frontier; +void LongJmpPass::updateStubGroups() { + auto update = [&](StubGroupsTy &StubGroups) { + for (auto &KeyVal : StubGroups) { + for (auto &Elem : KeyVal.second) { + Elem.first = BBAddresses[Elem.second]; } - // Always put stubs at the end of the function if non-simple. We can't - // change the layout of non-simple functions because it has jump tables - // that we do not control. - if (!Func.isSimple()) - InsertionPoint = &*std::prev(Func.end()); - // Create a stub to handle a far-away target - Insertions.emplace_back(std::make_pair( - InsertionPoint, replaceTargetWithStub(BC, Func, BB, Inst))); + std::sort(KeyVal.second.begin(), KeyVal.second.end(), + [&](const std::pair &LHS, + const std::pair &RHS) { + return LHS.first < RHS.first; + }); } - } - - for (auto &Elmt : Insertions) { - if (!Elmt.second) - continue; - std::vector> NewBBs; - NewBBs.emplace_back(std::move(Elmt.second)); - Func.insertBasicBlocks(Elmt.first, std::move(NewBBs), true, true); - } - + }; + + for (auto &KeyVal : HotLocalStubs) + update(KeyVal.second); + for (auto &KeyVal : ColdLocalStubs) + update(KeyVal.second); + update(HotStubGroups); + update(ColdStubGroups); } -void LongJmpPass::tentativeBBLayout(const BinaryContext &BC, - const BinaryFunction &Func) { +void LongJmpPass::tentativeBBLayout(const BinaryFunction &Func) { + const BinaryContext &BC = Func.getBinaryContext(); uint64_t HotDot = HotAddresses[&Func]; uint64_t ColdDot = ColdAddresses[&Func]; bool Cold{false}; @@ -235,21 +320,32 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode( // Compute hot cold frontier uint32_t LastHotIndex = -1u; uint32_t CurrentIndex = 0; - for (auto *BF : SortedFunctions) { - if (!BF->hasValidIndex() && LastHotIndex == -1u) { - LastHotIndex = CurrentIndex; + if (opts::HotFunctionsAtEnd) { + for (auto *BF : SortedFunctions) { + if (BF->hasValidIndex() && LastHotIndex == -1u) { + LastHotIndex = CurrentIndex; + } + ++CurrentIndex; + } + } else { + for (auto *BF : SortedFunctions) { + if (!BF->hasValidIndex() && LastHotIndex == -1u) { + LastHotIndex = CurrentIndex; + } + ++CurrentIndex; } - ++CurrentIndex; } // Hot CurrentIndex = 0; bool ColdLayoutDone = false; for (auto Func : SortedFunctions) { - if (!ColdLayoutDone && CurrentIndex >= LastHotIndex){ + if (!ColdLayoutDone && CurrentIndex >= LastHotIndex) { DotAddress = tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress); ColdLayoutDone = true; + if (opts::HotFunctionsAtEnd) + DotAddress = alignTo(DotAddress, BC.PageAlign); } DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign); @@ -268,7 +364,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode( } // BBs for (auto Func : SortedFunctions) - tentativeBBLayout(BC, *Func); + tentativeBBLayout(*Func); return DotAddress; } @@ -285,7 +381,7 @@ void LongJmpPass::tentativeLayout( ColdAddresses[Func] = DotAddress; if (Func->isSplit()) DotAddress += Func->estimateColdSize(); - tentativeBBLayout(BC, *Func); + tentativeBBLayout(*Func); } return; @@ -308,34 +404,9 @@ void LongJmpPass::tentativeLayout( tentativeLayoutRelocMode(BC, SortedFunctions, DotAddress); } -void LongJmpPass::removeStubRef(const BinaryContext &BC, - BinaryBasicBlock *BB, MCInst &Inst, - BinaryBasicBlock *StubBB, - const MCSymbol *Target, - BinaryBasicBlock *TgtBB) { - BC.MIB->replaceBranchTarget(Inst, Target, BC.Ctx.get()); - - --StubRefCount[StubBB]; - assert(StubRefCount[StubBB] >= 0 && "Ref count is lost"); - - if (TgtBB && BB->isSuccessor(StubBB)) { - const auto &BI = BB->getBranchInfo(*StubBB); - uint64_t OrigCount{BI.Count}; - uint64_t OrigMispreds{BI.MispredictedCount}; - BB->replaceSuccessor(StubBB, TgtBB, OrigCount, OrigMispreds); - } - - if (StubRefCount[StubBB] == 0) { - // Remove the block from CFG - StubBB->removeAllSuccessors(); - StubBB->markValid(false); - StubBB->setEntryPoint(false); - } -} - -bool LongJmpPass::usesStub(const BinaryContext &BC, const BinaryFunction &Func, +bool LongJmpPass::usesStub(const BinaryFunction &Func, const MCInst &Inst) const { - auto TgtSym = BC.MIB->getTargetSymbol(Inst); + auto TgtSym = Func.getBinaryContext().MIB->getTargetSymbol(Inst); auto *TgtBB = Func.getBasicBlockForLabel(TgtSym); auto Iter = Stubs.find(&Func); if (Iter != Stubs.end()) @@ -348,7 +419,7 @@ uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC, const BinaryBasicBlock *TgtBB) const { if (TgtBB) { auto Iter = BBAddresses.find(TgtBB); - assert (Iter != BBAddresses.end() && "Unrecognized local BB"); + assert (Iter != BBAddresses.end() && "Unrecognized BB"); return Iter->second; } auto *TargetFunc = BC.getFunctionForSymbol(Target); @@ -363,140 +434,193 @@ uint64_t LongJmpPass::getSymbolAddress(const BinaryContext &BC, return Iter->second; } -bool LongJmpPass::removeOrShrinkStubs(const BinaryContext &BC, - BinaryFunction &Func) { +bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) { + const BinaryFunction &Func = *StubBB.getFunction(); + const BinaryContext &BC = Func.getBinaryContext(); + const auto Bits = StubBits[&StubBB]; + // Already working with the largest range? + if (Bits == static_cast(BC.AsmInfo->getCodePointerSize() * 8)) + return false; + + const static auto RangeShortJmp = BC.MIB->getShortJmpEncodingSize(); + const static auto RangeSingleInstr = BC.MIB->getUncondBranchEncodingSize(); + const static uint64_t ShortJmpMask = ~((1ULL << RangeShortJmp) - 1); + const static uint64_t SingleInstrMask = + ~((1ULL << (RangeSingleInstr - 1)) - 1); + + auto *RealTargetSym = BC.MIB->getTargetSymbol(*StubBB.begin()); + auto *TgtBB = Func.getBasicBlockForLabel(RealTargetSym); + uint64_t TgtAddress = getSymbolAddress(BC, RealTargetSym, TgtBB); + uint64_t DotAddress = BBAddresses[&StubBB]; + uint64_t PCRelTgtAddress = DotAddress > TgtAddress ? DotAddress - TgtAddress + : TgtAddress - DotAddress; + // If it fits in one instruction, do not relax + if (!(PCRelTgtAddress & SingleInstrMask)) + return false; + + // Fits short jmp + if (!(PCRelTgtAddress & ShortJmpMask)) { + if (Bits >= RangeShortJmp) + return false; + + DEBUG(dbgs() << "Relaxing stub to short jump. PCRelTgtAddress = " + << Twine::utohexstr(PCRelTgtAddress) + << " RealTargetSym = " << RealTargetSym->getName() << "\n"); + relaxStubToShortJmp(StubBB, RealTargetSym); + StubBits[&StubBB] = RangeShortJmp; + return true; + } + + // Needs a long jmp + if (Bits > RangeShortJmp) + return false; + + DEBUG(dbgs() << "Relaxing stub to long jump. PCRelTgtAddress = " + << Twine::utohexstr(PCRelTgtAddress) + << " RealTargetSym = " << RealTargetSym->getName() << "\n"); + relaxStubToLongJmp(StubBB, RealTargetSym); + StubBits[&StubBB] = static_cast(BC.AsmInfo->getCodePointerSize() * 8); + return true; +} + +bool LongJmpPass::needsStub(const BinaryBasicBlock &BB, const MCInst &Inst, + uint64_t DotAddress) const { + const BinaryFunction &Func = *BB.getFunction(); + const BinaryContext &BC = Func.getBinaryContext(); + auto TgtSym = BC.MIB->getTargetSymbol(Inst); + assert (TgtSym && "getTargetSymbol failed"); + + auto *TgtBB = Func.getBasicBlockForLabel(TgtSym); + // Check for shared stubs from foreign functions + if (!TgtBB) { + auto SSIter = SharedStubs.find(TgtSym); + if (SSIter != SharedStubs.end()) { + TgtBB = SSIter->second; + } + } + + auto BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 1; + uint64_t Mask = ~((1ULL << BitsAvail) - 1); + + uint64_t PCRelTgtAddress = getSymbolAddress(BC, TgtSym, TgtBB); + PCRelTgtAddress = DotAddress > PCRelTgtAddress ? DotAddress - PCRelTgtAddress + : PCRelTgtAddress - DotAddress; + + return PCRelTgtAddress & Mask; +} + +bool LongJmpPass::relax(BinaryFunction &Func) { + const BinaryContext &BC = Func.getBinaryContext(); bool Modified{false}; assert(BC.isAArch64() && "Unsupported arch"); constexpr auto InsnSize = 4; // AArch64 - // Remove unnecessary stubs for branch targets we know we can fit in the + std::vector>> + Insertions; + + BinaryBasicBlock *Frontier = getBBAtHotColdSplitPoint(Func); + uint64_t FrontierAddress = Frontier ? BBAddresses[Frontier] : 0; + if (FrontierAddress) { + FrontierAddress += Frontier->getNumNonPseudos() * InsnSize; + } + // Add necessary stubs for branch targets we know we can't fit in the // instruction for (auto &BB : Func) { uint64_t DotAddress = BBAddresses[&BB]; + // Stubs themselves are relaxed on the next loop + if (Stubs[&Func].count(&BB)) + continue; + for (auto &Inst : BB) { - if (!shouldInsertStub(BC, Inst) || !usesStub(BC, Func, Inst)) { + if (BC.MII->get(Inst.getOpcode()).isPseudo()) + continue; + + if (!shouldInsertStub(BC, Inst)) { DotAddress += InsnSize; continue; } - // Compute DoNotChangeTarget annotation, when fixBranches cannot swap - // targets - if (BC.MIB->isConditionalBranch(Inst) && BB.succ_size() == 2) { - auto *SuccBB = BB.getConditionalSuccessor(false); - bool IsStub = false; - auto Iter = Stubs.find(&Func); - if (Iter != Stubs.end()) - IsStub = Iter->second.count(SuccBB); - auto *RealTargetSym = - IsStub ? BC.MIB->getTargetSymbol(*SuccBB->begin()) : nullptr; - if (IsStub) - SuccBB = Func.getBasicBlockForLabel(RealTargetSym); - uint64_t Offset = getSymbolAddress(BC, RealTargetSym, SuccBB); + // Check and relax direct branch or call + if (!needsStub(BB, Inst, DotAddress)) { + DotAddress += InsnSize; + continue; + } + Modified = true; + + // Insert stubs close to the patched BB if call, but far away from the + // hot path if a branch, since this branch target is the cold region + // (but first check that the far away stub will be in range). + BinaryBasicBlock *InsertionPoint = &BB; + if (Func.isSimple() && !BC.MIB->isCall(Inst) && FrontierAddress && + !BB.isCold()) { auto BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 1; uint64_t Mask = ~((1ULL << BitsAvail) - 1); - if ((Offset & Mask) && - !BC.MIB->hasAnnotation(Inst, "DoNotChangeTarget")) { - BC.MIB->addAnnotation(Inst, "DoNotChangeTarget", true); - } else if ((!(Offset & Mask)) && - BC.MIB->hasAnnotation(Inst, "DoNotChangeTarget")) { - BC.MIB->removeAnnotation(Inst, "DoNotChangeTarget"); - } + assert(FrontierAddress > DotAddress && + "Hot code should be before the frontier"); + uint64_t PCRelTgt = FrontierAddress - DotAddress; + if (!(PCRelTgt & Mask)) + InsertionPoint = Frontier; } + // Always put stubs at the end of the function if non-simple. We can't + // change the layout of non-simple functions because it has jump tables + // that we do not control. + if (!Func.isSimple()) + InsertionPoint = &*std::prev(Func.end()); - auto StubSym = BC.MIB->getTargetSymbol(Inst); - auto *StubBB = Func.getBasicBlockForLabel(StubSym); - auto *RealTargetSym = BC.MIB->getTargetSymbol(*StubBB->begin()); - auto *TgtBB = Func.getBasicBlockForLabel(RealTargetSym); - auto BitsAvail = BC.MIB->getPCRelEncodingSize(Inst) - 1; - uint64_t Mask = ~((1ULL << BitsAvail) - 1); - uint64_t Offset = getSymbolAddress(BC, RealTargetSym, TgtBB); - if (DotAddress > Offset) - Offset = DotAddress - Offset; - else - Offset -= DotAddress; - // If it fits in the original instr, remove the stub - if (!(Offset & Mask)) { - removeStubRef(BC, &BB, Inst, StubBB, RealTargetSym, TgtBB); - Modified = true; - } - DotAddress += InsnSize; + // Create a stub to handle a far-away target + Insertions.emplace_back(std::make_pair( + InsertionPoint, + replaceTargetWithStub(BB, Inst, DotAddress, + InsertionPoint == Frontier ? FrontierAddress + : DotAddress))); } } - auto RangeShortJmp = BC.MIB->getShortJmpEncodingSize(); - auto RangeSingleInstr = BC.MIB->getUncondBranchEncodingSize(); - uint64_t ShortJmpMask = ~((1ULL << RangeShortJmp) - 1); - uint64_t SingleInstrMask = ~((1ULL << (RangeSingleInstr - 1)) - 1); - // Shrink stubs from 64 to 32 or 28 bit whenever possible + // Relax stubs if necessary for (auto &BB : Func) { if (!Stubs[&Func].count(&BB) || !BB.isValid()) continue; - auto Bits = StubBits[&BB]; - // Already working with the tightest range? - if (Bits == RangeSingleInstr) - continue; - - // Attempt to tight to short jmp - auto *RealTargetSym = BC.MIB->getTargetSymbol(*BB.begin()); - auto *TgtBB = Func.getBasicBlockForLabel(RealTargetSym); - uint64_t DotAddress = BBAddresses[&BB]; - uint64_t TgtAddress = getSymbolAddress(BC, RealTargetSym, TgtBB); - if (TgtAddress & ShortJmpMask) - continue; + Modified |= relaxStub(BB); + } - // Attempt to tight to pc-relative single-instr branch - uint64_t PCRelTgtAddress = TgtAddress > DotAddress - ? TgtAddress - DotAddress - : DotAddress - TgtAddress; - if (PCRelTgtAddress & SingleInstrMask) { - if (Bits > RangeShortJmp) { - shrinkStubToShortJmp(BC, BB, RealTargetSym); - StubBits[&BB] = RangeShortJmp; - Modified = true; - } + for (auto &Elmt : Insertions) { + if (!Elmt.second) continue; - } - - if (Bits > RangeSingleInstr) { - shrinkStubToSingleInst(BC, BB, RealTargetSym, /*is func?*/!TgtBB); - StubBits[&BB] = RangeSingleInstr; - Modified = true; - } + std::vector> NewBBs; + NewBBs.emplace_back(std::move(Elmt.second)); + Func.insertBasicBlocks(Elmt.first, std::move(NewBBs), true, true); } + return Modified; } void LongJmpPass::runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) { + outs() << "BOLT-INFO: Starting stub-insertion pass\n"; auto Sorted = BinaryContext::getSortedFunctions(BFs); - for (auto Func : Sorted) { - // We are going to remove invalid BBs, so remove any previous marks - for (auto &BB : *Func) { - BB.markValid(true); - } - insertStubs(BC, *Func); - // Don't ruin non-simple functions, they can't afford to have the layout - // changed. - if (Func->isSimple()) - Func->fixBranches(); - } - bool Modified; + uint32_t Iterations{0}; do { + ++Iterations; Modified = false; tentativeLayout(BC, Sorted); + updateStubGroups(); for (auto Func : Sorted) { - if (removeOrShrinkStubs(BC, *Func)) { - Func->eraseInvalidBBs(); + if (relax(*Func)) { + // Don't ruin non-simple functions, they can't afford to have the layout + // changed. if (Func->isSimple()) Func->fixBranches(); Modified = true; } } } while (Modified); + outs() << "BOLT-INFO: Inserted " << NumHotStubs + << " stubs in the hot area and " << NumColdStubs + << " stubs in the cold area. Shared " << NumSharedStubs + << " times, iterated " << Iterations << " times.\n"; } - } } diff --git a/bolt/src/Passes/LongJmp.h b/bolt/src/Passes/LongJmp.h index e771b6767076..5bed5a30ac9c 100644 --- a/bolt/src/Passes/LongJmp.h +++ b/bolt/src/Passes/LongJmp.h @@ -21,58 +21,51 @@ namespace bolt { /// pull this pass inside BOLT because here we can do a better job at stub /// inserting by manipulating the CFG, something linkers can't do. /// -/// LongJmp is a two-step process. In the first step, when function sizes are -/// still unknown because we can insert an arbitrary amount of code to reach -/// far-away code, this pass expands all PC-relative instructions that refer to -/// a symbol at an unknown location likely to violate the branch range. +/// We iteratively repeat the following until no modification is done: we do a +/// tentative layout with the current function sizes; then we add stubs for +/// branches that we know are out of range or we expand smaller stubs (28-bit) +/// to a large one if necessary (32 or 64). +/// /// This expansion inserts the equivalent of "linker stubs", small /// blocks of code that load a 64-bit address into a pre-allocated register and // then executes an unconditional indirect branch on this register. By using a /// 64-bit range, we guarantee it can reach any code location. /// -/// In the second step, we iteratively repeat the following until no -/// modification is done: we do a tentative layout with the current function -/// sizes; then we remove stubs for branches that we know are close enough to be -/// encoded in a direct branch or a smaller stub (32-bit). -/// -/// Notice that this iteration is possible since step 2 strictly reduces sizes -/// and distances between branches and their destinations. -/// class LongJmpPass : public BinaryFunctionPass { - using StubMapTy = DenseMap>; + /// Used to implement stub grouping (re-using a stub from one function into + /// another) + using StubGroupsTy = + DenseMap, 4>>; + StubGroupsTy HotStubGroups; + StubGroupsTy ColdStubGroups; + DenseMap SharedStubs; + + /// Stubs that are local to a function. This will be the primary lookup + /// before resorting to stubs located in foreign functions. + using StubMapTy = DenseMap; /// Used to quickly fetch stubs based on the target they jump to - StubMapTy HotStubs; - StubMapTy ColdStubs; + StubMapTy HotLocalStubs; + StubMapTy ColdLocalStubs; /// Used to quickly identify whether a BB is a stub, sharded by function DenseMap> Stubs; using FuncAddressesMapTy = DenseMap; - /// Hold tentative addresses during step 2 + /// Hold tentative addresses FuncAddressesMapTy HotAddresses; FuncAddressesMapTy ColdAddresses; DenseMap BBAddresses; - /// Used to remove unused stubs - DenseMap StubRefCount; /// Used to identify the stub size DenseMap StubBits; - /// Replace the target of call or conditional branch in \p Inst with a - /// a stub that in turn will branch to the target (perform stub insertion). - /// If a new stub was created, return it. - std::unique_ptr - replaceTargetWithStub(const BinaryContext &BC, BinaryFunction &BF, - BinaryBasicBlock &BB, MCInst &Inst); - - /// -- Step 1 methods -- - /// Process all functions and insert maximum-size stubs so every branch in the - /// program is encodable without violating relocation ranges (relax all - /// branches). - void insertStubs(const BinaryContext &BC, BinaryFunction &BF); + /// Stats about number of stubs inserted + uint32_t NumHotStubs{0}; + uint32_t NumColdStubs{0}; + uint32_t NumSharedStubs{0}; - /// -- Step 2 methods -- + /// -- Layout estimation methods -- /// Try to do layout before running the emitter, by looking at BinaryFunctions /// and MCInsts -- this is an estimation. To be correct for longjmp inserter /// purposes, we need to do a size worst-case estimation. Real layout is done @@ -87,25 +80,67 @@ class LongJmpPass : public BinaryFunctionPass { tentativeLayoutRelocColdPart(const BinaryContext &BC, std::vector &SortedFunctions, uint64_t DotAddress); - void tentativeBBLayout(const BinaryContext &BC, const BinaryFunction &Func); + void tentativeBBLayout(const BinaryFunction &Func); - /// Helper to identify whether \p Inst is branching to a stub - bool usesStub(const BinaryContext &BC, const BinaryFunction &Func, - const MCInst &Inst) const; + /// Update stubs addresses with their exact address after a round of stub + /// insertion and layout estimation is done. + void updateStubGroups(); + + /// -- Relaxation/stub insertion methods -- + /// Creates a new stub jumping to \p TgtSym and updates bookkeeping about + /// this stub using \p AtAddress as its initial location. This location is + /// an approximation and will be later resolved to the exact location in + /// a next iteration, in updateStubGroups. + std::pair, MCSymbol *> + createNewStub(BinaryBasicBlock &SourceBB, const MCSymbol *TgtSym, + bool TgtIsFunc, uint64_t AtAddress); + + /// Replace the target of call or conditional branch in \p Inst with a + /// a stub that in turn will branch to the target (perform stub insertion). + /// If a new stub was created, return it. + std::unique_ptr + replaceTargetWithStub(BinaryBasicBlock &BB, MCInst &Inst, uint64_t DotAddress, + uint64_t StubCreationAddress); + + /// Helper used to fetch the closest stub to \p Inst at \p DotAddress that + /// is jumping to \p TgtSym. Returns nullptr if the closest stub is out of + /// range or if it doesn't exist. The source of truth for stubs will be the + /// map \p StubGroups, which can be either local stubs for a particular + /// function that is very large and needs to group stubs, or can be global + /// stubs if we are sharing stubs across functions. + BinaryBasicBlock *lookupStubFromGroup(const StubGroupsTy &StubGroups, + const BinaryFunction &Func, + const MCInst &Inst, + const MCSymbol *TgtSym, + uint64_t DotAddress) const; + + /// Lookup closest stub from the global pool, meaning this can return a basic + /// block from another function. + BinaryBasicBlock *lookupGlobalStub(const BinaryBasicBlock &SourceBB, + const MCInst &Inst, const MCSymbol *TgtSym, + uint64_t DotAddress) const; + + /// Lookup closest stub local to \p Func. + BinaryBasicBlock *lookupLocalStub(const BinaryBasicBlock &SourceBB, + const MCInst &Inst, const MCSymbol *TgtSym, + uint64_t DotAddress) const; + + /// Helper to identify whether \p Inst is branching to a stub + bool usesStub(const BinaryFunction &Func, const MCInst &Inst) const; + + /// True if Inst is a branch that is out of range + bool needsStub(const BinaryBasicBlock &BB, const MCInst &Inst, + uint64_t DotAddress) const; + + /// Expand the range of the stub in StubBB if necessary + bool relaxStub(BinaryBasicBlock &StubBB); /// Helper to resolve a symbol address according to our tentative layout uint64_t getSymbolAddress(const BinaryContext &BC, const MCSymbol *Target, const BinaryBasicBlock *TgtBB) const; - /// Change \p Inst to do not use a stub anymore, back to its original form - void removeStubRef(const BinaryContext &BC, - BinaryBasicBlock *BB, MCInst &Inst, - BinaryBasicBlock *StubBB, - const MCSymbol *Target, BinaryBasicBlock *TgtBB); - - /// Step 2 main entry point: Iterate through functions reducing stubs size - /// or completely removing them. - bool removeOrShrinkStubs(const BinaryContext &BC, BinaryFunction &BF); + /// Relax function by adding necessary stubs or relaxing existing stubs + bool relax(BinaryFunction &BF); public: /// BinaryPass public interface diff --git a/bolt/src/Passes/VeneerElimination.cpp b/bolt/src/Passes/VeneerElimination.cpp index 9dd67694ad00..cf2ab8238f1f 100644 --- a/bolt/src/Passes/VeneerElimination.cpp +++ b/bolt/src/Passes/VeneerElimination.cpp @@ -23,7 +23,7 @@ extern cl::OptionCategory BoltOptCategory; static llvm::cl::opt EliminateVeneers("elim-link-veneers", cl::desc("run veneer elimination pass"), - cl::init(false), + cl::init(true), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 43e18366d236..a512dc44fec5 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -84,6 +84,7 @@ extern cl::OptionCategory AggregatorCategory; extern cl::opt AlignMacroOpFusion; extern cl::opt JumpTables; extern cl::list ReorderData; +extern cl::opt HotFunctionsAtEnd; static cl::opt ForceToDataRelocations("force-data-relocations", @@ -2793,7 +2794,7 @@ void RewriteInstance::emitFunctions() { Streamer->InitSections(false); // Mark beginning of "hot text". - if (BC->HasRelocations && opts::HotText) + if (BC->HasRelocations && opts::HotText && !opts::HotFunctionsAtEnd) Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_start")); // Sort functions for the output. @@ -2811,13 +2812,23 @@ void RewriteInstance::emitFunctions() { uint32_t LastHotIndex = -1u; uint32_t CurrentIndex = 0; - for (auto *BF : SortedFunctions) { - if (!BF->hasValidIndex() && LastHotIndex == -1u) { - LastHotIndex = CurrentIndex; + if (opts::HotFunctionsAtEnd) { + for (auto *BF : SortedFunctions) { + if (BF->hasValidIndex() && LastHotIndex == -1u) { + LastHotIndex = CurrentIndex; + } + assert(LastHotIndex == -1u || BF->hasValidIndex()); + ++CurrentIndex; + } + } else { + for (auto *BF : SortedFunctions) { + if (!BF->hasValidIndex() && LastHotIndex == -1u) { + LastHotIndex = CurrentIndex; + } + assert(LastHotIndex == -1u || !BF->hasValidIndex()); + assert(!BF->hasValidIndex() || CurrentIndex == BF->getIndex()); + ++CurrentIndex; } - assert(LastHotIndex == -1u || !BF->hasValidIndex()); - assert(!BF->hasValidIndex() || CurrentIndex == BF->getIndex()); - ++CurrentIndex; } CurrentIndex = 0; DEBUG(dbgs() << "BOLT-DEBUG: LastHotIndex = " << LastHotIndex << "\n"); @@ -2833,11 +2844,10 @@ void RewriteInstance::emitFunctions() { if (BC->HasRelocations && !ColdFunctionSeen && CurrentIndex >= LastHotIndex) { // Mark the end of "hot" stuff. - if (opts::HotText) { + if (opts::HotText && !opts::HotFunctionsAtEnd) { Streamer->SwitchSection(BC->MOFI->getTextSection()); Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_end")); } - ColdFunctionSeen = true; // Emit injected functions hot part @@ -2858,6 +2868,12 @@ void RewriteInstance::emitFunctions() { } } DEBUG(dbgs() << "BOLT-DEBUG: first cold function: " << Function << '\n'); + + if (opts::HotText && opts::HotFunctionsAtEnd) { + Streamer->SwitchSection(BC->MOFI->getTextSection()); + Streamer->EmitCodeAlignment(BC->PageAlign); + Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_start")); + } } if (!BC->HasRelocations && @@ -2886,7 +2902,7 @@ void RewriteInstance::emitFunctions() { } } - if (!ColdFunctionSeen && opts::HotText) { + if ((!ColdFunctionSeen || opts::HotFunctionsAtEnd) && opts::HotText) { Streamer->SwitchSection(BC->MOFI->getTextSection()); Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_end")); } diff --git a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp index 02cb252e0b8d..f45556cd85f1 100644 --- a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -697,7 +697,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return replaceBranchTarget(Inst, TBB, Ctx); } - int getPCRelEncodingSize(MCInst &Inst) const override { + int getPCRelEncodingSize(const MCInst &Inst) const override { switch (Inst.getOpcode()) { default: llvm_unreachable("Failed to get pcrel encoding size"); From 83400bf1da33928cac907e36ee7d70679e96bf21 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 21 Sep 2018 12:00:20 -0700 Subject: [PATCH 486/904] [BOLT] Support relocations without symbols Summary: lld may generate relocations without associated symbols. Instead of rejecting binaries with such relocations, we can re-create the symbol the relocation is against based on the extracted value. (cherry picked from commit 61d7ee7c32ca1821935b8b285f632c6ab5a8646c) --- bolt/src/BinaryContext.cpp | 4 +- bolt/src/BinaryContext.h | 4 +- bolt/src/BinaryFunction.cpp | 4 +- bolt/src/BinaryFunction.h | 4 +- bolt/src/Exceptions.cpp | 4 +- bolt/src/RewriteInstance.cpp | 133 ++++++++++++++++++----------------- bolt/src/RewriteInstance.h | 1 + 7 files changed, 78 insertions(+), 76 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 5a21a812b576..9751a951e6df 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -228,9 +228,9 @@ BinaryContext::getSubBinaryData(BinaryData *BD) { } MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, + Twine Prefix, uint64_t Size, uint16_t Alignment, - Twine Prefix, unsigned Flags) { auto Itr = BinaryDataMap.find(Address); if (Itr != BinaryDataMap.end()) { @@ -536,7 +536,7 @@ void BinaryContext::fixBinaryDataHoles() { if (BD->getSection() == Section) setBinaryDataSize(Hole.first, Hole.second); } else { - getOrCreateGlobalSymbol(Hole.first, Hole.second, 1, "HOLEat"); + getOrCreateGlobalSymbol(Hole.first, "HOLEat", Hole.second, 1); } } } diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index be5b581aa72d..e705c8e28b29 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -386,9 +386,9 @@ class BinaryContext { /// If there are multiple symbols registered at the \p Address, then /// return the first one. MCSymbol *getOrCreateGlobalSymbol(uint64_t Address, - uint64_t Size, - uint16_t Alignment, Twine Prefix, + uint64_t Size = 0, + uint16_t Alignment = 0, unsigned Flags = 0); /// Register a symbol with \p Name at a given \p Address and \p Size. diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 48bbc3620160..6891d299fb04 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1046,7 +1046,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } // TODO: use DWARF info to get size/alignment here? auto *TargetSymbol = - BC.getOrCreateGlobalSymbol(TargetAddress, 0, 0, "DATAat"); + BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat"); DEBUG(if (opts::Verbosity >= 2) { auto SectionName = BD ? BD->getSectionName() : ""; dbgs() << "Created DATAat sym: " << TargetSymbol->getName() @@ -1301,7 +1301,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } TargetSymbol = - BC.getOrCreateGlobalSymbol(TargetAddress, 0, 0, "FUNCat"); + BC.getOrCreateGlobalSymbol(TargetAddress, "FUNCat"); if (TargetAddress == 0) { // We actually see calls to address 0 in presence of weak // symbols originating from libraries. This code is never meant diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index b4ae1359b992..f52bf00d3d29 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -1608,7 +1608,7 @@ class BinaryFunction { BinaryFunction &setPersonalityFunction(uint64_t Addr) { assert(!PersonalityFunction && "can't set personality function twice"); - PersonalityFunction = BC.getOrCreateGlobalSymbol(Addr, 0, 0, "FUNCat"); + PersonalityFunction = BC.getOrCreateGlobalSymbol(Addr, "FUNCat"); return *this; } @@ -1762,7 +1762,7 @@ class BinaryFunction { return nullptr; // Register our island at global namespace - Symbol = BC.getOrCreateGlobalSymbol(Address, 0, 0, "ISLANDat"); + Symbol = BC.getOrCreateGlobalSymbol(Address, "ISLANDat"); // Internal bookkeeping const auto Offset = Address - getAddress(); assert((!IslandOffsets.count(Offset) || IslandOffsets[Offset] == Symbol) && diff --git a/bolt/src/Exceptions.cpp b/bolt/src/Exceptions.cpp index 5277aea99d83..b2024164a38d 100644 --- a/bolt/src/Exceptions.cpp +++ b/bolt/src/Exceptions.cpp @@ -637,9 +637,9 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { if (TypeAddress) { const auto *TypeSymbol = BC.getOrCreateGlobalSymbol(TypeAddress, + "TI", TTypeEncodingSize, - TTypeAlignment, - "TI"); + TTypeAlignment); auto *DotSymbol = BC.Ctx->createTempSymbol(); Streamer->EmitLabel(DotSymbol); const auto *SubDotExpr = MCBinaryExpr::createSub( diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index a512dc44fec5..655aabfdf5f9 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1695,7 +1695,7 @@ void RewriteInstance::relocateEHFrameSection() { if (!Symbol) { DEBUG(dbgs() << "BOLT-DEBUG: creating symbol for DWARF reference at 0x" << Twine::utohexstr(Value) << '\n'); - Symbol = BC->getOrCreateGlobalSymbol(Value, 0, 0, "FUNCat"); + Symbol = BC->getOrCreateGlobalSymbol(Value, "FUNCat"); } DEBUG(dbgs() << "BOLT-DEBUG: adding DWARF reference against symbol " @@ -1806,8 +1806,7 @@ void RewriteInstance::readSpecialSections() { } void RewriteInstance::adjustCommandLineOptions() { - if (BC->isAArch64() && opts::RelocationMode != cl::BOU_TRUE && - !opts::AggregateOnly) { + if (BC->isAArch64() && !BC->HasRelocations) { errs() << "BOLT-WARNING: non-relocation mode for AArch64 is not fully " "supported\n"; } @@ -1874,6 +1873,7 @@ int64_t getRelocationAddend(const ELFObjectFileBase *Obj, bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, SectionRef RelocatedSection, std::string &SymbolName, + bool &IsSectionRelocation, uint64_t &SymbolAddress, int64_t &Addend, uint64_t &ExtractedValue) const { @@ -1882,45 +1882,68 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, const bool IsAArch64 = BC->isAArch64(); - // For value extraction. + // Extract the value. StringRef RelocatedSectionContents; RelocatedSection.getContents(RelocatedSectionContents); DataExtractor DE(RelocatedSectionContents, BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getCodePointerSize()); - - const bool IsPCRelative = Relocation::isPCRelative(Rel.getType()); - auto SymbolIter = Rel.getSymbol(); - assert(SymbolIter != InputFile->symbol_end() && - "relocation symbol must exist"); - const auto &Symbol = *SymbolIter; - SymbolName = cantFail(Symbol.getName()); - SymbolAddress = cantFail(Symbol.getAddress()); - Addend = getRelocationAddend(InputFile, Rel); - - uint32_t RelocationOffset = - Rel.getOffset() - RelocatedSection.getAddress(); + uint32_t RelocationOffset = Rel.getOffset() - RelocatedSection.getAddress(); const auto RelSize = Relocation::getSizeForType(Rel.getType()); - ExtractedValue = - static_cast(DE.getSigned(&RelocationOffset, RelSize)); - + ExtractedValue = static_cast(DE.getSigned(&RelocationOffset, + RelSize)); if (IsAArch64) { ExtractedValue = Relocation::extractValue(Rel.getType(), ExtractedValue, Rel.getOffset()); } - // Section symbols are marked as ST_Debug. - const bool SymbolIsSection = - (cantFail(Symbol.getType()) == SymbolRef::ST_Debug); + Addend = getRelocationAddend(InputFile, Rel); + + const bool IsPCRelative = Relocation::isPCRelative(Rel.getType()); const auto PCRelOffset = IsPCRelative && !IsAArch64 ? Rel.getOffset() : 0; + bool SkipVerification = false; + auto SymbolIter = Rel.getSymbol(); + if (SymbolIter == InputFile->symbol_end()) { + SymbolAddress = ExtractedValue - Addend; + if (IsPCRelative) + SymbolAddress += PCRelOffset; + auto *RelSymbol = BC->getOrCreateGlobalSymbol(SymbolAddress, "RELSYMat"); + SymbolName = RelSymbol->getName(); + IsSectionRelocation = false; + } else { + const auto &Symbol = *SymbolIter; + SymbolName = cantFail(Symbol.getName()); + SymbolAddress = cantFail(Symbol.getAddress()); + SkipVerification = (cantFail(Symbol.getType()) == SymbolRef::ST_Other); + // Section symbols are marked as ST_Debug. + IsSectionRelocation = (cantFail(Symbol.getType()) == SymbolRef::ST_Debug); + if (IsSectionRelocation) { + auto Section = Symbol.getSection(); + if (Section && *Section != InputFile->section_end()) { + SymbolName = "section " + std::string(getSectionName(**Section)); + if (!IsAArch64) { + assert(SymbolAddress == (*Section)->getAddress() && + "section symbol address must be the same as section address"); + // Convert section symbol relocations to regular relocations inside + // non-section symbols. + if (IsPCRelative) { + Addend = ExtractedValue - (SymbolAddress - PCRelOffset); + } else { + SymbolAddress = ExtractedValue; + Addend = 0; + } + } + } + } + } // If no symbol has been found or if it is a relocation requiring the // creation of a GOT entry, do not link against the symbol but against // whatever address was extracted from the instruction itself. We are // not creating a GOT entry as this was already processed by the linker. if (!SymbolAddress || Relocation::isGOT(Rel.getType())) { - assert(!SymbolIsSection); + assert(!IsSectionRelocation); if (ExtractedValue) { SymbolAddress = ExtractedValue - Addend + PCRelOffset; } else { @@ -1935,26 +1958,12 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, SymbolAddress += PCRelOffset; return true; } - } else if (SymbolIsSection) { - auto Section = Symbol.getSection(); - if (Section && *Section != InputFile->section_end()) { - SymbolName = "section " + std::string(getSectionName(**Section)); - if (!IsAArch64) { - assert(SymbolAddress == (*Section)->getAddress() && - "section symbol address must be the same as section address"); - // Convert section symbol relocations to regular relocations inside - // non-section symbols. - if (IsPCRelative) { - Addend = ExtractedValue - (SymbolAddress - PCRelOffset); - } else { - SymbolAddress = ExtractedValue; - Addend = 0; - } - } - } } auto verifyExtractedValue = [&]() { + if (SkipVerification) + return true; + if (IsAArch64) return true; @@ -1964,9 +1973,6 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, if (Relocation::isTLS(Rel.getType())) return true; - if (cantFail(Symbol.getType()) == SymbolRef::ST_Other) - return true; - return truncateToSize(ExtractedValue, RelSize) == truncateToSize(SymbolAddress + Addend - PCRelOffset, RelSize); }; @@ -1981,7 +1987,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { Section.getName(SectionName); DEBUG(dbgs() << "BOLT-DEBUG: relocations for section " << SectionName << ":\n"); - if (ELFSectionRef(Section).getFlags() & ELF::SHF_ALLOC) { + if (BinarySection(*BC, Section).isAllocatable()) { DEBUG(dbgs() << "BOLT-DEBUG: ignoring runtime relocations\n"); return; } @@ -1994,7 +2000,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { DEBUG(dbgs() << "BOLT-DEBUG: relocated section is " << RelocatedSectionName << '\n'); - if (!(ELFSectionRef(RelocatedSection).getFlags() & ELF::SHF_ALLOC)) { + if (!BinarySection(*BC, RelocatedSection).isAllocatable()) { DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocations against " << "non-allocatable section\n"); return; @@ -2046,17 +2052,18 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { uint64_t SymbolAddress; int64_t Addend; uint64_t ExtractedValue; - + bool IsSectionRelocation; if (!analyzeRelocation(Rel, RelocatedSection, SymbolName, + IsSectionRelocation, SymbolAddress, Addend, ExtractedValue)) { DEBUG(dbgs() << "BOLT-DEBUG: skipping relocation @ offset = 0x" - << Twine::utohexstr(Rel.getOffset()) - << "; type name = " << TypeName - << '\n'); + << Twine::utohexstr(Rel.getOffset()) + << "; type name = " << TypeName + << '\n'); continue; } @@ -2117,7 +2124,6 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { if (BC->isAArch64() && Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE) ForceRelocation = true; - // TODO: RefSection should be the same as **Rel.getSymbol().getSection() auto RefSection = BC->getSectionForAddress(SymbolAddress); if (!RefSection && !ForceRelocation) { DEBUG(dbgs() << "BOLT-DEBUG: cannot determine referenced section.\n"); @@ -2125,8 +2131,6 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } const bool IsToCode = RefSection && RefSection->isText(); - const bool IsSectionRelocation = - (cantFail(Rel.getSymbol()->getType()) == SymbolRef::ST_Debug); // Occasionally we may see a reference past the last byte of the function // typically as a result of __builtin_unreachable(). Check it here. @@ -2298,7 +2302,6 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { SymbolAddress = BD->getAddress(); assert(Address == SymbolAddress + Addend); } else { - auto Symbol = *Rel.getSymbol(); // These are mostly local data symbols but undefined symbols // in relocation sections can get through here too, from .plt. assert((IsAArch64 || @@ -2306,14 +2309,18 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { BC->getSectionNameForAddress(SymbolAddress)->startswith(".plt")) && "known symbols should not resolve to anonymous locals"); - const uint64_t SymbolSize = - IsAArch64 ? 0 : ELFSymbolRef(Symbol).getSize(); - const uint64_t SymbolAlignment = IsAArch64 ? 1 : Symbol.getAlignment(); - const unsigned SymbolFlags = Symbol.getFlags(); - - if (!IsSectionRelocation) { + if (IsSectionRelocation) { + ReferencedSymbol = BC->getOrCreateGlobalSymbol(SymbolAddress, + "SYMBOLat"); + } else { + auto Symbol = *Rel.getSymbol(); + const uint64_t SymbolSize = + IsAArch64 ? 0 : ELFSymbolRef(Symbol).getSize(); + const uint64_t SymbolAlignment = + IsAArch64 ? 1 : Symbol.getAlignment(); + const auto SymbolFlags = Symbol.getFlags(); std::string Name; - if (Symbol.getFlags() & SymbolRef::SF_Global) { + if (SymbolFlags & SymbolRef::SF_Global) { Name = SymbolName; } else { Name = uniquifyName(*BC, StringRef(SymbolName).startswith( @@ -2326,12 +2333,6 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { SymbolSize, SymbolAlignment, SymbolFlags); - } else { - ReferencedSymbol = BC->getOrCreateGlobalSymbol(SymbolAddress, - SymbolSize, - SymbolAlignment, - "SYMBOLat", - SymbolFlags); } if (!opts::AllowSectionRelocations && IsSectionRelocation) { diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index fb0f4b42f038..43f72044e8de 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -264,6 +264,7 @@ class RewriteInstance { bool analyzeRelocation(const RelocationRef &Rel, SectionRef RelocatedSection, std::string &SymbolName, + bool &IsSectionRelocation, uint64_t &SymbolAddress, int64_t &Addend, uint64_t &ExtractedValue) const; From de0f777d91169a6556562c12b2885891750d7979 Mon Sep 17 00:00:00 2001 From: Igor Sugak Date: Mon, 17 Sep 2018 12:17:33 -0700 Subject: [PATCH 487/904] [BOLT] fix build with gcc-4.8.5 Summary: These are two minor changes to make it copatible with gcc-4.8.5 (cherry picked from commit 42c71d056cd549e64350703b9ccbbdeb8ca66071) --- bolt/src/BinaryBasicBlock.h | 4 +++- bolt/src/DataAggregator.cpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 84010150b5ae..86b8dd1f0573 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -736,7 +736,9 @@ class BinaryBasicBlock { adjustNumPseudos(*II, -1); adjustNumPseudos(Begin, End, 1); - return Instructions.insert(Instructions.erase(II), Begin, End); + auto i = II - Instructions.begin(); + Instructions.insert(Instructions.erase(II), Begin, End); + return i + Instructions.begin(); } iterator replaceInstruction(iterator II, diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index d5007fdc1ff2..3dfcde6020a0 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -903,7 +903,7 @@ ErrorOr DataAggregator::parseAggregatedLBREntry() { auto TypeOrErr = parseString(FieldSeparator); if (std::error_code EC = TypeOrErr.getError()) return EC; - auto Type{AggregatedLBREntry::BRANCH}; + auto Type = AggregatedLBREntry::BRANCH; if (TypeOrErr.get() == "B") { Type = AggregatedLBREntry::BRANCH; } else if (TypeOrErr.get() == "F") { From 4a9ea3f1448c93b83b877e26e3fc634fc74d3c5f Mon Sep 17 00:00:00 2001 From: Igor Sugak Date: Mon, 1 Oct 2018 16:22:46 -0700 Subject: [PATCH 488/904] [BOLT] D9884971 follow up Summary: as titled (cherry picked from commit 735a39df4c4218c57790a9aa2b0efa211811a29e) --- bolt/src/BinaryBasicBlock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 86b8dd1f0573..7a60cc5e27e9 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -736,9 +736,9 @@ class BinaryBasicBlock { adjustNumPseudos(*II, -1); adjustNumPseudos(Begin, End, 1); - auto i = II - Instructions.begin(); + auto I = II - Instructions.begin(); Instructions.insert(Instructions.erase(II), Begin, End); - return i + Instructions.begin(); + return I + Instructions.begin(); } iterator replaceInstruction(iterator II, From 6c7cf30eb58415e111e888ba71c65b763497f654 Mon Sep 17 00:00:00 2001 From: Facebook Github Bot Date: Thu, 4 Oct 2018 10:46:16 -0700 Subject: [PATCH 489/904] [BOLT][PR] Fix compiler warnings in BinaryContext and RegAnalysis Summary: This pull request fixes two compiler warnings: - missing `break;` in a switch-case statement in RegAnalysis.cpp (-Wimplicit-fallthrough warning) - misleading indentation in BinaryContext.cpp (-Wmisleading-indentation warning) Pull Request resolved: https://github.com/facebookincubator/BOLT/pull/39 GitHub Author: Andreas Ziegler (cherry picked from commit 656b5beed0f467d9534c4d668c8062181c3fa113) --- bolt/src/BinaryContext.cpp | 6 +++--- bolt/src/Passes/RegAnalysis.cpp | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 9751a951e6df..4359f074abde 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -872,11 +872,11 @@ void BinaryContext::printInstruction(raw_ostream &OS, OS << " # TAILCALL "; if (MIB->isInvoke(Instruction)) { if (const auto EHInfo = MIB->getEHInfo(Instruction)) { - OS << " # handler: "; + OS << " # handler: "; if (EHInfo->first) OS << *EHInfo->first; - else - OS << '0'; + else + OS << '0'; OS << "; action: " << EHInfo->second; } auto GnuArgsSize = MIB->getGnuArgsSize(Instruction); diff --git a/bolt/src/Passes/RegAnalysis.cpp b/bolt/src/Passes/RegAnalysis.cpp index 14ef48b57c25..5a52fae8a2ee 100644 --- a/bolt/src/Passes/RegAnalysis.cpp +++ b/bolt/src/Passes/RegAnalysis.cpp @@ -105,6 +105,7 @@ void RegAnalysis::beConservative(BitVector &Result) const { BC.MIB->getCalleeSavedRegs(BV); BV.flip(); Result |= BV; + break; } case ConservativeStrategy::CLOBBERS_NONE: Result.reset(); From b02af998bbdb2c802af66545e2815770d1bf4c08 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 11 Oct 2018 18:12:09 -0700 Subject: [PATCH 490/904] Fix bug in analyzeRelocation for GOT entries Summary: Special case GOT relocs to ignore addend subtracting logic in analyzeRelocation, since the addend does not refer to the target of the instruction being analyzed. Also make the code honor the comments in the special case about zeroed out ExtractValue but non-zero addend. Fix #40 (cherry picked from commit df4cd19692329f378dfa138c5751f885f7774d9b) --- bolt/src/RewriteInstance.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 655aabfdf5f9..ee5c81ab6a88 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1942,9 +1942,15 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, // creation of a GOT entry, do not link against the symbol but against // whatever address was extracted from the instruction itself. We are // not creating a GOT entry as this was already processed by the linker. - if (!SymbolAddress || Relocation::isGOT(Rel.getType())) { + // For GOT relocs, do not subtract addend as the addend does not refer + // to this instruction's target, but it refers to the target in the GOT + // entry. + if (Relocation::isGOT(Rel.getType())) { + Addend = 0; + SymbolAddress = ExtractedValue + PCRelOffset; + } else if (!SymbolAddress) { assert(!IsSectionRelocation); - if (ExtractedValue) { + if (ExtractedValue || Addend == 0 || IsPCRelative) { SymbolAddress = ExtractedValue - Addend + PCRelOffset; } else { // This is weird case. The extracted value is zero but the addend is @@ -1955,7 +1961,6 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, << Twine::utohexstr(Rel.getOffset()) << " value does not match addend for " << "relocation to undefined symbol.\n"); - SymbolAddress += PCRelOffset; return true; } } From 21033d1985b4fd6c98c16263fa8d106fdd724967 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 2 Oct 2018 17:16:26 -0700 Subject: [PATCH 491/904] [perf2bolt] Pre-aggregate LBR samples Summary: Pre-aggregating LBR data cuts pef2bolt processing times in half. (cherry picked from commit c8a03242ad9e6c235da6ddf23a2082dfbfb8f974) --- bolt/src/BinaryContext.h | 8 ++++ bolt/src/DataAggregator.cpp | 88 ++++++++++++++++++++++++++++++++++- bolt/src/ProfileWriter.cpp | 2 + bolt/src/ProfileYAMLMapping.h | 18 +++++++ bolt/src/RewriteInstance.cpp | 19 ++++---- 5 files changed, 122 insertions(+), 13 deletions(-) diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index e705c8e28b29..7ce5a4a39858 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -278,6 +278,9 @@ class BinaryContext { uint64_t MissedMacroFusionPairs{0}; uint64_t MissedMacroFusionExecCount{0}; + // Address of the first allocated segment. + uint64_t FirstAllocAddress{std::numeric_limits::max()}; + /// Track next available address for new allocatable sections. RewriteInstance /// sets this prior to running BOLT passes, so layout passes are aware of the /// final addresses functions will have. @@ -568,6 +571,11 @@ class BinaryContext { Sections.end())); } + /// Check if the address belongs to this binary's static allocation space. + bool containsAddress(uint64_t Address) const { + return Address >= FirstAllocAddress && Address < LayoutStartAddress; + } + /// Return section name containing the given \p Address. ErrorOr getSectionNameForAddress(uint64_t Address) const; diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 3dfcde6020a0..4265b45c4041 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -24,6 +24,7 @@ #include "llvm/Support/Regex.h" #include "llvm/Support/Timer.h" #include +#include #include @@ -550,6 +551,9 @@ bool DataAggregator::aggregate(BinaryContext &BC, BinaryFunction * DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) { + if (!BC->containsAddress(Address)) + return nullptr; + auto FI = BFs->upper_bound(Address); if (FI == BFs->begin()) return nullptr; @@ -964,6 +968,37 @@ std::error_code DataAggregator::parseBranchEvents() { uint64_t NumEntries{0}; uint64_t NumSamples{0}; uint64_t NumTraces{0}; + + struct Location { + uint64_t From; + uint64_t To; + Location(uint64_t From, uint64_t To) + : From(From), To(To) {} + bool operator==(const Location &Other) const { + return From == Other.From && To == Other.To; + } + }; + + struct LocationHash { + size_t operator()(const Location &L) const { + return std::hash()(L.From << 32 | L.To); + } + }; + + struct TraceInfo { + uint64_t InternCount{0}; + uint64_t ExternCount{0}; + }; + + struct BranchInfo { + uint64_t TakenCount{0}; + uint64_t MispredCount{0}; + }; + + /// Map location to counters. + std::unordered_map BranchLBRs; + std::unordered_map FallthroughLBRs; + while (hasData()) { auto SampleRes = parseBranchSample(); if (std::error_code EC = SampleRes.getError()) @@ -981,13 +1016,62 @@ std::error_code DataAggregator::parseBranchEvents() { const LBREntry *NextLBR{nullptr}; for (const auto &LBR : Sample.LBR) { if (NextLBR) { - doTrace(LBR, *NextLBR); + // Record fall-through trace. + const auto TraceFrom = LBR.To; + const auto TraceTo = NextLBR->From; + const auto *TraceBF = getBinaryFunctionContainingAddress(TraceFrom); + if (TraceBF && TraceBF->containsAddress(TraceTo)) { + auto &Info = FallthroughLBRs[Location(TraceFrom, TraceTo)]; + if (TraceBF->containsAddress(LBR.From)) { + ++Info.InternCount; + } else { + ++Info.ExternCount; + } + } else { + if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) { + ++NumInvalidTraces; + } else { + ++NumLongRangeTraces; + } + } ++NumTraces; } - doBranch(LBR.From, LBR.To, 1, LBR.Mispred); NextLBR = &LBR; + + auto From = LBR.From; + if (!getBinaryFunctionContainingAddress(From)) + From = 0; + auto To = LBR.To; + if (!getBinaryFunctionContainingAddress(To)) + To = 0; + if (!From && !To) + continue; + auto &Info = BranchLBRs[Location(From, To)]; + ++Info.TakenCount; + Info.MispredCount += LBR.Mispred; } } + + for (const auto &AggrLBR : FallthroughLBRs) { + auto &Loc = AggrLBR.first; + auto &Info = AggrLBR.second; + LBREntry First{Loc.From, Loc.From, false}; + LBREntry Second{Loc.To, Loc.To, false}; + if (Info.InternCount) { + doTrace(First, Second, Info.InternCount); + } + if (Info.ExternCount) { + First.From = 0; + doTrace(First, Second, Info.ExternCount); + } + } + + for (const auto &AggrLBR : BranchLBRs) { + auto &Loc = AggrLBR.first; + auto &Info = AggrLBR.second; + doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); + } + outs() << "PERF2BOLT: Read " << NumSamples << " samples and " << NumEntries << " LBR entries\n"; outs() << "PERF2BOLT: Traces mismatching disassembled function contents: " diff --git a/bolt/src/ProfileWriter.cpp b/bolt/src/ProfileWriter.cpp index 0f3138648736..00cc32bbfb6d 100644 --- a/bolt/src/ProfileWriter.cpp +++ b/bolt/src/ProfileWriter.cpp @@ -124,6 +124,8 @@ convert(const BinaryFunction &BF, yaml::bolt::BinaryFunctionProfile &YamlBF) { } } + std::sort(YamlBB.CallSites.begin(), YamlBB.CallSites.end()); + // Skip printing if there's no profile data for non-entry basic block. // Include landing pads with non-zero execution count. if (YamlBB.CallSites.empty() && diff --git a/bolt/src/ProfileYAMLMapping.h b/bolt/src/ProfileYAMLMapping.h index 46503b73180f..0fd3229b2a63 100644 --- a/bolt/src/ProfileYAMLMapping.h +++ b/bolt/src/ProfileYAMLMapping.h @@ -37,9 +37,27 @@ struct CallSiteInfo { DestId == Other.DestId && EntryDiscriminator == Other.EntryDiscriminator; } + bool operator!=(const CallSiteInfo &Other) const { return !(*this == Other); } + + bool operator<(const CallSiteInfo &Other) const { + if (Offset < Other.Offset) + return true; + if (Offset > Other.Offset) + return false; + + if (DestId < Other.DestId) + return true; + if (DestId > Other.DestId) + return false; + + if (EntryDiscriminator < Other.EntryDiscriminator) + return true; + + return false; + } }; } // end namespace bolt diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index ee5c81ab6a88..cee4f78d880c 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -810,16 +810,13 @@ void RewriteInstance::discoverStorage() { EntryPoint = Obj->getHeader()->e_entry; - // This is where the first segment and ELF header were allocated. - uint64_t FirstAllocAddress = std::numeric_limits::max(); - NextAvailableAddress = 0; uint64_t NextAvailableOffset = 0; auto PHs = cantFail(Obj->program_headers(), "program_headers() failed"); for (const auto &Phdr : PHs) { if (Phdr.p_type == ELF::PT_LOAD) { - FirstAllocAddress = std::min(FirstAllocAddress, - static_cast(Phdr.p_vaddr)); + BC->FirstAllocAddress = std::min(BC->FirstAllocAddress, + static_cast(Phdr.p_vaddr)); NextAvailableAddress = std::max(NextAvailableAddress, Phdr.p_vaddr + Phdr.p_memsz); NextAvailableOffset = std::max(NextAvailableOffset, @@ -856,7 +853,7 @@ void RewriteInstance::discoverStorage() { "no PT_LOAD pheader seen"); outs() << "BOLT-INFO: first alloc address is 0x" - << Twine::utohexstr(FirstAllocAddress) << '\n'; + << Twine::utohexstr(BC->FirstAllocAddress) << '\n'; FirstNonAllocatableOffset = NextAvailableOffset; @@ -874,13 +871,13 @@ void RewriteInstance::discoverStorage() { // // NB: bfd's strip command cannot do the above and will corrupt the // binary during the process of stripping non-allocatable sections. - if (NextAvailableOffset <= NextAvailableAddress - FirstAllocAddress) { - NextAvailableOffset = NextAvailableAddress - FirstAllocAddress; + if (NextAvailableOffset <= NextAvailableAddress - BC->FirstAllocAddress) { + NextAvailableOffset = NextAvailableAddress - BC->FirstAllocAddress; } else { - NextAvailableAddress = NextAvailableOffset + FirstAllocAddress; + NextAvailableAddress = NextAvailableOffset + BC->FirstAllocAddress; } - assert(NextAvailableOffset == NextAvailableAddress - FirstAllocAddress && - "PHDR table address calculation error"); + assert(NextAvailableOffset == NextAvailableAddress - BC->FirstAllocAddress + && "PHDR table address calculation error"); outs() << "BOLT-INFO: creating new program header table at address 0x" << Twine::utohexstr(NextAvailableAddress) << ", offset 0x" From 66b8ab99df56b469050737aff8dbd3f5d108c556 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 22 Oct 2018 18:48:12 -0700 Subject: [PATCH 492/904] [BOLT] Update local symbol count in symbol table Summary: Fix sh_info entry for symbol table section to reflect updated number of local symbols. (cherry picked from commit 50c59af6f19b5618714a6fb36f1b5825e3cc6674) --- bolt/src/RewriteInstance.cpp | 46 +++++++++++++++++++----------------- bolt/src/RewriteInstance.h | 3 +++ 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index cee4f78d880c..74c8d0803c94 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -3873,6 +3873,10 @@ std::vector RewriteInstance::getOutputSections( NewSection.sh_size = BSec->getOutputSize(); NewSection.sh_name = SHStrTab.getOffset(SectionName); + if (NewSection.sh_type == ELF::SHT_SYMTAB) { + NewSection.sh_info = NumLocalSymbols; + } + OutputSections->emplace_back(NewSection); LastFileOffset = BSec->getFileOffset(); @@ -4005,8 +4009,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto updateSymbolTable = [&](bool PatchExisting, const Elf_Shdr *Section, - std::function - Write, + std::function Write, std::function AddToStrTab) { auto StringSection = cantFail(Obj->getStringTableForSymtab(*Section)); unsigned IsHotTextUpdated = 0; @@ -4029,7 +4032,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { NewSymbol.st_size = Function->getOutputSize(); NewSymbol.st_other = 0; NewSymbol.setBindingAndType(ELF::STB_LOCAL, ELF::STT_FUNC); - Write(0, reinterpret_cast(&NewSymbol), sizeof(NewSymbol)); + Write(0, NewSymbol); if (Function->isSplit()) { auto NewColdSym = NewSymbol; @@ -4039,8 +4042,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { Twine(Function->getPrintName()).concat(".cold.0").toStringRef(Buf)); NewColdSym.st_value = Function->cold().getAddress(); NewColdSym.st_size = Function->cold().getImageSize(); - Write(0, reinterpret_cast(&NewColdSym), - sizeof(NewColdSym)); + Write(0, NewColdSym); } } @@ -4067,8 +4069,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { .toStringRef(Buf)); NewColdSym.st_value = Function->cold().getAddress(); NewColdSym.st_size = Function->cold().getImageSize(); - Write(0, reinterpret_cast(&NewColdSym), - sizeof(NewColdSym)); + Write(0, NewColdSym); } if (!PatchExisting && Function->hasConstantIsland()) { auto DataMark = Function->getOutputDataAddress(); @@ -4083,10 +4084,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto CodeMarkSym = DataMarkSym; CodeMarkSym.st_name = AddToStrTab("$x"); CodeMarkSym.st_value = CodeMark; - Write(0, reinterpret_cast(&DataMarkSym), - sizeof(DataMarkSym)); - Write(0, reinterpret_cast(&CodeMarkSym), - sizeof(CodeMarkSym)); + Write(0, DataMarkSym); + Write(0, CodeMarkSym); } if (!PatchExisting && Function->hasConstantIsland() && Function->isSplit()) { @@ -4102,10 +4101,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto CodeMarkSym = DataMarkSym; CodeMarkSym.st_name = AddToStrTab("$x"); CodeMarkSym.st_value = CodeMark; - Write(0, reinterpret_cast(&DataMarkSym), - sizeof(DataMarkSym)); - Write(0, reinterpret_cast(&CodeMarkSym), - sizeof(CodeMarkSym)); + Write(0, DataMarkSym); + Write(0, CodeMarkSym); } } else { uint32_t OldSectionIndex = NewSymbol.st_shndx; @@ -4190,7 +4187,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { Write((&Symbol - cantFail(Obj->symbols(Section)).begin()) * sizeof(Elf_Sym), - reinterpret_cast(&NewSymbol), sizeof(NewSymbol)); + NewSymbol); } assert((!IsHotTextUpdated || IsHotTextUpdated == 2) && @@ -4210,7 +4207,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { outs() << "BOLT-INFO: setting " << Name << " to 0x" << Twine::utohexstr(Symbol.st_value) << '\n'; - Write(0, reinterpret_cast(&Symbol), sizeof(Symbol)); + Write(0, Symbol); }; if (opts::HotText && !IsHotTextUpdated && !PatchExisting) { @@ -4226,7 +4223,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { // Update dynamic symbol table. const Elf_Shdr *DynSymSection = nullptr; - for (const Elf_Shdr &Section : cantFail(Obj->sections())) { + for (const auto &Section : cantFail(Obj->sections())) { if (Section.sh_type == ELF::SHT_DYNSYM) { DynSymSection = &Section; break; @@ -4235,8 +4232,9 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { assert(DynSymSection && "no dynamic symbol table found"); updateSymbolTable(/*patch existing table?*/ true, DynSymSection, - [&](size_t Offset, const char *Buf, size_t Size) { - Out->os().pwrite(Buf, Size, + [&](size_t Offset, const Elf_Sym &Sym) { + Out->os().pwrite(reinterpret_cast(&Sym), + sizeof(Elf_Sym), DynSymSection->sh_offset + Offset); }, [](StringRef) -> size_t { return 0; }); @@ -4262,10 +4260,14 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto SecName = cantFail(Obj->getSectionName(SymTabSection)); auto StrSecName = cantFail(Obj->getSectionName(StrTabSection)); + NumLocalSymbols = 0; updateSymbolTable(/*patch existing table?*/ false, SymTabSection, - [&](size_t Offset, const char *Buf, size_t Size) { - NewContents.append(Buf, Size); + [&](size_t Offset, const Elf_Sym &Sym) { + if (Sym.getBinding() == ELF::STB_LOCAL) + ++NumLocalSymbols; + NewContents.append(reinterpret_cast(&Sym), + sizeof(Elf_Sym)); }, [&](StringRef Str) { size_t Idx = NewStrTab.size(); diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 43f72044e8de..c97892aa81b3 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -478,6 +478,9 @@ class RewriteInstance { uint64_t NewTextSectionIndex{0}; + /// Number of local symbols in newly written symbol table. + uint64_t NumLocalSymbols{0}; + /// Exception handling and stack unwinding information in this binary. ErrorOr LSDASection{std::errc::bad_address}; const llvm::DWARFDebugFrame *EHFrame{nullptr}; From 60348f6ae39eaa729f50a4ab74545c904fcf397b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 12 Nov 2018 12:38:50 -0800 Subject: [PATCH 493/904] [BOLT] Workaround for Clang de-virtualization bug Summary: When Clang is boot-strapped with (Thin)LTO, it may produce a code fragment similar to below: .LFT663334 (6 instructions, align : 1) Predecessors: .LFT663333 00000538: movb $0x1, %al 0000053a: movl %eax, -0x2c(%rbp) 0000053d: movl $"_ZN5clang6Parser12ConsumeParenEv/1", %ecx 00000542: testb $0x1, %cl 00000545: movq -0x40(%rbp), %r14 00000549: je .Ltmp1071462 Successors: .Ltmp1071462, .LFT663335 .LFT663335 (2 instructions, align : 1) Predecessors: .LFT663334 0000054b: movq (%r12), %rax 0000054f: movq .Ltmp0(%rax), %rcx Successors: .Ltmp1071462 .Ltmp1071462 (7 instructions, align : 1) Predecessors: .LFT663334, .LFT663335 00000556: movq %r12, %rdi 00000559: callq *%rcx ....... The code above is making a call by dereferencing a pointer to a member function. A pointer to a member function could either be a regular function, or a virtual function. To differentiate between the two, AMD64 ABI (originated from Itanium ABI) uses the last bit of the pointer. The call instruction sequence varies depending if the function is virtual or not, and the pointer's last bit is checked. If it's "1" then the value of the pointer (minus 1) is used as an offset in the object vtable to get the address of the function, otherwise the pointer is used directly as a function address. In this specific case, a de-virtualization is taking place, but it's not complete. Compiler knows that the member function pointer is actually a non-virtual function _ZN5clang6Parser12ConsumeParenEv (aka "clang::Parser::ConsumeParen()"). However, it keeps the (dead) code that checks the last bit of _ZN5clang6Parser12ConsumeParenEv, and furthermore keeps the code (unreachable/dead) to make a virtual call while using (_ZN5clang6Parser12ConsumeParenEv - 1) as an offset into the vtable. This is obviously wrong, but since the code is unreachable, it will never affect the runtime correctness. The value "_ZN5clang6Parser12ConsumeParenEv - 1" falls into a last byte of a function preceding _ZN5clang6Parser12ConsumeParenEv, and BOLT creates a label ".Ltmp0" pointing to this last byte that is referenced in by the instruction sequence above. It just happens that the last byte is also in the middle of the last instruction, and as a result, BOLT never emits the label, hence resulting in the error message "Undefined temporary symbol". The workaround is to detect non-pc-relative relocations from code pointing to some (fptr - 1). Note that this is not completely error-prone, but non-pc-relative references from code into a middle of a function are quite rare, and chances that in a normal situation they will point to a byte preceding some function address are virtually zero. (cherry picked from commit 5273e56affa7d5f999aba6bb916f4ff8663b4344) --- bolt/src/RewriteInstance.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 74c8d0803c94..7cacbd15d784 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -2180,6 +2180,18 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { if (ReferencedBF->containsAddress(Address, /*UseMaxSize = */true)) { RefFunctionOffset = Address - ReferencedBF->getAddress(); if (RefFunctionOffset) { + // Workaround for member function pointer de-virtualization bug. + // We check if a code non-pc-relative relocation is pointing + // to a (fptr - 1). + if (ContainingBF && !Relocation::isPCRelative(Rel.getType())) { + if (const auto *NextBF = getBinaryFunctionAtAddress(Address + 1)) { + errs() << "BOLT-WARNING: detected possible compiler " + "de-virtualization bug: -1 addend used with " + "non-pc-relative relocation against function " + << *NextBF << " in function " << *ContainingBF << '\n'; + continue; + } + } ReferencedSymbol = ReferencedBF->getOrCreateLocalLabel(Address, /*CreatePastEnd =*/ true); From d895632ad6026d1de60e241bea55fa2db7f29787 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 14 Nov 2018 14:43:59 -0800 Subject: [PATCH 494/904] [BOLT] Add branch priority policy for blocks with 2 successors Summary: On x86 the difference between long and short jump instructions could be either 4 or 3 bytes, depending if it's a conditional jump or not. For a basic block with 2 jump instructions, if we know that one of the successors is in a different code region, then we can make it a target of an unconditional jump instruction. This will save 1 byte in case the conditional jump happens to be a short one. (cherry picked from commit 293b02c6c632188cafb406e2a669df4393c02378) --- bolt/src/BinaryFunction.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 6891d299fb04..cb4bca682544 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3252,6 +3252,16 @@ void BinaryFunction::fixBranches() { BB->removeDuplicateConditionalSuccessor(CondBranch); } if (!NextBB || (NextBB != TSuccessor && NextBB != FSuccessor)) { + // If one of the branches is guaranteed to be "long" while the other + // could be "short", then prioritize short for "taken". This will + // generate a sequence 1 byte shorter on x86. + if (BC.isX86() && + TSuccessor->isCold() != FSuccessor->isCold() && + BB->isCold() != TSuccessor->isCold()) { + std::swap(TSuccessor, FSuccessor); + MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx); + BB->swapConditionalSuccessors(); + } BB->addBranchInstruction(FSuccessor); } } From b97aa5dfad9bd934283acbd03abe936b246b4b65 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 15 Nov 2018 16:02:16 -0800 Subject: [PATCH 495/904] [BOLT] Add method for better function size estimation Summary: Add BinaryContext::calculateEmittedSize() that ephemerally emits code to allow precise estimation of the function size. Relaxation and macro-op alignment adjustments are taken into account. (cherry picked from commit 75cf77f5ec74edc3e2a3cea96daef63825e3facf) --- bolt/src/BinaryContext.cpp | 80 +++++++++++++++++++++++++++++++++++++ bolt/src/BinaryContext.h | 7 ++++ bolt/src/BinaryFunction.cpp | 13 ++++-- bolt/src/BinaryFunction.h | 3 +- 4 files changed, 98 insertions(+), 5 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 4359f074abde..1b04d90e22b2 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -15,8 +15,13 @@ #include "llvm/ADT/Twine.h" #include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/DebugInfo/DWARF/DWARFUnit.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" @@ -1149,3 +1154,78 @@ BinaryContext::createInjectedBinaryFunction(const std::string &Name, setSymbolToFunctionMap(BF->getSymbol(), BF); return BF; } + +std::pair +BinaryContext::calculateEmittedSize(BinaryFunction &BF) { + // Adjust branch instruction to match the current layout. + BF.fixBranches(); + + // Create local MC context to isolate the effect of ephemeral code emission. + std::unique_ptr LocalMOFI = + llvm::make_unique(); + std::unique_ptr LocalCtx = + llvm::make_unique(AsmInfo.get(), MRI.get(), LocalMOFI.get()); + LocalMOFI->InitMCObjectFileInfo(*TheTriple, /*PIC=*/false, *LocalCtx); + auto *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCTargetOptions()); + auto *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *LocalCtx); + SmallString<256> Code; + raw_svector_ostream VecOS(Code); + + std::unique_ptr Streamer(TheTarget->createMCObjectStreamer( + *TheTriple, *LocalCtx, std::unique_ptr(MAB), VecOS, + std::unique_ptr(MCE), *STI, + /* RelaxAll */ false, + /* IncrementalLinkerCompatible */ false, + /* DWARFMustBeAtTheEnd */ false)); + + Streamer->InitSections(false); + + auto *Section = LocalMOFI->getTextSection(); + Section->setHasInstructions(true); + + auto *StartLabel = LocalCtx->getOrCreateSymbol("__hstart"); + auto *EndLabel = LocalCtx->getOrCreateSymbol("__hend"); + auto *ColdStartLabel = LocalCtx->getOrCreateSymbol("__cstart"); + auto *ColdEndLabel = LocalCtx->getOrCreateSymbol("__cend"); + + Streamer->SwitchSection(Section); + Streamer->EmitLabel(StartLabel); + BF.emitBody(*Streamer, /*EmitColdPart = */false, /*EmitCodeOnly = */true); + Streamer->EmitLabel(EndLabel); + + if (BF.isSplit()) { + auto *ColdSection = + LocalCtx->getELFSection(BF.getColdCodeSectionName(), + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + ColdSection->setHasInstructions(true); + + Streamer->SwitchSection(ColdSection); + Streamer->EmitLabel(ColdStartLabel); + BF.emitBody(*Streamer, /*EmitColdPart = */true, /*EmitCodeOnly = */true); + Streamer->EmitLabel(ColdEndLabel); + } + + // To avoid calling MCObjectStreamer::flushPendingLabels() which is private. + Streamer->EmitBytes(StringRef("")); + + auto &Assembler = + static_cast(Streamer.get())->getAssembler(); + MCAsmLayout Layout(Assembler); + Assembler.layout(Layout); + + const auto HotSize = Layout.getSymbolOffset(*EndLabel) - + Layout.getSymbolOffset(*StartLabel); + const auto ColdSize = BF.isSplit() ? Layout.getSymbolOffset(*ColdEndLabel) - + Layout.getSymbolOffset(*ColdStartLabel) + : 0ULL; + + // Clean-up the effect of the code emission. + for (const auto &Symbol : Assembler.symbols()) { + auto *MutableSymbol = const_cast(&Symbol); + MutableSymbol->setUndefined(); + MutableSymbol->setIsRegistered(false); + } + + return std::make_pair(HotSize, ColdSize); +} diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 7ce5a4a39858..fe7517da8bb6 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -669,6 +669,13 @@ class BinaryContext { static std::vector getSortedFunctions(std::map &BinaryFunctions); + /// Do the best effort to calculate the size of the function by emitting + /// its code, and relaxing branch instructions. + /// + /// Return the pair where the first size is for the main part, and the second + /// size is for the cold one. + std::pair calculateEmittedSize(BinaryFunction &BF); + /// Compute the native code size for a range of instructions. /// Note: this can be imprecise wrt the final binary since happening prior to /// relaxation, as well as wrt the original binary because of opcode diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index cb4bca682544..33c623056e7c 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -2639,8 +2639,9 @@ uint64_t BinaryFunction::getEditDistance() const { BasicBlocksLayout); } -void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { - if (EmitColdPart && hasConstantIsland()) +void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart, + bool EmitCodeOnly) { + if (!EmitCodeOnly && EmitColdPart && hasConstantIsland()) duplicateConstantIslands(); for (auto BB : layout()) { @@ -2672,6 +2673,9 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { auto &Instr = *I; + if (EmitCodeOnly && BC.MII->get(Instr.getOpcode()).isPseudo()) + continue; + // Handle pseudo instructions. if (BC.MIB->isEHLabel(Instr)) { const auto *Label = BC.MIB->getTargetSymbol(Instr); @@ -2695,7 +2699,7 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { Streamer.EmitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64); } - if (opts::UpdateDebugSections && UnitLineTable.first) { + if (!EmitCodeOnly && opts::UpdateDebugSections && UnitLineTable.first) { LastLocSeen = emitLineInfo(Instr.getLoc(), LastLocSeen); } @@ -2704,7 +2708,8 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart) { } } - emitConstantIslands(Streamer, EmitColdPart); + if (!EmitCodeOnly) + emitConstantIslands(Streamer, EmitColdPart); } void BinaryFunction::emitBodyRaw(MCStreamer *Streamer) { diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index f52bf00d3d29..a2b23c1af7f3 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -2074,7 +2074,8 @@ class BinaryFunction { /// Emit function code. The caller is responsible for emitting function /// symbol(s) and setting the section to emit the code to. - void emitBody(MCStreamer &Streamer, bool EmitColdPart); + void emitBody(MCStreamer &Streamer, bool EmitColdPart, + bool EmitCodeOnly = false); /// Emit function as a blob with relocations and labels for relocations. void emitBodyRaw(MCStreamer *Streamer); From e5ce9bbda187a8791a8c5373412e1076e34b76ed Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 15 Nov 2018 16:03:34 -0800 Subject: [PATCH 496/904] [BOLT] Add thresholds for function splitting Summary: Use newly added function size estimation to measure the effectiveness and guide function splitting. Two new tuning options are added: -split-threshold= split function only if its main size is reduced by more than given amount of bytes. Default value: 0, i.e. split iff the size is reduced. Note that on some architectures the size can increase after splitting. -split-align-threshold= when deciding to split a function, apply this alignment while doing the size comparison (see -split-threshold). Default value: 2. (cherry picked from commit 74743f29792027201efaafb5c3958eb20ee2f5b2) --- bolt/src/Passes/BinaryPasses.cpp | 53 ++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index c340eaf105dc..2e2b65db6e80 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -202,6 +202,27 @@ SctcMode("sctc-mode", cl::ZeroOrMore, cl::cat(BoltOptCategory)); +static cl::opt +SplitAlignThreshold("split-align-threshold", + cl::desc("when deciding to split a function, apply this alignment " + "while doing the size comparison (see -split-threshold). " + "Default value: 2."), + cl::init(2), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +SplitThreshold("split-threshold", + cl::desc("split function only if its main size is reduced by more than " + "given amount of bytes. Default value: 0, i.e. split iff the " + "size is reduced. Note that on some architectures the size can " + "increase after splitting."), + cl::init(0), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + static cl::opt TSPThreshold("tsp-threshold", cl::desc("maximum number of hot basic blocks in a function for which to use " @@ -511,6 +532,18 @@ void ReorderBasicBlocks::splitFunction(BinaryFunction &BF) const { if (AllCold) return; + auto PreSplitLayout = BF.getLayout(); + + auto &BC = BF.getBinaryContext(); + size_t OriginalHotSize; + size_t HotSize; + size_t ColdSize; + if (BC.isX86()) + std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); + DEBUG(dbgs() << "Estimated size for function " << BF << " pre-split is <0x" + << Twine::utohexstr(OriginalHotSize) << ", 0x" + << Twine::utohexstr(ColdSize) << ">\n"); + // Never outline the first basic block. BF.layout_front()->setCanOutline(false); for (auto *BB : BF.layout()) { @@ -577,6 +610,26 @@ void ReorderBasicBlocks::splitFunction(BinaryFunction &BF) const { break; BB->setIsCold(true); } + + // Check the new size to see if it's worth splitting the function. + if (BC.isX86() && BF.isSplit()) { + std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); + DEBUG(dbgs() << "Estimated size for function " << BF << " post-split is <0x" + << Twine::utohexstr(HotSize) << ", 0x" + << Twine::utohexstr(ColdSize) << ">\n"); + if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= + alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { + DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n 0x" + << Twine::utohexstr(HotSize) << ", 0x" + << Twine::utohexstr(ColdSize) << " -> 0x" + << Twine::utohexstr(OriginalHotSize) << '\n'); + + BF.updateBasicBlockLayout(PreSplitLayout); + for (auto &BB : BF) { + BB.setIsCold(false); + } + } + } } void FixupBranches::runOnFunctions( From 2ece33bcd20497800c7421dec1bd559f5fab935c Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 21 Nov 2018 20:04:00 -0800 Subject: [PATCH 497/904] [perf2bolt] Better tracking of process forking Summary: Improve tracking of forked processes. If a process corresponding to the input binary has forked/started before 'perf record' was initiated, then the full name of the binary will be recorded in a corresponding MMAP2 event. We've being handling such cases well so far. However, if the process was forked after 'perf record' has started, and execve(2) wasn't called afterwards, then there will be no MMAP2 event recorded corresponding to the mapping of the main binary (unrelated MMAP2 events could still be recorded). To track such cases, we need to parse 'perf script --show-task-events' command output, and to scan for PERF_RECORD_FORK events, and then add forked process PIDs to the list associated with the input binary. If the fork event was followed by an exec event (PERF_RECORD_COMM exec) of a different binary, then the forked PID should be ignored. If the exec event was associated with our input binary, then the correct MMAP2 event was recorded and parsed. To track if the event occurred before or after 'perf record', we parse event's time. This helps us to differentiate some events. E.g. the exec event is only registered correctly if it happened after perf recording has started (otherwise the "exec" part is missing), and thus we only record forks with non-zero time stamps. (cherry picked from commit 785702e8a0ccdfe49f0f67c06248c8ec63e62c61) --- bolt/src/DataAggregator.cpp | 643 ++++++++++++++++++++---------------- bolt/src/DataAggregator.h | 73 ++-- 2 files changed, 406 insertions(+), 310 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 4265b45c4041..950e26944032 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -24,6 +24,7 @@ #include "llvm/Support/Regex.h" #include "llvm/Support/Timer.h" #include +#include #include #include @@ -73,6 +74,26 @@ const char TimerGroupDesc[] = "Aggregator"; } +DataAggregator::~DataAggregator() { + deleteTempFiles(); +} + +namespace { +void deleteTempFile(const std::string &FileName) { + if (auto Errc = sys::fs::remove(FileName.c_str())) { + errs() << "PERF2BOLT: failed to delete temporary file " + << FileName << " with error " << Errc.message() << "\n"; + } +} +} + +void DataAggregator::deleteTempFiles() { + for (auto &FileName : TempFiles) { + deleteTempFile(FileName); + } + TempFiles.clear(); +} + void DataAggregator::findPerfExecutable() { auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf"); if (!PerfExecutable) { @@ -93,9 +114,35 @@ void DataAggregator::start(StringRef PerfDataFilename) { return; findPerfExecutable(); - launchPerfBranchEventsNoWait(); - launchPerfMemEventsNoWait(); - launchPerfMMapEventsNoWait(); + + if (opts::BasicAggregation) { + launchPerfProcess("events without LBR", + BranchEventsPPI, + "script -F pid,event,ip", + /*Wait = */false); + } else { + launchPerfProcess("branch events", + BranchEventsPPI, + "script -F pid,brstack", + /*Wait = */false); + } + + // Note: we launch script for mem events regardless of the option, as the + // command fails fairly fast if mem events were not collected. + launchPerfProcess("mem events", + MemEventsPPI, + "script -F pid,event,addr,ip", + /*Wait = */false); + + launchPerfProcess("process events", + MMapEventsPPI, + "script --show-mmap-events", + /*Wait = */false); + + launchPerfProcess("task events", + TaskEventsPPI, + "script --show-task-events", + /*Wait = */false); } void DataAggregator::abort() { @@ -105,203 +152,102 @@ void DataAggregator::abort() { std::string Error; // Kill subprocesses in case they are not finished - sys::Wait(MMapEventsPI, 1, false, &Error); - sys::Wait(BranchEventsPI, 1, false, &Error); - sys::Wait(MemEventsPI, 1, false, &Error); + sys::Wait(TaskEventsPPI.PI, 1, false, &Error); + sys::Wait(MMapEventsPPI.PI, 1, false, &Error); + sys::Wait(BranchEventsPPI.PI, 1, false, &Error); + sys::Wait(MemEventsPPI.PI, 1, false, &Error); deleteTempFiles(); -} -bool DataAggregator::launchPerfBranchEventsNoWait() { - SmallVector Argv; - - if (opts::BasicAggregation) - outs() - << "PERF2BOLT: Spawning perf-script job to read events without LBR\n"; - else - outs() << "PERF2BOLT: Spawning perf-script job to read branch events\n"; - Argv.push_back(PerfPath.data()); - Argv.push_back("script"); - Argv.push_back("-F"); - if (opts::BasicAggregation) - Argv.push_back("pid,event,ip"); - else - Argv.push_back("pid,brstack"); - Argv.push_back("-i"); - Argv.push_back(PerfDataFilename.data()); - Argv.push_back(nullptr); - - if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out", - PerfBranchEventsOutputPath)) { - outs() << "PERF2BOLT: Failed to create temporary file " - << PerfBranchEventsOutputPath << " with error " << Errc.message() - << "\n"; - exit(1); - } - - if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", - PerfBranchEventsErrPath)) { - outs() << "PERF2BOLT: Failed to create temporary file " - << PerfBranchEventsErrPath << " with error " << Errc.message() - << "\n"; - exit(1); - } - Optional Redirects[] = { - llvm::None, // Stdin - StringRef(PerfBranchEventsOutputPath.data()), // Stdout - StringRef(PerfBranchEventsErrPath.data())}; // Stderr - - DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " - << PerfBranchEventsOutputPath.data() << " 2> " - << PerfBranchEventsErrPath.data() << "\n"); - - BranchEventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, Redirects); - - return true; + exit(1); } -bool DataAggregator::launchPerfMemEventsNoWait() { +void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI, + const char *ArgsString, bool Wait) { SmallVector Argv; - outs() << "PERF2BOLT: Spawning perf-script job to read mem events\n"; + outs() << "PERF2BOLT: spawning perf job to read " << Name << '\n'; Argv.push_back(PerfPath.data()); - Argv.push_back("script"); - Argv.push_back("-F"); - Argv.push_back("pid,event,addr,ip"); - Argv.push_back("-i"); - Argv.push_back(PerfDataFilename.data()); - Argv.push_back(nullptr); - - if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out", - PerfMemEventsOutputPath)) { - outs() << "PERF2BOLT: Failed to create temporary file " - << PerfMemEventsOutputPath << " with error " << Errc.message() << "\n"; - exit(1); - } - if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", - PerfMemEventsErrPath)) { - outs() << "PERF2BOLT: Failed to create temporary file " - << PerfMemEventsErrPath << " with error " << Errc.message() << "\n"; - exit(1); - } - - Optional Redirects[] = { - llvm::None, // Stdin - StringRef(PerfMemEventsOutputPath.data()), // Stdout - StringRef(PerfMemEventsErrPath.data())}; // Stderr - - DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " - << PerfMemEventsOutputPath.data() << " 2> " - << PerfMemEventsErrPath.data() << "\n"); - - MemEventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, Redirects); - - return true; -} - -bool DataAggregator::launchPerfMMapEventsNoWait() { - SmallVector Argv; + auto *WritableArgsString = strdup(ArgsString); + auto *Str = WritableArgsString; + do { + Argv.push_back(Str); + while (*Str && *Str != ' ') + ++Str; + if (!*Str) + break; + *Str++ = 0; + } while (true); - outs() << "PERF2BOLT: Spawning perf-script job to read process info\n"; - Argv.push_back(PerfPath.data()); - Argv.push_back("script"); - Argv.push_back("--show-mmap-events"); Argv.push_back("-i"); Argv.push_back(PerfDataFilename.data()); Argv.push_back(nullptr); if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out", - PerfMMapEventsOutputPath)) { - outs() << "PERF2BOLT: Failed to create temporary file " - << PerfMMapEventsOutputPath << " with error " << Errc.message() + PPI.StdoutPath)) { + errs() << "PERF2BOLT: failed to create temporary file " + << PPI.StdoutPath << " with error " << Errc.message() << "\n"; exit(1); } + TempFiles.push_back(PPI.StdoutPath.data()); if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", - PerfMMapEventsErrPath)) { - outs() << "PERF2BOLT: Failed to create temporary file " - << PerfMMapEventsErrPath << " with error " << Errc.message() << "\n"; + PPI.StderrPath)) { + errs() << "PERF2BOLT: failed to create temporary file " + << PPI.StderrPath << " with error " << Errc.message() << "\n"; exit(1); } + TempFiles.push_back(PPI.StderrPath.data()); Optional Redirects[] = { - llvm::None, // Stdin - StringRef(PerfMMapEventsOutputPath.data()), // Stdout - StringRef(PerfMMapEventsErrPath.data())}; // Stderr + llvm::None, // Stdin + StringRef(PPI.StdoutPath.data()), // Stdout + StringRef(PPI.StderrPath.data())}; // Stderr DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " - << PerfMMapEventsOutputPath.data() << " 2> " - << PerfMMapEventsErrPath.data() << "\n"); + << PPI.StdoutPath.data() << " 2> " + << PPI.StderrPath.data() << "\n"); - MMapEventsPI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, Redirects); + if (Wait) { + PPI.PI.ReturnCode = + sys::ExecuteAndWait(PerfPath.data(), Argv.data(), /*envp*/ nullptr, + Redirects); + } else { + PPI.PI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), /*envp*/ nullptr, + Redirects); + } - return true; + free(WritableArgsString); } void DataAggregator::processFileBuildID(StringRef FileBuildID) { if (opts::ReadPreAggregated) return; - SmallVector Argv; - SmallVector OutputPath; - SmallVector ErrPath; + PerfProcessInfo BuildIDProcessInfo; + launchPerfProcess("buildid list", + BuildIDProcessInfo, + "buildid-list", + /*Wait = */true); - Argv.push_back(PerfPath.data()); - Argv.push_back("buildid-list"); - Argv.push_back("-i"); - Argv.push_back(PerfDataFilename.data()); - Argv.push_back(nullptr); - - if (auto Errc = sys::fs::createTemporaryFile("perf.buildid", "out", - OutputPath)) { - outs() << "PERF2BOLT: Failed to create temporary file " - << OutputPath << " with error " << Errc.message() << "\n"; - exit(1); - } - - if (auto Errc = sys::fs::createTemporaryFile("perf.script", "err", - ErrPath)) { - outs() << "PERF2BOLT: Failed to create temporary file " - << ErrPath << " with error " << Errc.message() << "\n"; - exit(1); - } - - Optional Redirects[] = { - llvm::None, // Stdin - StringRef(OutputPath.data()), // Stdout - StringRef(ErrPath.data())}; // Stderr - - DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " - << OutputPath.data() << " 2> " - << ErrPath.data() << "\n"); - - auto RetCode = sys::ExecuteAndWait(PerfPath.data(), Argv.data(), - /*envp*/ nullptr, Redirects); - - if (RetCode != 0) { + if (BuildIDProcessInfo.PI.ReturnCode != 0) { ErrorOr> MB = - MemoryBuffer::getFileOrSTDIN(ErrPath.data()); + MemoryBuffer::getFileOrSTDIN(BuildIDProcessInfo.StderrPath.data()); StringRef ErrBuf = (*MB)->getBuffer(); - errs() << "PERF-ERROR: Return code " << RetCode << "\n"; + errs() << "PERF-ERROR: return code " << BuildIDProcessInfo.PI.ReturnCode + << '\n'; errs() << ErrBuf; - deleteTempFile(ErrPath.data()); - deleteTempFile(OutputPath.data()); return; } ErrorOr> MB = - MemoryBuffer::getFileOrSTDIN(OutputPath.data()); + MemoryBuffer::getFileOrSTDIN(BuildIDProcessInfo.StdoutPath.data()); if (std::error_code EC = MB.getError()) { - errs() << "Cannot open " << PerfMMapEventsOutputPath.data() << ": " + errs() << "Cannot open " << BuildIDProcessInfo.StdoutPath.data() << ": " << EC.message() << "\n"; - deleteTempFile(ErrPath.data()); - deleteTempFile(OutputPath.data()); return; } @@ -310,8 +256,6 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) { if (ParsingBuf.empty()) { errs() << "PERF2BOLT-WARNING: build-id will not be checked because perf " "data was recorded without it\n"; - deleteTempFile(ErrPath.data()); - deleteTempFile(OutputPath.data()); return; } @@ -325,10 +269,7 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) { "data, or there were no samples recorded for the binary. " "Use -ignore-build-id option to override.\n"; if (!opts::IgnoreBuildID) { - deleteTempFile(ErrPath.data()); - deleteTempFile(OutputPath.data()); abort(); - exit(1); } } else if (*FileName != BinaryName) { errs() << "PERF2BOLT-WARNING: build-id matched a different file name\n"; @@ -337,8 +278,6 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) { outs() << "PERF2BOLT: matched build-id and file name\n"; } - deleteTempFile(ErrPath.data()); - deleteTempFile(OutputPath.data()); return; } @@ -364,22 +303,6 @@ bool DataAggregator::checkPerfDataMagic(StringRef FileName) { return false; } -void DataAggregator::deleteTempFile(StringRef File) { - if (auto Errc = sys::fs::remove(File.data())) { - outs() << "PERF2BOLT: Failed to delete temporary file " - << File << " with error " << Errc.message() << "\n"; - } -} - -void DataAggregator::deleteTempFiles() { - deleteTempFile(PerfBranchEventsErrPath.data()); - deleteTempFile(PerfBranchEventsOutputPath.data()); - deleteTempFile(PerfMemEventsErrPath.data()); - deleteTempFile(PerfMemEventsOutputPath.data()); - deleteTempFile(PerfMMapEventsErrPath.data()); - deleteTempFile(PerfMMapEventsOutputPath.data()); -} - bool DataAggregator::processPreAggregated() { std::string Error; @@ -395,7 +318,7 @@ bool DataAggregator::processPreAggregated() { Col = 0; Line = 1; if (parseAggregatedLBRSamples()) { - outs() << "PERF2BOLT: Failed to parse samples\n"; + errs() << "PERF2BOLT: failed to parse samples\n"; exit(1); } @@ -414,88 +337,64 @@ bool DataAggregator::processPreAggregated() { bool DataAggregator::aggregate(BinaryContext &BC, std::map &BFs) { - std::string Error; - this->BC = &BC; this->BFs = &BFs; if (opts::ReadPreAggregated) return processPreAggregated(); - outs() << "PERF2BOLT: Waiting for perf mmap events collection to finish...\n"; - auto PI1 = sys::Wait(MMapEventsPI, 0, true, &Error); - - if (!Error.empty()) { - errs() << "PERF-ERROR: " << Error << "\n"; - deleteTempFiles(); - exit(1); - } - - if (PI1.ReturnCode != 0) { - ErrorOr> MB = - MemoryBuffer::getFileOrSTDIN(PerfMMapEventsErrPath.data()); - StringRef ErrBuf = (*MB)->getBuffer(); - - errs() << "PERF-ERROR: Return code " << PI1.ReturnCode << "\n"; - errs() << ErrBuf; - deleteTempFiles(); - exit(1); - } - - ErrorOr> MB1 = - MemoryBuffer::getFileOrSTDIN(PerfMMapEventsOutputPath.data()); - if (std::error_code EC = MB1.getError()) { - errs() << "Cannot open " << PerfMMapEventsOutputPath.data() << ": " - << EC.message() << "\n"; - deleteTempFiles(); - exit(1); - } + auto prepareToParse = [&] (StringRef Name, PerfProcessInfo &Process) { + std::string Error; + outs() << "PERF2BOLT: waiting for perf " << Name + << " collection to finish...\n"; + auto PI = sys::Wait(Process.PI, 0, true, &Error); - FileBuf.reset(MB1->release()); - ParsingBuf = FileBuf->getBuffer(); - Col = 0; - Line = 1; - if (parseMMapEvents()) { - outs() << "PERF2BOLT: Failed to parse mmap events\n"; - } + if (!Error.empty()) { + errs() << "PERF-ERROR: " << PerfPath << ": " << Error << "\n"; + deleteTempFiles(); + exit(1); + } - outs() - << "PERF2BOLT: Waiting for perf events collection to finish...\n"; - auto PI2 = sys::Wait(BranchEventsPI, 0, true, &Error); + if (PI.ReturnCode != 0) { + ErrorOr> ErrorMB = + MemoryBuffer::getFileOrSTDIN(Process.StderrPath.data()); + StringRef ErrBuf = (*ErrorMB)->getBuffer(); - if (!Error.empty()) { - errs() << "PERF-ERROR: " << Error << "\n"; - deleteTempFiles(); - exit(1); - } + errs() << "PERF-ERROR: return code " << PI.ReturnCode << "\n"; + errs() << ErrBuf; + deleteTempFiles(); + exit(1); + } - if (PI2.ReturnCode != 0) { ErrorOr> MB = - MemoryBuffer::getFileOrSTDIN(PerfBranchEventsErrPath.data()); - StringRef ErrBuf = (*MB)->getBuffer(); + MemoryBuffer::getFileOrSTDIN(Process.StdoutPath.data()); + if (std::error_code EC = MB.getError()) { + errs() << "Cannot open " << Process.StdoutPath.data() << ": " + << EC.message() << "\n"; + deleteTempFiles(); + exit(1); + } - errs() << "PERF-ERROR: Return code " << PI2.ReturnCode << "\n"; - errs() << ErrBuf; - deleteTempFiles(); - exit(1); + FileBuf.reset(MB->release()); + ParsingBuf = FileBuf->getBuffer(); + Col = 0; + Line = 1; + }; + + prepareToParse("mmap events", MMapEventsPPI); + if (parseMMapEvents()) { + errs() << "PERF2BOLT: failed to parse mmap events\n"; } - ErrorOr> MB2 = - MemoryBuffer::getFileOrSTDIN(PerfBranchEventsOutputPath.data()); - if (std::error_code EC = MB2.getError()) { - errs() << "Cannot open " << PerfBranchEventsOutputPath.data() << ": " - << EC.message() << "\n"; - deleteTempFiles(); - exit(1); + prepareToParse("task events", TaskEventsPPI); + if (parseTaskEvents()) { + errs() << "PERF2BOLT: failed to parse task events\n"; } - FileBuf.reset(MB2->release()); - ParsingBuf = FileBuf->getBuffer(); - Col = 0; - Line = 1; + prepareToParse("events", BranchEventsPPI); if ((!opts::BasicAggregation && parseBranchEvents()) || (opts::BasicAggregation && parseBasicEvents())) { - outs() << "PERF2BOLT: Failed to parse samples\n"; + errs() << "PERF2BOLT: failed to parse samples\n"; } // Mark all functions with registered events as having a valid profile. @@ -508,10 +407,12 @@ bool DataAggregator::aggregate(BinaryContext &BC, } } - auto PI3 = sys::Wait(MemEventsPI, 0, true, &Error); - if (PI3.ReturnCode != 0) { + // Special handling for memory events + std::string Error; + auto PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error); + if (PI.ReturnCode != 0) { ErrorOr> MB = - MemoryBuffer::getFileOrSTDIN(PerfMemEventsErrPath.data()); + MemoryBuffer::getFileOrSTDIN(MemEventsPPI.StderrPath.data()); StringRef ErrBuf = (*MB)->getBuffer(); deleteTempFiles(); @@ -519,28 +420,28 @@ bool DataAggregator::aggregate(BinaryContext &BC, Regex NoData("Samples for '.*' event do not have ADDR attribute set. " "Cannot print 'addr' field."); if (!NoData.match(ErrBuf)) { - errs() << "PERF-ERROR: Return code " << PI3.ReturnCode << "\n"; + errs() << "PERF-ERROR: return code " << PI.ReturnCode << "\n"; errs() << ErrBuf; exit(1); } return true; } - ErrorOr> MB3 = - MemoryBuffer::getFileOrSTDIN(PerfMemEventsOutputPath.data()); - if (std::error_code EC = MB3.getError()) { - errs() << "Cannot open " << PerfMemEventsOutputPath.data() << ": " + ErrorOr> MB = + MemoryBuffer::getFileOrSTDIN(MemEventsPPI.StdoutPath.data()); + if (std::error_code EC = MB.getError()) { + errs() << "Cannot open " << MemEventsPPI.StdoutPath.data() << ": " << EC.message() << "\n"; deleteTempFiles(); exit(1); } - FileBuf.reset(MB3->release()); + FileBuf.reset(MB->release()); ParsingBuf = FileBuf->getBuffer(); Col = 0; Line = 1; if (const auto EC = parseMemEvents()) { - errs() << "PERF2BOLT: Failed to parse memory events: " + errs() << "PERF2BOLT: failed to parse memory events: " << EC.message() << '\n'; } @@ -962,12 +863,13 @@ bool DataAggregator::hasData() { } std::error_code DataAggregator::parseBranchEvents() { - outs() << "PERF2BOLT: Aggregating branch events...\n"; + outs() << "PERF2BOLT: aggregating branch events...\n"; NamedRegionTimer T("parseBranch", "Branch samples parsing", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); uint64_t NumEntries{0}; uint64_t NumSamples{0}; uint64_t NumTraces{0}; + uint64_t NumTotalSamples{0}; struct Location { uint64_t From; @@ -1000,6 +902,8 @@ std::error_code DataAggregator::parseBranchEvents() { std::unordered_map FallthroughLBRs; while (hasData()) { + ++NumTotalSamples; + auto SampleRes = parseBranchSample(); if (std::error_code EC = SampleRes.getError()) return EC; @@ -1072,27 +976,42 @@ std::error_code DataAggregator::parseBranchEvents() { doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); } - outs() << "PERF2BOLT: Read " << NumSamples << " samples and " + auto printColored = [](raw_ostream &OS, float Percent, float T1, float T2) { + OS << " ("; + if (OS.has_colors()) { + if (Percent > T2) { + OS.changeColor(raw_ostream::RED); + } else if (Percent > T1) { + OS.changeColor(raw_ostream::YELLOW); + } else { + OS.changeColor(raw_ostream::GREEN); + } + } + OS << format("%.1f%%", Percent); + if (OS.has_colors()) + OS.resetColor(); + OS << ")"; + }; + + outs() << "PERF2BOLT: read " << NumSamples << " samples and " << NumEntries << " LBR entries\n"; - outs() << "PERF2BOLT: Traces mismatching disassembled function contents: " + if (NumTotalSamples) { + const auto IgnoredSamples = NumTotalSamples - NumSamples; + const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples; + outs() << "PERF2BOLT: " << IgnoredSamples << " samples"; + printColored(outs(), PercentIgnored, 20, 50); + outs() << " were ignored\n"; + if (PercentIgnored > 50.0f) { + errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples were " + "attributed to the input binary\n"; + } + } + outs() << "PERF2BOLT: traces mismatching disassembled function contents: " << NumInvalidTraces; float Perc{0.0f}; if (NumTraces > 0) { - outs() << " ("; Perc = NumInvalidTraces * 100.0f / NumTraces; - if (outs().has_colors()) { - if (Perc > 10.0f) { - outs().changeColor(raw_ostream::RED); - } else if (Perc > 5.0f) { - outs().changeColor(raw_ostream::YELLOW); - } else { - outs().changeColor(raw_ostream::GREEN); - } - } - outs() << format("%.1f%%", Perc); - if (outs().has_colors()) - outs().resetColor(); - outs() << ")"; + printColored(outs(), Perc, 5, 10); } outs() << "\n"; if (Perc > 10.0f) { @@ -1102,7 +1021,7 @@ std::error_code DataAggregator::parseBranchEvents() { "performance.\n\n"; } - outs() << "PERF2BOLT: Out of range traces involving unknown regions: " + outs() << "PERF2BOLT: out of range traces involving unknown regions: " << NumLongRangeTraces; if (NumTraces > 0) { outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); @@ -1113,7 +1032,7 @@ std::error_code DataAggregator::parseBranchEvents() { } std::error_code DataAggregator::parseBasicEvents() { - outs() << "PERF2BOLT: Aggregating basic events (without LBR)...\n"; + outs() << "PERF2BOLT: aggregating basic events (without LBR)...\n"; NamedRegionTimer T("parseBasic", "Perf samples parsing", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); uint64_t NumSamples{0}; @@ -1137,9 +1056,9 @@ std::error_code DataAggregator::parseBasicEvents() { doSample(*Func, Sample.PC); EventNames.insert(Sample.EventName); } - outs() << "PERF2BOLT: Read " << NumSamples << " samples\n"; + outs() << "PERF2BOLT: read " << NumSamples << " samples\n"; - outs() << "PERF2BOLT: Out of range samples recorded in unknown regions: " + outs() << "PERF2BOLT: out of range samples recorded in unknown regions: " << OutOfRangeSamples; float Perc{0.0f}; if (NumSamples > 0) { @@ -1171,7 +1090,7 @@ std::error_code DataAggregator::parseBasicEvents() { } std::error_code DataAggregator::parseMemEvents() { - outs() << "PERF2BOLT: Aggregating memory events...\n"; + outs() << "PERF2BOLT: aggregating memory events...\n"; NamedRegionTimer T("memevents", "Mem samples parsing", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); @@ -1224,7 +1143,7 @@ std::error_code DataAggregator::parseMemEvents() { } std::error_code DataAggregator::parseAggregatedLBRSamples() { - outs() << "PERF2BOLT: Aggregating...\n"; + outs() << "PERF2BOLT: aggregating...\n"; NamedRegionTimer T("parseAggregated", "Aggregated LBR parsing", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); uint64_t NumAggrEntries{0}; @@ -1255,8 +1174,8 @@ std::error_code DataAggregator::parseAggregatedLBRSamples() { } } } - outs() << "PERF2BOLT: Read " << NumAggrEntries << " aggregated LBR entries\n"; - outs() << "PERF2BOLT: Traces mismatching disassembled function contents: " + outs() << "PERF2BOLT: read " << NumAggrEntries << " aggregated LBR entries\n"; + outs() << "PERF2BOLT: traces mismatching disassembled function contents: " << NumInvalidTraces; float Perc{0.0f}; if (NumTraces > 0) { @@ -1296,6 +1215,98 @@ std::error_code DataAggregator::parseAggregatedLBRSamples() { return std::error_code(); } +Optional +DataAggregator::parseCommExecEvent() { + auto LineEnd = ParsingBuf.find_first_of("\n"); + if (LineEnd == StringRef::npos) { + reportError("expected rest of line"); + Diag << "Found: " << ParsingBuf << "\n"; + return NoneType(); + } + StringRef Line = ParsingBuf.substr(0, LineEnd); + + auto Pos = Line.find("PERF_RECORD_COMM exec"); + if (Pos == StringRef::npos) { + return NoneType(); + } + Line = Line.drop_front(Pos); + + // Line: + // PERF_RECORD_COMM exec: :/" + auto PIDStr = Line.rsplit(':').second.split('/').first; + pid_t PID; + if (PIDStr.getAsInteger(10, PID)) { + reportError("expected PID"); + Diag << "Found: " << PIDStr << "in '" << Line << "'\n"; + return NoneType(); + } + + return PID; +} + +namespace { +Optional parsePerfTime(const StringRef TimeStr) { + const auto SecTimeStr = TimeStr.split('.').first; + const auto USecTimeStr = TimeStr.split('.').second; + uint64_t SecTime; + uint64_t USecTime; + if (SecTimeStr.getAsInteger(10, SecTime) || + USecTimeStr.getAsInteger(10, USecTime)) { + return NoneType(); + } + return SecTime * 1000000ULL + USecTime; +} +} + +Optional +DataAggregator::parseForkEvent() { + while (checkAndConsumeFS()) {} + + auto LineEnd = ParsingBuf.find_first_of("\n"); + if (LineEnd == StringRef::npos) { + reportError("expected rest of line"); + Diag << "Found: " << ParsingBuf << "\n"; + return NoneType(); + } + StringRef Line = ParsingBuf.substr(0, LineEnd); + + auto Pos = Line.find("PERF_RECORD_FORK"); + if (Pos == StringRef::npos) { + consumeRestOfLine(); + return NoneType(); + } + + ForkInfo FI; + + const auto TimeStr = + Line.substr(0, Pos).rsplit(':').first.rsplit(FieldSeparator).second; + if (auto TimeRes = parsePerfTime(TimeStr)) { + FI.Time = *TimeRes; + } + + Line = Line.drop_front(Pos); + + // Line: + // PERF_RECORD_FORK(:):(:) + const auto ChildPIDStr = Line.split('(').second.split(':').first; + if (ChildPIDStr.getAsInteger(10, FI.ChildPID)) { + reportError("expected PID"); + Diag << "Found: " << ChildPIDStr << "in '" << Line << "'\n"; + return NoneType(); + } + + const auto ParentPIDStr = Line.rsplit('(').second.split(':').first; + if (ParentPIDStr.getAsInteger(10, FI.ParentPID)) { + reportError("expected PID"); + Diag << "Found: " << ParentPIDStr << "in '" << Line << "'\n"; + return NoneType(); + } + + consumeRestOfLine(); + + return FI; +} + ErrorOr> DataAggregator::parseMMapEvent() { while (checkAndConsumeFS()) {} @@ -1315,8 +1326,21 @@ DataAggregator::parseMMapEvent() { consumeRestOfLine(); return std::make_pair(StringRef(), ParsedInfo); } + + // Line: + // { .* .: }PERF_RECORD_MMAP2 /: .* + + const auto TimeStr = + Line.substr(0, Pos).rsplit(':').first.rsplit(FieldSeparator).second; + if (auto TimeRes = parsePerfTime(TimeStr)) { + ParsedInfo.Time = *TimeRes; + } + Line = Line.drop_front(Pos); + // Line: + // PERF_RECORD_MMAP2 /: [() .*]: .* + auto FileName = Line.rsplit(FieldSeparator).second; if (FileName.startswith("//") || FileName.startswith("[")) { consumeRestOfLine(); @@ -1324,21 +1348,21 @@ DataAggregator::parseMMapEvent() { } FileName = sys::path::filename(FileName); - StringRef PIDStr = Line.split(FieldSeparator).second.split('/').first; + const auto PIDStr = Line.split(FieldSeparator).second.split('/').first; if (PIDStr.getAsInteger(10, ParsedInfo.PID)) { reportError("expected PID"); Diag << "Found: " << PIDStr << "in '" << Line << "'\n"; return make_error_code(llvm::errc::io_error); } - StringRef BaseAddressStr = Line.split('[').second.split('(').first; + const auto BaseAddressStr = Line.split('[').second.split('(').first; if (BaseAddressStr.getAsInteger(0, ParsedInfo.BaseAddress)) { reportError("expected base address"); Diag << "Found: " << BaseAddressStr << "in '" << Line << "'\n"; return make_error_code(llvm::errc::io_error); } - StringRef SizeStr = Line.split('(').second.split(')').first; + const auto SizeStr = Line.split('(').second.split(')').first; if (SizeStr.getAsInteger(0, ParsedInfo.Size)) { reportError("expected mmaped size"); Diag << "Found: " << SizeStr << "in '" << Line << "'\n"; @@ -1351,7 +1375,7 @@ DataAggregator::parseMMapEvent() { } std::error_code DataAggregator::parseMMapEvents() { - outs() << "PERF2BOLT: Parsing perf-script mmap events output\n"; + outs() << "PERF2BOLT: parsing perf-script mmap events output\n"; NamedRegionTimer T("parseMMapEvents", "Parsing mmap events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); @@ -1422,9 +1446,60 @@ std::error_code DataAggregator::parseMMapEvents() { exit(1); } - outs() << "PERF2BOLT: Input binary is associated with " + return std::error_code(); +} + +std::error_code DataAggregator::parseTaskEvents() { + outs() << "PERF2BOLT: parsing perf-script task events output\n"; + NamedRegionTimer T("parseTaskEvents", "Parsing task events", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + + while (hasData()) { + if (auto CommInfo = parseCommExecEvent()) { + // Remove forked child that ran execve + auto MMapInfoIter = BinaryMMapInfo.find(*CommInfo); + if (MMapInfoIter != BinaryMMapInfo.end() && + MMapInfoIter->second.Forked) { + BinaryMMapInfo.erase(MMapInfoIter); + } + consumeRestOfLine(); + continue; + } + + auto ForkInfo = parseForkEvent(); + if (!ForkInfo) + continue; + + if (ForkInfo->ParentPID == ForkInfo->ChildPID) + continue; + + if (ForkInfo->Time == 0) { + // Process was forked and mmaped before perf ran. In this case the child + // should have its own mmap entry unless it was execve'd. + continue; + } + + auto MMapInfoIter = BinaryMMapInfo.find(ForkInfo->ParentPID); + if (MMapInfoIter == BinaryMMapInfo.end()) + continue; + + auto MMapInfo = MMapInfoIter->second; + MMapInfo.PID = ForkInfo->ChildPID; + MMapInfo.Forked = true; + BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo)); + } + + outs() << "PERF2BOLT: input binary is associated with " << BinaryMMapInfo.size() << " PID(s)\n"; + DEBUG( + for (auto &MMI : BinaryMMapInfo) { + outs() << " " << MMI.second.PID << (MMI.second.Forked ? " (forked)" : "") + << ": (0x" << Twine::utohexstr(MMI.second.BaseAddress) + << ": 0x" << Twine::utohexstr(MMI.second.Size) << ")\n"; + } + ); + return std::error_code(); } @@ -1523,7 +1598,7 @@ std::error_code DataAggregator::writeAggregatedFile() const { } } - outs() << "PERF2BOLT: Wrote " << BranchValues << " objects and " + outs() << "PERF2BOLT: wrote " << BranchValues << " objects and " << MemValues << " memory objects to " << OutputFDataName << "\n"; return std::error_code(); diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index 703a5551d91c..06f8fb77629f 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -74,24 +74,32 @@ struct AggregatedLBREntry { /// The last step is to write the aggregated data to disk in the output file /// specified by the user. class DataAggregator : public DataReader { - // Perf process spawning bookkeeping - std::string PerfPath; - sys::ProcessInfo BranchEventsPI; - sys::ProcessInfo MemEventsPI; - sys::ProcessInfo MMapEventsPI; - SmallVector PerfBranchEventsOutputPath; - SmallVector PerfBranchEventsErrPath; - SmallVector PerfMemEventsOutputPath; - SmallVector PerfMemEventsErrPath; - SmallVector PerfMMapEventsOutputPath; - SmallVector PerfMMapEventsErrPath; - /// Whether aggregator was scheduled to run - bool Enabled{false}; + /// Perf utility full path name + std::string PerfPath; /// Input perf.data file StringRef PerfDataFilename; + /// Perf process spawning bookkeeping + struct PerfProcessInfo { + sys::ProcessInfo PI; + SmallVector StdoutPath; + SmallVector StderrPath; + }; + + /// Process info for spawned processes + PerfProcessInfo BranchEventsPPI; + PerfProcessInfo MemEventsPPI; + PerfProcessInfo MMapEventsPPI; + PerfProcessInfo TaskEventsPPI; + + /// Current list of created temporary files + std::vector TempFiles; + + /// Whether aggregator was scheduled to run + bool Enabled{false}; + /// Output file name to write aggregated fdata to StringRef OutputFDataName; @@ -104,14 +112,23 @@ class DataAggregator : public DataReader { /// Memory map info for a single file struct MMapInfo { - int64_t PID{-1LL}; + pid_t PID{-1LL}; uint64_t BaseAddress; uint64_t Size; + bool Forked{false}; + uint64_t Time{0ULL}; // time in micro seconds }; /// Per-PID map info for the binary std::unordered_map BinaryMMapInfo; + /// Fork event info + struct ForkInfo { + pid_t ParentPID; + pid_t ChildPID; + uint64_t Time{0ULL}; + }; + /// References to core BOLT data structures BinaryContext *BC{nullptr}; std::map *BFs{nullptr}; @@ -123,22 +140,14 @@ class DataAggregator : public DataReader { /// Looks into system PATH for Linux Perf and set up the aggregator to use it void findPerfExecutable(); - /// Launch a subprocess to read all perf branch samples and write them to an - /// output file we will parse later - bool launchPerfBranchEventsNoWait(); - - /// Launch a subprocess to read all perf memory event samples and write them - /// to an output file we will parse later - bool launchPerfMemEventsNoWait(); - - /// Launch a subprocess to read memory mapping for the binary. We later use - /// PIDs to filter samples, and memory mapping to adjust addresses. - bool launchPerfMMapEventsNoWait(); + /// Launch a perf subprocess with given args and save output for later + /// parsing. + void launchPerfProcess(StringRef Name, PerfProcessInfo &PPI, + const char *ArgsString, bool Wait); /// Delete all temporary files created to hold the output generated by spawned /// subprocesses during the aggregation job void deleteTempFiles(); - void deleteTempFile(StringRef File); // Semantic pass helpers /// Look up which function contains an address by using out map of @@ -219,11 +228,21 @@ class DataAggregator : public DataReader { /// On success return a pair. ErrorOr> parseMMapEvent(); + /// Parse PERF_RECORD_FORK event. + Optional parseForkEvent(); + + /// Parse 'PERF_RECORD_COMM exec'. Don't consume the string. + Optional parseCommExecEvent(); + /// Parse the full output generated by `perf script --show-mmap-events` /// to generate mapping between binary files and their memory mappings for /// all PIDs. std::error_code parseMMapEvents(); + /// Parse output of `perf script --show-task-events`, and forked processes + /// to the set of tracked PIDs. + std::error_code parseTaskEvents(); + /// Parse a single pair of binary full path and associated build-id Optional> parseNameBuildIDPair(); @@ -300,6 +319,8 @@ class DataAggregator : public DataReader { DataAggregator(raw_ostream &Diag, StringRef BinaryName) : DataReader(Diag), BinaryName(llvm::sys::path::filename(BinaryName)) {} + ~DataAggregator(); + /// Set the file name to save aggregate data to void setOutputFDataName(StringRef Name) { OutputFDataName = Name; } From 5ee7b525e21428c0542f3d69aa6b3d1ef15670bc Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 15 Jan 2019 23:43:40 -0800 Subject: [PATCH 498/904] [perf2bolt] Optimize memory usage in perf2bolt Summary: While converting perf profile, we only need CFG for functions that were profiled and can skip building CFG for the rest. This saves us some processing time and memory. Breakdown processing of perf.data into two steps. The first step parses the data, saves it in intermediate format, and marks functions with the profile. The second step attributes the profile to functions with CFG. When we disassemble and build CFG for functions in aggregate-only mode, we skip functions without the profile. (cherry picked from commit abc5713e75549a70609dcbe24928be732bf5eb1c) --- bolt/src/BinaryFunction.h | 16 ++ bolt/src/DataAggregator.cpp | 278 +++++++++++++----------- bolt/src/DataAggregator.h | 124 ++++++++--- bolt/src/Passes/IndirectCallPromotion.h | 1 - bolt/src/Passes/JTFootprintReduction.h | 1 - bolt/src/RewriteInstance.cpp | 52 +++-- bolt/src/RewriteInstance.h | 6 + 7 files changed, 300 insertions(+), 178 deletions(-) diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index a2b23c1af7f3..71e4a25ce5cf 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -315,6 +315,11 @@ class BinaryFunction { /// True if the function has more than one entry point. bool IsMultiEntry{false}; + /// True if the function might have a profile available externally. + /// Used to check if processing of the function is required under certain + /// conditions. + bool HasProfileAvailable{false}; + /// Indicate if the function body was folded into another function. Used /// for ICF optimization without relocations. bool IsFolded{false}; @@ -1266,6 +1271,12 @@ class BinaryFunction { return IsMultiEntry; } + /// Return true if the function might have a profile available externally, + /// but not yet populated into the function. + bool hasProfileAvailable() const { + return HasProfileAvailable; + } + bool isFolded() const { return IsFolded; } @@ -1601,6 +1612,11 @@ class BinaryFunction { return *this; } + BinaryFunction &setHasProfileAvailable(bool V = true) { + HasProfileAvailable = V; + return *this; + } + BinaryFunction &setFolded(bool Folded = true) { IsFolded = Folded; return *this; diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 950e26944032..00d5451007bd 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -117,12 +117,12 @@ void DataAggregator::start(StringRef PerfDataFilename) { if (opts::BasicAggregation) { launchPerfProcess("events without LBR", - BranchEventsPPI, + MainEventsPPI, "script -F pid,event,ip", /*Wait = */false); } else { launchPerfProcess("branch events", - BranchEventsPPI, + MainEventsPPI, "script -F pid,brstack", /*Wait = */false); } @@ -154,7 +154,7 @@ void DataAggregator::abort() { // Kill subprocesses in case they are not finished sys::Wait(TaskEventsPPI.PI, 1, false, &Error); sys::Wait(MMapEventsPPI.PI, 1, false, &Error); - sys::Wait(BranchEventsPPI.PI, 1, false, &Error); + sys::Wait(MainEventsPPI.PI, 1, false, &Error); sys::Wait(MemEventsPPI.PI, 1, false, &Error); deleteTempFiles(); @@ -303,7 +303,7 @@ bool DataAggregator::checkPerfDataMagic(StringRef FileName) { return false; } -bool DataAggregator::processPreAggregated() { +void DataAggregator::parsePreAggregated() { std::string Error; auto MB = MemoryBuffer::getFileOrSTDIN(PerfDataFilename); @@ -317,31 +317,22 @@ bool DataAggregator::processPreAggregated() { ParsingBuf = FileBuf->getBuffer(); Col = 0; Line = 1; - if (parseAggregatedLBRSamples()) { + if (parsePreAggregatedLBRSamples()) { errs() << "PERF2BOLT: failed to parse samples\n"; exit(1); } - - // Mark all functions with registered events as having a valid profile. - for (auto &BFI : *BFs) { - auto &BF = BFI.second; - if (BF.getBranchData()) { - const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE - : BinaryFunction::PF_LBR; - BF.markProfiled(Flags); - } - } - - return true; } -bool DataAggregator::aggregate(BinaryContext &BC, - std::map &BFs) { +void DataAggregator::parseProfile( + BinaryContext &BC, + std::map &BFs) { this->BC = &BC; this->BFs = &BFs; - if (opts::ReadPreAggregated) - return processPreAggregated(); + if (opts::ReadPreAggregated) { + parsePreAggregated(); + return; + } auto prepareToParse = [&] (StringRef Name, PerfProcessInfo &Process) { std::string Error; @@ -391,22 +382,12 @@ bool DataAggregator::aggregate(BinaryContext &BC, errs() << "PERF2BOLT: failed to parse task events\n"; } - prepareToParse("events", BranchEventsPPI); + prepareToParse("events", MainEventsPPI); if ((!opts::BasicAggregation && parseBranchEvents()) || (opts::BasicAggregation && parseBasicEvents())) { errs() << "PERF2BOLT: failed to parse samples\n"; } - // Mark all functions with registered events as having a valid profile. - for (auto &BFI : BFs) { - auto &BF = BFI.second; - if (BF.getBranchData()) { - const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE - : BinaryFunction::PF_LBR; - BF.markProfiled(Flags); - } - } - // Special handling for memory events std::string Error; auto PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error); @@ -424,7 +405,7 @@ bool DataAggregator::aggregate(BinaryContext &BC, errs() << ErrBuf; exit(1); } - return true; + return; } ErrorOr> MB = @@ -446,8 +427,36 @@ bool DataAggregator::aggregate(BinaryContext &BC, } deleteTempFiles(); +} - return true; +void DataAggregator::processProfile( + BinaryContext &BC, + std::map &BFs) { + if (opts::ReadPreAggregated) + processPreAggregated(); + else if (opts::BasicAggregation) + processBasicEvents(); + else + processBranchEvents(); + + processMemEvents(); + + // Mark all functions with registered events as having a valid profile. + for (auto &BFI : BFs) { + auto &BF = BFI.second; + if (BF.getBranchData()) { + const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE + : BinaryFunction::PF_LBR; + BF.markProfiled(Flags); + } + } + + // Release intermediate storage. + clear(BranchLBRs); + clear(FallthroughLBRs); + clear(AggregatedLBRs); + clear(BasicSamples); + clear(MemSamples); } BinaryFunction * @@ -659,7 +668,7 @@ void DataAggregator::consumeRestOfLine() { Line += 1; } -ErrorOr DataAggregator::parseBranchSample() { +ErrorOr DataAggregator::parseBranchSample() { PerfBranchSample Res; while (checkAndConsumeFS()) {} @@ -688,7 +697,7 @@ ErrorOr DataAggregator::parseBranchSample() { return Res; } -ErrorOr DataAggregator::parseBasicSample() { +ErrorOr DataAggregator::parseBasicSample() { while (checkAndConsumeFS()) {} auto PIDRes = parseNumberField(FieldSeparator, true); @@ -726,7 +735,7 @@ ErrorOr DataAggregator::parseBasicSample() { return PerfBasicSample{Event.get(), Address}; } -ErrorOr DataAggregator::parseMemSample() { +ErrorOr DataAggregator::parseMemSample() { PerfMemSample Res{0,0}; while (checkAndConsumeFS()) {} @@ -802,7 +811,8 @@ ErrorOr DataAggregator::parseLocationOrOffset() { return Location(true, BuildID.get(), Offset.get()); } -ErrorOr DataAggregator::parseAggregatedLBREntry() { +ErrorOr +DataAggregator::parseAggregatedLBREntry() { while (checkAndConsumeFS()) {} auto TypeOrErr = parseString(FieldSeparator); @@ -863,43 +873,14 @@ bool DataAggregator::hasData() { } std::error_code DataAggregator::parseBranchEvents() { - outs() << "PERF2BOLT: aggregating branch events...\n"; - NamedRegionTimer T("parseBranch", "Branch samples parsing", TimerGroupName, + outs() << "PERF2BOLT: parse branch events...\n"; + NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + uint64_t NumTotalSamples{0}; uint64_t NumEntries{0}; uint64_t NumSamples{0}; uint64_t NumTraces{0}; - uint64_t NumTotalSamples{0}; - - struct Location { - uint64_t From; - uint64_t To; - Location(uint64_t From, uint64_t To) - : From(From), To(To) {} - bool operator==(const Location &Other) const { - return From == Other.From && To == Other.To; - } - }; - - struct LocationHash { - size_t operator()(const Location &L) const { - return std::hash()(L.From << 32 | L.To); - } - }; - - struct TraceInfo { - uint64_t InternCount{0}; - uint64_t ExternCount{0}; - }; - - struct BranchInfo { - uint64_t TakenCount{0}; - uint64_t MispredCount{0}; - }; - - /// Map location to counters. - std::unordered_map BranchLBRs; - std::unordered_map FallthroughLBRs; while (hasData()) { ++NumTotalSamples; @@ -925,7 +906,7 @@ std::error_code DataAggregator::parseBranchEvents() { const auto TraceTo = NextLBR->From; const auto *TraceBF = getBinaryFunctionContainingAddress(TraceFrom); if (TraceBF && TraceBF->containsAddress(TraceTo)) { - auto &Info = FallthroughLBRs[Location(TraceFrom, TraceTo)]; + auto &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; if (TraceBF->containsAddress(LBR.From)) { ++Info.InternCount; } else { @@ -950,30 +931,18 @@ std::error_code DataAggregator::parseBranchEvents() { To = 0; if (!From && !To) continue; - auto &Info = BranchLBRs[Location(From, To)]; + auto &Info = BranchLBRs[Trace(From, To)]; ++Info.TakenCount; Info.MispredCount += LBR.Mispred; } } - for (const auto &AggrLBR : FallthroughLBRs) { - auto &Loc = AggrLBR.first; - auto &Info = AggrLBR.second; - LBREntry First{Loc.From, Loc.From, false}; - LBREntry Second{Loc.To, Loc.To, false}; - if (Info.InternCount) { - doTrace(First, Second, Info.InternCount); - } - if (Info.ExternCount) { - First.From = 0; - doTrace(First, Second, Info.ExternCount); - } - } - - for (const auto &AggrLBR : BranchLBRs) { - auto &Loc = AggrLBR.first; - auto &Info = AggrLBR.second; - doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); + for (const auto &LBR : BranchLBRs) { + const auto &Trace = LBR.first; + if (auto *BF = getBinaryFunctionContainingAddress(Trace.From)) + BF->setHasProfileAvailable(); + if (auto *BF = getBinaryFunctionContainingAddress(Trace.To)) + BF->setHasProfileAvailable(); } auto printColored = [](raw_ostream &OS, float Percent, float T1, float T2) { @@ -1031,22 +1000,59 @@ std::error_code DataAggregator::parseBranchEvents() { return std::error_code(); } +void DataAggregator::processBranchEvents() { + outs() << "PERF2BOLT: processing branch events...\n"; + NamedRegionTimer T("processBranch", "Processing branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + for (const auto &AggrLBR : FallthroughLBRs) { + auto &Loc = AggrLBR.first; + auto &Info = AggrLBR.second; + LBREntry First{Loc.From, Loc.From, false}; + LBREntry Second{Loc.To, Loc.To, false}; + if (Info.InternCount) { + doTrace(First, Second, Info.InternCount); + } + if (Info.ExternCount) { + First.From = 0; + doTrace(First, Second, Info.ExternCount); + } + } + + for (const auto &AggrLBR : BranchLBRs) { + auto &Loc = AggrLBR.first; + auto &Info = AggrLBR.second; + doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); + } +} + std::error_code DataAggregator::parseBasicEvents() { - outs() << "PERF2BOLT: aggregating basic events (without LBR)...\n"; - NamedRegionTimer T("parseBasic", "Perf samples parsing", TimerGroupName, + outs() << "PERF2BOLT: parsing basic events (without LBR)...\n"; + NamedRegionTimer T("parseBasic", "Parsing basic events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - uint64_t NumSamples{0}; - uint64_t OutOfRangeSamples{0}; while (hasData()) { - auto SampleRes = parseBasicSample(); - if (std::error_code EC = SampleRes.getError()) + auto Sample = parseBasicSample(); + if (std::error_code EC = Sample.getError()) return EC; - auto &Sample = SampleRes.get(); - if (!Sample.PC) + if (!Sample->PC) continue; - ++NumSamples; + if (auto *BF = getBinaryFunctionContainingAddress(Sample->PC)) + BF->setHasProfileAvailable(); + + BasicSamples.emplace_back(std::move(Sample.get())); + } + + return std::error_code(); +} + +void DataAggregator::processBasicEvents() { + outs() << "PERF2BOLT: processing basic events (without LBR)...\n"; + NamedRegionTimer T("processBasic", "Processing basic events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + uint64_t OutOfRangeSamples{0}; + for (auto &Sample : BasicSamples) { auto *Func = getBinaryFunctionContainingAddress(Sample.PC); if (!Func) { ++OutOfRangeSamples; @@ -1056,6 +1062,7 @@ std::error_code DataAggregator::parseBasicEvents() { doSample(*Func, Sample.PC); EventNames.insert(Sample.EventName); } + const auto NumSamples = BasicSamples.size(); outs() << "PERF2BOLT: read " << NumSamples << " samples\n"; outs() << "PERF2BOLT: out of range samples recorded in unknown regions: " @@ -1085,22 +1092,32 @@ std::error_code DataAggregator::parseBasicEvents() { "collection. The generated data may be ineffective for improving " "performance.\n\n"; } - - return std::error_code(); } std::error_code DataAggregator::parseMemEvents() { - outs() << "PERF2BOLT: aggregating memory events...\n"; - NamedRegionTimer T("memevents", "Mem samples parsing", TimerGroupName, + outs() << "PERF2BOLT: parsing memory events...\n"; + NamedRegionTimer T("parseMemEvents", "Parsing mem events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - while (hasData()) { - auto SampleRes = parseMemSample(); - if (std::error_code EC = SampleRes.getError()) + auto Sample = parseMemSample(); + if (std::error_code EC = Sample.getError()) return EC; - auto PC = SampleRes.get().PC; - auto Addr = SampleRes.get().Addr; + if (auto *BF = getBinaryFunctionContainingAddress(Sample->PC)) + BF->setHasProfileAvailable(); + + MemSamples.emplace_back(std::move(Sample.get())); + } + + return std::error_code(); +} + +void DataAggregator::processMemEvents() { + NamedRegionTimer T("ProcessMemEvents", "Processing mem events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + for (const auto &Sample : MemSamples) { + auto PC = Sample.PC; + auto Addr = Sample.Addr; StringRef FuncName; StringRef MemName; @@ -1138,24 +1155,35 @@ std::error_code DataAggregator::parseMemEvents() { DEBUG(dbgs() << "Mem event: " << FuncLoc << " = " << AddrLoc << "\n"); } } - - return std::error_code(); } -std::error_code DataAggregator::parseAggregatedLBRSamples() { - outs() << "PERF2BOLT: aggregating...\n"; - NamedRegionTimer T("parseAggregated", "Aggregated LBR parsing", TimerGroupName, - TimerGroupDesc, opts::TimeAggregator); - uint64_t NumAggrEntries{0}; - uint64_t NumTraces{0}; +std::error_code DataAggregator::parsePreAggregatedLBRSamples() { + outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; + NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); while (hasData()) { - auto AggrEntryRes = parseAggregatedLBREntry(); - if (std::error_code EC = AggrEntryRes.getError()) + auto AggrEntry = parseAggregatedLBREntry(); + if (std::error_code EC = AggrEntry.getError()) return EC; - auto &AggrEntry = AggrEntryRes.get(); + if (auto *BF = getBinaryFunctionContainingAddress(AggrEntry->From.Offset)) + BF->setHasProfileAvailable(); + if (auto *BF = getBinaryFunctionContainingAddress(AggrEntry->To.Offset)) + BF->setHasProfileAvailable(); + + AggregatedLBRs.emplace_back(std::move(AggrEntry.get())); + } + + return std::error_code(); +} - ++NumAggrEntries; +void DataAggregator::processPreAggregated() { + outs() << "PERF2BOLT: processing pre-aggregated profile...\n"; + NamedRegionTimer T("processAggregated", "Processing aggregated branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + uint64_t NumTraces{0}; + for (const auto &AggrEntry : AggregatedLBRs) { switch (AggrEntry.EntryType) { case AggregatedLBREntry::BRANCH: doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, @@ -1174,7 +1202,9 @@ std::error_code DataAggregator::parseAggregatedLBRSamples() { } } } - outs() << "PERF2BOLT: read " << NumAggrEntries << " aggregated LBR entries\n"; + + outs() << "PERF2BOLT: read " << AggregatedLBRs.size() + << " aggregated LBR entries\n"; outs() << "PERF2BOLT: traces mismatching disassembled function contents: " << NumInvalidTraces; float Perc{0.0f}; @@ -1209,10 +1239,6 @@ std::error_code DataAggregator::parseAggregatedLBRSamples() { outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); } outs() << "\n"; - - dump(); - - return std::error_code(); } Optional diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index 06f8fb77629f..d181e7312fd6 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -29,30 +29,6 @@ namespace bolt { class BinaryFunction; class BinaryContext; -struct PerfBranchSample { - SmallVector LBR; -}; - -struct PerfBasicSample { - StringRef EventName; - uint64_t PC; -}; - -struct PerfMemSample { - uint64_t PC; - uint64_t Addr; -}; - -/// Used for parsing specific pre-aggregated input files. -struct AggregatedLBREntry { - enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN }; - Location From; - Location To; - uint64_t Count; - uint64_t Mispreds; - Type EntryType; -}; - /// DataAggregator inherits all parsing logic from DataReader as well as /// its data structures used to represent aggregated profile data in memory. /// @@ -75,6 +51,69 @@ struct AggregatedLBREntry { /// specified by the user. class DataAggregator : public DataReader { + struct PerfBranchSample { + SmallVector LBR; + }; + + struct PerfBasicSample { + StringRef EventName; + uint64_t PC; + }; + + struct PerfMemSample { + uint64_t PC; + uint64_t Addr; + }; + + /// Used for parsing specific pre-aggregated input files. + struct AggregatedLBREntry { + enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN }; + Location From; + Location To; + uint64_t Count; + uint64_t Mispreds; + Type EntryType; + }; + + struct Trace { + uint64_t From; + uint64_t To; + Trace(uint64_t From, uint64_t To) + : From(From), To(To) {} + bool operator==(const Trace &Other) const { + return From == Other.From && To == Other.To; + } + }; + + struct TraceHash { + size_t operator()(const Trace &L) const { + return std::hash()(L.From << 32 | L.To); + } + }; + + struct FTInfo { + uint64_t InternCount{0}; + uint64_t ExternCount{0}; + }; + + struct BranchInfo { + uint64_t TakenCount{0}; + uint64_t MispredCount{0}; + }; + + /// Intermediate storage for profile data. We save the results of parsing + /// and use them later for processing and assigning profile. + std::unordered_map BranchLBRs; + std::unordered_map FallthroughLBRs; + std::vector AggregatedLBRs; + std::vector BasicSamples; + std::vector MemSamples; + + template void clear(T& Container) { + T TempContainer; + TempContainer.swap(Container); + } + /// Perf utility full path name std::string PerfPath; @@ -83,13 +122,14 @@ class DataAggregator : public DataReader { /// Perf process spawning bookkeeping struct PerfProcessInfo { + bool IsFinished{false}; sys::ProcessInfo PI; SmallVector StdoutPath; SmallVector StderrPath; }; /// Process info for spawned processes - PerfProcessInfo BranchEventsPPI; + PerfProcessInfo MainEventsPPI; PerfProcessInfo MemEventsPPI; PerfProcessInfo MMapEventsPPI; PerfProcessInfo TaskEventsPPI; @@ -150,6 +190,7 @@ class DataAggregator : public DataReader { void deleteTempFiles(); // Semantic pass helpers + /// Look up which function contains an address by using out map of /// disassembled BinaryFunctions BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address); @@ -209,18 +250,23 @@ class DataAggregator : public DataReader { /// Parse a single LBR entry as output by perf script -Fbrstack ErrorOr parseLBREntry(); - /// Parse the full output generated by perf script to report LBR samples. + /// Parse and pre-aggregate branch events. std::error_code parseBranchEvents(); + /// Process all branch events. + void processBranchEvents(); + /// Parse the full output generated by perf script to report non-LBR samples. std::error_code parseBasicEvents(); + /// Process non-LBR events. + void processBasicEvents(); + /// Parse the full output generated by perf script to report memory events. std::error_code parseMemEvents(); - /// Parse the full output of pre-aggregated LBR samples generated by - /// an external tool. - std::error_code parseAggregatedLBRSamples(); + /// Process parsed memory events profile. + void processMemEvents(); /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping /// between the binary name and its memory layout in a process with a given @@ -293,7 +339,14 @@ class DataAggregator : public DataReader { /// F 41be90 41be90 4 /// B 4b1942 39b57f0 3 0 /// B 4b196f 4b19e0 2 0 - bool processPreAggregated(); + void parsePreAggregated(); + + /// Parse the full output of pre-aggregated LBR samples generated by + /// an external tool. + std::error_code parsePreAggregatedLBRSamples(); + + /// Process parsed pre-aggregated data. + void processPreAggregated(); /// If \p Address falls into the binary address space based on memory /// mapping info \p MMI, then adjust it for further processing by subtracting @@ -338,9 +391,14 @@ class DataAggregator : public DataReader { /// Dump data structures into a file readable by llvm-bolt std::error_code writeAggregatedFile() const; - /// Join child subprocesses and finalize aggregation populating data - /// structures - bool aggregate(BinaryContext &BC, std::map &BFs); + /// Parse profile and mark functions/objects with profile. + /// Don't assign profile to functions yet. + void parseProfile(BinaryContext &BC, + std::map &BFs); + + /// Populate functions with profile. + void processProfile(BinaryContext &BC, + std::map &BFs); /// Check whether \p FileName is a perf.data file static bool checkPerfDataMagic(StringRef FileName); diff --git a/bolt/src/Passes/IndirectCallPromotion.h b/bolt/src/Passes/IndirectCallPromotion.h index a391aee6e53c..f8ea575b2fa2 100644 --- a/bolt/src/Passes/IndirectCallPromotion.h +++ b/bolt/src/Passes/IndirectCallPromotion.h @@ -15,7 +15,6 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_INDIRECT_CALL_PROMOTION_H #include "BinaryPasses.h" -#include "DataReader.h" namespace llvm { namespace bolt { diff --git a/bolt/src/Passes/JTFootprintReduction.h b/bolt/src/Passes/JTFootprintReduction.h index 81be253a6e3e..20b1da2eb4ae 100644 --- a/bolt/src/Passes/JTFootprintReduction.h +++ b/bolt/src/Passes/JTFootprintReduction.h @@ -16,7 +16,6 @@ #include "BinaryPasses.h" #include "DataflowInfoManager.h" -#include "DataReader.h" namespace llvm { namespace bolt { diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 7cacbd15d784..19ca5b10491d 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -381,11 +381,13 @@ TimeRewrite("time-rewrite", // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { - if (opts::MaxFunctions && Function.getFunctionNumber() >= opts::MaxFunctions) { - if (Function.getFunctionNumber() == opts::MaxFunctions) + if (opts::MaxFunctions && + Function.getFunctionNumber() >= opts::MaxFunctions) { + if (Function.getFunctionNumber() == opts::MaxFunctions) { dbgs() << "BOLT-INFO: processing ending on " << Function << "\n"; - else + } else { return false; + } } auto populateFunctionNames = [](cl::opt &FunctionNamesFile, @@ -786,6 +788,20 @@ void RewriteInstance::reset() { LocationListWriter.reset(); } +bool RewriteInstance::shouldDisassemble(BinaryFunction &BF) const { + // If we have to relocate the code we have to disassemble all functions. + if (!BF.getBinaryContext().HasRelocations && !opts::shouldProcess(BF)) { + DEBUG(dbgs() << "BOLT: skipping processing function " << BF + << " per user request.\n"); + return false; + } + + if (opts::AggregateOnly && !BF.hasProfileAvailable()) + return false; + + return true; +} + void RewriteInstance::discoverStorage() { NamedRegionTimer T("discoverStorage", "discover storage", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); @@ -987,6 +1003,7 @@ void RewriteInstance::run() { readSpecialSections(); adjustCommandLineOptions(); discoverFileObjects(); + preprocessProfileData(); readDebugInfo(); disassembleFunctions(); processProfileData(); @@ -2411,11 +2428,20 @@ void RewriteInstance::readDebugInfo() { BC->preprocessDebugInfo(BinaryFunctions); } +void RewriteInstance::preprocessProfileData() { + if (!DA.started()) + return; + + NamedRegionTimer T("preprocessprofile", "pre-process profile data", + TimerGroupName, TimerGroupDesc, opts::TimeRewrite); + DA.parseProfile(*BC.get(), BinaryFunctions); +} + void RewriteInstance::processProfileData() { + NamedRegionTimer T("processprofile", "process profile data", TimerGroupName, + TimerGroupDesc, opts::TimeRewrite); if (DA.started()) { - NamedRegionTimer T("aggregate", "aggregate data", TimerGroupName, - TimerGroupDesc, opts::TimeRewrite); - DA.aggregate(*BC.get(), BinaryFunctions); + DA.processProfile(*BC.get(), BinaryFunctions); for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -2428,9 +2454,6 @@ void RewriteInstance::processProfileData() { } } } else { - NamedRegionTimer T("readprofile", "read profile data", TimerGroupName, - TimerGroupDesc, opts::TimeRewrite); - if (!opts::BoltProfile.empty()) { ProfileReader PR; auto EC = PR.readProfile(opts::BoltProfile, BinaryFunctions); @@ -2473,10 +2496,8 @@ void RewriteInstance::disassembleFunctions() { for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; - // If we have to relocate the code we have to disassemble all functions. - if (!BC->HasRelocations && !opts::shouldProcess(Function)) { - DEBUG(dbgs() << "BOLT: skipping processing function " - << Function << " per user request.\n"); + if (!shouldDisassemble(Function)) { + Function.setSimple(false); continue; } @@ -2567,11 +2588,8 @@ void RewriteInstance::disassembleFunctions() { for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; - if (!BC->HasRelocations && !opts::shouldProcess(Function)) { - DEBUG(dbgs() << "BOLT: skipping processing function " - << Function << " per user request.\n"); + if (!shouldDisassemble(Function)) continue; - } if (!Function.isSimple()) { assert((!BC->HasRelocations || Function.getSize() == 0) && diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index c97892aa81b3..19e9f6cb221c 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -142,6 +142,9 @@ class RewriteInstance { /// Read information from debug sections. void readDebugInfo(); + /// Read profile data without having disassembled functions available. + void preprocessProfileData(); + /// Associate profile data with binary objects. void processProfileData(); @@ -381,6 +384,9 @@ class RewriteInstance { uint64_t SymbolSize = 0, uint16_t Alignment = 0); + /// Return true if the function \p BF should be disassembled. + bool shouldDisassemble(BinaryFunction &BF) const; + public: /// When updating debug info, these are the sections we overwrite. static constexpr const char *SectionsToOverwrite[] = { From 88f400dd9ac7df5ac0fcb4d72e750b3686c688ae Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 22 Jan 2019 17:21:45 -0800 Subject: [PATCH 499/904] [perf2bolt] Add support for generating autofdo input Summary: Autofdo tools support. (cherry picked from commit 302106953a641e6269fb6fdaa8f148073dd17402) --- bolt/src/DataAggregator.cpp | 121 +++++++++++++++++++++++++++++++++--- bolt/src/DataAggregator.h | 8 ++- bolt/src/DataReader.cpp | 6 +- bolt/src/DataReader.h | 2 +- 4 files changed, 121 insertions(+), 16 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 00d5451007bd..29b21609819a 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -45,6 +45,13 @@ BasicAggregation("nl", cl::ZeroOrMore, cl::cat(AggregatorCategory)); +static cl::opt +WriteAutoFDOData("autofdo", + cl::desc("generate autofdo textual data instead of bolt data"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(AggregatorCategory)); + static cl::opt ReadPreAggregated("pa", cl::desc("skip perf and read data from a pre-aggregated file format"), @@ -123,7 +130,7 @@ void DataAggregator::start(StringRef PerfDataFilename) { } else { launchPerfProcess("branch events", MainEventsPPI, - "script -F pid,brstack", + "script -F pid,ip,brstack", /*Wait = */false); } @@ -323,6 +330,75 @@ void DataAggregator::parsePreAggregated() { } } +std::error_code DataAggregator::writeAutoFDOData() { + outs() << "PERF2BOLT: writing data for autofdo tools...\n"; + NamedRegionTimer T("writeAutoFDO", "Processing branch events", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + std::error_code EC; + raw_fd_ostream OutFile(OutputFDataName, EC, sys::fs::OpenFlags::F_None); + if (EC) + return EC; + + // Format: + // number of unique traces + // from_1-to_1:count_1 + // from_2-to_2:count_2 + // ...... + // from_n-to_n:count_n + // number of unique sample addresses + // addr_1:count_1 + // addr_2:count_2 + // ...... + // addr_n:count_n + // number of unique LBR entries + // src_1->dst_1:count_1 + // src_2->dst_2:count_2 + // ...... + // src_n->dst_n:count_n + + const uint64_t FirstAllocAddress = this->BC->FirstAllocAddress; + + // AutoFDO addresses are relative to the first allocated loadable program + // segment + auto filterAddress = [&FirstAllocAddress](uint64_t Address) -> uint64_t { + if (Address < FirstAllocAddress) + return 0; + return Address - FirstAllocAddress; + }; + + OutFile << FallthroughLBRs.size() << "\n"; + for (const auto &AggrLBR : FallthroughLBRs) { + auto &Trace = AggrLBR.first; + auto &Info = AggrLBR.second; + OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "-" + << Twine::utohexstr(filterAddress(Trace.To)) << ":" + << (Info.InternCount + Info.ExternCount) << "\n"; + } + + OutFile << BasicSamples.size() << "\n"; + for (const auto &Sample : BasicSamples) { + auto PC = Sample.first; + auto HitCount = Sample.second; + OutFile << Twine::utohexstr(filterAddress(PC)) << ":" << HitCount << "\n"; + } + + OutFile << BranchLBRs.size() << "\n"; + for (const auto &AggrLBR : BranchLBRs) { + auto &Trace = AggrLBR.first; + auto &Info = AggrLBR.second; + OutFile << Twine::utohexstr(filterAddress(Trace.From)) << "->" + << Twine::utohexstr(filterAddress(Trace.To)) << ":" + << Info.TakenCount << "\n"; + } + + outs() << "PERF2BOLT: wrote " << FallthroughLBRs.size() << " unique traces, " + << BasicSamples.size() << " sample addresses and " << BranchLBRs.size() + << " unique branches to " << OutputFDataName << "\n"; + + return std::error_code(); +} + void DataAggregator::parseProfile( BinaryContext &BC, std::map &BFs) { @@ -388,6 +464,15 @@ void DataAggregator::parseProfile( errs() << "PERF2BOLT: failed to parse samples\n"; } + // We can finish early if the goal is just to generate data for autofdo + if (opts::WriteAutoFDOData) { + if (std::error_code EC = writeAutoFDOData()) { + errs() << "Error writing autofdo data to file: " << EC.message() << "\n"; + } + deleteTempFiles(); + exit(0); + } + // Special handling for memory events std::string Error; auto PI = sys::Wait(MemEventsPPI.PI, 0, true, &Error); @@ -475,8 +560,8 @@ DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) { return &FI->second; } -bool -DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) { +bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address, + uint64_t Count) { auto I = FuncsToSamples.find(Func.getNames()[0]); if (I == FuncsToSamples.end()) { bool Success; @@ -485,7 +570,7 @@ DataAggregator::doSample(BinaryFunction &Func, uint64_t Address) { FuncSampleData(Func.getNames()[0], FuncSampleData::ContainerTy()))); } - I->second.bumpCount(Address - Func.getAddress()); + I->second.bumpCount(Address - Func.getAddress(), Count); return true; } @@ -682,6 +767,16 @@ ErrorOr DataAggregator::parseBranchSample() { return Res; } + while (checkAndConsumeFS()) {} + + auto PCRes = parseHexField(FieldSeparator, true); + if (std::error_code EC = PCRes.getError()) + return EC; + Res.PC = PCRes.get(); + + if (checkAndConsumeNewLine()) + return Res; + while (!checkAndConsumeNewLine()) { checkAndConsumeFS(); @@ -890,6 +985,9 @@ std::error_code DataAggregator::parseBranchEvents() { return EC; auto &Sample = SampleRes.get(); + if (opts::WriteAutoFDOData) + ++BasicSamples[Sample.PC]; + if (Sample.LBR.empty()) continue; @@ -1041,7 +1139,8 @@ std::error_code DataAggregator::parseBasicEvents() { if (auto *BF = getBinaryFunctionContainingAddress(Sample->PC)) BF->setHasProfileAvailable(); - BasicSamples.emplace_back(std::move(Sample.get())); + ++BasicSamples[Sample->PC]; + EventNames.insert(Sample->EventName); } return std::error_code(); @@ -1052,17 +1151,19 @@ void DataAggregator::processBasicEvents() { NamedRegionTimer T("processBasic", "Processing basic events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); uint64_t OutOfRangeSamples{0}; + uint64_t NumSamples{0}; for (auto &Sample : BasicSamples) { - auto *Func = getBinaryFunctionContainingAddress(Sample.PC); + const auto PC = Sample.first; + const auto HitCount = Sample.second; + NumSamples += HitCount; + auto *Func = getBinaryFunctionContainingAddress(PC); if (!Func) { - ++OutOfRangeSamples; + OutOfRangeSamples += HitCount; continue; } - doSample(*Func, Sample.PC); - EventNames.insert(Sample.EventName); + doSample(*Func, PC, HitCount); } - const auto NumSamples = BasicSamples.size(); outs() << "PERF2BOLT: read " << NumSamples << " samples\n"; outs() << "PERF2BOLT: out of range samples recorded in unknown regions: " diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index d181e7312fd6..151dc68e77ba 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -53,6 +53,7 @@ class DataAggregator : public DataReader { struct PerfBranchSample { SmallVector LBR; + uint64_t PC; }; struct PerfBasicSample { @@ -106,7 +107,7 @@ class DataAggregator : public DataReader { std::unordered_map BranchLBRs; std::unordered_map FallthroughLBRs; std::vector AggregatedLBRs; - std::vector BasicSamples; + std::unordered_map BasicSamples; std::vector MemSamples; template void clear(T& Container) { @@ -197,7 +198,7 @@ class DataAggregator : public DataReader { /// Semantic actions - parser hooks to interpret parsed perf samples /// Register a sample (non-LBR mode), i.e. a new hit at \p Address - bool doSample(BinaryFunction &Func, const uint64_t Address); + bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count); /// Register an intraprocedural branch \p Branch. bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To, @@ -256,6 +257,9 @@ class DataAggregator : public DataReader { /// Process all branch events. void processBranchEvents(); + /// This member function supports generating data for AutoFDO LLVM tools. + std::error_code writeAutoFDOData(); + /// Parse the full output generated by perf script to report non-LBR samples. std::error_code parseBasicEvents(); diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp index 335119e30c3e..be179e482918 100644 --- a/bolt/src/DataReader.cpp +++ b/bolt/src/DataReader.cpp @@ -118,15 +118,15 @@ FuncSampleData::getSamples(uint64_t Start, uint64_t End) const { return Result; } -void FuncSampleData::bumpCount(uint64_t Offset) { +void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) { auto Iter = Index.find(Offset); if (Iter == Index.end()) { - Data.emplace_back(Location(true, Name, Offset), 1); + Data.emplace_back(Location(true, Name, Offset), Count); Index[Offset] = Data.size() - 1; return; } auto &SI = Data[Iter->second]; - ++SI.Hits; + SI.Hits += Count; } void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, diff --git a/bolt/src/DataReader.h b/bolt/src/DataReader.h index fe5c6a548ffd..50b901b9f5b5 100644 --- a/bolt/src/DataReader.h +++ b/bolt/src/DataReader.h @@ -285,7 +285,7 @@ struct FuncSampleData { /// Aggregation helper DenseMap Index; - void bumpCount(uint64_t Offset); + void bumpCount(uint64_t Offset, uint64_t Count); }; //===----------------------------------------------------------------------===// From 7175dc2cf04c7c13adf83c82aabd49a874c2229a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 28 Jan 2019 13:46:18 -0800 Subject: [PATCH 500/904] [BOLT] For non-simple functions always update jump tables in-place Summary: For non-simple function we can miss a reference to a jump table or to an indirect goto table. If we move the jump table, the missed reference will not get updated, and the corresponding indirect jump will end up in the old (wrong) location. Updating the original jump table in-place should take care of the issue. (cherry picked from commit e20ca789b545f259432fe51a81fd34c90fdeaec4) --- bolt/src/BinaryFunction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 33c623056e7c..648425760efc 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3757,7 +3757,7 @@ void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { auto &JT = *JTI.second; if (opts::PrintJumpTables) JT.print(outs()); - if (opts::JumpTables == JTS_BASIC && BC.HasRelocations) { + if ((opts::JumpTables == JTS_BASIC || !isSimple()) && BC.HasRelocations) { JT.updateOriginal(); } else { MCSection *HotSection, *ColdSection; From 27c3634178c628844885a665ca5aaf814be829d5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 31 Jan 2019 11:23:02 -0800 Subject: [PATCH 501/904] [BOLT] New inliner implementation Summary: Addresses correctness issues related to inlining. Inlining heuristics are not part of this diff. (cherry picked from commit d8fcb4f968dfe63b00da0b25975f432208c121d8) --- bolt/src/BinaryBasicBlock.cpp | 39 +- bolt/src/BinaryBasicBlock.h | 52 +- bolt/src/BinaryContext.cpp | 17 +- bolt/src/BinaryContext.h | 23 +- bolt/src/BinaryFunction.cpp | 61 +- bolt/src/BinaryFunction.h | 40 +- bolt/src/BinaryPassManager.cpp | 21 +- bolt/src/MCPlusBuilder.cpp | 26 + bolt/src/MCPlusBuilder.h | 27 +- bolt/src/Passes/AllocCombiner.cpp | 2 +- bolt/src/Passes/BinaryFunctionCallGraph.cpp | 4 +- bolt/src/Passes/BinaryPasses.cpp | 94 +- bolt/src/Passes/BinaryPasses.h | 31 - bolt/src/Passes/FrameOptimizer.cpp | 4 +- bolt/src/Passes/IndirectCallPromotion.cpp | 2 +- bolt/src/Passes/Inliner.cpp | 937 ++++++++++---------- bolt/src/Passes/Inliner.h | 110 +-- bolt/src/Passes/JTFootprintReduction.cpp | 29 +- bolt/src/Passes/JTFootprintReduction.h | 6 +- bolt/src/Passes/LongJmp.cpp | 2 +- bolt/src/Passes/ShrinkWrapping.cpp | 10 +- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 11 +- 22 files changed, 776 insertions(+), 772 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.cpp b/bolt/src/BinaryBasicBlock.cpp index 2fdf00878962..6c4439e77944 100644 --- a/bolt/src/BinaryBasicBlock.cpp +++ b/bolt/src/BinaryBasicBlock.cpp @@ -321,7 +321,7 @@ void BinaryBasicBlock::removeDuplicateConditionalSuccessor(MCInst *CondBranch) { const auto CondBI = BranchInfo[0]; const auto UncondBI = BranchInfo[1]; - eraseInstruction(CondBranch); + eraseInstruction(findInstruction(CondBranch)); Successors.clear(); BranchInfo.clear(); @@ -334,6 +334,23 @@ void BinaryBasicBlock::removeDuplicateConditionalSuccessor(MCInst *CondBranch) { BranchInfo.push_back({Count, 0}); } +void BinaryBasicBlock::adjustExecutionCount(double Ratio) { + auto adjustedCount = [&](uint64_t Count) -> uint64_t { + auto NewCount = Count * Ratio; + if (!NewCount && Count && (Ratio > 0.0)) + NewCount = 1; + return NewCount; + }; + + setExecutionCount(adjustedCount(getKnownExecutionCount())); + for (auto &BI : branch_info()) { + if (BI.Count != COUNT_NO_PROFILE) + BI.Count = adjustedCount(BI.Count); + if (BI.MispredictedCount != COUNT_INFERRED) + BI.MispredictedCount = adjustedCount(BI.MispredictedCount); + } +} + bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, @@ -543,5 +560,25 @@ BinaryBasicBlock::getBranchInfo(const MCSymbol *Label) { return *BI; } +BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) { + assert(II != end() && "expected iterator pointing to instruction"); + + auto *NewBlock = getFunction()->addBasicBlock(0); + + // Adjust successors/predecessors and propagate the execution count. + moveAllSuccessorsTo(NewBlock); + addSuccessor(NewBlock, getExecutionCount(), 0); + + // Set correct CFI state for the new block. + NewBlock->setCFIState(getCFIStateAtInstr(&*II)); + + // Move instructions over. + adjustNumPseudos(II, end(), -1); + NewBlock->addInstructions(II, end()); + Instructions.erase(II, end()); + + return NewBlock; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 7a60cc5e27e9..2fbfb9c38c33 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -427,7 +427,7 @@ class BinaryBasicBlock { } /// Add instruction at the end of this basic block. - /// Returns the index of the instruction in the Instructions vector of the BB. + /// Returns iterator pointing to the inserted instruction. iterator addInstruction(MCInst &&Inst) { adjustNumPseudos(Inst, 1); Instructions.emplace_back(Inst); @@ -435,7 +435,7 @@ class BinaryBasicBlock { } /// Add instruction at the end of this basic block. - /// Returns the index of the instruction in the Instructions vector of the BB. + /// Returns iterator pointing to the inserted instruction. iterator addInstruction(const MCInst &Inst) { adjustNumPseudos(Inst, 1); Instructions.push_back(Inst); @@ -489,6 +489,10 @@ class BinaryBasicBlock { auto RII = getLastNonPseudo(); return RII == Instructions.rend() ? nullptr : &*RII; } + const MCInst *getLastNonPseudoInstr() const { + auto RII = getLastNonPseudo(); + return RII == Instructions.rend() ? nullptr : &*RII; + } /// Set CFI state at entry to this basic block. void setCFIState(int32_t NewCFIState) { @@ -633,6 +637,9 @@ class BinaryBasicBlock { ExecutionCount = Count; } + /// Apply a given \p Ratio to the profile information of this basic block. + void adjustExecutionCount(double Ratio); + bool isEntryPoint() const { return IsEntryPoint; } @@ -670,18 +677,14 @@ class BinaryBasicBlock { } /// Erase pseudo instruction at a given iterator. + /// Return iterator following the removed instruction. iterator erasePseudoInstruction(iterator II) { --NumPseudos; return Instructions.erase(II); } - /// Erase given (non-pseudo) instruction if found. - /// Warning: this will invalidate succeeding instruction pointers. - bool eraseInstruction(MCInst *Inst) { - return replaceInstruction(Inst, std::vector()); - } - /// Erase non-pseudo instruction at a given iterator \p II. + /// Return iterator following the removed instruction. iterator eraseInstruction(iterator II) { adjustNumPseudos(*II, -1); return Instructions.erase(II); @@ -691,11 +694,11 @@ class BinaryBasicBlock { template void eraseInstructions(ItrType Begin, ItrType End) { while (End > Begin) { - eraseInstruction(*--End); + eraseInstruction(findInstruction(*--End)); } } - /// Erase all instructions + /// Erase all instructions. void clear() { Instructions.clear(); NumPseudos = 0; @@ -711,25 +714,9 @@ class BinaryBasicBlock { : Instructions.begin() + Index; } - /// Replace an instruction with a sequence of instructions. Returns true - /// if the instruction to be replaced was found and replaced. - template - bool replaceInstruction(const MCInst *Inst, Itr Begin, Itr End) { - auto I = findInstruction(Inst); - if (I != Instructions.end()) { - adjustNumPseudos(*Inst, -1); - Instructions.insert(Instructions.erase(I), Begin, End); - adjustNumPseudos(Begin, End, 1); - return true; - } - return false; - } - - bool replaceInstruction(const MCInst *Inst, - const std::vector &Replacement) { - return replaceInstruction(Inst, Replacement.begin(), Replacement.end()); - } - + /// Replace instruction referenced by iterator \II with a sequence of + /// instructions defined by [\p Begin, \p End] range. + /// /// Return iterator pointing to the first inserted instruction. template iterator replaceInstruction(iterator II, Itr Begin, Itr End) { @@ -779,6 +766,13 @@ class BinaryBasicBlock { return SplitInst; } + /// Split basic block at the instruction pointed to by II. + /// All iterators pointing after II get invalidated. + /// + /// Return the new basic block that starts with the instruction + /// at the split point. + BinaryBasicBlock *splitAt(iterator II); + /// Sets address of the basic block in the output. void setOutputStartAddress(uint64_t Address) { OutputAddressRange.first = Address; diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 1b04d90e22b2..2e1a63dadbd2 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -876,15 +876,14 @@ void BinaryContext::printInstruction(raw_ostream &OS, if (MIB->isTailCall(Instruction)) OS << " # TAILCALL "; if (MIB->isInvoke(Instruction)) { - if (const auto EHInfo = MIB->getEHInfo(Instruction)) { - OS << " # handler: "; - if (EHInfo->first) - OS << *EHInfo->first; - else - OS << '0'; - OS << "; action: " << EHInfo->second; - } - auto GnuArgsSize = MIB->getGnuArgsSize(Instruction); + const auto EHInfo = MIB->getEHInfo(Instruction); + OS << " # handler: "; + if (EHInfo->first) + OS << *EHInfo->first; + else + OS << '0'; + OS << "; action: " << EHInfo->second; + const auto GnuArgsSize = MIB->getGnuArgsSize(Instruction); if (GnuArgsSize >= 0) OS << "; GNU_args_size = " << GnuArgsSize; } diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index fe7517da8bb6..2075821a4163 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -676,6 +676,16 @@ class BinaryContext { /// size is for the cold one. std::pair calculateEmittedSize(BinaryFunction &BF); + /// Calculate the size of the instruction \p Inst. + uint64_t computeInstructionSize(const MCInst &Inst) const { + SmallString<256> Code; + SmallVector Fixups; + raw_svector_ostream VecOS(Code); + MCE->encodeInstruction(Inst, VecOS, Fixups, *STI); + + return Code.size(); + } + /// Compute the native code size for a range of instructions. /// Note: this can be imprecise wrt the final binary since happening prior to /// relaxation, as well as wrt the original binary because of opcode @@ -684,16 +694,9 @@ class BinaryContext { uint64_t computeCodeSize(Itr Beg, Itr End) const { uint64_t Size = 0; while (Beg != End) { - // Calculate the size of the instruction. - SmallString<256> Code; - SmallVector Fixups; - raw_svector_ostream VecOS(Code); - if (MIB->isCFI(*Beg) || MIB->isEHLabel(*Beg)) { - ++Beg; - continue; - } - MCE->encodeInstruction(*Beg++, VecOS, Fixups, *STI); - Size += Code.size(); + if (!MII->get(Beg->getOpcode()).isPseudo()) + Size += computeInstructionSize(*Beg); + ++Beg; } return Size; } diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 648425760efc..bc3ecd4c9b48 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1882,6 +1882,8 @@ bool BinaryFunction::buildCFG() { updateLayoutIndices(); + normalizeCFIState(); + // Clean-up memory taken by intermediate structures. // // NB: don't clear Labels list as we may need them if we mark the function @@ -2543,33 +2545,23 @@ BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState, return NewStates; } -bool BinaryFunction::fixCFIState() { - DEBUG(dbgs() << "Trying to fix CFI states for each BB after reordering.\n"); - DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this - << ": "); - - std::stack Stack; - auto &OriginalBBOrder = BasicBlocksPreviousLayout.empty() - ? BasicBlocksLayout - : BasicBlocksPreviousLayout; - +void BinaryFunction::normalizeCFIState() { // Reordering blocks with remember-restore state instructions can be specially // tricky. When rewriting the CFI, we omit remember-restore state instructions // entirely. For restore state, we build a map expanding each restore to the // equivalent unwindCFIState sequence required at that point to achieve the // same effect of the restore. All remember state are then just ignored. - for (BinaryBasicBlock *CurBB : OriginalBBOrder) { + std::stack Stack; + for (BinaryBasicBlock *CurBB : BasicBlocksLayout) { for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { if (auto *CFI = getCFIFor(*II)) { if (CFI->getOperation() == MCCFIInstruction::OpRememberState) { Stack.push(II->getOperand(0).getImm()); - BC.MIB->addAnnotation(*II, "DeleteMe", 0U); continue; } if (CFI->getOperation() == MCCFIInstruction::OpRestoreState) { const int32_t RememberState = Stack.top(); const int32_t CurState = II->getOperand(0).getImm(); - BC.MIB->addAnnotation(*II, "DeleteMe", 0U); FrameRestoreEquivalents[CurState] = unwindCFIState(CurState, RememberState, CurBB, II); Stack.pop(); @@ -2577,6 +2569,12 @@ bool BinaryFunction::fixCFIState() { } } } +} + +bool BinaryFunction::finalizeCFIState() { + DEBUG(dbgs() << "Trying to fix CFI states for each BB after reordering.\n"); + DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this + << ": "); int32_t State = 0; bool SeenCold = false; @@ -2614,10 +2612,18 @@ bool BinaryFunction::fixCFIState() { } DEBUG(dbgs() << "\n"); - for (auto BB : BasicBlocksLayout) - for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) - if (BC.MIB->hasAnnotation(*I, "DeleteMe")) - BB->eraseInstruction(&*I); + for (auto BB : BasicBlocksLayout) { + for (auto II = BB->begin(); II != BB->end(); ) { + auto CFI = getCFIFor(*II); + if (CFI && + (CFI->getOperation() == MCCFIInstruction::OpRememberState || + CFI->getOperation() == MCCFIInstruction::OpRestoreState)) { + II = BB->eraseInstruction(II); + } else { + ++II; + } + } + } return true; } @@ -3225,7 +3231,7 @@ void BinaryFunction::fixBranches() { // We will create unconditional branch with correct destination if needed. if (UncondBranch) - BB->eraseInstruction(UncondBranch); + BB->eraseInstruction(BB->findInstruction(UncondBranch)); // Basic block that follows the current one in the final layout. const BinaryBasicBlock *NextBB = nullptr; @@ -3238,7 +3244,7 @@ void BinaryFunction::fixBranches() { // one valid successor. Since behaviour is undefined - we replace // the conditional branch with an unconditional if required. if (CondBranch) - BB->eraseInstruction(CondBranch); + BB->eraseInstruction(BB->findInstruction(CondBranch)); if (BB->getSuccessor() == NextBB) continue; BB->addBranchInstruction(BB->getSuccessor()); @@ -3738,6 +3744,23 @@ SMLoc BinaryFunction::emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const { return NewLoc; } +void BinaryFunction::adjustExecutionCount(uint64_t Count) { + if (getKnownExecutionCount() == 0 || Count == 0) + return; + + if (ExecutionCount < Count) + Count = ExecutionCount; + + double AdjustmentRatio = ((double) ExecutionCount - Count) / ExecutionCount; + if (AdjustmentRatio < 0.0) + AdjustmentRatio = 0.0; + + for (auto &BB : layout()) + BB->adjustExecutionCount(AdjustmentRatio); + + ExecutionCount -= Count; +} + BinaryFunction::~BinaryFunction() { for (auto BB : BasicBlocks) { delete BB; diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 71e4a25ce5cf..b405ab56ae6c 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -122,6 +122,11 @@ class DynoStats { #define Radd(a, b) (a + b) #define Rsub(a, b) (a - b) DYNO_STATS +#undef Rsub +#undef Radd +#undef F +#undef Fsub +#undef Fadd #undef Fn #undef D default: @@ -433,9 +438,6 @@ class BinaryFunction { /// Synchronize branch instructions with CFG. void postProcessBranches(); - /// Recompute landing pad information for the function and all its blocks. - void recomputeLandingPads(); - /// Temporary holder of offsets that are potentially entry points. std::unordered_set EntryOffsets; @@ -670,6 +672,9 @@ class BinaryFunction { Instructions.emplace(Offset, std::forward(Instruction)); } + /// Convert CFI instructions to a standard form (remove remember/restore). + void normalizeCFIState(); + /// Analyze and process indirect branch \p Instruction before it is /// added to Instructions list. IndirectBranchType processIndirectBranch(MCInst &Instruction, @@ -747,6 +752,12 @@ class BinaryFunction { BinaryBasicBlock &front() { return *BasicBlocks.front(); } const BinaryBasicBlock & back() const { return *BasicBlocks.back(); } BinaryBasicBlock & back() { return *BasicBlocks.back(); } + inline iterator_range blocks() { + return iterator_range(begin(), end()); + } + inline iterator_range blocks() const { + return iterator_range(begin(), end()); + } order_iterator layout_begin() { return BasicBlocksLayout.begin(); } const_order_iterator layout_begin() const @@ -822,6 +833,9 @@ class BinaryFunction { } } + /// Recompute landing pad information for the function and all its blocks. + void recomputeLandingPads(); + /// Return current basic block layout. const BasicBlockOrderType &getLayout() const { return BasicBlocksLayout; @@ -1360,18 +1374,25 @@ class BinaryFunction { /// on the alignment of the existing offset. /// /// Returns NULL if basic block already exists at the \p Offset. - BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label, + BinaryBasicBlock *addBasicBlock(uint64_t Offset, MCSymbol *Label = nullptr, bool DeriveAlignment = false) { assert((CurrentState == State::CFG || !getBasicBlockAtOffset(Offset)) && "basic block already exists in pre-CFG state"); + + if (!Label) + Label = BC.Ctx->createTempSymbol("BB", true); + auto BBPtr = createBasicBlock(Offset, Label, DeriveAlignment); BasicBlocks.emplace_back(BBPtr.release()); - auto BB = BasicBlocks.back(); + auto *BB = BasicBlocks.back(); BB->setIndex(BasicBlocks.size() - 1); if (CurrentState == State::Disassembled) { BasicBlockOffsets.emplace_back(std::make_pair(Offset, BB)); + } else if (CurrentState == State::CFG) { + BB->setLayoutIndex(layout_size()); + BasicBlocksLayout.emplace_back(BB); } assert(CurrentState == State::CFG || @@ -1686,6 +1707,13 @@ class BinaryFunction { return *this; } + /// Adjust execution count for the function by a given \p Count. The value + /// \p Count will be subtracted from the current function count. + /// + /// The function will proportionally adjust execution count for all + /// basic blocks and edges in the control flow graph. + void adjustExecutionCount(uint64_t Count); + /// Set LSDA address for the function. BinaryFunction &setLSDAAddress(uint64_t Address) { LSDAAddress = Address; @@ -2048,7 +2076,7 @@ class BinaryFunction { /// After reordering, this function checks the state of CFI and fixes it if it /// is corrupted. If it is unable to fix it, it returns false. - bool fixCFIState(); + bool finalizeCFIState(); /// Adjust branch instructions to match the CFG. /// diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index c8ce0eb2c214..49e7d5bcf62c 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -63,12 +63,6 @@ ICF("icf", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -InlineSmallFunctions("inline-small-functions", - cl::desc("inline functions with a single basic block"), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt JTFootprintReductionFlag("jt-footprint-reduction", cl::desc("make jump tables size smaller at the cost of using more " @@ -90,12 +84,6 @@ NeverPrint("never-print", cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt -OptimizeBodylessFunctions("optimize-bodyless-functions", - cl::desc("optimize functions that just do a tail call"), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt PrintAfterBranchFixup("print-after-branch-fixup", cl::desc("print function after fixing local branches"), @@ -398,13 +386,6 @@ void BinaryFunctionPassManager::runAllPasses( llvm::make_unique(PrintJTFootprintReduction), opts::JTFootprintReductionFlag); - Manager.registerPass(llvm::make_unique(PrintInline), - opts::InlineSmallFunctions); - - Manager.registerPass( - llvm::make_unique(PrintOptimizeBodyless), - opts::OptimizeBodylessFunctions); - Manager.registerPass( llvm::make_unique(PrintSimplifyROLoads), opts::SimplifyRODataLoads); @@ -412,6 +393,8 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass(llvm::make_unique(PrintRegReAssign), opts::RegReAssign); + Manager.registerPass(llvm::make_unique(PrintInline)); + Manager.registerPass(llvm::make_unique(PrintICF), opts::ICF); diff --git a/bolt/src/MCPlusBuilder.cpp b/bolt/src/MCPlusBuilder.cpp index 0e1949299066..2158a5b3af75 100644 --- a/bolt/src/MCPlusBuilder.cpp +++ b/bolt/src/MCPlusBuilder.cpp @@ -249,6 +249,14 @@ void MCPlusBuilder::removeAllAnnotations(MCInst &Inst) { Inst.erase(std::prev(Inst.end())); } +void MCPlusBuilder::stripAnnotations(MCInst &Inst) { + auto *AnnotationInst = getAnnotationInst(Inst); + if (!AnnotationInst) + return; + + Inst.erase(std::prev(Inst.end())); +} + void MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS) const { const auto *AnnotationInst = getAnnotationInst(Inst); @@ -353,6 +361,24 @@ void MCPlusBuilder::getUsedRegs(const MCInst &Inst, BitVector &Regs) const { } } +bool MCPlusBuilder::hasDefOfPhysReg(const MCInst &MI, unsigned Reg) const { + const auto &InstInfo = Info->get(MI.getOpcode()); + return InstInfo.hasDefOfPhysReg(MI, Reg, *RegInfo); +} + +bool MCPlusBuilder::hasUseOfPhysReg(const MCInst &MI, unsigned Reg) const { + const auto &InstInfo = Info->get(MI.getOpcode()); + for (int I = InstInfo.NumDefs; I < InstInfo.NumOperands; ++I) + if (MI.getOperand(I).isReg() && + RegInfo->isSubRegisterEq(Reg, MI.getOperand(I).getReg())) + return true; + if (const uint16_t *ImpUses = InstInfo.ImplicitUses) + for (; *ImpUses; ++ImpUses) + if (*ImpUses == Reg || RegInfo->isSubRegister(Reg, *ImpUses)) + return true; + return false; +} + const BitVector & MCPlusBuilder::getAliases(MCPhysReg Reg, bool OnlySmaller) const { diff --git a/bolt/src/MCPlusBuilder.h b/bolt/src/MCPlusBuilder.h index d4beb0862228..851fec65c53d 100644 --- a/bolt/src/MCPlusBuilder.h +++ b/bolt/src/MCPlusBuilder.h @@ -334,6 +334,11 @@ class MCPlusBuilder { return false; } + /// Return true of the instruction is of pseudo kind. + bool isPseudo(const MCInst &Inst) const { + return Info->get(Inst.getOpcode()).isPseudo(); + } + /// Creates x86 pause instruction. virtual void createPause(MCInst &Inst) const { llvm_unreachable("not implemented"); @@ -1093,6 +1098,14 @@ class MCPlusBuilder { /// but only if they are strictly smaller than the actual reg virtual void getUsedRegs(const MCInst &Inst, BitVector &Regs) const; + /// Return true if this instruction defines the specified physical + /// register either explicitly or implicitly. + virtual bool hasDefOfPhysReg(const MCInst &MI, unsigned Reg) const; + + /// Return true if this instruction uses the specified physical + /// register either explicitly or implicitly. + virtual bool hasUseOfPhysReg(const MCInst &MI, unsigned Reg) const; + /// Replace displacement in compound memory operand with given \p Label. bool replaceMemOperandDisp(MCInst &Inst, const MCSymbol *Label, MCContext *Ctx) const { @@ -1227,7 +1240,6 @@ class MCPlusBuilder { virtual int getShortJmpEncodingSize() const { llvm_unreachable("not implemented"); - return 0; } virtual int getUncondBranchEncodingSize() const { @@ -1257,6 +1269,16 @@ class MCPlusBuilder { return false; } + /// Creates a new call instruction in Inst and sets its operand to + /// Target. + /// + /// Returns true on success. + virtual bool createCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) { + llvm_unreachable("not implemented"); + return false; + } + /// Creates a new tail call instruction in Inst and sets its operand to /// Target. /// @@ -1562,6 +1584,9 @@ class MCPlusBuilder { /// Remove all meta-data annotations from Inst. void removeAllAnnotations(MCInst &Inst); + /// Remove meta-data, but don't destroy it. + void stripAnnotations(MCInst &Inst); + /// This method takes an indirect call instruction and splits it up into an /// equivalent set of instructions that use direct calls for target /// symbols/addresses that are contained in the Targets vector. This is done diff --git a/bolt/src/Passes/AllocCombiner.cpp b/bolt/src/Passes/AllocCombiner.cpp index b18289566989..2513ea1fc88b 100644 --- a/bolt/src/Passes/AllocCombiner.cpp +++ b/bolt/src/Passes/AllocCombiner.cpp @@ -92,7 +92,7 @@ void AllocCombinerPass::combineAdjustments(BinaryContext &BC, Inst.dump(); }); - BB.eraseInstruction(Prev); + BB.eraseInstruction(BB.findInstruction(Prev)); ++NumCombined; FuncsChanged.insert(&BF); Prev = &Inst; diff --git a/bolt/src/Passes/BinaryFunctionCallGraph.cpp b/bolt/src/Passes/BinaryFunctionCallGraph.cpp index ccfecc3521d5..538b7378b668 100644 --- a/bolt/src/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/src/Passes/BinaryFunctionCallGraph.cpp @@ -114,9 +114,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, // accumulate the number of calls from the callsite into the function // samples. Results from perfomance testing seem to favor the zero // count though, so I'm leaving it this way for now. - const auto Samples = - Function->hasProfile() ? Function->getExecutionCount() : 0; - return Cg.addNode(Function, Size, Samples); + return Cg.addNode(Function, Size, Function->getKnownExecutionCount()); } else { return Id; } diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 2e2b65db6e80..9eb6dfbbffef 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -14,7 +14,7 @@ #include "llvm/Support/Options.h" #include -#define DEBUG_TYPE "bolt" +#define DEBUG_TYPE "bolt-opts" using namespace llvm; @@ -257,88 +257,6 @@ bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { return BF.isSimple() && opts::shouldProcess(BF); } -void OptimizeBodylessFunctions::analyze( - BinaryFunction &BF, - BinaryContext &BC, - std::map &BFs) { - if (BF.size() != 1 || BF.front().getNumNonPseudos() != 1) - return; - - const auto *FirstInstr = BF.front().getFirstNonPseudoInstr(); - if (!FirstInstr) - return; - if (!BC.MIB->isTailCall(*FirstInstr)) - return; - const auto *TargetSymbol = BC.MIB->getTargetSymbol(*FirstInstr); - if (!TargetSymbol) - return; - const auto *Function = BC.getFunctionForSymbol(TargetSymbol); - if (!Function) - return; - - EquivalentCallTarget[BF.getSymbol()] = Function; -} - -void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, - BinaryContext &BC) { - for (auto *BB : BF.layout()) { - for (auto &Inst : *BB) { - if (!BC.MIB->isCall(Inst)) - continue; - const auto *OriginalTarget = BC.MIB->getTargetSymbol(Inst); - if (!OriginalTarget) - continue; - const auto *Target = OriginalTarget; - // Iteratively update target since we could have f1() calling f2() - // calling f3() calling f4() and we want to output f1() directly - // calling f4(). - unsigned CallSites = 0; - while (EquivalentCallTarget.count(Target)) { - Target = EquivalentCallTarget.find(Target)->second->getSymbol(); - ++CallSites; - } - if (Target == OriginalTarget) - continue; - DEBUG(dbgs() << "BOLT-DEBUG: Optimizing " << BB->getName() - << " (executed " << BB->getKnownExecutionCount() - << " times) in " << BF - << ": replacing call to " << OriginalTarget->getName() - << " by call to " << Target->getName() - << " while folding " << CallSites << " call sites\n"); - BC.MIB->replaceBranchTarget(Inst, Target, BC.Ctx.get()); - - NumOptimizedCallSites += CallSites; - if (BB->hasProfile()) { - NumEliminatedCalls += CallSites * BB->getExecutionCount(); - } - } - } -} - -void OptimizeBodylessFunctions::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &) { - for (auto &It : BFs) { - auto &Function = It.second; - if (shouldOptimize(Function)) { - analyze(Function, BC, BFs); - } - } - for (auto &It : BFs) { - auto &Function = It.second; - if (shouldOptimize(Function)) { - optimizeCalls(Function, BC); - } - } - - if (NumEliminatedCalls || NumOptimizedCallSites) { - outs() << "BOLT-INFO: optimized " << NumOptimizedCallSites - << " redirect call sites to eliminate " << NumEliminatedCalls - << " dynamic calls.\n"; - } -} - void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { if (Function.layout_size() > 0) { unsigned Count; @@ -660,7 +578,7 @@ void FinalizeFunctions::runOnFunctions( continue; // Fix the CFI state. - if (ShouldOptimize && !Function.fixCFIState()) { + if (ShouldOptimize && !Function.finalizeCFIState()) { if (BC.HasRelocations) { errs() << "BOLT-ERROR: unable to fix CFI state for function " << Function << ". Exiting.\n"; @@ -779,7 +697,7 @@ uint64_t fixDoubleJumps(BinaryContext &BC, if (Branch && BC.MIB->isUnconditionalBranch(*Branch)) { assert(BC.MIB->getTargetSymbol(*Branch) == BB.getLabel()); Pred->removeSuccessor(&BB); - Pred->eraseInstruction(Branch); + Pred->eraseInstruction(Pred->findInstruction(Branch)); Pred->addTailCallInstruction(SuccSym); } else { return false; @@ -1029,7 +947,7 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, if (UncondBranch) { if (HasFallthrough) - PredBB->eraseInstruction(UncondBranch); + PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); else MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), @@ -1388,7 +1306,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, ); auto SFI = ProfiledFunctions.begin(); auto SFIend = ProfiledFunctions.end(); - for (auto i = 0u; i < opts::TopCalledLimit && SFI != SFIend; ++SFI, ++i) { + for (auto I = 0u; I < opts::TopCalledLimit && SFI != SFIend; ++SFI, ++I) { outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n'; } @@ -1458,7 +1376,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, outs() << " are:\n"; auto SFI = Functions.begin(); - for (unsigned i = 0; i < 100 && SFI != Functions.end(); ++SFI, ++i) { + for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { const auto Stats = (*SFI)->getDynoStats(); outs() << " " << **SFI; if (!SortAll) { diff --git a/bolt/src/Passes/BinaryPasses.h b/bolt/src/Passes/BinaryPasses.h index 8caade87dd8a..53068af0a0f7 100644 --- a/bolt/src/Passes/BinaryPasses.h +++ b/bolt/src/Passes/BinaryPasses.h @@ -95,37 +95,6 @@ class DynoStatsPrintPass : public BinaryFunctionPass { } }; -/// Detects functions that simply do a tail call when they are called and -/// optimizes calls to these functions. -class OptimizeBodylessFunctions : public BinaryFunctionPass { -private: - /// EquivalentCallTarget[F] = G ==> function F is simply a tail call to G, - /// thus calls to F can be optimized to calls to G. - std::unordered_map - EquivalentCallTarget; - - void analyze(BinaryFunction &BF, - BinaryContext &BC, - std::map &BFs); - - void optimizeCalls(BinaryFunction &BF, - BinaryContext &BC); - - /// Stats for eliminated calls. - uint64_t NumEliminatedCalls{0}; - uint64_t NumOptimizedCallSites{0}; - -public: - explicit OptimizeBodylessFunctions(const cl::opt &PrintPass) - : BinaryFunctionPass(PrintPass) { } - const char *getName() const override { - return "optimize-bodyless"; - } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; -}; - /// Detect and eliminate unreachable basic blocks. We could have those /// filled with nops and they are used for alignment. class EliminateUnreachableBlocks : public BinaryFunctionPass { diff --git a/bolt/src/Passes/FrameOptimizer.cpp b/bolt/src/Passes/FrameOptimizer.cpp index d885f831d0b7..93818c8c416d 100644 --- a/bolt/src/Passes/FrameOptimizer.cpp +++ b/bolt/src/Passes/FrameOptimizer.cpp @@ -154,7 +154,7 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA, // TODO: Implement an interface of eraseInstruction that works out the // complete list of elements to remove. for (auto I : ToErase) { - I.first->eraseInstruction(I.second); + I.first->eraseInstruction(I.first->findInstruction(I.second)); } } @@ -214,7 +214,7 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, } for (auto I : ToErase) { - I.first->eraseInstruction(I.second); + I.first->eraseInstruction(I.first->findInstruction(I.second)); } if (Changed) { DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp index fb530ce2a177..941d091f7d69 100644 --- a/bolt/src/Passes/IndirectCallPromotion.cpp +++ b/bolt/src/Passes/IndirectCallPromotion.cpp @@ -728,7 +728,7 @@ IndirectCallPromotion::rewriteCall( IndCallBlock->addInstructions(ICPcode.front().second.begin(), ICPcode.front().second.end()); } else { - IndCallBlock->replaceInstruction(&IndCallBlock->back(), + IndCallBlock->replaceInstruction(std::prev(IndCallBlock->end()), ICPcode.front().second); } IndCallBlock->addInstructions(TailInsts.begin(), TailInsts.end()); diff --git a/bolt/src/Passes/Inliner.cpp b/bolt/src/Passes/Inliner.cpp index abde11258782..4836240c662a 100644 --- a/bolt/src/Passes/Inliner.cpp +++ b/bolt/src/Passes/Inliner.cpp @@ -7,11 +7,26 @@ // //===----------------------------------------------------------------------===// // +// The current inliner has a limited callee support +// (see Inliner::getInliningInfo() for the most up-to-date details): +// +// * No exception handling +// * No jump tables +// * Single entry point +// * CFI update not supported - breaks unwinding +// * Regular Call Sites: +// - only leaf functions (or callees with only tail calls) +// * no invokes (they can't be tail calls) +// - no direct use of %rsp +// * Tail Call Sites: +// - since the stack is unmodified, the regular call limitations are lifted +// //===----------------------------------------------------------------------===// #include "Inliner.h" #include "MCPlus.h" #include "llvm/Support/Options.h" +#include #define DEBUG_TYPE "bolt-inliner" @@ -22,10 +37,9 @@ namespace opts { extern cl::OptionCategory BoltOptCategory; static cl::opt -AggressiveInlining("aggressive-inlining", - cl::desc("perform aggressive inlining"), +AdjustProfile("inline-ap", + cl::desc("adjust function profile after inlining"), cl::ZeroOrMore, - cl::Hidden, cl::cat(BoltOptCategory)); static cl::list @@ -36,581 +50,544 @@ ForceInlineFunctions("force-inline", cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt +InlineAll("inline-all", + cl::desc("inline all functions"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +InlineIgnoreLeafCFI("inline-ignore-leaf-cfi", + cl::desc("inline leaf functions with CFI programs (can break unwinding)"), + cl::init(true), + cl::ZeroOrMore, + cl::ReallyHidden, + cl::cat(BoltOptCategory)); + +static cl::opt +InlineIgnoreCFI("inline-ignore-cfi", + cl::desc("inline functions with CFI programs (can break exception handling)"), + cl::init(false), + cl::ZeroOrMore, + cl::ReallyHidden, + cl::cat(BoltOptCategory)); + +static cl::opt +InlineLimit("inline-limit", + cl::desc("maximum number of call sites to inline"), + cl::init(0), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +InlineMaxIters("inline-max-iters", + cl::desc("maximum number of inline iterations"), + cl::init(3), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +InlineSmallFunctions("inline-small-functions", + cl::desc("inline functions if increase in size is less than defined by " + "-inline-small-functions-bytes"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +static cl::opt +InlineSmallFunctionsBytes("inline-small-functions-bytes", + cl::desc("max number of bytes for the function to be considered small for " + "inlining purposes"), + cl::init(4), + cl::ZeroOrMore, + cl::Hidden, + cl::cat(BoltOptCategory)); + +static cl::opt +NoInline("no-inline", + cl::desc("disable all inlining (overrides other inlining options)"), + cl::init(false), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +/// This function returns true if any of inlining options are specified and the +/// inlining pass should be executed. Whenever a new inlining option is added, +/// this function should reflect the change. +bool inliningEnabled() { + return !NoInline && + (InlineAll || + InlineSmallFunctions || + !ForceInlineFunctions.empty()); +} + +bool mustConsider(const llvm::bolt::BinaryFunction &Function) { + for (auto &Name : opts::ForceInlineFunctions) { + if (Function.hasName(Name)) + return true; + } + return false; +} + +void syncOptions() { + if (opts::InlineIgnoreCFI) + opts::InlineIgnoreLeafCFI = true; + + if (opts::InlineAll) + opts::InlineSmallFunctions = true; } +} // namespace opts + namespace llvm { namespace bolt { -void InlineSmallFunctions::findInliningCandidates( - BinaryContext &BC, - const std::map &BFs) { - for (const auto &BFIt : BFs) { - const auto &Function = BFIt.second; - if (!shouldOptimize(Function) || Function.size() != 1) - continue; - auto &BB = *Function.begin(); - const auto &LastInstruction = *BB.rbegin(); - // Check if the function is small enough, doesn't do a tail call - // and doesn't throw exceptions. - if (BB.size() > 0 && - BB.getNumNonPseudos() <= kMaxInstructions && - BB.lp_empty() && - BC.MIB->isReturn(LastInstruction) && - !BC.MIB->isTailCall(LastInstruction)) { - InliningCandidates.insert(&Function); - } - } +uint64_t Inliner::SizeOfCallInst; +uint64_t Inliner::SizeOfTailCallInst; + +uint64_t Inliner::getSizeOfCallInst(const BinaryContext &BC) { + if (SizeOfCallInst) + return SizeOfCallInst; + + MCInst Inst; + BC.MIB->createCall(Inst, BC.Ctx->createTempSymbol(), BC.Ctx.get()); + SizeOfCallInst = BC.computeInstructionSize(Inst); - DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size() - << " inlineable functions.\n"); + return SizeOfCallInst; } -void InlineSmallFunctions::findInliningCandidatesAggressive( - BinaryContext &BC, - const std::map &BFs) { - std::set OverwrittenFunctions = { - "_ZN4HPHP13hash_string_iEPKcj", - "_ZN4HPHP21hash_string_cs_unsafeEPKcj", - "_ZN4HPHP14hash_string_csEPKcj", - "_ZN4HPHP20hash_string_i_unsafeEPKcj", - "_ZNK4HPHP10StringData10hashHelperEv" - }; - for (const auto &BFIt : BFs) { - const auto &Function = BFIt.second; - if (!shouldOptimize(Function) || - OverwrittenFunctions.count(Function.getSymbol()->getName()) || - Function.hasEHRanges()) - continue; - uint64_t FunctionSize = 0; - for (const auto *BB : Function.layout()) { - FunctionSize += BC.computeCodeSize(BB->begin(), BB->end()); - } - assert(FunctionSize > 0 && "found empty function"); - if (FunctionSize > kMaxSize) - continue; - bool FoundCFI = false; - for (const auto BB : Function.layout()) { - for (const auto &Inst : *BB) { - if (BC.MIB->isEHLabel(Inst) || BC.MIB->isCFI(Inst)) { - FoundCFI = true; +uint64_t Inliner::getSizeOfTailCallInst(const BinaryContext &BC) { + if (SizeOfTailCallInst) + return SizeOfTailCallInst; + + MCInst Inst; + BC.MIB->createTailCall(Inst, BC.Ctx->createTempSymbol(), BC.Ctx.get()); + SizeOfTailCallInst = BC.computeInstructionSize(Inst); + + return SizeOfTailCallInst; +} + +Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const { + if (!shouldOptimize(BF)) + return INL_NONE; + + auto &BC = BF.getBinaryContext(); + bool DirectSP = false; + bool HasCFI = false; + bool IsLeaf = true; + + // Perform necessary checks unless the option overrides it. + if (!opts::mustConsider(BF)) { + if (BF.hasEHRanges()) + return INL_NONE; + + if (BF.isMultiEntry()) + return INL_NONE; + + if (BF.hasJumpTables()) + return INL_NONE; + + const auto SPReg = BC.MIB->getStackPointer(); + for (const auto *BB : BF.layout()) { + for (auto &Inst : *BB) { + // Tail calls are marked as implicitly using the stack pointer and they + // could be inlined. + if (BC.MIB->isTailCall(Inst)) break; + + if (BC.MIB->isCFI(Inst)) { + HasCFI = true; + continue; } + + if (BC.MIB->isCall(Inst)) + IsLeaf = false; + + // Push/pop instructions are straightforward to handle. + if (BC.MIB->isPush(Inst) || BC.MIB->isPop(Inst)) + continue; + + DirectSP |= BC.MIB->hasDefOfPhysReg(Inst, SPReg) || + BC.MIB->hasUseOfPhysReg(Inst, SPReg); } } - if (!FoundCFI) - InliningCandidates.insert(&Function); } - DEBUG(dbgs() << "BOLT-DEBUG: " << InliningCandidates.size() - << " inlineable functions.\n"); -} + if (HasCFI) { + if (!opts::InlineIgnoreLeafCFI) + return INL_NONE; -namespace { + if (!IsLeaf && !opts::InlineIgnoreCFI) + return INL_NONE; + } -/// Returns whether a function creates a stack frame for itself or not. -/// If so, we need to manipulate the stack pointer when calling this function. -/// Since we're only inlining very small functions, we return false for now, but -/// we could for instance check if the function starts with 'push ebp'. -/// TODO generalize this. -bool createsStackFrame(const BinaryBasicBlock &) { - return false; -} + InliningInfo Info(DirectSP ? INL_TAILCALL : INL_ANY); -} // namespace + auto Size = BF.estimateSize(); -void InlineSmallFunctions::inlineCall( - BinaryContext &BC, - BinaryBasicBlock &BB, - MCInst *CallInst, - const BinaryBasicBlock &InlinedFunctionBB) { - assert(BC.MIB->isCall(*CallInst) && "Can only inline a call."); - assert(BC.MIB->isReturn(*InlinedFunctionBB.rbegin()) && - "Inlined function should end with a return."); + Info.SizeAfterInlining = Size; + Info.SizeAfterTailCallInlining = Size; - std::vector InlinedInstance; + // Handle special case of the known size reduction. + if (BF.size() == 1) { + // For a regular call the last return instruction could be removed + // (or converted to a branch). + const auto *LastInst = BF.back().getLastNonPseudoInstr(); + if (LastInst && + BC.MIB->isReturn(*LastInst) && + !BC.MIB->isTailCall(*LastInst)) { + const auto RetInstSize = BC.computeInstructionSize(*LastInst); + assert(Size >= RetInstSize); + Info.SizeAfterInlining -= RetInstSize; + } + } - bool ShouldAdjustStack = createsStackFrame(InlinedFunctionBB); + return Info; +} - // Move stack like 'call' would if needed. - if (ShouldAdjustStack) { - MCInst StackInc; - BC.MIB->createStackPointerIncrement(StackInc); - InlinedInstance.push_back(StackInc); +void +Inliner::findInliningCandidates(BinaryContext &BC, + const std::map &BFs) { + for (const auto &BFI : BFs) { + const auto &Function = BFI.second; + const auto InlInfo = getInliningInfo(Function); + if (InlInfo.Type != INL_NONE) + InliningCandidates[&Function] = InlInfo; } +} - for (auto Instruction : InlinedFunctionBB) { - if (BC.MIB->isReturn(Instruction)) { - break; - } - if (!BC.MIB->isEHLabel(Instruction) && - !BC.MIB->isCFI(Instruction)) { - InlinedInstance.push_back(Instruction); +std::pair +Inliner::inlineCall(BinaryBasicBlock &CallerBB, + BinaryBasicBlock::iterator CallInst, + const BinaryFunction &Callee) { + auto &CallerFunction = *CallerBB.getFunction(); + auto &BC = CallerFunction.getBinaryContext(); + auto &MIB = *BC.MIB; + + assert(MIB.isCall(*CallInst) && "can only inline a call or a tail call"); + assert(!Callee.isMultiEntry() && + "cannot inline function with multiple entries"); + assert(!Callee.hasJumpTables() && + "cannot inline function with jump table(s)"); + + // Get information about the call site. + const auto CSIsInvoke = BC.MIB->isInvoke(*CallInst); + const auto CSIsTailCall = BC.MIB->isTailCall(*CallInst); + const auto CSGNUArgsSize = BC.MIB->getGnuArgsSize(*CallInst); + const auto CSEHInfo = BC.MIB->getEHInfo(*CallInst); + + // Split basic block at the call site if there will be more incoming edges + // coming from the callee. + BinaryBasicBlock *FirstInlinedBB = &CallerBB; + if (Callee.front().pred_size() && CallInst != CallerBB.begin()) { + FirstInlinedBB = CallerBB.splitAt(CallInst); + CallInst = FirstInlinedBB->begin(); + } + + // Split basic block after the call instruction unless the callee is trivial + // (i.e. consists of a single basic block). If necessary, obtain a basic block + // for return instructions in the callee to redirect to. + BinaryBasicBlock *NextBB = nullptr; + if (Callee.size() > 1) { + if (std::next(CallInst) != FirstInlinedBB->end()) { + NextBB = FirstInlinedBB->splitAt(std::next(CallInst)); + } else { + NextBB = FirstInlinedBB->getSuccessor(); } } + if (NextBB) + FirstInlinedBB->removeSuccessor(NextBB); - // Move stack pointer like 'ret' would. - if (ShouldAdjustStack) { - MCInst StackDec; - BC.MIB->createStackPointerDecrement(StackDec); - InlinedInstance.push_back(StackDec); + // Remove the call instruction. + auto InsertII = FirstInlinedBB->eraseInstruction(CallInst); + + double ProfileRatio = 0; + if (auto CalleeExecCount = Callee.getKnownExecutionCount()) { + ProfileRatio = + (double) FirstInlinedBB->getKnownExecutionCount() / CalleeExecCount; } - BB.replaceInstruction(CallInst, InlinedInstance); -} + // Save execution count of the first block as we don't want it to change + // later due to profile adjustment rounding errors. + const auto FirstInlinedBBCount = FirstInlinedBB->getKnownExecutionCount(); -std::pair -InlineSmallFunctions::inlineCall( - BinaryContext &BC, - BinaryFunction &CallerFunction, - BinaryBasicBlock *CallerBB, - const unsigned CallInstIndex, - const BinaryFunction &InlinedFunction) { - // Get the instruction to be replaced with inlined code. - MCInst &CallInst = CallerBB->getInstructionAtIndex(CallInstIndex); - assert(BC.MIB->isCall(CallInst) && "Can only inline a call."); - - // Point in the function after the inlined code. - BinaryBasicBlock *AfterInlinedBB = nullptr; - unsigned AfterInlinedIstrIndex = 0; - - // In case of a tail call we should not remove any ret instructions from the - // inlined instance. - bool IsTailCall = BC.MIB->isTailCall(CallInst); - - // The first block of the function to be inlined can be merged with the caller - // basic block. This cannot happen if there are jumps to the first block. - bool CanMergeFirstInlinedBlock = (*InlinedFunction.begin()).pred_size() == 0; - - // If the call to be inlined is not at the end of its basic block and we have - // to inline more than one basic blocks (or even just one basic block that - // cannot be merged into the caller block), then the caller's basic block - // should be split. - bool ShouldSplitCallerBB = - CallInstIndex < CallerBB->size() - 1 && - (InlinedFunction.size() > 1 || !CanMergeFirstInlinedBlock); - - // Copy inlined function's basic blocks into a vector of basic blocks that - // will be inserted in the caller function (the inlined instance). Also, we - // keep a mapping from basic block index to the corresponding block in the - // inlined instance. - std::vector> InlinedInstance; + // Copy basic blocks and maintain a map from their origin. std::unordered_map InlinedBBMap; - - for (const auto InlinedFunctionBB : InlinedFunction.layout()) { - InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0)); - InlinedBBMap[InlinedFunctionBB] = InlinedInstance.back().get(); - if (InlinedFunction.hasValidProfile()) { - const auto Count = InlinedFunctionBB->getExecutionCount(); - InlinedInstance.back()->setExecutionCount(Count); - } - } - if (ShouldSplitCallerBB) { - // Add one extra block at the inlined instance for the removed part of the - // caller block. - InlinedInstance.emplace_back(CallerFunction.createBasicBlock(0)); - if (CallerFunction.hasValidProfile()) { - const auto Count = CallerBB->getExecutionCount(); - InlinedInstance.back()->setExecutionCount(Count); + InlinedBBMap[&Callee.front()] = FirstInlinedBB; + for (auto BBI = std::next(Callee.begin()); BBI != Callee.end(); ++BBI) { + auto *InlinedBB = CallerFunction.addBasicBlock(0); + InlinedBBMap[&*BBI] = InlinedBB; + InlinedBB->setCFIState(FirstInlinedBB->getCFIState()); + if (Callee.hasValidProfile()) { + InlinedBB->setExecutionCount(BBI->getKnownExecutionCount()); + } else { + InlinedBB->setExecutionCount(FirstInlinedBBCount); } } - // Copy instructions to the basic blocks of the inlined instance. - bool First = true; - for (const auto InlinedFunctionBB : InlinedFunction.layout()) { - // Get the corresponding block of the inlined instance. - auto *InlinedInstanceBB = InlinedBBMap.at(InlinedFunctionBB); - bool IsExitingBlock = false; - - // Copy instructions into the inlined instance. - for (auto Instruction : *InlinedFunctionBB) { - if (!IsTailCall && - BC.MIB->isReturn(Instruction) && - !BC.MIB->isTailCall(Instruction)) { - // Skip returns when the caller does a normal call as opposed to a tail - // call. - IsExitingBlock = true; + // Copy over instructions and edges. + for (const auto &BB : Callee) { + auto *InlinedBB = InlinedBBMap[&BB]; + + if (InlinedBB != FirstInlinedBB) + InsertII = InlinedBB->begin(); + + // Copy over instructions making any necessary mods. + for (auto Inst : BB) { + if (MIB.isPseudo(Inst)) continue; + + MIB.stripAnnotations(Inst); + + // Fix branch target. Strictly speaking, we don't have to do this as + // targets of direct branches will be fixed later and don't matter + // in the CFG state. However, disassembly may look misleading, and + // hence we do the fixing. + if (MIB.isBranch(Inst)) { + assert(!MIB.isIndirectBranch(Inst) && + "unexpected indirect branch in callee"); + const auto *TargetBB = + Callee.getBasicBlockForLabel(MIB.getTargetSymbol(Inst)); + assert(TargetBB && "cannot find target block in callee"); + MIB.replaceBranchTarget(Inst, InlinedBBMap[TargetBB]->getLabel(), + BC.Ctx.get()); } - if (!IsTailCall && - BC.MIB->isTailCall(Instruction)) { - // Convert tail calls to normal calls when the caller does a normal - // call. - if (!BC.MIB->convertTailCallToCall(Instruction)) - assert(false && "unexpected tail call opcode found"); - IsExitingBlock = true; + + if (CSIsTailCall || (!MIB.isCall(Inst) && !MIB.isReturn(Inst))) { + InsertII = std::next(InlinedBB->insertInstruction(InsertII, + std::move(Inst))); + continue; } - if (BC.MIB->isBranch(Instruction) && - !BC.MIB->isIndirectBranch(Instruction)) { - // Convert the branch targets in the branch instructions that will be - // added to the inlined instance. - const MCSymbol *OldTargetLabel = nullptr; - const MCSymbol *OldFTLabel = nullptr; - MCInst *CondBranch = nullptr; - MCInst *UncondBranch = nullptr; - const bool Result = BC.MIB->analyzeBranch(&Instruction, - &Instruction + 1, - OldTargetLabel, - OldFTLabel, CondBranch, - UncondBranch); - (void)Result; - assert(Result && - "analyzeBranch failed on instruction guaranteed to be a branch"); - assert(OldTargetLabel); - const MCSymbol *NewTargetLabel = nullptr; - for (const auto SuccBB : InlinedFunctionBB->successors()) { - if (SuccBB->getLabel() == OldTargetLabel) { - NewTargetLabel = InlinedBBMap.at(SuccBB)->getLabel(); - break; - } - } - assert(NewTargetLabel); - BC.MIB->replaceBranchTarget(Instruction, NewTargetLabel, BC.Ctx.get()); + + // Handle special instructions for a non-tail call site. + if (!MIB.isCall(Inst)) { + // Returns are removed. + break; } - // TODO; Currently we simply ignore CFI instructions but we need to - // address them for correctness. - if (!BC.MIB->isEHLabel(Instruction) && - !BC.MIB->isCFI(Instruction)) { - InlinedInstanceBB->addInstruction(std::move(Instruction)); + + MIB.convertTailCallToCall(Inst); + + // Propagate EH-related info to call instructions. + if (CSIsInvoke) { + MIB.addEHInfo(Inst, *CSEHInfo); + if (CSGNUArgsSize >= 0) + MIB.addGnuArgsSize(Inst, CSGNUArgsSize); } + + InsertII = std::next(InlinedBB->insertInstruction(InsertII, + std::move(Inst))); } // Add CFG edges to the basic blocks of the inlined instance. - std::vector - Successors(InlinedFunctionBB->succ_size(), nullptr); - + std::vector Successors(BB.succ_size()); std::transform( - InlinedFunctionBB->succ_begin(), - InlinedFunctionBB->succ_end(), + BB.succ_begin(), + BB.succ_end(), Successors.begin(), [&InlinedBBMap](const BinaryBasicBlock *BB) { return InlinedBBMap.at(BB); }); - if (InlinedFunction.hasValidProfile()) { - InlinedInstanceBB->addSuccessors( + if (CallerFunction.hasValidProfile() && Callee.hasValidProfile()) { + InlinedBB->addSuccessors( Successors.begin(), Successors.end(), - InlinedFunctionBB->branch_info_begin(), - InlinedFunctionBB->branch_info_end()); + BB.branch_info_begin(), + BB.branch_info_end()); } else { - InlinedInstanceBB->addSuccessors( + InlinedBB->addSuccessors( Successors.begin(), Successors.end()); } - if (IsExitingBlock) { - assert(Successors.size() == 0); - if (ShouldSplitCallerBB) { - if (InlinedFunction.hasValidProfile()) { - InlinedInstanceBB->addSuccessor( - InlinedInstance.back().get(), - InlinedInstanceBB->getExecutionCount()); - } else { - InlinedInstanceBB->addSuccessor(InlinedInstance.back().get()); - } - InlinedInstanceBB->addBranchInstruction(InlinedInstance.back().get()); - } else if (!First || !CanMergeFirstInlinedBlock) { - assert(CallInstIndex == CallerBB->size() - 1); - assert(CallerBB->succ_size() <= 1); - if (CallerBB->succ_size() == 1) { - if (InlinedFunction.hasValidProfile()) { - InlinedInstanceBB->addSuccessor( - *CallerBB->succ_begin(), - InlinedInstanceBB->getExecutionCount()); - } else { - InlinedInstanceBB->addSuccessor(*CallerBB->succ_begin()); - } - InlinedInstanceBB->addBranchInstruction(*CallerBB->succ_begin()); - } - } - } - - First = false; - } - - if (ShouldSplitCallerBB) { - // Split the basic block that contains the call and add the removed - // instructions in the last block of the inlined instance. - // (Is it OK to have a basic block with just CFI instructions?) - std::vector TrailInstructions = - CallerBB->splitInstructions(&CallInst); - assert(TrailInstructions.size() > 0); - InlinedInstance.back()->addInstructions( - TrailInstructions.begin(), - TrailInstructions.end()); - // Add CFG edges for the block with the removed instructions. - if (CallerFunction.hasValidProfile()) { - InlinedInstance.back()->addSuccessors( - CallerBB->succ_begin(), - CallerBB->succ_end(), - CallerBB->branch_info_begin(), - CallerBB->branch_info_end()); - } else { - InlinedInstance.back()->addSuccessors( - CallerBB->succ_begin(), - CallerBB->succ_end()); + if (!CSIsTailCall && BB.succ_size() == 0 && NextBB) { + // Either it's a return block or the last instruction never returns. + InlinedBB->addSuccessor(NextBB, InlinedBB->getExecutionCount()); } - // Update the after-inlined point. - AfterInlinedBB = InlinedInstance.back().get(); - AfterInlinedIstrIndex = 0; - } - assert(InlinedInstance.size() > 0 && "found function with no basic blocks"); - assert(InlinedInstance.front()->size() > 0 && - "found function with empty basic block"); - - // If the inlining cannot happen as a simple instruction insertion into - // CallerBB, we remove the outgoing CFG edges of the caller block. - if (InlinedInstance.size() > 1 || !CanMergeFirstInlinedBlock) { - CallerBB->removeAllSuccessors(); - if (!ShouldSplitCallerBB) { - // Update the after-inlined point. - AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB); - AfterInlinedIstrIndex = 0; - } - } else { - assert(!ShouldSplitCallerBB); - // Update the after-inlined point. - if (CallInstIndex < CallerBB->size() - 1) { - AfterInlinedBB = CallerBB; - AfterInlinedIstrIndex = - CallInstIndex + InlinedInstance.front()->size(); - } else { - AfterInlinedBB = CallerFunction.getBasicBlockAfter(CallerBB); - AfterInlinedIstrIndex = 0; - } - } - - // Do the inlining by merging the first block of the inlined instance into - // the caller basic block if possible and adding the rest of the inlined - // instance basic blocks in the caller function. - if (CanMergeFirstInlinedBlock) { - CallerBB->replaceInstruction( - &CallInst, - InlinedInstance.front()->begin(), - InlinedInstance.front()->end()); - if (InlinedInstance.size() > 1) { - auto FirstBB = InlinedInstance.begin()->get(); - if (InlinedFunction.hasValidProfile()) { - CallerBB->addSuccessors( - FirstBB->succ_begin(), - FirstBB->succ_end(), - FirstBB->branch_info_begin(), - FirstBB->branch_info_end()); + // Scale profiling info for blocks and edges after inlining. + if (CallerFunction.hasValidProfile() && Callee.size() > 1) { + if (opts::AdjustProfile) { + InlinedBB->adjustExecutionCount(ProfileRatio); } else { - CallerBB->addSuccessors( - FirstBB->succ_begin(), - FirstBB->succ_end()); + InlinedBB->setExecutionCount( + InlinedBB->getKnownExecutionCount() * ProfileRatio); } - FirstBB->removeAllSuccessors(); - } - InlinedInstance.erase(InlinedInstance.begin()); - } else { - CallerBB->eraseInstruction(&CallInst); - if (CallerFunction.hasValidProfile()) { - CallerBB->addSuccessor(InlinedInstance.front().get(), - CallerBB->getExecutionCount()); - } else { - CallerBB->addSuccessor(InlinedInstance.front().get(), - CallerBB->getExecutionCount()); } } - CallerFunction.insertBasicBlocks(CallerBB, std::move(InlinedInstance)); - return std::make_pair(AfterInlinedBB, AfterInlinedIstrIndex); + // Restore the original execution count of the first inlined basic block. + FirstInlinedBB->setExecutionCount(FirstInlinedBBCount); + + CallerFunction.recomputeLandingPads(); + + if (NextBB) + return std::make_pair(NextBB, NextBB->begin()); + + if (Callee.size() == 1) + return std::make_pair(FirstInlinedBB, InsertII); + + return std::make_pair(FirstInlinedBB, FirstInlinedBB->end()); } -bool InlineSmallFunctions::inlineCallsInFunction( - BinaryContext &BC, - BinaryFunction &Function) { +bool Inliner::inlineCallsInFunction(BinaryFunction &Function) { + auto &BC = Function.getBinaryContext(); std::vector Blocks(Function.layout().begin(), Function.layout().end()); std::sort(Blocks.begin(), Blocks.end(), [](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) { - return BB1->getExecutionCount() > BB2->getExecutionCount(); + return BB1->getKnownExecutionCount() > BB2->getKnownExecutionCount(); }); - uint32_t ExtraSize = 0; - for (auto BB : Blocks) { - for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) { + bool DidInlining = false; + for (auto *BB : Blocks) { + for (auto InstIt = BB->begin(); InstIt != BB->end(); ) { auto &Inst = *InstIt; - if (BC.MIB->isCall(Inst)) { - TotalDynamicCalls += BB->getExecutionCount(); + if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || + !Inst.getOperand(0).isExpr()) { + ++InstIt; + continue; } - } - } - bool DidInlining = false; - - for (auto BB : Blocks) { - if (BB->isCold()) - continue; + const auto *TargetSymbol = BC.MIB->getTargetSymbol(Inst); + assert(TargetSymbol && "target symbol expected for direct call"); + auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol); + if (!TargetFunction) { + ++InstIt; + continue; + } - for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ) { - auto &Inst = *InstIt; - if (BC.MIB->isCall(Inst) && - !BC.MIB->isTailCall(Inst) && - MCPlus::getNumPrimeOperands(Inst) == 1 && - Inst.getOperand(0).isExpr()) { - const auto *TargetSymbol = BC.MIB->getTargetSymbol(Inst); - assert(TargetSymbol && "target symbol expected for direct call"); - const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol); - if (TargetFunction) { - bool CallToInlineableFunction = - InliningCandidates.count(TargetFunction); - - TotalInlineableCalls += - CallToInlineableFunction * BB->getExecutionCount(); - - if (CallToInlineableFunction && - TargetFunction->getSize() + ExtraSize - + Function.estimateHotSize() < Function.getMaxSize()) { - auto NextInstIt = std::next(InstIt); - inlineCall(BC, *BB, &Inst, *TargetFunction->begin()); - DidInlining = true; - DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to " - << *TargetFunction << " in " - << Function << "\n"); - InstIt = NextInstIt; - ExtraSize += TargetFunction->getSize(); - InlinedDynamicCalls += BB->getExecutionCount(); - continue; - } - } + // Don't do recursive inlining. + if (TargetFunction == &Function) { + ++InstIt; + continue; } - ++InstIt; - } - } + auto IInfo = InliningCandidates.find(TargetFunction); + if (IInfo == InliningCandidates.end()) { + ++InstIt; + continue; + } - return DidInlining; -} + const auto IsTailCall = BC.MIB->isTailCall(Inst); + if (!IsTailCall && IInfo->second.Type == INL_TAILCALL) { + ++InstIt; + continue; + } -bool InlineSmallFunctions::inlineCallsInFunctionAggressive( - BinaryContext &BC, - BinaryFunction &Function) { - std::vector Blocks(Function.layout().begin(), - Function.layout().end()); - std::sort(Blocks.begin(), Blocks.end(), - [](const BinaryBasicBlock *BB1, const BinaryBasicBlock *BB2) { - return BB1->getExecutionCount() > BB2->getExecutionCount(); - }); - uint32_t ExtraSize = 0; + int64_t SizeAfterInlining; + if (IsTailCall) { + SizeAfterInlining = IInfo->second.SizeAfterTailCallInlining - + getSizeOfTailCallInst(BC); + } else { + SizeAfterInlining = IInfo->second.SizeAfterInlining - + getSizeOfCallInst(BC); + } - for (auto BB : Blocks) { - for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) { - auto &Inst = *InstIt; - if (BC.MIB->isCall(Inst)) { - TotalDynamicCalls += BB->getExecutionCount(); + if (!opts::InlineAll && !opts::mustConsider(*TargetFunction)) { + if (!opts::InlineSmallFunctions || + SizeAfterInlining > opts::InlineSmallFunctionsBytes) { + ++InstIt; + continue; + } } - } - } - bool DidInlining = false; + DEBUG(dbgs() << "BOLT-DEBUG: inlining call to " << *TargetFunction + << " in " << Function << " : " << BB->getName() + << ". Count: " << BB->getKnownExecutionCount() + << ". Size change: " << SizeAfterInlining << " bytes.\n"); - for (auto BB : Blocks) { - if (BB->isCold()) - continue; + std::tie(BB, InstIt) = inlineCall(*BB, InstIt, *TargetFunction); - unsigned InstIndex = 0; - for (auto InstIt = BB->begin(); InstIt != BB->end(); ) { - auto &Inst = *InstIt; - if (BC.MIB->isCall(Inst) && - MCPlus::getNumPrimeOperands(Inst) == 1 && - Inst.getOperand(0).isExpr()) { - assert(!BC.MIB->isInvoke(Inst)); - const auto *TargetSymbol = BC.MIB->getTargetSymbol(Inst); - assert(TargetSymbol && "target symbol expected for direct call"); - const auto *TargetFunction = BC.getFunctionForSymbol(TargetSymbol); - if (TargetFunction) { - bool CallToInlineableFunction = - InliningCandidates.count(TargetFunction); - - TotalInlineableCalls += - CallToInlineableFunction * BB->getExecutionCount(); - - if (CallToInlineableFunction && - TargetFunction->getSize() + ExtraSize - + Function.estimateHotSize() < Function.getMaxSize()) { - unsigned NextInstIndex = 0; - BinaryBasicBlock *NextBB = nullptr; - std::tie(NextBB, NextInstIndex) = - inlineCall(BC, Function, BB, InstIndex, *TargetFunction); - DidInlining = true; - DEBUG(dbgs() << "BOLT-DEBUG: Inlining call to " - << *TargetFunction << " in " - << Function << "\n"); - InstIndex = NextBB == BB ? NextInstIndex : BB->size(); - InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end(); - ExtraSize += TargetFunction->getSize(); - InlinedDynamicCalls += BB->getExecutionCount(); - continue; - } + DidInlining = true; + TotalInlinedBytes += SizeAfterInlining; + + ++NumInlinedCallSites; + NumInlinedDynamicCalls += BB->getExecutionCount(); + + // Subtract basic block execution count from the callee execution count. + if (opts::AdjustProfile) { + TargetFunction->adjustExecutionCount(BB->getKnownExecutionCount()); + } + + // Check if the caller inlining status has to be adjusted. + if (IInfo->second.Type == INL_TAILCALL) { + auto CallerIInfo = InliningCandidates.find(&Function); + if (CallerIInfo != InliningCandidates.end() && + CallerIInfo->second.Type == INL_ANY) { + DEBUG(dbgs() << "adjusting inlining status for function " << Function + << '\n'); + CallerIInfo->second.Type = INL_TAILCALL; } } - ++InstIndex; - ++InstIt; + if (NumInlinedCallSites == opts::InlineLimit) { + return true; + } } } return DidInlining; } -bool InlineSmallFunctions::mustConsider(const BinaryFunction &BF) { - for (auto &Name : opts::ForceInlineFunctions) { - if (BF.hasName(Name)) - return true; - } - return false; -} +void Inliner::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &) { + opts::syncOptions(); + + if (!opts::inliningEnabled()) + return; + + uint64_t TotalSize = 0; + for (auto &BFI : BFs) + TotalSize += BFI.second.getSize(); + + bool InlinedOnce; + unsigned NumIters = 0; + do { + if (opts::InlineLimit && NumInlinedCallSites >= opts::InlineLimit) + break; -void InlineSmallFunctions::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &) { + InlinedOnce = false; - if (opts::AggressiveInlining) - findInliningCandidatesAggressive(BC, BFs); - else + InliningCandidates.clear(); findInliningCandidates(BC, BFs); - std::vector ConsideredFunctions; - for (auto &It : BFs) { - auto &Function = It.second; - if (!shouldOptimize(Function) || - (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE && - !mustConsider(Function))) - continue; - ConsideredFunctions.push_back(&Function); - } - std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(), - [](BinaryFunction *A, BinaryFunction *B) { - return B->getExecutionCount() < A->getExecutionCount(); - }); - unsigned ModifiedFunctions = 0; - for (unsigned i = 0; i < ConsideredFunctions.size() && - ModifiedFunctions <= kMaxFunctions; ++i) { - auto &Function = *ConsideredFunctions[i]; - - const bool DidInline = opts::AggressiveInlining - ? inlineCallsInFunctionAggressive(BC, Function) - : inlineCallsInFunction(BC, Function); - - if (DidInline) { - Modified.insert(&Function); - ++ModifiedFunctions; + std::vector ConsideredFunctions; + for (auto &BFI : BFs) { + auto &Function = BFI.second; + if (!shouldOptimize(Function)) + continue; + ConsideredFunctions.push_back(&Function); } - } + std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(), + [](const BinaryFunction *A, const BinaryFunction *B) { + return B->getKnownExecutionCount() < A->getKnownExecutionCount(); + }); + for (auto *Function : ConsideredFunctions) { + if (opts::InlineLimit && NumInlinedCallSites >= opts::InlineLimit) + break; - DEBUG(dbgs() << "BOLT-INFO: Inlined " << InlinedDynamicCalls << " of " - << TotalDynamicCalls << " function calls in the profile.\n" - << "BOLT-INFO: Inlined calls represent " - << format("%.1f", - 100.0 * InlinedDynamicCalls / TotalInlineableCalls) - << "% of all inlineable calls in the profile.\n"); -} + const auto DidInline = inlineCallsInFunction(*Function); + + if (DidInline) + Modified.insert(Function); + InlinedOnce |= DidInline; + } + + ++NumIters; + } while (InlinedOnce && NumIters < opts::InlineMaxIters); + + if (NumInlinedCallSites) { + outs() << "BOLT-INFO: inlined " << NumInlinedDynamicCalls << " calls at " + << NumInlinedCallSites << " call sites in " << NumIters + << " iteration(s). Change in binary size: " << TotalInlinedBytes + << " bytes.\n"; + } +} } // namespace bolt } // namespace llvm diff --git a/bolt/src/Passes/Inliner.h b/bolt/src/Passes/Inliner.h index 4a548a5dba84..a890a9e27e26 100644 --- a/bolt/src/Passes/Inliner.h +++ b/bolt/src/Passes/Inliner.h @@ -21,76 +21,82 @@ namespace llvm { namespace bolt { -/// Inlining of single basic block functions. -/// The pass currently does not handle CFI instructions. This is needed for -/// correctness and we may break exception handling because of this. -class InlineSmallFunctions : public BinaryFunctionPass { +class Inliner : public BinaryFunctionPass { private: - std::set InliningCandidates; - - /// Maximum number of instructions in an inlined function. - static const unsigned kMaxInstructions = 8; - /// Maximum code size (in bytes) of inlined function (used by aggressive - /// inlining). - static const uint64_t kMaxSize = 60; - /// Maximum number of functions that will be considered for inlining (in - /// descending hottness order). - static const unsigned kMaxFunctions = 30000; - - /// Statistics collected for debugging. - uint64_t TotalDynamicCalls = 0; - uint64_t InlinedDynamicCalls = 0; - uint64_t TotalInlineableCalls = 0; + + enum InliningType : char { + INL_NONE = 0, /// Cannot inline + INL_TAILCALL, /// Can inline at tail call site + INL_ANY /// Can inline at any call site + }; + + struct InliningInfo { + InliningType Type{INL_NONE}; + uint64_t SizeAfterInlining{0}; + uint64_t SizeAfterTailCallInlining{0}; + + InliningInfo(InliningType Type = INL_NONE) + : Type(Type) + {} + }; + + std::unordered_map InliningCandidates; + + /// Count total amount of bytes inlined for all instances of Inliner. + /// Note that this number could be negative indicating that the inliner + /// reduced the size. + int64_t TotalInlinedBytes{0}; + + /// Dynamic count of calls eliminated. + uint64_t NumInlinedDynamicCalls{0}; + + /// Number of call sites that were inlined. + uint64_t NumInlinedCallSites{0}; + + /// Size in bytes of a regular call instruction. + static uint64_t SizeOfCallInst; + + /// Size in bytes of a tail call instruction. + static uint64_t SizeOfTailCallInst; + + /// Set of functions modified by inlining (used for printing). std::unordered_set Modified; - static bool mustConsider(const BinaryFunction &BF); + /// Return the size in bytes of a regular call instruction. + uint64_t getSizeOfCallInst(const BinaryContext &BC); + + /// Return the size in bytes of a tail call instruction. + uint64_t getSizeOfTailCallInst(const BinaryContext &BC); void findInliningCandidates(BinaryContext &BC, const std::map &BFs); - /// Inline the call in CallInst to InlinedFunctionBB (the only BB of the - /// called function). - void inlineCall(BinaryContext &BC, - BinaryBasicBlock &BB, - MCInst *CallInst, - const BinaryBasicBlock &InlinedFunctionBB); - - bool inlineCallsInFunction(BinaryContext &BC, - BinaryFunction &Function); - - /// The following methods do a more aggressive inlining pass, where we - /// inline calls as well as tail calls and we are not limited to inlining - /// functions with only one basic block. - /// FIXME: Currently these are broken since they do not work with the split - /// function option. - void findInliningCandidatesAggressive( - BinaryContext &BC, const std::map &BFs); - - bool inlineCallsInFunctionAggressive( - BinaryContext &BC, BinaryFunction &Function); - - /// Inline the call in CallInst to InlinedFunction. Inlined function should not - /// contain any landing pad or thrower edges but can have more than one blocks. + bool inlineCallsInFunction(BinaryFunction &Function); + + /// Inline a function call \p CallInst to function \p Callee. /// - /// Return the location (basic block and instruction index) where the code of - /// the caller function continues after the the inlined code. - std::pair - inlineCall(BinaryContext &BC, - BinaryFunction &CallerFunction, - BinaryBasicBlock *CallerBB, - const unsigned CallInstIdex, - const BinaryFunction &InlinedFunction); + /// Return the location (basic block and instruction iterator) where the code + /// of the caller function continues after the inlined code. + std::pair + inlineCall(BinaryBasicBlock &CallerBB, + BinaryBasicBlock::iterator CallInst, + const BinaryFunction &Callee); + + /// Check if the inliner can handle inlining of \p BF. + InliningInfo getInliningInfo(const BinaryFunction &BF) const; public: - explicit InlineSmallFunctions(const cl::opt &PrintPass) + explicit Inliner(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) { } const char *getName() const override { return "inlining"; } + bool shouldPrint(const BinaryFunction &BF) const override { return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; } + void runOnFunctions(BinaryContext &BC, std::map &BFs, std::set &LargeFunctions) override; diff --git a/bolt/src/Passes/JTFootprintReduction.cpp b/bolt/src/Passes/JTFootprintReduction.cpp index ac7328406159..52d32f55daea 100644 --- a/bolt/src/Passes/JTFootprintReduction.cpp +++ b/bolt/src/Passes/JTFootprintReduction.cpp @@ -124,7 +124,8 @@ void JTFootprintReduction::checkOpportunities(BinaryContext &BC, } bool JTFootprintReduction::tryOptimizeNonPIC( - BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, uint64_t JTAddr, + BinaryContext &BC, BinaryBasicBlock &BB, + BinaryBasicBlock::iterator Inst, uint64_t JTAddr, JumpTable *JumpTable, DataflowInfoManager &Info) { if (opts::JTFootprintOnlyPIC) return false; @@ -137,7 +138,7 @@ bool JTFootprintReduction::tryOptimizeNonPIC( BC.MIB->matchAnyOperand(Base), BC.MIB->matchImm(Scale), BC.MIB->matchReg(Index), BC.MIB->matchAnyOperand(Offset)); if (!IndJmpMatcher->match(*BC.MRI, *BC.MIB, - MutableArrayRef(&*BB.begin(), &Inst + 1), + MutableArrayRef(&*BB.begin(), &*Inst + 1), -1)) { return false; } @@ -148,7 +149,7 @@ bool JTFootprintReduction::tryOptimizeNonPIC( IndJmpMatcher->annotate(*BC.MIB, "DeleteMe"); auto &LA = Info.getLivenessAnalysis(); - MCPhysReg Reg = LA.scavengeRegAfter(&Inst); + MCPhysReg Reg = LA.scavengeRegAfter(&*Inst); assert(Reg != 0 && "Register scavenger failed!"); auto RegOp = MCOperand::createReg(Reg); SmallVector NewFrag; @@ -159,12 +160,13 @@ bool JTFootprintReduction::tryOptimizeNonPIC( JumpTable->OutputEntrySize = 4; - BB.replaceInstruction(&Inst, NewFrag.begin(), NewFrag.end()); + BB.replaceInstruction(Inst, NewFrag.begin(), NewFrag.end()); return true; } bool JTFootprintReduction::tryOptimizePIC( - BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, uint64_t JTAddr, + BinaryContext &BC, BinaryBasicBlock &BB, + BinaryBasicBlock::iterator Inst, uint64_t JTAddr, JumpTable *JumpTable, DataflowInfoManager &Info) { MCPhysReg BaseReg; uint64_t Scale; @@ -176,7 +178,7 @@ bool JTFootprintReduction::tryOptimizePIC( BC.MIB->matchLoad(BC.MIB->matchReg(BaseReg), BC.MIB->matchImm(Scale), BC.MIB->matchReg(Index), BC.MIB->matchAnyOperand()))); if (!PICIndJmpMatcher->match(*BC.MRI, *BC.MIB, - MutableArrayRef(&*BB.begin(), &Inst + 1), + MutableArrayRef(&*BB.begin(), &*Inst + 1), -1)) { return false; } @@ -197,7 +199,7 @@ bool JTFootprintReduction::tryOptimizePIC( // DePICify JumpTable->Type = JumpTable::JTT_NORMAL; - BB.replaceInstruction(&Inst, NewFrag.begin(), NewFrag.end()); + BB.replaceInstruction(Inst, NewFrag.begin(), NewFrag.end()); return true; } @@ -208,13 +210,14 @@ void JTFootprintReduction::optimizeFunction(BinaryContext &BC, if (!BB.getNumNonPseudos()) continue; - MCInst &IndJmp = *BB.getLastNonPseudo(); - uint64_t JTAddr = BC.MIB->getJumpTable(IndJmp); + auto IndJmpRI = BB.getLastNonPseudo(); + auto IndJmp = std::prev(IndJmpRI.base()); + const auto JTAddr = BC.MIB->getJumpTable(*IndJmp); if (!JTAddr) continue; - auto *JumpTable = Function.getJumpTable(IndJmp); + auto *JumpTable = Function.getJumpTable(*IndJmp); if (BlacklistedJTs.count(JumpTable)) continue; @@ -231,9 +234,11 @@ void JTFootprintReduction::optimizeFunction(BinaryContext &BC, return; for (auto &BB : Function) { - for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + for (auto I = BB.begin(); I != BB.end(); ) { if (BC.MIB->hasAnnotation(*I, "DeleteMe")) - BB.eraseInstruction(&*I); + I = BB.eraseInstruction(I); + else + ++I; } } } diff --git a/bolt/src/Passes/JTFootprintReduction.h b/bolt/src/Passes/JTFootprintReduction.h index 20b1da2eb4ae..61726619bf23 100644 --- a/bolt/src/Passes/JTFootprintReduction.h +++ b/bolt/src/Passes/JTFootprintReduction.h @@ -48,13 +48,15 @@ class JTFootprintReduction : public BinaryFunctionPass { /// sequence from a single jmp * instruction to a pair of load32zext-jmp /// instructions that depend on the availability of an extra register. /// This saves dcache/dTLB at the expense of icache. - bool tryOptimizeNonPIC(BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, + bool tryOptimizeNonPIC(BinaryContext &BC, BinaryBasicBlock &BB, + BinaryBasicBlock::iterator Inst, uint64_t JTAddr, JumpTable *JumpTable, DataflowInfoManager &Info); /// The PIC jump table optimization consists of "de-pic-ifying" it, since the /// PIC jump sequence is larger than its non-PIC counterpart, saving icache. - bool tryOptimizePIC(BinaryContext &BC, BinaryBasicBlock &BB, MCInst &Inst, + bool tryOptimizePIC(BinaryContext &BC, BinaryBasicBlock &BB, + BinaryBasicBlock::iterator Inst, uint64_t JTAddr, JumpTable *JumpTable, DataflowInfoManager &Info); diff --git a/bolt/src/Passes/LongJmp.cpp b/bolt/src/Passes/LongJmp.cpp index e8b6719d1bcc..f4810be5fb90 100644 --- a/bolt/src/Passes/LongJmp.cpp +++ b/bolt/src/Passes/LongJmp.cpp @@ -589,7 +589,7 @@ bool LongJmpPass::relax(BinaryFunction &Func) { continue; std::vector> NewBBs; NewBBs.emplace_back(std::move(Elmt.second)); - Func.insertBasicBlocks(Elmt.first, std::move(NewBBs), true, true); + Func.insertBasicBlocks(Elmt.first, std::move(NewBBs), true); } return Modified; diff --git a/bolt/src/Passes/ShrinkWrapping.cpp b/bolt/src/Passes/ShrinkWrapping.cpp index c89ad2f46a57..4de7d9fec124 100644 --- a/bolt/src/Passes/ShrinkWrapping.cpp +++ b/bolt/src/Passes/ShrinkWrapping.cpp @@ -1592,10 +1592,14 @@ void ShrinkWrapping::rebuildCFIForSP() { PrevBB = BB; } - for (auto &BB : BF) - for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) + for (auto &BB : BF) { + for (auto I = BB.begin(); I != BB.end(); ) { if (BC.MIB->hasAnnotation(*I, "DeleteMe")) - BB.eraseInstruction(&*I); + I = BB.eraseInstruction(I); + else + ++I; + } + } } MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal, diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index 5497db157f8d..b758146999a1 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -34,7 +34,6 @@ #include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCAsmInfo.h" -#include #define DEBUG_TYPE "bolt-x86" @@ -2677,7 +2676,7 @@ class X86MCPlusBuilder : public MCPlusBuilder { bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale, const MCPhysReg &IndexReg, int64_t Offset, const MCExpr *OffsetExpr, const MCPhysReg &AddrSegmentReg, - const MCPhysReg &DstReg, int Size) const { + const MCPhysReg &DstReg, int Size) const override { unsigned NewOpcode; switch (Size) { default: @@ -2797,6 +2796,14 @@ class X86MCPlusBuilder : public MCPlusBuilder { return true; } + bool createCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { + Inst.setOpcode(X86::CALL64pcrel32); + Inst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx))); + return true; + } + bool createTailCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx) override { Inst.setOpcode(X86::TAILJMPd); From e8eeec9dab0166cea25fb95aeb15e109c174eb0e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 5 Feb 2019 15:28:19 -0800 Subject: [PATCH 502/904] [BOLT-HEATMAP] Initial heat map implementation Summary: Add heatmap subcommand to produce heatmaps based on perf.data with LBR. The output is produced in colored ASCII format. llvm-bolt heatmap -p perf.data -block-size= - size of a heat map block in bytes (default 64) -line-size= - number of entries per line (default 256) -max-address= - maximum address considered valid for heatmap (default 4GB) -o= - heatmap output file (default stdout) (cherry picked from commit 7f2e8b6fa53f3b582d74443fadbb34ec1132e9bc) --- bolt/src/CMakeLists.txt | 2 + bolt/src/DataAggregator.cpp | 110 +++++++++++++++- bolt/src/DataAggregator.h | 3 + bolt/src/Exceptions.cpp | 28 ++++ bolt/src/Exceptions.h | 30 +---- bolt/src/Heatmap.cpp | 241 +++++++++++++++++++++++++++++++++++ bolt/src/Heatmap.h | 71 +++++++++++ bolt/src/RewriteInstance.cpp | 15 +-- bolt/src/llvm-bolt.cpp | 47 ++++++- 9 files changed, 501 insertions(+), 46 deletions(-) create mode 100644 bolt/src/Heatmap.cpp create mode 100644 bolt/src/Heatmap.h diff --git a/bolt/src/CMakeLists.txt b/bolt/src/CMakeLists.txt index 3804b16f2e8f..e10db2e99519 100644 --- a/bolt/src/CMakeLists.txt +++ b/bolt/src/CMakeLists.txt @@ -77,6 +77,7 @@ add_llvm_tool(llvm-bolt DebugData.cpp DWARFRewriter.cpp Exceptions.cpp + Heatmap.cpp JumpTable.cpp MCPlusBuilder.cpp ProfileReader.cpp @@ -90,3 +91,4 @@ add_llvm_tool(llvm-bolt add_llvm_tool_symlink(perf2bolt llvm-bolt) add_llvm_tool_symlink(llvm-boltdiff llvm-bolt) +add_llvm_tool_symlink(llvm-bolt-heatmap llvm-bolt) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 29b21609819a..70091d2fba70 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -15,6 +15,7 @@ #include "BinaryContext.h" #include "BinaryFunction.h" #include "DataAggregator.h" +#include "Heatmap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Options.h" @@ -37,6 +38,8 @@ using namespace bolt; namespace opts { extern cl::OptionCategory AggregatorCategory; +extern bool HeatmapMode; +extern cl::SubCommand HeatmapCommand; static cl::opt BasicAggregation("nl", @@ -46,12 +49,31 @@ BasicAggregation("nl", cl::cat(AggregatorCategory)); static cl::opt -WriteAutoFDOData("autofdo", - cl::desc("generate autofdo textual data instead of bolt data"), +IgnoreBuildID("ignore-build-id", + cl::desc("continue even if build-ids in input binary and perf.data mismatch"), cl::init(false), - cl::ZeroOrMore, cl::cat(AggregatorCategory)); +static cl::opt +HeatmapBlock("block-size", + cl::desc("size of a heat map block in bytes (default 64)"), + cl::init(64), + cl::sub(HeatmapCommand)); + +static cl::opt +HeatmapFile("o", + cl::init("-"), + cl::desc("heatmap output file (default stdout)"), + cl::Optional, + cl::sub(HeatmapCommand)); + +static cl::opt +HeatmapMaxAddress("max-address", + cl::init(0xffffffff), + cl::desc("maximum address considered valid for heatmap (default 4GB)"), + cl::Optional, + cl::sub(HeatmapCommand)); + static cl::opt ReadPreAggregated("pa", cl::desc("skip perf and read data from a pre-aggregated file format"), @@ -60,14 +82,15 @@ ReadPreAggregated("pa", cl::cat(AggregatorCategory)); static cl::opt -IgnoreBuildID("ignore-build-id", - cl::desc("continue even if build-ids in input binary and perf.data mismatch"), +TimeAggregator("time-aggr", + cl::desc("time BOLT aggregator"), cl::init(false), + cl::ZeroOrMore, cl::cat(AggregatorCategory)); static cl::opt -TimeAggregator("time-aggr", - cl::desc("time BOLT aggregator"), +WriteAutoFDOData("autofdo", + cl::desc("generate autofdo textual data instead of bolt data"), cl::init(false), cl::ZeroOrMore, cl::cat(AggregatorCategory)); @@ -459,6 +482,15 @@ void DataAggregator::parseProfile( } prepareToParse("events", MainEventsPPI); + + if (opts::HeatmapMode) { + if (auto EC = printLBRHeatMap()) { + errs() << "ERROR: failed to print heat map: " << EC.message() << '\n'; + exit(1); + } + exit(0); + } + if ((!opts::BasicAggregation && parseBranchEvents()) || (opts::BasicAggregation && parseBasicEvents())) { errs() << "PERF2BOLT: failed to parse samples\n"; @@ -967,6 +999,70 @@ bool DataAggregator::hasData() { return true; } +std::error_code DataAggregator::printLBRHeatMap() { + outs() << "PERF2BOLT: parse branch events...\n"; + NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName, + TimerGroupDesc, opts::TimeAggregator); + + Heatmap HM(opts::HeatmapBlock, opts::HeatmapMaxAddress); + uint64_t NumTotalSamples{0}; + + while (hasData()) { + auto SampleRes = parseBranchSample(); + if (std::error_code EC = SampleRes.getError()) + return EC; + + auto &Sample = SampleRes.get(); + + // LBRs are stored in reverse execution order. NextLBR refers to the next + // executed branch record. + const LBREntry *NextLBR{nullptr}; + for (const auto &LBR : Sample.LBR) { + if (NextLBR) { + // Record fall-through trace. + const auto TraceFrom = LBR.To; + const auto TraceTo = NextLBR->From; + ++FallthroughLBRs[Trace(TraceFrom, TraceTo)].InternCount; + } + NextLBR = &LBR; + } + if (!Sample.LBR.empty()) { + HM.registerAddress(Sample.LBR.front().To); + HM.registerAddress(Sample.LBR.back().From); + } + NumTotalSamples += Sample.LBR.size(); + } + + if (!NumTotalSamples) { + errs() << "HEATMAP-ERROR: no LBR traces detected in profile. " + "Cannot build heatmap.\n"; + exit(1); + } + + outs() << "HEATMAP: read " << NumTotalSamples << " LBR samples\n"; + outs() << "HEATMAP: " << FallthroughLBRs.size() << " unique traces\n"; + + outs() << "HEATMAP: building heat map...\n"; + + for (const auto &LBR : FallthroughLBRs) { + const auto &Trace = LBR.first; + const auto &Info = LBR.second; + HM.registerAddressRange(Trace.From, Trace.To, Info.InternCount); + } + + if (HM.getNumInvalidRanges()) + outs() << "HEATMAP: invalid traces: " << HM.getNumInvalidRanges() << '\n'; + + if (!HM.size()) { + errs() << "HEATMAP-ERROR: no valid traces registered\n"; + exit(1); + } + + HM.print(opts::HeatmapFile); + + return std::error_code(); +} + std::error_code DataAggregator::parseBranchEvents() { outs() << "PERF2BOLT: parse branch events...\n"; NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName, diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index 151dc68e77ba..26863ff14667 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -222,6 +222,9 @@ class DataAggregator : public DataReader { /// everything bool hasData(); + /// Print heat map based on LBR samples. + std::error_code printLBRHeatMap(); + /// Parse a single perf sample containing a PID associated with a sequence of /// LBR entries ErrorOr parseBranchSample(); diff --git a/bolt/src/Exceptions.cpp b/bolt/src/Exceptions.cpp index b2024164a38d..cfd039ed1537 100644 --- a/bolt/src/Exceptions.cpp +++ b/bolt/src/Exceptions.cpp @@ -661,6 +661,34 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0; +CFIReaderWriter::CFIReaderWriter(const DWARFDebugFrame &EHFrame) { + // Prepare FDEs for fast lookup + for (const auto &Entry : EHFrame.entries()) { + const auto *CurFDE = dyn_cast(&Entry); + // Skip CIEs. + if (!CurFDE) + continue; + // There could me multiple FDEs with the same initial address, and perhaps + // different sizes (address ranges). Use the first entry with non-zero size. + auto FDEI = FDEs.lower_bound(CurFDE->getInitialLocation()); + if (FDEI != FDEs.end() && FDEI->first == CurFDE->getInitialLocation()) { + if (CurFDE->getAddressRange()) { + if (FDEI->second->getAddressRange() == 0) { + FDEI->second = CurFDE; + } else if (opts::Verbosity > 0) { + errs() << "BOLT-WARNING: different FDEs for function at 0x" + << Twine::utohexstr(FDEI->first) + << " detected; sizes: " + << FDEI->second->getAddressRange() << " and " + << CurFDE->getAddressRange() << '\n'; + } + } + } else { + FDEs.emplace_hint(FDEI, CurFDE->getInitialLocation(), CurFDE); + } + } +} + bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { uint64_t Address = Function.getAddress(); auto I = FDEs.find(Address); diff --git a/bolt/src/Exceptions.h b/bolt/src/Exceptions.h index fe0862f8fe84..202c9611c3c8 100644 --- a/bolt/src/Exceptions.h +++ b/bolt/src/Exceptions.h @@ -28,35 +28,7 @@ class RewriteInstance; /// BinaryFunction, as well as rewriting CFI sections. class CFIReaderWriter { public: - explicit CFIReaderWriter(const DWARFDebugFrame &EHFrame) { - // Prepare FDEs for fast lookup - for (const auto &Entry : EHFrame.entries()) { - const auto *CurFDE = dyn_cast(&Entry); - // Skip CIEs. - if (!CurFDE) - continue; - // There could me multiple FDEs with the same initial address, but - // different size (address range). Make sure the sizes match if they - // are non-zero. Ignore zero-sized ones. - auto FDEI = FDEs.lower_bound(CurFDE->getInitialLocation()); - if (FDEI != FDEs.end() && - FDEI->first == CurFDE->getInitialLocation()) { - if (FDEI->second->getAddressRange() != 0 && - CurFDE->getAddressRange() != 0 && - CurFDE->getAddressRange() != FDEI->second->getAddressRange()) { - errs() << "BOLT-ERROR: input FDEs for function at 0x" - << Twine::utohexstr(FDEI->first) - << " have conflicting sizes: " - << FDEI->second->getAddressRange() << " and " - << CurFDE->getAddressRange() << '\n'; - } else if (FDEI->second->getAddressRange() == 0) { - FDEI->second = CurFDE; - } - continue; - } - FDEs.emplace_hint(FDEI, CurFDE->getInitialLocation(), CurFDE); - } - } + explicit CFIReaderWriter(const DWARFDebugFrame &EHFrame); bool fillCFIInfoFor(BinaryFunction &Function) const; diff --git a/bolt/src/Heatmap.cpp b/bolt/src/Heatmap.cpp new file mode 100644 index 000000000000..5761c90db2c6 --- /dev/null +++ b/bolt/src/Heatmap.cpp @@ -0,0 +1,241 @@ +//===-- Heatmap.cpp ---------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "Heatmap.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include + +#define DEBUG_TYPE "bolt-heatmap" + +using namespace llvm; + +namespace opts { + +extern cl::SubCommand HeatmapCommand; + +static cl::opt +BucketsPerLine("line-size", + cl::desc("number of entries per line (default 256)"), + cl::init(256), + cl::Optional, + cl::sub(HeatmapCommand)); + +} + +namespace llvm { +namespace bolt { + +void Heatmap::registerAddressRange(uint64_t StartAddress, uint64_t EndAddress, + uint64_t Count) { + if (ignoreAddress(StartAddress)) { + ++NumSkippedRanges; + return; + } + + if (StartAddress > EndAddress || + EndAddress - StartAddress > 64 * 1024) { + DEBUG(dbgs() << "invalid range : 0x" << Twine::utohexstr(StartAddress) + << " -> 0x" << Twine::utohexstr(EndAddress) << '\n'); + ++NumSkippedRanges; + return; + } + + for (uint64_t Bucket = StartAddress / BucketSize; + Bucket <= EndAddress / BucketSize; ++Bucket) { + Map[Bucket] += Count; + } +} + +void Heatmap::print(StringRef FileName) const { + std::error_code EC; + raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::F_None); + if (EC) { + errs() << "error opening output file: " << EC.message() << '\n'; + exit(1); + } + print(OS); +} + +void Heatmap::print(raw_ostream &OS) const { + const char FillChar = '.'; + + const auto DefaultColor = raw_ostream::WHITE; + auto changeColor = [&](raw_ostream::Colors Color) -> void { + static auto CurrentColor = raw_ostream::BLACK; + if (CurrentColor == Color) + return; + OS.changeColor(Color); + CurrentColor = Color; + }; + + const uint64_t BytesPerLine = opts::BucketsPerLine * BucketSize; + + // Calculate the max value for scaling. + uint64_t MaxValue = 0; + for (auto &Entry : Map) { + MaxValue = std::max(MaxValue, Entry.second); + } + + // Print start of the line and fill it with an empty space right before + // the Address. + auto startLine = [&](uint64_t Address, bool Empty = false) { + changeColor(DefaultColor); + const auto LineAddress = Address / BytesPerLine * BytesPerLine; + + if (MaxAddress > 0xffffffff) + OS << format("0x%016" PRIx64 ": ", LineAddress); + else + OS << format("0x%08" PRIx64 ": ", LineAddress); + + if (Empty) + Address = LineAddress + BytesPerLine; + for (auto Fill = LineAddress; Fill < Address; Fill += BucketSize) { + OS << FillChar; + } + }; + + // Finish line after \p Address was printed. + auto finishLine = [&](uint64_t Address) { + const auto End = alignTo(Address + 1, BytesPerLine); + for (auto Fill = Address + BucketSize; Fill < End; Fill += BucketSize) + OS << FillChar; + OS << '\n'; + }; + + // Fill empty space in (Start, End) range. + auto fillRange = [&](uint64_t Start, uint64_t End) { + if ((Start / BytesPerLine) == (End / BytesPerLine)) { + for (auto Fill = Start + BucketSize; Fill < End; Fill += BucketSize) { + changeColor(DefaultColor); + OS << FillChar; + } + return; + } + + changeColor(DefaultColor); + finishLine(Start); + Start = alignTo(Start, BytesPerLine); + + uint64_t NumEmptyLines = (End - Start) / BytesPerLine; + + if (NumEmptyLines > 32) { + OS << '\n'; + } else { + while (NumEmptyLines--) { + startLine(Start, /*Empty=*/true); + OS << '\n'; + Start += BytesPerLine; + } + } + + startLine(End); + }; + + static raw_ostream::Colors Colors[] = { + raw_ostream::WHITE, + raw_ostream::WHITE, + raw_ostream::CYAN, + raw_ostream::GREEN, + raw_ostream::YELLOW, + raw_ostream::RED + }; + constexpr size_t NumRanges = sizeof(Colors) / sizeof(Colors[0]); + + uint64_t Range[NumRanges]; + for (uint64_t I = 0; I < NumRanges; ++I) + Range[I] = std::max(I + 1, + (uint64_t) std::pow((double) MaxValue, + (double) (I + 1) / NumRanges)); + Range[NumRanges - 1] = std::max((uint64_t) NumRanges, MaxValue); + + // Print scaled value + auto printValue = [&](uint64_t Value, bool ResetColor = false) { + assert(Value && "should only print positive values"); + for (unsigned I = 0; I < sizeof(Range) / sizeof(Range[0]); ++I) { + if (Value <= Range[I]) { + changeColor(Colors[I]); + break; + } + } + if (Value <= Range[0]) { + OS << 'o'; + } else { + OS << 'O'; + } + if (ResetColor) + changeColor(DefaultColor); + }; + + // Print against black background + OS.changeColor(raw_ostream::BLACK, /*Bold=*/false, /*Background=*/true); + changeColor(DefaultColor); + + // Print map legend + OS << "Legend:\n"; + uint64_t PrevValue = 0; + for (unsigned I = 0; I < sizeof(Range) / sizeof(Range[0]); ++I) { + const auto Value = Range[I]; + OS << " "; + printValue(Value, true); + OS << " : (" << PrevValue << ", " << Value << "]\n"; + PrevValue = Value; + } + + // Pos - character position from right in hex form. + auto printHeader = [&](unsigned Pos) { + OS << " "; + if (MaxAddress > 0xffffffff) + OS << " "; + unsigned PrevValue = unsigned(-1); + for (unsigned I = 0; I < BytesPerLine; I += BucketSize) { + const auto Value = (I & ((1 << Pos * 4) - 1)) >> (Pos - 1) * 4; + if (Value != PrevValue) { + OS << Twine::utohexstr(Value); + PrevValue = Value; + } else { + OS << ' '; + } + } + OS << '\n'; + }; + for (unsigned I = 5; I > 0; --I) + printHeader(I); + + uint64_t PrevAddress = 0; + for (auto MI = Map.begin(), ME = Map.end(); MI != ME; ++MI) { + auto &Entry = *MI; + uint64_t Address = Entry.first * BucketSize; + + if (PrevAddress) { + fillRange(PrevAddress, Address); + } else { + startLine(Address); + } + + printValue(Entry.second); + + PrevAddress = Address; + } + + if (PrevAddress) { + changeColor(DefaultColor); + finishLine(PrevAddress); + } +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/Heatmap.h b/bolt/src/Heatmap.h new file mode 100644 index 000000000000..a66e9e1d4879 --- /dev/null +++ b/bolt/src/Heatmap.h @@ -0,0 +1,71 @@ +//===-- Heatmap.cpp ---------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_HEATMAP_H +#define LLVM_TOOLS_LLVM_BOLT_HEATMAP_H + +#include "llvm/Support/raw_ostream.h" +#include + +namespace llvm { +namespace bolt { + +class Heatmap { + /// Number of bytes per entry in the heat map. + size_t BucketSize; + + /// Maximum address that is considered to be valid. + uint64_t MaxAddress; + + /// Count invalid ranges. + uint64_t NumSkippedRanges{0}; + + /// Map buckets to the number of samples. + std::map Map; + +public: + explicit Heatmap(uint64_t BucketSize = 4096, + uint64_t MaxAddress = std::numeric_limits::max()) + : BucketSize(BucketSize), MaxAddress(MaxAddress) + {}; + + inline bool ignoreAddress(uint64_t Address) const { + return Address > MaxAddress; + } + + /// Register a single sample at \p Address. + void registerAddress(uint64_t Address) { + if (!ignoreAddress(Address)) + ++Map[Address / BucketSize]; + } + + /// Register \p Count samples at [\p StartAddress, \p EndAddress ]. + void registerAddressRange(uint64_t StartAddress, uint64_t EndAddress, + uint64_t Count); + + /// Return the number of ranges that failed to register. + uint64_t getNumInvalidRanges() const { + return NumSkippedRanges; + } + + void print(StringRef FileName) const; + + void print(raw_ostream &OS) const; + + size_t size() const { + return Map.size(); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 19ca5b10491d..ae6e5d5280f4 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -75,6 +75,8 @@ using namespace bolt; namespace opts { +extern bool HeatmapMode; + extern cl::OptionCategory BoltCategory; extern cl::OptionCategory BoltDiffCategory; extern cl::OptionCategory BoltOptCategory; @@ -357,7 +359,8 @@ Verbosity("v", cl::desc("set verbosity level for diagnostic output"), cl::init(0), cl::ZeroOrMore, - cl::cat(BoltCategory)); + cl::cat(BoltCategory), + cl::sub(*cl::AllSubCommands)); cl::opt AggregateOnly("aggregate-only", @@ -857,8 +860,9 @@ void RewriteInstance::discoverStorage() { SectionContents.data() - InputFile->getData().data(); } - if (SectionName.startswith(OrgSecPrefix) || - SectionName.startswith(BOLTSecPrefix)) { + if (!opts::HeatmapMode && + (SectionName.startswith(OrgSecPrefix) || + SectionName.startswith(BOLTSecPrefix))) { errs() << "BOLT-ERROR: input file was processed by BOLT. " "Cannot re-optimize.\n"; exit(1); @@ -1781,11 +1785,6 @@ void RewriteInstance::readSpecialSections() { BC->printSections(outs()); } - if (opts::PrintSections) { - outs() << "BOLT-INFO: Sections:\n"; - BC->printSections(outs()); - } - if (opts::RelocationMode == cl::BOU_TRUE && !HasTextRelocations) { errs() << "BOLT-ERROR: relocations against code are missing from the input " "file. Cannot proceed in relocations mode (-relocs).\n"; diff --git a/bolt/src/llvm-bolt.cpp b/bolt/src/llvm-bolt.cpp index a07e31328d39..bbe1b0f3a9b4 100644 --- a/bolt/src/llvm-bolt.cpp +++ b/bolt/src/llvm-bolt.cpp @@ -33,6 +33,8 @@ using namespace bolt; namespace opts { +bool HeatmapMode = false; + cl::OptionCategory BoltCategory("BOLT generic options"); cl::OptionCategory BoltDiffCategory("BOLTDIFF generic options"); cl::OptionCategory BoltOptCategory("BOLT optimization options"); @@ -50,6 +52,8 @@ static cl::OptionCategory *BoltDiffCategories[] = {&BoltDiffCategory}; static cl::OptionCategory *Perf2BoltCategories[] = {&AggregatorCategory, &BoltOutputCategory}; +cl::SubCommand HeatmapCommand("heatmap", "generate heatmap"); + extern cl::opt OutputFilename; extern cl::opt AggregateOnly; extern cl::opt DiffOnly; @@ -77,7 +81,8 @@ InputFilename( cl::Positional, cl::desc(""), cl::Required, - cl::cat(BoltDiffCategory)); + cl::cat(BoltCategory), + cl::sub(*cl::AllSubCommands)); static cl::opt InputFilename2( @@ -90,7 +95,8 @@ static cl::opt PerfData("perfdata", cl::desc(""), cl::Optional, - cl::cat(AggregatorCategory)); + cl::cat(AggregatorCategory), + cl::sub(*cl::AllSubCommands)); static cl::alias PerfDataA("p", @@ -155,6 +161,34 @@ void perf2boltMode(int argc, char **argv) { opts::AggregateOnly = true; } +void heatmapMode(int argc, char **argv) { + // Insert a fake subcommand if invoked via a command alias. + std::unique_ptr FakeArgv; + if (argc == 1 || strcmp(argv[1], "heatmap")) { + ++argc; + FakeArgv.reset(new char *[argc+1]); + FakeArgv[0] = argv[0]; + FakeArgv[1] = const_cast("heatmap"); + for (int I = 2; I < argc; ++I) + FakeArgv[I] = argv[I - 1]; + FakeArgv[argc] = nullptr; + argv = FakeArgv.get(); + } + + cl::ParseCommandLineOptions(argc, argv, ""); + + if (!sys::fs::exists(opts::InputFilename)) + report_error(opts::InputFilename, errc::no_such_file_or_directory); + + if (opts::PerfData.empty()) { + errs() << ToolName << ": expected -perfdata= option.\n"; + exit(1); + } + + opts::HeatmapMode = true; + opts::AggregateOnly = true; +} + void boltDiffMode(int argc, char **argv) { cl::HideUnrelatedOptions(makeArrayRef(opts::BoltDiffCategories)); cl::ParseCommandLineOptions( @@ -213,10 +247,19 @@ int main(int argc, char **argv) { ToolName = argv[0]; + // Pre-process subcommands. + if (argc > 1 && *argv[1] != '-') { + if (!strcmp(argv[1], "heatmap")) + opts::HeatmapMode = true; + } + if (llvm::sys::path::filename(ToolName) == "perf2bolt") perf2boltMode(argc, argv); else if (llvm::sys::path::filename(ToolName) == "llvm-boltdiff") boltDiffMode(argc, argv); + else if (llvm::sys::path::filename(ToolName) == "llvm-bolt-heatmap" || + opts::HeatmapMode) + heatmapMode(argc, argv); else boltMode(argc, argv); From e5d4a5381b53b50416c16b0c278695d053f4998a Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 12 Mar 2019 16:36:35 -0700 Subject: [PATCH 503/904] Do not assert on addresses read from processIndirectBranch Summary: As part of our heuristics to decode an indirect branch, if we suspect the branch is an indirect tail call, we add its probable target to the BC::InterproceduralReferences vector to detect functions with more than one entry point. However, if this probable target is not in an allocatable section, we were asserting. Remove this assertion and change the code to conditionally store to InterproceduralReferences instead. The probable target could be garbage at this point because of analyzeIndirectBranch failing to identify the load instruction that has the memory address of the target, so we should tolerate this. (cherry picked from commit d0df41280d3d0bb605b3dd6d0193b39ac748d859) --- bolt/src/BinaryFunction.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index bc3ecd4c9b48..979e69907684 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -932,8 +932,16 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, return Type; } - assert(!Value || BC.getSectionForAddress(Value)); - BC.InterproceduralReferences.insert(Value); + + // We have a possible tail call, so let's add the value read from the possible + // memory location as a reference. Only do that if the address we read is sane + // enough (is inside an allocatable section). It is possible that we read + // garbage if the load instruction we analyzed is in a basic block different + // than the one where the indirect jump is. However, later, + // postProcessIndirectBranches() is going to mark the function as non-simple + // in this case. + if (Value && BC.getSectionForAddress(Value)) + BC.InterproceduralReferences.insert(Value); return IndirectBranchType::POSSIBLE_TAIL_CALL; } From 2c5653511cc14b2c829442368929773feb78767e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 14 Mar 2019 18:49:40 -0700 Subject: [PATCH 504/904] [NFC][BOLT] Move ExecutableFileMemoryManager into its own file (cherry picked from commit c810177b6c5cb883a748eda389b29d317b056661) --- bolt/src/CMakeLists.txt | 1 + bolt/src/ExecutableFileMemoryManager.cpp | 106 +++++++++++++++++++++++ bolt/src/ExecutableFileMemoryManager.h | 92 ++++++++++++++++++++ bolt/src/RewriteInstance.cpp | 83 +----------------- bolt/src/RewriteInstance.h | 66 +------------- 5 files changed, 201 insertions(+), 147 deletions(-) create mode 100644 bolt/src/ExecutableFileMemoryManager.cpp create mode 100644 bolt/src/ExecutableFileMemoryManager.h diff --git a/bolt/src/CMakeLists.txt b/bolt/src/CMakeLists.txt index e10db2e99519..28b2392d2338 100644 --- a/bolt/src/CMakeLists.txt +++ b/bolt/src/CMakeLists.txt @@ -77,6 +77,7 @@ add_llvm_tool(llvm-bolt DebugData.cpp DWARFRewriter.cpp Exceptions.cpp + ExecutableFileMemoryManager.cpp Heatmap.cpp JumpTable.cpp MCPlusBuilder.cpp diff --git a/bolt/src/ExecutableFileMemoryManager.cpp b/bolt/src/ExecutableFileMemoryManager.cpp new file mode 100644 index 000000000000..e0aea5d8b96e --- /dev/null +++ b/bolt/src/ExecutableFileMemoryManager.cpp @@ -0,0 +1,106 @@ +//===--- ExecutableFileMemoryManager.cpp ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "ExecutableFileMemoryManager.h" +#include "RewriteInstance.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +using namespace llvm; +using namespace object; +using namespace bolt; + +namespace llvm { + +namespace bolt { + +uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, + unsigned Alignment, + unsigned SectionID, + StringRef SectionName, + bool IsCode, + bool IsReadOnly) { + // Register as note section (non-allocatable) if we recognize it as so + for (auto &OverwriteName : RewriteInstance::SectionsToOverwrite) { + if (SectionName == OverwriteName) { + uint8_t *DataCopy = new uint8_t[Size]; + auto &Section = BC.registerOrUpdateNoteSection(SectionName, + DataCopy, + Size, + Alignment); + Section.setSectionID(SectionID); + assert(!Section.isAllocatable() && "note sections cannot be allocatable"); + return DataCopy; + } + } + + uint8_t *Ret; + if (IsCode) { + Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment, + SectionID, SectionName); + } else { + Ret = SectionMemoryManager::allocateDataSection(Size, Alignment, + SectionID, SectionName, + IsReadOnly); + } + + const auto Flags = BinarySection::getFlags(IsReadOnly, IsCode, true); + auto &Section = BC.registerOrUpdateSection(SectionName, + ELF::SHT_PROGBITS, + Flags, + Ret, + Size, + Alignment); + Section.setSectionID(SectionID); + assert(Section.isAllocatable() && + "verify that allocatable is marked as allocatable"); + + DEBUG(dbgs() << "BOLT: allocating " << (Section.isLocal() ? "local " : "") + << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data")) + << " section : " << SectionName + << " with size " << Size << ", alignment " << Alignment + << " at 0x" << Ret << ", ID = " << SectionID << "\n"); + + return Ret; +} + +/// Notifier for non-allocatable (note) section. +uint8_t *ExecutableFileMemoryManager::recordNoteSection( + const uint8_t *Data, + uintptr_t Size, + unsigned Alignment, + unsigned SectionID, + StringRef SectionName) { + DEBUG(dbgs() << "BOLT: note section " + << SectionName + << " with size " << Size << ", alignment " << Alignment + << " at 0x" + << Twine::utohexstr(reinterpret_cast(Data)) << '\n'); + auto &Section = BC.registerOrUpdateNoteSection(SectionName, + copyByteArray(Data, Size), + Size, + Alignment); + Section.setSectionID(SectionID); + assert(!Section.isAllocatable() && "note sections cannot be allocatable"); + return Section.getOutputData(); +} + +bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { + DEBUG(dbgs() << "BOLT: finalizeMemory()\n"); + return SectionMemoryManager::finalizeMemory(ErrMsg); +} + +ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { } + +} + +} diff --git a/bolt/src/ExecutableFileMemoryManager.h b/bolt/src/ExecutableFileMemoryManager.h new file mode 100644 index 000000000000..ca820df14101 --- /dev/null +++ b/bolt/src/ExecutableFileMemoryManager.h @@ -0,0 +1,92 @@ +//===--- ExecutableFileMemoryManager.h ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H +#define LLVM_TOOLS_LLVM_BOLT_EXECUTABLE_FILE_MEMORY_MANAGER_H + +#include "BinaryContext.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +namespace bolt { + +struct SegmentInfo { + uint64_t Address; /// Address of the segment in memory. + uint64_t Size; /// Size of the segment in memory. + uint64_t FileOffset; /// Offset in the file. + uint64_t FileSize; /// Size in file. + + void print(raw_ostream &OS) const { + OS << "SegmentInfo { Address: 0x" + << Twine::utohexstr(Address) << ", Size: 0x" + << Twine::utohexstr(Size) << ", FileOffset: 0x" + << Twine::utohexstr(FileOffset) << ", FileSize: 0x" + << Twine::utohexstr(FileSize) << "}"; + }; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) { + SegInfo.print(OS); + return OS; +} + +/// Class responsible for allocating and managing code and data sections. +class ExecutableFileMemoryManager : public SectionMemoryManager { +private: + uint8_t *allocateSection(intptr_t Size, + unsigned Alignment, + unsigned SectionID, + StringRef SectionName, + bool IsCode, + bool IsReadOnly); + BinaryContext &BC; + bool AllowStubs; + +public: + /// [start memory address] -> [segment info] mapping. + std::map SegmentMapInfo; + + ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs) + : BC(BC), AllowStubs(AllowStubs) {} + + ~ExecutableFileMemoryManager(); + + uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, + StringRef SectionName) override { + return allocateSection(Size, Alignment, SectionID, SectionName, + /*IsCode=*/true, true); + } + + uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, StringRef SectionName, + bool IsReadOnly) override { + return allocateSection(Size, Alignment, SectionID, SectionName, + /*IsCode=*/false, IsReadOnly); + } + + uint8_t *recordNoteSection(const uint8_t *Data, uintptr_t Size, + unsigned Alignment, unsigned SectionID, + StringRef SectionName) override; + + bool allowStubAllocation() const override { return AllowStubs; } + + bool finalizeMemory(std::string *ErrMsg = nullptr) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index ae6e5d5280f4..7c2a518871fa 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -18,6 +18,7 @@ #include "DataAggregator.h" #include "DataReader.h" #include "Exceptions.h" +#include "ExecutableFileMemoryManager.h" #include "MCPlusBuilder.h" #include "ProfileReader.h" #include "ProfileWriter.h" @@ -538,88 +539,6 @@ bool refersToReorderedSection(ErrorOr Section) { return Itr != opts::ReorderData.end(); } -} - -uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, - unsigned Alignment, - unsigned SectionID, - StringRef SectionName, - bool IsCode, - bool IsReadOnly) { - // Register as note section (non-allocatable) if we recognize it as so - for (auto &OverwriteName : RewriteInstance::SectionsToOverwrite) { - if (SectionName == OverwriteName) { - uint8_t *DataCopy = new uint8_t[Size]; - auto &Section = BC.registerOrUpdateNoteSection(SectionName, - DataCopy, - Size, - Alignment); - Section.setSectionID(SectionID); - assert(!Section.isAllocatable() && "note sections cannot be allocatable"); - return DataCopy; - } - } - - uint8_t *Ret; - if (IsCode) { - Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment, - SectionID, SectionName); - } else { - Ret = SectionMemoryManager::allocateDataSection(Size, Alignment, - SectionID, SectionName, - IsReadOnly); - } - - const auto Flags = BinarySection::getFlags(IsReadOnly, IsCode, true); - auto &Section = BC.registerOrUpdateSection(SectionName, - ELF::SHT_PROGBITS, - Flags, - Ret, - Size, - Alignment); - Section.setSectionID(SectionID); - assert(Section.isAllocatable() && - "verify that allocatable is marked as allocatable"); - - DEBUG(dbgs() << "BOLT: allocating " << (Section.isLocal() ? "local " : "") - << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data")) - << " section : " << SectionName - << " with size " << Size << ", alignment " << Alignment - << " at 0x" << Ret << ", ID = " << SectionID << "\n"); - - return Ret; -} - -/// Notifier for non-allocatable (note) section. -uint8_t *ExecutableFileMemoryManager::recordNoteSection( - const uint8_t *Data, - uintptr_t Size, - unsigned Alignment, - unsigned SectionID, - StringRef SectionName) { - DEBUG(dbgs() << "BOLT: note section " - << SectionName - << " with size " << Size << ", alignment " << Alignment - << " at 0x" - << Twine::utohexstr(reinterpret_cast(Data)) << '\n'); - auto &Section = BC.registerOrUpdateNoteSection(SectionName, - copyByteArray(Data, Size), - Size, - Alignment); - Section.setSectionID(SectionID); - assert(!Section.isAllocatable() && "note sections cannot be allocatable"); - return Section.getOutputData(); -} - -bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) { - DEBUG(dbgs() << "BOLT: finalizeMemory()\n"); - return SectionMemoryManager::finalizeMemory(ErrMsg); -} - -ExecutableFileMemoryManager::~ExecutableFileMemoryManager() { } - -namespace { - StringRef getSectionName(SectionRef Section) { StringRef SectionName; Section.getName(SectionName); diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 19e9f6cb221c..4ed7ad148e4f 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -16,9 +16,9 @@ #include "BinaryFunction.h" #include "DebugData.h" +#include "ExecutableFileMemoryManager.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" -#include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/MC/StringTableBuilder.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/ObjectFile.h" @@ -40,70 +40,6 @@ class DataAggregator; class DataReader; class RewriteInstanceDiff; -struct SegmentInfo { - uint64_t Address; /// Address of the segment in memory. - uint64_t Size; /// Size of the segment in memory. - uint64_t FileOffset; /// Offset in the file. - uint64_t FileSize; /// Size in file. - - void print(raw_ostream &OS) const { - OS << "SegmentInfo { Address: 0x" - << Twine::utohexstr(Address) << ", Size: 0x" - << Twine::utohexstr(Size) << ", FileOffset: 0x" - << Twine::utohexstr(FileOffset) << ", FileSize: 0x" - << Twine::utohexstr(FileSize) << "}"; - }; -}; - -inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) { - SegInfo.print(OS); - return OS; -} - -/// Class responsible for allocating and managing code and data sections. -class ExecutableFileMemoryManager : public SectionMemoryManager { -private: - uint8_t *allocateSection(intptr_t Size, - unsigned Alignment, - unsigned SectionID, - StringRef SectionName, - bool IsCode, - bool IsReadOnly); - BinaryContext &BC; - bool AllowStubs; - -public: - /// [start memory address] -> [segment info] mapping. - std::map SegmentMapInfo; - - ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs) - : BC(BC), AllowStubs(AllowStubs) {} - - ~ExecutableFileMemoryManager(); - - uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, - StringRef SectionName) override { - return allocateSection(Size, Alignment, SectionID, SectionName, - /*IsCode=*/true, true); - } - - uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, StringRef SectionName, - bool IsReadOnly) override { - return allocateSection(Size, Alignment, SectionID, SectionName, - /*IsCode=*/false, IsReadOnly); - } - - uint8_t *recordNoteSection(const uint8_t *Data, uintptr_t Size, - unsigned Alignment, unsigned SectionID, - StringRef SectionName) override; - - bool allowStubAllocation() const override { return AllowStubs; } - - bool finalizeMemory(std::string *ErrMsg = nullptr) override; -}; - /// This class encapsulates all data necessary to carry on binary reading, /// disassembly, CFG building, BB reordering (among other binary-level /// optimizations) and rewriting. It also has the logic to coordinate such From fec2b64d22e3cdbedb984c0a5b8e316529e7b346 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 14 Mar 2019 18:51:05 -0700 Subject: [PATCH 505/904] [BOLT] Refactor allocatable sections rewrite part Summary: This refactoring makes it easier to create new code sections and control code placement. As an example, cold code is being placed into ".text.cold" which is emitted independently from ".text", and the final address assignment becomes more flexible. Previously, in non-relocation mode we used to emit temporary section name into .shstrtab. This resulted in unnecessary bloat of this section. There was unnecessary padding emitted at the end of text section. After fixing this, the output binary becomes smaller. I had to change the way exception handling tables are re-written as the current infra does not support cross-section label difference. This means we have to emit absolute landing pad addresses, which might not work for PIE binaries. I'm going to address this once I investigate the current exception handling issues in PIEs. This diff temporarily disables "-hot-functions-at-end" option. (cherry picked from commit 900d9a38616ee50ab0deac7bbb3d9eb71c17b21f) --- bolt/src/BinaryContext.cpp | 30 +- bolt/src/BinaryContext.h | 33 +- bolt/src/BinaryData.cpp | 4 +- bolt/src/BinaryFunction.h | 16 + bolt/src/BinarySection.cpp | 2 +- bolt/src/BinarySection.h | 15 +- bolt/src/Exceptions.cpp | 47 +-- bolt/src/RewriteInstance.cpp | 729 +++++++++++++++++------------------ bolt/src/RewriteInstance.h | 31 +- 9 files changed, 440 insertions(+), 467 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 2e1a63dadbd2..d95023b5a3e3 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -67,14 +67,6 @@ PrintMemData("print-mem-data", cl::ZeroOrMore, cl::cat(BoltCategory)); -cl::opt -HotFunctionsAtEnd( - "hot-functions-at-end", - cl::desc( - "if reorder-functions is used, order functions putting hottest last"), - cl::ZeroOrMore, - cl::cat(BoltCategory)); - } // namespace opts BinaryContext::BinaryContext(std::unique_ptr Ctx, @@ -696,15 +688,6 @@ unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID, return cantFail(Ctx->getDwarfFile(Dir, FileName, 0, nullptr, None, DestCUID)); } -template -bool Comparator(const BinaryFunction *A, const BinaryFunction *B) { - if (A->hasValidIndex() && B->hasValidIndex()) { - return A->getIndex() < B->getIndex(); - } else { - return FuncsAtEnd ? B->hasValidIndex() : A->hasValidIndex(); - } -} - std::vector BinaryContext::getSortedFunctions( std::map &BinaryFunctions) { std::vector SortedFunctions(BinaryFunctions.size()); @@ -714,14 +697,13 @@ std::vector BinaryContext::getSortedFunctions( return &BFI.second; }); - if (opts::HotFunctionsAtEnd) { - std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), - Comparator); - return SortedFunctions; - } - std::stable_sort(SortedFunctions.begin(), SortedFunctions.end(), - Comparator); + [] (const BinaryFunction *A, const BinaryFunction *B) { + if (A->hasValidIndex() && B->hasValidIndex()) { + return A->getIndex() < B->getIndex(); + } + return A->hasValidIndex(); + }); return SortedFunctions; } diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 2075821a4163..ba3abdf03eb1 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -448,6 +448,14 @@ class BinaryContext { return Itr != GlobalSymbols.end() ? Itr->second : nullptr; } + MCSymbol *getHotTextStartSymbol() const { + return Ctx->getOrCreateSymbol("__hot_start"); + } + + MCSymbol *getHotTextEndSymbol() const { + return Ctx->getOrCreateSymbol("__hot_end"); + } + /// Perform any necessary post processing on the symbol table after /// function disassembly is complete. This processing fixes top /// level data holes and makes sure the symbol table is valid. @@ -535,6 +543,19 @@ class BinaryContext { Sections.end())); } + /// Iterate over all registered code sections. + iterator_range textSections() { + auto isText = [](const SectionIterator &Itr) { + return *Itr && Itr->isAllocatable() && Itr->isText(); + }; + return make_range(FilteredSectionIterator(isText, + Sections.begin(), + Sections.end()), + FilteredSectionIterator(isText, + Sections.end(), + Sections.end())); + } + /// Iterate over all registered allocatable sections. iterator_range allocatableSections() const { return const_cast(this)->allocatableSections(); @@ -598,18 +619,10 @@ class BinaryContext { return make_range(NameToSection.equal_range(Name)); } - /// Return the unique (allocatable) section associated with given \p Name. + /// Return the unique section associated with given \p Name. /// If there is more than one section with the same name, return an error /// object. - ErrorOr getUniqueSectionByName(StringRef SectionName) { - auto Sections = getSectionByName(SectionName); - if (Sections.begin() != Sections.end() && - std::next(Sections.begin()) == Sections.end()) - return *Sections.begin()->second; - return std::make_error_code(std::errc::bad_address); - } - ErrorOr - getUniqueSectionByName(StringRef SectionName) const { + ErrorOr getUniqueSectionByName(StringRef SectionName) const { auto Sections = getSectionByName(SectionName); if (Sections.begin() != Sections.end() && std::next(Sections.begin()) == Sections.end()) diff --git a/bolt/src/BinaryData.cpp b/bolt/src/BinaryData.cpp index 3374a8c31543..925720cce60d 100644 --- a/bolt/src/BinaryData.cpp +++ b/bolt/src/BinaryData.cpp @@ -73,8 +73,8 @@ StringRef BinaryData::getOutputSectionName() const { } uint64_t BinaryData::getOutputAddress() const { - assert(OutputSection->getFileAddress()); - return OutputSection->getFileAddress() + OutputOffset; + assert(OutputSection->getOutputAddress()); + return OutputSection->getOutputAddress() + OutputOffset; } uint64_t BinaryData::getOffset() const { diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index b405ab56ae6c..fb2afa13754d 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -18,6 +18,7 @@ #include "BinaryBasicBlock.h" #include "BinaryContext.h" #include "BinaryLoop.h" +#include "BinarySection.h" #include "DataReader.h" #include "DebugData.h" #include "JumpTable.h" @@ -1240,16 +1241,31 @@ class BinaryFunction { /// Get data used by this function. std::set dataUses(bool OnlyHot) const; + /// Return then name of the section this function originated from. + StringRef getOriginSectionName() const { + return getSection().getName(); + } + /// Return internal section name for this function. StringRef getCodeSectionName() const { return StringRef(CodeSectionName); } + /// Get output code section. + ErrorOr getCodeSection() const { + return BC.getUniqueSectionByName(getCodeSectionName()); + } + /// Return cold code section name for the function. StringRef getColdCodeSectionName() const { return StringRef(ColdCodeSectionName); } + /// Get output code section for cold code of this function. + ErrorOr getColdCodeSection() const { + return BC.getUniqueSectionByName(getColdCodeSectionName()); + } + /// Return true iif the function will halt execution on entry. bool trapsOnEntry() const { return TrapsOnEntry; diff --git a/bolt/src/BinarySection.cpp b/bolt/src/BinarySection.cpp index daea0c4a06fb..3e163868e54d 100644 --- a/bolt/src/BinarySection.cpp +++ b/bolt/src/BinarySection.cpp @@ -78,7 +78,7 @@ void BinarySection::print(raw_ostream &OS) const { OS << getName() << ", " << "0x" << Twine::utohexstr(getAddress()) << ", " << getSize() - << " (0x" << Twine::utohexstr(getFileAddress()) << ", " + << " (0x" << Twine::utohexstr(getOutputAddress()) << ", " << getOutputSize() << ")" << ", data = " << getData() << ", output data = " << getOutputData(); diff --git a/bolt/src/BinarySection.h b/bolt/src/BinarySection.h index 26e071e85f4e..a8309b99a9b3 100644 --- a/bolt/src/BinarySection.h +++ b/bolt/src/BinarySection.h @@ -62,12 +62,13 @@ class BinarySection { // finalized? std::string OutputName; // Output section name (if the section has // been renamed) - uint64_t FileAddress{0}; // Section address for the rewritten binary. + uint64_t OutputAddress{0}; // Section address for the rewritten binary. uint64_t OutputSize{0}; // Section size in the rewritten binary. uint64_t FileOffset{0}; // File offset in the rewritten binary file. StringRef OutputContents; // Rewritten section contents. unsigned SectionID{-1u}; // Unique ID used for address mapping. // Set by ExecutableFileMemoryManager. + uint32_t Index{0}; // Section index in the output file. mutable bool IsReordered{false}; // Have the contents been reordered? uint64_t hash(const BinaryData &BD, @@ -371,7 +372,7 @@ class BinarySection { uint64_t getAllocAddress() const { return reinterpret_cast(getOutputData()); } - uint64_t getFileAddress() const { return FileAddress; } + uint64_t getOutputAddress() const { return OutputAddress; } uint64_t getFileOffset() const { return FileOffset; } unsigned getSectionID() const { assert(hasValidSectionID() && "trying to use uninitialized section id"); @@ -380,10 +381,13 @@ class BinarySection { bool hasValidSectionID() const { return SectionID != -1u; } + uint32_t getIndex() const { + return Index; + } // mutation - void setFileAddress(uint64_t Address) { - FileAddress = Address; + void setOutputAddress(uint64_t Address) { + OutputAddress = Address; } void setFileOffset(uint64_t Offset) { FileOffset = Offset; @@ -392,6 +396,9 @@ class BinarySection { assert(!hasValidSectionID() && "trying to set section id twice"); SectionID = ID; } + void setIndex(uint32_t I) { + Index = I; + } void setOutputName(StringRef Name) { OutputName = Name; } diff --git a/bolt/src/Exceptions.cpp b/bolt/src/Exceptions.cpp index cfd039ed1537..9b9258884bde 100644 --- a/bolt/src/Exceptions.cpp +++ b/bolt/src/Exceptions.cpp @@ -526,42 +526,19 @@ void BinaryFunction::emitLSDA(MCStreamer *Streamer, bool EmitColdPart) { // a landing pad, this means that the first landing pad offset will be 0. // As a result, an exception handling runtime will ignore this landing pad, // because zero offset denotes the absence of a landing pad. + // For this reason, we emit LPStart value of 0 and output an absolute value + // of the landing pad in the table. // - // To workaround this issue, we issue a special LPStart for cold fragments - // that is equal to FDE start minus 1 byte. - // - // Note that main function fragment cannot start with a landing pad and we - // omit LPStart. - const MCExpr *LPStartExpr = nullptr; - std::function emitLandingPad; - if (EmitColdPart) { - Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format - LPStartExpr = MCBinaryExpr::createSub( - MCSymbolRefExpr::create(StartSymbol, *BC.Ctx.get()), - MCConstantExpr::create(1, *BC.Ctx.get()), - *BC.Ctx.get()); - Streamer->EmitValue(LPStartExpr, 4); - emitLandingPad = [&](const MCSymbol *LPSymbol) { - if (!LPSymbol) { - Streamer->EmitIntValue(0, 4); - return; - } - Streamer->EmitValue(MCBinaryExpr::createSub( - MCSymbolRefExpr::create(LPSymbol, *BC.Ctx.get()), - LPStartExpr, - *BC.Ctx.get()), - 4); - }; - } else { - Streamer->EmitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format - emitLandingPad = [&](const MCSymbol *LPSymbol) { - if (!LPSymbol) { - Streamer->EmitIntValue(0, 4); - return; - } - Streamer->emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4); - }; - } + // FIXME: this may break PIEs and DSOs where the base address is not 0. + Streamer->EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format + Streamer->EmitIntValue(0, 4); + auto emitLandingPad = [&](const MCSymbol *LPSymbol) { + if (!LPSymbol) { + Streamer->EmitIntValue(0, 4); + return; + } + Streamer->EmitSymbolValue(LPSymbol, 4); + }; Streamer->EmitIntValue(TTypeEncoding, 1); // TType format diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 7c2a518871fa..ddf81ccba3f8 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -87,7 +87,6 @@ extern cl::OptionCategory AggregatorCategory; extern cl::opt AlignMacroOpFusion; extern cl::opt JumpTables; extern cl::list ReorderData; -extern cl::opt HotFunctionsAtEnd; static cl::opt ForceToDataRelocations("force-data-relocations", @@ -185,6 +184,14 @@ FunctionPadSpec("pad-funcs", cl::cat(BoltCategory)); cl::opt +HotFunctionsAtEnd( + "hot-functions-at-end", + cl::desc( + "if reorder-functions is used, order functions putting hottest last"), + cl::ZeroOrMore, + cl::cat(BoltCategory)); + +static cl::opt HotText("hot-text", cl::desc("hot text symbols support (relocation mode)"), cl::ZeroOrMore, @@ -388,7 +395,7 @@ bool shouldProcess(const BinaryFunction &Function) { if (opts::MaxFunctions && Function.getFunctionNumber() >= opts::MaxFunctions) { if (Function.getFunctionNumber() == opts::MaxFunctions) { - dbgs() << "BOLT-INFO: processing ending on " << Function << "\n"; + outs() << "BOLT-INFO: processing ending on " << Function << "\n"; } else { return false; } @@ -424,16 +431,12 @@ bool shouldProcess(const BinaryFunction &Function) { if (!IsValid) return false; - if (!SkipFunctionNames.empty()) { - for (auto &Name : SkipFunctionNames) { - if (Function.hasName(Name)) { - IsValid = false; - break; - } - } + for (auto &Name : SkipFunctionNames) { + if (Function.hasName(Name)) + return false; } - return IsValid; + return true; } size_t padFunction(const BinaryFunction &Function) { @@ -941,7 +944,7 @@ void RewriteInstance::run() { if (opts::DiffOnly) return; runOptimizationPasses(); - emitFunctions(); + emitSections(); }; outs() << "BOLT-INFO: Target architecture: " @@ -1765,6 +1768,11 @@ void RewriteInstance::adjustCommandLineOptions() { "was specified\n"; opts::AlignMacroOpFusion = MFT_ALL; } + + if (opts::HotText && !BC->HasRelocations) { + outs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n"; + opts::HotText = false; + } } namespace { @@ -2592,23 +2600,15 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, if (Function.getState() == BinaryFunction::State::Empty) return; - MCSection *Section; - if (BC->HasRelocations || Function.isInjected()) { - Section = BC->MOFI->getTextSection(); - } else { - // Each fuction is emmitted into its own section. - Section = - BC->Ctx->getELFSection(EmitColdPart ? Function.getColdCodeSectionName() - : Function.getCodeSectionName(), - ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); - } - + auto *Section = static_cast(Streamer.getCurrentSectionOnly()); Section->setHasInstructions(true); - BC->Ctx->addGenDwarfSection(Section); - Streamer.SwitchSection(Section); + if (EmitColdPart) { + Function.ColdCodeSectionName = Section->getSectionName(); + } else { + Function.CodeSectionName = Section->getSectionName(); + } if (BC->HasRelocations) { Streamer.EmitCodeAlignment(BinaryFunction::MinAlign); @@ -2713,8 +2713,8 @@ std::vector singletonSet(T t) { } // anonymous namespace -void RewriteInstance::emitFunctions() { - NamedRegionTimer T("emitFunctions", "emit functions", TimerGroupName, +void RewriteInstance::emitSections() { + NamedRegionTimer T("emitSections", "emit sections", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); std::error_code EC; @@ -2744,119 +2744,7 @@ void RewriteInstance::emitFunctions() { Streamer->InitSections(false); - // Mark beginning of "hot text". - if (BC->HasRelocations && opts::HotText && !opts::HotFunctionsAtEnd) - Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_start")); - - // Sort functions for the output. - std::vector SortedFunctions = - BinaryContext::getSortedFunctions(BinaryFunctions); - - DEBUG( - if (!BC->HasRelocations) { - auto SortedIt = SortedFunctions.begin(); - for (auto &It : BinaryFunctions) { - assert(&It.second == *SortedIt); - ++SortedIt; - } - }); - - uint32_t LastHotIndex = -1u; - uint32_t CurrentIndex = 0; - if (opts::HotFunctionsAtEnd) { - for (auto *BF : SortedFunctions) { - if (BF->hasValidIndex() && LastHotIndex == -1u) { - LastHotIndex = CurrentIndex; - } - assert(LastHotIndex == -1u || BF->hasValidIndex()); - ++CurrentIndex; - } - } else { - for (auto *BF : SortedFunctions) { - if (!BF->hasValidIndex() && LastHotIndex == -1u) { - LastHotIndex = CurrentIndex; - } - assert(LastHotIndex == -1u || !BF->hasValidIndex()); - assert(!BF->hasValidIndex() || CurrentIndex == BF->getIndex()); - ++CurrentIndex; - } - } - CurrentIndex = 0; - DEBUG(dbgs() << "BOLT-DEBUG: LastHotIndex = " << LastHotIndex << "\n"); - - bool ColdFunctionSeen = false; - - // Output functions one by one. - for (auto *FunctionPtr : SortedFunctions) { - auto &Function = *FunctionPtr; - - // Emit all cold function split parts at the border of hot and - // cold functions. - if (BC->HasRelocations && !ColdFunctionSeen && - CurrentIndex >= LastHotIndex) { - // Mark the end of "hot" stuff. - if (opts::HotText && !opts::HotFunctionsAtEnd) { - Streamer->SwitchSection(BC->MOFI->getTextSection()); - Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_end")); - } - ColdFunctionSeen = true; - - // Emit injected functions hot part - for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) - emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/false); - - // Emit injected functions cold part - for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) - emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/true); - - //TODO: this code is unreachable if all functions are hot - if (opts::SplitFunctions != BinaryFunction::ST_NONE) { - DEBUG(dbgs() << "BOLT-DEBUG: generating code for split functions\n"); - for (auto *FPtr : SortedFunctions) { - if (!FPtr->isSplit() || !FPtr->isSimple()) - continue; - emitFunction(*Streamer, *FPtr, /*EmitColdPart=*/true); - } - } - DEBUG(dbgs() << "BOLT-DEBUG: first cold function: " << Function << '\n'); - - if (opts::HotText && opts::HotFunctionsAtEnd) { - Streamer->SwitchSection(BC->MOFI->getTextSection()); - Streamer->EmitCodeAlignment(BC->PageAlign); - Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_start")); - } - } - - if (!BC->HasRelocations && - (!Function.isSimple() || !opts::shouldProcess(Function))) { - ++CurrentIndex; - continue; - } - - DEBUG(dbgs() << "BOLT: generating code for function \"" - << Function << "\" : " - << Function.getFunctionNumber() << '\n'); - - emitFunction(*Streamer, Function, /*EmitColdPart=*/false); - - if (!BC->HasRelocations && Function.isSplit()) - emitFunction(*Streamer, Function, /*EmitColdPart=*/true); - - ++CurrentIndex; - } - - // Emit injected functions in non-reloc mode - if (!BC->HasRelocations) { - for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()){ - emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/false); - emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/true); - } - } - - if ((!ColdFunctionSeen || opts::HotFunctionsAtEnd) && opts::HotText) { - Streamer->SwitchSection(BC->MOFI->getTextSection()); - Streamer->EmitLabel(BC->Ctx->getOrCreateSymbol("__hot_end")); - } + emitFunctions(Streamer.get()); if (!BC->HasRelocations && opts::UpdateDebugSections) updateDebugLineInfoForNonSimpleFunctions(); @@ -2939,62 +2827,203 @@ void RewriteInstance::emitFunctions() { cantFail(OLT->emitAndFinalize(K)); + // Once the code is emitted, we can rename function sections to actual + // output sections and de-register sections used for emission. + if (!BC->HasRelocations) { + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + if (auto Section = Function.getCodeSection()) + BC->deregisterSection(*Section); + Function.CodeSectionName = Function.getOriginSectionName(); + if (Function.isSplit()) { + if (auto ColdSection = Function.getColdCodeSection()) + BC->deregisterSection(*ColdSection); + Function.ColdCodeSectionName = ".bolt.text"; + } + } + } + if (opts::PrintCacheMetrics) { outs() << "BOLT-INFO: cache metrics after emitting functions:\n"; - CacheMetrics::printAll(SortedFunctions); + CacheMetrics::printAll(BC->getSortedFunctions(BinaryFunctions)); } if (opts::KeepTmp) TempOut->keep(); } +void RewriteInstance::emitFunctions(MCStreamer *Streamer) { + auto *TextSection = BC->MOFI->getTextSection(); + auto *ColdSection = + BC->Ctx->getELFSection(".text.cold", + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + ColdSection->setAlignment(64); + + // Sort functions for the output. + std::vector SortedFunctions = + BinaryContext::getSortedFunctions(BinaryFunctions); + + DEBUG( + if (!BC->HasRelocations) { + auto SortedIt = SortedFunctions.begin(); + for (auto &It : BinaryFunctions) { + assert(&It.second == *SortedIt); + ++SortedIt; + } + }); + + // Emit a set of functions at the boundary of hot and cold code. + auto emitBoundaryFunctions = [&]() { + // Emit injected functions hot parts + for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) + emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/false); + + // Emit injected functions cold parts + for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) + emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/true); + + if (opts::SplitFunctions != BinaryFunction::ST_NONE) { + DEBUG(dbgs() << "BOLT-DEBUG: generating code for split functions\n"); + for (auto *FPtr : SortedFunctions) { + if (!FPtr->isSplit() || !FPtr->isSimple()) + continue; + emitFunction(*Streamer, *FPtr, /*EmitColdPart=*/true); + } + } + }; + + if (opts::HotText) { + Streamer->SwitchSection(TextSection); + Streamer->EmitCodeAlignment(BC->PageAlign); + Streamer->EmitLabel(BC->getHotTextStartSymbol()); + } + + if (BC->HasRelocations) { + Streamer->SwitchSection(ColdSection); + emitBoundaryFunctions(); + } + + bool UseColdSection = SortedFunctions.front()->hasValidIndex(); + + // Output functions one by one. + for (auto *FunctionPtr : SortedFunctions) { + auto &Function = *FunctionPtr; + + if (!BC->HasRelocations && + (!Function.isSimple() || !opts::shouldProcess(Function))) + continue; + + MCSection *Section; + if (BC->HasRelocations) { + if (UseColdSection && !Function.hasValidIndex()) { + Section = ColdSection; + } else { + Section = TextSection; + } + } else { + Section = + BC->Ctx->getELFSection(Function.getCodeSectionName(), + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + } + Streamer->SwitchSection(Section); + + DEBUG(dbgs() << "BOLT: generating code for function \"" + << Function << "\" : " + << Function.getFunctionNumber() << '\n'); + + emitFunction(*Streamer, Function, /*EmitColdPart=*/false); + + if (!BC->HasRelocations && Function.isSplit()) { + Streamer->SwitchSection( + BC->Ctx->getELFSection(Function.getColdCodeSectionName(), + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC)); + emitFunction(*Streamer, Function, /*EmitColdPart=*/true); + } + } + + if (opts::HotText) { + Streamer->SwitchSection(TextSection); + Streamer->EmitLabel(BC->getHotTextEndSymbol()); + } + + // Emit injected functions in non-reloc mode + if (!BC->HasRelocations) { + Streamer->SwitchSection(TextSection); + for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()){ + emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/false); + emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/true); + } + } +} + void RewriteInstance::mapFileSections(orc::VModuleKey Key) { mapTextSections(Key); mapDataSections(Key); } void RewriteInstance::mapTextSections(orc::VModuleKey Key) { - NewTextSectionStartAddress = NextAvailableAddress; if (BC->HasRelocations) { + uint64_t AllocationAddress{0}; auto TextSection = BC->getUniqueSectionByName(".text"); assert(TextSection && ".text not found in output"); + auto ColdSection = BC->getUniqueSectionByName(".text.cold"); + + DEBUG( + for (auto &Section : BC->textSections()) { + dbgs() << "code section : " << Section.getName() + << "; valid ID : " << Section.hasValidSectionID() << '\n'; + }); - uint64_t NewTextSectionOffset = 0; + auto CodeSize = TextSection->getOutputSize(); + if (ColdSection) { + CodeSize = alignTo(CodeSize, ColdSection->getAlignment()); + CodeSize += ColdSection->getOutputSize(); + } auto Padding = OffsetToAlignment(BC->OldTextSectionAddress, BC->PageAlign); - if (opts::UseOldText && - Padding + TextSection->getOutputSize() <= BC->OldTextSectionSize) { + if (opts::UseOldText && Padding + CodeSize <= BC->OldTextSectionSize) { // Utilize the original .text for storage. outs() << "BOLT-INFO: using original .text for new code with 0x" << Twine::utohexstr(BC->PageAlign) << " alignment\n"; - NewTextSectionStartAddress = BC->OldTextSectionAddress + Padding; - NewTextSectionOffset = BC->OldTextSectionOffset + Padding; + AllocationAddress = BC->OldTextSectionAddress + Padding; } else { if (opts::UseOldText) { errs() << "BOLT-WARNING: original .text too small to fit the new code" << " using 0x" << Twine::utohexstr(BC->PageAlign) - << " aligment. " << Padding + TextSection->getOutputSize() + << " aligment. " << Padding + CodeSize << " bytes needed, have " << BC->OldTextSectionSize << " bytes available.\n"; opts::UseOldText = false; } - auto Padding = OffsetToAlignment(NewTextSectionStartAddress, - BC->PageAlign); - NextAvailableAddress += Padding; - NewTextSectionStartAddress = NextAvailableAddress; - NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress); - NextAvailableAddress += Padding + TextSection->getOutputSize(); + AllocationAddress = alignTo(NextAvailableAddress, BC->PageAlign); + } + + auto mapSection = [&](BinarySection &Section) { + AllocationAddress = alignTo(AllocationAddress, Section.getAlignment()); + DEBUG(dbgs() << "BOLT: mapping " << Section.getName() << " 0x" + << Twine::utohexstr(Section.getAllocAddress()) + << " to 0x" << Twine::utohexstr(AllocationAddress) + << '\n'); + OLT->mapSectionAddress(Key, Section.getSectionID(), AllocationAddress); + Section.setOutputAddress(AllocationAddress); + Section.setFileOffset(getFileOffsetForAddress(AllocationAddress)); + AllocationAddress += Section.getOutputSize(); + }; + + mapSection(*TextSection); + if (ColdSection) + mapSection(*ColdSection); + + if (!opts::UseOldText) { + NextAvailableAddress = AllocationAddress; } - TextSection->setFileAddress(NewTextSectionStartAddress); - TextSection->setFileOffset(NewTextSectionOffset); - DEBUG(dbgs() << "BOLT: mapping .text 0x" - << Twine::utohexstr(TextSection->getAllocAddress()) - << " to 0x" << Twine::utohexstr(NewTextSectionStartAddress) - << '\n'); - OLT->mapSectionAddress(Key, TextSection->getSectionID(), - NewTextSectionStartAddress); } else { + auto NewTextSectionStartAddress = NextAvailableAddress; + // Prepare .text section for injected functions auto TextSection = BC->getUniqueSectionByName(".text"); assert(TextSection && ".text not found in output"); @@ -3006,7 +3035,7 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { NewTextSectionStartAddress = NextAvailableAddress; NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress); NextAvailableAddress += Padding + TextSection->getOutputSize(); - TextSection->setFileAddress(NewTextSectionStartAddress); + TextSection->setOutputAddress(NewTextSectionStartAddress); TextSection->setFileOffset(NewTextSectionOffset); DEBUG(dbgs() << "BOLT: mapping .text 0x" @@ -3023,9 +3052,9 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { continue; auto TooLarge = false; - auto FuncSection = - BC->getUniqueSectionByName(Function.getCodeSectionName()); + auto FuncSection = Function.getCodeSection(); assert(FuncSection && "cannot find section for function"); + FuncSection->setOutputAddress(Function.getAddress()); DEBUG(dbgs() << "BOLT: mapping 0x" << Twine::utohexstr(FuncSection->getAllocAddress()) << " to 0x" << Twine::utohexstr(Function.getAddress()) @@ -3044,7 +3073,7 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { for (auto &JTI : Function.JumpTables) { auto *JT = JTI.second; auto &Section = JT->getOutputSection(); - Section.setFileAddress(JT->getAddress()); + Section.setOutputAddress(JT->getAddress()); DEBUG(dbgs() << "BOLT-DEBUG: mapping " << Section.getName() << " to 0x" << Twine::utohexstr(JT->getAddress()) << '\n'); @@ -3056,8 +3085,7 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { if (!Function.isSplit()) continue; - auto ColdSection = - BC->getUniqueSectionByName(Function.getColdCodeSectionName()); + auto ColdSection = Function.getColdCodeSection(); assert(ColdSection && "cannot find section for cold part"); // Cold fragments are aligned at 16 bytes. NextAvailableAddress = alignTo(NextAvailableAddress, 16); @@ -3073,6 +3101,7 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { ColdPart.setImageAddress(ColdSection->getAllocAddress()); ColdPart.setImageSize(ColdSection->getOutputSize()); ColdPart.setFileOffset(getFileOffsetForAddress(NextAvailableAddress)); + ColdSection->setOutputAddress(ColdPart.getAddress()); } DEBUG(dbgs() << "BOLT: mapping cold fragment 0x" @@ -3102,7 +3131,7 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { NewTextSectionSize, 16, true /*IsLocal*/); - Section.setFileAddress(NewTextSectionStartAddress); + Section.setOutputAddress(NewTextSectionStartAddress); Section.setFileOffset( getFileOffsetForAddress(NewTextSectionStartAddress)); } @@ -3130,7 +3159,7 @@ void RewriteInstance::mapDataSections(orc::VModuleKey Key) { << '\n'); OLT->mapSectionAddress(Key, Section->getSectionID(), NextAvailableAddress); - Section->setFileAddress(NextAvailableAddress); + Section->setOutputAddress(NextAvailableAddress); Section->setFileOffset(getFileOffsetForAddress(NextAvailableAddress)); NextAvailableAddress += Section->getOutputSize(); @@ -3149,10 +3178,10 @@ void RewriteInstance::mapDataSections(orc::VModuleKey Key) { !OrgSection->isFinalized()) continue; - if (OrgSection->getFileAddress()) { + if (OrgSection->getOutputAddress()) { DEBUG(dbgs() << "BOLT-DEBUG: section " << SectionName << " is already mapped at 0x" - << Twine::utohexstr(OrgSection->getFileAddress()) << '\n'); + << Twine::utohexstr(OrgSection->getOutputAddress()) << '\n'); continue; } DEBUG(dbgs() << "BOLT: mapping original section " << SectionName << " (0x" @@ -3163,7 +3192,7 @@ void RewriteInstance::mapDataSections(orc::VModuleKey Key) { OLT->mapSectionAddress(Key, OrgSection->getSectionID(), Section.getAddress()); - OrgSection->setFileAddress(Section.getAddress()); + OrgSection->setOutputAddress(Section.getAddress()); OrgSection->setFileOffset(Section.getContents().data() - InputFile->getData().data()); } @@ -3178,18 +3207,22 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { Function.setOutputSize(Function.getSize()); return; } + + const auto BaseAddress = Function.getCodeSection()->getOutputAddress(); + auto ColdSection = Function.getColdCodeSection(); + const auto ColdBaseAddress = + Function.isSplit() ? ColdSection->getOutputAddress() : 0; if (BC->HasRelocations || Function.isInjected()) { - const auto BaseAddress = NewTextSectionStartAddress; const auto StartOffset = Layout.getSymbolOffset(*Function.getSymbol()); const auto EndOffset = Layout.getSymbolOffset(*Function.getFunctionEndLabel()); + Function.setOutputAddress(BaseAddress + StartOffset); + Function.setOutputSize(EndOffset - StartOffset); if (Function.hasConstantIsland()) { const auto DataOffset = Layout.getSymbolOffset(*Function.getFunctionConstantIslandLabel()); Function.setOutputDataAddress(BaseAddress + DataOffset); } - Function.setOutputAddress(BaseAddress + StartOffset); - Function.setOutputSize(EndOffset - StartOffset); if (Function.isSplit()) { const auto *ColdStartSymbol = Function.getColdSymbol(); assert(ColdStartSymbol && ColdStartSymbol->isDefined() && @@ -3199,12 +3232,12 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { "split function should have defined cold end symbol"); const auto ColdStartOffset = Layout.getSymbolOffset(*ColdStartSymbol); const auto ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol); - Function.cold().setAddress(BaseAddress + ColdStartOffset); + Function.cold().setAddress(ColdBaseAddress + ColdStartOffset); Function.cold().setImageSize(ColdEndOffset - ColdStartOffset); if (Function.hasConstantIsland()) { const auto DataOffset = Layout.getSymbolOffset( *Function.getFunctionColdConstantIslandLabel()); - Function.setOutputColdDataAddress(BaseAddress + DataOffset); + Function.setOutputColdDataAddress(ColdBaseAddress + DataOffset); } } } else { @@ -3230,18 +3263,19 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { BBI != BBE; ++BBI) { auto *BB = *BBI; assert(BB->getLabel()->isDefined() && "symbol should be defined"); - uint64_t BaseAddress; - if (BC->HasRelocations) { - BaseAddress = NewTextSectionStartAddress; - } else { - BaseAddress = BB->isCold() ? Function.cold().getAddress() - : Function.getOutputAddress(); + const auto BBBaseAddress = BB->isCold() ? ColdBaseAddress : BaseAddress; + if (!BC->HasRelocations) { + if (BB->isCold()) { + assert(BBBaseAddress == Function.cold().getAddress()); + } else { + assert(BBBaseAddress == Function.getOutputAddress()); + } } - uint64_t Address = BaseAddress + Layout.getSymbolOffset(*BB->getLabel()); - BB->setOutputStartAddress(Address); + auto BBAddress = BBBaseAddress + Layout.getSymbolOffset(*BB->getLabel()); + BB->setOutputStartAddress(BBAddress); if (PrevBB) { - auto PrevBBEndAddress = Address; + auto PrevBBEndAddress = BBAddress; if (BB->isCold() != PrevBB->isCold()) { PrevBBEndAddress = Function.getOutputAddress() + Function.getOutputSize(); @@ -3392,8 +3426,8 @@ void RewriteInstance::patchELFPHDRTable() { EHFrameHdrSec->isAllocatable() && EHFrameHdrSec->isFinalized()) { NewPhdr.p_offset = EHFrameHdrSec->getFileOffset(); - NewPhdr.p_vaddr = EHFrameHdrSec->getFileAddress(); - NewPhdr.p_paddr = EHFrameHdrSec->getFileAddress(); + NewPhdr.p_vaddr = EHFrameHdrSec->getOutputAddress(); + NewPhdr.p_paddr = EHFrameHdrSec->getOutputAddress(); NewPhdr.p_filesz = EHFrameHdrSec->getOutputSize(); NewPhdr.p_memsz = EHFrameHdrSec->getOutputSize(); } @@ -3544,7 +3578,7 @@ void RewriteInstance::rewriteNoteSections() { BSec ? BSec->getELFType() : ELF::SHT_PROGBITS, BSec ? BSec->isLocal() : false); - NewSection.setFileAddress(0); + NewSection.setOutputAddress(0); NewSection.setFileOffset(NextAvailableOffset); NextAvailableOffset += Size; @@ -3580,9 +3614,9 @@ void RewriteInstance::finalizeSectionStringTable(ELFObjectFile *File) { StringRef SectionName = cantFail(Obj->getSectionName(&Section), "cannot get section name"); SHStrTab.add(SectionName); - if (willOverwriteSection(SectionName)) { - AllSHStrTabStrings.emplace_back( - SHStrTabPool.intern(OrgSecPrefix + SectionName.str())); + auto OutputSectionName = getOutputSectionName(Obj, Section); + if (OutputSectionName != SectionName) { + AllSHStrTabStrings.emplace_back(SHStrTabPool.intern(OutputSectionName)); SHStrTab.add(*AllSHStrTabStrings.back()); } } @@ -3645,145 +3679,102 @@ void RewriteInstance::addBoltInfoSection() { /*IsReadOnly=*/true, ELF::SHT_NOTE); } -// Provide a mapping of the existing input binary sections to the output binary -// section header table. -// Return the map from the section header old index to its new index. Optionally -// return in OutputSections an ordered list of the output sections. This is -// optional because for reference updating in the symbol table we only need the -// map of input to output indices, not the real output section list. +template +std::string RewriteInstance::getOutputSectionName(const ELFObjType *Obj, + const ELFShdrTy &Section) { + if (Section.sh_type == ELF::SHT_NULL) + return ""; + + StringRef SectionName = + cantFail(Obj->getSectionName(&Section), "cannot get section name"); + + if ((Section.sh_flags & ELF::SHF_ALLOC) && + willOverwriteSection(SectionName)) + return OrgSecPrefix + SectionName.str(); + + return SectionName; +} + template -std::vector RewriteInstance::getOutputSections( - ELFObjectFile *File, - std::vector *OutputSections, - std::map *SectionNameMap -) { +std::vector RewriteInstance::getOutputSections( + ELFObjectFile *File, std::vector &NewSectionIndex) { auto *Obj = File->getELFFile(); auto Sections = cantFail(Obj->sections()); - std::vector NewSectionIndex( - std::distance(Sections.begin(), Sections.end()), 0); - NewTextSectionIndex = 0; - uint32_t CurIndex{0}; + // Keep track of section header entries together with their name. + std::vector> OutputSections; + auto addSection = [&](const std::string &Name, const ELFShdrTy &Section) { + auto NewSection = Section; + NewSection.sh_name = SHStrTab.getOffset(Name); + OutputSections.emplace_back(std::make_pair(Name, std::move(NewSection))); + }; - // Copy over entries for original allocatable sections with minor - // modifications (e.g. name). + // Copy over entries for original allocatable sections using modified name. for (auto &Section : Sections) { // Always ignore this section. if (Section.sh_type == ELF::SHT_NULL) { - NewSectionIndex[0] = CurIndex++; - if (OutputSections) - OutputSections->emplace_back(Section); + OutputSections.emplace_back(std::make_pair("", Section)); continue; } - // Is this our new text? Then update our pointer indicating the new output - // text section - if (opts::UseOldText && Section.sh_flags & ELF::SHF_ALLOC && - Section.sh_addr <= NewTextSectionStartAddress && - Section.sh_addr + Section.sh_size > NewTextSectionStartAddress) { - NewTextSectionIndex = CurIndex; - } - - // Skip non-allocatable sections. if (!(Section.sh_flags & ELF::SHF_ALLOC)) continue; - StringRef SectionName = - cantFail(Obj->getSectionName(&Section), "cannot get section name"); - - if (SectionNameMap && !SectionNameMap->count(SectionName)) { - (*SectionNameMap)[SectionName] = CurIndex; - } - const auto OldIdx = std::distance(Sections.begin(), &Section); - assert(NewSectionIndex[OldIdx] == 0); - NewSectionIndex[OldIdx] = CurIndex++; - - // If only computing the map, we're done with this iteration - if (!OutputSections) - continue; - - auto NewSection = Section; - if (SectionName == ".bss") { - // .bss section offset matches that of the next section. - NewSection.sh_offset = NewTextSegmentOffset; - } - - if (willOverwriteSection(SectionName)) { - NewSection.sh_name = SHStrTab.getOffset(OrgSecPrefix + - SectionName.str()); - } else { - NewSection.sh_name = SHStrTab.getOffset(SectionName); - } - - OutputSections->emplace_back(NewSection); - } - - // If we are creating our own .text section, it should be the first section - // we created in BinaryContext, so this is the correct index. - if (!opts::UseOldText) { - NewTextSectionIndex = CurIndex; + addSection(getOutputSectionName(Obj, Section), Section); } - // Process entries for all new allocatable sections. Make sure - // allocatable sections follow the same order as in mapDataSections so - // that the section indices are consistent. - std::vector AllocatableSections; - std::vector SectionNames = { ".eh_frame", - ".gcc_except_table", - ".rodata", - ".rodata.cold" }; - for (const auto &SectionName : SectionNames) { - auto Section = BC->getUniqueSectionByName(SectionName); - if (Section && Section->isFinalized()) { - AllocatableSections.push_back(&*Section); - } - } - for (auto &Section : BC->allocatableSections()) { + for (const auto &Section : BC->allocatableSections()) { if (!Section.isFinalized()) continue; - if (std::find_if(AllocatableSections.begin(), - AllocatableSections.end(), - [&Section](const BinarySection *BSec) { - return BSec == &Section; - }) == AllocatableSections.end()) { - AllocatableSections.push_back(&Section); - } - } - - for (const auto *Section : AllocatableSections) { - // Ignore function sections. - if (Section->getFileAddress() < NewTextSegmentAddress) { + if (Section.getName().startswith(OrgSecPrefix)) { if (opts::Verbosity) outs() << "BOLT-INFO: not writing section header for existing section " - << Section->getName() << '\n'; + << Section.getName() << '\n'; continue; } - if (SectionNameMap) { - (*SectionNameMap)[Section->getName()] = CurIndex; - } - ++CurIndex; - - // If only computing the map, we're done with this iteration - if (!OutputSections) - continue; - if (opts::Verbosity >= 1) outs() << "BOLT-INFO: writing section header for " - << Section->getName() << '\n'; + << Section.getName() << '\n'; ELFShdrTy NewSection; - NewSection.sh_name = SHStrTab.getOffset(Section->getName()); NewSection.sh_type = ELF::SHT_PROGBITS; - NewSection.sh_addr = Section->getFileAddress(); - NewSection.sh_offset = Section->getFileOffset(); - NewSection.sh_size = Section->getOutputSize(); + NewSection.sh_addr = Section.getOutputAddress(); + NewSection.sh_offset = Section.getFileOffset(); + NewSection.sh_size = Section.getOutputSize(); NewSection.sh_entsize = 0; - NewSection.sh_flags = Section->getELFFlags(); + NewSection.sh_flags = Section.getELFFlags(); NewSection.sh_link = 0; NewSection.sh_info = 0; - NewSection.sh_addralign = Section->getAlignment(); - OutputSections->emplace_back(NewSection); + NewSection.sh_addralign = Section.getAlignment(); + addSection(Section.getName(), NewSection); + } + + // Sort all allocatable sections by their offset. + std::stable_sort(OutputSections.begin(), OutputSections.end(), + [] (const std::pair &A, + const std::pair &B) { + return A.second.sh_offset < B.second.sh_offset; + }); + + // Fix section sizes to prevent overlapping. + for (uint32_t Index = 1; Index < OutputSections.size(); ++Index) { + auto &PrevSection = OutputSections[Index - 1].second; + auto &Section = OutputSections[Index].second; + + // Skip TBSS section size adjustment. + if (PrevSection.sh_type == ELF::SHT_NOBITS && + (PrevSection.sh_flags & ELF::SHF_TLS)) + continue; + + if (PrevSection.sh_addr + PrevSection.sh_size > Section.sh_addr) { + if (opts::Verbosity > 1) { + outs() << "BOLT-INFO: adjusting size for section " + << OutputSections[Index - 1].first << '\n'; + } + PrevSection.sh_size = Section.sh_addr > PrevSection.sh_addr ? + Section.sh_addr - PrevSection.sh_addr : 0; + } } uint64_t LastFileOffset = 0; @@ -3802,38 +3793,22 @@ std::vector RewriteInstance::getOutputSections( StringRef SectionName = cantFail(Obj->getSectionName(&Section), "cannot get section name"); - if (SectionNameMap && !SectionNameMap->count(SectionName)) { - (*SectionNameMap)[SectionName] = CurIndex; - } - const auto OldIdx = std::distance(Sections.begin(), &Section); - assert(NewSectionIndex[OldIdx] == 0); - NewSectionIndex[OldIdx] = CurIndex++; - - // If only computing the map, we're done with this iteration - if (!OutputSections) - continue; - auto BSec = BC->getUniqueSectionByName(SectionName); assert(BSec && "missing section info for non-allocatable section"); auto NewSection = Section; NewSection.sh_offset = BSec->getFileOffset(); NewSection.sh_size = BSec->getOutputSize(); - NewSection.sh_name = SHStrTab.getOffset(SectionName); if (NewSection.sh_type == ELF::SHT_SYMTAB) { NewSection.sh_info = NumLocalSymbols; } - OutputSections->emplace_back(NewSection); + addSection(SectionName, NewSection); LastFileOffset = BSec->getFileOffset(); } - // Map input -> output is ready. Early return if that's all we need. - if (!OutputSections) - return NewSectionIndex; - // Create entries for new non-allocatable sections. for (auto &Section : BC->nonAllocatableSections()) { if (Section.getFileOffset() <= LastFileOffset) @@ -3844,7 +3819,6 @@ std::vector RewriteInstance::getOutputSections( << Section.getName() << '\n'; } ELFShdrTy NewSection; - NewSection.sh_name = SHStrTab.getOffset(Section.getName()); NewSection.sh_type = Section.getELFType(); NewSection.sh_addr = 0; NewSection.sh_offset = Section.getFileOffset(); @@ -3854,10 +3828,44 @@ std::vector RewriteInstance::getOutputSections( NewSection.sh_link = 0; NewSection.sh_info = 0; NewSection.sh_addralign = Section.getAlignment(); - OutputSections->emplace_back(NewSection); + + addSection(Section.getName(), NewSection); + } + + // Assign indices to sections. + std::unordered_map NameToIndex; + for (uint32_t Index = 1; Index < OutputSections.size(); ++Index) { + const auto &SectionName = OutputSections[Index].first; + NameToIndex[SectionName] = Index; + if (auto Section = BC->getUniqueSectionByName(SectionName)) + Section->setIndex(Index); + } + + // Update section index mapping + NewSectionIndex.clear(); + NewSectionIndex.resize(Sections.size(), 0); + for (auto &Section : Sections) { + if (Section.sh_type == ELF::SHT_NULL) + continue; + + auto OrgIndex = std::distance(Sections.begin(), &Section); + auto SectionName = getOutputSectionName(Obj, Section); + + // Some sections are stripped + if (!NameToIndex.count(SectionName)) + continue; + + NewSectionIndex[OrgIndex] = NameToIndex[SectionName]; } - return NewSectionIndex; + std::vector SectionsOnly(OutputSections.size()); + std::transform(OutputSections.begin(), OutputSections.end(), + SectionsOnly.begin(), + [](std::pair &SectionInfo) { + return SectionInfo.second; + }); + + return SectionsOnly; } // Rewrite section header table inserting new entries as needed. The sections @@ -3867,31 +3875,14 @@ std::vector RewriteInstance::getOutputSections( // As we rewrite entries we need to track how many sections were inserted // as it changes the sh_link value. We map old indices to new ones for // existing sections. -// -// The following are assumptions about file modifications: -// * There are no modifications done to address and/or size of existing -// allocatable sections. -// * All new allocatable sections are written immediately after existing -// allocatable sections. -// * There could be modifications done to non-allocatable sections, e.g. -// size could be increased. -// * New non-allocatable sections are added to the end of the file. template void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { using Elf_Shdr = typename ELFObjectFile::Elf_Shdr; - std::vector OutputSections; auto &OS = Out->os(); auto *Obj = File->getELFFile(); - auto NewSectionIndex = getOutputSections(File, &OutputSections); - - // Sort sections by their offset prior to writing. Only newly created sections - // were unsorted, hence this wouldn't ruin indices in NewSectionIndex. - std::stable_sort(OutputSections.begin(), OutputSections.end(), - [] (Elf_Shdr A, Elf_Shdr B) { - return A.sh_offset < B.sh_offset; - }); - + std::vector NewSectionIndex; + auto OutputSections = getOutputSections(File, NewSectionIndex); DEBUG( dbgs() << "BOLT-DEBUG: old to new section index mapping:\n"; for (uint64_t I = 0; I < NewSectionIndex.size(); ++I) { @@ -3904,8 +3895,7 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile *File) { SHTOffset = appendPadding(OS, SHTOffset, sizeof(Elf_Shdr)); // Write all section header entries while patching section references. - for (uint64_t Index = 0; Index < OutputSections.size(); ++Index) { - auto &Section = OutputSections[Index]; + for (auto &Section : OutputSections) { Section.sh_link = NewSectionIndex[Section.sh_link]; if (Section.sh_type == ELF::SHT_REL || Section.sh_type == ELF::SHT_RELA) { if (Section.sh_info) @@ -3943,16 +3933,9 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { using Elf_Sym = typename ELFObjectFile::Elf_Sym; // Compute a preview of how section indices will change after rewriting, so - // we can properly update the symbol table. - std::map SectionNameMap; - auto NewSectionIndex = - getOutputSections(File, (std::vector *)nullptr, &SectionNameMap); - - DEBUG(dbgs() << "BOLT-DEBUG: SectionNameMap:\n"; - for (auto &Entry : SectionNameMap) { - dbgs() << "BOLT-DEBUG: " << Entry.first << " -> " - << Entry.second << "\n"; - }); + // we can properly update the symbol table based on new section indices. + std::vector NewSectionIndex; + getOutputSections(File, NewSectionIndex); auto updateSymbolTable = [&](bool PatchExisting, @@ -3974,7 +3957,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { // Add symbols of injected functions for (BinaryFunction *Function : BC->getInjectedBinaryFunctions()) { Elf_Sym NewSymbol; - NewSymbol.st_shndx = NewTextSectionIndex; + NewSymbol.st_shndx = Function->getCodeSection()->getIndex(); NewSymbol.st_value = Function->getOutputAddress(); NewSymbol.st_name = AddToStrTab(Function->getPrintName()); NewSymbol.st_size = Function->getOutputSize(); @@ -4004,10 +3987,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { NewSymbol.getType() != ELF::STT_SECTION) { NewSymbol.st_value = Function->getOutputAddress(); NewSymbol.st_size = Function->getOutputSize(); - if (BC->HasRelocations) - NewSymbol.st_shndx = NewTextSectionIndex; - else - NewSymbol.st_shndx = NewSectionIndex[NewSymbol.st_shndx]; + NewSymbol.st_shndx = Function->getCodeSection()->getIndex(); if (!PatchExisting && Function->isSplit()) { auto NewColdSym = NewSymbol; SmallVector Buf; @@ -4015,6 +3995,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { AddToStrTab(Twine(cantFail(Symbol.getName(StringSection))) .concat(".cold.0") .toStringRef(Buf)); + NewColdSym.st_shndx = Function->getColdCodeSection()->getIndex(); NewColdSym.st_value = Function->cold().getAddress(); NewColdSym.st_size = Function->cold().getImageSize(); Write(0, NewColdSym); @@ -4064,14 +4045,14 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto &OutputSection = BD->getOutputSection(); - assert(SectionNameMap.count(OutputSection.getName())); + assert(OutputSection.getIndex()); DEBUG(dbgs() << "BOLT-DEBUG: moving " << BD->getName() << " from " << *BC->getSectionNameForAddress(NewSymbol.st_value) << " (" << NewSymbol.st_shndx << ") to " << OutputSection.getName() << " (" - << SectionNameMap[OutputSection.getName()] << ")\n"); + << OutputSection.getIndex() << ")\n"); OldSectionIndex = ELF::SHN_LORESERVE; - NewSymbol.st_shndx = SectionNameMap[OutputSection.getName()]; + NewSymbol.st_shndx = OutputSection.getIndex(); // TODO: use getNewValueForSymbol()? NewSymbol.st_value = BD->getOutputAddress(); @@ -4161,12 +4142,12 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (opts::HotText && !IsHotTextUpdated && !PatchExisting) { addSymbol("__hot_start"); addSymbol("__hot_end"); - } + } - if (opts::HotData && !IsHotDataUpdated && !PatchExisting) { - addSymbol("__hot_data_start"); - addSymbol("__hot_data_end"); - } + if (opts::HotData && !IsHotDataUpdated && !PatchExisting) { + addSymbol("__hot_data_start"); + addSymbol("__hot_data_end"); + } }; // Update dynamic symbol table. @@ -4589,14 +4570,14 @@ void RewriteInstance::rewriteFile() { } void RewriteInstance::writeEHFrameHeader() { - DWARFDebugFrame NewEHFrame(true, EHFrameSection->getFileAddress()); + DWARFDebugFrame NewEHFrame(true, EHFrameSection->getOutputAddress()); NewEHFrame.parse(DWARFDataExtractor(EHFrameSection->getOutputContents(), BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getCodePointerSize())); auto OldEHFrameSection = BC->getUniqueSectionByName(".eh_frame_old"); assert(OldEHFrameSection && "expected .eh_frame_old to be present"); - DWARFDebugFrame OldEHFrame(true, OldEHFrameSection->getFileAddress()); + DWARFDebugFrame OldEHFrame(true, OldEHFrameSection->getOutputAddress()); OldEHFrame.parse(DWARFDataExtractor(OldEHFrameSection->getOutputContents(), BC->AsmInfo->isLittleEndian(), BC->AsmInfo->getCodePointerSize())); @@ -4606,14 +4587,14 @@ void RewriteInstance::writeEHFrameHeader() { NextAvailableAddress = appendPadding(Out->os(), NextAvailableAddress, EHFrameHdrAlign); - const auto EHFrameHdrFileAddress = NextAvailableAddress; + const auto EHFrameHdrOutputAddress = NextAvailableAddress; const auto EHFrameHdrFileOffset = getFileOffsetForAddress(NextAvailableAddress); auto NewEHFrameHdr = CFIRdWrt->generateEHFrameHeader(OldEHFrame, NewEHFrame, - EHFrameHdrFileAddress, + EHFrameHdrOutputAddress, FailedAddresses); assert(Out->os().tell() == EHFrameHdrFileOffset && "offset mismatch"); @@ -4629,14 +4610,14 @@ void RewriteInstance::writeEHFrameHeader() { NewEHFrameHdr.size(), /*Alignment=*/1); EHFrameHdrSec.setFileOffset(EHFrameHdrFileOffset); - EHFrameHdrSec.setFileAddress(EHFrameHdrFileAddress); + EHFrameHdrSec.setOutputAddress(EHFrameHdrOutputAddress); NextAvailableAddress += EHFrameHdrSec.getOutputSize(); // Merge .eh_frame and .eh_frame_old so that gdb can locate all FDEs. - const auto EHFrameSectionSize = (OldEHFrameSection->getFileAddress() + + const auto EHFrameSectionSize = (OldEHFrameSection->getOutputAddress() + OldEHFrameSection->getOutputSize() - - EHFrameSection->getFileAddress()); + EHFrameSection->getOutputAddress()); EHFrameSection = BC->registerOrUpdateSection(".eh_frame", @@ -4678,10 +4659,6 @@ bool RewriteInstance::willOverwriteSection(StringRef SectionName) { return true; } - // Special handling for .text - if (SectionName == ".text" && opts::UseOldText) - return false; - auto Section = BC->getUniqueSectionByName(SectionName); return Section && Section->isAllocatable() && Section->isFinalized(); } diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 4ed7ad148e4f..c3107965b8af 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -94,10 +94,13 @@ class RewriteInstance { /// Run optimizations that operate at the binary, or post-linker, level. void runOptimizationPasses(); - /// Write all functions to an intermediary object file, map virtual to real - /// addresses and link this object file, resolving all relocations and + /// Write code and data into an intermediary object file, map virtual to real + /// addresses and link the object file, resolving all relocations and /// performing final relaxation. - void emitFunctions(); + void emitSections(); + + /// Emit function code. + void emitFunctions(MCStreamer *Streamer); /// Emit data \p Section, possibly with relocations. Use name \p Name if /// non-empty. @@ -253,15 +256,17 @@ class RewriteInstance { /// Finalize memory image of section header string table. ELF_FUNCTION(finalizeSectionStringTable); - /// Get a list of all the sections to include in the output binary along - /// with a map of input to output indices. Optionally produce a mapping - /// of section name to new section index in /p OutputSectionNameMap. + /// Return a name of the input file section in the output file. + template + std::string getOutputSectionName(const ELFObjType *Obj, + const ELFShdrTy &Section); + + /// Return a list of all sections to include in the output binary. + /// Populate \p NewSectionIndex with a map of input to output indices. template ::Elf_Shdr> - std::vector getOutputSections( - ELFObjectFile *File, - std::vector *OutputSections = nullptr, - std::map *OutputSectionNameMap = nullptr); + std::vector getOutputSections( + ELFObjectFile *File, std::vector &NewSectionIndex); /// Add a notes section containing the BOLT revision and command line options. void addBoltInfoSection(); @@ -290,7 +295,7 @@ class RewriteInstance { /// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc). /// \p DebugRangesOffset is the offset in .debug_ranges of the object's /// new address ranges in the output binary. - /// \p Unit Compile uniit the object belongs to. + /// \p Unit Compile unit the object belongs to. /// \p DIE is the object's DIE in the input binary. void updateDWARFObjectAddressRanges(const DWARFDie DIE, uint64_t DebugRangesOffset); @@ -416,10 +421,6 @@ class RewriteInstance { /// Maps section name -> patcher. std::map> SectionPatchers; - uint64_t NewTextSectionStartAddress{0}; - - uint64_t NewTextSectionIndex{0}; - /// Number of local symbols in newly written symbol table. uint64_t NumLocalSymbols{0}; From 56105b6c7da47485593d49aafb3c9bc120753b84 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 14 Mar 2019 20:32:04 -0700 Subject: [PATCH 506/904] [BOLT] Fix -hot-functions-at-end option Summary: Make "-hot-functions-at-end" option work again. (cherry picked from commit f133874aad1256af3ef98d8baf61246f0d8de14f) --- bolt/src/RewriteInstance.cpp | 73 +++++++++++++++++++++++------------- bolt/src/RewriteInstance.h | 2 +- 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index ddf81ccba3f8..d1a44f4daac5 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -2854,6 +2854,7 @@ void RewriteInstance::emitSections() { void RewriteInstance::emitFunctions(MCStreamer *Streamer) { auto *TextSection = BC->MOFI->getTextSection(); + TextSection->setAlignment(BC->PageAlign); auto *ColdSection = BC->Ctx->getELFSection(".text.cold", ELF::SHT_PROGBITS, @@ -2960,44 +2961,64 @@ void RewriteInstance::emitFunctions(MCStreamer *Streamer) { } void RewriteInstance::mapFileSections(orc::VModuleKey Key) { - mapTextSections(Key); + mapCodeSections(Key); mapDataSections(Key); } -void RewriteInstance::mapTextSections(orc::VModuleKey Key) { +void RewriteInstance::mapCodeSections(orc::VModuleKey Key) { if (BC->HasRelocations) { - uint64_t AllocationAddress{0}; - auto TextSection = BC->getUniqueSectionByName(".text"); - assert(TextSection && ".text not found in output"); - auto ColdSection = BC->getUniqueSectionByName(".text.cold"); + // Populate the list of sections to be allocated. + std::vector CodeSections; + for (auto &Section : BC->textSections()) { + if (Section.hasValidSectionID()) + CodeSections.emplace_back(&Section); + }; - DEBUG( - for (auto &Section : BC->textSections()) { - dbgs() << "code section : " << Section.getName() - << "; valid ID : " << Section.hasValidSectionID() << '\n'; + // Determine the order of sections. + std::stable_sort(CodeSections.begin(), CodeSections.end(), + [&](const BinarySection *A, const BinarySection *B) { + if (opts::HotFunctionsAtEnd) { + return B->getName() == ".text"; + } else { + return A->getName() == ".text"; + } + }); + + DEBUG(dbgs() << "Code section in the order of output:\n"; + for (const auto *Section : CodeSections) { + dbgs() << Section->getName() << '\n'; }); - auto CodeSize = TextSection->getOutputSize(); - if (ColdSection) { - CodeSize = alignTo(CodeSize, ColdSection->getAlignment()); - CodeSize += ColdSection->getOutputSize(); + // Beginning address for placing code. + uint64_t AllocationAddress{0}; + uint64_t CodeSize{0}; + + // Check if we can fit code in the original .text + if (opts::UseOldText) { + auto Code = BC->OldTextSectionAddress; + for (const auto *CodeSection : CodeSections) { + Code = alignTo(Code, CodeSection->getAlignment()); + Code += CodeSection->getOutputSize(); + } + CodeSize = Code - BC->OldTextSectionAddress; + + if (CodeSize <= BC->OldTextSectionSize) { + outs() << "BOLT-INFO: using original .text for new code with 0x" + << Twine::utohexstr(BC->PageAlign) << " alignment\n"; + AllocationAddress = BC->OldTextSectionAddress; + } } - auto Padding = OffsetToAlignment(BC->OldTextSectionAddress, BC->PageAlign); - if (opts::UseOldText && Padding + CodeSize <= BC->OldTextSectionSize) { - // Utilize the original .text for storage. - outs() << "BOLT-INFO: using original .text for new code with 0x" - << Twine::utohexstr(BC->PageAlign) << " alignment\n"; - AllocationAddress = BC->OldTextSectionAddress + Padding; - } else { + + if (!AllocationAddress) { if (opts::UseOldText) { errs() << "BOLT-WARNING: original .text too small to fit the new code" << " using 0x" << Twine::utohexstr(BC->PageAlign) - << " aligment. " << Padding + CodeSize + << " page alignment. " << CodeSize << " bytes needed, have " << BC->OldTextSectionSize << " bytes available.\n"; opts::UseOldText = false; } - AllocationAddress = alignTo(NextAvailableAddress, BC->PageAlign); + AllocationAddress = NextAvailableAddress; } auto mapSection = [&](BinarySection &Section) { @@ -3012,9 +3033,9 @@ void RewriteInstance::mapTextSections(orc::VModuleKey Key) { AllocationAddress += Section.getOutputSize(); }; - mapSection(*TextSection); - if (ColdSection) - mapSection(*ColdSection); + for (auto *CodeSection : CodeSections) { + mapSection(*CodeSection); + } if (!opts::UseOldText) { NextAvailableAddress = AllocationAddress; diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index c3107965b8af..6e84fcf4f10d 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -121,7 +121,7 @@ class RewriteInstance { std::vector FunctionStack); /// Map all sections to their final addresses. - void mapTextSections(orc::VModuleKey ObjectsHandle); + void mapCodeSections(orc::VModuleKey ObjectsHandle); void mapDataSections(orc::VModuleKey ObjectsHandle); void mapFileSections(orc::VModuleKey ObjectsHandle); From e4fbc60e8baaeeb8c157ea8d02dfa5e46eff1a58 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 15 Mar 2019 15:06:41 -0700 Subject: [PATCH 507/904] [BOLT][NFC] Fix compilation warnings Summary: Get rid of warnings while building with Clang. (cherry picked from commit 23d06b81492ebe41d28b3897bf1cb1f8a973bd04) --- bolt/src/Passes/ReorderFunctions.cpp | 2 +- bolt/src/Passes/RetpolineInsertion.cpp | 52 ++++++++++++++------------ bolt/src/Passes/RetpolineInsertion.h | 18 +++++---- 3 files changed, 40 insertions(+), 32 deletions(-) diff --git a/bolt/src/Passes/ReorderFunctions.cpp b/bolt/src/Passes/ReorderFunctions.cpp index c2a989bb292b..b332d35101de 100644 --- a/bolt/src/Passes/ReorderFunctions.cpp +++ b/bolt/src/Passes/ReorderFunctions.cpp @@ -290,7 +290,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC, opts::ReorderFunctions != RT_USER) { Cg = buildCallGraph(BC, BFs, - [this](const BinaryFunction &BF) { + [](const BinaryFunction &BF) { if (!BF.hasProfile()) return true; if (BF.getState() != BinaryFunction::State::CFG) diff --git a/bolt/src/Passes/RetpolineInsertion.cpp b/bolt/src/Passes/RetpolineInsertion.cpp index 5f330e32f791..29feb3cfa428 100644 --- a/bolt/src/Passes/RetpolineInsertion.cpp +++ b/bolt/src/Passes/RetpolineInsertion.cpp @@ -138,9 +138,10 @@ BinaryFunction *createNewRetpoline(BinaryContext &BC, BB2.addInstruction(PushR11); MCInst LoadCalleeAddrs; - MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue, - BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr, - BrInfo.SegRegNum, MIB.getX86R11(), 8); + const auto &MemRef = BrInfo.Memory; + MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue, + MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr, + MemRef.SegRegNum, MIB.getX86R11(), 8); BB2.addInstruction(LoadCalleeAddrs); @@ -186,27 +187,29 @@ std::string createRetpolineFunctionTag(BinaryContext &BC, std::string Tag = "__retpoline_mem_"; + const auto &MemRef = BrInfo.Memory; + std::string DispExprStr; - if (BrInfo.DispExpr) { + if (MemRef.DispExpr) { llvm::raw_string_ostream Ostream(DispExprStr); - BrInfo.DispExpr->print(Ostream, BC.AsmInfo.get()); + MemRef.DispExpr->print(Ostream, BC.AsmInfo.get()); Ostream.flush(); } - Tag += BrInfo.BaseRegNum != BC.MIB->getX86NoRegister() - ? "r" + to_string(BrInfo.BaseRegNum) + Tag += MemRef.BaseRegNum != BC.MIB->getX86NoRegister() + ? "r" + to_string(MemRef.BaseRegNum) : ""; Tag += - BrInfo.DispExpr ? "+" + DispExprStr : "+" + to_string(BrInfo.DispValue); + MemRef.DispExpr ? "+" + DispExprStr : "+" + to_string(MemRef.DispValue); - Tag += BrInfo.IndexRegNum != BC.MIB->getX86NoRegister() - ? "+" + to_string(BrInfo.ScaleValue) + "*" + - to_string(BrInfo.IndexRegNum) + Tag += MemRef.IndexRegNum != BC.MIB->getX86NoRegister() + ? "+" + to_string(MemRef.ScaleValue) + "*" + + to_string(MemRef.IndexRegNum) : ""; - Tag += BrInfo.SegRegNum != BC.MIB->getX86NoRegister() - ? "_seg_" + to_string(BrInfo.SegRegNum) + Tag += MemRef.SegRegNum != BC.MIB->getX86NoRegister() + ? "_seg_" + to_string(MemRef.SegRegNum) : ""; return Tag; @@ -232,10 +235,11 @@ void createBranchReplacement(BinaryContext &BC, auto &MIB = *BC.MIB; // Load the branch address in r11 if available if (BrInfo.isMem() && R11Available) { + const auto &MemRef = BrInfo.Memory; MCInst LoadCalleeAddrs; - MIB.createLoad(LoadCalleeAddrs, BrInfo.BaseRegNum, BrInfo.ScaleValue, - BrInfo.IndexRegNum, BrInfo.DispValue, BrInfo.DispExpr, - BrInfo.SegRegNum, MIB.getX86R11(), 8); + MIB.createLoad(LoadCalleeAddrs, MemRef.BaseRegNum, MemRef.ScaleValue, + MemRef.IndexRegNum, MemRef.DispValue, MemRef.DispExpr, + MemRef.SegRegNum, MIB.getX86R11(), 8); Replacement.push_back(LoadCalleeAddrs); } @@ -255,9 +259,10 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) { if (MIB.isBranchOnMem(Inst)) { IsMem = true; - if (!MIB.evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue, - &IndexRegNum, &DispValue, &SegRegNum, - &DispExpr)) { + if (!MIB.evaluateX86MemoryOperand(Inst, &Memory.BaseRegNum, + &Memory.ScaleValue, + &Memory.IndexRegNum, &Memory.DispValue, + &Memory.SegRegNum, &Memory.DispExpr)) { llvm_unreachable("not expected"); } } else if (MIB.isBranchOnReg(Inst)) { @@ -309,12 +314,13 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC, // If the instruction addressing pattern uses rsp and the retpoline // loads the callee address then displacement needs to be updated if (BrInfo.isMem() && !R11Available) { + auto &MemRef = BrInfo.Memory; auto Addend = (BrInfo.isJump() || BrInfo.isTailCall()) ? 8 : 16; - if (BrInfo.BaseRegNum == MIB.getStackPointer()) { - BrInfo.DispValue += Addend; + if (MemRef.BaseRegNum == MIB.getStackPointer()) { + MemRef.DispValue += Addend; } - if (BrInfo.IndexRegNum == MIB.getStackPointer()) - BrInfo.DispValue += Addend * BrInfo.ScaleValue; + if (MemRef.IndexRegNum == MIB.getStackPointer()) + MemRef.DispValue += Addend * MemRef.ScaleValue; } TargetRetpoline = getOrCreateRetpoline(BC, BrInfo, R11Available); diff --git a/bolt/src/Passes/RetpolineInsertion.h b/bolt/src/Passes/RetpolineInsertion.h index dd29ff47066a..e3cf5bb3675b 100644 --- a/bolt/src/Passes/RetpolineInsertion.h +++ b/bolt/src/Passes/RetpolineInsertion.h @@ -34,19 +34,21 @@ struct IndirectBranchInfo { bool isJump() const { return !IsCall; } bool isTailCall() const { return IsTailCall; } + struct MemOpInfo { + unsigned BaseRegNum; + int64_t ScaleValue; + unsigned IndexRegNum; + int64_t DispValue; + unsigned SegRegNum; + const MCExpr *DispExpr{nullptr}; + }; + union { // Register branch information MCPhysReg BranchReg; // Memory branch information - struct { - unsigned BaseRegNum; - int64_t ScaleValue; - unsigned IndexRegNum; - int64_t DispValue; - unsigned SegRegNum; - const MCExpr *DispExpr{nullptr}; - }; + MemOpInfo Memory; }; }; From eb8f3994a3a82bfd1a65dafd63e9d4bda5e6b468 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 18 Mar 2019 19:22:26 -0700 Subject: [PATCH 508/904] [BOLT] Fix debug line info emission Summary: GDB does not like if the first entry in the line info table after end_sequence entry is not marked with is_stmt. If this happens, it will not print the correct line number information for such address. Note that everything works fine starting with the first address marked with is_stmt. This could happen if the first instruction in the cold section wasn't marked with is_stmt. The fix is to always emit debug line info for the first instruction in any function fragment with is_stmt flag. (cherry picked from commit 67144d1bee370c591e2817049f6d35de1621ff9d) --- bolt/src/BinaryFunction.cpp | 22 ++++++++++++++++------ bolt/src/BinaryFunction.h | 4 +++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 979e69907684..42e37d5ae594 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -2658,6 +2658,8 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart, if (!EmitCodeOnly && EmitColdPart && hasConstantIsland()) duplicateConstantIslands(); + // Track first emitted instruction with debug info. + bool FirstInstr = true; for (auto BB : layout()) { if (EmitColdPart != BB->isCold()) continue; @@ -2714,7 +2716,8 @@ void BinaryFunction::emitBody(MCStreamer &Streamer, bool EmitColdPart, } if (!EmitCodeOnly && opts::UpdateDebugSections && UnitLineTable.first) { - LastLocSeen = emitLineInfo(Instr.getLoc(), LastLocSeen); + LastLocSeen = emitLineInfo(Instr.getLoc(), LastLocSeen, FirstInstr); + FirstInstr = false; } Streamer.EmitInstruction(Instr, *BC.STI); @@ -3702,7 +3705,8 @@ bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, return true; } -SMLoc BinaryFunction::emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const { +SMLoc BinaryFunction::emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc, + bool FirstInstr) const { auto *FunctionCU = UnitLineTable.first; const auto *FunctionLineTable = UnitLineTable.second; assert(FunctionCU && "cannot emit line info for function without CU"); @@ -3737,14 +3741,20 @@ SMLoc BinaryFunction::emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const { if (!CurrentFilenum) CurrentFilenum = CurrentRow.File; + unsigned Flags = (DWARF2_FLAG_IS_STMT * CurrentRow.IsStmt) | + (DWARF2_FLAG_BASIC_BLOCK * CurrentRow.BasicBlock) | + (DWARF2_FLAG_PROLOGUE_END * CurrentRow.PrologueEnd) | + (DWARF2_FLAG_EPILOGUE_BEGIN * CurrentRow.EpilogueBegin); + + // Always emit is_stmt at the beginning of function fragment. + if (FirstInstr) + Flags |= DWARF2_FLAG_IS_STMT; + BC.Ctx->setCurrentDwarfLoc( CurrentFilenum, CurrentRow.Line, CurrentRow.Column, - (DWARF2_FLAG_IS_STMT * CurrentRow.IsStmt) | - (DWARF2_FLAG_BASIC_BLOCK * CurrentRow.BasicBlock) | - (DWARF2_FLAG_PROLOGUE_END * CurrentRow.PrologueEnd) | - (DWARF2_FLAG_EPILOGUE_BEGIN * CurrentRow.EpilogueBegin), + Flags, CurrentRow.Isa, CurrentRow.Discriminator); BC.Ctx->setDwarfCompileUnitID(FunctionUnitIndex); diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index fb2afa13754d..13b0b64a8771 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -688,9 +688,11 @@ class BinaryFunction { /// Emit line number information corresponding to \p NewLoc. \p PrevLoc /// provides a context for de-duplication of line number info. + /// \p FirstInstr indicates if \p NewLoc represents the first instruction + /// in a sequence, such as a function fragment. /// /// Return new current location which is either \p NewLoc or \p PrevLoc. - SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc) const; + SMLoc emitLineInfo(SMLoc NewLoc, SMLoc PrevLoc, bool FirstInstr) const; BinaryFunction& operator=(const BinaryFunction &) = delete; BinaryFunction(const BinaryFunction &) = delete; From cd51105ebb6f7b3aaa270bdb40ecfc32f07ac0b1 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 15 Mar 2019 13:43:36 -0700 Subject: [PATCH 509/904] [BOLT] Place hot text mover functions into a separate section Summary: Create a separate pass for assigning functions to sections. Detect functions originating from special sections (by default .stub and .mover) and place them into ".text.mover" if "-hot-text" options is specified. Cold functions are isolated from hot functions even when no function re-ordering is specified. (cherry picked from commit d1b3763ac7c3adebc85ed0aa86775c69560d2ae5) --- bolt/src/BinaryContext.h | 37 ++++++ bolt/src/BinaryFunction.h | 10 ++ bolt/src/BinaryPassManager.cpp | 5 +- bolt/src/Passes/BinaryPasses.cpp | 35 ++++++ bolt/src/Passes/BinaryPasses.h | 17 ++- bolt/src/RewriteInstance.cpp | 201 +++++++++++++------------------ bolt/src/RewriteInstance.h | 3 + 7 files changed, 191 insertions(+), 117 deletions(-) diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index ba3abdf03eb1..96762b053381 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -32,6 +32,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Object/ObjectFile.h" @@ -456,6 +457,42 @@ class BinaryContext { return Ctx->getOrCreateSymbol("__hot_end"); } + MCSection *getTextSection() const { + return MOFI->getTextSection(); + } + + /// Return code section with a given name. + MCSection *getCodeSection(StringRef SectionName) const { + return Ctx->getELFSection(SectionName, + ELF::SHT_PROGBITS, + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + } + + /// \name Pre-assigned Section Names + /// @{ + + const char *getMainCodeSectionName() const { + return ".text"; + } + + const char *getColdCodeSectionName() const { + return ".text.cold"; + } + + const char *getHotTextMoverSectionName() const { + return ".text.mover"; + } + + const char *getInjectedCodeSectionName() const { + return ".text.injected"; + } + + const char *getInjectedColdCodeSectionName() const { + return ".text.injected.cold"; + } + + /// @} + /// Perform any necessary post processing on the symbol table after /// function disassembly is complete. This processing fixes top /// level data holes and makes sure the symbol table is valid. diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 13b0b64a8771..7992c4a9f469 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -1253,6 +1253,11 @@ class BinaryFunction { return StringRef(CodeSectionName); } + /// Assign a code section name to the function. + void setCodeSectionName(StringRef Name) { + CodeSectionName = Name; + } + /// Get output code section. ErrorOr getCodeSection() const { return BC.getUniqueSectionByName(getCodeSectionName()); @@ -1263,6 +1268,11 @@ class BinaryFunction { return StringRef(ColdCodeSectionName); } + /// Assign a section name for the cold part of the function. + void setColdCodeSectionName(StringRef Name) { + ColdCodeSectionName = Name; + } + /// Get output code section for cold code of this function. ErrorOr getColdCodeSection() const { return BC.getUniqueSectionByName(getColdCodeSectionName()); diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index 49e7d5bcf62c..28516aaea781 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -462,7 +462,10 @@ void BinaryFunctionPassManager::runAllPasses( Manager.registerPass( llvm::make_unique(PrintRetpolineInsertion)); - // Thighten branches according to offset differences between branch and + // Assign each function an output section. + Manager.registerPass(llvm::make_unique()); + + // Tighten branches according to offset differences between branch and // targets. No extra instructions after this pass, otherwise we may have // relocations out of range and crash during linking. if (BC.isAArch64()) diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 9eb6dfbbffef..f9ee20a07934 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -56,6 +56,7 @@ extern cl::opt Verbosity; extern cl::opt SplitEH; extern cl::opt SplitFunctions; extern bool shouldProcess(const bolt::BinaryFunction &Function); +extern bool isHotTextMover(const bolt::BinaryFunction &Function); enum DynoStatsSortOrder : char { Ascending, @@ -1216,6 +1217,40 @@ void SimplifyRODataLoads::runOnFunctions( << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; } +void AssignSections::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &) { + for (auto *Function : BC.getInjectedBinaryFunctions()) { + Function->setCodeSectionName(BC.getInjectedCodeSectionName()); + Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); + } + + // In non-relocation mode functions have pre-assigned section names. + if (!BC.HasRelocations) + return; + + const auto UseColdSection = BC.NumProfiledFuncs > 0; + for (auto &BFI : BFs) { + auto &Function = BFI.second; + if (opts::isHotTextMover(Function)) { + Function.setCodeSectionName(BC.getHotTextMoverSectionName()); + Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); + continue; + } + + if (!UseColdSection || + Function.hasValidIndex() || + Function.hasValidProfile()) { + Function.setCodeSectionName(BC.getMainCodeSectionName()); + } else { + Function.setCodeSectionName(BC.getColdCodeSectionName()); + } + + if (Function.isSplit()) + Function.setColdCodeSectionName(BC.getColdCodeSectionName()); + } +} + void PrintProgramStats::runOnFunctions(BinaryContext &BC, std::map &BFs, diff --git a/bolt/src/Passes/BinaryPasses.h b/bolt/src/Passes/BinaryPasses.h index 53068af0a0f7..4cf7fee9c92c 100644 --- a/bolt/src/Passes/BinaryPasses.h +++ b/bolt/src/Passes/BinaryPasses.h @@ -323,7 +323,7 @@ class Peepholes : public BinaryFunctionPass { /// /// mov 0x12f(%rip), %eax /// -/// to their counterparts that use immediate opreands instead of memory loads: +/// to their counterparts that use immediate operands instead of memory loads: /// /// mov $0x4007dc, %eax /// @@ -353,6 +353,21 @@ class SimplifyRODataLoads : public BinaryFunctionPass { std::set &LargeFunctions) override; }; +/// Assign output sections to all functions. +class AssignSections : public BinaryFunctionPass { + public: + explicit AssignSections() + : BinaryFunctionPass(false) { + } + + const char *getName() const override { + return "assign-sections"; + } + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + /// Prints a list of the top 100 functions sorted by a set of /// dyno stats categories. class PrintProgramStats : public BinaryFunctionPass { diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index d1a44f4daac5..947ed1b73f5f 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -197,6 +197,16 @@ HotText("hot-text", cl::ZeroOrMore, cl::cat(BoltCategory)); +static cl::list +HotTextMoveSections("hot-text-move-sections", + cl::desc("list of sections containing functions used for hugifying hot text. " + "BOLT makes sure these functions are not placed on the same page as " + "the hot text. (default=\'.stub,.mover\')."), + cl::value_desc("sec1,sec2,sec3,..."), + cl::CommaSeparated, + cl::ZeroOrMore, + cl::cat(BoltCategory)); + static cl::opt HotData("hot-data", cl::desc("hot data symbols support (relocation mode)"), @@ -389,6 +399,15 @@ TimeRewrite("time-rewrite", cl::Hidden, cl::cat(BoltCategory)); +bool isHotTextMover(const BinaryFunction &Function) { + for (auto &SectionName : opts::HotTextMoveSections) { + if (Function.getOriginSectionName() == SectionName) + return true; + } + + return false; +} + // Check against lists of functions from options if we should // optimize the function with a given name. bool shouldProcess(const BinaryFunction &Function) { @@ -1750,16 +1769,19 @@ void RewriteInstance::adjustCommandLineOptions() { outs() << "BOLT-INFO: disabling -align-macro-fusion on non-x86 platform\n"; opts::AlignMacroOpFusion = MFT_NONE; } + if (opts::AlignMacroOpFusion != MFT_NONE && !BC->HasRelocations) { outs() << "BOLT-INFO: disabling -align-macro-fusion in non-relocation " "mode\n"; opts::AlignMacroOpFusion = MFT_NONE; } + if (opts::SplitEH && !BC->HasRelocations) { outs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n"; opts::SplitEH = false; } + if (BC->isX86() && BC->HasRelocations && opts::AlignMacroOpFusion == MFT_HOT && !DA.started() && BC->DR.getAllFuncsData().empty() && @@ -1773,6 +1795,11 @@ void RewriteInstance::adjustCommandLineOptions() { outs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n"; opts::HotText = false; } + + if (opts::HotText && opts::HotTextMoveSections.getNumOccurrences() == 0) { + opts::HotTextMoveSections.addValue(".stub"); + opts::HotTextMoveSections.addValue(".mover"); + } } namespace { @@ -2600,16 +2627,13 @@ void RewriteInstance::emitFunction(MCStreamer &Streamer, if (Function.getState() == BinaryFunction::State::Empty) return; - auto *Section = static_cast(Streamer.getCurrentSectionOnly()); + auto *Section = + BC->getCodeSection(EmitColdPart ? Function.getColdCodeSectionName() + : Function.getCodeSectionName()); + Streamer.SwitchSection(Section); Section->setHasInstructions(true); BC->Ctx->addGenDwarfSection(Section); - if (EmitColdPart) { - Function.ColdCodeSectionName = Section->getSectionName(); - } else { - Function.CodeSectionName = Section->getSectionName(); - } - if (BC->HasRelocations) { Streamer.EmitCodeAlignment(BinaryFunction::MinAlign); auto MaxAlignBytes = EmitColdPart @@ -2744,6 +2768,8 @@ void RewriteInstance::emitSections() { Streamer->InitSections(false); + BC->getTextSection()->setAlignment(BC->PageAlign); + emitFunctions(Streamer.get()); if (!BC->HasRelocations && opts::UpdateDebugSections) @@ -2853,111 +2879,42 @@ void RewriteInstance::emitSections() { } void RewriteInstance::emitFunctions(MCStreamer *Streamer) { - auto *TextSection = BC->MOFI->getTextSection(); - TextSection->setAlignment(BC->PageAlign); - auto *ColdSection = - BC->Ctx->getELFSection(".text.cold", - ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); - ColdSection->setAlignment(64); - - // Sort functions for the output. - std::vector SortedFunctions = - BinaryContext::getSortedFunctions(BinaryFunctions); + auto emit = [&](const std::vector &Functions) { + for (auto *Function : Functions) { + if (!BC->HasRelocations && + (!Function->isSimple() || !opts::shouldProcess(*Function))) + continue; - DEBUG( - if (!BC->HasRelocations) { - auto SortedIt = SortedFunctions.begin(); - for (auto &It : BinaryFunctions) { - assert(&It.second == *SortedIt); - ++SortedIt; - } - }); - - // Emit a set of functions at the boundary of hot and cold code. - auto emitBoundaryFunctions = [&]() { - // Emit injected functions hot parts - for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) - emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/false); - - // Emit injected functions cold parts - for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()) - emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/true); - - if (opts::SplitFunctions != BinaryFunction::ST_NONE) { - DEBUG(dbgs() << "BOLT-DEBUG: generating code for split functions\n"); - for (auto *FPtr : SortedFunctions) { - if (!FPtr->isSplit() || !FPtr->isSimple()) - continue; - emitFunction(*Streamer, *FPtr, /*EmitColdPart=*/true); - } + DEBUG(dbgs() << "BOLT: generating code for function \"" + << *Function << "\" : " + << Function->getFunctionNumber() << '\n'); + + emitFunction(*Streamer, *Function, /*EmitColdPart=*/false); + + if (Function->isSplit()) + emitFunction(*Streamer, *Function, /*EmitColdPart=*/true); } }; + // Mark the start of hot text. if (opts::HotText) { - Streamer->SwitchSection(TextSection); - Streamer->EmitCodeAlignment(BC->PageAlign); + Streamer->SwitchSection(BC->getTextSection()); Streamer->EmitLabel(BC->getHotTextStartSymbol()); } - if (BC->HasRelocations) { - Streamer->SwitchSection(ColdSection); - emitBoundaryFunctions(); - } - - bool UseColdSection = SortedFunctions.front()->hasValidIndex(); - - // Output functions one by one. - for (auto *FunctionPtr : SortedFunctions) { - auto &Function = *FunctionPtr; - - if (!BC->HasRelocations && - (!Function.isSimple() || !opts::shouldProcess(Function))) - continue; - - MCSection *Section; - if (BC->HasRelocations) { - if (UseColdSection && !Function.hasValidIndex()) { - Section = ColdSection; - } else { - Section = TextSection; - } - } else { - Section = - BC->Ctx->getELFSection(Function.getCodeSectionName(), - ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); - } - Streamer->SwitchSection(Section); - - DEBUG(dbgs() << "BOLT: generating code for function \"" - << Function << "\" : " - << Function.getFunctionNumber() << '\n'); - - emitFunction(*Streamer, Function, /*EmitColdPart=*/false); + // Emit functions in sorted order. + std::vector SortedFunctions = + BinaryContext::getSortedFunctions(BinaryFunctions); + emit(SortedFunctions); - if (!BC->HasRelocations && Function.isSplit()) { - Streamer->SwitchSection( - BC->Ctx->getELFSection(Function.getColdCodeSectionName(), - ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC)); - emitFunction(*Streamer, Function, /*EmitColdPart=*/true); - } - } + // Emit functions added by BOLT. + emit(BC->getInjectedBinaryFunctions()); + // Mark the end of hot text. if (opts::HotText) { - Streamer->SwitchSection(TextSection); + Streamer->SwitchSection(BC->getTextSection()); Streamer->EmitLabel(BC->getHotTextEndSymbol()); } - - // Emit injected functions in non-reloc mode - if (!BC->HasRelocations) { - Streamer->SwitchSection(TextSection); - for (auto *InjectedFunction : BC->getInjectedBinaryFunctions()){ - emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/false); - emitFunction(*Streamer, *InjectedFunction, /*EmitColdPart=*/true); - } - } } void RewriteInstance::mapFileSections(orc::VModuleKey Key) { @@ -2965,25 +2922,39 @@ void RewriteInstance::mapFileSections(orc::VModuleKey Key) { mapDataSections(Key); } +std::vector +RewriteInstance::getCodeSections() { + std::vector CodeSections; + for (auto &Section : BC->textSections()) { + if (Section.hasValidSectionID()) + CodeSections.emplace_back(&Section); + }; + + auto compareSections = [&](const BinarySection *A, const BinarySection *B) { + // Place movers before anything else. + if (A->getName() == BC->getHotTextMoverSectionName()) + return true; + if (B->getName() == BC->getHotTextMoverSectionName()) + return false; + + // Depending on the option, put main text at the beginning or at the end. + if (opts::HotFunctionsAtEnd) { + return B->getName() == BC->getMainCodeSectionName(); + } else { + return A->getName() == BC->getMainCodeSectionName(); + } + }; + + // Determine the order of sections. + std::stable_sort(CodeSections.begin(), CodeSections.end(), compareSections); + + return CodeSections; +} + void RewriteInstance::mapCodeSections(orc::VModuleKey Key) { if (BC->HasRelocations) { // Populate the list of sections to be allocated. - std::vector CodeSections; - for (auto &Section : BC->textSections()) { - if (Section.hasValidSectionID()) - CodeSections.emplace_back(&Section); - }; - - // Determine the order of sections. - std::stable_sort(CodeSections.begin(), CodeSections.end(), - [&](const BinarySection *A, const BinarySection *B) { - if (opts::HotFunctionsAtEnd) { - return B->getName() == ".text"; - } else { - return A->getName() == ".text"; - } - }); - + auto CodeSections = getCodeSections(); DEBUG(dbgs() << "Code section in the order of output:\n"; for (const auto *Section : CodeSections) { dbgs() << Section->getName() << '\n'; diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 6e84fcf4f10d..83bd0e786dad 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -120,6 +120,9 @@ class RewriteInstance { void updateUnitDebugInfo(const DWARFDie DIE, std::vector FunctionStack); + /// Return the list of code sections in the output order. + std::vector getCodeSections(); + /// Map all sections to their final addresses. void mapCodeSections(orc::VModuleKey ObjectsHandle); void mapDataSections(orc::VModuleKey ObjectsHandle); From 892437ce26eeb70c62279e4a3277bdf86efd5448 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 19 Mar 2019 13:46:21 -0700 Subject: [PATCH 510/904] [BOLT] Use local binding for cold fragment symbols Summary: We used to use existing symbol binding while duplicating and renaming cold fragment symbols. As a result, some of those were emitted with global binding. This confuses gdb, and it starts treating those symbols as additional entry points. The fix is to always emit such symbols with a local binding. This also means that we have to sort static symbol table before emission to make sure local symbols precede all others. (cherry picked from commit cb1bbb385686eba34f87e0f408801fb25d867244) --- bolt/src/RewriteInstance.cpp | 52 ++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 947ed1b73f5f..cbc04523b840 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -3938,6 +3938,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { unsigned IsHotTextUpdated = 0; unsigned IsHotDataUpdated = 0; + std::vector Symbols; + std::map IslandSizes; auto getConstantIslandSize = [&IslandSizes](const BinaryFunction *BF) { auto Itr = IslandSizes.find(BF); @@ -3955,7 +3957,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { NewSymbol.st_size = Function->getOutputSize(); NewSymbol.st_other = 0; NewSymbol.setBindingAndType(ELF::STB_LOCAL, ELF::STT_FUNC); - Write(0, NewSymbol); + Symbols.emplace_back(NewSymbol); if (Function->isSplit()) { auto NewColdSym = NewSymbol; @@ -3965,7 +3967,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { Twine(Function->getPrintName()).concat(".cold.0").toStringRef(Buf)); NewColdSym.st_value = Function->cold().getAddress(); NewColdSym.st_size = Function->cold().getImageSize(); - Write(0, NewColdSym); + Symbols.emplace_back(NewColdSym); } } @@ -3990,7 +3992,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { NewColdSym.st_shndx = Function->getColdCodeSection()->getIndex(); NewColdSym.st_value = Function->cold().getAddress(); NewColdSym.st_size = Function->cold().getImageSize(); - Write(0, NewColdSym); + NewColdSym.setBindingAndType(ELF::STB_LOCAL, ELF::STT_FUNC); + Symbols.emplace_back(NewColdSym); } if (!PatchExisting && Function->hasConstantIsland()) { auto DataMark = Function->getOutputDataAddress(); @@ -4005,8 +4008,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto CodeMarkSym = DataMarkSym; CodeMarkSym.st_name = AddToStrTab("$x"); CodeMarkSym.st_value = CodeMark; - Write(0, DataMarkSym); - Write(0, CodeMarkSym); + Symbols.emplace_back(DataMarkSym); + Symbols.emplace_back(CodeMarkSym); } if (!PatchExisting && Function->hasConstantIsland() && Function->isSplit()) { @@ -4022,8 +4025,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { auto CodeMarkSym = DataMarkSym; CodeMarkSym.st_name = AddToStrTab("$x"); CodeMarkSym.st_value = CodeMark; - Write(0, DataMarkSym); - Write(0, CodeMarkSym); + Symbols.emplace_back(DataMarkSym); + Symbols.emplace_back(CodeMarkSym); } } else { uint32_t OldSectionIndex = NewSymbol.st_shndx; @@ -4106,15 +4109,23 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { << Twine::utohexstr(NewSymbol.st_value) << '\n'; } - Write((&Symbol - cantFail(Obj->symbols(Section)).begin()) * - sizeof(Elf_Sym), - NewSymbol); + if (PatchExisting) { + Write((&Symbol - cantFail(Obj->symbols(Section)).begin()) * + sizeof(Elf_Sym), + NewSymbol); + } else { + Symbols.emplace_back(NewSymbol); + } } + if (PatchExisting) + return; + assert((!IsHotTextUpdated || IsHotTextUpdated == 2) && "either none or both __hot_start/__hot_end symbols were expected"); assert((!IsHotDataUpdated || IsHotDataUpdated == 2) && - "either none or both __hot_data_start/__hot_data_end symbols were expected"); + "either none or both __hot_data_start/__hot_data_end symbols were " + "expected"); auto addSymbol = [&](const std::string &Name) { Elf_Sym Symbol; @@ -4128,18 +4139,31 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { outs() << "BOLT-INFO: setting " << Name << " to 0x" << Twine::utohexstr(Symbol.st_value) << '\n'; - Write(0, Symbol); + Symbols.emplace_back(Symbol); }; - if (opts::HotText && !IsHotTextUpdated && !PatchExisting) { + if (opts::HotText && !IsHotTextUpdated) { addSymbol("__hot_start"); addSymbol("__hot_end"); } - if (opts::HotData && !IsHotDataUpdated && !PatchExisting) { + if (opts::HotData && !IsHotDataUpdated) { addSymbol("__hot_data_start"); addSymbol("__hot_data_end"); } + + // Put local symbols at the beginning. + std::stable_sort(Symbols.begin(), Symbols.end(), + [](const Elf_Sym &A, const Elf_Sym &B) { + if (A.getBinding() == ELF::STB_LOCAL && + B.getBinding() != ELF::STB_LOCAL) + return true; + return false; + }); + + for (const auto &Symbol : Symbols) { + Write(0, Symbol); + } }; // Update dynamic symbol table. From ae051dc6f9bdfababd36fdf2e873f56391a4f29a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 20 Mar 2019 16:13:09 -0700 Subject: [PATCH 511/904] [BOLT] Fix section lookup while deleting symbols Summary: While removing redundant local symbols, we used new section index to lookup the corresponding section in the old section table. As a result, we used to either not remove the correct symbols, or remove the wrong ones. (cherry picked from commit e0cd16a8e4480ca4909bb818082412444649af2b) --- bolt/src/RewriteInstance.cpp | 2 +- bolt/src/RewriteInstance.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index cbc04523b840..ef60e3eb072b 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -4063,7 +4063,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (NewSymbol.getType() == ELF::STT_NOTYPE && NewSymbol.getBinding() == ELF::STB_LOCAL && NewSymbol.st_size == 0) { - auto ExpectedSec = File->getELFFile()->getSection(NewSymbol.st_shndx); + auto ExpectedSec = File->getELFFile()->getSection(OldSectionIndex); if (ExpectedSec) { auto Section = *ExpectedSec; if (Section->sh_type == ELF::SHT_PROGBITS && diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 83bd0e786dad..bc794eef2216 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -238,7 +238,6 @@ class RewriteInstance { } /// Patch ELF book-keeping info. - void patchELF(); void patchELFPHDRTable(); /// Create section header table. From a60c63c2019faab014bb4278d0ad96d412e99882 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 21 Mar 2019 21:13:45 -0700 Subject: [PATCH 512/904] [BOLT] Allocate enough space past __hot_end for huge pages Summary: While using "-hot-text" option, we might not get enough cold text to fill up the last huge page, and we can get data allocated on this page producing undesirable effects. To prevent this from happening, always make sure to allocate enough space past __hot_end. (cherry picked from commit ac76e77770310fcdc18b50caa981ca77ab64018a) --- bolt/src/RewriteInstance.cpp | 292 +++++++++++++++++++---------------- 1 file changed, 155 insertions(+), 137 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index ef60e3eb072b..02ee04c67a52 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -2952,36 +2952,53 @@ RewriteInstance::getCodeSections() { } void RewriteInstance::mapCodeSections(orc::VModuleKey Key) { + auto TextSection = BC->getUniqueSectionByName(BC->getMainCodeSectionName()); + assert(TextSection && ".text section not found in output"); + if (BC->HasRelocations) { + assert(TextSection->hasValidSectionID() && ".text section should be valid"); + // Populate the list of sections to be allocated. auto CodeSections = getCodeSections(); - DEBUG(dbgs() << "Code section in the order of output:\n"; + DEBUG(dbgs() << "Code sections in the order of output:\n"; for (const auto *Section : CodeSections) { dbgs() << Section->getName() << '\n'; }); - // Beginning address for placing code. - uint64_t AllocationAddress{0}; - uint64_t CodeSize{0}; + uint64_t PaddingSize{0}; // size of padding required at the end + + // Allocate sections starting at a given Address. + auto allocateAt = [&](uint64_t Address) { + for (auto *Section : CodeSections) { + Address = alignTo(Address, Section->getAlignment()); + Section->setOutputAddress(Address); + Address += Section->getOutputSize(); + } + + // Make sure we allocate enough space for huge pages. + if (opts::HotText) { + auto HotTextEnd = TextSection->getOutputAddress() + + TextSection->getOutputSize(); + HotTextEnd = alignTo(HotTextEnd, BC->PageAlign); + if (HotTextEnd > Address) { + PaddingSize = HotTextEnd - Address; + Address = HotTextEnd; + } + } + return Address; + }; // Check if we can fit code in the original .text + bool AllocationDone{false}; if (opts::UseOldText) { - auto Code = BC->OldTextSectionAddress; - for (const auto *CodeSection : CodeSections) { - Code = alignTo(Code, CodeSection->getAlignment()); - Code += CodeSection->getOutputSize(); - } - CodeSize = Code - BC->OldTextSectionAddress; + const auto CodeSize = allocateAt(BC->OldTextSectionAddress) - + BC->OldTextSectionAddress; if (CodeSize <= BC->OldTextSectionSize) { outs() << "BOLT-INFO: using original .text for new code with 0x" << Twine::utohexstr(BC->PageAlign) << " alignment\n"; - AllocationAddress = BC->OldTextSectionAddress; - } - } - - if (!AllocationAddress) { - if (opts::UseOldText) { + AllocationDone = true; + } else { errs() << "BOLT-WARNING: original .text too small to fit the new code" << " using 0x" << Twine::utohexstr(BC->PageAlign) << " page alignment. " << CodeSize @@ -2989,144 +3006,145 @@ void RewriteInstance::mapCodeSections(orc::VModuleKey Key) { << " bytes available.\n"; opts::UseOldText = false; } - AllocationAddress = NextAvailableAddress; } - auto mapSection = [&](BinarySection &Section) { - AllocationAddress = alignTo(AllocationAddress, Section.getAlignment()); - DEBUG(dbgs() << "BOLT: mapping " << Section.getName() << " 0x" - << Twine::utohexstr(Section.getAllocAddress()) - << " to 0x" << Twine::utohexstr(AllocationAddress) - << '\n'); - OLT->mapSectionAddress(Key, Section.getSectionID(), AllocationAddress); - Section.setOutputAddress(AllocationAddress); - Section.setFileOffset(getFileOffsetForAddress(AllocationAddress)); - AllocationAddress += Section.getOutputSize(); - }; - - for (auto *CodeSection : CodeSections) { - mapSection(*CodeSection); + if (!AllocationDone) { + NextAvailableAddress = allocateAt(NextAvailableAddress); } - if (!opts::UseOldText) { - NextAvailableAddress = AllocationAddress; + // Do the mapping for ORC layer based on the allocation. + for (auto *Section : CodeSections) { + DEBUG(dbgs() << "BOLT: mapping " << Section->getName() + << " at 0x" << Twine::utohexstr(Section->getAllocAddress()) + << " to 0x" << Twine::utohexstr(Section->getOutputAddress()) + << '\n'); + OLT->mapSectionAddress(Key, Section->getSectionID(), + Section->getOutputAddress()); + Section->setFileOffset( + getFileOffsetForAddress(Section->getOutputAddress())); } - } else { - - auto NewTextSectionStartAddress = NextAvailableAddress; - - // Prepare .text section for injected functions - auto TextSection = BC->getUniqueSectionByName(".text"); - assert(TextSection && ".text not found in output"); - if (TextSection->hasValidSectionID()) { - uint64_t NewTextSectionOffset = 0; - auto Padding = OffsetToAlignment(NewTextSectionStartAddress, - BC->PageAlign); - NextAvailableAddress += Padding; - NewTextSectionStartAddress = NextAvailableAddress; - NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress); - NextAvailableAddress += Padding + TextSection->getOutputSize(); - TextSection->setOutputAddress(NewTextSectionStartAddress); - TextSection->setFileOffset(NewTextSectionOffset); - - DEBUG(dbgs() << "BOLT: mapping .text 0x" - << Twine::utohexstr(TextSection->getAllocAddress()) - << " to 0x" << Twine::utohexstr(NewTextSectionStartAddress) - << '\n'); - OLT->mapSectionAddress(Key, TextSection->getSectionID(), - NewTextSectionStartAddress); + // Check if we need to insert a padding section for hot text. + if (PaddingSize && !opts::UseOldText) { + outs() << "BOLT-INFO: padding code to 0x" + << Twine::utohexstr(NextAvailableAddress) + << " to accommodate hot text\n"; } - for (auto &BFI : BinaryFunctions) { - auto &Function = BFI.second; - if (!Function.isSimple() || !opts::shouldProcess(Function)) - continue; + return; + } - auto TooLarge = false; - auto FuncSection = Function.getCodeSection(); - assert(FuncSection && "cannot find section for function"); - FuncSection->setOutputAddress(Function.getAddress()); - DEBUG(dbgs() << "BOLT: mapping 0x" - << Twine::utohexstr(FuncSection->getAllocAddress()) - << " to 0x" << Twine::utohexstr(Function.getAddress()) - << '\n'); - OLT->mapSectionAddress(Key, FuncSection->getSectionID(), - Function.getAddress()); - Function.setImageAddress(FuncSection->getAllocAddress()); - Function.setImageSize(FuncSection->getOutputSize()); - if (Function.getImageSize() > Function.getMaxSize()) { - TooLarge = true; - FailedAddresses.emplace_back(Function.getAddress()); - } + // Processing in non-relocation mode. + auto NewTextSectionStartAddress = NextAvailableAddress; + + // Prepare .text section for injected functions + if (TextSection->hasValidSectionID()) { + uint64_t NewTextSectionOffset = 0; + auto Padding = OffsetToAlignment(NewTextSectionStartAddress, + BC->PageAlign); + NextAvailableAddress += Padding; + NewTextSectionStartAddress = NextAvailableAddress; + NewTextSectionOffset = getFileOffsetForAddress(NextAvailableAddress); + NextAvailableAddress += Padding + TextSection->getOutputSize(); + TextSection->setOutputAddress(NewTextSectionStartAddress); + TextSection->setFileOffset(NewTextSectionOffset); + + DEBUG(dbgs() << "BOLT: mapping .text 0x" + << Twine::utohexstr(TextSection->getAllocAddress()) + << " to 0x" << Twine::utohexstr(NewTextSectionStartAddress) + << '\n'); + OLT->mapSectionAddress(Key, TextSection->getSectionID(), + NewTextSectionStartAddress); + } - // Map jump tables if updating in-place. - if (opts::JumpTables == JTS_BASIC) { - for (auto &JTI : Function.JumpTables) { - auto *JT = JTI.second; - auto &Section = JT->getOutputSection(); - Section.setOutputAddress(JT->getAddress()); - DEBUG(dbgs() << "BOLT-DEBUG: mapping " << Section.getName() - << " to 0x" << Twine::utohexstr(JT->getAddress()) - << '\n'); - OLT->mapSectionAddress(Key, Section.getSectionID(), - JT->getAddress()); - } - } + for (auto &BFI : BinaryFunctions) { + auto &Function = BFI.second; + if (!Function.isSimple() || !opts::shouldProcess(Function)) + continue; - if (!Function.isSplit()) - continue; + auto TooLarge = false; + auto FuncSection = Function.getCodeSection(); + assert(FuncSection && "cannot find section for function"); + FuncSection->setOutputAddress(Function.getAddress()); + DEBUG(dbgs() << "BOLT: mapping 0x" + << Twine::utohexstr(FuncSection->getAllocAddress()) + << " to 0x" << Twine::utohexstr(Function.getAddress()) + << '\n'); + OLT->mapSectionAddress(Key, FuncSection->getSectionID(), + Function.getAddress()); + Function.setImageAddress(FuncSection->getAllocAddress()); + Function.setImageSize(FuncSection->getOutputSize()); + if (Function.getImageSize() > Function.getMaxSize()) { + TooLarge = true; + FailedAddresses.emplace_back(Function.getAddress()); + } - auto ColdSection = Function.getColdCodeSection(); - assert(ColdSection && "cannot find section for cold part"); - // Cold fragments are aligned at 16 bytes. - NextAvailableAddress = alignTo(NextAvailableAddress, 16); - auto &ColdPart = Function.cold(); - if (TooLarge) { - // The corresponding FDE will refer to address 0. - ColdPart.setAddress(0); - ColdPart.setImageAddress(0); - ColdPart.setImageSize(0); - ColdPart.setFileOffset(0); - } else { - ColdPart.setAddress(NextAvailableAddress); - ColdPart.setImageAddress(ColdSection->getAllocAddress()); - ColdPart.setImageSize(ColdSection->getOutputSize()); - ColdPart.setFileOffset(getFileOffsetForAddress(NextAvailableAddress)); - ColdSection->setOutputAddress(ColdPart.getAddress()); + // Map jump tables if updating in-place. + if (opts::JumpTables == JTS_BASIC) { + for (auto &JTI : Function.JumpTables) { + auto *JT = JTI.second; + auto &Section = JT->getOutputSection(); + Section.setOutputAddress(JT->getAddress()); + DEBUG(dbgs() << "BOLT-DEBUG: mapping " << Section.getName() + << " to 0x" << Twine::utohexstr(JT->getAddress()) + << '\n'); + OLT->mapSectionAddress(Key, Section.getSectionID(), + JT->getAddress()); } + } - DEBUG(dbgs() << "BOLT: mapping cold fragment 0x" - << Twine::utohexstr(ColdPart.getImageAddress()) - << " to 0x" - << Twine::utohexstr(ColdPart.getAddress()) - << " with size " - << Twine::utohexstr(ColdPart.getImageSize()) << '\n'); - OLT->mapSectionAddress(Key, ColdSection->getSectionID(), - ColdPart.getAddress()); + if (!Function.isSplit()) + continue; - NextAvailableAddress += ColdPart.getImageSize(); + auto ColdSection = Function.getColdCodeSection(); + assert(ColdSection && "cannot find section for cold part"); + // Cold fragments are aligned at 16 bytes. + NextAvailableAddress = alignTo(NextAvailableAddress, 16); + auto &ColdPart = Function.cold(); + if (TooLarge) { + // The corresponding FDE will refer to address 0. + ColdPart.setAddress(0); + ColdPart.setImageAddress(0); + ColdPart.setImageSize(0); + ColdPart.setFileOffset(0); + } else { + ColdPart.setAddress(NextAvailableAddress); + ColdPart.setImageAddress(ColdSection->getAllocAddress()); + ColdPart.setImageSize(ColdSection->getOutputSize()); + ColdPart.setFileOffset(getFileOffsetForAddress(NextAvailableAddress)); + ColdSection->setOutputAddress(ColdPart.getAddress()); } - // Add the new text section aggregating all existing code sections. - // This is pseudo-section that serves a purpose of creating a corresponding - // entry in section header table. - auto NewTextSectionSize = NextAvailableAddress - NewTextSectionStartAddress; - if (NewTextSectionSize) { - const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/true, - /*IsText=*/true, - /*IsAllocatable=*/true); - auto &Section = BC->registerOrUpdateSection(BOLTSecPrefix + ".text", - ELF::SHT_PROGBITS, - Flags, - nullptr, - NewTextSectionSize, - 16, - true /*IsLocal*/); - Section.setOutputAddress(NewTextSectionStartAddress); - Section.setFileOffset( - getFileOffsetForAddress(NewTextSectionStartAddress)); - } + DEBUG(dbgs() << "BOLT: mapping cold fragment 0x" + << Twine::utohexstr(ColdPart.getImageAddress()) + << " to 0x" + << Twine::utohexstr(ColdPart.getAddress()) + << " with size " + << Twine::utohexstr(ColdPart.getImageSize()) << '\n'); + OLT->mapSectionAddress(Key, ColdSection->getSectionID(), + ColdPart.getAddress()); + + NextAvailableAddress += ColdPart.getImageSize(); + } + + // Add the new text section aggregating all existing code sections. + // This is pseudo-section that serves a purpose of creating a corresponding + // entry in section header table. + auto NewTextSectionSize = NextAvailableAddress - NewTextSectionStartAddress; + if (NewTextSectionSize) { + const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/true, + /*IsText=*/true, + /*IsAllocatable=*/true); + auto &Section = BC->registerOrUpdateSection(BOLTSecPrefix + ".text", + ELF::SHT_PROGBITS, + Flags, + nullptr, + NewTextSectionSize, + 16, + true /*IsLocal*/); + Section.setOutputAddress(NewTextSectionStartAddress); + Section.setFileOffset( + getFileOffsetForAddress(NewTextSectionStartAddress)); } } From c049775e772b9fdbabb68b50c988aa7961888bc6 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 27 Mar 2019 13:58:31 -0700 Subject: [PATCH 513/904] [BOLT] Do not write jump table section headers Summary: In non-relocation mode we were accidentally emitting section headers for every single jump table. This happened with default `-jump-tables=basic`. (cherry picked from commit 74c59e1954256e7a0e09379922f9fef1dd76be7d) --- bolt/src/BinaryFunction.cpp | 8 +++++--- bolt/src/BinarySection.h | 6 ++++++ bolt/src/RewriteInstance.cpp | 4 ++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 42e37d5ae594..a3ba92b915a3 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3805,9 +3805,11 @@ void BinaryFunction::emitJumpTables(MCStreamer *Streamer) { if (opts::JumpTables == JTS_BASIC) { std::string Name = ".local." + JT.Labels[0]->getName().str(); std::replace(Name.begin(), Name.end(), '/', '.'); - JT.setOutputSection(BC.registerOrUpdateSection(Name, - ELF::SHT_PROGBITS, - ELF::SHF_ALLOC)); + auto &Section = BC.registerOrUpdateSection(Name, + ELF::SHT_PROGBITS, + ELF::SHF_ALLOC); + Section.setAnonymous(true); + JT.setOutputSection(Section); HotSection = BC.Ctx->getELFSection(Name, ELF::SHT_PROGBITS, ELF::SHF_ALLOC); diff --git a/bolt/src/BinarySection.h b/bolt/src/BinarySection.h index a8309b99a9b3..a0bb47e3877a 100644 --- a/bolt/src/BinarySection.h +++ b/bolt/src/BinarySection.h @@ -70,6 +70,8 @@ class BinarySection { // Set by ExecutableFileMemoryManager. uint32_t Index{0}; // Section index in the output file. mutable bool IsReordered{false}; // Have the contents been reordered? + bool IsAnonymous{false}; // True if the name should not be included + // in the output file. uint64_t hash(const BinaryData &BD, std::map &Cache) const; @@ -265,6 +267,7 @@ class BinarySection { } bool isLocal() const { return IsLocal; } bool isReordered() const { return IsReordered; } + bool isAnonymous() const { return IsAnonymous; } unsigned getELFType() const { return ELFType; } unsigned getELFFlags() const { return ELFFlags; } @@ -402,6 +405,9 @@ class BinarySection { void setOutputName(StringRef Name) { OutputName = Name; } + void setAnonymous(bool Flag) { + IsAnonymous = Flag; + } /// Reorder the contents of this section according to /p Order. If /// /p Inplace is true, the entire contents of the section is reordered, diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 02ee04c67a52..6c9625508e5d 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -3737,9 +3737,9 @@ std::vector RewriteInstance::getOutputSections( if (!Section.isFinalized()) continue; - if (Section.getName().startswith(OrgSecPrefix)) { + if (Section.getName().startswith(OrgSecPrefix) || Section.isAnonymous()) { if (opts::Verbosity) - outs() << "BOLT-INFO: not writing section header for existing section " + outs() << "BOLT-INFO: not writing section header for section " << Section.getName() << '\n'; continue; } From a9371d7b510325169b9b99acd05aeacc70af8565 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 29 Mar 2019 14:22:54 -0700 Subject: [PATCH 514/904] [BOLT][DWARF] Dedup .debug_abbrev section patches Summary: When we patch .debug_abbrev we issue many duplicate patches. Instead of storing these patches as a vector, use a hash map. This saves some processing time and memory. (cherry picked from commit afb458d7eb3cfe11959bb14bd218adb2c76846b3) --- bolt/src/BinaryFunction.cpp | 22 ++++++++++----- bolt/src/BinaryFunction.h | 4 +-- bolt/src/DebugData.cpp | 50 +++++++++++++++------------------- bolt/src/DebugData.h | 52 +++++++++++++++++++++++++++++------- bolt/src/RewriteInstance.cpp | 4 +-- bolt/src/RewriteInstance.h | 4 +-- 6 files changed, 85 insertions(+), 51 deletions(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index a3ba92b915a3..37c4536b34b7 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3900,8 +3900,8 @@ void BinaryFunction::calculateLoopInfo() { } } -DWARFAddressRangesVector BinaryFunction::getOutputAddressRanges() const { - DWARFAddressRangesVector OutputRanges; +DebugAddressRangesVector BinaryFunction::getOutputAddressRanges() const { + DebugAddressRangesVector OutputRanges; OutputRanges.emplace_back(getOutputAddress(), getOutputAddress() + getOutputSize()); @@ -3938,16 +3938,24 @@ uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const { BB->getOutputAddressRange().second); } -DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( +DebugAddressRangesVector BinaryFunction::translateInputToOutputRanges( const DWARFAddressRangesVector &InputRanges) const { + DebugAddressRangesVector OutputRanges; + // If the function hasn't changed return the same ranges. - if (!isEmitted() && !BC.HasRelocations) - return InputRanges; + if (!isEmitted() && !BC.HasRelocations) { + OutputRanges.resize(InputRanges.size()); + std::transform(InputRanges.begin(), InputRanges.end(), + OutputRanges.begin(), + [](const DWARFAddressRange &Range) { + return DebugAddressRange(Range.LowPC, Range.HighPC); + }); + return OutputRanges; + } // Even though we will merge ranges in a post-processing pass, we attempt to // merge them in a main processing loop as it improves the processing time. uint64_t PrevEndAddress = 0; - DWARFAddressRangesVector OutputRanges; for (const auto &Range : InputRanges) { if (!containsAddress(Range.LowPC)) { DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for " @@ -3998,7 +4006,7 @@ DWARFAddressRangesVector BinaryFunction::translateInputToOutputRanges( // Post-processing pass to sort and merge ranges. std::sort(OutputRanges.begin(), OutputRanges.end()); - DWARFAddressRangesVector MergedRanges; + DebugAddressRangesVector MergedRanges; PrevEndAddress = 0; for(const auto &Range : OutputRanges) { if (Range.LowPC <= PrevEndAddress) { diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 7992c4a9f469..23c554e700d1 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -2281,7 +2281,7 @@ class BinaryFunction { } /// Return output address ranges for a function. - DWARFAddressRangesVector getOutputAddressRanges() const; + DebugAddressRangesVector getOutputAddressRanges() const; /// Given an address corresponding to an instruction in the input binary, /// return an address of this instruction in output binary. @@ -2292,7 +2292,7 @@ class BinaryFunction { /// Take address ranges corresponding to the input binary and translate /// them to address ranges in the output binary. - DWARFAddressRangesVector translateInputToOutputRanges( + DebugAddressRangesVector translateInputToOutputRanges( const DWARFAddressRangesVector &InputRanges) const; /// Similar to translateInputToOutputRanges() but operates on location lists diff --git a/bolt/src/DebugData.cpp b/bolt/src/DebugData.cpp index 3bc981d55e8f..ff122dd0575c 100644 --- a/bolt/src/DebugData.cpp +++ b/bolt/src/DebugData.cpp @@ -40,7 +40,7 @@ namespace { // Returns the number of written bytes. uint64_t writeAddressRanges( MCObjectWriter *Writer, - const DWARFAddressRangesVector &AddressRanges, + const DebugAddressRangesVector &AddressRanges, const bool WriteRelativeRanges = false) { for (auto &Range : AddressRanges) { Writer->writeLE64(Range.LowPC); @@ -62,12 +62,12 @@ DebugRangesSectionsWriter::DebugRangesSectionsWriter(BinaryContext *BC) { std::unique_ptr(BC->createObjectWriter(*RangesStream)); // Add an empty range as the first entry; - SectionOffset += writeAddressRanges(Writer.get(), DWARFAddressRangesVector{}); + SectionOffset += writeAddressRanges(Writer.get(), DebugAddressRangesVector{}); } uint64_t DebugRangesSectionsWriter::addCURanges( uint64_t CUOffset, - DWARFAddressRangesVector &&Ranges) { + DebugAddressRangesVector &&Ranges) { const auto RangesOffset = addRanges(Ranges); CUAddressRanges.emplace(CUOffset, std::move(Ranges)); @@ -76,7 +76,7 @@ uint64_t DebugRangesSectionsWriter::addCURanges( uint64_t DebugRangesSectionsWriter::addRanges(const BinaryFunction *Function, - DWARFAddressRangesVector &&Ranges) { + DebugAddressRangesVector &&Ranges) { if (Ranges.empty()) return getEmptyRangesOffset(); @@ -98,7 +98,7 @@ DebugRangesSectionsWriter::addRanges(const BinaryFunction *Function, } uint64_t -DebugRangesSectionsWriter::addRanges(const DWARFAddressRangesVector &Ranges) { +DebugRangesSectionsWriter::addRanges(const DebugAddressRangesVector &Ranges) { if (Ranges.empty()) return getEmptyRangesOffset(); @@ -235,36 +235,30 @@ void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit, uint8_t NewAttrTag, uint8_t NewAttrForm) { assert(Unit && "No compile unit specified."); - Patches[Unit].emplace_back( - AbbrevAttrPatch{AbbrevCode, AttrTag, NewAttrTag, NewAttrForm}); + AbbrevPatches.emplace( + AbbrevAttrPatch{Unit, AbbrevCode, AttrTag, NewAttrTag, NewAttrForm}); } void DebugAbbrevPatcher::patchBinary(std::string &Contents) { SimpleBinaryPatcher Patcher; - for (const auto &UnitPatchesPair : Patches) { - const auto *Unit = UnitPatchesPair.first; - const auto *UnitAbbreviations = Unit->getAbbreviations(); + for (const auto &Patch : AbbrevPatches) { + const auto *UnitAbbreviations = Patch.Unit->getAbbreviations(); assert(UnitAbbreviations && "Compile unit doesn't have associated abbreviations."); - const auto &UnitPatches = UnitPatchesPair.second; - for (const auto &AttrPatch : UnitPatches) { - const auto *AbbreviationDeclaration = - UnitAbbreviations->getAbbreviationDeclaration(AttrPatch.Code); - assert(AbbreviationDeclaration && "No abbreviation with given code."); - const auto Attribute = - AbbreviationDeclaration->findAttribute(AttrPatch.Attr); - - assert(Attribute && "Specified attribute doesn't occur in abbreviation."); - // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or - // DW_AT_APPLE_*), they are all small (< 128) and encoded in a single - // byte in ULEB128, otherwise it'll be more tricky as we may need to - // grow or shrink the section. - Patcher.addBytePatch(Attribute->AttrOffset, - AttrPatch.NewAttr); - Patcher.addBytePatch(Attribute->FormOffset, - AttrPatch.NewForm); - } + const auto *AbbreviationDeclaration = + UnitAbbreviations->getAbbreviationDeclaration(Patch.Code); + assert(AbbreviationDeclaration && "No abbreviation with given code."); + const auto Attribute = + AbbreviationDeclaration->findAttribute(Patch.Attr); + + assert(Attribute && "Specified attribute doesn't occur in abbreviation."); + // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or + // DW_AT_APPLE_*), they are all small (< 128) and encoded in a single + // byte in ULEB128, otherwise it'll be more tricky as we may need to + // grow or shrink the section. + Patcher.addBytePatch(Attribute->AttrOffset, Patch.NewAttr); + Patcher.addBytePatch(Attribute->FormOffset, Patch.NewForm); } Patcher.patchBinary(Contents); } diff --git a/bolt/src/DebugData.h b/bolt/src/DebugData.h index 9b8c18c663af..44fc1ccc4324 100644 --- a/bolt/src/DebugData.h +++ b/bolt/src/DebugData.h @@ -21,6 +21,7 @@ #include "llvm/Support/raw_ostream.h" #include #include +#include #include #include @@ -39,7 +40,26 @@ class BasicBlockTable; class BinaryBasicBlock; class BinaryFunction; -/// Eeferences a row in a DWARFDebugLine::LineTable by the DWARF +/// Address range representation. Takes less space than DWARFAddressRange. +struct DebugAddressRange { + uint64_t LowPC{0}; + uint64_t HighPC{0}; + + DebugAddressRange() = default; + + DebugAddressRange(uint64_t LowPC, uint64_t HighPC) + : LowPC(LowPC), HighPC(HighPC) {} +}; + +static inline bool operator<(const DebugAddressRange &LHS, + const DebugAddressRange &RHS) { + return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC); +} + +/// DebugAddressRangesVector - represents a set of absolute address ranges. +using DebugAddressRangesVector = SmallVector; + +/// References a row in a DWARFDebugLine::LineTable by the DWARF /// Context index of the DWARF Compile Unit that owns the Line Table and the row /// index. This is tied to our IR during disassembly so that we can later update /// .debug_line information. RowIndex has a base of 1, which means a RowIndex @@ -84,14 +104,14 @@ class DebugRangesSectionsWriter { DebugRangesSectionsWriter(BinaryContext *BC); /// Add ranges for CU matching \p CUOffset and return offset into section. - uint64_t addCURanges(uint64_t CUOffset, DWARFAddressRangesVector &&Ranges); + uint64_t addCURanges(uint64_t CUOffset, DebugAddressRangesVector &&Ranges); /// Add ranges with caching for \p Function. uint64_t addRanges(const BinaryFunction *Function, - DWARFAddressRangesVector &&Ranges); + DebugAddressRangesVector &&Ranges); /// Add ranges and return offset into section. - uint64_t addRanges(const DWARFAddressRangesVector &Ranges); + uint64_t addRanges(const DebugAddressRangesVector &Ranges); /// Writes .debug_aranges with the added ranges to the MCObjectWriter. void writeArangesSection(MCObjectWriter *Writer) const; @@ -106,7 +126,7 @@ class DebugRangesSectionsWriter { uint64_t getEmptyRangesOffset() const { return EmptyRangesOffset; } /// Map DWARFCompileUnit index to ranges. - using CUAddressRangesType = std::map; + using CUAddressRangesType = std::map; /// Return ranges for a given CU. const CUAddressRangesType &getCUAddressRanges() const { @@ -137,7 +157,7 @@ class DebugRangesSectionsWriter { static constexpr uint64_t EmptyRangesOffset{0}; /// Cached used for de-duplicating entries for the same function. - std::map CachedRanges; + std::map CachedRanges; }; /// Serializes the .debug_loc DWARF section with LocationLists. @@ -219,13 +239,25 @@ class DebugAbbrevPatcher : public BinaryPatcher { private: /// Patch of changing one attribute to another. struct AbbrevAttrPatch { - uint32_t Code; // Code of abbreviation to be modified. + const DWARFUnit *Unit; // Containing DWARF unit + uint32_t Code; // Code of abbreviation to be modified. dwarf::Attribute Attr; // ID of attribute to be replaced. - uint8_t NewAttr; // ID of the new attribute. - uint8_t NewForm; // Form of the new attribute. + uint8_t NewAttr; // ID of the new attribute. + uint8_t NewForm; // Form of the new attribute. + + bool operator==(const AbbrevAttrPatch &RHS) const { + return Unit == RHS.Unit && Code == RHS.Code && Attr == RHS.Attr; + } + }; + + struct AbbrevHash { + std::size_t operator()(const AbbrevAttrPatch &P) const { + return std::hash()( + ((uint64_t)P.Unit->getOffset() << 32) + (P.Code << 16) + P.Attr); + } }; - std::map> Patches; + std::unordered_set AbbrevPatches; public: ~DebugAbbrevPatcher() { } diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 6c9625508e5d..57f9d4bedf17 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -4721,9 +4721,9 @@ RewriteInstance::getBinaryFunctionAtAddress(uint64_t Address) const { return nullptr; } -DWARFAddressRangesVector RewriteInstance::translateModuleAddressRanges( +DebugAddressRangesVector RewriteInstance::translateModuleAddressRanges( const DWARFAddressRangesVector &InputRanges) const { - DWARFAddressRangesVector OutputRanges; + DebugAddressRangesVector OutputRanges; for (const auto Range : InputRanges) { auto BFI = BinaryFunctions.lower_bound(Range.LowPC); diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index bc794eef2216..ffdbfb5584a7 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -183,7 +183,7 @@ class RewriteInstance { const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) const; /// Produce output address ranges based on input ranges for some module. - DWARFAddressRangesVector translateModuleAddressRanges( + DebugAddressRangesVector translateModuleAddressRanges( const DWARFAddressRangesVector &InputRanges) const; private: @@ -287,7 +287,7 @@ class RewriteInstance { /// Patches the binary for DWARF address ranges (e.g. in functions and lexical /// blocks) to be updated. - void updateDWARFAddressRanges(); + void updateDebugAddressRanges(); /// Rewrite .gdb_index section if present. void updateGdbIndexSection(); From ac006b15c2652962a7842b2a40c897fc32351a56 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 3 Apr 2019 15:52:01 -0700 Subject: [PATCH 515/904] [BOLT] Move BinaryFunctions into a BinaryContext and more Summary: A long due refactoring that makes interfaces cleaner and less awkward. Mainly makes the future work way easier. (cherry picked from commit 7b61ae863b9abd31987e5590bd1885c0ed4d6620) --- bolt/src/BinaryContext.cpp | 66 ++++- bolt/src/BinaryContext.h | 74 +++++- bolt/src/BinaryFunction.h | 13 + bolt/src/BinaryPassManager.cpp | 13 +- bolt/src/BinaryPassManager.h | 13 +- bolt/src/BoltDiff.cpp | 19 +- bolt/src/DWARFRewriter.cpp | 80 +++--- bolt/src/DWARFRewriter.h | 85 +++++++ bolt/src/DataAggregator.cpp | 22 +- bolt/src/DataAggregator.h | 7 +- bolt/src/DebugData.h | 5 - bolt/src/Passes/Aligner.cpp | 6 +- bolt/src/Passes/Aligner.h | 4 +- bolt/src/Passes/AllocCombiner.cpp | 7 +- bolt/src/Passes/AllocCombiner.h | 4 +- bolt/src/Passes/BinaryFunctionCallGraph.cpp | 3 +- bolt/src/Passes/BinaryFunctionCallGraph.h | 3 +- bolt/src/Passes/BinaryPasses.cpp | 101 +++----- bolt/src/Passes/BinaryPasses.h | 62 ++--- bolt/src/Passes/FrameAnalysis.cpp | 7 +- bolt/src/Passes/FrameAnalysis.h | 4 +- bolt/src/Passes/FrameOptimizer.cpp | 12 +- bolt/src/Passes/FrameOptimizer.h | 4 +- bolt/src/Passes/IdenticalCodeFolding.cpp | 10 +- bolt/src/Passes/IdenticalCodeFolding.h | 4 +- bolt/src/Passes/IndirectCallPromotion.cpp | 10 +- bolt/src/Passes/IndirectCallPromotion.h | 4 +- bolt/src/Passes/Inliner.cpp | 15 +- bolt/src/Passes/Inliner.h | 7 +- bolt/src/Passes/JTFootprintReduction.cpp | 12 +- bolt/src/Passes/JTFootprintReduction.h | 4 +- bolt/src/Passes/LongJmp.cpp | 6 +- bolt/src/Passes/LongJmp.h | 4 +- bolt/src/Passes/PLTCall.cpp | 7 +- bolt/src/Passes/PLTCall.h | 4 +- bolt/src/Passes/RegAnalysis.h | 3 +- bolt/src/Passes/RegReAssign.cpp | 12 +- bolt/src/Passes/RegReAssign.h | 4 +- bolt/src/Passes/ReorderData.cpp | 7 +- bolt/src/Passes/ReorderData.h | 4 +- bolt/src/Passes/ReorderFunctions.cpp | 6 +- bolt/src/Passes/ReorderFunctions.h | 4 +- bolt/src/Passes/RetpolineInsertion.cpp | 7 +- bolt/src/Passes/RetpolineInsertion.h | 4 +- bolt/src/Passes/StokeInfo.cpp | 11 +- bolt/src/Passes/StokeInfo.h | 4 +- bolt/src/Passes/ValidateInternalCalls.cpp | 6 +- bolt/src/Passes/ValidateInternalCalls.h | 4 +- bolt/src/Passes/VeneerElimination.cpp | 5 +- bolt/src/Passes/VeneerElimination.h | 4 +- bolt/src/ProfileWriter.cpp | 2 +- bolt/src/RewriteInstance.cpp | 258 ++++++++------------ bolt/src/RewriteInstance.h | 98 +------- 53 files changed, 525 insertions(+), 619 deletions(-) create mode 100644 bolt/src/DWARFRewriter.h diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index d95023b5a3e3..4be1af4dc79e 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -240,6 +240,19 @@ MCSymbol *BinaryContext::getOrCreateGlobalSymbol(uint64_t Address, return registerNameAtAddress(Name, Address, Size, Alignment, Flags); } +BinaryFunction *BinaryContext::createBinaryFunction( + const std::string &Name, BinarySection &Section, uint64_t Address, + uint64_t Size, bool IsSimple, uint64_t SymbolSize, uint16_t Alignment) { + auto Result = BinaryFunctions.emplace( + Address, BinaryFunction(Name, Section, Address, Size, *this, IsSimple)); + assert(Result.second == true && "unexpected duplicate function"); + auto *BF = &Result.first->second; + registerNameAtAddress(Name, Address, SymbolSize ? SymbolSize : Size, + Alignment); + setSymbolToFunctionMap(BF->getSymbol(), BF); + return BF; +} + MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, uint64_t Address, uint64_t Size, @@ -449,8 +462,7 @@ void BinaryContext::postProcessSymbolTable() { } void BinaryContext::foldFunction(BinaryFunction &ChildBF, - BinaryFunction &ParentBF, - std::map &BFs) { + BinaryFunction &ParentBF) { // Copy name list. ParentBF.addNewNames(ChildBF.getNames()); @@ -470,10 +482,10 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF, if (HasRelocations) { // Remove ChildBF from the global set of functions in relocs mode. - auto FI = BFs.find(ChildBF.getAddress()); - assert(FI != BFs.end() && "function not found"); + auto FI = BinaryFunctions.find(ChildBF.getAddress()); + assert(FI != BinaryFunctions.end() && "function not found"); assert(&ChildBF == &FI->second && "function mismatch"); - FI = BFs.erase(FI); + FI = BinaryFunctions.erase(FI); } else { // In non-relocation mode we keep the function, but rename it. std::string NewName = "__ICF_" + ChildBF.Names.back(); @@ -688,8 +700,7 @@ unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID, return cantFail(Ctx->getDwarfFile(Dir, FileName, 0, nullptr, None, DestCUID)); } -std::vector BinaryContext::getSortedFunctions( - std::map &BinaryFunctions) { +std::vector BinaryContext::getSortedFunctions() { std::vector SortedFunctions(BinaryFunctions.size()); std::transform(BinaryFunctions.begin(), BinaryFunctions.end(), SortedFunctions.begin(), @@ -707,8 +718,7 @@ std::vector BinaryContext::getSortedFunctions( return SortedFunctions; } -void BinaryContext::preprocessDebugInfo( - std::map &BinaryFunctions) { +void BinaryContext::preprocessDebugInfo() { // Populate MCContext with DWARF files. for (const auto &CU : DwCtx->compile_units()) { const auto CUID = CU->getOffset(); @@ -1210,3 +1220,41 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF) { return std::make_pair(HotSize, ColdSize); } + +BinaryFunction * +BinaryContext::getBinaryFunctionContainingAddress(uint64_t Address, + bool CheckPastEnd, + bool UseMaxSize) { + auto FI = BinaryFunctions.upper_bound(Address); + if (FI == BinaryFunctions.begin()) + return nullptr; + --FI; + + const auto UsedSize = UseMaxSize ? FI->second.getMaxSize() + : FI->second.getSize(); + + if (Address >= FI->first + UsedSize + (CheckPastEnd ? 1 : 0)) + return nullptr; + return &FI->second; +} + +DebugAddressRangesVector BinaryContext::translateModuleAddressRanges( + const DWARFAddressRangesVector &InputRanges) const { + DebugAddressRangesVector OutputRanges; + + for (const auto Range : InputRanges) { + auto BFI = BinaryFunctions.lower_bound(Range.LowPC); + while (BFI != BinaryFunctions.end()) { + const auto &Function = BFI->second; + if (Function.getAddress() >= Range.HighPC) + break; + const auto FunctionRanges = Function.getOutputAddressRanges(); + std::move(std::begin(FunctionRanges), + std::end(FunctionRanges), + std::back_inserter(OutputRanges)); + std::advance(BFI, 1); + } + } + + return OutputRanges; +} diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 96762b053381..18638bf46518 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -138,6 +138,9 @@ class BinaryContext { /// Low level section registration. BinarySection ®isterSection(BinarySection *Section); + /// Store all functions in the binary, sorted by original address. + std::map BinaryFunctions; + /// Functions injected by BOLT std::vector InjectedBinaryFunctions; @@ -161,6 +164,40 @@ class BinaryContext { FilterIterator; using FilteredBinaryDataIterator = FilterIterator; + /// Return BinaryFunction containing a given \p Address or nullptr if + /// no registered function has it. + /// + /// In a binary a function has somewhat vague boundaries. E.g. a function can + /// refer to the first byte past the end of the function, and it will still be + /// referring to this function, not the function following it in the address + /// space. Thus we have the following flags that allow to lookup for + /// a function where a caller has more context for the search. + /// + /// If \p CheckPastEnd is true and the \p Address falls on a byte + /// immediately following the last byte of some function and there's no other + /// function that starts there, then return the function as the one containing + /// the \p Address. This is useful when we need to locate functions for + /// references pointing immediately past a function body. + /// + /// If \p UseMaxSize is true, then include the space between this function + /// body and the next object in address ranges that we check. + BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address, + bool CheckPastEnd = false, + bool UseMaxSize = false); + + /// Return BinaryFunction that starts at a given \p Address. + BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) { + if (const auto *BD = getBinaryDataAtAddress(Address)) + return getFunctionForSymbol(BD->getSymbol()); + return nullptr; + } + + const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) const { + if (const auto *BD = getBinaryDataAtAddress(Address)) + return getFunctionForSymbol(BD->getSymbol()); + return nullptr; + } + /// [MCSymbol] -> [BinaryFunction] /// /// As we fold identical functions, multiple symbols can point @@ -188,6 +225,10 @@ class BinaryContext { /// top level BinaryData. bool validateHoles() const; + /// Produce output address ranges based on input ranges for some module. + DebugAddressRangesVector translateModuleAddressRanges( + const DWARFAddressRangesVector &InputRanges) const; + /// Get a bogus "absolute" section that will be associated with all /// absolute BinaryDatas. BinarySection &absoluteSection(); @@ -203,6 +244,25 @@ class BinaryContext { /// is complete, e.g. after building CFGs for all functions. void assignMemData(); + /// Construct BinaryFunction object and add it to internal maps. + BinaryFunction *createBinaryFunction(const std::string &Name, + BinarySection &Section, + uint64_t Address, + uint64_t Size, + bool IsSimple, + uint64_t SymbolSize = 0, + uint16_t Alignment = 0); + + /// Return all functions for this rewrite instance. + std::map &getBinaryFunctions() { + return BinaryFunctions; + } + + /// Return all functions for this rewrite instance. + const std::map &getBinaryFunctions() const { + return BinaryFunctions; + } + /// Create BOLT-injected function BinaryFunction *createInjectedBinaryFunction(const std::string &Name, bool IsSimple = true); @@ -491,6 +551,10 @@ class BinaryContext { return ".text.injected.cold"; } + ErrorOr getGdbIndexSection() const { + return getUniqueSectionByName(".gdb_index"); + } + /// @} /// Perform any necessary post processing on the symbol table after @@ -675,9 +739,7 @@ class BinaryContext { /// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then /// removed from the list of functions \p BFs. The profile data of \p ChildBF /// is merged into that of \p ParentBF. - void foldFunction(BinaryFunction &ChildBF, - BinaryFunction &ParentBF, - std::map &BFs); + void foldFunction(BinaryFunction &ChildBF, BinaryFunction &ParentBF); /// Add a Section relocation at a given \p Address. void addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Type, @@ -707,8 +769,7 @@ class BinaryContext { } /// Populate some internal data structures with debug info. - void preprocessDebugInfo( - std::map &BinaryFunctions); + void preprocessDebugInfo(); /// Add a filename entry from SrcCUID to DestCUID. unsigned addDebugFilenameToUnit(const uint32_t DestCUID, @@ -716,8 +777,7 @@ class BinaryContext { unsigned FileIndex); /// Return functions in output layout order - static std::vector - getSortedFunctions(std::map &BinaryFunctions); + std::vector getSortedFunctions(); /// Do the best effort to calculate the size of the function by emitting /// its code, and relaxing branch instructions. diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 23c554e700d1..be33aab8c50f 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -337,6 +337,9 @@ class BinaryFunction { /// destination. bool HasFixedIndirectBranch{false}; + /// Is the function known to exceed its input size? + bool IsLarge{false}; + /// The address for the code for this function in codegen memory. uint64_t ImageAddress{0}; @@ -1292,6 +1295,11 @@ class BinaryFunction { return IsSimple; } + /// Return true if the function should be split for the output. + bool shouldSplit() const { + return IsLarge && !getBinaryContext().HasRelocations; + } + /// Return true if the function body is non-contiguous. bool isSplit() const { return layout_size() && @@ -1656,6 +1664,11 @@ class BinaryFunction { return *this; } + BinaryFunction &setLarge(bool Large) { + IsLarge = Large; + return *this; + } + BinaryFunction &setUsesGnuArgsSize(bool Uses = true) { UsesGnuArgsSize = Uses; return *this; diff --git a/bolt/src/BinaryPassManager.cpp b/bolt/src/BinaryPassManager.cpp index 28516aaea781..53c675fd9543 100644 --- a/bolt/src/BinaryPassManager.cpp +++ b/bolt/src/BinaryPassManager.cpp @@ -291,6 +291,7 @@ const char BinaryFunctionPassManager::TimerGroupDesc[] = "Binary Function Pass Manager"; void BinaryFunctionPassManager::runPasses() { + auto &BFs = BC.getBinaryFunctions(); for (const auto &OptPassPair : Passes) { if (!OptPassPair.first) continue; @@ -306,7 +307,7 @@ void BinaryFunctionPassManager::runPasses() { callWithDynoStats( [this,&Pass] { - Pass->runOnFunctions(BC, BFs, LargeFunctions); + Pass->runOnFunctions(BC); }, BFs, Pass->getName(), @@ -349,14 +350,10 @@ void BinaryFunctionPassManager::runPasses() { } } -void BinaryFunctionPassManager::runAllPasses( - BinaryContext &BC, - std::map &Functions, - std::set &LargeFunctions -) { - BinaryFunctionPassManager Manager(BC, Functions, LargeFunctions); +void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { + BinaryFunctionPassManager Manager(BC); - const auto InitialDynoStats = getDynoStats(Functions); + const auto InitialDynoStats = getDynoStats(BC.getBinaryFunctions()); // Here we manage dependencies/order manually, since passes are run in the // order they're registered. diff --git a/bolt/src/BinaryPassManager.h b/bolt/src/BinaryPassManager.h index 08fa9c1af2f5..2a4632c46c59 100644 --- a/bolt/src/BinaryPassManager.h +++ b/bolt/src/BinaryPassManager.h @@ -27,8 +27,6 @@ namespace bolt { class BinaryFunctionPassManager { private: BinaryContext &BC; - std::map &BFs; - std::set &LargeFunctions; std::vector>> Passes; @@ -36,10 +34,8 @@ class BinaryFunctionPassManager { static const char TimerGroupName[]; static const char TimerGroupDesc[]; - BinaryFunctionPassManager(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) - : BC(BC), BFs(BFs), LargeFunctions(LargeFunctions) {} + BinaryFunctionPassManager(BinaryContext &BC) + : BC(BC) {} /// Adds a pass to this manager based on the value of its corresponding /// command-line option. @@ -57,10 +53,7 @@ class BinaryFunctionPassManager { void runPasses(); /// Runs all enabled implemented passes on all functions. - static void runAllPasses(BinaryContext &BC, - std::map &Functions, - std::set &LargeFunctions); - + static void runAllPasses(BinaryContext &BC); }; } // namespace bolt diff --git a/bolt/src/BoltDiff.cpp b/bolt/src/BoltDiff.cpp index 3ecea1490b47..90c484b3ed8e 100644 --- a/bolt/src/BoltDiff.cpp +++ b/bolt/src/BoltDiff.cpp @@ -204,7 +204,7 @@ class RewriteInstanceDiff { /// later when matching functions in binary 2 to corresponding functions /// in binary 1 void buildLookupMaps() { - for (const auto &BFI : RI1.BinaryFunctions) { + for (const auto &BFI : RI1.BC->getBinaryFunctions()) { StringRef LTOName; const auto &Function = BFI.second; const auto Score = getNormalizedScore(Function, RI1); @@ -224,7 +224,7 @@ class RewriteInstanceDiff { } // Compute LTONameLookup2 and LargestBin2 - for (const auto &BFI : RI2.BinaryFunctions) { + for (const auto &BFI : RI2.BC->getBinaryFunctions()) { StringRef LTOName; const auto &Function = BFI.second; const auto Score = getNormalizedScore(Function, RI2); @@ -245,7 +245,7 @@ class RewriteInstanceDiff { void matchFunctions() { outs() << "BOLT-DIFF: Mapping functions in Binary2 to Binary1\n"; - for (const auto &BFI2 : RI2.BinaryFunctions) { + for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) { const auto &Function2 = BFI2.second; StringRef LTOName; bool Match = false; @@ -451,7 +451,7 @@ class RewriteInstanceDiff { /// having a large difference in performance because hotness shifted from /// LTO variant 1 to variant 2, even though they represent the same function. void computeAggregatedLTOScore() { - for (const auto &BFI : RI1.BinaryFunctions) { + for (const auto &BFI : RI1.BC->getBinaryFunctions()) { const auto &Function = BFI.second; double Score = getNormalizedScore(Function, RI1); auto Iter = LTOMap1.find(&Function); @@ -461,7 +461,7 @@ class RewriteInstanceDiff { } double UnmappedScore{0}; - for (const auto &BFI : RI2.BinaryFunctions) { + for (const auto &BFI : RI2.BC->getBinaryFunctions()) { const auto &Function = BFI.second; bool Matched = FuncMap.find(&Function) != FuncMap.end(); double Score = getNormalizedScore(Function, RI2); @@ -475,7 +475,8 @@ class RewriteInstanceDiff { if (FuncMap.find(Iter->second) == FuncMap.end()) UnmappedScore += Score; } - int64_t Unmapped = RI2.BinaryFunctions.size() - Bin2MappedFuncs.size(); + int64_t Unmapped = + RI2.BC->getBinaryFunctions().size() - Bin2MappedFuncs.size(); outs() << "BOLT-DIFF: " << Unmapped << " functions in Binary2 have no correspondence to any other " "function in Binary1.\n"; @@ -595,7 +596,7 @@ class RewriteInstanceDiff { void reportUnmapped() { outs() << "List of functions from binary 2 that were not matched with any " << "function in binary 1:\n"; - for (const auto &BFI2 : RI2.BinaryFunctions) { + for (const auto &BFI2 : RI2.BC->getBinaryFunctions()) { const auto &Function2 = BFI2.second; if (Bin2MappedFuncs.count(&Function2)) continue; @@ -654,9 +655,9 @@ void RewriteInstance::compare(RewriteInstance &RI2) { if (opts::ICF) { IdenticalCodeFolding ICF(opts::NeverPrint); outs() << "BOLT-DIFF: Starting ICF pass for binary 1"; - ICF.runOnFunctions(*BC, BinaryFunctions, LargeFunctions); + ICF.runOnFunctions(*BC); outs() << "BOLT-DIFF: Starting ICF pass for binary 2"; - ICF.runOnFunctions(*RI2.BC, RI2.BinaryFunctions, RI2.LargeFunctions); + ICF.runOnFunctions(*RI2.BC); } RewriteInstanceDiff RID(*this, RI2); diff --git a/bolt/src/DWARFRewriter.cpp b/bolt/src/DWARFRewriter.cpp index 88d7db2adcc3..abc3d67ea83c 100644 --- a/bolt/src/DWARFRewriter.cpp +++ b/bolt/src/DWARFRewriter.cpp @@ -9,11 +9,9 @@ // //===----------------------------------------------------------------------===// - -#include "BinaryBasicBlock.h" +#include "DWARFRewriter.h" #include "BinaryContext.h" #include "BinaryFunction.h" -#include "RewriteInstance.h" #include "llvm/ADT/STLExtras.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -59,14 +57,14 @@ KeepARanges("keep-aranges", } // namespace opts -void RewriteInstance::updateDebugInfo() { +void DWARFRewriter::updateDebugInfo() { SectionPatchers[".debug_abbrev"] = llvm::make_unique(); SectionPatchers[".debug_info"] = llvm::make_unique(); - RangesSectionsWriter = llvm::make_unique(BC.get()); - LocationListWriter = llvm::make_unique(BC.get()); + RangesSectionsWriter = llvm::make_unique(&BC); + LocationListWriter = llvm::make_unique(&BC); - for (auto &CU : BC->DwCtx->compile_units()) { + for (auto &CU : BC.DwCtx->compile_units()) { updateUnitDebugInfo(CU->getUnitDIE(false), std::vector{}); } @@ -76,7 +74,7 @@ void RewriteInstance::updateDebugInfo() { updateGdbIndexSection(); } -void RewriteInstance::updateUnitDebugInfo( +void DWARFRewriter::updateUnitDebugInfo( const DWARFDie DIE, std::vector FunctionStack) { @@ -85,7 +83,7 @@ void RewriteInstance::updateUnitDebugInfo( case dwarf::DW_TAG_compile_unit: { const auto ModuleRanges = DIE.getAddressRanges(); - auto OutputRanges = translateModuleAddressRanges(ModuleRanges); + auto OutputRanges = BC.translateModuleAddressRanges(ModuleRanges); const auto RangesSectionOffset = RangesSectionsWriter->addCURanges(DIE.getDwarfUnit()->getOffset(), std::move(OutputRanges)); @@ -99,7 +97,7 @@ void RewriteInstance::updateUnitDebugInfo( uint64_t SectionIndex, LowPC, HighPC; if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) { IsFunctionDef = true; - const auto *Function = getBinaryFunctionAtAddress(LowPC); + const auto *Function = BC.getBinaryFunctionAtAddress(LowPC); if (Function && Function->isFolded()) { Function = nullptr; } @@ -229,7 +227,7 @@ void RewriteInstance::updateUnitDebugInfo( FunctionStack.pop_back(); } -void RewriteInstance::updateDWARFObjectAddressRanges( +void DWARFRewriter::updateDWARFObjectAddressRanges( const DWARFDie DIE, uint64_t DebugRangesOffset) { // Some objects don't have an associated DIE and cannot be updated (such as @@ -337,8 +335,8 @@ void RewriteInstance::updateDWARFObjectAddressRanges( } } -void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { - for (auto &It : BinaryFunctions) { +void DWARFRewriter::updateDebugLineInfoForNonSimpleFunctions() { + for (auto &It : BC.getBinaryFunctions()) { const auto &Function = It.second; if (Function.isSimple()) @@ -353,7 +351,7 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { std::vector Results; MCSectionELF *FunctionSection = - BC->Ctx->getELFSection(Function.getCodeSectionName(), + BC.Ctx->getELFSection(Function.getCodeSectionName(), ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); @@ -361,10 +359,10 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { if (LineTable->lookupAddressRange(Address, Function.getMaxSize(), Results)) { auto &OutputLineTable = - BC->Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections(); + BC.Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections(); for (auto RowIndex : Results) { const auto &Row = LineTable->Rows[RowIndex]; - BC->Ctx->setCurrentDwarfLoc( + BC.Ctx->setCurrentDwarfLoc( Row.File, Row.Line, Row.Column, @@ -375,17 +373,17 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { Row.Isa, Row.Discriminator, Row.Address); - auto Loc = BC->Ctx->getCurrentDwarfLoc(); - BC->Ctx->clearDwarfLocSeen(); + auto Loc = BC.Ctx->getCurrentDwarfLoc(); + BC.Ctx->clearDwarfLocSeen(); OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc}, FunctionSection); } // Add an empty entry past the end of the function // for end_sequence mark. - BC->Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0, + BC.Ctx->setCurrentDwarfLoc(0, 0, 0, 0, 0, 0, Address + Function.getMaxSize()); - auto Loc = BC->Ctx->getCurrentDwarfLoc(); - BC->Ctx->clearDwarfLocSeen(); + auto Loc = BC.Ctx->getCurrentDwarfLoc(); + BC.Ctx->clearDwarfLocSeen(); OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc}, FunctionSection); } else { @@ -395,9 +393,9 @@ void RewriteInstance::updateDebugLineInfoForNonSimpleFunctions() { } } -void RewriteInstance::updateLineTableOffsets() { +void DWARFRewriter::updateLineTableOffsets() { const auto *LineSection = - BC->Ctx->getObjectFileInfo()->getDwarfLineSection(); + BC.Ctx->getObjectFileInfo()->getDwarfLineSection(); auto CurrentFragment = LineSection->begin(); uint32_t CurrentOffset = 0; uint32_t Offset = 0; @@ -406,7 +404,7 @@ void RewriteInstance::updateLineTableOffsets() { // output file, thus we can compute all table's offset by passing through // each fragment at most once, continuing from the last CU's beginning // instead of from the first fragment. - for (const auto &CUIDLineTablePair : BC->Ctx->getMCDwarfLineTables()) { + for (const auto &CUIDLineTablePair : BC.Ctx->getMCDwarfLineTables()) { auto Label = CUIDLineTablePair.second.getLabel(); if (!Label) continue; @@ -415,10 +413,10 @@ void RewriteInstance::updateLineTableOffsets() { if (CUOffset == -1U) continue; - auto *CU = BC->DwCtx->getCompileUnitForOffset(CUOffset); + auto *CU = BC.DwCtx->getCompileUnitForOffset(CUOffset); assert(CU && "no CU found at offset"); auto LTOffset = - BC->DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list); + BC.DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list); if (!LTOffset) continue; @@ -444,9 +442,9 @@ void RewriteInstance::updateLineTableOffsets() { Offset += Label->getOffset() - CurrentOffset; CurrentOffset = Label->getOffset(); - auto DbgInfoSection = BC->getUniqueSectionByName(".debug_info"); + auto DbgInfoSection = BC.getUniqueSectionByName(".debug_info"); assert(DbgInfoSection && ".debug_info section must exist"); - auto *Zero = BC->registerNameAtAddress("Zero", 0, 0, 0); + auto *Zero = BC.registerNameAtAddress("Zero", 0, 0, 0); DbgInfoSection->addRelocation(LTOffset, Zero, ELF::R_X86_64_32, @@ -463,43 +461,43 @@ void RewriteInstance::updateLineTableOffsets() { } } -void RewriteInstance::finalizeDebugSections() { +void DWARFRewriter::finalizeDebugSections() { // Skip .debug_aranges if we are re-generating .gdb_index. - if (opts::KeepARanges || !GdbIndexSection) { + if (opts::KeepARanges || !BC.getGdbIndexSection()) { SmallVector ARangesBuffer; raw_svector_ostream OS(ARangesBuffer); - auto MAB = std::unique_ptr(BC->TheTarget->createMCAsmBackend( - *BC->STI, *BC->MRI, MCTargetOptions())); + auto MAB = std::unique_ptr(BC.TheTarget->createMCAsmBackend( + *BC.STI, *BC.MRI, MCTargetOptions())); auto Writer = std::unique_ptr(MAB->createObjectWriter(OS)); RangesSectionsWriter->writeArangesSection(Writer.get()); const auto &ARangesContents = OS.str(); - BC->registerOrUpdateNoteSection(".debug_aranges", + BC.registerOrUpdateNoteSection(".debug_aranges", copyByteArray(ARangesContents), ARangesContents.size()); } auto RangesSectionContents = RangesSectionsWriter->finalize(); - BC->registerOrUpdateNoteSection(".debug_ranges", + BC.registerOrUpdateNoteSection(".debug_ranges", copyByteArray(*RangesSectionContents), RangesSectionContents->size()); auto LocationListSectionContents = LocationListWriter->finalize(); - BC->registerOrUpdateNoteSection(".debug_loc", + BC.registerOrUpdateNoteSection(".debug_loc", copyByteArray(*LocationListSectionContents), LocationListSectionContents->size()); } -void RewriteInstance::updateGdbIndexSection() { - if (!GdbIndexSection) +void DWARFRewriter::updateGdbIndexSection() { + if (!BC.getGdbIndexSection()) return; // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html for // .gdb_index section format. - StringRef GdbIndexContents = GdbIndexSection->getContents(); + StringRef GdbIndexContents = BC.getGdbIndexSection()->getContents(); const auto *Data = GdbIndexContents.data(); @@ -523,13 +521,13 @@ void RewriteInstance::updateGdbIndexSection() { // Map CUs offsets to indices and verify existing index table. std::map OffsetToIndexMap; const auto CUListSize = CUTypesOffset - CUListOffset; - const auto NumCUs = BC->DwCtx->getNumCompileUnits(); + const auto NumCUs = BC.DwCtx->getNumCompileUnits(); if (CUListSize != NumCUs * 16) { errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n"; exit(1); } for (unsigned Index = 0; Index < NumCUs; ++Index, Data += 16) { - const auto *CU = BC->DwCtx->getCompileUnitAtIndex(Index); + const auto *CU = BC.DwCtx->getCompileUnitAtIndex(Index); const auto Offset = read64le(Data); if (CU->getOffset() != Offset) { errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n"; @@ -595,7 +593,7 @@ void RewriteInstance::updateGdbIndexSection() { memcpy(Buffer, Data, TrailingSize); // Register the new section. - BC->registerOrUpdateNoteSection(".gdb_index", + BC.registerOrUpdateNoteSection(".gdb_index", NewGdbIndexContents, NewGdbIndexSize); } diff --git a/bolt/src/DWARFRewriter.h b/bolt/src/DWARFRewriter.h new file mode 100644 index 000000000000..58285a63b216 --- /dev/null +++ b/bolt/src/DWARFRewriter.h @@ -0,0 +1,85 @@ +//===--- DWARFRewriter.h --------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H +#define LLVM_TOOLS_LLVM_BOLT_DWARF_REWRITER_H + +#include "DebugData.h" +#include "RewriteInstance.h" + +namespace llvm { + +namespace bolt { + +class BinaryFunction; + +class DWARFRewriter { + DWARFRewriter() = delete; + + BinaryContext &BC; + + using SectionPatchersType = RewriteInstance::SectionPatchersType; + + SectionPatchersType &SectionPatchers; + + /// Stores and serializes information that will be put into the .debug_ranges + /// and .debug_aranges DWARF sections. + std::unique_ptr RangesSectionsWriter; + + std::unique_ptr LocationListWriter; + + /// Recursively update debug info for all DIEs in \p Unit. + /// If \p Function is not empty, it points to a function corresponding + /// to a parent DW_TAG_subprogram node of the current \p DIE. + void updateUnitDebugInfo(const DWARFDie DIE, + std::vector FunctionStack); + + /// Patches the binary for an object's address ranges to be updated. + /// The object can be a anything that has associated address ranges via either + /// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc). + /// \p DebugRangesOffset is the offset in .debug_ranges of the object's + /// new address ranges in the output binary. + /// \p Unit Compile unit the object belongs to. + /// \p DIE is the object's DIE in the input binary. + void updateDWARFObjectAddressRanges(const DWARFDie DIE, + uint64_t DebugRangesOffset); + + /// Generate new contents for .debug_ranges and .debug_aranges section. + void finalizeDebugSections(); + + /// Patches the binary for DWARF address ranges (e.g. in functions and lexical + /// blocks) to be updated. + void updateDebugAddressRanges(); + + /// Rewrite .gdb_index section if present. + void updateGdbIndexSection(); + +public: + DWARFRewriter(BinaryContext &BC, + SectionPatchersType &SectionPatchers) + : BC(BC), SectionPatchers(SectionPatchers) {} + + /// Main function for updating the DWARF debug info. + void updateDebugInfo(); + + /// Computes output .debug_line line table offsets for each compile unit, + /// and updates stmt_list for a corresponding compile unit. + void updateLineTableOffsets(); + + /// Updates debug line information for non-simple functions, which are not + /// rewritten. + void updateDebugLineInfoForNonSimpleFunctions(); +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 70091d2fba70..dbc83687819c 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -422,11 +422,8 @@ std::error_code DataAggregator::writeAutoFDOData() { return std::error_code(); } -void DataAggregator::parseProfile( - BinaryContext &BC, - std::map &BFs) { +void DataAggregator::parseProfile(BinaryContext &BC) { this->BC = &BC; - this->BFs = &BFs; if (opts::ReadPreAggregated) { parsePreAggregated(); @@ -546,9 +543,7 @@ void DataAggregator::parseProfile( deleteTempFiles(); } -void DataAggregator::processProfile( - BinaryContext &BC, - std::map &BFs) { +void DataAggregator::processProfile(BinaryContext &BC) { if (opts::ReadPreAggregated) processPreAggregated(); else if (opts::BasicAggregation) @@ -559,7 +554,7 @@ void DataAggregator::processProfile( processMemEvents(); // Mark all functions with registered events as having a valid profile. - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { auto &BF = BFI.second; if (BF.getBranchData()) { const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE @@ -581,15 +576,8 @@ DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) { if (!BC->containsAddress(Address)) return nullptr; - auto FI = BFs->upper_bound(Address); - if (FI == BFs->begin()) - return nullptr; - --FI; - - const auto UsedSize = FI->second.getMaxSize(); - if (Address >= FI->first + UsedSize) - return nullptr; - return &FI->second; + return BC->getBinaryFunctionContainingAddress(Address, /*CheckPastEnd=*/false, + /*UseMaxSize=*/true); } bool DataAggregator::doSample(BinaryFunction &Func, uint64_t Address, diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index 26863ff14667..eca279daccf6 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -172,7 +172,6 @@ class DataAggregator : public DataReader { /// References to core BOLT data structures BinaryContext *BC{nullptr}; - std::map *BFs{nullptr}; /// Aggregation statistics uint64_t NumInvalidTraces{0}; @@ -400,12 +399,10 @@ class DataAggregator : public DataReader { /// Parse profile and mark functions/objects with profile. /// Don't assign profile to functions yet. - void parseProfile(BinaryContext &BC, - std::map &BFs); + void parseProfile(BinaryContext &BC); /// Populate functions with profile. - void processProfile(BinaryContext &BC, - std::map &BFs); + void processProfile(BinaryContext &BC); /// Check whether \p FileName is a perf.data file static bool checkPerfDataMagic(StringRef FileName); diff --git a/bolt/src/DebugData.h b/bolt/src/DebugData.h index 44fc1ccc4324..355c8b280423 100644 --- a/bolt/src/DebugData.h +++ b/bolt/src/DebugData.h @@ -25,19 +25,14 @@ #include #include -#include "BinaryBasicBlock.h" - namespace llvm { class DWARFCompileUnit; -class DWARFDebugInfoEntryMinimal; class MCObjectWriter; namespace bolt { class BinaryContext; -class BasicBlockTable; -class BinaryBasicBlock; class BinaryFunction; /// Address range representation. Takes less space than DWARFAddressRange. diff --git a/bolt/src/Passes/Aligner.cpp b/bolt/src/Passes/Aligner.cpp index d95634acbede..a261de569370 100644 --- a/bolt/src/Passes/Aligner.cpp +++ b/bolt/src/Passes/Aligner.cpp @@ -154,15 +154,13 @@ void AlignerPass::alignBlocks(BinaryFunction &Function) { } } -void AlignerPass::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void AlignerPass::runOnFunctions(BinaryContext &BC) { if (!BC.HasRelocations) return; AlignHistogram.resize(opts::BlockAlignment); - for (auto &It : BFs) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; if (opts::UseCompactAligner) diff --git a/bolt/src/Passes/Aligner.h b/bolt/src/Passes/Aligner.h index 28e6f6d693b0..88adf932da7f 100644 --- a/bolt/src/Passes/Aligner.h +++ b/bolt/src/Passes/Aligner.h @@ -37,9 +37,7 @@ class AlignerPass : public BinaryFunctionPass { } /// Pass entry point - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/AllocCombiner.cpp b/bolt/src/Passes/AllocCombiner.cpp index 2513ea1fc88b..ad88a04f7aa9 100644 --- a/bolt/src/Passes/AllocCombiner.cpp +++ b/bolt/src/Passes/AllocCombiner.cpp @@ -100,14 +100,13 @@ void AllocCombinerPass::combineAdjustments(BinaryContext &BC, } } -void AllocCombinerPass::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void AllocCombinerPass::runOnFunctions(BinaryContext &BC) { if (opts::FrameOptimization == FOP_NONE) return; runForAllWeCare( - BFs, [&](BinaryFunction &Function) { combineAdjustments(BC, Function); }); + BC.getBinaryFunctions(), + [&](BinaryFunction &Function) { combineAdjustments(BC, Function); }); outs() << "BOLT-INFO: Allocation combiner: " << NumCombined << " empty spaces coalesced.\n"; diff --git a/bolt/src/Passes/AllocCombiner.h b/bolt/src/Passes/AllocCombiner.h index 0e816048956d..cdfbf48c8c0d 100644 --- a/bolt/src/Passes/AllocCombiner.h +++ b/bolt/src/Passes/AllocCombiner.h @@ -40,9 +40,7 @@ class AllocCombinerPass : public BinaryFunctionPass { } /// Pass entry point - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/BinaryFunctionCallGraph.cpp b/bolt/src/Passes/BinaryFunctionCallGraph.cpp index 538b7378b668..c5db238fc465 100644 --- a/bolt/src/Passes/BinaryFunctionCallGraph.cpp +++ b/bolt/src/Passes/BinaryFunctionCallGraph.cpp @@ -77,7 +77,6 @@ std::deque BinaryFunctionCallGraph::buildTraversalOrder() { } BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, - std::map &BFs, CgFilterFunction Filter, bool CgFromPerfData, bool IncludeColdCalls, @@ -126,7 +125,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, uint64_t NoProfileCallsites = 0; uint64_t NumFallbacks = 0; uint64_t RecursiveCallsites = 0; - for (auto &It : BFs) { + for (auto &It : BC.getBinaryFunctions()) { auto *Function = &It.second; if (Filter(*Function)) { diff --git a/bolt/src/Passes/BinaryFunctionCallGraph.h b/bolt/src/Passes/BinaryFunctionCallGraph.h index 0bce5c9de92f..b840493c9cc1 100644 --- a/bolt/src/Passes/BinaryFunctionCallGraph.h +++ b/bolt/src/Passes/BinaryFunctionCallGraph.h @@ -57,7 +57,7 @@ class BinaryFunctionCallGraph : public CallGraph { using CgFilterFunction = std::function; inline bool NoFilter(const BinaryFunction &) { return false; } -/// Builds a call graph from the map of BinaryFunctions provided in BFs. +/// Builds a call graph from the map of BinaryFunctions provided in BC. /// The arguments control how the graph is constructed. /// Filter is called on each function, any function that it returns true for /// is omitted from the graph. @@ -68,7 +68,6 @@ inline bool NoFilter(const BinaryFunction &) { return false; } /// UseEdgeCounts is used to control if the Weight attribute on Arcs is computed /// using the number of calls. BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC, - std::map &BFs, CgFilterFunction Filter = NoFilter, bool CgFromPerfData = false, bool IncludeColdCalls = true, diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index f9ee20a07934..62dc7e2e717a 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -286,12 +286,8 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { } } -void EliminateUnreachableBlocks::runOnFunctions( - BinaryContext&, - std::map &BFs, - std::set & -) { - for (auto &It : BFs) { +void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; if (shouldOptimize(Function)) { runOnFunction(Function); @@ -306,17 +302,14 @@ bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); } -void ReorderBasicBlocks::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) return; IsAArch64 = BC.isAArch64(); uint64_t ModifiedFuncCount = 0; - for (auto &It : BFs) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; if (!shouldOptimize(Function)) @@ -326,7 +319,7 @@ void ReorderBasicBlocks::runOnFunctions( (opts::SplitFunctions == BinaryFunction::ST_ALL) || (opts::SplitFunctions == BinaryFunction::ST_EH && Function.hasEHRanges()) || - (LargeFunctions.find(It.first) != LargeFunctions.end()); + Function.shouldSplit(); modifyFunctionLayout(Function, opts::ReorderBlocks, opts::MinBranchClusters, ShouldSplit); @@ -337,12 +330,14 @@ void ReorderBasicBlocks::runOnFunctions( outs() << "BOLT-INFO: basic block reordering modified layout of " << format("%zu (%.2lf%%) functions\n", - ModifiedFuncCount, 100.0 * ModifiedFuncCount / BFs.size()); + ModifiedFuncCount, + 100.0 * ModifiedFuncCount / BC.getBinaryFunctions().size()); if (opts::PrintFuncStat > 0) { raw_ostream &OS = outs(); // Copy all the values into vector in order to sort them std::map ScoreMap; + auto &BFs = BC.getBinaryFunctions(); for (auto It = BFs.begin(); It != BFs.end(); ++It) { ScoreMap.insert(std::pair( It->second.getFunctionScore(), It->second)); @@ -551,11 +546,8 @@ void ReorderBasicBlocks::splitFunction(BinaryFunction &BF) const { } } -void FixupBranches::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &) { - for (auto &It : BFs) { +void FixupBranches::runOnFunctions(BinaryContext &BC) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; if (BC.HasRelocations || shouldOptimize(Function)) { if (BC.HasRelocations && !Function.isSimple()) @@ -565,12 +557,8 @@ void FixupBranches::runOnFunctions( } } -void FinalizeFunctions::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set & -) { - for (auto &It : BFs) { +void FinalizeFunctions::runOnFunctions(BinaryContext &BC) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; const auto ShouldOptimize = shouldOptimize(Function); @@ -596,11 +584,8 @@ void FinalizeFunctions::runOnFunctions( } } -void LowerAnnotations::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &) { - for (auto &It : BFs) { +void LowerAnnotations::runOnFunctions(BinaryContext &BC) { + for (auto &It : BC.getBinaryFunctions()) { auto &BF = It.second; int64_t CurrentGnuArgsSize = 0; @@ -985,15 +970,11 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, return NumLocalCTCs > 0; } -void SimplifyConditionalTailCalls::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set & -) { +void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { if (!BC.isX86()) return; - for (auto &It : BFs) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; if (!shouldOptimize(Function)) @@ -1081,9 +1062,7 @@ void Peepholes::removeUselessCondBranches(BinaryContext &BC, } } -void Peepholes::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void Peepholes::runOnFunctions(BinaryContext &BC) { const char Opts = std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), @@ -1094,7 +1073,7 @@ void Peepholes::runOnFunctions(BinaryContext &BC, if (Opts == opts::PEEP_NONE || !BC.isX86()) return; - for (auto &It : BFs) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; if (shouldOptimize(Function)) { if (Opts & opts::PEEP_SHORTEN) @@ -1198,12 +1177,8 @@ bool SimplifyRODataLoads::simplifyRODataLoads( return NumLocalLoadsSimplified > 0; } -void SimplifyRODataLoads::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set & -) { - for (auto &It : BFs) { +void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; if (shouldOptimize(Function) && simplifyRODataLoads(BC, Function)) { Modified.insert(&Function); @@ -1217,9 +1192,7 @@ void SimplifyRODataLoads::runOnFunctions( << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; } -void AssignSections::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &) { +void AssignSections::runOnFunctions(BinaryContext &BC) { for (auto *Function : BC.getInjectedBinaryFunctions()) { Function->setCodeSectionName(BC.getInjectedCodeSectionName()); Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); @@ -1230,7 +1203,7 @@ void AssignSections::runOnFunctions(BinaryContext &BC, return; const auto UseColdSection = BC.NumProfiledFuncs > 0; - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { auto &Function = BFI.second; if (opts::isHotTextMover(Function)) { Function.setCodeSectionName(BC.getHotTextMoverSectionName()); @@ -1252,15 +1225,13 @@ void AssignSections::runOnFunctions(BinaryContext &BC, } void -PrintProgramStats::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &) { +PrintProgramStats::runOnFunctions(BinaryContext &BC) { uint64_t NumSimpleFunctions{0}; uint64_t NumStaleProfileFunctions{0}; uint64_t NumNonSimpleProfiledFunctions{0}; std::vector ProfiledFunctions; const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { auto &Function = BFI.second; if (!Function.isSimple()) { if (Function.hasProfile()) { @@ -1356,7 +1327,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, std::vector Functions; std::map Stats; - for (const auto &BFI : BFs) { + for (const auto &BFI : BC.getBinaryFunctions()) { const auto &BF = BFI.second; if (shouldOptimize(BF) && BF.hasValidProfile()) { Functions.push_back(&BF); @@ -1462,7 +1433,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, // Collect and print information about suboptimal code layout on input. if (opts::ReportBadLayout) { std::vector SuboptimalFuncs; - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { const auto &BF = BFI.second; if (!BF.hasValidProfile()) continue; @@ -1500,11 +1471,8 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC, } } -void InstructionLowering::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { - for (auto &BFI : BFs) { +void InstructionLowering::runOnFunctions(BinaryContext &BC) { + for (auto &BFI : BC.getBinaryFunctions()) { for (auto &BB : BFI.second) { for (auto &Instruction : BB) { BC.MIB->lowerTailCall(Instruction); @@ -1513,13 +1481,10 @@ void InstructionLowering::runOnFunctions( } } -void StripRepRet::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void StripRepRet::runOnFunctions(BinaryContext &BC) { uint64_t NumPrefixesRemoved = 0; uint64_t NumBytesSaved = 0; - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { for (auto &BB : BFI.second) { auto LastInstRIter = BB.getLastNonPseudo(); if (LastInstRIter == BB.rend() || @@ -1539,15 +1504,13 @@ void StripRepRet::runOnFunctions( } } -void InlineMemcpy::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void InlineMemcpy::runOnFunctions(BinaryContext &BC) { if (!BC.isX86()) return; uint64_t NumInlined = 0; uint64_t NumInlinedDyno = 0; - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { for (auto &BB : BFI.second) { for(auto II = BB.begin(); II != BB.end(); ++II) { auto &Inst = *II; diff --git a/bolt/src/Passes/BinaryPasses.h b/bolt/src/Passes/BinaryPasses.h index 4cf7fee9c92c..2ad31cfec754 100644 --- a/bolt/src/Passes/BinaryPasses.h +++ b/bolt/src/Passes/BinaryPasses.h @@ -53,9 +53,7 @@ class BinaryFunctionPass { virtual bool shouldPrint(const BinaryFunction &BF) const; /// Execute this pass on the given functions. - virtual void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) = 0; + virtual void runOnFunctions(BinaryContext &BC) = 0; }; /// A pass to print program-wide dynostats. @@ -79,10 +77,8 @@ class DynoStatsPrintPass : public BinaryFunctionPass { return false; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override { - const auto NewDynoStats = getDynoStats(BFs); + void runOnFunctions(BinaryContext &BC) override { + const auto NewDynoStats = getDynoStats(BC.getBinaryFunctions()); const auto Changed = (NewDynoStats != PrevDynoStats); outs() << "BOLT-INFO: program-wide dynostats " << Title << (Changed ? "" : " (no change)") << ":\n\n" @@ -112,9 +108,7 @@ class EliminateUnreachableBlocks : public BinaryFunctionPass { bool shouldPrint(const BinaryFunction &BF) const override { return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; } - void runOnFunctions(BinaryContext&, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext&) override; }; // Reorder the basic blocks for each function based on hotness. @@ -164,9 +158,7 @@ class ReorderBasicBlocks : public BinaryFunctionPass { return "reordering"; } bool shouldPrint(const BinaryFunction &BF) const override; - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Sync local branches with CFG. @@ -178,9 +170,7 @@ class FixupBranches : public BinaryFunctionPass { const char *getName() const override { return "fix-branches"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Fix the CFI state and exception handling information after all other @@ -193,9 +183,7 @@ class FinalizeFunctions : public BinaryFunctionPass { const char *getName() const override { return "finalize-functions"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Convert and remove all BOLT-related annotations before LLVM code emission. @@ -207,9 +195,7 @@ class LowerAnnotations : public BinaryFunctionPass { const char *getName() const override { return "lower-annotations"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// An optimization to simplify conditional tail calls by removing @@ -281,9 +267,7 @@ class SimplifyConditionalTailCalls : public BinaryFunctionPass { bool shouldPrint(const BinaryFunction &BF) const override { return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Perform simple peephole optimizations. @@ -313,9 +297,7 @@ class Peepholes : public BinaryFunctionPass { const char *getName() const override { return "peepholes"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// An optimization to simplify loads from read-only sections.The pass converts @@ -348,9 +330,7 @@ class SimplifyRODataLoads : public BinaryFunctionPass { bool shouldPrint(const BinaryFunction &BF) const override { return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Assign output sections to all functions. @@ -363,9 +343,7 @@ class AssignSections : public BinaryFunctionPass { const char *getName() const override { return "assign-sections"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Prints a list of the top 100 functions sorted by a set of @@ -381,9 +359,7 @@ class PrintProgramStats : public BinaryFunctionPass { bool shouldPrint(const BinaryFunction &) const override { return false; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Pass for lowering any instructions that we have raised and that have @@ -397,9 +373,7 @@ class InstructionLowering : public BinaryFunctionPass { return "inst-lowering"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Pass for stripping 'repz' from 'repz retq' sequence of instructions. @@ -412,9 +386,7 @@ class StripRepRet : public BinaryFunctionPass { return "strip-rep-ret"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; /// Pass for inlining calls to memcpy using 'rep movsb' on X86. @@ -427,9 +399,7 @@ class InlineMemcpy : public BinaryFunctionPass { return "inline-memcpy"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; enum FrameOptimizationType : char { diff --git a/bolt/src/Passes/FrameAnalysis.cpp b/bolt/src/Passes/FrameAnalysis.cpp index 251d160c2e97..485394be90de 100644 --- a/bolt/src/Passes/FrameAnalysis.cpp +++ b/bolt/src/Passes/FrameAnalysis.cpp @@ -485,7 +485,7 @@ bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) { } void FrameAnalysis::cleanAnnotations() { - for (auto &I : BFs) { + for (auto &I : BC.getBinaryFunctions()) { for (auto &BB : I.second) { for (auto &Inst : BB) { BC.MIB->removeAnnotation(Inst, "ArgAccessEntry"); @@ -496,16 +496,15 @@ void FrameAnalysis::cleanAnnotations() { } FrameAnalysis::FrameAnalysis(BinaryContext &BC, - std::map &BFs, BinaryFunctionCallGraph &CG) - : BC(BC), BFs(BFs) { + : BC(BC) { // Position 0 of the vector should be always associated with "assume access // everything". ArgAccessesVector.emplace_back(ArgAccesses(/*AssumeEverything*/ true)); traverseCG(CG); - for (auto &I : BFs) { + for (auto &I : BC.getBinaryFunctions()) { auto Count = I.second.getExecutionCount(); if (Count != BinaryFunction::COUNT_NO_PROFILE) CountDenominator += Count; diff --git a/bolt/src/Passes/FrameAnalysis.h b/bolt/src/Passes/FrameAnalysis.h index 69c188c2e2e3..0ab24b7fba34 100644 --- a/bolt/src/Passes/FrameAnalysis.h +++ b/bolt/src/Passes/FrameAnalysis.h @@ -93,7 +93,7 @@ raw_ostream &operator<<(raw_ostream &OS, /// Initialization: /// /// FrameAnalysis FA(PrintPass); -/// FA.runOnFunctions(BC, BFs, LargeFunctions); +/// FA.runOnFunctions(BC); /// /// Usage (fetching frame access information about a given instruction): /// @@ -113,7 +113,6 @@ raw_ostream &operator<<(raw_ostream &OS, /// class FrameAnalysis { BinaryContext &BC; - std::map &BFs; /// Map functions to the set of tuples representing /// accesses to stack positions that belongs to caller @@ -170,7 +169,6 @@ class FrameAnalysis { public: explicit FrameAnalysis(BinaryContext &BC, - std::map &BFs, BinaryFunctionCallGraph &CG); /// Return true if we could fully analyze \p Func diff --git a/bolt/src/Passes/FrameOptimizer.cpp b/bolt/src/Passes/FrameOptimizer.cpp index 93818c8c416d..832a15e7f023 100644 --- a/bolt/src/Passes/FrameOptimizer.cpp +++ b/bolt/src/Passes/FrameOptimizer.cpp @@ -221,20 +221,18 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, } } -void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) { if (opts::FrameOptimization == FOP_NONE) return; // Run FrameAnalysis pass - BinaryFunctionCallGraph CG = buildCallGraph(BC, BFs); - FrameAnalysis FA(BC, BFs, CG); - RegAnalysis RA(BC, &BFs, &CG); + BinaryFunctionCallGraph CG = buildCallGraph(BC); + FrameAnalysis FA(BC, CG); + RegAnalysis RA(BC, &BC.getBinaryFunctions(), &CG); // Our main loop: perform caller-saved register optimizations, then // callee-saved register optimizations (shrink wrapping). - for (auto &I : BFs) { + for (auto &I : BC.getBinaryFunctions()) { if (!FA.hasFrameInfo(I.second)) continue; // Restrict pass execution if user asked to only run on hot functions diff --git a/bolt/src/Passes/FrameOptimizer.h b/bolt/src/Passes/FrameOptimizer.h index b868e65fb974..77ed31cc5406 100644 --- a/bolt/src/Passes/FrameOptimizer.h +++ b/bolt/src/Passes/FrameOptimizer.h @@ -108,9 +108,7 @@ class FrameOptimizerPass : public BinaryFunctionPass { } /// Pass entry point - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; bool shouldPrint(const BinaryFunction &BF) const override { return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0; diff --git a/bolt/src/Passes/IdenticalCodeFolding.cpp b/bolt/src/Passes/IdenticalCodeFolding.cpp index 65bfbf1f31e7..d41e48d29716 100644 --- a/bolt/src/Passes/IdenticalCodeFolding.cpp +++ b/bolt/src/Passes/IdenticalCodeFolding.cpp @@ -281,10 +281,8 @@ bool isIdenticalWith(const BinaryFunction &A, const BinaryFunction &B, namespace llvm { namespace bolt { -void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &) { - const auto OriginalFunctionCount = BFs.size(); +void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) { + const auto OriginalFunctionCount = BC.getBinaryFunctions().size(); uint64_t NumFunctionsFolded = 0; uint64_t NumJTFunctionsFolded = 0; uint64_t BytesSavedEstimate = 0; @@ -312,7 +310,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, // be folded. std::unordered_map, KeyHash, KeyCongruent> CongruentBuckets; - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { auto &BF = BFI.second; if (!shouldOptimize(BF) || BF.isFolded()) continue; @@ -375,7 +373,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, BytesSavedEstimate += ChildBF->getSize(); CallsSavedEstimate += std::min(ChildBF->getKnownExecutionCount(), ParentBF->getKnownExecutionCount()); - BC.foldFunction(*ChildBF, *ParentBF, BFs); + BC.foldFunction(*ChildBF, *ParentBF); ++NumFoldedLastIteration; diff --git a/bolt/src/Passes/IdenticalCodeFolding.h b/bolt/src/Passes/IdenticalCodeFolding.h index 708fdb9a0107..5e42477749e9 100644 --- a/bolt/src/Passes/IdenticalCodeFolding.h +++ b/bolt/src/Passes/IdenticalCodeFolding.h @@ -30,9 +30,7 @@ class IdenticalCodeFolding : public BinaryFunctionPass { const char *getName() const override { return "identical-code-folding"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp index 941d091f7d69..4457585d6a13 100644 --- a/bolt/src/Passes/IndirectCallPromotion.cpp +++ b/bolt/src/Passes/IndirectCallPromotion.cpp @@ -1082,14 +1082,12 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB, }); } -void IndirectCallPromotion::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions -) { +void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) { if (opts::IndirectCallPromotion == ICP_NONE) return; + auto &BFs = BC.getBinaryFunctions(); + const bool OptimizeCalls = (opts::IndirectCallPromotion == ICP_CALLS || opts::IndirectCallPromotion == ICP_ALL); @@ -1100,7 +1098,7 @@ void IndirectCallPromotion::runOnFunctions( std::unique_ptr RA; std::unique_ptr CG; if (OptimizeJumpTables) { - CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); + CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC))); RA.reset(new RegAnalysis(BC, &BFs, &*CG)); } diff --git a/bolt/src/Passes/IndirectCallPromotion.h b/bolt/src/Passes/IndirectCallPromotion.h index f8ea575b2fa2..7c42a5f76da4 100644 --- a/bolt/src/Passes/IndirectCallPromotion.h +++ b/bolt/src/Passes/IndirectCallPromotion.h @@ -239,9 +239,7 @@ class IndirectCallPromotion : public BinaryFunctionPass { bool shouldPrint(const BinaryFunction &BF) const override { return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/Inliner.cpp b/bolt/src/Passes/Inliner.cpp index 4836240c662a..106bd6205929 100644 --- a/bolt/src/Passes/Inliner.cpp +++ b/bolt/src/Passes/Inliner.cpp @@ -248,9 +248,8 @@ Inliner::InliningInfo Inliner::getInliningInfo(const BinaryFunction &BF) const { } void -Inliner::findInliningCandidates(BinaryContext &BC, - const std::map &BFs) { - for (const auto &BFI : BFs) { +Inliner::findInliningCandidates(BinaryContext &BC) { + for (const auto &BFI : BC.getBinaryFunctions()) { const auto &Function = BFI.second; const auto InlInfo = getInliningInfo(Function); if (InlInfo.Type != INL_NONE) @@ -532,16 +531,14 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) { return DidInlining; } -void Inliner::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &) { +void Inliner::runOnFunctions(BinaryContext &BC) { opts::syncOptions(); if (!opts::inliningEnabled()) return; uint64_t TotalSize = 0; - for (auto &BFI : BFs) + for (auto &BFI : BC.getBinaryFunctions()) TotalSize += BFI.second.getSize(); bool InlinedOnce; @@ -553,10 +550,10 @@ void Inliner::runOnFunctions(BinaryContext &BC, InlinedOnce = false; InliningCandidates.clear(); - findInliningCandidates(BC, BFs); + findInliningCandidates(BC); std::vector ConsideredFunctions; - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { auto &Function = BFI.second; if (!shouldOptimize(Function)) continue; diff --git a/bolt/src/Passes/Inliner.h b/bolt/src/Passes/Inliner.h index a890a9e27e26..db6f89195b23 100644 --- a/bolt/src/Passes/Inliner.h +++ b/bolt/src/Passes/Inliner.h @@ -68,8 +68,7 @@ class Inliner : public BinaryFunctionPass { /// Return the size in bytes of a tail call instruction. uint64_t getSizeOfTailCallInst(const BinaryContext &BC); - void findInliningCandidates(BinaryContext &BC, - const std::map &BFs); + void findInliningCandidates(BinaryContext &BC); bool inlineCallsInFunction(BinaryFunction &Function); @@ -97,9 +96,7 @@ class Inliner : public BinaryFunctionPass { return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/JTFootprintReduction.cpp b/bolt/src/Passes/JTFootprintReduction.cpp index 52d32f55daea..480ed56d62dd 100644 --- a/bolt/src/Passes/JTFootprintReduction.cpp +++ b/bolt/src/Passes/JTFootprintReduction.cpp @@ -243,21 +243,17 @@ void JTFootprintReduction::optimizeFunction(BinaryContext &BC, } } -void JTFootprintReduction::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions -) { +void JTFootprintReduction::runOnFunctions(BinaryContext &BC) { if (opts::JumpTables == JTS_BASIC && BC.HasRelocations) return; std::unique_ptr RA; std::unique_ptr CG; if (!opts::JTFootprintOnlyPIC) { - CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); - RA.reset(new RegAnalysis(BC, &BFs, &*CG)); + CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC))); + RA.reset(new RegAnalysis(BC, &BC.getBinaryFunctions(), &*CG)); } - for (auto &BFIt : BFs) { + for (auto &BFIt : BC.getBinaryFunctions()) { auto &Function = BFIt.second; if (!Function.isSimple() || !opts::shouldProcess(Function)) diff --git a/bolt/src/Passes/JTFootprintReduction.h b/bolt/src/Passes/JTFootprintReduction.h index 61726619bf23..e2b9f8335f1f 100644 --- a/bolt/src/Passes/JTFootprintReduction.h +++ b/bolt/src/Passes/JTFootprintReduction.h @@ -75,9 +75,7 @@ class JTFootprintReduction : public BinaryFunctionPass { bool shouldPrint(const BinaryFunction &BF) const override { return BinaryFunctionPass::shouldPrint(BF) && Modified.count(&BF) > 0; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/LongJmp.cpp b/bolt/src/Passes/LongJmp.cpp index f4810be5fb90..da6997ae5eb9 100644 --- a/bolt/src/Passes/LongJmp.cpp +++ b/bolt/src/Passes/LongJmp.cpp @@ -595,11 +595,9 @@ bool LongJmpPass::relax(BinaryFunction &Func) { return Modified; } -void LongJmpPass::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void LongJmpPass::runOnFunctions(BinaryContext &BC) { outs() << "BOLT-INFO: Starting stub-insertion pass\n"; - auto Sorted = BinaryContext::getSortedFunctions(BFs); + auto Sorted = BC.getSortedFunctions(); bool Modified; uint32_t Iterations{0}; do { diff --git a/bolt/src/Passes/LongJmp.h b/bolt/src/Passes/LongJmp.h index 5bed5a30ac9c..1d317c89e55d 100644 --- a/bolt/src/Passes/LongJmp.h +++ b/bolt/src/Passes/LongJmp.h @@ -150,9 +150,7 @@ class LongJmpPass : public BinaryFunctionPass { const char *getName() const override { return "long-jmp"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } } diff --git a/bolt/src/Passes/PLTCall.cpp b/bolt/src/Passes/PLTCall.cpp index 966ff2e09192..7f5db420a015 100644 --- a/bolt/src/Passes/PLTCall.cpp +++ b/bolt/src/Passes/PLTCall.cpp @@ -43,15 +43,12 @@ PLT("plt", namespace llvm { namespace bolt { -void PLTCall::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &) { +void PLTCall::runOnFunctions(BinaryContext &BC) { if (opts::PLT == OT_NONE) return; uint64_t NumCallsOptimized = 0; - for (auto &It : BFs) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; if (!shouldOptimize(Function)) continue; diff --git a/bolt/src/Passes/PLTCall.h b/bolt/src/Passes/PLTCall.h index 19daab31f2fb..8a27a4be1033 100644 --- a/bolt/src/Passes/PLTCall.h +++ b/bolt/src/Passes/PLTCall.h @@ -38,9 +38,7 @@ class PLTCall : public BinaryFunctionPass { bool shouldPrint(const BinaryFunction &BF) const override { return BinaryFunctionPass::shouldPrint(BF); } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/RegAnalysis.h b/bolt/src/Passes/RegAnalysis.h index 9c01fce5b8cb..a6416edcfe5c 100644 --- a/bolt/src/Passes/RegAnalysis.h +++ b/bolt/src/Passes/RegAnalysis.h @@ -36,7 +36,8 @@ class RegAnalysis { /// set of clobbered registers. BitVector getFunctionClobberList(const BinaryFunction *Func); - RegAnalysis(BinaryContext &BC, std::map *BFs, + RegAnalysis(BinaryContext &BC, + std::map *BFs, BinaryFunctionCallGraph *CG); /// Compute the set of registers \p Inst may read from, marking them in diff --git a/bolt/src/Passes/RegReAssign.cpp b/bolt/src/Passes/RegReAssign.cpp index 402cc796c910..64320b0c8384 100644 --- a/bolt/src/Passes/RegReAssign.cpp +++ b/bolt/src/Passes/RegReAssign.cpp @@ -339,7 +339,7 @@ bool RegReAssign::conservativePassOverFunction(BinaryContext &BC, void RegReAssign::setupAggressivePass(BinaryContext &BC, std::map &BFs) { setupConservativePass(BC, BFs); - CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC, BFs))); + CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC))); RA.reset(new RegAnalysis(BC, &BFs, &*CG)); GPRegs = BitVector(BC.MRI->getNumRegs(), false); @@ -380,18 +380,16 @@ void RegReAssign::setupConservativePass( }); } -void RegReAssign::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void RegReAssign::runOnFunctions(BinaryContext &BC) { RegScore = std::vector(BC.MRI->getNumRegs(), 0); RankedRegs = std::vector(BC.MRI->getNumRegs(), 0); if (opts::AggressiveReAssign) - setupAggressivePass(BC, BFs); + setupAggressivePass(BC, BC.getBinaryFunctions()); else - setupConservativePass(BC, BFs); + setupConservativePass(BC, BC.getBinaryFunctions()); - for (auto &I : BFs) { + for (auto &I : BC.getBinaryFunctions()) { auto &Function = I.second; if (!Function.isSimple() || !opts::shouldProcess(Function)) diff --git a/bolt/src/Passes/RegReAssign.h b/bolt/src/Passes/RegReAssign.h index 602ae12c5d28..35da0ffc5bc7 100644 --- a/bolt/src/Passes/RegReAssign.h +++ b/bolt/src/Passes/RegReAssign.h @@ -58,9 +58,7 @@ class RegReAssign : public BinaryFunctionPass { return BinaryFunctionPass::shouldPrint(BF) && FuncsChanged.count(&BF) > 0; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } } diff --git a/bolt/src/Passes/ReorderData.cpp b/bolt/src/Passes/ReorderData.cpp index 48b979dd3659..46edcc6ab07e 100644 --- a/bolt/src/Passes/ReorderData.cpp +++ b/bolt/src/Passes/ReorderData.cpp @@ -379,9 +379,7 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC, return FoundUnmoveable; } -void ReorderData::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void ReorderData::runOnFunctions(BinaryContext &BC) { static const char* DefaultSections[] = { ".rodata", ".data", @@ -435,7 +433,8 @@ void ReorderData::runOnFunctions(BinaryContext &BC, std::tie(Order, SplitPointIdx) = sortedByCount(BC, *Section); } else { outs() << "BOLT-INFO: reorder-sections: ordering data by funcs\n"; - std::tie(Order, SplitPointIdx) = sortedByFunc(BC, *Section, BFs); + std::tie(Order, SplitPointIdx) = + sortedByFunc(BC, *Section, BC.getBinaryFunctions()); } auto SplitPoint = Order.begin() + SplitPointIdx; diff --git a/bolt/src/Passes/ReorderData.h b/bolt/src/Passes/ReorderData.h index 44698d65f5f0..59ddc0ec00a7 100644 --- a/bolt/src/Passes/ReorderData.h +++ b/bolt/src/Passes/ReorderData.h @@ -57,9 +57,7 @@ class ReorderData : public BinaryFunctionPass { return "reorder-data"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/ReorderFunctions.cpp b/bolt/src/Passes/ReorderFunctions.cpp index b332d35101de..879ca581f48d 100644 --- a/bolt/src/Passes/ReorderFunctions.cpp +++ b/bolt/src/Passes/ReorderFunctions.cpp @@ -276,20 +276,18 @@ std::vector readFunctionOrderFile() { } -void ReorderFunctions::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void ReorderFunctions::runOnFunctions(BinaryContext &BC) { if (!BC.HasRelocations && opts::ReorderFunctions != RT_NONE) { errs() << "BOLT-ERROR: Function reordering only works when " << "relocs are enabled.\n"; exit(1); } + auto &BFs = BC.getBinaryFunctions(); if (opts::ReorderFunctions != RT_NONE && opts::ReorderFunctions != RT_EXEC_COUNT && opts::ReorderFunctions != RT_USER) { Cg = buildCallGraph(BC, - BFs, [](const BinaryFunction &BF) { if (!BF.hasProfile()) return true; diff --git a/bolt/src/Passes/ReorderFunctions.h b/bolt/src/Passes/ReorderFunctions.h index 1b1c58021dfd..57edd278a79e 100644 --- a/bolt/src/Passes/ReorderFunctions.h +++ b/bolt/src/Passes/ReorderFunctions.h @@ -41,9 +41,7 @@ class ReorderFunctions : public BinaryFunctionPass { const char *getName() const override { return "reorder-functions"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/RetpolineInsertion.cpp b/bolt/src/Passes/RetpolineInsertion.cpp index 29feb3cfa428..ba7085fc29d8 100644 --- a/bolt/src/Passes/RetpolineInsertion.cpp +++ b/bolt/src/Passes/RetpolineInsertion.cpp @@ -273,10 +273,7 @@ IndirectBranchInfo::IndirectBranchInfo(MCInst &Inst, MCPlusBuilder &MIB) { } } -void RetpolineInsertion::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { - +void RetpolineInsertion::runOnFunctions(BinaryContext &BC) { if (!opts::InsertRetpolines) return; @@ -287,7 +284,7 @@ void RetpolineInsertion::runOnFunctions(BinaryContext &BC, auto &MIB = *BC.MIB; uint32_t RetpolinedBranches = 0; - for (auto &It : BFs) { + for (auto &It : BC.getBinaryFunctions()) { auto &Function = It.second; for (auto &BB : Function) { for (auto It = BB.begin(); It != BB.end(); ++It) { diff --git a/bolt/src/Passes/RetpolineInsertion.h b/bolt/src/Passes/RetpolineInsertion.h index e3cf5bb3675b..f48a9c2a69e3 100644 --- a/bolt/src/Passes/RetpolineInsertion.h +++ b/bolt/src/Passes/RetpolineInsertion.h @@ -73,9 +73,7 @@ class RetpolineInsertion : public BinaryFunctionPass { const char *getName() const override { return "retpoline-insertion"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt } // namespace llvm diff --git a/bolt/src/Passes/StokeInfo.cpp b/bolt/src/Passes/StokeInfo.cpp index 4ea885e72761..fb8d8763a352 100644 --- a/bolt/src/Passes/StokeInfo.cpp +++ b/bolt/src/Passes/StokeInfo.cpp @@ -137,10 +137,7 @@ bool StokeInfo::checkFunction(const BinaryContext &BC, BinaryFunction &BF, return true; } -void StokeInfo::runOnFunctions( - BinaryContext &BC, - std::map &BFs, - std::set &) { +void StokeInfo::runOnFunctions(BinaryContext &BC) { outs() << "STOKE-INFO: begin of stoke pass\n"; std::ofstream Outfile; @@ -156,8 +153,8 @@ void StokeInfo::runOnFunctions( DEBUG(dbgs() << "\tTripleName " << BC.TripleName << "\n"); DEBUG(dbgs() << "\tgetNumRegs " << BC.MRI->getNumRegs() << "\n"); - auto CG = buildCallGraph(BC, BFs); - RegAnalysis RA(BC, &BFs, &CG); + auto CG = buildCallGraph(BC); + RegAnalysis RA(BC, &BC.getBinaryFunctions(), &CG); NumRegs = BC.MRI->getNumRegs(); assert(NumRegs > 0 && "STOKE-INFO: the target register number is incorrect!"); @@ -174,7 +171,7 @@ void StokeInfo::runOnFunctions( StokeFuncInfo FuncInfo; // analyze all functions FuncInfo.printCsvHeader(Outfile); - for (auto &BF : BFs) { + for (auto &BF : BC.getBinaryFunctions()) { DataflowInfoManager DInfo(BC, BF.second, &RA/*RA.get()*/, nullptr); FuncInfo.reset(); if (checkFunction(BC, BF.second, DInfo, RA, FuncInfo)) { diff --git a/bolt/src/Passes/StokeInfo.h b/bolt/src/Passes/StokeInfo.h index d63a77668500..1a4d0117b00f 100644 --- a/bolt/src/Passes/StokeInfo.h +++ b/bolt/src/Passes/StokeInfo.h @@ -138,9 +138,7 @@ class StokeInfo : public BinaryFunctionPass { DataflowInfoManager &DInfo, RegAnalysis &RA, StokeFuncInfo &FuncInfo); - void runOnFunctions(BinaryContext &BC, std::map &BFs, - std::set &LargeFunctions) override; - + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt diff --git a/bolt/src/Passes/ValidateInternalCalls.cpp b/bolt/src/Passes/ValidateInternalCalls.cpp index 28f9d3075a7c..e56f75d292b9 100644 --- a/bolt/src/Passes/ValidateInternalCalls.cpp +++ b/bolt/src/Passes/ValidateInternalCalls.cpp @@ -293,15 +293,13 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const { return true; } -void ValidateInternalCalls::runOnFunctions( - BinaryContext &BC, std::map &BFs, - std::set &LargeFunctions) { +void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) { if (!BC.isX86()) return; // Look for functions that need validation. This should be pretty rare. std::set NeedsValidation; - for (auto &BFI : BFs) { + for (auto &BFI : BC.getBinaryFunctions()) { BinaryFunction &Function = BFI.second; for (auto &BB : Function) { for(auto &Inst : BB) { diff --git a/bolt/src/Passes/ValidateInternalCalls.h b/bolt/src/Passes/ValidateInternalCalls.h index de5e4b6e5c4b..a5e324b060af 100644 --- a/bolt/src/Passes/ValidateInternalCalls.h +++ b/bolt/src/Passes/ValidateInternalCalls.h @@ -58,9 +58,7 @@ class ValidateInternalCalls : public BinaryFunctionPass { return "validate-internal-calls"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; private: /// Fix the CFG to take into consideration internal calls that do not diff --git a/bolt/src/Passes/VeneerElimination.cpp b/bolt/src/Passes/VeneerElimination.cpp index cf2ab8238f1f..73128553ab08 100644 --- a/bolt/src/Passes/VeneerElimination.cpp +++ b/bolt/src/Passes/VeneerElimination.cpp @@ -32,12 +32,11 @@ EliminateVeneers("elim-link-veneers", namespace llvm { namespace bolt { -void VeneerElimination::runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) { +void VeneerElimination::runOnFunctions(BinaryContext &BC) { if (!opts::EliminateVeneers || !BC.isAArch64()) return; + auto &BFs = BC.getBinaryFunctions(); std::unordered_map VeneerDestinations; uint64_t VeneersCount = 0; for (auto It = BFs.begin(); It != BFs.end();) { diff --git a/bolt/src/Passes/VeneerElimination.h b/bolt/src/Passes/VeneerElimination.h index 9948ef890e5d..4b1770c36536 100644 --- a/bolt/src/Passes/VeneerElimination.h +++ b/bolt/src/Passes/VeneerElimination.h @@ -30,9 +30,7 @@ class VeneerElimination : public BinaryFunctionPass { const char *getName() const override { return "veneer-elimination"; } - void runOnFunctions(BinaryContext &BC, - std::map &BFs, - std::set &LargeFunctions) override; + void runOnFunctions(BinaryContext &BC) override; }; } // namespace bolt } // namespace llvm diff --git a/bolt/src/ProfileWriter.cpp b/bolt/src/ProfileWriter.cpp index 00cc32bbfb6d..3d1583b499fc 100644 --- a/bolt/src/ProfileWriter.cpp +++ b/bolt/src/ProfileWriter.cpp @@ -158,7 +158,7 @@ convert(const BinaryFunction &BF, yaml::bolt::BinaryFunctionProfile &YamlBF) { std::error_code ProfileWriter::writeProfile(const RewriteInstance &RI) { - const auto &Functions = RI.getFunctions(); + const auto &Functions = RI.getBinaryContext().getBinaryFunctions(); std::error_code EC; OS = make_unique(FileName, EC, sys::fs::F_None); diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 57f9d4bedf17..87823fe1b4bf 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -15,6 +15,7 @@ #include "BinaryFunction.h" #include "BinaryPassManager.h" #include "CacheMetrics.h" +#include "DWARFRewriter.h" #include "DataAggregator.h" #include "DataReader.h" #include "Exceptions.h" @@ -710,12 +711,15 @@ RewriteInstance::RewriteInstance(ELFObjectFileBase *File, DataReader &DR, File, DR, DWARFContext::create(*File, nullptr, DWARFContext::defaultErrorHandler, "", false))), - SHStrTab(StringTableBuilder::ELF) {} + SHStrTab(StringTableBuilder::ELF) { + if (opts::UpdateDebugSections) { + DebugInfoRewriter = llvm::make_unique(*BC, SectionPatchers); + } +} RewriteInstance::~RewriteInstance() {} void RewriteInstance::reset() { - BinaryFunctions.clear(); FileSymRefs.clear(); auto &DR = BC->DR; BC = createBinaryContext( @@ -728,8 +732,9 @@ void RewriteInstance::reset() { Out.reset(nullptr); EHFrame = nullptr; FailedAddresses.clear(); - RangesSectionsWriter.reset(); - LocationListWriter.reset(); + if (opts::UpdateDebugSections) { + DebugInfoRewriter = llvm::make_unique(*BC, SectionPatchers); + } } bool RewriteInstance::shouldDisassemble(BinaryFunction &BF) const { @@ -943,7 +948,8 @@ void RewriteInstance::run() { return; } - auto executeRewritePass = [&](const std::set &NonSimpleFunctions) { + auto executeRewritePass = [&](const std::set &NonSimpleFunctions, + bool ShouldSplit) { discoverStorage(); readSpecialSections(); adjustCommandLineOptions(); @@ -956,9 +962,12 @@ void RewriteInstance::run() { return; postProcessFunctions(); for (uint64_t Address : NonSimpleFunctions) { - auto FI = BinaryFunctions.find(Address); - assert(FI != BinaryFunctions.end() && "bad non-simple function address"); - FI->second.setSimple(false); + auto *BF = BC->getBinaryFunctionAtAddress(Address); + assert(BF && "bad non-simple function address"); + if (ShouldSplit) + BF->setLarge(true); + else + BF->setSimple(false); } if (opts::DiffOnly) return; @@ -972,7 +981,7 @@ void RewriteInstance::run() { << "\n"; unsigned PassNumber = 1; - executeRewritePass({}); + executeRewritePass({}, false); if (opts::AggregateOnly || opts::DiffOnly) return; @@ -982,7 +991,7 @@ void RewriteInstance::run() { // Emit again because now some functions have been split outs() << "BOLT: split-functions: starting pass " << PassNumber << "...\n"; reset(); - executeRewritePass({}); + executeRewritePass(LargeFunctions, true); } // Emit functions again ignoring functions which still didn't fit in their @@ -995,11 +1004,11 @@ void RewriteInstance::run() { PassNumber, LargeFunctions.size()) << "...\n"; reset(); - executeRewritePass(LargeFunctions); + executeRewritePass(LargeFunctions, false); } if (opts::UpdateDebugSections) - updateDebugInfo(); + DebugInfoRewriter->updateDebugInfo(); addBoltInfoSection(); @@ -1019,7 +1028,7 @@ void RewriteInstance::discoverFileObjects() { TimerGroupName, TimerGroupDesc, opts::TimeRewrite); FileSymRefs.clear(); - BinaryFunctions.clear(); + BC->getBinaryFunctions().clear(); BC->clearBinaryData(); // For local symbols we want to keep track of associated FILE symbol name for @@ -1350,12 +1359,15 @@ void RewriteInstance::discoverFileObjects() { } TentativeSize = SymbolSize; } - + BinaryFunction *BF{nullptr}; - auto BFI = BinaryFunctions.find(Address); - if (BFI != BinaryFunctions.end()) { + // Since function may not have yet obtained its real size, do a search + // using the list of registered functions instead of calling + // getBinaryFunctionAtAddress(). + auto BFI = BC->getBinaryFunctions().find(Address); + if (BFI != BC->getBinaryFunctions().end()) { BF = &BFI->second; - // Duplicate function name. Make sure everything matches before we add + // Duplicate the function name. Make sure everything matches before we add // an alternative name. if (SymbolSize != BF->getSize()) { if (opts::Verbosity >= 1) { @@ -1373,8 +1385,8 @@ void RewriteInstance::discoverFileObjects() { } else { auto Section = BC->getSectionForAddress(Address); assert(Section && "section for functions must be registered."); - BF = createBinaryFunction(UniqueName, *Section, Address, - SymbolSize, IsSimple); + BF = BC->createBinaryFunction(UniqueName, *Section, Address, + SymbolSize, IsSimple); } if (!AlternativeName.empty()) BF->addAlternativeName(AlternativeName); @@ -1391,26 +1403,29 @@ void RewriteInstance::discoverFileObjects() { for (const auto &FDEI : CFIRdWrt->getFDEs()) { const auto Address = FDEI.first; const auto *FDE = FDEI.second; - const auto *BF = getBinaryFunctionAtAddress(Address); - if (!BF) { - if (const auto *PartialBF = getBinaryFunctionContainingAddress(Address)) { - errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x" - << Twine::utohexstr(Address + FDE->getAddressRange()) - << ") conflicts with function " << *PartialBF << '\n'; - } else { - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) - << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange()) - << ") has no corresponding symbol table entry\n"; - } - auto Section = BC->getSectionForAddress(Address); - assert(Section && "cannot get section for address from FDE"); - std::string FunctionName = - "__BOLT_FDE_FUNCat" + Twine::utohexstr(Address).str(); - createBinaryFunction(FunctionName, *Section, Address, - FDE->getAddressRange(), true); - } + const auto *BF = BC->getBinaryFunctionAtAddress(Address); + if (BF) + continue; + + BF = BC->getBinaryFunctionContainingAddress(Address); + if (BF) { + errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) << ", 0x" + << Twine::utohexstr(Address + FDE->getAddressRange()) + << ") conflicts with function " << *BF << '\n'; + continue; } + + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: FDE [0x" << Twine::utohexstr(Address) + << ", 0x" << Twine::utohexstr(Address + FDE->getAddressRange()) + << ") has no corresponding symbol table entry\n"; + } + auto Section = BC->getSectionForAddress(Address); + assert(Section && "cannot get section for address from FDE"); + std::string FunctionName = + "__BOLT_FDE_FUNCat" + Twine::utohexstr(Address).str(); + BC->createBinaryFunction(FunctionName, *Section, Address, + FDE->getAddressRange(), true); } if (!SeenFileName && BC->DR.hasLocalsWithFileName() && !opts::AllowStripped) { @@ -1431,7 +1446,7 @@ void RewriteInstance::discoverFileObjects() { uint64_t Address = cantFail(Symbol.getAddress(), "cannot get symbol address"); auto SymbolSize = ELFSymbolRef(Symbol).getSize(); - auto *BF = getBinaryFunctionContainingAddress(Address, true, true); + auto *BF = BC->getBinaryFunctionContainingAddress(Address, true, true); if (!BF) { // Stray marker continue; @@ -1471,8 +1486,8 @@ void RewriteInstance::disassemblePLT() { // Pseudo function for the start of PLT. The table could have a matching // FDE that we want to match to pseudo function. - createBinaryFunction("__BOLT_PLT_PSEUDO", *PLTSection, PLTAddress, 0, false, - PLTSize, PLTAlignment); + BC->createBinaryFunction("__BOLT_PLT_PSEUDO", *PLTSection, PLTAddress, 0, + false, PLTSize, PLTAlignment); for (uint64_t Offset = 0; Offset < PLTSection->getSize(); Offset += PLTSize) { uint64_t InstrSize; MCInst Instruction; @@ -1512,13 +1527,13 @@ void RewriteInstance::disassemblePLT() { const auto SymbolName = cantFail((*SymbolIter).getName()); std::string Name = SymbolName.str() + "@PLT"; const auto PtrSize = BC->AsmInfo->getCodePointerSize(); - auto *BF = createBinaryFunction(Name, - *PLTSection, - InstrAddr, - 0, - /*IsSimple=*/false, - PLTSize, - PLTAlignment); + auto *BF = BC->createBinaryFunction(Name, + *PLTSection, + InstrAddr, + 0, + /*IsSimple=*/false, + PLTSize, + PLTAlignment); auto TargetSymbol = BC->registerNameAtAddress(SymbolName.str() + "@GOT", TargetAddress, PtrSize, @@ -1532,19 +1547,20 @@ void RewriteInstance::disassemblePLT() { if (PLTGOTSection) { // Check if we need to create a function for .plt.got. Some linkers // (depending on the version) would mark it with FDE while others wouldn't. - if (!getBinaryFunctionAtAddress(PLTGOTSection->getAddress())) { - createBinaryFunction("__BOLT_PLT_GOT_PSEUDO", - *PLTGOTSection, - PLTGOTSection->getAddress(), - 0, - false, - PLTAlignment); + if (!BC->getBinaryFunctionAtAddress(PLTGOTSection->getAddress())) { + BC->createBinaryFunction("__BOLT_PLT_GOT_PSEUDO", + *PLTGOTSection, + PLTGOTSection->getAddress(), + 0, + false, + PLTAlignment); } } } void RewriteInstance::adjustFunctionBoundaries() { - for (auto BFI = BinaryFunctions.begin(), BFE = BinaryFunctions.end(); + for (auto BFI = BC->getBinaryFunctions().begin(), + BFE = BC->getBinaryFunctions().end(); BFI != BFE; ++BFI) { auto &Function = BFI->second; @@ -1666,21 +1682,6 @@ void RewriteInstance::relocateEHFrameSection() { EHFrame.parse(DE, createReloc); } -BinaryFunction *RewriteInstance::createBinaryFunction( - const std::string &Name, BinarySection &Section, uint64_t Address, - uint64_t Size, bool IsSimple, uint64_t SymbolSize, uint16_t Alignment) { - auto Result = BinaryFunctions.emplace( - Address, BinaryFunction(Name, Section, Address, Size, *BC, IsSimple)); - assert(Result.second == true && "unexpected duplicate function"); - auto *BF = &Result.first->second; - BC->registerNameAtAddress(Name, - Address, - SymbolSize ? SymbolSize : Size, - Alignment); - BC->setSymbolToFunctionMap(BF->getSymbol(), BF); - return BF; -} - ArrayRef RewriteInstance::getLSDAData() { return ArrayRef(LSDASection->getData(), LSDASection->getContents().size()); @@ -1714,7 +1715,6 @@ void RewriteInstance::readSpecialSections() { HasTextRelocations = (bool)BC->getUniqueSectionByName(".rela.text"); LSDASection = BC->getUniqueSectionByName(".gcc_except_table"); EHFrameSection = BC->getUniqueSectionByName(".eh_frame"); - GdbIndexSection = BC->getUniqueSectionByName(".gdb_index"); PLTSection = BC->getUniqueSectionByName(".plt"); GOTPLTSection = BC->getUniqueSectionByName(".got.plt"); PLTGOTSection = BC->getUniqueSectionByName(".plt.got"); @@ -2007,9 +2007,9 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { << "; addend = 0x" << Twine::utohexstr(Addend) << "; address = 0x" << Twine::utohexstr(Address) << "; in = "; - if (auto *Func = getBinaryFunctionContainingAddress(Rel.getOffset(), - false, - IsAArch64)) { + if (auto *Func = BC->getBinaryFunctionContainingAddress(Rel.getOffset(), + false, + IsAArch64)) { dbgs() << Func->getPrintName() << "\n"; } else { dbgs() << BC->getSectionForAddress(Rel.getOffset())->getName() << "\n"; @@ -2052,9 +2052,9 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { BinaryFunction *ContainingBF = nullptr; if (IsFromCode) { ContainingBF = - getBinaryFunctionContainingAddress(Rel.getOffset(), - /*CheckPastEnd*/ false, - /*UseMaxSize*/ IsAArch64); + BC->getBinaryFunctionContainingAddress(Rel.getOffset(), + /*CheckPastEnd*/ false, + /*UseMaxSize*/ IsAArch64); assert(ContainingBF && "cannot find function for address in code"); } @@ -2106,11 +2106,11 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { // Occasionally we may see a reference past the last byte of the function // typically as a result of __builtin_unreachable(). Check it here. - auto *ReferencedBF = getBinaryFunctionContainingAddress( + auto *ReferencedBF = BC->getBinaryFunctionContainingAddress( Address, /*CheckPastEnd*/ true, /*UseMaxSize*/ IsAArch64); if (!IsSectionRelocation) { - if (auto *BF = getBinaryFunctionContainingAddress(SymbolAddress)) { + if (auto *BF = BC->getBinaryFunctionContainingAddress(SymbolAddress)) { if (BF != ReferencedBF) { // It's possible we are referencing a function without referencing any // code, e.g. when taking a bitmask action on a function address. @@ -2154,7 +2154,8 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { // We check if a code non-pc-relative relocation is pointing // to a (fptr - 1). if (ContainingBF && !Relocation::isPCRelative(Rel.getType())) { - if (const auto *NextBF = getBinaryFunctionAtAddress(Address + 1)) { + if (const auto *NextBF = + BC->getBinaryFunctionAtAddress(Address + 1)) { errs() << "BOLT-WARNING: detected possible compiler " "de-virtualization bug: -1 addend used with " "non-pc-relative relocation against function " @@ -2378,7 +2379,7 @@ void RewriteInstance::readDebugInfo() { if (!opts::UpdateDebugSections) return; - BC->preprocessDebugInfo(BinaryFunctions); + BC->preprocessDebugInfo(); } void RewriteInstance::preprocessProfileData() { @@ -2387,14 +2388,15 @@ void RewriteInstance::preprocessProfileData() { NamedRegionTimer T("preprocessprofile", "pre-process profile data", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); - DA.parseProfile(*BC.get(), BinaryFunctions); + DA.parseProfile(*BC.get()); } void RewriteInstance::processProfileData() { NamedRegionTimer T("processprofile", "process profile data", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); + auto &BinaryFunctions = BC->getBinaryFunctions(); if (DA.started()) { - DA.processProfile(*BC.get(), BinaryFunctions); + DA.processProfile(*BC.get()); for (auto &BFI : BinaryFunctions) { auto &Function = BFI.second; @@ -2446,7 +2448,7 @@ void RewriteInstance::processProfileData() { void RewriteInstance::disassembleFunctions() { NamedRegionTimer T("disassembleFunctions", "disassemble functions", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { BinaryFunction &Function = BFI.second; if (!shouldDisassemble(Function)) { @@ -2487,7 +2489,7 @@ void RewriteInstance::disassembleFunctions() { // Post-process inter-procedural references ASAP as it may affect // functions we are about to disassemble next. for (const auto Addr : BC->InterproceduralReferences) { - auto *ContainingFunction = getBinaryFunctionContainingAddress(Addr); + auto *ContainingFunction = BC->getBinaryFunctionContainingAddress(Addr); if (ContainingFunction && ContainingFunction->getAddress() != Addr) { ContainingFunction->addEntryPoint(Addr); if (!BC->HasRelocations) { @@ -2521,9 +2523,9 @@ void RewriteInstance::disassembleFunctions() { } ContainingFunction = - getBinaryFunctionContainingAddress(Addr, - /*CheckPastEnd=*/false, - /*UseMaxSize=*/true); + BC->getBinaryFunctionContainingAddress(Addr, + /*CheckPastEnd=*/false, + /*UseMaxSize=*/true); // We are not going to overwrite non-simple functions, but for simple // ones - adjust the padding size. if (ContainingFunction && ContainingFunction->isSimple()) { @@ -2538,7 +2540,7 @@ void RewriteInstance::disassembleFunctions() { BC->InterproceduralReferences.clear(); } - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { BinaryFunction &Function = BFI.second; if (!shouldDisassemble(Function)) @@ -2582,7 +2584,7 @@ void RewriteInstance::disassembleFunctions() { void RewriteInstance::postProcessFunctions() { BC->TotalScore = 0; BC->SumExecutionCount = 0; - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { BinaryFunction &Function = BFI.second; if (Function.empty()) @@ -2614,7 +2616,7 @@ void RewriteInstance::postProcessFunctions() { void RewriteInstance::runOptimizationPasses() { NamedRegionTimer T("runOptimizationPasses", "run optimization passes", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); - BinaryFunctionPassManager::runAllPasses(*BC, BinaryFunctions, LargeFunctions); + BinaryFunctionPassManager::runAllPasses(*BC); } // Helper function to emit the contents of a function via a MCStreamer object. @@ -2773,7 +2775,7 @@ void RewriteInstance::emitSections() { emitFunctions(Streamer.get()); if (!BC->HasRelocations && opts::UpdateDebugSections) - updateDebugLineInfoForNonSimpleFunctions(); + DebugInfoRewriter->updateDebugLineInfoForNonSimpleFunctions(); emitDataSections(Streamer.get()); @@ -2796,7 +2798,7 @@ void RewriteInstance::emitSections() { if (opts::UpdateDebugSections) { // Compute offsets of tables in .debug_line for each compile unit. - updateLineTableOffsets(); + DebugInfoRewriter->updateLineTableOffsets(); } // Get output object as ObjectFile. @@ -2856,7 +2858,7 @@ void RewriteInstance::emitSections() { // Once the code is emitted, we can rename function sections to actual // output sections and de-register sections used for emission. if (!BC->HasRelocations) { - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { auto &Function = BFI.second; if (auto Section = Function.getCodeSection()) BC->deregisterSection(*Section); @@ -2871,7 +2873,7 @@ void RewriteInstance::emitSections() { if (opts::PrintCacheMetrics) { outs() << "BOLT-INFO: cache metrics after emitting functions:\n"; - CacheMetrics::printAll(BC->getSortedFunctions(BinaryFunctions)); + CacheMetrics::printAll(BC->getSortedFunctions()); } if (opts::KeepTmp) @@ -2903,8 +2905,7 @@ void RewriteInstance::emitFunctions(MCStreamer *Streamer) { } // Emit functions in sorted order. - std::vector SortedFunctions = - BinaryContext::getSortedFunctions(BinaryFunctions); + std::vector SortedFunctions = BC->getSortedFunctions(); emit(SortedFunctions); // Emit functions added by BOLT. @@ -3057,7 +3058,7 @@ void RewriteInstance::mapCodeSections(orc::VModuleKey Key) { NewTextSectionStartAddress); } - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { auto &Function = BFI.second; if (!Function.isSimple() || !opts::shouldProcess(Function)) continue; @@ -3299,7 +3300,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { Function.getOutputAddress() + Function.getOutputSize()); }; - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { auto &Function = BFI.second; updateOutputValue(Function); } @@ -3377,7 +3378,7 @@ bool RewriteInstance::checkLargeFunctions() { return false; LargeFunctions.clear(); - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { auto &Function = BFI.second; // Ignore this function if we failed to map it to the output binary @@ -3991,7 +3992,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { for (const Elf_Sym &Symbol : cantFail(Obj->symbols(Section))) { auto NewSymbol = Symbol; - const auto *Function = getBinaryFunctionAtAddress(Symbol.st_value); + const auto *Function = BC->getBinaryFunctionAtAddress(Symbol.st_value); // Some section symbols may be mistakenly associated with the first // function emitted in the section. Dismiss if it is a section symbol. if (Function && @@ -4391,7 +4392,7 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { } uint64_t RewriteInstance::getNewFunctionAddress(uint64_t OldAddress) { - const auto *Function = getBinaryFunctionAtAddress(OldAddress); + const auto *Function = BC->getBinaryFunctionAtAddress(OldAddress); if (!Function) return 0; return Function->getOutputAddress(); @@ -4427,7 +4428,7 @@ void RewriteInstance::rewriteFile() { // Overwrite functions in the output file. uint64_t CountOverwrittenFunctions = 0; uint64_t OverwrittenScore = 0; - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { auto &Function = BFI.second; if (Function.getImageAddress() == 0 || Function.getImageSize() == 0) @@ -4509,7 +4510,7 @@ void RewriteInstance::rewriteFile() { // Print function statistics. outs() << "BOLT: " << CountOverwrittenFunctions - << " out of " << BinaryFunctions.size() + << " out of " << BC->getBinaryFunctions().size() << " functions were overwritten.\n"; if (BC->TotalScore != 0) { double Coverage = OverwrittenScore / (double) BC->TotalScore * 100.0; @@ -4522,7 +4523,7 @@ void RewriteInstance::rewriteFile() { if (BC->HasRelocations && opts::TrapOldCode) { auto SavedPos = OS.tell(); // Overwrite function body to make sure we never execute these instructions. - for (auto &BFI : BinaryFunctions) { + for (auto &BFI : BC->getBinaryFunctions()) { auto &BF = BFI.second; if (!BF.getFileOffset()) continue; @@ -4696,48 +4697,3 @@ bool RewriteInstance::willOverwriteSection(StringRef SectionName) { auto Section = BC->getUniqueSectionByName(SectionName); return Section && Section->isAllocatable() && Section->isFinalized(); } - -BinaryFunction * -RewriteInstance::getBinaryFunctionContainingAddress(uint64_t Address, - bool CheckPastEnd, - bool UseMaxSize) { - auto FI = BinaryFunctions.upper_bound(Address); - if (FI == BinaryFunctions.begin()) - return nullptr; - --FI; - - const auto UsedSize = UseMaxSize ? FI->second.getMaxSize() - : FI->second.getSize(); - - if (Address >= FI->first + UsedSize + (CheckPastEnd ? 1 : 0)) - return nullptr; - return &FI->second; -} - -const BinaryFunction * -RewriteInstance::getBinaryFunctionAtAddress(uint64_t Address) const { - if (const auto *BD = BC->getBinaryDataAtAddress(Address)) - return BC->getFunctionForSymbol(BD->getSymbol()); - return nullptr; -} - -DebugAddressRangesVector RewriteInstance::translateModuleAddressRanges( - const DWARFAddressRangesVector &InputRanges) const { - DebugAddressRangesVector OutputRanges; - - for (const auto Range : InputRanges) { - auto BFI = BinaryFunctions.lower_bound(Range.LowPC); - while (BFI != BinaryFunctions.end()) { - const auto &Function = BFI->second; - if (Function.getAddress() >= Range.HighPC) - break; - const auto FunctionRanges = Function.getOutputAddressRanges(); - std::move(std::begin(FunctionRanges), - std::end(FunctionRanges), - std::back_inserter(OutputRanges)); - std::advance(BFI, 1); - } - } - - return OutputRanges; -} diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index ffdbfb5584a7..7e9e7f5cff6a 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -15,7 +15,6 @@ #define LLVM_TOOLS_LLVM_BOLT_REWRITE_INSTANCE_H #include "BinaryFunction.h" -#include "DebugData.h" #include "ExecutableFileMemoryManager.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" @@ -28,14 +27,13 @@ namespace llvm { -class DWARFContext; -class DWARFFrame; class ToolOutputFile; namespace bolt { class BinaryContext; class CFIReaderWriter; +class DWARFRewriter; class DataAggregator; class DataReader; class RewriteInstanceDiff; @@ -114,12 +112,6 @@ class RewriteInstance { /// Update debug information in the file for re-written code. void updateDebugInfo(); - /// Recursively update debug info for all DIEs in \p Unit. - /// If \p Function is not empty, it points to a function corresponding - /// to a parent DW_TAG_subprogram node of the current \p DIE. - void updateUnitDebugInfo(const DWARFDie DIE, - std::vector FunctionStack); - /// Return the list of code sections in the output order. std::vector getCodeSections(); @@ -138,10 +130,6 @@ class RewriteInstance { /// second pass to emit those functions in two parts. bool checkLargeFunctions(); - /// Updates debug line information for non-simple functions, which are not - /// rewritten. - void updateDebugLineInfoForNonSimpleFunctions(); - /// Rewrite back all functions (hopefully optimized) that fit in the original /// memory footprint for that function. If the function is now larger and does /// not fit in the binary, reject it and preserve the original version of the @@ -159,33 +147,6 @@ class RewriteInstance { "findSymbol failed"); } - /// Return BinaryFunction containing a given \p Address or nullptr if - /// no registered function has it. - /// - /// In a binary a function has somewhat vague boundaries. E.g. a function can - /// refer to the first byte past the end of the function, and it will still be - /// referring to this function, not the function following it in the address - /// space. Thus we have the following flags that allow to lookup for - /// a function where a caller has more context for the search. - /// - /// If \p CheckPastEnd is true and the \p Address falls on a byte - /// immediately following the last byte of some function and there's no other - /// function that starts there, then return the function as the one containing - /// the \p Address. This is useful when we need to locate functions for - /// references pointing immediately past a function body. - /// - /// If \p UseMaxSize is true, then include the space between this function - /// body and the next object in address ranges that we check. - BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address, - bool CheckPastEnd = false, - bool UseMaxSize = false); - - const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) const; - - /// Produce output address ranges based on input ranges for some module. - DebugAddressRangesVector translateModuleAddressRanges( - const DWARFAddressRangesVector &InputRanges) const; - private: /// Emit a single function. void emitFunction(MCStreamer &Streamer, BinaryFunction &Function, @@ -278,30 +239,6 @@ class RewriteInstance { /// rewritten binary. void patchBuildID(); - /// Computes output .debug_line line table offsets for each compile unit, - /// and updates stmt_list for a corresponding compile unit. - void updateLineTableOffsets(); - - /// Generate new contents for .debug_ranges and .debug_aranges section. - void finalizeDebugSections(); - - /// Patches the binary for DWARF address ranges (e.g. in functions and lexical - /// blocks) to be updated. - void updateDebugAddressRanges(); - - /// Rewrite .gdb_index section if present. - void updateGdbIndexSection(); - - /// Patches the binary for an object's address ranges to be updated. - /// The object can be a anything that has associated address ranges via either - /// DW_AT_low/high_pc or DW_AT_ranges (i.e. functions, lexical blocks, etc). - /// \p DebugRangesOffset is the offset in .debug_ranges of the object's - /// new address ranges in the output binary. - /// \p Unit Compile unit the object belongs to. - /// \p DIE is the object's DIE in the input binary. - void updateDWARFObjectAddressRanges(const DWARFDie DIE, - uint64_t DebugRangesOffset); - /// Return file offset corresponding to a given virtual address. uint64_t getFileOffsetFor(uint64_t Address) { assert(Address >= NewTextSegmentAddress && @@ -318,15 +255,6 @@ class RewriteInstance { /// of appending contents to it. bool willOverwriteSection(StringRef SectionName); - /// Construct BinaryFunction object and add it to internal maps. - BinaryFunction *createBinaryFunction(const std::string &Name, - BinarySection &Section, - uint64_t Address, - uint64_t Size, - bool IsSimple, - uint64_t SymbolSize = 0, - uint16_t Alignment = 0); - /// Return true if the function \p BF should be disassembled. bool shouldDisassemble(BinaryFunction &BF) const; @@ -343,6 +271,9 @@ class RewriteInstance { ".gdb_index", }; + using SectionPatchersType = + std::map>; + private: /// Get the contents of the LSDA section for this binary. ArrayRef getLSDAData(); @@ -410,18 +341,11 @@ class RewriteInstance { /// Store all non-zero symbols in this map for a quick address lookup. std::map FileSymRefs; - /// Store all functions in the binary, sorted by original address. - std::map BinaryFunctions; - - /// Stores and serializes information that will be put into the .debug_ranges - /// and .debug_aranges DWARF sections. - std::unique_ptr RangesSectionsWriter; - - std::unique_ptr LocationListWriter; + std::unique_ptr DebugInfoRewriter; /// Patchers used to apply simple changes to sections of the input binary. /// Maps section name -> patcher. - std::map> SectionPatchers; + SectionPatchersType SectionPatchers; /// Number of local symbols in newly written symbol table. uint64_t NumLocalSymbols{0}; @@ -450,17 +374,12 @@ class RewriteInstance { /// Contains relocations against .got.plt. ErrorOr RelaPLTSection{std::errc::bad_address}; - /// .gdb_index section. - ErrorOr GdbIndexSection{std::errc::bad_address}; - /// .note.gnu.build-id section. ErrorOr BuildIDSection{std::errc::bad_address}; /// A reference to the build-id bytes in the original binary StringRef BuildID; - uint64_t NewSymTabOffset{0}; - /// Keep track of functions we fail to write in the binary. We need to avoid /// rewriting CFI info for these functions. std::vector FailedAddresses; @@ -499,11 +418,6 @@ class RewriteInstance { return BC->TotalScore; } - /// Return all functions for this rewrite instance. - const std::map &getFunctions() const { - return BinaryFunctions; - } - /// Return the name of the input file. Optional getInputFileName() const { if (InputFile) From f8e1e336c861c5cb4c50d7412f41d2423cddf3e5 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 3 Apr 2019 22:31:12 -0700 Subject: [PATCH 516/904] [BOLT] Detect internal references into a middle of instruction Summary: Some instructions in assembly-written functions could reference 8-byte constants from another instructions using 4-byte offsets, presumably to save a couple of bytes. Detect such cases, and skip processing such functions until we teach BOLT how to handle references into a middle of instruction. (cherry picked from commit a589c2d8ceb2ed198e22df3536890f50363e7512) --- bolt/src/BinaryFunction.cpp | 25 ++++++++++++++++++++++++- bolt/src/BinaryFunction.h | 3 +++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 37c4536b34b7..711d7282e30d 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1464,9 +1464,32 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { updateState(State::Disassembled); + postProcessEntryPoints(); + postProcessJumpTables(); } +void BinaryFunction::postProcessEntryPoints() { + for (auto Offset : EntryOffsets) { + if (!getInstructionAtOffset(Offset)) { + // On AArch64 there are legitimate reasons to have references past the + // end of the function, e.g. jump tables. + if (BC.isAArch64() && Offset == getSize()) { + continue; + } + + errs() << "BOLT-WARNING: reference in the middle of instruction " + "detected in function " << *this + << " at offset 0x" << Twine::utohexstr(Offset) << '\n'; + if (BC.HasRelocations) { + errs() << "BOLT-ERROR: unable to keep processing in relocation mode\n"; + exit(1); + } + setSimple(false); + } + } +} + void BinaryFunction::postProcessJumpTables() { // Create labels for all entries. for (auto &JTI : JumpTables) { @@ -1683,7 +1706,7 @@ bool BinaryFunction::buildCFG() { return false; } - if (!(CurrentState == State::Disassembled)) + if (CurrentState != State::Disassembled) return false; assert(BasicBlocks.empty() && "basic block list should be empty"); diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index be33aab8c50f..5d117e1d14d9 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -1960,6 +1960,9 @@ class BinaryFunction { /// Returns false if disassembly failed. void disassemble(ArrayRef FunctionData); + /// Validate entry points. + void postProcessEntryPoints(); + /// Post-processing for jump tables after disassembly. Since their /// boundaries are not known until all call sites are seen, we need this /// extra pass to perform any final adjustments. From a21457eef70dd0235d00c5af5fded6aa1f27f99b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 1 Apr 2019 20:26:41 -0700 Subject: [PATCH 517/904] [DWARF][BOLT] Convert DW_AT_(low|high)_pc to DW_AT_ranges only if necessary Summary: While updating DWARF, we used to convert address ranges for functions into DW_AT_ranges format, even if the ranges were not split and still had a simple [low, high) form. We had to do this because functions with contiguous ranges could be sharing an abbrev with non-contiguous range function, and we had to convert the abbrev. It turns out, that the excessive usage of DW_AT_ranges may lead to internal core dumps in gdb in the presence of .gdb_index. I still don't know the root cause of it, but reducing the number DW_AT_ranges used by DW_TAG_subprogram DIEs does alleviate the issue. We can keep a simple range for DIEs that are guaranteed not to share an abbrev with any non-contiguous function. Hence we have to postpone the update of function ranges until we've seen all DIEs. Note that DIEs from different compilation units could share the same abbrev, and hence we have to process DIEs from all compilation units. (cherry picked from commit d49cfc3028dcb5e15eb5135878bbfa6202f6ab6e) --- bolt/src/DWARFRewriter.cpp | 214 +++++++++++++++++++++++++------------ bolt/src/DWARFRewriter.h | 32 ++++++ bolt/src/DebugData.cpp | 25 ++--- bolt/src/DebugData.h | 16 ++- 4 files changed, 191 insertions(+), 96 deletions(-) diff --git a/bolt/src/DWARFRewriter.cpp b/bolt/src/DWARFRewriter.cpp index abc3d67ea83c..e5dd2b26724e 100644 --- a/bolt/src/DWARFRewriter.cpp +++ b/bolt/src/DWARFRewriter.cpp @@ -61,6 +61,12 @@ void DWARFRewriter::updateDebugInfo() { SectionPatchers[".debug_abbrev"] = llvm::make_unique(); SectionPatchers[".debug_info"] = llvm::make_unique(); + DebugInfoPatcher = + static_cast(SectionPatchers[".debug_info"].get()); + AbbrevPatcher = + static_cast(SectionPatchers[".debug_abbrev"].get()); + assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized."); + RangesSectionsWriter = llvm::make_unique(&BC); LocationListWriter = llvm::make_unique(&BC); @@ -69,6 +75,8 @@ void DWARFRewriter::updateDebugInfo() { std::vector{}); } + flushPendingRanges(); + finalizeDebugSections(); updateGdbIndexSection(); @@ -98,19 +106,29 @@ void DWARFRewriter::updateUnitDebugInfo( if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) { IsFunctionDef = true; const auto *Function = BC.getBinaryFunctionAtAddress(LowPC); - if (Function && Function->isFolded()) { + if (Function && Function->isFolded()) Function = nullptr; - } FunctionStack.push_back(Function); - auto RangesSectionOffset = - RangesSectionsWriter->getEmptyRangesOffset(); - if (Function) { - auto FunctionRanges = Function->getOutputAddressRanges(); - RangesSectionOffset = - RangesSectionsWriter->addRanges(Function, - std::move(FunctionRanges)); + + const auto *Abbrev = DIE.getAbbreviationDeclarationPtr(); + assert(Abbrev && "abbrev expected"); + + DebugAddressRangesVector FunctionRanges; + if (Function) + FunctionRanges = Function->getOutputAddressRanges(); + + if (FunctionRanges.size() > 1) { + convertPending(Abbrev); + convertToRanges(DIE, FunctionRanges); + } else if (ConvertedRangesAbbrevs.find(Abbrev) != + ConvertedRangesAbbrevs.end()) { + convertToRanges(DIE, FunctionRanges); + } else { + if (FunctionRanges.empty()) + FunctionRanges.emplace_back(DebugAddressRange()); + PendingRanges[Abbrev].emplace_back( + std::make_pair(DIE, FunctionRanges.front())); } - updateDWARFObjectAddressRanges(DIE, RangesSectionOffset); } } break; @@ -184,9 +202,6 @@ void DWARFRewriter::updateUnitDebugInfo( } } - auto DebugInfoPatcher = - static_cast( - SectionPatchers[".debug_info"].get()); DebugInfoPatcher->addLE32Patch(AttrOffset, LocListSectionOffset); } else { assert((Value.isFormClass(DWARFFormValue::FC_Exprloc) || @@ -206,9 +221,6 @@ void DWARFRewriter::updateUnitDebugInfo( << " for DIE with tag " << DIE.getTag() << " to 0x" << Twine::utohexstr(NewAddress) << '\n'); } - auto DebugInfoPatcher = - static_cast( - SectionPatchers[".debug_info"].get()); DebugInfoPatcher->addLE64Patch(AttrOffset, NewAddress); } else if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: unexpected form value for attribute at 0x" @@ -237,17 +249,10 @@ void DWARFRewriter::updateDWARFObjectAddressRanges( } if (opts::Verbosity >= 2 && DebugRangesOffset == -1U) { - errs() << "BOLT-WARNING: using invalid DW_AT_range for DIE at offset 0x" + errs() << "BOLT-WARNING: using invalid DW_AT_ranges for DIE at offset 0x" << Twine::utohexstr(DIE.getOffset()) << '\n'; } - auto DebugInfoPatcher = - static_cast(SectionPatchers[".debug_info"].get()); - auto AbbrevPatcher = - static_cast(SectionPatchers[".debug_abbrev"].get()); - - assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized."); - const auto *AbbreviationDecl = DIE.getAbbreviationDeclarationPtr(); if (!AbbreviationDecl) { if (opts::Verbosity >= 1) { @@ -258,8 +263,6 @@ void DWARFRewriter::updateDWARFObjectAddressRanges( return; } - auto AbbrevCode = AbbreviationDecl->getCode(); - if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges)) { // Case 1: The object was already non-contiguous and had DW_AT_ranges. // In this case we simply need to update the value of DW_AT_ranges. @@ -282,50 +285,8 @@ void DWARFRewriter::updateDWARFObjectAddressRanges( // large size. if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_low_pc) && AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_high_pc)) { - uint32_t LowPCOffset = -1U; - uint32_t HighPCOffset = -1U; - DWARFFormValue LowPCFormValue = - *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset); - DWARFFormValue HighPCFormValue = - *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset); - - if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr || - (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && - HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && - HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { - errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " - << "at offset 0x" << Twine::utohexstr(DIE.getOffset()) - << "\n"; - return; - } - if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { - errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " - << "Cannot update DIE at offset 0x" - << Twine::utohexstr(DIE.getOffset()) << '\n'; - return; - } - - AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(), - AbbrevCode, - dwarf::DW_AT_low_pc, - dwarf::DW_AT_ranges, - dwarf::DW_FORM_sec_offset); - AbbrevPatcher->addAttributePatch(DIE.getDwarfUnit(), - AbbrevCode, - dwarf::DW_AT_high_pc, - dwarf::DW_AT_low_pc, - dwarf::DW_FORM_udata); - unsigned LowPCSize = 0; - if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr || - HighPCFormValue.getForm() == dwarf::DW_FORM_data8) { - LowPCSize = 12; - } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) { - LowPCSize = 8; - } else { - llvm_unreachable("unexpected form"); - } - DebugInfoPatcher->addLE32Patch(LowPCOffset, DebugRangesOffset); - DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize); + convertToRanges(AbbreviationDecl); + convertToRanges(DIE, DebugRangesOffset); } else { if (opts::Verbosity >= 1) { errs() << "BOLT-WARNING: Cannot update ranges for DIE at offset 0x" @@ -597,3 +558,116 @@ void DWARFRewriter::updateGdbIndexSection() { NewGdbIndexContents, NewGdbIndexSize); } + +void +DWARFRewriter::convertToRanges(const DWARFAbbreviationDeclaration *Abbrev) { + AbbrevPatcher->addAttributePatch(Abbrev, + dwarf::DW_AT_low_pc, + dwarf::DW_AT_ranges, + dwarf::DW_FORM_sec_offset); + AbbrevPatcher->addAttributePatch(Abbrev, + dwarf::DW_AT_high_pc, + dwarf::DW_AT_low_pc, + dwarf::DW_FORM_udata); +} + +void DWARFRewriter::convertToRanges(DWARFDie DIE, + const DebugAddressRangesVector &Ranges) { + uint64_t RangesSectionOffset; + if (Ranges.empty()) { + RangesSectionOffset = RangesSectionsWriter->getEmptyRangesOffset(); + } else { + RangesSectionOffset = RangesSectionsWriter->addRanges(Ranges); + } + + convertToRanges(DIE, RangesSectionOffset); +} + +void DWARFRewriter::convertPending(const DWARFAbbreviationDeclaration *Abbrev) { + if (ConvertedRangesAbbrevs.count(Abbrev)) + return; + + convertToRanges(Abbrev); + + auto I = PendingRanges.find(Abbrev); + if (I != PendingRanges.end()) { + for (auto &Pair : I->second) { + convertToRanges(Pair.first, {Pair.second}); + } + PendingRanges.erase(I); + } + + ConvertedRangesAbbrevs.emplace(Abbrev); +} + +void DWARFRewriter::flushPendingRanges() { + for (auto &I : PendingRanges) { + for (auto &RangePair : I.second) { + patchLowHigh(RangePair.first, RangePair.second); + } + } +} + +namespace { + +void getRangeAttrData( + DWARFDie DIE, + uint32_t &LowPCOffset, uint32_t &HighPCOffset, + DWARFFormValue &LowPCFormValue, DWARFFormValue &HighPCFormValue) { + LowPCOffset = -1U; + HighPCOffset = -1U; + LowPCFormValue = *DIE.find(dwarf::DW_AT_low_pc, &LowPCOffset); + HighPCFormValue = *DIE.find(dwarf::DW_AT_high_pc, &HighPCOffset); + + if (LowPCFormValue.getForm() != dwarf::DW_FORM_addr || + (HighPCFormValue.getForm() != dwarf::DW_FORM_addr && + HighPCFormValue.getForm() != dwarf::DW_FORM_data8 && + HighPCFormValue.getForm() != dwarf::DW_FORM_data4)) { + errs() << "BOLT-WARNING: unexpected form value. Cannot update DIE " + << "at offset 0x" << Twine::utohexstr(DIE.getOffset()) << "\n"; + return; + } + if (LowPCOffset == -1U || (LowPCOffset + 8 != HighPCOffset)) { + errs() << "BOLT-WARNING: high_pc expected immediately after low_pc. " + << "Cannot update DIE at offset 0x" + << Twine::utohexstr(DIE.getOffset()) << '\n'; + return; + } +} + +} + +void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range) { + uint32_t LowPCOffset, HighPCOffset; + DWARFFormValue LowPCFormValue, HighPCFormValue; + getRangeAttrData( + DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue); + DebugInfoPatcher->addLE64Patch(LowPCOffset, Range.LowPC); + if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr || + HighPCFormValue.getForm() == dwarf::DW_FORM_data8) { + DebugInfoPatcher->addLE64Patch(HighPCOffset, Range.HighPC - Range.LowPC); + } else { + DebugInfoPatcher->addLE32Patch(HighPCOffset, Range.HighPC - Range.LowPC); + } +} + +void DWARFRewriter::convertToRanges(DWARFDie DIE, + uint64_t RangesSectionOffset) { + uint32_t LowPCOffset, HighPCOffset; + DWARFFormValue LowPCFormValue, HighPCFormValue; + getRangeAttrData( + DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue); + + unsigned LowPCSize = 0; + if (HighPCFormValue.getForm() == dwarf::DW_FORM_addr || + HighPCFormValue.getForm() == dwarf::DW_FORM_data8) { + LowPCSize = 12; + } else if (HighPCFormValue.getForm() == dwarf::DW_FORM_data4) { + LowPCSize = 8; + } else { + llvm_unreachable("unexpected form"); + } + DebugInfoPatcher->addLE32Patch(LowPCOffset, RangesSectionOffset); + DebugInfoPatcher->addUDataPatch(LowPCOffset + 4, 0, LowPCSize); +} + diff --git a/bolt/src/DWARFRewriter.h b/bolt/src/DWARFRewriter.h index 58285a63b216..dd4a3ab097a5 100644 --- a/bolt/src/DWARFRewriter.h +++ b/bolt/src/DWARFRewriter.h @@ -14,6 +14,7 @@ #include "DebugData.h" #include "RewriteInstance.h" +#include namespace llvm { @@ -30,6 +31,9 @@ class DWARFRewriter { SectionPatchersType &SectionPatchers; + SimpleBinaryPatcher *DebugInfoPatcher{nullptr}; + DebugAbbrevPatcher *AbbrevPatcher{nullptr}; + /// Stores and serializes information that will be put into the .debug_ranges /// and .debug_aranges DWARF sections. std::unique_ptr RangesSectionsWriter; @@ -62,6 +66,34 @@ class DWARFRewriter { /// Rewrite .gdb_index section if present. void updateGdbIndexSection(); + /// Abbreviations that were converted to use DW_AT_ranges. + std::set ConvertedRangesAbbrevs; + + /// DIEs with abbrevs that were not converted to DW_AT_ranges. + /// We only update those when all DIEs have been processed to guarantee that + /// the abbrev (which is shared) is intact. + std::map>> PendingRanges; + + /// Convert \p Abbrev from using a simple DW_AT_(low|high)_pc range to + /// DW_AT_ranges. + void convertToRanges(const DWARFAbbreviationDeclaration *Abbrev); + + /// Update \p DIE that was using DW_AT_(low|high)_pc with DW_AT_ranges offset. + void convertToRanges(DWARFDie DIE, uint64_t RangesSectionOffset); + + /// Same as above, but takes a vector of \p Ranges as a parameter. + void convertToRanges(DWARFDie DIE, const DebugAddressRangesVector &Ranges); + + /// Patch DW_AT_(low|high)_pc values for the \p DIE based on \p Range. + void patchLowHigh(DWARFDie DIE, DebugAddressRange Range); + + /// Convert pending ranges associated with the given \p Abbrev. + void convertPending(const DWARFAbbreviationDeclaration *Abbrev); + + /// Once all DIEs were seen, update DW_AT_(low|high)_pc values. + void flushPendingRanges(); + public: DWARFRewriter(BinaryContext &BC, SectionPatchersType &SectionPatchers) diff --git a/bolt/src/DebugData.cpp b/bolt/src/DebugData.cpp index ff122dd0575c..f628bc10fc45 100644 --- a/bolt/src/DebugData.cpp +++ b/bolt/src/DebugData.cpp @@ -229,30 +229,23 @@ void SimpleBinaryPatcher::patchBinary(std::string &BinaryContents) { } } -void DebugAbbrevPatcher::addAttributePatch(const DWARFUnit *Unit, - uint32_t AbbrevCode, - dwarf::Attribute AttrTag, - uint8_t NewAttrTag, - uint8_t NewAttrForm) { - assert(Unit && "No compile unit specified."); +void DebugAbbrevPatcher::addAttributePatch( + const DWARFAbbreviationDeclaration *Abbrev, + dwarf::Attribute AttrTag, + uint8_t NewAttrTag, + uint8_t NewAttrForm) { + assert(Abbrev && "no abbreviation specified"); AbbrevPatches.emplace( - AbbrevAttrPatch{Unit, AbbrevCode, AttrTag, NewAttrTag, NewAttrForm}); + AbbrevAttrPatch{Abbrev, AttrTag, NewAttrTag, NewAttrForm}); } void DebugAbbrevPatcher::patchBinary(std::string &Contents) { SimpleBinaryPatcher Patcher; for (const auto &Patch : AbbrevPatches) { - const auto *UnitAbbreviations = Patch.Unit->getAbbreviations(); - assert(UnitAbbreviations && - "Compile unit doesn't have associated abbreviations."); - const auto *AbbreviationDeclaration = - UnitAbbreviations->getAbbreviationDeclaration(Patch.Code); - assert(AbbreviationDeclaration && "No abbreviation with given code."); - const auto Attribute = - AbbreviationDeclaration->findAttribute(Patch.Attr); - + const auto Attribute = Patch.Abbrev->findAttribute(Patch.Attr); assert(Attribute && "Specified attribute doesn't occur in abbreviation."); + // Because we're only handling standard values (i.e. no DW_FORM_GNU_* or // DW_AT_APPLE_*), they are all small (< 128) and encoded in a single // byte in ULEB128, otherwise it'll be more tricky as we may need to diff --git a/bolt/src/DebugData.h b/bolt/src/DebugData.h index 355c8b280423..55caecd35ca9 100644 --- a/bolt/src/DebugData.h +++ b/bolt/src/DebugData.h @@ -234,21 +234,19 @@ class DebugAbbrevPatcher : public BinaryPatcher { private: /// Patch of changing one attribute to another. struct AbbrevAttrPatch { - const DWARFUnit *Unit; // Containing DWARF unit - uint32_t Code; // Code of abbreviation to be modified. + const DWARFAbbreviationDeclaration *Abbrev; dwarf::Attribute Attr; // ID of attribute to be replaced. uint8_t NewAttr; // ID of the new attribute. uint8_t NewForm; // Form of the new attribute. bool operator==(const AbbrevAttrPatch &RHS) const { - return Unit == RHS.Unit && Code == RHS.Code && Attr == RHS.Attr; + return Abbrev == RHS.Abbrev && Attr == RHS.Attr; } }; struct AbbrevHash { std::size_t operator()(const AbbrevAttrPatch &P) const { - return std::hash()( - ((uint64_t)P.Unit->getOffset() << 32) + (P.Code << 16) + P.Attr); + return std::hash()(((uint64_t)P.Abbrev << 16) + P.Attr); } }; @@ -256,15 +254,13 @@ class DebugAbbrevPatcher : public BinaryPatcher { public: ~DebugAbbrevPatcher() { } - /// Adds a patch to change an attribute of an abbreviation that belongs to - /// \p Unit to another attribute. - /// \p AbbrevCode code of the abbreviation to be modified. + /// Adds a patch to change an attribute of the abbreviation + /// \p Abbrev the abbreviation to be modified. /// \p AttrTag ID of the attribute to be replaced. /// \p NewAttrTag ID of the new attribute. /// \p NewAttrForm Form of the new attribute. /// We only handle standard forms, that are encoded in a single byte. - void addAttributePatch(const DWARFUnit *Unit, - uint32_t AbbrevCode, + void addAttributePatch(const DWARFAbbreviationDeclaration *Abbrev, dwarf::Attribute AttrTag, uint8_t NewAttrTag, uint8_t NewAttrForm); From b3686cb4729236fc4858ca26db984a80a5c308d2 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 5 Apr 2019 17:27:25 -0700 Subject: [PATCH 518/904] [PERF2BOLT] Print a better message if perf.data lacks LBR Summary: If processing the perf.data in LBR mode but the data was collected without -j, currently we confusingly report all samples to mismatch the input binary, even though the samples match but lack LBR info. Change perf2bolt to detect this scenario and print a helpful message instructing the user to collect data with LBR. (cherry picked from commit ac618f24b51dd07a06ff3e0e2960a99432b9883e) --- bolt/src/DataAggregator.cpp | 59 ++++++++++++++++++++++++++----------- bolt/src/DataAggregator.h | 4 ++- 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index dbc83687819c..54da55860caf 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -232,13 +232,18 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI, TempFiles.push_back(PPI.StderrPath.data()); Optional Redirects[] = { - llvm::None, // Stdin + llvm::None, // Stdin StringRef(PPI.StdoutPath.data()), // Stdout StringRef(PPI.StderrPath.data())}; // Stderr - DEBUG(dbgs() << "Launching perf: " << PerfPath.data() << " 1> " - << PPI.StdoutPath.data() << " 2> " - << PPI.StderrPath.data() << "\n"); + DEBUG({ + dbgs() << "Launching perf: "; + for (const char *Arg : Argv) + dbgs() << Arg << " "; + dbgs() << " 1> " + << PPI.StdoutPath.data() << " 2> " + << PPI.StderrPath.data() << "\n"; + }); if (Wait) { PPI.PI.ReturnCode = @@ -784,7 +789,7 @@ ErrorOr DataAggregator::parseBranchSample() { auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes); if (MMapInfoIter == BinaryMMapInfo.end()) { consumeRestOfLine(); - return Res; + return make_error_code(std::errc::no_such_process); } while (checkAndConsumeFS()) {} @@ -997,8 +1002,11 @@ std::error_code DataAggregator::printLBRHeatMap() { while (hasData()) { auto SampleRes = parseBranchSample(); - if (std::error_code EC = SampleRes.getError()) + if (std::error_code EC = SampleRes.getError()) { + if (EC == std::errc::no_such_process) + continue; return EC; + } auto &Sample = SampleRes.get(); @@ -1059,23 +1067,29 @@ std::error_code DataAggregator::parseBranchEvents() { uint64_t NumTotalSamples{0}; uint64_t NumEntries{0}; uint64_t NumSamples{0}; + uint64_t NumSamplesNoLBR{0}; uint64_t NumTraces{0}; while (hasData()) { ++NumTotalSamples; auto SampleRes = parseBranchSample(); - if (std::error_code EC = SampleRes.getError()) + if (std::error_code EC = SampleRes.getError()) { + if (EC == std::errc::no_such_process) + continue; return EC; + } + ++NumSamples; auto &Sample = SampleRes.get(); if (opts::WriteAutoFDOData) ++BasicSamples[Sample.PC]; - if (Sample.LBR.empty()) + if (Sample.LBR.empty()) { + ++NumSamplesNoLBR; continue; + } - ++NumSamples; NumEntries += Sample.LBR.size(); // LBRs are stored in reverse execution order. NextLBR refers to the next @@ -1147,14 +1161,25 @@ std::error_code DataAggregator::parseBranchEvents() { outs() << "PERF2BOLT: read " << NumSamples << " samples and " << NumEntries << " LBR entries\n"; if (NumTotalSamples) { - const auto IgnoredSamples = NumTotalSamples - NumSamples; - const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples; - outs() << "PERF2BOLT: " << IgnoredSamples << " samples"; - printColored(outs(), PercentIgnored, 20, 50); - outs() << " were ignored\n"; - if (PercentIgnored > 50.0f) { - errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples were " - "attributed to the input binary\n"; + if (NumSamples && NumSamplesNoLBR == NumSamples) { + if (errs().has_colors()) + errs().changeColor(raw_ostream::RED); + errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack " + "LBR. Record profile with perf record -j any or run perf2bolt " + "in no-LBR mode with -nl (the performance improvement in -nl " + "mode may be limited)\n"; + if (errs().has_colors()) + errs().resetColor(); + } else { + const auto IgnoredSamples = NumTotalSamples - NumSamples; + const auto PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples; + outs() << "PERF2BOLT: " << IgnoredSamples << " samples"; + printColored(outs(), PercentIgnored, 20, 50); + outs() << " were ignored\n"; + if (PercentIgnored > 50.0f) { + errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples " + "were attributed to the input binary\n"; + } } } outs() << "PERF2BOLT: traces mismatching disassembled function contents: " diff --git a/bolt/src/DataAggregator.h b/bolt/src/DataAggregator.h index eca279daccf6..e1585abda89f 100644 --- a/bolt/src/DataAggregator.h +++ b/bolt/src/DataAggregator.h @@ -225,7 +225,9 @@ class DataAggregator : public DataReader { std::error_code printLBRHeatMap(); /// Parse a single perf sample containing a PID associated with a sequence of - /// LBR entries + /// LBR entries. If the PID does not correspond to the binary we are looking + /// for, return std::errc::no_such_process. If other parsing errors occur, + /// return the error. Otherwise, return the parsed sample. ErrorOr parseBranchSample(); /// Parse a single perf sample containing a PID associated with an event name From b50ee79951facfdeae2f4ce39731c9daf45e3548 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 9 Apr 2019 11:31:45 -0700 Subject: [PATCH 519/904] [BOLT][NFC] Indentation fix (cherry picked from commit e79ad8757e26c9920c7e1137e2d9576f05ca335b) --- bolt/src/BinaryContext.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 4be1af4dc79e..ab430ace9e87 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -522,7 +522,7 @@ void BinaryContext::fixBinaryDataHoles() { while (Itr != End) { if (Itr->second->getAddress() > EndAddress) { - auto Gap = Itr->second->getAddress() - EndAddress; + auto Gap = Itr->second->getAddress() - EndAddress; Holes.push_back(std::make_pair(EndAddress, Gap)); } EndAddress = Itr->second->getEndAddress(); From 12243fcf6b5ecd0a2c327f669daa393e673b1520 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 9 Apr 2019 12:29:40 -0700 Subject: [PATCH 520/904] [BOLT] Add interface to extract values from static addresses (cherry picked from commit 301c4ac0aa08f49f57f0add3c94443f3dfef9851) --- bolt/src/BinaryContext.cpp | 32 ++++++++++++++++++----- bolt/src/BinaryContext.h | 18 ++++++++++--- bolt/src/Exceptions.cpp | 7 +++-- bolt/src/Passes/IndirectCallPromotion.cpp | 2 +- bolt/src/RewriteInstance.cpp | 17 ++++-------- bolt/src/RewriteInstance.h | 3 +-- bolt/src/Target/X86/X86MCPlusBuilder.cpp | 2 +- 7 files changed, 50 insertions(+), 31 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index ab430ace9e87..76a2d69311ba 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -1079,17 +1079,35 @@ BinarySection &BinaryContext::absoluteSection() { } ErrorOr -BinaryContext::extractPointerAtAddress(uint64_t Address) const { - auto Section = getSectionForAddress(Address); +BinaryContext::getUnsignedValueAtAddress(uint64_t Address, + size_t Size) const { + const auto Section = getSectionForAddress(Address); + if (!Section) + return std::make_error_code(std::errc::bad_address); + + if (Section->isVirtual()) + return 0; + + DataExtractor DE(Section->getContents(), AsmInfo->isLittleEndian(), + AsmInfo->getCodePointerSize()); + auto ValueOffset = static_cast(Address - Section->getAddress()); + return DE.getUnsigned(&ValueOffset, Size); +} + +ErrorOr +BinaryContext::getSignedValueAtAddress(uint64_t Address, + size_t Size) const { + const auto Section = getSectionForAddress(Address); if (!Section) return std::make_error_code(std::errc::bad_address); - StringRef SectionContents = Section->getContents(); - DataExtractor DE(SectionContents, - AsmInfo->isLittleEndian(), + if (Section->isVirtual()) + return 0; + + DataExtractor DE(Section->getContents(), AsmInfo->isLittleEndian(), AsmInfo->getCodePointerSize()); - uint32_t SectionOffset = Address - Section->getAddress(); - return DE.getAddress(&SectionOffset); + auto ValueOffset = static_cast(Address - Section->getAddress()); + return DE.getSigned(&ValueOffset, Size); } void BinaryContext::addRelocation(uint64_t Address, diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 18638bf46518..977cacb518f0 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -731,10 +731,20 @@ class BinaryContext { return std::make_error_code(std::errc::bad_address); } - /// Given \p Address in the binary, extract and return a pointer value at that - /// address. The address has to be a valid statically allocated address for - /// the binary. - ErrorOr extractPointerAtAddress(uint64_t Address) const; + /// Return an unsigned value of \p Size stored at \p Address. The address has + /// to be a valid statically allocated address for the binary. + ErrorOr getUnsignedValueAtAddress(uint64_t Address, + size_t Size) const; + + /// Return a signed value of \p Size stored at \p Address. The address has + /// to be a valid statically allocated address for the binary. + ErrorOr getSignedValueAtAddress(uint64_t Address, + size_t Size) const; + + /// Special case of getUnsignedValueAtAddress() that uses a pointer size. + ErrorOr getPointerAtAddress(uint64_t Address) const { + return getUnsignedValueAtAddress(Address, AsmInfo->getCodePointerSize()); + } /// Replaces all references to \p ChildBF with \p ParentBF. \p ChildBF is then /// removed from the list of functions \p BFs. The profile data of \p ChildBF diff --git a/bolt/src/Exceptions.cpp b/bolt/src/Exceptions.cpp index 9b9258884bde..87f1eb853d81 100644 --- a/bolt/src/Exceptions.cpp +++ b/bolt/src/Exceptions.cpp @@ -266,7 +266,7 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, return; } if (TTypeEncoding & DW_EH_PE_indirect) { - auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress); + auto PointerOrErr = BC.getPointerAtAddress(TypeAddress); assert(PointerOrErr && "failed to decode indirect address"); TypeAddress = *PointerOrErr; } @@ -349,9 +349,8 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, if ((TTypeEncoding & DW_EH_PE_pcrel) && (TypeAddress == TTEntryAddress)) { TypeAddress = 0; } - if (TypeAddress && - (TTypeEncoding & DW_EH_PE_indirect)) { - auto PointerOrErr = BC.extractPointerAtAddress(TypeAddress); + if (TypeAddress && (TTypeEncoding & DW_EH_PE_indirect)) { + auto PointerOrErr = BC.getPointerAtAddress(TypeAddress); assert(PointerOrErr && "failed to decode indirect address"); TypeAddress = *PointerOrErr; } diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp index 4457585d6a13..68d2fe685be5 100644 --- a/bolt/src/Passes/IndirectCallPromotion.cpp +++ b/bolt/src/Passes/IndirectCallPromotion.cpp @@ -647,7 +647,7 @@ IndirectCallPromotion::maybeGetVtableSyms( << "+" << MethodOffset << "/" << MI.Count << "\n"); - if (auto MethodAddr = BC.extractPointerAtAddress(Address)) { + if (auto MethodAddr = BC.getPointerAtAddress(Address)) { auto *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get()); if (!MethodBD) // skip unknown methods continue; diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 87823fe1b4bf..5d082f73a72b 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1838,7 +1838,6 @@ int64_t getRelocationAddend(const ELFObjectFileBase *Obj, } // anonymous namespace bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, - SectionRef RelocatedSection, std::string &SymbolName, bool &IsSectionRelocation, uint64_t &SymbolAddress, @@ -1849,16 +1848,11 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, const bool IsAArch64 = BC->isAArch64(); - // Extract the value. - StringRef RelocatedSectionContents; - RelocatedSection.getContents(RelocatedSectionContents); - DataExtractor DE(RelocatedSectionContents, - BC->AsmInfo->isLittleEndian(), - BC->AsmInfo->getCodePointerSize()); - uint32_t RelocationOffset = Rel.getOffset() - RelocatedSection.getAddress(); const auto RelSize = Relocation::getSizeForType(Rel.getType()); - ExtractedValue = static_cast(DE.getSigned(&RelocationOffset, - RelSize)); + + auto Value = BC->getUnsignedValueAtAddress(Rel.getOffset(), RelSize); + assert(Value && "failed to extract relocated value"); + ExtractedValue = *Value; if (IsAArch64) { ExtractedValue = Relocation::extractValue(Rel.getType(), ExtractedValue, @@ -1867,7 +1861,7 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, Addend = getRelocationAddend(InputFile, Rel); - const bool IsPCRelative = Relocation::isPCRelative(Rel.getType()); + const auto IsPCRelative = Relocation::isPCRelative(Rel.getType()); const auto PCRelOffset = IsPCRelative && !IsAArch64 ? Rel.getOffset() : 0; bool SkipVerification = false; auto SymbolIter = Rel.getSymbol(); @@ -2026,7 +2020,6 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { uint64_t ExtractedValue; bool IsSectionRelocation; if (!analyzeRelocation(Rel, - RelocatedSection, SymbolName, IsSectionRelocation, SymbolAddress, diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index 7e9e7f5cff6a..fa7a1a9e2c85 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -163,12 +163,11 @@ class RewriteInstance { /// Make .eh_frame section relocatable. void relocateEHFrameSection(); - /// Analyze relocation \p Rel contained in section \p RelocatedSection. + /// Analyze relocation \p Rel. /// Return true if the relocation was successfully processed, false otherwise. /// The \p SymbolName, \p SymbolAddress, \p Addend and \p ExtractedValue /// parameters will be set on success. bool analyzeRelocation(const RelocationRef &Rel, - SectionRef RelocatedSection, std::string &SymbolName, bool &IsSectionRelocation, uint64_t &SymbolAddress, diff --git a/bolt/src/Target/X86/X86MCPlusBuilder.cpp b/bolt/src/Target/X86/X86MCPlusBuilder.cpp index b758146999a1..ea8e0eb1943a 100644 --- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp @@ -1893,7 +1893,7 @@ class X86MCPlusBuilder : public MCPlusBuilder { assert(Offset + I.DataSize <= ConstantData.size() && "invalid offset for given constant data"); int64_t ImmVal = - DataExtractor(ConstantData, true, 64).getSigned(&Offset, I.DataSize); + DataExtractor(ConstantData, true, 8).getSigned(&Offset, I.DataSize); // Compute the new opcode. unsigned NewOpcode = 0; From 646becc06f11a1fb3283bbb0c9d4f04f46c1d342 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 9 Apr 2019 11:27:23 -0700 Subject: [PATCH 521/904] [BOLT] Sort basic block successors for printing Summary: For easier analysis of the hottest targets of jump tables it helps to have basic block successors sorted based on the taken frequency. (cherry picked from commit 67553364e3eb8e83d919e24806d2f105f0e4136f) --- bolt/src/BinaryBasicBlock.h | 6 ++++++ bolt/src/BinaryFunction.cpp | 26 +++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/bolt/src/BinaryBasicBlock.h b/bolt/src/BinaryBasicBlock.h index 2fbfb9c38c33..30b42c09528a 100644 --- a/bolt/src/BinaryBasicBlock.h +++ b/bolt/src/BinaryBasicBlock.h @@ -49,6 +49,12 @@ class BinaryBasicBlock { struct BinaryBranchInfo { uint64_t Count; uint64_t MispredictedCount; /// number of branches mispredicted + + bool operator<(const BinaryBranchInfo &Other) const { + return (Count < Other.Count) || + (Count == Other.Count && + MispredictedCount < Other.MispredictedCount); + } }; static constexpr uint32_t INVALID_OFFSET = diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 711d7282e30d..1d8b81b49a3c 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -556,21 +556,29 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (!BB->succ_empty()) { OS << " Successors: "; - auto BI = BB->branch_info_begin(); + // For more than 2 successors, sort them based on frequency. + std::vector Indices(BB->succ_size()); + std::iota(Indices.begin(), Indices.end(), 0); + if (BB->succ_size() > 2 && BB->getKnownExecutionCount()) { + std::stable_sort(Indices.begin(), Indices.end(), + [&](const uint64_t A, const uint64_t B) { + return BB->BranchInfo[B] < BB->BranchInfo[A]; + }); + } auto Sep = ""; - for (auto Succ : BB->successors()) { - assert(BI != BB->branch_info_end() && "missing BranchInfo entry"); + for (unsigned I = 0; I < Indices.size(); ++I) { + auto *Succ = BB->Successors[Indices[I]]; + auto &BI = BB->BranchInfo[Indices[I]]; OS << Sep << Succ->getName(); if (ExecutionCount != COUNT_NO_PROFILE && - BI->MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { - OS << " (mispreds: " << BI->MispredictedCount - << ", count: " << BI->Count << ")"; + BI.MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { + OS << " (mispreds: " << BI.MispredictedCount + << ", count: " << BI.Count << ")"; } else if (ExecutionCount != COUNT_NO_PROFILE && - BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { - OS << " (inferred count: " << BI->Count << ")"; + BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) { + OS << " (inferred count: " << BI.Count << ")"; } Sep = ", "; - ++BI; } OS << '\n'; } From b828b56039abd3a4f3244ee9596f30ea551e984f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 9 Apr 2019 21:22:41 -0700 Subject: [PATCH 522/904] [BOLT] Include for std::iota Summary: Some compilers require header. (cherry picked from commit cf3eda87eb3f8c56e2b461370da81ccb236865e8) --- bolt/src/BinaryFunction.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 1d8b81b49a3c..77a47e262171 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -33,10 +33,11 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Regex.h" #include +#include #include +#include #include #include -#include #undef DEBUG_TYPE #define DEBUG_TYPE "bolt" From 16c3acd8aea93ff3c2c56c976956755483a78e93 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 11 Apr 2019 17:11:08 -0700 Subject: [PATCH 523/904] [BOLT] Handle R_X86_64_converted_reloc_bit Summary: In binutils 2.30 a bfd linker accidentally started modifying some relocations on output under `-q/--emit-relocs` by turning on R_X86_64_converted_reloc_bit. As a result, BOLT ignored such relocations and failed to correctly update the binary. This diff filters out R_X86_64_converted_reloc_bit from the relocation type. (cherry picked from commit f08c61e14d18a625359adeab3551bd5b1502ebc3) --- bolt/src/Relocation.h | 8 ++++++++ bolt/src/RewriteInstance.cpp | 38 ++++++++++++++++++++++++------------ bolt/src/RewriteInstance.h | 1 + 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/bolt/src/Relocation.h b/bolt/src/Relocation.h index f6cd6791c565..ba00f7679753 100644 --- a/bolt/src/Relocation.h +++ b/bolt/src/Relocation.h @@ -18,6 +18,14 @@ #include "llvm/Support/raw_ostream.h" namespace llvm { + +namespace ELF { +/// Relocation type mask that was accidentally output by bfd 2.30 linker. +enum { + R_X86_64_converted_reloc_bit = 0x80 +}; +} + namespace bolt { /// Relocation class. diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 5d082f73a72b..e05f48c76c17 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -23,6 +23,7 @@ #include "MCPlusBuilder.h" #include "ProfileReader.h" #include "ProfileWriter.h" +#include "Relocation.h" #include "RewriteInstance.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" @@ -1838,30 +1839,31 @@ int64_t getRelocationAddend(const ELFObjectFileBase *Obj, } // anonymous namespace bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, + uint64_t RType, std::string &SymbolName, bool &IsSectionRelocation, uint64_t &SymbolAddress, int64_t &Addend, uint64_t &ExtractedValue) const { - if (!Relocation::isSupported(Rel.getType())) + if (!Relocation::isSupported(RType)) return false; const bool IsAArch64 = BC->isAArch64(); - const auto RelSize = Relocation::getSizeForType(Rel.getType()); + const auto RelSize = Relocation::getSizeForType(RType); auto Value = BC->getUnsignedValueAtAddress(Rel.getOffset(), RelSize); assert(Value && "failed to extract relocated value"); ExtractedValue = *Value; if (IsAArch64) { - ExtractedValue = Relocation::extractValue(Rel.getType(), + ExtractedValue = Relocation::extractValue(RType, ExtractedValue, Rel.getOffset()); } Addend = getRelocationAddend(InputFile, Rel); - const auto IsPCRelative = Relocation::isPCRelative(Rel.getType()); + const auto IsPCRelative = Relocation::isPCRelative(RType); const auto PCRelOffset = IsPCRelative && !IsAArch64 ? Rel.getOffset() : 0; bool SkipVerification = false; auto SymbolIter = Rel.getSymbol(); @@ -1906,7 +1908,7 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, // For GOT relocs, do not subtract addend as the addend does not refer // to this instruction's target, but it refers to the target in the GOT // entry. - if (Relocation::isGOT(Rel.getType())) { + if (Relocation::isGOT(RType)) { Addend = 0; SymbolAddress = ExtractedValue + PCRelOffset; } else if (!SymbolAddress) { @@ -1936,7 +1938,7 @@ bool RewriteInstance::analyzeRelocation(const RelocationRef &Rel, if (SymbolName == "__hot_start" || SymbolName == "__hot_end") return true; - if (Relocation::isTLS(Rel.getType())) + if (Relocation::isTLS(RType)) return true; return truncateToSize(ExtractedValue, RelSize) == @@ -2013,6 +2015,15 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { for (const auto &Rel : Section.relocations()) { SmallString<16> TypeName; Rel.getTypeName(TypeName); + auto RType = Rel.getType(); + + // Adjust the relocation type as the linker might have skewed it. + if (BC->isX86() && (RType & ELF::R_X86_64_converted_reloc_bit)) { + if (opts::Verbosity >= 1) { + dbgs() << "BOLT-WARNING: ignoring R_X86_64_converted_reloc_bit\n"; + } + RType &= ~ELF::R_X86_64_converted_reloc_bit; + } std::string SymbolName; uint64_t SymbolAddress; @@ -2020,6 +2031,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { uint64_t ExtractedValue; bool IsSectionRelocation; if (!analyzeRelocation(Rel, + RType, SymbolName, IsSectionRelocation, SymbolAddress, @@ -2058,7 +2070,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { // between the two. If we blindly apply the relocation it will appear // that it references an arbitrary location in the code, possibly even // in a different function from that containing the jump table. - if (!IsAArch64 && Relocation::isPCRelative(Rel.getType())) { + if (!IsAArch64 && Relocation::isPCRelative(RType)) { // Just register the fact that we have PC-relative relocation at a given // address. The actual referenced label/address cannot be determined // from linker data alone. @@ -2086,7 +2098,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { return false; }(SymbolName); - if (BC->isAArch64() && Rel.getType() == ELF::R_AARCH64_ADR_GOT_PAGE) + if (BC->isAArch64() && RType == ELF::R_AARCH64_ADR_GOT_PAGE) ForceRelocation = true; auto RefSection = BC->getSectionForAddress(SymbolAddress); @@ -2129,10 +2141,10 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { uint64_t RefFunctionOffset = 0; MCSymbol *ReferencedSymbol = nullptr; if (ForceRelocation) { - auto Name = Relocation::isGOT(Rel.getType()) ? "Zero" : SymbolName; + auto Name = Relocation::isGOT(RType) ? "Zero" : SymbolName; ReferencedSymbol = BC->registerNameAtAddress(Name, 0, 0, 0); SymbolAddress = 0; - if (Relocation::isGOT(Rel.getType())) + if (Relocation::isGOT(RType)) Addend = Address; DEBUG(dbgs() << "BOLT-DEBUG: forcing relocation against symbol " << SymbolName << " with addend " << Addend << '\n'); @@ -2146,7 +2158,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { // Workaround for member function pointer de-virtualization bug. // We check if a code non-pc-relative relocation is pointing // to a (fptr - 1). - if (ContainingBF && !Relocation::isPCRelative(Rel.getType())) { + if (ContainingBF && !Relocation::isPCRelative(RType)) { if (const auto *NextBF = BC->getBinaryFunctionAtAddress(Address + 1)) { errs() << "BOLT-WARNING: detected possible compiler " @@ -2348,7 +2360,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { if (ReferencedBF || ForceRelocation) { ContainingBF->addRelocation(Rel.getOffset(), ReferencedSymbol, - Rel.getType(), + RType, Addend, ExtractedValue); } else { @@ -2358,7 +2370,7 @@ void RewriteInstance::readRelocations(const SectionRef &Section) { } else if (IsToCode || ForceRelocation) { BC->addRelocation(Rel.getOffset(), ReferencedSymbol, - Rel.getType(), + RType, Addend); } else { DEBUG(dbgs() << "BOLT-DEBUG: ignoring relocation from data to data\n"); diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index fa7a1a9e2c85..e6bd0acfee87 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -168,6 +168,7 @@ class RewriteInstance { /// The \p SymbolName, \p SymbolAddress, \p Addend and \p ExtractedValue /// parameters will be set on success. bool analyzeRelocation(const RelocationRef &Rel, + uint64_t RType, std::string &SymbolName, bool &IsSectionRelocation, uint64_t &SymbolAddress, From 7a5673129f3641a6383ddc602d352d22a020c5dc Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 15 Apr 2019 11:56:55 -0700 Subject: [PATCH 524/904] [BOLT] Reduce warnings for non-simple functions Summary: If a function was already marked as non-simple, there's no reason to issue a warning that it has a reference in the middle of an instruction. Besides, sometimes there wouldn't be instructions disassembled at a given entry, and the warning would be incorrect. (cherry picked from commit 1f688e99ab78186d7f766a959a2c5164577fc65b) --- bolt/src/BinaryFunction.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 77a47e262171..5ea1c476c380 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -1479,6 +1479,9 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } void BinaryFunction::postProcessEntryPoints() { + if (!isSimple()) + return; + for (auto Offset : EntryOffsets) { if (!getInstructionAtOffset(Offset)) { // On AArch64 there are legitimate reasons to have references past the From 37c9a15af07dd905c72cfc8f8fd04b0d1a87986d Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 15 Apr 2019 14:03:01 -0700 Subject: [PATCH 525/904] [BOLT] Abort processing if the profile has no valid data Summary: It's possible to pass a profile in invalid format to BOLT, and we silently ignore it. This could cause a regression as such scenario can go undetected. We should abort processing if no valid data was seen in the profile and issue a warning if it was partially invalid. (cherry picked from commit 9c5716e4ffcb43f15d98e881927ee973d49a2eb0) --- bolt/src/DataReader.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp index be179e482918..65f67c3bce7d 100644 --- a/bolt/src/DataReader.cpp +++ b/bolt/src/DataReader.cpp @@ -253,14 +253,20 @@ void FuncMemData::update(const Location &Offset, const Location &Addr) { ErrorOr> DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { - ErrorOr> MB = - MemoryBuffer::getFileOrSTDIN(Path); - if (std::error_code EC = MB.getError()) { - Diag << "Cannot open " << Path << ": " << EC.message() << "\n"; + auto MB = MemoryBuffer::getFileOrSTDIN(Path); + if (auto EC = MB.getError()) { + Diag << "cannot open " << Path << ": " << EC.message() << "\n"; return EC; } auto DR = make_unique(std::move(MB.get()), Diag); - DR->parse(); + if (auto EC = DR->parse()) { + return EC; + } + if (!DR->ParsingBuf.empty()) { + Diag << "WARNING: invalid profile data detected at line " << DR->Line + << ". Possibly corrupted profile.\n"; + } + DR->buildLTONameMaps(); return std::move(DR); } @@ -599,6 +605,12 @@ std::error_code DataReader::parse() { if (!FlagOrErr) return FlagOrErr.getError(); NoLBRMode = *FlagOrErr; + + if (!hasBranchData() && !hasMemData()) { + Diag << "ERROR: no valid profile data found\n"; + return make_error_code(llvm::errc::io_error); + } + if (NoLBRMode) return parseInNoLBRMode(); From eceb151a10b7e28eb538d027cfd1ea4e68a69868 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 16 Apr 2019 10:39:05 -0700 Subject: [PATCH 526/904] [BOLT] Add another section to the list of hot text movers Summary: (cherry picked from commit 4c42911ef3ec020b36262ac31f60f89e754f262b) --- bolt/src/RewriteInstance.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index e05f48c76c17..4b481e2c75bf 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1800,6 +1800,7 @@ void RewriteInstance::adjustCommandLineOptions() { if (opts::HotText && opts::HotTextMoveSections.getNumOccurrences() == 0) { opts::HotTextMoveSections.addValue(".stub"); opts::HotTextMoveSections.addValue(".mover"); + opts::HotTextMoveSections.addValue(".never_hugify"); } } From 36accdef66683b910f04c920b11f83eefe80b179 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Tue, 16 Apr 2019 14:35:29 -0700 Subject: [PATCH 527/904] [BOLT] Fix adjustFunctionBoundaries w.r.t. entry points Summary: Don't consider symbols in another section when processing additional entry points for a function. (cherry picked from commit fea1f3e788ef17fe0ac95749c3f15a61e036a8dc) --- bolt/src/BinaryFunction.cpp | 6 ++++++ bolt/src/RewriteInstance.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 5ea1c476c380..81e9c65d3624 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -3721,6 +3721,12 @@ bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol, bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const { + // If this symbol is in a different section from the one where the + // function symbol is, don't consider it as valid. + if (!getSection().containsAddress( + cantFail(Symbol.getAddress(), "cannot get symbol address"))) + return false; + // Some symbols are tolerated inside function bodies, others are not. // The real function boundaries may not be known at this point. if (isDataMarker(Symbol, SymbolSize) || isCodeMarker(Symbol, SymbolSize)) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 4b481e2c75bf..d946c10c910a 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1360,7 +1360,7 @@ void RewriteInstance::discoverFileObjects() { } TentativeSize = SymbolSize; } - + BinaryFunction *BF{nullptr}; // Since function may not have yet obtained its real size, do a search // using the list of registered functions instead of calling From 579f4d074b74b6104bca2003da5b6eb200f49d07 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 15 Apr 2019 16:42:49 -0700 Subject: [PATCH 528/904] [BOLT] Fix an issue with std:errc Summary: On some platforms `llvm::make_error_code(std::errc::no_such_process) == std::errc::no_such_process` evaluates to false. (cherry picked from commit 298285cbbc14ba1b35c15826cb450c25f1722b5f) --- bolt/src/DataAggregator.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 54da55860caf..89d0e687affa 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -789,7 +789,7 @@ ErrorOr DataAggregator::parseBranchSample() { auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes); if (MMapInfoIter == BinaryMMapInfo.end()) { consumeRestOfLine(); - return make_error_code(std::errc::no_such_process); + return make_error_code(errc::no_such_process); } while (checkAndConsumeFS()) {} @@ -1002,8 +1002,8 @@ std::error_code DataAggregator::printLBRHeatMap() { while (hasData()) { auto SampleRes = parseBranchSample(); - if (std::error_code EC = SampleRes.getError()) { - if (EC == std::errc::no_such_process) + if (auto EC = SampleRes.getError()) { + if (EC == errc::no_such_process) continue; return EC; } @@ -1074,8 +1074,8 @@ std::error_code DataAggregator::parseBranchEvents() { ++NumTotalSamples; auto SampleRes = parseBranchSample(); - if (std::error_code EC = SampleRes.getError()) { - if (EC == std::errc::no_such_process) + if (auto EC = SampleRes.getError()) { + if (EC == errc::no_such_process) continue; return EC; } From ad97e95b7707127a69b198bff9e02e7ae822e35d Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 16 Apr 2019 10:24:34 -0700 Subject: [PATCH 529/904] [BOLT] Basic support for split functions Summary: This adds very basic and limited support for split functions. In non-relocation mode, split functions are ignored, while their debug info is properly updated. No support in the relocation mode yet. Split functions consist of a main body and one or more fragments. For fragments, the main part is called their parent. Any fragment could only be entered via its parent or another fragment. The short-term goal is to correctly update debug information for split functions, while the long-term goal is to have a complete support including full optimization. Note that if we don't detect split bodies, we would have to add multiple entry points via tail calls, which we would rather avoid. Parent functions and fragments are represented by a `BinaryFunction` and are marked accordingly. For now they are marked as non-simple, and thus only supported in non-relocation mode. Once we start building a CFG, it should be a common graph (i.e. the one that includes all fragments) in the parent function. The function discovery is unchanged, except for the detection of `\.cold\.` pattern in the function name, which automatically marks the function as a fragment of another function. Because of the local function name ambiguity, we cannot rely on the function name to establish child fragment and parent relationship. Instead we rely on disassembly processing. `BinaryContext::getBinaryFunctionContainingAddress()` now returns a parent function if an address from its fragment is passed. There's no jump table support at the moment. Jump tables can have source and destinations in both fragment and parent. Parent functions that enter their fragments via C++ exception handling mechanism are not yet supported. (cherry picked from commit fd8519310fccaeab11330bea8fa0a4f0525c4274) --- bolt/src/BinaryContext.cpp | 34 +++++++++++---- bolt/src/BinaryContext.h | 21 +++++----- bolt/src/BinaryFunction.cpp | 48 ++++++++++++++++++---- bolt/src/BinaryFunction.h | 30 +++++++++++++- bolt/src/DWARFRewriter.cpp | 41 ++++++++++++------ bolt/src/RewriteInstance.cpp | 80 ++++++++++++++++++++++++++++-------- 6 files changed, 195 insertions(+), 59 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 76a2d69311ba..113229605f81 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -647,7 +647,6 @@ namespace { void findSubprograms(const DWARFDie DIE, std::map &BinaryFunctions) { if (DIE.isSubprogramDIE()) { - // TODO: handle DW_AT_ranges. uint64_t LowPC, HighPC, SectionIndex; if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) { auto It = BinaryFunctions.find(LowPC); @@ -658,10 +657,11 @@ void findSubprograms(const DWARFDie DIE, } } else { const auto RangesVector = DIE.getAddressRanges(); - if (!RangesVector.empty()) { - errs() << "BOLT-ERROR: split function detected in .debug_info. " - "Split functions are not supported.\n"; - exit(1); + for (const auto Range : DIE.getAddressRanges()) { + auto It = BinaryFunctions.find(Range.LowPC); + if (It != BinaryFunctions.end()) { + It->second.addSubprogramDIE(DIE); + } } } } @@ -1241,8 +1241,8 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF) { BinaryFunction * BinaryContext::getBinaryFunctionContainingAddress(uint64_t Address, - bool CheckPastEnd, - bool UseMaxSize) { + bool CheckPastEnd, + bool UseMaxSize) { auto FI = BinaryFunctions.upper_bound(Address); if (FI == BinaryFunctions.begin()) return nullptr; @@ -1253,7 +1253,25 @@ BinaryContext::getBinaryFunctionContainingAddress(uint64_t Address, if (Address >= FI->first + UsedSize + (CheckPastEnd ? 1 : 0)) return nullptr; - return &FI->second; + + auto *BF = &FI->second; + while (BF->getParentFunction()) + BF = BF->getParentFunction(); + + return BF; +} + +BinaryFunction * +BinaryContext::getBinaryFunctionAtAddress(uint64_t Address, bool Shallow) { + if (const auto *BD = getBinaryDataAtAddress(Address)) { + if (auto *BF = getFunctionForSymbol(BD->getSymbol())) { + while (BF->getParentFunction() && !Shallow) { + BF = BF->getParentFunction(); + } + return BF; + } + } + return nullptr; } DebugAddressRangesVector BinaryContext::translateModuleAddressRanges( diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 977cacb518f0..49c77129fe46 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -185,17 +185,16 @@ class BinaryContext { bool CheckPastEnd = false, bool UseMaxSize = false); - /// Return BinaryFunction that starts at a given \p Address. - BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) { - if (const auto *BD = getBinaryDataAtAddress(Address)) - return getFunctionForSymbol(BD->getSymbol()); - return nullptr; - } + /// Return BinaryFunction which has a fragment that starts at a given + /// \p Address. If the BinaryFunction is a child fragment, then return its + /// parent unless \p Shallow parameter is set to true. + BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address, + bool Shallow = false); - const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address) const { - if (const auto *BD = getBinaryDataAtAddress(Address)) - return getFunctionForSymbol(BD->getSymbol()); - return nullptr; + const BinaryFunction *getBinaryFunctionAtAddress(uint64_t Address, + bool Shallow = false) const { + return const_cast(this)-> + getBinaryFunctionAtAddress(Address, Shallow); } /// [MCSymbol] -> [BinaryFunction] @@ -284,7 +283,7 @@ class BinaryContext { /// Set of addresses in the code that are not a function start, and are /// referenced from outside of containing function. E.g. this could happen /// when a function has more than a single entry point. - std::set InterproceduralReferences; + std::set> InterproceduralReferences; std::unique_ptr Ctx; diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 81e9c65d3624..9eb18e9d8bdc 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -272,12 +272,13 @@ bool DynoStats::lessThan(const DynoStats &Other, uint64_t BinaryFunction::Count = 0; -bool BinaryFunction::hasNameRegex(const std::string &NameRegex) const { +const std::string * +BinaryFunction::hasNameRegex(const std::string &NameRegex) const { Regex MatchName(NameRegex); for (auto &Name : Names) if (MatchName.match(Name)) - return true; - return false; + return &Name; + return nullptr; } std::string BinaryFunction::getDemangledName() const { @@ -455,6 +456,20 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, << "\n IsSplit : " << isSplit() << "\n BB Count : " << size(); + if (IsFragment) { + OS << "\n IsFragment : true"; + } + if (ParentFunction) { + OS << "\n Parent : " << *ParentFunction; + } + if (!Fragments.empty()) { + OS << "\n Fragments : "; + auto Sep = ""; + for (auto *Frag : Fragments) { + OS << Sep << *Frag; + Sep = ", "; + } + } if (hasCFG()) { OS << "\n Hash : " << Twine::utohexstr(hash()); } @@ -950,7 +965,7 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, // postProcessIndirectBranches() is going to mark the function as non-simple // in this case. if (Value && BC.getSectionForAddress(Value)) - BC.InterproceduralReferences.insert(Value); + BC.InterproceduralReferences.insert(std::make_pair(this, Value)); return IndirectBranchType::POSSIBLE_TAIL_CALL; } @@ -1040,8 +1055,7 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { // a section, it could be an absolute address too. auto Section = BC.getSectionForAddress(TargetAddress); if (Section && Section->isText()) { - if (containsAddress(TargetAddress, /*UseMaxSize=*/ - BC.isAArch64())) { + if (containsAddress(TargetAddress, /*UseMaxSize=*/ BC.isAArch64())) { if (TargetAddress != getAddress()) { // The address could potentially escape. Mark it as another entry // point into the function. @@ -1051,7 +1065,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { return addEntryPointAtOffset(TargetAddress - getAddress()); } } else { - BC.InterproceduralReferences.insert(TargetAddress); + BC.InterproceduralReferences.insert( + std::make_pair(this, TargetAddress)); } } @@ -1291,7 +1306,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { } goto add_instruction; } - BC.InterproceduralReferences.insert(TargetAddress); + BC.InterproceduralReferences.insert( + std::make_pair(this, TargetAddress)); if (opts::Verbosity >= 2 && !IsCall && Size == 2 && !BC.HasRelocations) { errs() << "BOLT-WARNING: relaxed tail call detected at 0x" @@ -1400,7 +1416,8 @@ void BinaryFunction::disassemble(ArrayRef FunctionData) { HasFixedIndirectBranch = true; } else { MIB->convertJmpToTailCall(Instruction, BC.Ctx.get()); - BC.InterproceduralReferences.insert(IndirectTarget); + BC.InterproceduralReferences.insert( + std::make_pair(this, IndirectTarget)); } break; } @@ -3944,6 +3961,9 @@ void BinaryFunction::calculateLoopInfo() { DebugAddressRangesVector BinaryFunction::getOutputAddressRanges() const { DebugAddressRangesVector OutputRanges; + if (IsFragment) + return OutputRanges; + OutputRanges.emplace_back(getOutputAddress(), getOutputAddress() + getOutputSize()); if (isSplit()) { @@ -3952,6 +3972,16 @@ DebugAddressRangesVector BinaryFunction::getOutputAddressRanges() const { cold().getAddress() + cold().getImageSize()); } + if (isSimple()) + return OutputRanges; + + for (auto *Frag : Fragments) { + assert(!Frag->isSimple() && + "fragment of non-simple function should also be non-simple"); + OutputRanges.emplace_back(Frag->getOutputAddress(), + Frag->getOutputAddress() + Frag->getOutputSize()); + } + return OutputRanges; } diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 5d117e1d14d9..52127a180dd1 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -340,6 +340,12 @@ class BinaryFunction { /// Is the function known to exceed its input size? bool IsLarge{false}; + /// True if the function is a fragment of another function. This means that + /// this function could only be entered via its parent or one of its sibling + /// fragments. It could be entered at any basic block. It can also return + /// the control to any basic block of its parent or its sibling. + bool IsFragment{false}; + /// The address for the code for this function in codegen memory. uint64_t ImageAddress{0}; @@ -352,6 +358,12 @@ class BinaryFunction { /// Name for the corresponding cold code section. std::string ColdCodeSectionName; + /// Parent function for split function fragments. + BinaryFunction *ParentFunction{nullptr}; + + /// All fragments for a parent function. + std::unordered_set Fragments; + /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; @@ -665,6 +677,16 @@ class BinaryFunction { /// This is called in disassembled state. void addEntryPoint(uint64_t Address); + void setParentFunction(BinaryFunction *BF) { + assert((!ParentFunction || ParentFunction == BF) && + "cannot have more than one parent function"); + ParentFunction = BF; + } + + void addFragment(BinaryFunction *BF) { + Fragments.insert(BF); + } + /// Return true if there is a registered entry point at a given offset /// into the function. bool hasEntryPointAtOffset(uint64_t Offset) { @@ -1006,7 +1028,7 @@ class BinaryFunction { /// Check if (possibly one out of many) function name matches the given /// regex. - bool hasNameRegex(const std::string &NameRegex) const; + const std::string *hasNameRegex(const std::string &NameRegex) const; /// Return a vector of all possible names for the function. const std::vector &getNames() const { @@ -1365,7 +1387,7 @@ class BinaryFunction { } /// Return true if the given address \p PC is inside the function body. - bool containsAddress(uint64_t PC, bool UseMaxSize=false) const { + bool containsAddress(uint64_t PC, bool UseMaxSize = false) const { if (UseMaxSize) return Address <= PC && PC < Address + MaxSize; return Address <= PC && PC < Address + Size; @@ -1742,6 +1764,10 @@ class BinaryFunction { return ImageSize; } + BinaryFunction *getParentFunction() const { + return ParentFunction; + } + /// Set the profile data for the number of times the function was called. BinaryFunction &setExecutionCount(uint64_t Count) { ExecutionCount = Count; diff --git a/bolt/src/DWARFRewriter.cpp b/bolt/src/DWARFRewriter.cpp index e5dd2b26724e..4b3039b2e742 100644 --- a/bolt/src/DWARFRewriter.cpp +++ b/bolt/src/DWARFRewriter.cpp @@ -101,22 +101,39 @@ void DWARFRewriter::updateUnitDebugInfo( case dwarf::DW_TAG_subprogram: { - // The function cannot have multiple ranges on the input. - uint64_t SectionIndex, LowPC, HighPC; - if (DIE.getLowAndHighPC(LowPC, HighPC, SectionIndex)) { - IsFunctionDef = true; - const auto *Function = BC.getBinaryFunctionAtAddress(LowPC); - if (Function && Function->isFolded()) - Function = nullptr; - FunctionStack.push_back(Function); + // Get function address either from ranges or [LowPC, HighPC) pair. + bool UsesRanges = false; + uint64_t Address; + uint64_t SectionIndex, HighPC; + if (!DIE.getLowAndHighPC(Address, HighPC, SectionIndex)) { + auto Ranges = DIE.getAddressRanges(); + // Not a function definition. + if (Ranges.empty()) + break; + + Address = Ranges.front().LowPC; + UsesRanges = true; + } + IsFunctionDef = true; + const auto *Function = BC.getBinaryFunctionAtAddress(Address); + if (Function && Function->isFolded()) + Function = nullptr; + FunctionStack.push_back(Function); + + DebugAddressRangesVector FunctionRanges; + if (Function) + FunctionRanges = Function->getOutputAddressRanges(); + + // Update ranges. + if (UsesRanges) { + updateDWARFObjectAddressRanges(DIE, + RangesSectionsWriter->addRanges(FunctionRanges)); + } else { + // Delay conversion of [LowPC, HighPC) into DW_AT_ranges if possible. const auto *Abbrev = DIE.getAbbreviationDeclarationPtr(); assert(Abbrev && "abbrev expected"); - DebugAddressRangesVector FunctionRanges; - if (Function) - FunctionRanges = Function->getOutputAddressRanges(); - if (FunctionRanges.size() > 1) { convertPending(Abbrev); convertToRanges(DIE, FunctionRanges); diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index d946c10c910a..f1dc44f6201d 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -1094,14 +1094,23 @@ void RewriteInstance::discoverFileObjects() { std::stable_sort(SortedFileSymbols.begin(), SortedFileSymbols.end(), [](const SymbolRef &A, const SymbolRef &B) { - // FUNC symbols have higher precedence. + // FUNC symbols have the highest precedence, while SECTIONs + // have the lowest. auto AddressA = cantFail(A.getAddress()); auto AddressB = cantFail(B.getAddress()); - if (AddressA == AddressB) { - return cantFail(A.getType()) == SymbolRef::ST_Function && - cantFail(B.getType()) != SymbolRef::ST_Function; - } - return AddressA < AddressB; + if (AddressA != AddressB) + return AddressA < AddressB; + + auto AType = cantFail(A.getType()); + auto BType = cantFail(B.getType()); + if (AType == SymbolRef::ST_Function && + BType != SymbolRef::ST_Function) + return true; + if (BType == SymbolRef::ST_Debug && + AType != SymbolRef::ST_Debug) + return true; + + return false; }); // For aarch64, the ABI defines mapping symbols so we identify data in the @@ -1565,6 +1574,17 @@ void RewriteInstance::adjustFunctionBoundaries() { BFI != BFE; ++BFI) { auto &Function = BFI->second; + // Check if it's a fragment of a function. + if (auto *FragName = Function.hasNameRegex("\\.cold\\.")) { + static bool PrintedWarning = false; + if (BC->HasRelocations && !PrintedWarning) { + errs() << "BOLT-WARNING: split function detected on input : " + << *FragName <<". The support is limited in relocation mode.\n"; + PrintedWarning = true; + } + Function.IsFragment = true; + } + // Check if there's a symbol or a function with a larger address in the // same section. If there is - it determines the maximum size for the // current function. Otherwise, it is the size of a containing section @@ -2494,19 +2514,43 @@ void RewriteInstance::disassembleFunctions() { // Post-process inter-procedural references ASAP as it may affect // functions we are about to disassemble next. - for (const auto Addr : BC->InterproceduralReferences) { + for (auto &Pair : BC->InterproceduralReferences) { + auto *FromBF = Pair.first; + auto Addr = Pair.second; auto *ContainingFunction = BC->getBinaryFunctionContainingAddress(Addr); - if (ContainingFunction && ContainingFunction->getAddress() != Addr) { - ContainingFunction->addEntryPoint(Addr); - if (!BC->HasRelocations) { + if (FromBF == ContainingFunction) + continue; + + if (ContainingFunction) { + // Only a parent function (or a sibling) can reach its fragment. + if (ContainingFunction->IsFragment) { + assert(!FromBF->IsFragment && + "only one cold fragment is supported at this time"); + ContainingFunction->setParentFunction(FromBF); + FromBF->addFragment(ContainingFunction); + if (!BC->HasRelocations) { + ContainingFunction->setSimple(false); + FromBF->setSimple(false); + } if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: Function " << *ContainingFunction - << " has internal BBs that are target of a reference located" - << " in another function. Skipping the function.\n"; + outs() << "BOLT-INFO: marking " << *ContainingFunction + << " as a fragment of " << *FromBF << '\n'; + } + continue; + } + + if (ContainingFunction->getAddress() != Addr) { + ContainingFunction->addEntryPoint(Addr); + if (!BC->HasRelocations) { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: Function " << *ContainingFunction + << " has internal BBs that are target of a reference " + << "located in another function. Skipping the function.\n"; + } + ContainingFunction->setSimple(false); } - ContainingFunction->setSimple(false); } - } else if (!ContainingFunction && Addr) { + } else if (Addr) { // Check if address falls in function padding space - this could be // unmarked data in code. In this case adjust the padding space size. auto Section = BC->getSectionForAddress(Addr); @@ -3998,7 +4042,8 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { for (const Elf_Sym &Symbol : cantFail(Obj->symbols(Section))) { auto NewSymbol = Symbol; - const auto *Function = BC->getBinaryFunctionAtAddress(Symbol.st_value); + const auto *Function = BC->getBinaryFunctionAtAddress(Symbol.st_value, + /*Shallow=*/true); // Some section symbols may be mistakenly associated with the first // function emitted in the section. Dismiss if it is a section symbol. if (Function && @@ -4398,7 +4443,8 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile *File) { } uint64_t RewriteInstance::getNewFunctionAddress(uint64_t OldAddress) { - const auto *Function = BC->getBinaryFunctionAtAddress(OldAddress); + const auto *Function = BC->getBinaryFunctionAtAddress(OldAddress, + /*Shallow=*/true); if (!Function) return 0; return Function->getOutputAddress(); From 3aa5e2133f4ed003e22e79429555881f7c7d3253 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 17 Apr 2019 15:17:55 -0700 Subject: [PATCH 530/904] [BOLT] Process CFIs for functions with FDE size mismatch Summary: If a function size indicated in FDE is different from the one in the symbol table, we can keep processing the function as we are using the max size for internal purposes. Typically this happens for assembly-written functions with padding at the end. This padding is not included in FDE, but it is in the symbol table. (cherry picked from commit 5f44197c4dd7050254b40335f233b3bbded95fac) --- bolt/src/Exceptions.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/bolt/src/Exceptions.cpp b/bolt/src/Exceptions.cpp index 87f1eb853d81..0e459c2ebc9f 100644 --- a/bolt/src/Exceptions.cpp +++ b/bolt/src/Exceptions.cpp @@ -673,17 +673,6 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { return true; const FDE &CurFDE = *I->second; - if (Function.getSize() != CurFDE.getAddressRange()) { - if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: CFI information size mismatch for function \"" - << Function << "\"" - << format(": Function size is %dB, CFI covers " - "%dB\n", - Function.getSize(), CurFDE.getAddressRange()); - } - return false; - } - auto LSDA = CurFDE.getLSDAAddress(); Function.setLSDAAddress(LSDA ? *LSDA : 0); @@ -844,7 +833,8 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const { return false; default: if (opts::Verbosity >= 1) { - errs() << "BOLT-WARNING: Unrecognized CFI instruction\n"; + errs() << "BOLT-WARNING: Unrecognized CFI instruction: " + << Instr.Opcode << '\n'; } return false; } From 42624c86069c4f43a37a50172f06e98a3e5bed87 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 17 Apr 2019 18:20:56 -0700 Subject: [PATCH 531/904] [BOLT] Fix non-determinism in shrink wrapping Summary: Iterating over SmallPtrSet is non-deterministic. Change it to SmallSetVector. Similarly, do not sort a vector of ProgramPoint when computing the dominance frontier, as ProgramPoint uses the pointer value to determine order. Use a SmallSetVector there too to avoid duplicates instead of sorting + uniqueing. (cherry picked from commit c8df6fd44ed6a686172cdb6702043f8f35ed283e) --- bolt/src/Passes/DominatorAnalysis.h | 14 ++++++-------- bolt/src/Passes/ShrinkWrapping.cpp | 4 ++-- bolt/src/Passes/ShrinkWrapping.h | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/bolt/src/Passes/DominatorAnalysis.h b/bolt/src/Passes/DominatorAnalysis.h index ffd045ef3026..3a3f83318b72 100644 --- a/bolt/src/Passes/DominatorAnalysis.h +++ b/bolt/src/Passes/DominatorAnalysis.h @@ -39,30 +39,28 @@ class DominatorAnalysis : InstrsDataflowAnalysis, Backward>(BC, BF) {} virtual ~DominatorAnalysis() {} - SmallVector getDominanceFrontierFor(const MCInst &Dom) { - SmallVector Result; + SmallSetVector getDominanceFrontierFor(const MCInst &Dom) { + SmallSetVector Result; auto DomIdx = this->ExprToIdx[&Dom]; assert(!Backward && "Post-dom frontier not implemented"); for (auto &BB : this->Func) { bool HasDominatedPred = false; bool HasNonDominatedPred = false; - SmallVector Candidates; + SmallSetVector Candidates; this->doForAllSuccsOrPreds(BB, [&](ProgramPoint P) { if ((*this->getStateAt(P))[DomIdx]) { - Candidates.emplace_back(P); + Candidates.insert(P); HasDominatedPred = true; return; } HasNonDominatedPred = true; }); if (HasDominatedPred && HasNonDominatedPred) - Result.append(Candidates.begin(), Candidates.end()); + Result.insert(Candidates.begin(), Candidates.end()); if ((*this->getStateAt(ProgramPoint::getLastPointAt(BB)))[DomIdx] && BB.succ_begin() == BB.succ_end()) - Result.emplace_back(ProgramPoint::getLastPointAt(BB)); + Result.insert(ProgramPoint::getLastPointAt(BB)); } - std::sort(Result.begin(), Result.end()); - Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); return Result; } diff --git a/bolt/src/Passes/ShrinkWrapping.cpp b/bolt/src/Passes/ShrinkWrapping.cpp index 4de7d9fec124..5800b8285d9f 100644 --- a/bolt/src/Passes/ShrinkWrapping.cpp +++ b/bolt/src/Passes/ShrinkWrapping.cpp @@ -771,7 +771,7 @@ void ShrinkWrapping::pruneUnwantedCSRs() { } void ShrinkWrapping::computeSaveLocations() { - SavePos = std::vector>(BC.MRI->getNumRegs()); + SavePos = std::vector>(BC.MRI->getNumRegs()); auto &RI = Info.getReachingInsnsBackwards(); auto &DA = Info.getDominatorAnalysis(); auto &SPT = Info.getStackPointerTracking(); @@ -960,7 +960,7 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR, // In case of a critical edge, we need to create extra BBs to host restores // into edges transitioning to the dominance frontier, otherwise we pull these // restores to inside the dominated area. - Frontier = DA.getDominanceFrontierFor(*BestPosSave); + Frontier = DA.getDominanceFrontierFor(*BestPosSave).takeVector(); DEBUG({ dbgs() << "Dumping dominance frontier for "; BC.printInstruction(dbgs(), *BestPosSave); diff --git a/bolt/src/Passes/ShrinkWrapping.h b/bolt/src/Passes/ShrinkWrapping.h index 916e02e6dbb6..cda195ac7439 100644 --- a/bolt/src/Passes/ShrinkWrapping.h +++ b/bolt/src/Passes/ShrinkWrapping.h @@ -306,7 +306,7 @@ class ShrinkWrapping { std::vector PopOffsetByReg; std::vector DomOrder; CalleeSavedAnalysis CSA; - std::vector> SavePos; + std::vector> SavePos; std::vector BestSaveCount; std::vector BestSavePos; From e9af1c333b8380ffbfe8c512594e1aef60960508 Mon Sep 17 00:00:00 2001 From: Brian Gesiak Date: Mon, 22 Apr 2019 11:19:02 -0400 Subject: [PATCH 532/904] [cmake] Only build enabled targets Summary: When attempting to build llvm-bolt with `-DLLVM_ENABLE_TARGETS="X86"`, I encountered an error: ``` CMake Error at cmake/modules/AddLLVM.cmake:559 (add_dependencies): The dependency target "AArch64CommonTableGen" of target "LLVMBOLTTargetAArch64" does not exist. Call Stack (most recent call first): cmake/modules/AddLLVM.cmake:607 (llvm_add_library) tools/llvm-bolt/src/Target/AArch64/CMakeLists.txt:1 (add_llvm_library) ``` The issue is that the `llvm-bolt/src/Target/AArch64` subdirectory is added by CMake unconditionally. The LLVM project, on the other hand, only adds the subdirectories that are enabled, by using a `foreach` loop over `LLVM_TARGETS_TO_BUILD`. Copying that same loop, from `llvm/lib/Target/CMakeLists.txt`, to this project avoids the error. (cherry picked from commit b0fa812ca7428bcf61ea8f1d6b4130fed56a028f) --- bolt/src/Target/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bolt/src/Target/CMakeLists.txt b/bolt/src/Target/CMakeLists.txt index f8e7a0bd9a0c..d929366e7fd2 100644 --- a/bolt/src/Target/CMakeLists.txt +++ b/bolt/src/Target/CMakeLists.txt @@ -1,2 +1,4 @@ -add_subdirectory(AArch64) -add_subdirectory(X86) +foreach(t ${LLVM_TARGETS_TO_BUILD}) + message(STATUS "Targeting llvm-bolt ${t}") + add_subdirectory(${t}) +endforeach() From 6677f61df2e2dcbdd70014db5970c8b040c3bf2b Mon Sep 17 00:00:00 2001 From: Brian Gesiak Date: Mon, 22 Apr 2019 11:27:50 -0400 Subject: [PATCH 533/904] Fix casting issues on macOS Summary: `size_t` is platform-dependent, and on macOS it is defined as `unsigned long long`. This is not the same type as is used in many calls to templated functions that expect the same type. As a result, on macOS, calls to `std::max` fail because a template function that takes `uint64_t, unsigned long long` cannot be found. To work around the issue: * Specify explicit `std::max` and `std::min` functions where necessary, to work around the compiler trying (and failing) to find a suitable instantiation. * For lambda return types, specify an explicit return type where necessary. * For `operator ==()` calls, use an explicit cast where necessary. (cherry picked from commit e3d1d940ab241f1c7794f27b7996582349176145) --- bolt/src/BinaryContext.cpp | 4 ++-- bolt/src/Passes/Aligner.cpp | 6 ++++-- bolt/src/Passes/BinaryPasses.cpp | 3 ++- bolt/src/Passes/CachePlusReorderAlgorithm.cpp | 2 +- bolt/src/Passes/IndirectCallPromotion.cpp | 12 ++++++------ bolt/src/ProfileReader.cpp | 3 ++- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 113229605f81..deb3086a2b38 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -590,14 +590,14 @@ void BinaryContext::printGlobalSymbols(raw_ostream& OS) const { } void BinaryContext::assignMemData() { - auto getAddress = [&](const MemInfo &MI) { + auto getAddress = [&](const MemInfo &MI) -> uint64_t { if (!MI.Addr.IsSymbol) return MI.Addr.Offset; if (auto *BD = getBinaryDataByName(MI.Addr.Name)) return BD->getAddress() + MI.Addr.Offset; - return 0ul; + return 0; }; // Map of sections (or heap/stack) to count/size. diff --git a/bolt/src/Passes/Aligner.cpp b/bolt/src/Passes/Aligner.cpp index a261de569370..c18c75e3cd9d 100644 --- a/bolt/src/Passes/Aligner.cpp +++ b/bolt/src/Passes/Aligner.cpp @@ -120,7 +120,8 @@ void AlignerPass::alignBlocks(BinaryFunction &Function) { const auto &BC = Function.getBinaryContext(); - const auto FuncCount = std::max(1UL, Function.getKnownExecutionCount()); + const auto FuncCount = + std::max(1, Function.getKnownExecutionCount()); BinaryBasicBlock *PrevBB{nullptr}; for (auto *BB : Function.layout()) { auto Count = BB->getKnownExecutionCount(); @@ -140,7 +141,8 @@ void AlignerPass::alignBlocks(BinaryFunction &Function) { continue; const auto BlockSize = BC.computeCodeSize(BB->begin(), BB->end()); - const auto BytesToUse = std::min(opts::BlockAlignment - 1UL, BlockSize); + const auto BytesToUse = + std::min(opts::BlockAlignment - 1, BlockSize); if (opts::AlignBlocksMinSize && BlockSize < opts::AlignBlocksMinSize) continue; diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index 62dc7e2e717a..e284a164475b 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -1438,7 +1438,8 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC) { if (!BF.hasValidProfile()) continue; - const auto HotThreshold = std::max(BF.getKnownExecutionCount(), 1UL); + const auto HotThreshold = + std::max(BF.getKnownExecutionCount(), 1); bool HotSeen = false; for (const auto *BB : BF.rlayout()) { if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { diff --git a/bolt/src/Passes/CachePlusReorderAlgorithm.cpp b/bolt/src/Passes/CachePlusReorderAlgorithm.cpp index 52db989c9105..5148220e7752 100644 --- a/bolt/src/Passes/CachePlusReorderAlgorithm.cpp +++ b/bolt/src/Passes/CachePlusReorderAlgorithm.cpp @@ -295,7 +295,7 @@ class CachePlus { Size.reserve(BF.layout_size()); for (auto BB : BF.layout()) { size_t Index = BB->getLayoutIndex(); - Size.push_back(std::max(BB->estimateSize(), size_t(1))); + Size.push_back(std::max(BB->estimateSize(), 1)); AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]); Clusters.push_back(&AllClusters[Index]); CurCluster.push_back(&AllClusters[Index]); diff --git a/bolt/src/Passes/IndirectCallPromotion.cpp b/bolt/src/Passes/IndirectCallPromotion.cpp index 68d2fe685be5..628ca815949d 100644 --- a/bolt/src/Passes/IndirectCallPromotion.cpp +++ b/bolt/src/Passes/IndirectCallPromotion.cpp @@ -1444,12 +1444,12 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) { << "BOLT-INFO: ICP percentage of indirect calls that can be " "optimized = " << format("%.1f", (100.0 * TotalNumFrequentCalls) / - std::max(TotalIndirectCalls, 1ul)) + std::max(TotalIndirectCalls, 1)) << "%\n" << "BOLT-INFO: ICP percentage of indirect callsites that are " "optimized = " << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) / - std::max(TotalIndirectCallsites, 1ul)) + std::max(TotalIndirectCallsites, 1)) << "%\n" << "BOLT-INFO: ICP number of method load elimination candidates = " << TotalMethodLoadEliminationCandidates @@ -1457,17 +1457,17 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) { << "BOLT-INFO: ICP percentage of method calls candidates that have " "loads eliminated = " << format("%.1f", (100.0 * TotalMethodLoadsEliminated) / - std::max(TotalMethodLoadEliminationCandidates, 1ul)) + std::max(TotalMethodLoadEliminationCandidates, 1)) << "%\n" << "BOLT-INFO: ICP percentage of indirect branches that are " "optimized = " << format("%.1f", (100.0 * TotalNumFrequentJmps) / - std::max(TotalIndirectJmps, 1ul)) + std::max(TotalIndirectJmps, 1)) << "%\n" << "BOLT-INFO: ICP percentage of jump table callsites that are " << "optimized = " << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) / - std::max(TotalJumpTableCallsites, 1ul)) + std::max(TotalJumpTableCallsites, 1)) << "%\n" << "BOLT-INFO: ICP number of jump table callsites that can use hot " << "indices = " << TotalIndexBasedCandidates @@ -1475,7 +1475,7 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) { << "BOLT-INFO: ICP percentage of jump table callsites that use hot " "indices = " << format("%.1f", (100.0 * TotalIndexBasedJumps) / - std::max(TotalIndexBasedCandidates, 1ul)) + std::max(TotalIndexBasedCandidates, 1)) << "%\n"; #ifndef NDEBUG diff --git a/bolt/src/ProfileReader.cpp b/bolt/src/ProfileReader.cpp index 9851814fcbfd..7e3d38be5e4a 100644 --- a/bolt/src/ProfileReader.cpp +++ b/bolt/src/ProfileReader.cpp @@ -268,7 +268,8 @@ ProfileReader::readProfile(const std::string &FileName, BinaryFunction &BF) { if (opts::IgnoreHash && Profile.NumBasicBlocks == BF.size()) return true; - if (!opts::IgnoreHash && Profile.Hash == BF.hash(/*Recompute = */false)) + if (!opts::IgnoreHash && + Profile.Hash == static_cast(BF.hash(/*Recompute = */false))) return true; return false; }; From 9b74d2268e0cdaa4d7e5f6dddb10c2664c5699ee Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Thu, 18 Apr 2019 16:32:22 -0700 Subject: [PATCH 534/904] [BOLT] Update symbols for secondary entry points Summary: Update the output ELF symbol table for symbols representing secondary entry points for functions. Previously, those were left unchanged in the symtab. (cherry picked from commit 330f12580c1ef1f0ce313d49c6e23a5d67cf6e02) --- bolt/src/BinaryContext.cpp | 6 +++++- bolt/src/BinaryContext.h | 4 +++- bolt/src/RewriteInstance.cpp | 38 ++++++++++++++++++++++++++++-------- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index deb3086a2b38..62c71af75d6e 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -1242,7 +1242,8 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF) { BinaryFunction * BinaryContext::getBinaryFunctionContainingAddress(uint64_t Address, bool CheckPastEnd, - bool UseMaxSize) { + bool UseMaxSize, + bool Shallow) { auto FI = BinaryFunctions.upper_bound(Address); if (FI == BinaryFunctions.begin()) return nullptr; @@ -1255,6 +1256,9 @@ BinaryContext::getBinaryFunctionContainingAddress(uint64_t Address, return nullptr; auto *BF = &FI->second; + if (Shallow) + return BF; + while (BF->getParentFunction()) BF = BF->getParentFunction(); diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index 49c77129fe46..d25fb4d46d50 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -56,6 +56,7 @@ using namespace object; namespace bolt { class BinaryFunction; +class BinaryBasicBlock; class DataReader; /// Helper function to truncate a \p Value to given size in \p Bytes. @@ -183,7 +184,8 @@ class BinaryContext { /// body and the next object in address ranges that we check. BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address, bool CheckPastEnd = false, - bool UseMaxSize = false); + bool UseMaxSize = false, + bool Shallow = false); /// Return BinaryFunction which has a fragment that starts at a given /// \p Address. If the BinaryFunction is a child fragment, then return its diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index f1dc44f6201d..b850680287b5 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -3307,8 +3307,9 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { Layout.getSymbolOffset(*Function.getFunctionEndLabel())); } - // Update basic block output ranges only for the debug info. - if (!opts::UpdateDebugSections) + // Update basic block output ranges only for the debug info or if we have + // secondary entry points in the symbol table to update + if (!opts::UpdateDebugSections && !Function.isMultiEntry()) return; // Output ranges should match the input if the body hasn't changed. @@ -4042,11 +4043,17 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { for (const Elf_Sym &Symbol : cantFail(Obj->symbols(Section))) { auto NewSymbol = Symbol; - const auto *Function = BC->getBinaryFunctionAtAddress(Symbol.st_value, - /*Shallow=*/true); + + const auto *Function = + BC->getBinaryFunctionContainingAddress(NewSymbol.st_value, + /*CheckPastEnd=*/false, + /*UseMaxSize=*/true, + /*Shallow=*/true); + // Some section symbols may be mistakenly associated with the first // function emitted in the section. Dismiss if it is a section symbol. if (Function && + Function->getAddress() == NewSymbol.st_value && !Function->getPLTSymbol() && NewSymbol.getType() != ELF::STT_SECTION) { NewSymbol.st_value = Function->getOutputAddress(); @@ -4102,9 +4109,24 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { uint32_t OldSectionIndex = NewSymbol.st_shndx; auto *BD = !Function ? BC->getBinaryDataAtAddress(NewSymbol.st_value) : nullptr; - if (BD && BD->isMoved() && !BD->isJumpTable()) { - assert((!BD->getSize() || - !NewSymbol.st_size || + auto Output = + Function && !Function->getPLTSymbol() + ? Function->translateInputToOutputAddress(NewSymbol.st_value) + : 0; + + // Handle secondary entry points for this function + // (when Function->getAddress() != Symbol.st_value) + if (Output && NewSymbol.getType() != ELF::STT_SECTION) { + NewSymbol.st_value = Output; + // Force secondary entry points to have zero size + NewSymbol.st_size = 0; + NewSymbol.st_shndx = Output >= Function->cold().getAddress() && + Output < Function->cold().getImageSize() + ? Function->getColdCodeSection()->getIndex() + : Function->getCodeSection()->getIndex(); + OldSectionIndex = ELF::SHN_LORESERVE; + } else if (BD && BD->isMoved() && !BD->isJumpTable()) { + assert((!BD->getSize() || !NewSymbol.st_size || NewSymbol.st_size == BD->getSize()) && "sizes must match"); @@ -4133,7 +4155,7 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { if (NewSymbol.getType() == ELF::STT_NOTYPE && NewSymbol.getBinding() == ELF::STB_LOCAL && NewSymbol.st_size == 0) { - auto ExpectedSec = File->getELFFile()->getSection(OldSectionIndex); + auto ExpectedSec = File->getELFFile()->getSection(Symbol.st_shndx); if (ExpectedSec) { auto Section = *ExpectedSec; if (Section->sh_type == ELF::SHT_PROGBITS && From 7b4b866c92d0246bf7184c4770e7e731032fd90c Mon Sep 17 00:00:00 2001 From: Brian Gesiak Date: Wed, 24 Apr 2019 11:24:15 -0400 Subject: [PATCH 535/904] [BOLT] Minimize BOLT's diff with LLVM by removing trivial changes (NFC) Summary: BOLT works as a series of patches rebased onto upstream LLVM at revision `f137ed238db`. Some of these patches introduce unnecessary whitespace changes or includes. Remove these to minimize the diff with upstream LLVM. (cherry picked from commit 46fcef4d2e298d9f075da563871183ead431a085) --- bolt/llvm.patch | 114 ++++++++++-------------------------------------- 1 file changed, 22 insertions(+), 92 deletions(-) diff --git a/bolt/llvm.patch b/bolt/llvm.patch index 64904bb34d68..4d59eb0443bd 100644 --- a/bolt/llvm.patch +++ b/bolt/llvm.patch @@ -848,7 +848,7 @@ index 8e9b4ac5632..d2c569e3399 100644 SMLoc Loc) override; void diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h -index 582a836023b..0b15454ecd6 100644 +index 582a836023b..f1e341bd624 100644 --- a/include/llvm/MC/MCStreamer.h +++ b/include/llvm/MC/MCStreamer.h @@ -199,7 +199,7 @@ class MCStreamer { @@ -860,17 +860,6 @@ index 582a836023b..0b15454ecd6 100644 /// \brief This is stack of current and previous section values saved by /// PushSection. -@@ -290,8 +290,8 @@ public: - /// If the comment includes embedded \n's, they will each get the comment - /// prefix as appropriate. The added comment should not end with a \n. - /// By default, each comment is terminated with an end of line, i.e. the -- /// EOL param is set to true by default. If one prefers not to end the -- /// comment with a new line then the EOL param should be passed -+ /// EOL param is set to true by default. If one prefers not to end the -+ /// comment with a new line then the EOL param should be passed - /// with a false value. - virtual void AddComment(const Twine &T, bool EOL = true) {} - @@ -338,9 +338,7 @@ public: /// \brief Returns an index to represent the order a symbol was emitted in. @@ -1009,11 +998,10 @@ index 46504e74bc2..836fd8ddc45 100644 Expected sections() const; Expected symbols(const Elf_Shdr *Sec) const { -@@ -396,6 +408,34 @@ void ELFFile::getRelocationTypeName(uint32_t Type, - } +@@ -397,6 +409,34 @@ void ELFFile::getRelocationTypeName(uint32_t Type, } -+template + template +Expected::Elf_Dyn *> +ELFFile::dynamic_table_begin(const Elf_Phdr *Phdr) const { + if (!Phdr) @@ -1041,9 +1029,10 @@ index 46504e74bc2..836fd8ddc45 100644 + return reinterpret_cast(base() + End); +} + - template ++template Expected ELFFile::getRelocationSymbol(const Elf_Rel *Rel, + const Elf_Shdr *SymTab) const { diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h index 4d001039238..62837bbcaa0 100644 --- a/include/llvm/Object/ELFObjectFile.h @@ -1056,11 +1045,10 @@ index 4d001039238..62837bbcaa0 100644 relocation_iterator section_rel_begin(DataRefImpl Sec) const override; relocation_iterator section_rel_end(DataRefImpl Sec) const override; section_iterator getRelocatedSection(DataRefImpl Sec) const override; -@@ -716,6 +717,14 @@ bool ELFObjectFile::isSectionVirtual(DataRefImpl Sec) const { - return getSection(Sec)->sh_type == ELF::SHT_NOBITS; +@@ -717,6 +718,14 @@ bool ELFObjectFile::isSectionVirtual(DataRefImpl Sec) const { } -+template + template +bool ELFObjectFile::isSectionReadOnly(DataRefImpl Sec) const { + const Elf_Shdr *EShdr = getSection(Sec); + return EShdr->sh_flags & ELF::SHF_ALLOC && @@ -1068,9 +1056,10 @@ index 4d001039238..62837bbcaa0 100644 + EShdr->sh_type == ELF::SHT_PROGBITS; +} + - template ++template relocation_iterator ELFObjectFile::section_rel_begin(DataRefImpl Sec) const { + DataRefImpl RelData; @@ -751,9 +760,6 @@ ELFObjectFile::section_rel_end(DataRefImpl Sec) const { template section_iterator @@ -1101,7 +1090,7 @@ index 4d001039238..62837bbcaa0 100644 if (sec->sh_type == ELF::SHT_REL) return getRel(Rel)->r_offset; diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h -index bfd3462bf69..9be0b260f34 100644 +index bfd3462bf69..52bc210b577 100644 --- a/include/llvm/Object/MachO.h +++ b/include/llvm/Object/MachO.h @@ -320,6 +320,7 @@ public: @@ -1112,15 +1101,6 @@ index bfd3462bf69..9be0b260f34 100644 relocation_iterator section_rel_begin(DataRefImpl Sec) const override; relocation_iterator section_rel_end(DataRefImpl Sec) const override; -@@ -331,7 +332,7 @@ public: - - relocation_iterator locrel_begin() const; - relocation_iterator locrel_end() const; -- -+ - void moveRelocationNext(DataRefImpl &Rel) const override; - uint64_t getRelocationOffset(DataRefImpl Rel) const override; - symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override; diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h index 9c4ae94d3a6..64342723371 100644 --- a/include/llvm/Object/ObjectFile.h @@ -1215,18 +1195,9 @@ index d11f5a83779..0ad115c886b 100644 /// FD is the file descriptor that this writes to. If ShouldClose is true, /// this closes the file when the stream is destroyed. If FD is for stdout or diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp -index adada672af0..c9c79971a25 100644 +index adada672af0..b3d68ed66af 100644 --- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp +++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp -@@ -38,7 +38,7 @@ DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() { - } - - bool --DWARFAbbreviationDeclaration::extract(DataExtractor Data, -+DWARFAbbreviationDeclaration::extract(DataExtractor Data, - uint32_t* OffsetPtr) { - clear(); - const uint32_t Offset = *OffsetPtr; @@ -61,13 +61,15 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data, // Read all of the abbreviation attributes and forms. @@ -1587,7 +1558,7 @@ index 3d274b63a4f..cef29f4b41d 100644 StringRef RuntimeDyld::getErrorString() { return Dyld->getErrorString(); } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp -index 36b43ec9b78..3dc3e8f325c 100644 +index 36b43ec9b78..1a56e590014 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -270,6 +270,25 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section, @@ -1688,15 +1659,6 @@ index 36b43ec9b78..3dc3e8f325c 100644 resolveAArch64Branch(SectionID, Value, RelI, Stubs); } else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) { // Craete new GOT entry or find existing one. If GOT entry is -@@ -1410,7 +1478,7 @@ RuntimeDyldELF::processRelocationRef( - } else { - processSimpleRelocation(SectionID, Offset, RelType, Value); - } -- -+ - } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) { - if (RelType == ELF::R_PPC64_REL24) { - // Determine ABI variant in use for this object. @@ -1632,7 +1700,7 @@ RuntimeDyldELF::processRelocationRef( // equivalent to the usual PLT implementation except that we use the stub // mechanism in RuntimeDyld (which puts stubs at the end of the section) @@ -1819,18 +1781,10 @@ index a0f9a857e3c..be32963b705 100644 assert((cast(F).getValue() == 0) && "Invalid fill in virtual section!"); diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp -index 0e0ea965d14..0044566d9ab 100644 +index 0e0ea965d14..49885269d06 100644 --- a/lib/MC/MCDwarf.cpp +++ b/lib/MC/MCDwarf.cpp -@@ -41,6 +41,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -156,12 +157,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section, +@@ -156,12 +156,36 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section, unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0; unsigned Isa = 0; unsigned Discriminator = 0; @@ -1868,7 +1822,7 @@ index 0e0ea965d14..0044566d9ab 100644 if (FileNum != LineEntry.getFileNum()) { FileNum = LineEntry.getFileNum(); MCOS->EmitIntValue(dwarf::DW_LNS_set_file, 1); -@@ -197,18 +222,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section, +@@ -197,18 +221,33 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section, if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN) MCOS->EmitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1); @@ -1910,7 +1864,7 @@ index 0e0ea965d14..0044566d9ab 100644 } // Emit a DW_LNE_end_sequence for the end of the section. -@@ -250,7 +290,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS, +@@ -250,7 +289,7 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS, MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection()); // Handle the rest of the Compile Units. @@ -1919,16 +1873,7 @@ index 0e0ea965d14..0044566d9ab 100644 CUIDTablePair.second.EmitCU(MCOS, Params, LineStr); if (LineStr) -@@ -484,7 +524,7 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, - - // Parameters of the state machine, are next. - MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1); -- // maximum_operations_per_instruction -+ // maximum_operations_per_instruction - // For non-VLIW architectures this field is always 1. - // FIXME: VLIW architectures need to update this field accordingly. - if (LineTableVersion >= 4) -@@ -514,8 +554,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, +@@ -514,8 +553,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS, MCDwarfLineTableParams Params, @@ -1943,7 +1888,7 @@ index 0e0ea965d14..0044566d9ab 100644 // Put out the line tables. for (const auto &LineSec : MCLineSections.getMCLineEntries()) -@@ -1253,12 +1297,217 @@ public: +@@ -1253,12 +1296,217 @@ public: void EmitCFIInstruction(const MCCFIInstruction &Instr); }; @@ -2161,7 +2106,7 @@ index 0e0ea965d14..0044566d9ab 100644 void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) { int dataAlignmentFactor = getDataAlignmentFactor(Streamer); auto *MRI = Streamer.getContext().getRegisterInfo(); -@@ -1373,7 +1622,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) { +@@ -1373,7 +1621,28 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) { Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1); Streamer.EmitULEB128IntValue(Instr.getOffset()); return; @@ -2286,7 +2231,7 @@ index 0a684588110..58199c97420 100644 unsigned char Value, SMLoc Loc) { diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp -index 776569894a5..0954b70df49 100644 +index 776569894a5..aa130bb2d6a 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -85,11 +85,15 @@ void MCStreamer::reset() { @@ -2329,15 +2274,6 @@ index 776569894a5..0954b70df49 100644 } void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) { -@@ -513,7 +524,7 @@ void MCStreamer::EmitCFIEscape(StringRef Values) { - - void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) { - MCSymbol *Label = EmitCFILabel(); -- MCCFIInstruction Instruction = -+ MCCFIInstruction Instruction = - MCCFIInstruction::createGnuArgsSize(Label, Size); - MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); - if (!CurFrame) @@ -884,6 +895,14 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) { } } @@ -2363,16 +2299,10 @@ index 776569894a5..0954b70df49 100644 SMLoc Loc) {} void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {} diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp -index b544fa5c147..746c9f32865 100644 +index b544fa5c147..c885bf9f037 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp -@@ -339,11 +339,16 @@ unsigned COFFObjectFile::getSectionID(SectionRef Sec) const { - - bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const { - const coff_section *Sec = toSec(Ref); -- // In COFF, a virtual section won't have any in-file -+ // In COFF, a virtual section won't have any in-file - // content, so the file pointer to the content will be zero. +@@ -344,6 +344,11 @@ bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const { return Sec->PointerToRawData == 0; } From 382af1044186164b7568e047b762d5da4d5d808f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 25 Apr 2019 17:00:05 -0700 Subject: [PATCH 536/904] [BOLT] Automatically enable -hot-text Summary: Enable -hot-text by default if reordering functions. Also fail immediately if function reordering is specified on the command line in non-relocation mode. (cherry picked from commit c81cf6a2f9097409460b32229d6a291ce9849fc1) --- bolt/src/Passes/ReorderFunctions.cpp | 6 ------ bolt/src/RewriteInstance.cpp | 20 ++++++++++++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/bolt/src/Passes/ReorderFunctions.cpp b/bolt/src/Passes/ReorderFunctions.cpp index 879ca581f48d..2ae600d65a2d 100644 --- a/bolt/src/Passes/ReorderFunctions.cpp +++ b/bolt/src/Passes/ReorderFunctions.cpp @@ -277,12 +277,6 @@ std::vector readFunctionOrderFile() { } void ReorderFunctions::runOnFunctions(BinaryContext &BC) { - if (!BC.HasRelocations && opts::ReorderFunctions != RT_NONE) { - errs() << "BOLT-ERROR: Function reordering only works when " - << "relocs are enabled.\n"; - exit(1); - } - auto &BFs = BC.getBinaryFunctions(); if (opts::ReorderFunctions != RT_NONE && opts::ReorderFunctions != RT_EXEC_COUNT && diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index b850680287b5..0c3a51d28e32 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// +#include "RewriteInstance.h" #include "BinaryBasicBlock.h" #include "BinaryContext.h" #include "BinaryFunction.h" @@ -21,10 +22,10 @@ #include "Exceptions.h" #include "ExecutableFileMemoryManager.h" #include "MCPlusBuilder.h" +#include "Passes/ReorderFunctions.h" #include "ProfileReader.h" #include "ProfileWriter.h" #include "Relocation.h" -#include "RewriteInstance.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/BinaryFormat/Dwarf.h" @@ -89,6 +90,7 @@ extern cl::OptionCategory AggregatorCategory; extern cl::opt AlignMacroOpFusion; extern cl::opt JumpTables; extern cl::list ReorderData; +extern cl::opt ReorderFunctions; static cl::opt ForceToDataRelocations("force-data-relocations", @@ -1799,7 +1801,7 @@ void RewriteInstance::adjustCommandLineOptions() { } if (opts::SplitEH && !BC->HasRelocations) { - outs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n"; + errs() << "BOLT-WARNING: disabling -split-eh in non-relocation mode\n"; opts::SplitEH = false; } @@ -1812,8 +1814,18 @@ void RewriteInstance::adjustCommandLineOptions() { opts::AlignMacroOpFusion = MFT_ALL; } - if (opts::HotText && !BC->HasRelocations) { - outs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n"; + if (!BC->HasRelocations && + opts::ReorderFunctions != ReorderFunctions::RT_NONE) { + errs() << "BOLT-ERROR: function reordering only works when " + << "relocations are enabled\n"; + exit(1); + } + + if (opts::ReorderFunctions != ReorderFunctions::RT_NONE && + !opts::HotText.getNumOccurrences()) { + opts::HotText = true; + } else if (opts::HotText && !BC->HasRelocations) { + errs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n"; opts::HotText = false; } From 476fcb0c5f831b5c944e1443df36f0939e96990b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 25 Apr 2019 16:34:50 -0700 Subject: [PATCH 537/904] [perf2bolt] Fix print report for pre-aggregated profile Summary: For pre-aggregated profile, we were using the number of records in the profile for `NumTraces` ignoring the counts per record. As a result, the reported percentage of mismatched traces was bogus. (cherry picked from commit cbee2be4c8efb229e7fc89ad34fda4ebecee5515) --- bolt/src/DataAggregator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index 89d0e687affa..f99daf4e7d56 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -1407,7 +1407,7 @@ void DataAggregator::processPreAggregated() { AggrEntry.From.Offset, false}; LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false}; doTrace(First, Second, AggrEntry.Count); - ++NumTraces; + NumTraces += AggrEntry.Count; break; } } From c6de6a8a94fd0c09196c6840466e8c6a65d350d9 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 26 Apr 2019 16:32:28 -0700 Subject: [PATCH 538/904] [BOLT] Fix profile reading in non-reloc mode Summary: In non-relocation mode we may execute multiple re-write passes either because we need to split large functions or update debug information for large functions (in this context large functions are functions that do not fit into the original function boundaries after optimizations). When we execute another pass, we reset RewriteInstance and run most of the steps such as disassembly and profile matching for the 2nd or 3rd time. However, when we match a profile, we check `Used` flag, and don't use the profile for the 2nd time. Since we didn't reset the flag while resetting the rest of the states, we ignored profile for all functions. Resetting the flag in-between rewrite passes solves the problem. (cherry picked from commit fad44e2c50ae24d5d14f598af7dc5ee5a353e8eb) --- bolt/src/DataReader.cpp | 9 +++++++++ bolt/src/DataReader.h | 3 +++ bolt/src/RewriteInstance.cpp | 1 + 3 files changed, 13 insertions(+) diff --git a/bolt/src/DataReader.cpp b/bolt/src/DataReader.cpp index 65f67c3bce7d..c84773467a99 100644 --- a/bolt/src/DataReader.cpp +++ b/bolt/src/DataReader.cpp @@ -251,6 +251,15 @@ void FuncMemData::update(const Location &Offset, const Location &Addr) { ++Data[Iter->second].Count; } +void DataReader::reset() { + for (auto &Pair : getAllFuncsBranchData()) { + Pair.second.Used = false; + } + for (auto &Pair : getAllFuncsMemData()) { + Pair.second.Used = false; + } +} + ErrorOr> DataReader::readPerfData(StringRef Path, raw_ostream &Diag) { auto MB = MemoryBuffer::getFileOrSTDIN(Path); diff --git a/bolt/src/DataReader.h b/bolt/src/DataReader.h index 50b901b9f5b5..342db39c12d6 100644 --- a/bolt/src/DataReader.h +++ b/bolt/src/DataReader.h @@ -303,6 +303,9 @@ class DataReader { static ErrorOr> readPerfData(StringRef Path, raw_ostream &Diag); + /// Mark all profile objects unused. + void reset(); + /// Parses the input bolt data file into internal data structures. We expect /// the file format to follow the syntax below. /// diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 0c3a51d28e32..8c2fbad93c03 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -725,6 +725,7 @@ RewriteInstance::~RewriteInstance() {} void RewriteInstance::reset() { FileSymRefs.clear(); auto &DR = BC->DR; + DR.reset(); BC = createBinaryContext( InputFile, DR, DWARFContext::create(*InputFile, nullptr, From 6ee086734843a35c4a571eed49d9b122aaa6402c Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 26 Apr 2019 19:52:36 -0700 Subject: [PATCH 539/904] [BOLT] Fix symboltable update bug Summary: Commit "Update symbols for secondary entry points" introduced a bug by using getBinaryFunctionContainingAddress() instead of getBinaryFunctionAtAddress() regarding ICF'd functions. Only the latter would fetch the correct BinaryFunction object for addresses of functions that were ICF'd. As a result of this bug, the dynamic symbol table was not updated for function symbols that were folded by ICF. (cherry picked from commit 61924ee3f52cf7f6e89575413c3718f10b769d98) --- bolt/src/RewriteInstance.cpp | 55 +++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 8c2fbad93c03..57dfea171ac4 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -4057,16 +4057,12 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { for (const Elf_Sym &Symbol : cantFail(Obj->symbols(Section))) { auto NewSymbol = Symbol; - const auto *Function = - BC->getBinaryFunctionContainingAddress(NewSymbol.st_value, - /*CheckPastEnd=*/false, - /*UseMaxSize=*/true, - /*Shallow=*/true); + const auto *Function = BC->getBinaryFunctionAtAddress(Symbol.st_value, + /*Shallow=*/true); // Some section symbols may be mistakenly associated with the first // function emitted in the section. Dismiss if it is a section symbol. if (Function && - Function->getAddress() == NewSymbol.st_value && !Function->getPLTSymbol() && NewSymbol.getType() != ELF::STT_SECTION) { NewSymbol.st_value = Function->getOutputAddress(); @@ -4120,6 +4116,28 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { } } else { uint32_t OldSectionIndex = NewSymbol.st_shndx; + // Check if the original section where this symbol links to is + // code that we may have reordered. + auto ExpectedSec = File->getELFFile()->getSection(Symbol.st_shndx); + bool IsCodeSym{false}; + if (ExpectedSec) { + auto Section = *ExpectedSec; + IsCodeSym = (Section->sh_type == ELF::SHT_PROGBITS && + Section->sh_flags & ELF::SHF_ALLOC && + Section->sh_flags & ELF::SHF_EXECINSTR && + !(Section->sh_flags & ELF::SHF_WRITE)); + } else { + consumeError(ExpectedSec.takeError()); + } + // Try to fetch a containing function to check if this symbol is + // a secondary entry point of it + if (!Function && IsCodeSym && NewSymbol.getType() == ELF::STT_FUNC) { + Function = + BC->getBinaryFunctionContainingAddress(NewSymbol.st_value, + /*CheckPastEnd=*/false, + /*UseMaxSize=*/true, + /*Shallow=*/true); + } auto *BD = !Function ? BC->getBinaryDataAtAddress(NewSymbol.st_value) : nullptr; auto Output = @@ -4167,23 +4185,14 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile *File) { // .text (t15274167). Remove then from the symtab. if (NewSymbol.getType() == ELF::STT_NOTYPE && NewSymbol.getBinding() == ELF::STB_LOCAL && - NewSymbol.st_size == 0) { - auto ExpectedSec = File->getELFFile()->getSection(Symbol.st_shndx); - if (ExpectedSec) { - auto Section = *ExpectedSec; - if (Section->sh_type == ELF::SHT_PROGBITS && - Section->sh_flags & ELF::SHF_ALLOC && - Section->sh_flags & ELF::SHF_EXECINSTR) { - // This will cause the symbol to not be emitted if we are - // creating a new symtab from scratch instead of patching one. - if (!PatchExisting) - continue; - // If patching an existing symtab, patch this value to zero. - NewSymbol.st_value = 0; - } - } else { - consumeError(ExpectedSec.takeError()); - } + NewSymbol.st_size == 0 && + IsCodeSym) { + // This will cause the symbol to not be emitted if we are + // creating a new symtab from scratch instead of patching one. + if (!PatchExisting) + continue; + // If patching an existing symtab, patch this value to zero. + NewSymbol.st_value = 0; } } From e175919fc4b374a3024d52cedc9b0b32ae749b3a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 26 Apr 2019 15:30:12 -0700 Subject: [PATCH 540/904] [BOLT] Strip debug sections by default Summary: We used to ignore debug sections by default, but we kept them in the binary which led to invalid debug information in the output. It's better to strip debug info and print a warning to the user. Note: we are not updating debug info by default due to high memory requirements for large applications. (cherry picked from commit a8185a376c68f9a3143263f12ef1811f6a15df91) --- bolt/src/ExecutableFileMemoryManager.cpp | 22 +++++++++---------- bolt/src/RewriteInstance.cpp | 27 ++++++++++++++++++++++-- bolt/src/RewriteInstance.h | 9 +++++++- 3 files changed, 43 insertions(+), 15 deletions(-) diff --git a/bolt/src/ExecutableFileMemoryManager.cpp b/bolt/src/ExecutableFileMemoryManager.cpp index e0aea5d8b96e..9744821d848d 100644 --- a/bolt/src/ExecutableFileMemoryManager.cpp +++ b/bolt/src/ExecutableFileMemoryManager.cpp @@ -29,18 +29,16 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size, StringRef SectionName, bool IsCode, bool IsReadOnly) { - // Register as note section (non-allocatable) if we recognize it as so - for (auto &OverwriteName : RewriteInstance::SectionsToOverwrite) { - if (SectionName == OverwriteName) { - uint8_t *DataCopy = new uint8_t[Size]; - auto &Section = BC.registerOrUpdateNoteSection(SectionName, - DataCopy, - Size, - Alignment); - Section.setSectionID(SectionID); - assert(!Section.isAllocatable() && "note sections cannot be allocatable"); - return DataCopy; - } + // Register a debug section as a note section. + if (RewriteInstance::isDebugSection(SectionName)) { + uint8_t *DataCopy = new uint8_t[Size]; + auto &Section = BC.registerOrUpdateNoteSection(SectionName, + DataCopy, + Size, + Alignment); + Section.setSectionID(SectionID); + assert(!Section.isAllocatable() && "note sections cannot be allocatable"); + return DataCopy; } uint8_t *Ret; diff --git a/bolt/src/RewriteInstance.cpp b/bolt/src/RewriteInstance.cpp index 57dfea171ac4..76735023a29f 100644 --- a/bolt/src/RewriteInstance.cpp +++ b/bolt/src/RewriteInstance.cpp @@ -512,6 +512,7 @@ MCPlusBuilder *createMCPlusBuilder(const Triple::ArchType Arch, } constexpr const char *RewriteInstance::SectionsToOverwrite[]; +constexpr const char *RewriteInstance::DebugSectionsToOverwrite[]; const std::string RewriteInstance::OrgSecPrefix = ".bolt.org"; @@ -1720,6 +1721,7 @@ void RewriteInstance::readSpecialSections() { TimerGroupName, TimerGroupDesc, opts::TimeRewrite); bool HasTextRelocations = false; + bool HasDebugInfo = false; // Process special sections. for (const auto &Section : InputFile->sections()) { @@ -1733,9 +1735,16 @@ void RewriteInstance::readSpecialSections() { << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x" << Twine::utohexstr(Section.getAddress() + Section.getSize()) << "\n"); + if (isDebugSection(SectionName)) + HasDebugInfo = true; } } + if (HasDebugInfo && !opts::UpdateDebugSections) { + errs() << "BOLT-WARNING: debug info will be stripped from the binary. " + "Use -update-debug-sections to keep it.\n"; + } + HasTextRelocations = (bool)BC->getUniqueSectionByName(".rela.text"); LSDASection = BC->getUniqueSectionByName(".gcc_except_table"); EHFrameSection = BC->getUniqueSectionByName(".eh_frame"); @@ -3763,8 +3772,7 @@ std::string RewriteInstance::getOutputSectionName(const ELFObjType *Obj, StringRef SectionName = cantFail(Obj->getSectionName(&Section), "cannot get section name"); - if ((Section.sh_flags & ELF::SHF_ALLOC) && - willOverwriteSection(SectionName)) + if ((Section.sh_flags & ELF::SHF_ALLOC) && willOverwriteSection(SectionName)) return OrgSecPrefix + SectionName.str(); return SectionName; @@ -3868,6 +3876,10 @@ std::vector RewriteInstance::getOutputSections( StringRef SectionName = cantFail(Obj->getSectionName(&Section), "cannot get section name"); + // Strip debug sections if not updating them. + if (isDebugSection(SectionName) && !opts::UpdateDebugSections) + continue; + auto BSec = BC->getUniqueSectionByName(SectionName); assert(BSec && "missing section info for non-allocatable section"); @@ -4789,7 +4801,18 @@ bool RewriteInstance::willOverwriteSection(StringRef SectionName) { if (SectionName == OverwriteName) return true; } + for (auto &OverwriteName : DebugSectionsToOverwrite) { + if (SectionName == OverwriteName) + return true; + } auto Section = BC->getUniqueSectionByName(SectionName); return Section && Section->isAllocatable() && Section->isFinalized(); } + +bool RewriteInstance::isDebugSection(StringRef SectionName) { + if (SectionName.startswith(".debug_") || SectionName == ".gdb_index") + return true; + + return false; +} diff --git a/bolt/src/RewriteInstance.h b/bolt/src/RewriteInstance.h index e6bd0acfee87..78aeba88c2cf 100644 --- a/bolt/src/RewriteInstance.h +++ b/bolt/src/RewriteInstance.h @@ -259,11 +259,15 @@ class RewriteInstance { bool shouldDisassemble(BinaryFunction &BF) const; public: - /// When updating debug info, these are the sections we overwrite. + /// Standard ELF sections we overwrite. static constexpr const char *SectionsToOverwrite[] = { ".shstrtab", ".symtab", ".strtab", + }; + + /// Debug section to we overwrite while updating the debug info. + static constexpr const char *DebugSectionsToOverwrite[] = { ".debug_aranges", ".debug_line", ".debug_loc", @@ -271,6 +275,9 @@ class RewriteInstance { ".gdb_index", }; + /// Return true if the section holds debug information. + static bool isDebugSection(StringRef SectionName); + using SectionPatchersType = std::map>; From 4a480b22575b2ac246279a0a4b613a4895090487 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 29 Apr 2019 12:51:10 -0700 Subject: [PATCH 541/904] [BOLT][NFC] Move DynoStats out of BinaryFunction Summary: Move DynoStats into separate source files. (cherry picked from commit 1580877391ef3e6c2f43e50baec260d45c84194f) --- bolt/src/BinaryFunction.cpp | 212 +------------------------- bolt/src/BinaryFunction.h | 151 ------------------ bolt/src/CMakeLists.txt | 1 + bolt/src/DynoStats.cpp | 253 +++++++++++++++++++++++++++++++ bolt/src/DynoStats.h | 178 ++++++++++++++++++++++ bolt/src/Passes/BinaryPasses.cpp | 4 +- bolt/src/Passes/BinaryPasses.h | 1 + 7 files changed, 437 insertions(+), 363 deletions(-) create mode 100644 bolt/src/DynoStats.cpp create mode 100644 bolt/src/DynoStats.h diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index 9eb18e9d8bdc..ae59104e8334 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -13,6 +13,7 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" #include "DataReader.h" +#include "DynoStats.h" #include "MCPlusBuilder.h" #include "llvm/ADT/edit_distance.h" #include "llvm/ADT/StringRef.h" @@ -90,14 +91,6 @@ DotToolTipCode("dot-tooltip-code", cl::Hidden, cl::cat(BoltCategory)); -static cl::opt -DynoStatsScale("dyno-stats-scale", - cl::desc("scale to be applied while reporting dyno stats"), - cl::Optional, - cl::init(1), - cl::Hidden, - cl::cat(BoltCategory)); - cl::opt JumpTables("jump-tables", cl::desc("jump tables support (default=basic)"), @@ -193,7 +186,6 @@ bool shouldPrint(const BinaryFunction &Function) { namespace llvm { namespace bolt { -constexpr const char *DynoStats::Desc[]; constexpr unsigned BinaryFunction::MinAlign; const char BinaryFunction::TimerGroupName[] = "buildfuncs"; const char BinaryFunction::TimerGroupDesc[] = "Build Binary Functions"; @@ -245,31 +237,6 @@ SMLoc findDebugLineInformationForInstructionAt( } // namespace -bool DynoStats::operator<(const DynoStats &Other) const { - return std::lexicographical_compare( - &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], - &Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT] - ); -} - -bool DynoStats::operator==(const DynoStats &Other) const { - return std::equal( - &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], - &Other.Stats[FIRST_DYNO_STAT] - ); -} - -bool DynoStats::lessThan(const DynoStats &Other, - ArrayRef Keys) const { - return std::lexicographical_compare( - Keys.begin(), Keys.end(), - Keys.begin(), Keys.end(), - [this,&Other](const Category A, const Category) { - return Stats[A] < Other.Stats[A]; - } - ); -} - uint64_t BinaryFunction::Count = 0; const std::string * @@ -493,7 +460,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, if (opts::PrintDynoStats && !BasicBlocksLayout.empty()) { OS << '\n'; - DynoStats dynoStats = getDynoStats(); + DynoStats dynoStats = getDynoStats(*this); OS << dynoStats; } @@ -4284,145 +4251,6 @@ void BinaryFunction::printLoopInfo(raw_ostream &OS) const { OS << "Maximum nested loop depth: " << BLI->MaximumDepth << "\n\n"; } -DynoStats BinaryFunction::getDynoStats() const { - DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64()); - - // Return empty-stats about the function we don't completely understand. - if (!isSimple() || !hasValidProfile()) - return Stats; - - // If the function was folded in non-relocation mode we keep its profile - // for optimization. However, it should be excluded from the dyno stats. - if (isFolded()) - return Stats; - - // Update enumeration of basic blocks for correct detection of branch' - // direction. - updateLayoutIndices(); - - for (const auto &BB : layout()) { - // The basic block execution count equals to the sum of incoming branch - // frequencies. This may deviate from the sum of outgoing branches of the - // basic block especially since the block may contain a function that - // does not return or a function that throws an exception. - const uint64_t BBExecutionCount = BB->getKnownExecutionCount(); - - // Ignore empty blocks and blocks that were not executed. - if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0) - continue; - - // Count AArch64 linker-inserted veneers - if(isAArch64Veneer()) - Stats[DynoStats::VENEER_CALLS_AARCH64] += getKnownExecutionCount(); - - // Count the number of calls by iterating through all instructions. - for (const auto &Instr : *BB) { - if (BC.MIB->isStore(Instr)) { - Stats[DynoStats::STORES] += BBExecutionCount; - } - if (BC.MIB->isLoad(Instr)) { - Stats[DynoStats::LOADS] += BBExecutionCount; - } - - if (!BC.MIB->isCall(Instr)) - continue; - - uint64_t CallFreq = BBExecutionCount; - if (BC.MIB->getConditionalTailCall(Instr)) { - CallFreq = - BC.MIB->getAnnotationWithDefault(Instr, "CTCTakenCount"); - } - Stats[DynoStats::FUNCTION_CALLS] += CallFreq; - if (BC.MIB->isIndirectCall(Instr)) { - Stats[DynoStats::INDIRECT_CALLS] += CallFreq; - } else if (const auto *CallSymbol = BC.MIB->getTargetSymbol(Instr)) { - const auto *BF = BC.getFunctionForSymbol(CallSymbol); - if (BF && BF->isPLTFunction()) { - Stats[DynoStats::PLT_CALLS] += CallFreq; - - // We don't process PLT functions and hence have to adjust relevant - // dynostats here for: - // - // jmp *GOT_ENTRY(%rip) - // - // NOTE: this is arch-specific. - Stats[DynoStats::FUNCTION_CALLS] += CallFreq; - Stats[DynoStats::INDIRECT_CALLS] += CallFreq; - Stats[DynoStats::LOADS] += CallFreq; - Stats[DynoStats::INSTRUCTIONS] += CallFreq; - } - } - } - - Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount; - - // Jump tables. - const auto *LastInstr = BB->getLastNonPseudoInstr(); - if (BC.MIB->getJumpTable(*LastInstr)) { - Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount; - DEBUG( - static uint64_t MostFrequentJT; - if (BBExecutionCount > MostFrequentJT) { - MostFrequentJT = BBExecutionCount; - dbgs() << "BOLT-INFO: most frequently executed jump table is in " - << "function " << *this << " in basic block " << BB->getName() - << " executed totally " << BBExecutionCount << " times.\n"; - } - ); - continue; - } - - // Update stats for branches. - const MCSymbol *TBB = nullptr; - const MCSymbol *FBB = nullptr; - MCInst *CondBranch = nullptr; - MCInst *UncondBranch = nullptr; - if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) { - continue; - } - - if (!CondBranch && !UncondBranch) { - continue; - } - - // Simple unconditional branch. - if (!CondBranch) { - Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount; - continue; - } - - // CTCs - if (BC.MIB->getConditionalTailCall(*CondBranch)) { - if (BB->branch_info_begin() != BB->branch_info_end()) - Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count; - continue; - } - - // Conditional branch that could be followed by an unconditional branch. - auto TakenCount = BB->getTakenBranchInfo().Count; - if (TakenCount == COUNT_NO_PROFILE) - TakenCount = 0; - - auto NonTakenCount = BB->getFallthroughBranchInfo().Count; - if (NonTakenCount == COUNT_NO_PROFILE) - NonTakenCount = 0; - - if (isForwardBranch(BB, BB->getConditionalSuccessor(true))) { - Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount; - Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount; - } else { - Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount; - Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount; - } - - if (UncondBranch) { - Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount; - } - } - - return Stats; -} - bool BinaryFunction::isAArch64Veneer() const { if (BasicBlocks.size() != 1) return false; @@ -4439,41 +4267,5 @@ bool BinaryFunction::isAArch64Veneer() const { return true; } -void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { - auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat, - uint64_t OtherStat) { - OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name; - if (Other) { - if (Stat != OtherStat) { - OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0 - OS << format(" (%+.1f%%)", - ( (float) Stat - (float) OtherStat ) * 100.0 / - (float) (OtherStat) ); - } else { - OS << " (=)"; - } - } - OS << '\n'; - }; - - for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; - Stat < DynoStats::LAST_DYNO_STAT; - ++Stat) { - - if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64) - continue; - - printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0); - } -} - -void DynoStats::operator+=(const DynoStats &Other) { - for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; - Stat < DynoStats::LAST_DYNO_STAT; - ++Stat) { - Stats[Stat] += Other[Stat]; - } -} - } // namespace bolt } // namespace llvm diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 52127a180dd1..3bca1456052a 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -54,108 +54,6 @@ namespace bolt { using DWARFUnitLineTable = std::pair; -/// Class encapsulating runtime statistics about an execution unit. -class DynoStats { - -#define DYNO_STATS\ - D(FIRST_DYNO_STAT, "", Fn)\ - D(FORWARD_COND_BRANCHES, "executed forward branches", Fn)\ - D(FORWARD_COND_BRANCHES_TAKEN, "taken forward branches", Fn)\ - D(BACKWARD_COND_BRANCHES, "executed backward branches", Fn)\ - D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\ - D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\ - D(FUNCTION_CALLS, "all function calls", Fn)\ - D(INDIRECT_CALLS, "indirect calls", Fn)\ - D(PLT_CALLS, "PLT calls", Fn)\ - D(INSTRUCTIONS, "executed instructions", Fn)\ - D(LOADS, "executed load instructions", Fn)\ - D(STORES, "executed store instructions", Fn)\ - D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\ - D(ALL_BRANCHES, "total branches",\ - Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\ - D(ALL_TAKEN, "taken branches",\ - Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\ - D(NONTAKEN_CONDITIONAL, "non-taken conditional branches",\ - Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\ - D(TAKEN_CONDITIONAL, "taken conditional branches",\ - Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\ - D(ALL_CONDITIONAL, "all conditional branches",\ - Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\ - D(VENEER_CALLS_AARCH64, "linker-inserted veneer calls", Fn)\ - D(LAST_DYNO_STAT, "", 0) - -public: -#define D(name, ...) name, - enum Category : uint8_t { DYNO_STATS }; -#undef D - - -private: - uint64_t Stats[LAST_DYNO_STAT+1]; - bool PrintAArch64Stats; - -#define D(name, desc, ...) desc, - static constexpr const char *Desc[] = { DYNO_STATS }; -#undef D - -public: - DynoStats(bool PrintAArch64Stats ) { - this->PrintAArch64Stats = PrintAArch64Stats; - for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat) - Stats[Stat] = 0; - } - - uint64_t &operator[](size_t I) { - assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT && - "index out of bounds"); - return Stats[I]; - } - - uint64_t operator[](size_t I) const { - switch (I) { -#define D(name, desc, func) \ - case name: \ - return func; -#define Fn Stats[I] -#define Fadd(a, b) operator[](a) + operator[](b) -#define Fsub(a, b) operator[](a) - operator[](b) -#define F(a) operator[](a) -#define Radd(a, b) (a + b) -#define Rsub(a, b) (a - b) - DYNO_STATS -#undef Rsub -#undef Radd -#undef F -#undef Fsub -#undef Fadd -#undef Fn -#undef D - default: - llvm_unreachable("index out of bounds"); - } - return 0; - } - - void print(raw_ostream &OS, const DynoStats *Other = nullptr) const; - - void operator+=(const DynoStats &Other); - bool operator<(const DynoStats &Other) const; - bool operator==(const DynoStats &Other) const; - bool operator!=(const DynoStats &Other) const { return !operator==(Other); } - bool lessThan(const DynoStats &Other, ArrayRef Keys) const; - - static const char* Description(const Category C) { - return Desc[C]; - } -}; - -inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) { - Stats.print(OS, nullptr); - return OS; -} - -DynoStats operator+(const DynoStats &A, const DynoStats &B); - /// Types of macro-fusion alignment corrections. enum MacroFusionType { MFT_NONE, @@ -927,13 +825,6 @@ class BinaryFunction { /// Attempt to validate CFG invariants. bool validateCFG() const; - /// Return dynostats for the function. - /// - /// The function relies on branch instructions being in-sync with CFG for - /// branch instructions stats. Thus it is better to call it after - /// fixBranches(). - DynoStats getDynoStats() const; - BinaryBasicBlock *getBasicBlockForLabel(const MCSymbol *Label) { auto I = LabelToBB.find(Label); return I == LabelToBB.end() ? nullptr : I->second; @@ -2377,48 +2268,6 @@ class BinaryFunction { const FragmentInfo &cold() const { return ColdFragment; } }; -/// Return program-wide dynostats. -template -inline DynoStats getDynoStats(const FuncsType &Funcs) { - bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64(); - DynoStats dynoStats(IsAArch64); - for (auto &BFI : Funcs) { - auto &BF = BFI.second; - if (BF.isSimple()) { - dynoStats += BF.getDynoStats(); - } - } - return dynoStats; -} - -/// Call a function with optional before and after dynostats printing. -template -inline void -callWithDynoStats(FnType &&Func, - const FuncsType &Funcs, - StringRef Phase, - const bool Flag) { - bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64(); - DynoStats DynoStatsBefore(IsAArch64); - if (Flag) { - DynoStatsBefore = getDynoStats(Funcs); - } - - Func(); - - if (Flag) { - const auto DynoStatsAfter = getDynoStats(Funcs); - const auto Changed = (DynoStatsAfter != DynoStatsBefore); - outs() << "BOLT-INFO: program-wide dynostats after running " - << Phase << (Changed ? "" : " (no change)") << ":\n\n" - << DynoStatsBefore << '\n'; - if (Changed) { - DynoStatsAfter.print(outs(), &DynoStatsBefore); - } - outs() << '\n'; - } -} - inline raw_ostream &operator<<(raw_ostream &OS, const BinaryFunction &Function) { OS << Function.getPrintName(); diff --git a/bolt/src/CMakeLists.txt b/bolt/src/CMakeLists.txt index 28b2392d2338..01959cf71f18 100644 --- a/bolt/src/CMakeLists.txt +++ b/bolt/src/CMakeLists.txt @@ -76,6 +76,7 @@ add_llvm_tool(llvm-bolt DataReader.cpp DebugData.cpp DWARFRewriter.cpp + DynoStats.cpp Exceptions.cpp ExecutableFileMemoryManager.cpp Heatmap.cpp diff --git a/bolt/src/DynoStats.cpp b/bolt/src/DynoStats.cpp new file mode 100644 index 000000000000..133348098c2d --- /dev/null +++ b/bolt/src/DynoStats.cpp @@ -0,0 +1,253 @@ +//===--- DynoStats.cpp ----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "DynoStats.h" +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "bolt" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::OptionCategory BoltCategory; + +static cl::opt +DynoStatsScale("dyno-stats-scale", + cl::desc("scale to be applied while reporting dyno stats"), + cl::Optional, + cl::init(1), + cl::Hidden, + cl::cat(BoltCategory)); + +} // namespace opts + +namespace llvm { +namespace bolt { + +constexpr const char *DynoStats::Desc[]; + +bool DynoStats::operator<(const DynoStats &Other) const { + return std::lexicographical_compare( + &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], + &Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT] + ); +} + +bool DynoStats::operator==(const DynoStats &Other) const { + return std::equal( + &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT], + &Other.Stats[FIRST_DYNO_STAT] + ); +} + +bool DynoStats::lessThan(const DynoStats &Other, + ArrayRef Keys) const { + return std::lexicographical_compare( + Keys.begin(), Keys.end(), + Keys.begin(), Keys.end(), + [this,&Other](const Category A, const Category) { + return Stats[A] < Other.Stats[A]; + } + ); +} + +void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const { + auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat, + uint64_t OtherStat) { + OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name; + if (Other) { + if (Stat != OtherStat) { + OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0 + OS << format(" (%+.1f%%)", + ( (float) Stat - (float) OtherStat ) * 100.0 / + (float) (OtherStat) ); + } else { + OS << " (=)"; + } + } + OS << '\n'; + }; + + for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; + Stat < DynoStats::LAST_DYNO_STAT; + ++Stat) { + + if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64) + continue; + + printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0); + } +} + +void DynoStats::operator+=(const DynoStats &Other) { + for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1; + Stat < DynoStats::LAST_DYNO_STAT; + ++Stat) { + Stats[Stat] += Other[Stat]; + } +} + +DynoStats getDynoStats(const BinaryFunction &BF) { + auto &BC = BF.getBinaryContext(); + + DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64()); + + // Return empty-stats about the function we don't completely understand. + if (!BF.isSimple() || !BF.hasValidProfile()) + return Stats; + + // If the function was folded in non-relocation mode we keep its profile + // for optimization. However, it should be excluded from the dyno stats. + if (BF.isFolded()) + return Stats; + + // Update enumeration of basic blocks for correct detection of branch' + // direction. + BF.updateLayoutIndices(); + + for (const auto &BB : BF.layout()) { + // The basic block execution count equals to the sum of incoming branch + // frequencies. This may deviate from the sum of outgoing branches of the + // basic block especially since the block may contain a function that + // does not return or a function that throws an exception. + const uint64_t BBExecutionCount = BB->getKnownExecutionCount(); + + // Ignore empty blocks and blocks that were not executed. + if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0) + continue; + + // Count AArch64 linker-inserted veneers + if(BF.isAArch64Veneer()) + Stats[DynoStats::VENEER_CALLS_AARCH64] += BF.getKnownExecutionCount(); + + // Count the number of calls by iterating through all instructions. + for (const auto &Instr : *BB) { + if (BC.MIB->isStore(Instr)) { + Stats[DynoStats::STORES] += BBExecutionCount; + } + if (BC.MIB->isLoad(Instr)) { + Stats[DynoStats::LOADS] += BBExecutionCount; + } + + if (!BC.MIB->isCall(Instr)) + continue; + + uint64_t CallFreq = BBExecutionCount; + if (BC.MIB->getConditionalTailCall(Instr)) { + CallFreq = + BC.MIB->getAnnotationWithDefault(Instr, "CTCTakenCount"); + } + Stats[DynoStats::FUNCTION_CALLS] += CallFreq; + if (BC.MIB->isIndirectCall(Instr)) { + Stats[DynoStats::INDIRECT_CALLS] += CallFreq; + } else if (const auto *CallSymbol = BC.MIB->getTargetSymbol(Instr)) { + const auto *BF = BC.getFunctionForSymbol(CallSymbol); + if (BF && BF->isPLTFunction()) { + Stats[DynoStats::PLT_CALLS] += CallFreq; + + // We don't process PLT functions and hence have to adjust relevant + // dynostats here for: + // + // jmp *GOT_ENTRY(%rip) + // + // NOTE: this is arch-specific. + Stats[DynoStats::FUNCTION_CALLS] += CallFreq; + Stats[DynoStats::INDIRECT_CALLS] += CallFreq; + Stats[DynoStats::LOADS] += CallFreq; + Stats[DynoStats::INSTRUCTIONS] += CallFreq; + } + } + } + + Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount; + + // Jump tables. + const auto *LastInstr = BB->getLastNonPseudoInstr(); + if (BC.MIB->getJumpTable(*LastInstr)) { + Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount; + DEBUG( + static uint64_t MostFrequentJT; + if (BBExecutionCount > MostFrequentJT) { + MostFrequentJT = BBExecutionCount; + dbgs() << "BOLT-INFO: most frequently executed jump table is in " + << "function " << BF << " in basic block " << BB->getName() + << " executed totally " << BBExecutionCount << " times.\n"; + } + ); + continue; + } + + // Update stats for branches. + const MCSymbol *TBB = nullptr; + const MCSymbol *FBB = nullptr; + MCInst *CondBranch = nullptr; + MCInst *UncondBranch = nullptr; + if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) { + continue; + } + + if (!CondBranch && !UncondBranch) { + continue; + } + + // Simple unconditional branch. + if (!CondBranch) { + Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount; + continue; + } + + // CTCs + if (BC.MIB->getConditionalTailCall(*CondBranch)) { + if (BB->branch_info_begin() != BB->branch_info_end()) + Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count; + continue; + } + + // Conditional branch that could be followed by an unconditional branch. + auto TakenCount = BB->getTakenBranchInfo().Count; + if (TakenCount == BinaryBasicBlock::COUNT_NO_PROFILE) + TakenCount = 0; + + auto NonTakenCount = BB->getFallthroughBranchInfo().Count; + if (NonTakenCount == BinaryBasicBlock::COUNT_NO_PROFILE) + NonTakenCount = 0; + + if (BF.isForwardBranch(BB, BB->getConditionalSuccessor(true))) { + Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount; + Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount; + } else { + Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount; + Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount; + } + + if (UncondBranch) { + Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount; + } + } + + return Stats; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/src/DynoStats.h b/bolt/src/DynoStats.h new file mode 100644 index 000000000000..1e88b2fa4266 --- /dev/null +++ b/bolt/src/DynoStats.h @@ -0,0 +1,178 @@ +//===--- DynoStats.h ------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H +#define LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H + +#include "BinaryFunction.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +namespace bolt { + +/// Class encapsulating runtime statistics about an execution unit. +class DynoStats { + +#define DYNO_STATS\ + D(FIRST_DYNO_STAT, "", Fn)\ + D(FORWARD_COND_BRANCHES, "executed forward branches", Fn)\ + D(FORWARD_COND_BRANCHES_TAKEN, "taken forward branches", Fn)\ + D(BACKWARD_COND_BRANCHES, "executed backward branches", Fn)\ + D(BACKWARD_COND_BRANCHES_TAKEN, "taken backward branches", Fn)\ + D(UNCOND_BRANCHES, "executed unconditional branches", Fn)\ + D(FUNCTION_CALLS, "all function calls", Fn)\ + D(INDIRECT_CALLS, "indirect calls", Fn)\ + D(PLT_CALLS, "PLT calls", Fn)\ + D(INSTRUCTIONS, "executed instructions", Fn)\ + D(LOADS, "executed load instructions", Fn)\ + D(STORES, "executed store instructions", Fn)\ + D(JUMP_TABLE_BRANCHES, "taken jump table branches", Fn)\ + D(ALL_BRANCHES, "total branches",\ + Fadd(ALL_CONDITIONAL, UNCOND_BRANCHES))\ + D(ALL_TAKEN, "taken branches",\ + Fadd(TAKEN_CONDITIONAL, UNCOND_BRANCHES))\ + D(NONTAKEN_CONDITIONAL, "non-taken conditional branches",\ + Fsub(ALL_CONDITIONAL, TAKEN_CONDITIONAL))\ + D(TAKEN_CONDITIONAL, "taken conditional branches",\ + Fadd(FORWARD_COND_BRANCHES_TAKEN, BACKWARD_COND_BRANCHES_TAKEN))\ + D(ALL_CONDITIONAL, "all conditional branches",\ + Fadd(FORWARD_COND_BRANCHES, BACKWARD_COND_BRANCHES))\ + D(VENEER_CALLS_AARCH64, "linker-inserted veneer calls", Fn)\ + D(LAST_DYNO_STAT, "", 0) + +public: +#define D(name, ...) name, + enum Category : uint8_t { DYNO_STATS }; +#undef D + + +private: + uint64_t Stats[LAST_DYNO_STAT+1]; + bool PrintAArch64Stats; + +#define D(name, desc, ...) desc, + static constexpr const char *Desc[] = { DYNO_STATS }; +#undef D + +public: + DynoStats(bool PrintAArch64Stats) { + this->PrintAArch64Stats = PrintAArch64Stats; + for (auto Stat = FIRST_DYNO_STAT + 0; Stat < LAST_DYNO_STAT; ++Stat) + Stats[Stat] = 0; + } + + uint64_t &operator[](size_t I) { + assert(I > FIRST_DYNO_STAT && I < LAST_DYNO_STAT && + "index out of bounds"); + return Stats[I]; + } + + uint64_t operator[](size_t I) const { + switch (I) { +#define D(name, desc, func) \ + case name: \ + return func; +#define Fn Stats[I] +#define Fadd(a, b) operator[](a) + operator[](b) +#define Fsub(a, b) operator[](a) - operator[](b) +#define F(a) operator[](a) +#define Radd(a, b) (a + b) +#define Rsub(a, b) (a - b) + DYNO_STATS +#undef Rsub +#undef Radd +#undef F +#undef Fsub +#undef Fadd +#undef Fn +#undef D + default: + llvm_unreachable("index out of bounds"); + } + return 0; + } + + void print(raw_ostream &OS, const DynoStats *Other = nullptr) const; + + void operator+=(const DynoStats &Other); + bool operator<(const DynoStats &Other) const; + bool operator==(const DynoStats &Other) const; + bool operator!=(const DynoStats &Other) const { return !operator==(Other); } + bool lessThan(const DynoStats &Other, ArrayRef Keys) const; + + static const char* Description(const Category C) { + return Desc[C]; + } +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) { + Stats.print(OS, nullptr); + return OS; +} + +DynoStats operator+(const DynoStats &A, const DynoStats &B); + +/// Return dynostats for the function. +/// +/// The function relies on branch instructions being in-sync with CFG for +/// branch instructions stats. Thus it is better to call it after +/// fixBranches(). +DynoStats getDynoStats(const BinaryFunction &BF); + +/// Return program-wide dynostats. +template +inline DynoStats getDynoStats(const FuncsType &Funcs) { + bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64(); + DynoStats dynoStats(IsAArch64); + for (auto &BFI : Funcs) { + auto &BF = BFI.second; + if (BF.isSimple()) { + dynoStats += getDynoStats(BF); + } + } + return dynoStats; +} + +/// Call a function with optional before and after dynostats printing. +template +inline void +callWithDynoStats(FnType &&Func, + const FuncsType &Funcs, + StringRef Phase, + const bool Flag) { + bool IsAArch64 = Funcs.begin()->second.getBinaryContext().isAArch64(); + DynoStats DynoStatsBefore(IsAArch64); + if (Flag) { + DynoStatsBefore = getDynoStats(Funcs); + } + + Func(); + + if (Flag) { + const auto DynoStatsAfter = getDynoStats(Funcs); + const auto Changed = (DynoStatsAfter != DynoStatsBefore); + outs() << "BOLT-INFO: program-wide dynostats after running " + << Phase << (Changed ? "" : " (no change)") << ":\n\n" + << DynoStatsBefore << '\n'; + if (Changed) { + DynoStatsAfter.print(outs(), &DynoStatsBefore); + } + outs() << '\n'; + } +} + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/src/Passes/BinaryPasses.cpp b/bolt/src/Passes/BinaryPasses.cpp index e284a164475b..e16a6349247c 100644 --- a/bolt/src/Passes/BinaryPasses.cpp +++ b/bolt/src/Passes/BinaryPasses.cpp @@ -1331,7 +1331,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC) { const auto &BF = BFI.second; if (shouldOptimize(BF) && BF.hasValidProfile()) { Functions.push_back(&BF); - Stats.emplace(&BF, BF.getDynoStats()); + Stats.emplace(&BF, getDynoStats(BF)); } } @@ -1383,7 +1383,7 @@ PrintProgramStats::runOnFunctions(BinaryContext &BC) { outs() << " are:\n"; auto SFI = Functions.begin(); for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { - const auto Stats = (*SFI)->getDynoStats(); + const auto Stats = getDynoStats(**SFI); outs() << " " << **SFI; if (!SortAll) { outs() << " ("; diff --git a/bolt/src/Passes/BinaryPasses.h b/bolt/src/Passes/BinaryPasses.h index 2ad31cfec754..9664cda0052a 100644 --- a/bolt/src/Passes/BinaryPasses.h +++ b/bolt/src/Passes/BinaryPasses.h @@ -16,6 +16,7 @@ #include "BinaryContext.h" #include "BinaryFunction.h" +#include "DynoStats.h" #include "HFSort.h" #include "llvm/Support/CommandLine.h" From fe01cb5c96dad3b2e10cd4ad99cb6441f1407b5f Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 30 Apr 2019 15:47:10 -0700 Subject: [PATCH 542/904] [BOLT] Limit jump table size by containing object Summary: While checking for a size of a jump table, we've used containing section as a boundary. This worked for most cases as typically jump tables are not marked with symbol table entries. However, the compiler may generate objects for indirect goto's. (cherry picked from commit b401fa0724a5b4db5139bf50edf66f181562f499) --- bolt/src/BinaryFunction.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index ae59104e8334..c991e3bc3ded 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -840,7 +840,15 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, auto ValueOffset = static_cast(ArrayStart - Section->getAddress()); uint64_t Value = 0; std::vector JTOffsetCandidates; - while (ValueOffset <= Section->getSize() - EntrySize) { + auto UpperBound = Section->getSize(); + const auto *JumpTableBD = BC.getBinaryDataAtAddress(ArrayStart); + if (JumpTableBD && JumpTableBD->getSize()) { + UpperBound = ValueOffset + JumpTableBD->getSize(); + assert(UpperBound <= Section->getSize() && + "data object cannot cross a section boundary"); + } + + while (ValueOffset <= UpperBound - EntrySize) { DEBUG(dbgs() << "BOLT-DEBUG: indirect jmp at 0x" << Twine::utohexstr(getAddress() + Offset) << " is referencing address 0x" From d0fb0b014c2fdbacd597f6140b9ee7570c3d012e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 30 Apr 2019 17:08:22 -0700 Subject: [PATCH 543/904] [perf2bot] Pass `-f` flag to perf Summary: perf tool requires the input data to be owned by the current user or root, otherwise it rejects the input. Use `-f` option to override this behavior. (cherry picked from commit de93ea4ff7137ba4113b2ad3a9693cb3d8e5aa8a) --- bolt/src/DataAggregator.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/src/DataAggregator.cpp b/bolt/src/DataAggregator.cpp index f99daf4e7d56..5226e8b5209c 100644 --- a/bolt/src/DataAggregator.cpp +++ b/bolt/src/DataAggregator.cpp @@ -210,6 +210,7 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI, *Str++ = 0; } while (true); + Argv.push_back("-f"); Argv.push_back("-i"); Argv.push_back(PerfDataFilename.data()); Argv.push_back(nullptr); From d46b057fe99602dd3e9645897c32fc7e47cde806 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Thu, 2 May 2019 17:42:06 -0700 Subject: [PATCH 544/904] [BOLT] Move JumpTable management to BinaryContext Summary: Make BinaryContext responsible for creation and management of JumpTables. This will be used for detection and resolution of jump table conflicts across functions. (cherry picked from commit 20de117816fdca9ad863d898ac53091c5be10b7e) --- bolt/src/BinaryContext.cpp | 75 +++++++++++++++++++++++++++++ bolt/src/BinaryContext.h | 33 +++++++++++++ bolt/src/BinaryFunction.cpp | 94 ++++++++++--------------------------- bolt/src/BinaryFunction.h | 15 ++---- bolt/src/JumpTable.cpp | 32 +++++++------ bolt/src/JumpTable.h | 36 ++++++++++---- 6 files changed, 181 insertions(+), 104 deletions(-) diff --git a/bolt/src/BinaryContext.cpp b/bolt/src/BinaryContext.cpp index 62c71af75d6e..7af8c5cbb61a 100644 --- a/bolt/src/BinaryContext.cpp +++ b/bolt/src/BinaryContext.cpp @@ -253,6 +253,81 @@ BinaryFunction *BinaryContext::createBinaryFunction( return BF; } +std::pair +BinaryContext::createJumpTable(BinaryFunction &Function, + uint64_t Address, + JumpTable::JumpTableType Type, + JumpTable::OffsetEntriesType &&OffsetEntries) { + const auto JumpTableName = generateJumpTableName(Function, Address); + if (auto *JT = getJumpTableContainingAddress(Address)) { + assert(JT->Type == Type && "jump table types have to match"); + assert(JT->Parent == &Function && + "cannot re-use jump table of a different function"); + assert((Address == JT->getAddress() || Type != JumpTable::JTT_PIC) && + "cannot re-use part of PIC jump table"); + // Get or create a new label for the table. + const auto JTOffset = Address - JT->getAddress(); + auto LI = JT->Labels.find(JTOffset); + if (LI == JT->Labels.end()) { + auto *JTStartLabel = registerNameAtAddress(JumpTableName, + Address, + 0, + JT->EntrySize); + auto Result = JT->Labels.emplace(JTOffset, JTStartLabel); + assert(Result.second && "error adding jump table label"); + LI = Result.first; + } + + return std::make_pair(JT, LI->second); + } + + auto *JTStartLabel = Ctx->getOrCreateSymbol(JumpTableName); + const auto EntrySize = + Type == JumpTable::JTT_PIC ? 4 : AsmInfo->getCodePointerSize(); + + DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " + << JTStartLabel->getName() + << " in function " << Function << " with " + << OffsetEntries.size() << " entries\n"); + + auto *JT = new JumpTable(JumpTableName, + Address, + EntrySize, + Type, + std::move(OffsetEntries), + JumpTable::LabelMapType{{0, JTStartLabel}}, + Function, + *getSectionForAddress(Address)); + + const auto *JTLabel = registerNameAtAddress(JumpTableName, Address, JT); + assert(JTLabel == JTStartLabel); + + JumpTables.emplace(Address, JT); + + // Duplicate the entry for the parent function for easy access. + Function.JumpTables.emplace(Address, JT); + + return std::make_pair(JT, JTLabel); +} + +std::string BinaryContext::generateJumpTableName(const BinaryFunction &BF, + uint64_t Address) { + size_t Id; + uint64_t Offset = 0; + if (const auto *JT = BF.getJumpTableContainingAddress(Address)) { + Offset = Address - JT->getAddress(); + auto Itr = JT->Labels.find(Offset); + if (Itr != JT->Labels.end()) { + return Itr->second->getName(); + } + Id = JumpTableIds.at(JT->getAddress()); + } else { + Id = JumpTableIds[Address] = BF.JumpTables.size(); + } + return ("JUMP_TABLE/" + BF.Names[0] + "." + std::to_string(Id) + + (Offset ? ("." + std::to_string(Offset)) : "")); +} + MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, uint64_t Address, uint64_t Size, diff --git a/bolt/src/BinaryContext.h b/bolt/src/BinaryContext.h index d25fb4d46d50..1e3126b1aa18 100644 --- a/bolt/src/BinaryContext.h +++ b/bolt/src/BinaryContext.h @@ -17,6 +17,7 @@ #include "BinaryData.h" #include "BinarySection.h" #include "DebugData.h" +#include "JumpTable.h" #include "MCPlusBuilder.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/Triple.h" @@ -145,6 +146,9 @@ class BinaryContext { /// Functions injected by BOLT std::vector InjectedBinaryFunctions; + /// Jump tables for all functions mapped by address. + std::map JumpTables; + public: /// [name] -> [BinaryData*] map used for global symbol resolution. using SymbolMapType = std::map; @@ -199,6 +203,18 @@ class BinaryContext { getBinaryFunctionAtAddress(Address, Shallow); } + /// Return JumpTable containing a given \p Address. + JumpTable *getJumpTableContainingAddress(uint64_t Address) { + auto JTI = JumpTables.upper_bound(Address); + if (JTI == JumpTables.begin()) + return nullptr; + --JTI; + if (JTI->first + JTI->second->getSize() > Address) { + return JTI->second; + } + return nullptr; + } + /// [MCSymbol] -> [BinaryFunction] /// /// As we fold identical functions, multiple symbols can point @@ -272,6 +288,19 @@ class BinaryContext { return InjectedBinaryFunctions; } + /// Construct a jump table for \p Function at \p Address. + /// May create an embedded jump table and return its label as the second + /// element of the pair. + std::pair + createJumpTable(BinaryFunction &Function, + uint64_t Address, + JumpTable::JumpTableType Type, + JumpTable::OffsetEntriesType &&OffsetEntries); + + /// Generate a unique name for jump table at a given \p Address belonging + /// to function \p BF. + std::string generateJumpTableName(const BinaryFunction &BF, uint64_t Address); + public: /// Regular page size. static constexpr unsigned RegularPageSize = 0x1000; @@ -282,6 +311,10 @@ class BinaryContext { /// Map address to a constant island owner (constant data in code section) std::map AddressToConstantIslandMap; + /// A map from jump table address to insertion order. Used for generating + /// jump table names. + std::map JumpTableIds; + /// Set of addresses in the code that are not a function start, and are /// referenced from outside of containing function. E.g. this could happen /// when a function has more than a single entry point. diff --git a/bolt/src/BinaryFunction.cpp b/bolt/src/BinaryFunction.cpp index c991e3bc3ded..6eb4218dc5f0 100644 --- a/bolt/src/BinaryFunction.cpp +++ b/bolt/src/BinaryFunction.cpp @@ -772,9 +772,27 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, DEBUG(dbgs() << "BOLT-DEBUG: addressed memory is 0x" << Twine::utohexstr(ArrayStart) << '\n'); + // List of possible jump targets. + std::vector JTOffsetCandidates; + + auto useJumpTableForInstruction = [&](JumpTable::JumpTableType JTType) { + JumpTable *JT; + const MCSymbol *JTLabel; + std::tie(JT, JTLabel) = BC.createJumpTable(*this, + ArrayStart, + JTType, + std::move(JTOffsetCandidates)); + + BC.MIB->replaceMemOperandDisp(const_cast(*MemLocInstr), + JTLabel, BC.Ctx.get()); + BC.MIB->setJumpTable(Instruction, ArrayStart, IndexRegNum); + + JTSites.emplace_back(Offset, ArrayStart); + }; + // Check if there's already a jump table registered at this address. - if (auto *JT = getJumpTableContainingAddress(ArrayStart)) { - auto JTOffset = ArrayStart - JT->getAddress(); + if (auto *JT = BC.getJumpTableContainingAddress(ArrayStart)) { + const auto JTOffset = ArrayStart - JT->getAddress(); if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE && JTOffset != 0) { // Adjust the size of this jump table and create a new one if necessary. // We cannot re-use the entries since the offsets are relative to the @@ -783,7 +801,7 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, << Twine::utohexstr(JT->getAddress()) << '\n'); JT->OffsetEntries.resize(JTOffset / JT->EntrySize); } else if (Type != IndirectBranchType::POSSIBLE_FIXED_BRANCH) { - // Re-use an existing jump table. Perhaps parts of it. + // Re-use the existing jump table or parts of it. if (Type != IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) { assert(JT->Type == JumpTable::JTT_NORMAL && "normal jump table expected"); @@ -792,24 +810,7 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, assert(JT->Type == JumpTable::JTT_PIC && "PIC jump table expected"); } - // Get or create a new label for the table. - auto LI = JT->Labels.find(JTOffset); - if (LI == JT->Labels.end()) { - auto *JTStartLabel = - BC.registerNameAtAddress(generateJumpTableName(ArrayStart), - ArrayStart, - 0, - JT->EntrySize); - auto Result = JT->Labels.emplace(JTOffset, JTStartLabel); - assert(Result.second && "error adding jump table label"); - LI = Result.first; - } - - BC.MIB->replaceMemOperandDisp(const_cast(*MemLocInstr), - LI->second, BC.Ctx.get()); - BC.MIB->setJumpTable(Instruction, ArrayStart, IndexRegNum); - - JTSites.emplace_back(Offset, ArrayStart); + useJumpTableForInstruction(JT->Type); return Type; } @@ -839,7 +840,6 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, DataExtractor DE(SectionContents, BC.AsmInfo->isLittleEndian(), EntrySize); auto ValueOffset = static_cast(ArrayStart - Section->getAddress()); uint64_t Value = 0; - std::vector JTOffsetCandidates; auto UpperBound = Section->getSize(); const auto *JumpTableBD = BC.getBinaryDataAtAddress(ArrayStart); if (JumpTableBD && JumpTableBD->getSize()) { @@ -897,37 +897,10 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, assert(JTOffsetCandidates.size() > 1 && "expected more than one jump table entry"); - auto JumpTableName = generateJumpTableName(ArrayStart); - auto JumpTableType = - Type == IndirectBranchType::POSSIBLE_JUMP_TABLE + const auto JumpTableType = Type == IndirectBranchType::POSSIBLE_JUMP_TABLE ? JumpTable::JTT_NORMAL : JumpTable::JTT_PIC; - - auto *JTStartLabel = BC.Ctx->getOrCreateSymbol(JumpTableName); - - auto JT = llvm::make_unique(JumpTableName, - ArrayStart, - EntrySize, - JumpTableType, - std::move(JTOffsetCandidates), - JumpTable::LabelMapType{{0, JTStartLabel}}, - *BC.getSectionForAddress(ArrayStart)); - - auto *JTLabel = BC.registerNameAtAddress(JumpTableName, - ArrayStart, - JT.get()); - assert(JTLabel == JTStartLabel); - - DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " - << JTStartLabel->getName() - << " in function " << *this << " with " - << JTOffsetCandidates.size() << " entries.\n"); - JumpTables.emplace(ArrayStart, JT.release()); - BC.MIB->replaceMemOperandDisp(const_cast(*MemLocInstr), - JTStartLabel, BC.Ctx.get()); - BC.MIB->setJumpTable(Instruction, ArrayStart, IndexRegNum); - - JTSites.emplace_back(Offset, ArrayStart); + useJumpTableForInstruction(JumpTableType); return Type; } @@ -1558,6 +1531,7 @@ void BinaryFunction::postProcessJumpTables() { break; } } + clearList(JTSites); // Free memory used by jump table offsets. for (auto &JTI : JumpTables) { @@ -3484,24 +3458,6 @@ BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const { return DFS; } -std::string BinaryFunction::generateJumpTableName(uint64_t Address) const { - auto *JT = getJumpTableContainingAddress(Address); - size_t Id; - uint64_t Offset = 0; - if (JT) { - Offset = Address - JT->getAddress(); - auto Itr = JT->Labels.find(Offset); - if (Itr != JT->Labels.end()) { - return Itr->second->getName(); - } - Id = JumpTableIds.at(JT->getAddress()); - } else { - Id = JumpTableIds[Address] = JumpTables.size(); - } - return ("JUMP_TABLE/" + Names[0] + "." + std::to_string(Id) + - (Offset ? ("." + std::to_string(Offset)) : "")); -} - std::size_t BinaryFunction::hash(bool Recompute, bool UseDFS) const { if (size() == 0) return 0; diff --git a/bolt/src/BinaryFunction.h b/bolt/src/BinaryFunction.h index 3bca1456052a..cafb45c3f297 100644 --- a/bolt/src/BinaryFunction.h +++ b/bolt/src/BinaryFunction.h @@ -451,25 +451,20 @@ class BinaryFunction { /// function and that apply before the entry basic block). CFIInstrMapType CIEFrameInstructions; - /// All compound jump tables for this function. + /// All compound jump tables for this function. This duplicates what's stored + /// in the BinaryContext, but additionally it gives quick access for all + /// jump tables used by this function. + /// /// -> std::map JumpTables; - /// A map from jump table address to insertion order. Used for generating - /// jump table names. - mutable std::map JumpTableIds; - - /// Generate a unique name for this jump table at the given address that - /// should be repeatable no matter what the start address of the table is. - std::string generateJumpTableName(uint64_t Address) const; - /// Iterate over all jump tables associated with this function. iterator_range::const_iterator> jumpTables() const { return make_range(JumpTables.begin(), JumpTables.end()); } - /// All jump table sites in the function. + /// All jump table sites in the function before CFG is built. std::vector> JTSites; /// List of relocations in this function. diff --git a/bolt/src/JumpTable.cpp b/bolt/src/JumpTable.cpp index 582d5cf7dee9..7c46a6e434b7 100644 --- a/bolt/src/JumpTable.cpp +++ b/bolt/src/JumpTable.cpp @@ -27,6 +27,23 @@ extern cl::opt JumpTables; extern cl::opt Verbosity; } +JumpTable::JumpTable(StringRef Name, + uint64_t Address, + std::size_t EntrySize, + JumpTableType Type, + OffsetEntriesType &&OffsetEntries, + LabelMapType &&Labels, + BinaryFunction &BF, + BinarySection &Section) + : BinaryData(Name, Address, 0, EntrySize, Section), + EntrySize(EntrySize), + OutputEntrySize(EntrySize), + Type(Type), + OffsetEntries(OffsetEntries), + Labels(Labels), + Parent(&BF) { +} + std::pair JumpTable::getEntriesForAddress(const uint64_t Addr) const { const uint64_t InstOffset = Addr - getAddress(); @@ -174,18 +191,3 @@ void JumpTable::print(raw_ostream &OS) const { } OS << "\n\n"; } - -JumpTable::JumpTable(StringRef Name, - uint64_t Address, - std::size_t EntrySize, - JumpTableType Type, - decltype(OffsetEntries) &&OffsetEntries, - decltype(Labels) &&Labels, - BinarySection &Section) - : BinaryData(Name, Address, 0, EntrySize, Section), - EntrySize(EntrySize), - OutputEntrySize(EntrySize), - Type(Type), - OffsetEntries(OffsetEntries), - Labels(Labels) -{ } diff --git a/bolt/src/JumpTable.h b/bolt/src/JumpTable.h index 26a4abcb9460..7ab462acad76 100644 --- a/bolt/src/JumpTable.h +++ b/bolt/src/JumpTable.h @@ -30,17 +30,26 @@ enum JumpTableSupportLevel : char { JTS_AGGRESSIVE = 4, /// Aggressive splitting of jump tables. }; +class BinaryFunction; + /// Representation of a jump table. /// /// The jump table may include other jump tables that are referenced by /// a different label at a different offset in this jump table. class JumpTable : public BinaryData { + friend class BinaryContext; + + JumpTable() = delete; + JumpTable(const JumpTable &) = delete; + JumpTable &operator=(const JumpTable &) = delete; + public: enum JumpTableType : char { JTT_NORMAL, JTT_PIC, }; +public: /// Branch statistics for jump table entries. struct JumpInfo { uint64_t Mispreds{0}; @@ -60,7 +69,8 @@ class JumpTable : public BinaryData { std::vector Entries; /// All the entries as offsets into a function. Invalid after CFG is built. - std::vector OffsetEntries; + using OffsetEntriesType = std::vector; + OffsetEntriesType OffsetEntries; /// Map ->